Whamcloud - gitweb
LU-8371 llite: Trust creates in revalidate too.
[fs/lustre-release.git] / lustre / llite / file.c
index 614df9b..b7bd2bf 100644 (file)
@@ -27,7 +27,7 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2013, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
 
 #define DEBUG_SUBSYSTEM S_LLITE
 #include <lustre_dlm.h>
-#include <lustre_lite.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/sched.h>
-#include "llite_internal.h"
+#include <linux/user_namespace.h>
+#ifdef HAVE_UIDGID_HEADER
+# include <linux/uidgid.h>
+#endif
 #include <lustre/ll_fiemap.h>
+
 #include <lustre_ioctl.h>
+#include <lustre_swab.h>
 
 #include "cl_object.h"
+#include "llite_internal.h"
+#include "vvp_internal.h"
 
 static int
 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
@@ -81,153 +87,117 @@ static void ll_file_data_put(struct ll_file_data *fd)
                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
 }
 
-void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
-                          struct lustre_handle *fh)
-{
-        op_data->op_fid1 = ll_i2info(inode)->lli_fid;
-        op_data->op_attr.ia_mode = inode->i_mode;
-        op_data->op_attr.ia_atime = inode->i_atime;
-        op_data->op_attr.ia_mtime = inode->i_mtime;
-        op_data->op_attr.ia_ctime = inode->i_ctime;
-        op_data->op_attr.ia_size = i_size_read(inode);
-        op_data->op_attr_blocks = inode->i_blocks;
-        ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
-                                        ll_inode_to_ext_flags(inode->i_flags);
-        op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
-        if (fh)
-                op_data->op_handle = *fh;
-        op_data->op_capa1 = ll_mdscapa_get(inode);
-
-       if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
-               op_data->op_bias |= MDS_DATA_MODIFIED;
-}
-
 /**
- * Closes the IO epoch and packs all the attributes into @op_data for
- * the CLOSE rpc.
+ * Packs all the attributes into @op_data for the CLOSE rpc.
  */
 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
                              struct obd_client_handle *och)
 {
-        ENTRY;
-
-       op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
-                                       ATTR_MTIME | ATTR_MTIME_SET |
-                                       ATTR_CTIME | ATTR_CTIME_SET;
-
-        if (!(och->och_flags & FMODE_WRITE))
-                goto out;
+       ENTRY;
 
-        if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
-                op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
-        else
-                ll_ioepoch_close(inode, op_data, &och, 0);
+       ll_prep_md_op_data(op_data, inode, NULL, NULL,
+                          0, 0, LUSTRE_OPC_ANY, NULL);
+
+       op_data->op_attr.ia_mode = inode->i_mode;
+       op_data->op_attr.ia_atime = inode->i_atime;
+       op_data->op_attr.ia_mtime = inode->i_mtime;
+       op_data->op_attr.ia_ctime = inode->i_ctime;
+       op_data->op_attr.ia_size = i_size_read(inode);
+       op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
+                                    ATTR_MTIME | ATTR_MTIME_SET |
+                                    ATTR_CTIME | ATTR_CTIME_SET;
+       op_data->op_attr_blocks = inode->i_blocks;
+       op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
+       op_data->op_handle = och->och_fh;
+
+       if (och->och_flags & FMODE_WRITE &&
+           ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
+               /* For HSM: if inode data has been modified, pack it so that
+                * MDT can set data dirty flag in the archive. */
+               op_data->op_bias |= MDS_DATA_MODIFIED;
 
-out:
-        ll_pack_inode2opdata(inode, op_data, &och->och_fh);
-        ll_prep_md_op_data(op_data, inode, NULL, NULL,
-                           0, 0, LUSTRE_OPC_ANY, NULL);
-        EXIT;
+       EXIT;
 }
 
-static int ll_close_inode_openhandle(struct obd_export *md_exp,
-                                    struct inode *inode,
+/**
+ * Perform a close, possibly with a bias.
+ * The meaning of "data" depends on the value of "bias".
+ *
+ * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
+ * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
+ * swap layouts with.
+ */
+static int ll_close_inode_openhandle(struct inode *inode,
                                     struct obd_client_handle *och,
-                                    const __u64 *data_version)
+                                    enum mds_op_bias bias, void *data)
 {
-        struct obd_export *exp = ll_i2mdexp(inode);
-        struct md_op_data *op_data;
-        struct ptlrpc_request *req = NULL;
-        struct obd_device *obd = class_exp2obd(exp);
-        int epoch_close = 1;
-        int rc;
-        ENTRY;
+       struct obd_export *md_exp = ll_i2mdexp(inode);
+       const struct ll_inode_info *lli = ll_i2info(inode);
+       struct md_op_data *op_data;
+       struct ptlrpc_request *req = NULL;
+       int rc;
+       ENTRY;
 
-        if (obd == NULL) {
-                /*
-                 * XXX: in case of LMV, is this correct to access
-                 * ->exp_handle?
-                 */
-                CERROR("Invalid MDC connection handle "LPX64"\n",
-                       ll_i2mdexp(inode)->exp_handle.h_cookie);
-                GOTO(out, rc = 0);
-        }
+       if (class_exp2obd(md_exp) == NULL) {
+               CERROR("%s: invalid MDC connection handle closing "DFID"\n",
+                      ll_get_fsname(inode->i_sb, NULL, 0),
+                      PFID(&lli->lli_fid));
+               GOTO(out, rc = 0);
+       }
 
-        OBD_ALLOC_PTR(op_data);
-        if (op_data == NULL)
-                GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
+       OBD_ALLOC_PTR(op_data);
+       /* We leak openhandle and request here on error, but not much to be
+        * done in OOM case since app won't retry close on error either. */
+       if (op_data == NULL)
+               GOTO(out, rc = -ENOMEM);
 
        ll_prepare_close(inode, op_data, och);
-       if (data_version != NULL) {
-               /* Pass in data_version implies release. */
+       switch (bias) {
+       case MDS_CLOSE_LAYOUT_SWAP:
+               LASSERT(data != NULL);
+               op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
+               op_data->op_data_version = 0;
+               op_data->op_lease_handle = och->och_lease_handle;
+               op_data->op_fid2 = *ll_inode2fid(data);
+               break;
+
+       case MDS_HSM_RELEASE:
+               LASSERT(data != NULL);
                op_data->op_bias |= MDS_HSM_RELEASE;
-               op_data->op_data_version = *data_version;
+               op_data->op_data_version = *(__u64 *)data;
                op_data->op_lease_handle = och->och_lease_handle;
                op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
-       }
-        epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
-        rc = md_close(md_exp, op_data, och->och_mod, &req);
-        if (rc == -EAGAIN) {
-                /* This close must have the epoch closed. */
-                LASSERT(epoch_close);
-                /* MDS has instructed us to obtain Size-on-MDS attribute from
-                 * OSTs and send setattr to back to MDS. */
-                rc = ll_som_update(inode, op_data);
-                if (rc) {
-                       CERROR("%s: inode "DFID" mdc Size-on-MDS update"
-                              " failed: rc = %d\n",
-                              ll_i2mdexp(inode)->exp_obd->obd_name,
-                              PFID(ll_inode2fid(inode)), rc);
-                       rc = 0;
-               }
-       } else if (rc) {
-               CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
-                      ll_i2mdexp(inode)->exp_obd->obd_name,
-                      PFID(ll_inode2fid(inode)), rc);
-       }
-
-       /* DATA_MODIFIED flag was successfully sent on close, cancel data
-        * modification flag. */
-       if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
-               struct ll_inode_info *lli = ll_i2info(inode);
+               break;
 
-               spin_lock(&lli->lli_lock);
-               lli->lli_flags &= ~LLIF_DATA_MODIFIED;
-               spin_unlock(&lli->lli_lock);
+       default:
+               LASSERT(data == NULL);
+               break;
        }
 
-        if (rc == 0) {
-                rc = ll_objects_destroy(req, inode);
-                if (rc)
-                       CERROR("%s: inode "DFID
-                              " ll_objects destroy: rc = %d\n",
-                              ll_i2mdexp(inode)->exp_obd->obd_name,
-                              PFID(ll_inode2fid(inode)), rc);
-        }
+       rc = md_close(md_exp, op_data, och->och_mod, &req);
+       if (rc != 0 && rc != -EINTR)
+               CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
+                      md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 
-       if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
+       if (rc == 0 &&
+           op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
                struct mdt_body *body;
+
                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-               if (!(body->mbo_valid & OBD_MD_FLRELEASED))
+               if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
                        rc = -EBUSY;
        }
 
-        ll_finish_md_op_data(op_data);
-        EXIT;
+       ll_finish_md_op_data(op_data);
+       EXIT;
 out:
 
-        if (exp_connect_som(exp) && !epoch_close &&
-            S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
-                ll_queue_done_writing(inode, LLIF_DONE_WRITING);
-        } else {
-                md_clear_open_replay_data(md_exp, och);
-                /* Free @och if it is not waiting for DONE_WRITING. */
-                och->och_fh.cookie = DEAD_HANDLE_MAGIC;
-                OBD_FREE_PTR(och);
-        }
-        if (req) /* This is close request */
-                ptlrpc_req_finished(req);
-        return rc;
+       md_clear_open_replay_data(md_exp, och);
+       och->och_fh.cookie = DEAD_HANDLE_MAGIC;
+       OBD_FREE_PTR(och);
+
+       ptlrpc_req_finished(req);       /* This is close request */
+       return rc;
 }
 
 int ll_md_real_close(struct inode *inode, fmode_t fmode)
@@ -266,24 +236,28 @@ int ll_md_real_close(struct inode *inode, fmode_t fmode)
        if (och != NULL) {
                /* There might be a race and this handle may already
                 * be closed. */
-               rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
-                                              inode, och, NULL);
+               rc = ll_close_inode_openhandle(inode, och, 0, NULL);
        }
 
        RETURN(rc);
 }
 
-static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
-                      struct file *file)
+static int ll_md_close(struct inode *inode, struct file *file)
 {
-        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
-        struct ll_inode_info *lli = ll_i2info(inode);
-        int rc = 0;
-        ENTRY;
+       union ldlm_policy_data policy = {
+               .l_inodebits    = { MDS_INODELOCK_OPEN },
+       };
+       __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct lustre_handle lockh;
+       enum ldlm_mode lockmode;
+       int rc = 0;
+       ENTRY;
 
-        /* clear group lock, if present */
-        if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
-                ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
+       /* clear group lock, if present */
+       if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
+               ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 
        if (fd->fd_lease_och != NULL) {
                bool lease_broken;
@@ -298,51 +272,36 @@ static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
        }
 
        if (fd->fd_och != NULL) {
-               rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
+               rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
                fd->fd_och = NULL;
                GOTO(out, rc);
        }
 
         /* Let's see if we have good enough OPEN lock on the file and if
            we can skip talking to MDS */
-        if (file->f_dentry->d_inode) { /* Can this ever be false? */
-                int lockmode;
-               __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
-                struct lustre_handle lockh;
-                struct inode *inode = file->f_dentry->d_inode;
-                ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
-
-               mutex_lock(&lli->lli_och_mutex);
-                if (fd->fd_omode & FMODE_WRITE) {
-                        lockmode = LCK_CW;
-                        LASSERT(lli->lli_open_fd_write_count);
-                        lli->lli_open_fd_write_count--;
-                } else if (fd->fd_omode & FMODE_EXEC) {
-                        lockmode = LCK_PR;
-                        LASSERT(lli->lli_open_fd_exec_count);
-                        lli->lli_open_fd_exec_count--;
-                } else {
-                        lockmode = LCK_CR;
-                        LASSERT(lli->lli_open_fd_read_count);
-                        lli->lli_open_fd_read_count--;
-                }
-               mutex_unlock(&lli->lli_och_mutex);
+       mutex_lock(&lli->lli_och_mutex);
+       if (fd->fd_omode & FMODE_WRITE) {
+               lockmode = LCK_CW;
+               LASSERT(lli->lli_open_fd_write_count);
+               lli->lli_open_fd_write_count--;
+       } else if (fd->fd_omode & FMODE_EXEC) {
+               lockmode = LCK_PR;
+               LASSERT(lli->lli_open_fd_exec_count);
+               lli->lli_open_fd_exec_count--;
+       } else {
+               lockmode = LCK_CR;
+               LASSERT(lli->lli_open_fd_read_count);
+               lli->lli_open_fd_read_count--;
+       }
+       mutex_unlock(&lli->lli_och_mutex);
 
-                if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
-                                   LDLM_IBITS, &policy, lockmode,
-                                   &lockh)) {
-                        rc = ll_md_real_close(file->f_dentry->d_inode,
-                                              fd->fd_omode);
-                }
-        } else {
-                CERROR("Releasing a file %p with negative dentry %p. Name %s",
-                       file, file->f_dentry, file->f_dentry->d_name.name);
-        }
+       if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
+                          LDLM_IBITS, &policy, lockmode, &lockh))
+               rc = ll_md_real_close(inode, fd->fd_omode);
 
 out:
        LUSTRE_FPRIVATE(file) = NULL;
        ll_file_data_put(fd);
-       ll_capa_close(inode);
 
        RETURN(rc);
 }
@@ -363,152 +322,120 @@ int ll_file_release(struct inode *inode, struct file *file)
        CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
               PFID(ll_inode2fid(inode)), inode);
 
-#ifdef CONFIG_FS_POSIX_ACL
-       if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
-           inode == inode->i_sb->s_root->d_inode) {
-               struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
-
-               LASSERT(fd != NULL);
-               if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
-                       fd->fd_flags &= ~LL_FILE_RMTACL;
-                       rct_del(&sbi->ll_rct, current_pid());
-                       et_search_free(&sbi->ll_et, current_pid());
-               }
-       }
-#endif
-
-        if (inode->i_sb->s_root != file->f_dentry)
+       if (inode->i_sb->s_root != file_dentry(file))
                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
         fd = LUSTRE_FPRIVATE(file);
         LASSERT(fd != NULL);
 
-        /* The last ref on @file, maybe not the the owner pid of statahead.
-         * Different processes can open the same dir, "ll_opendir_key" means:
-         * it is me that should stop the statahead thread. */
-        if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
-            lli->lli_opendir_pid != 0)
-                ll_stop_statahead(inode, lli->lli_opendir_key);
+       /* The last ref on @file, maybe not the the owner pid of statahead,
+        * because parent and child process can share the same file handle. */
+       if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
+               ll_deauthorize_statahead(inode, fd);
 
-        if (inode->i_sb->s_root == file->f_dentry) {
-                LUSTRE_FPRIVATE(file) = NULL;
-                ll_file_data_put(fd);
-                RETURN(0);
-        }
+       if (inode->i_sb->s_root == file_dentry(file)) {
+               LUSTRE_FPRIVATE(file) = NULL;
+               ll_file_data_put(fd);
+               RETURN(0);
+       }
 
-        if (!S_ISDIR(inode->i_mode)) {
+       if (!S_ISDIR(inode->i_mode)) {
                if (lli->lli_clob != NULL)
                        lov_read_and_clear_async_rc(lli->lli_clob);
-                lli->lli_async_rc = 0;
-        }
+               lli->lli_async_rc = 0;
+       }
 
-        rc = ll_md_close(sbi->ll_md_exp, inode, file);
+       rc = ll_md_close(inode, file);
 
-        if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
-                libcfs_debug_dumplog();
+       if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
+               libcfs_debug_dumplog();
 
-        RETURN(rc);
+       RETURN(rc);
 }
 
-static int ll_intent_file_open(struct file *file, void *lmm,
-                               int lmmsize, struct lookup_intent *itp)
+static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
+                               struct lookup_intent *itp)
 {
-        struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
-        struct dentry *parent = file->f_dentry->d_parent;
-        const char *name = file->f_dentry->d_name.name;
-        const int len = file->f_dentry->d_name.len;
-        struct md_op_data *op_data;
+       struct dentry *de = file_dentry(file);
+       struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
+       struct dentry *parent = de->d_parent;
+       const char *name = NULL;
+       int len = 0;
+       struct md_op_data *op_data;
        struct ptlrpc_request *req = NULL;
-        __u32 opc = LUSTRE_OPC_ANY;
-        int rc;
-        ENTRY;
+       int rc;
+       ENTRY;
 
-        if (!parent)
-                RETURN(-ENOENT);
-
-        /* Usually we come here only for NFSD, and we want open lock.
-           But we can also get here with pre 2.6.15 patchless kernels, and in
-           that case that lock is also ok */
-        /* We can also get here if there was cached open handle in revalidate_it
-         * but it disappeared while we were getting from there to ll_file_open.
-         * But this means this file was closed and immediatelly opened which
-         * makes a good candidate for using OPEN lock */
-        /* If lmmsize & lmm are not 0, we are just setting stripe info
-         * parameters. No need for the open lock */
-        if (lmm == NULL && lmmsize == 0) {
-                itp->it_flags |= MDS_OPEN_LOCK;
-                if (itp->it_flags & FMODE_WRITE)
-                        opc = LUSTRE_OPC_CREATE;
-        }
+       LASSERT(parent != NULL);
+       LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 
-        op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
-                                      file->f_dentry->d_inode, name, len,
-                                      O_RDWR, opc, NULL);
-        if (IS_ERR(op_data))
-                RETURN(PTR_ERR(op_data));
+       /* if server supports open-by-fid, or file name is invalid, don't pack
+        * name in open request */
+       if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
+           lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
+               name = de->d_name.name;
+               len = de->d_name.len;
+       }
 
+       op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
+                                    name, len, 0, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
        op_data->op_data = lmm;
        op_data->op_data_size = lmmsize;
 
-       itp->it_flags |= MDS_OPEN_BY_FID;
        rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
                            &ll_md_blocking_ast, 0);
-        ll_finish_md_op_data(op_data);
-        if (rc == -ESTALE) {
-                /* reason for keep own exit path - don`t flood log
-                * with messages with -ESTALE errors.
-                */
-                if (!it_disposition(itp, DISP_OPEN_OPEN) ||
-                     it_open_error(DISP_OPEN_OPEN, itp))
-                        GOTO(out, rc);
-                ll_release_openhandle(file->f_dentry, itp);
-                GOTO(out, rc);
-        }
+       ll_finish_md_op_data(op_data);
+       if (rc == -ESTALE) {
+               /* reason for keep own exit path - don`t flood log
+                * with messages with -ESTALE errors.
+                */
+               if (!it_disposition(itp, DISP_OPEN_OPEN) ||
+                    it_open_error(DISP_OPEN_OPEN, itp))
+                       GOTO(out, rc);
+               ll_release_openhandle(de, itp);
+               GOTO(out, rc);
+       }
 
-        if (it_disposition(itp, DISP_LOOKUP_NEG))
-                GOTO(out, rc = -ENOENT);
+       if (it_disposition(itp, DISP_LOOKUP_NEG))
+               GOTO(out, rc = -ENOENT);
 
-        if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
-                rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
-                CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
-                GOTO(out, rc);
-        }
+       if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
+               rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
+               CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
+               GOTO(out, rc);
+       }
 
-        rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
-        if (!rc && itp->d.lustre.it_lock_mode)
-                ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
-                                 itp, NULL);
+       rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
+       if (!rc && itp->it_lock_mode)
+               ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 
 out:
        ptlrpc_req_finished(req);
-        ll_intent_drop_lock(itp);
-
-        RETURN(rc);
-}
+       ll_intent_drop_lock(itp);
+
+       /* We did open by fid, but by the time we got to the server,
+        * the object disappeared. If this is a create, we cannot really
+        * tell the userspace that the file it was trying to create
+        * does not exist. Instead let's return -ESTALE, and the VFS will
+        * retry the create with LOOKUP_REVAL that we are going to catch
+        * in ll_revalidate_dentry() and use lookup then.
+        */
+       if (rc == -ENOENT && itp->it_op & IT_CREAT)
+               rc = -ESTALE;
 
-/**
- * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
- * not believe attributes if a few ioepoch holders exist. Attributes for
- * previous ioepoch if new one is opened are also skipped by MDS.
- */
-void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
-{
-        if (ioepoch && lli->lli_ioepoch != ioepoch) {
-                lli->lli_ioepoch = ioepoch;
-                CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
-                       ioepoch, PFID(&lli->lli_fid));
-        }
+       RETURN(rc);
 }
 
 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
                       struct obd_client_handle *och)
 {
-       struct ptlrpc_request *req = it->d.lustre.it_data;
        struct mdt_body *body;
 
-       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
        och->och_fh = body->mbo_handle;
        och->och_fid = body->mbo_fid1;
-       och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
+       och->och_lease_handle.cookie = it->it_lock_handle;
        och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
        och->och_flags = it->it_flags;
 
@@ -518,25 +445,19 @@ static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 static int ll_local_open(struct file *file, struct lookup_intent *it,
                         struct ll_file_data *fd, struct obd_client_handle *och)
 {
-        struct inode *inode = file->f_dentry->d_inode;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        ENTRY;
+       struct inode *inode = file_inode(file);
+       ENTRY;
 
-        LASSERT(!LUSTRE_FPRIVATE(file));
+       LASSERT(!LUSTRE_FPRIVATE(file));
 
-        LASSERT(fd != NULL);
+       LASSERT(fd != NULL);
 
-        if (och) {
-                struct ptlrpc_request *req = it->d.lustre.it_data;
-                struct mdt_body *body;
-                int rc;
+       if (och) {
+               int rc;
 
                rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
                if (rc != 0)
                        RETURN(rc);
-
-               body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-               ll_ioepoch_open(lli, body->mbo_ioepoch);
        }
 
        LUSTRE_FPRIVATE(file) = fd;
@@ -565,43 +486,35 @@ static int ll_local_open(struct file *file, struct lookup_intent *it,
  */
 int ll_file_open(struct inode *inode, struct file *file)
 {
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct lookup_intent *it, oit = { .it_op = IT_OPEN,
-                                          .it_flags = file->f_flags };
-        struct obd_client_handle **och_p = NULL;
-        __u64 *och_usecount = NULL;
-        struct ll_file_data *fd;
-        int rc = 0, opendir_set = 0;
-        ENTRY;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct lookup_intent *it, oit = { .it_op = IT_OPEN,
+                                         .it_flags = file->f_flags };
+       struct obd_client_handle **och_p = NULL;
+       __u64 *och_usecount = NULL;
+       struct ll_file_data *fd;
+       int rc = 0;
+       ENTRY;
 
        CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
               PFID(ll_inode2fid(inode)), inode, file->f_flags);
 
-        it = file->private_data; /* XXX: compat macro */
-        file->private_data = NULL; /* prevent ll_local_open assertion */
+       it = file->private_data; /* XXX: compat macro */
+       file->private_data = NULL; /* prevent ll_local_open assertion */
 
        fd = ll_file_data_get();
        if (fd == NULL)
                GOTO(out_openerr, rc = -ENOMEM);
 
        fd->fd_file = file;
-       if (S_ISDIR(inode->i_mode)) {
-               spin_lock(&lli->lli_sa_lock);
-               if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
-                   lli->lli_opendir_pid == 0) {
-                       lli->lli_opendir_key = fd;
-                       lli->lli_opendir_pid = current_pid();
-                       opendir_set = 1;
-               }
-               spin_unlock(&lli->lli_sa_lock);
-       }
+       if (S_ISDIR(inode->i_mode))
+               ll_authorize_statahead(inode, fd);
 
-        if (inode->i_sb->s_root == file->f_dentry) {
+       if (inode->i_sb->s_root == file_dentry(file)) {
                 LUSTRE_FPRIVATE(file) = fd;
                 RETURN(0);
         }
 
-        if (!it || !it->d.lustre.it_disposition) {
+       if (!it || !it->it_disposition) {
                 /* Convert f_flags into access mode. We cannot use file->f_mode,
                  * because everything but O_ACCMODE mask was stripped from
                  * there */
@@ -654,7 +567,7 @@ restart:
                                 GOTO(out_openerr, rc);
                         }
 
-                        ll_release_openhandle(file->f_dentry, it);
+                       ll_release_openhandle(file_dentry(file), it);
                 }
                 (*och_usecount)++;
 
@@ -666,16 +579,40 @@ restart:
                 }
         } else {
                 LASSERT(*och_usecount == 0);
-                if (!it->d.lustre.it_disposition) {
+               if (!it->it_disposition) {
+                       struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
                         /* We cannot just request lock handle now, new ELC code
                            means that one of other OPEN locks for this file
                            could be cancelled, and since blocking ast handler
                            would attempt to grab och_mutex as well, that would
                            result in a deadlock */
                        mutex_unlock(&lli->lli_och_mutex);
-                        it->it_create_mode |= M_CHECK_STALE;
+                       /*
+                        * Normally called under two situations:
+                        * 1. NFS export.
+                        * 2. A race/condition on MDS resulting in no open
+                        *    handle to be returned from LOOKUP|OPEN request,
+                        *    for example if the target entry was a symlink.
+                        *
+                        *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
+                        *  marked by a bit set in ll_iget_for_nfs. Clear the
+                        *  bit so that it's not confusing later callers.
+                        *
+                        *  NB; when ldd is NULL, it must have come via normal
+                        *  lookup path only, since ll_iget_for_nfs always calls
+                        *  ll_d_init().
+                        */
+                       if (ldd && ldd->lld_nfs_dentry) {
+                               ldd->lld_nfs_dentry = 0;
+                               it->it_flags |= MDS_OPEN_LOCK;
+                       }
+
+                        /*
+                        * Always specify MDS_OPEN_BY_FID because we don't want
+                        * to get file with different fid.
+                        */
+                       it->it_flags |= MDS_OPEN_BY_FID;
                         rc = ll_intent_file_open(file, NULL, 0, it);
-                        it->it_create_mode &= ~M_CHECK_STALE;
                         if (rc)
                                 GOTO(out_openerr, rc);
 
@@ -698,7 +635,7 @@ restart:
 
                LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
                         "inode %p: disposition %x, status %d\n", inode,
-                        it_disposition(it, ~0), it->d.lustre.it_status);
+                        it_disposition(it, ~0), it->it_status);
 
                rc = ll_local_open(file, it, fd, *och_p);
                if (rc)
@@ -713,14 +650,6 @@ restart:
         if (!S_ISREG(inode->i_mode))
                 GOTO(out_och_free, rc);
 
-        ll_capa_open(inode);
-
-       if (!lli->lli_has_smd &&
-           (cl_is_lov_delay_create(file->f_flags) ||
-            (file->f_mode & FMODE_WRITE) == 0)) {
-               CDEBUG(D_INODE, "object creation was delayed\n");
-               GOTO(out_och_free, rc);
-       }
        cl_lov_delay_create_clear(&file->f_flags);
        GOTO(out_och_free, rc);
 
@@ -734,16 +663,16 @@ out_och_free:
                mutex_unlock(&lli->lli_och_mutex);
 
 out_openerr:
-                if (opendir_set != 0)
-                        ll_stop_statahead(inode, lli->lli_opendir_key);
-                if (fd != NULL)
-                        ll_file_data_put(fd);
+               if (lli->lli_opendir_key == fd)
+                       ll_deauthorize_statahead(inode, fd);
+               if (fd != NULL)
+                       ll_file_data_put(fd);
         } else {
                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
         }
 
        if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
-               ptlrpc_req_finished(it->d.lustre.it_data);
+               ptlrpc_req_finished(it->it_request);
                it_clear_disposition(it, DISP_ENQ_OPEN_REF);
        }
 
@@ -774,6 +703,95 @@ static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 }
 
 /**
+ * When setting a lease on a file, we take ownership of the lli_mds_*_och
+ * and save it as fd->fd_och so as to force client to reopen the file even
+ * if it has an open lock in cache already.
+ */
+static int ll_lease_och_acquire(struct inode *inode, struct file *file,
+                               struct lustre_handle *old_handle)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct obd_client_handle **och_p;
+       __u64 *och_usecount;
+       int rc = 0;
+       ENTRY;
+
+       /* Get the openhandle of the file */
+       mutex_lock(&lli->lli_och_mutex);
+       if (fd->fd_lease_och != NULL)
+               GOTO(out_unlock, rc = -EBUSY);
+
+       if (fd->fd_och == NULL) {
+               if (file->f_mode & FMODE_WRITE) {
+                       LASSERT(lli->lli_mds_write_och != NULL);
+                       och_p = &lli->lli_mds_write_och;
+                       och_usecount = &lli->lli_open_fd_write_count;
+               } else {
+                       LASSERT(lli->lli_mds_read_och != NULL);
+                       och_p = &lli->lli_mds_read_och;
+                       och_usecount = &lli->lli_open_fd_read_count;
+               }
+
+               if (*och_usecount > 1)
+                       GOTO(out_unlock, rc = -EBUSY);
+
+               fd->fd_och = *och_p;
+               *och_usecount = 0;
+               *och_p = NULL;
+       }
+
+       *old_handle = fd->fd_och->och_fh;
+
+       EXIT;
+out_unlock:
+       mutex_unlock(&lli->lli_och_mutex);
+       return rc;
+}
+
+/**
+ * Release ownership on lli_mds_*_och when putting back a file lease.
+ */
+static int ll_lease_och_release(struct inode *inode, struct file *file)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct obd_client_handle **och_p;
+       struct obd_client_handle *old_och = NULL;
+       __u64 *och_usecount;
+       int rc = 0;
+       ENTRY;
+
+       mutex_lock(&lli->lli_och_mutex);
+       if (file->f_mode & FMODE_WRITE) {
+               och_p = &lli->lli_mds_write_och;
+               och_usecount = &lli->lli_open_fd_write_count;
+       } else {
+               och_p = &lli->lli_mds_read_och;
+               och_usecount = &lli->lli_open_fd_read_count;
+       }
+
+       /* The file may have been open by another process (broken lease) so
+        * *och_p is not NULL. In this case we should simply increase usecount
+        * and close fd_och.
+        */
+       if (*och_p != NULL) {
+               old_och = fd->fd_och;
+               (*och_usecount)++;
+       } else {
+               *och_p = fd->fd_och;
+               *och_usecount = 1;
+       }
+       fd->fd_och = NULL;
+       mutex_unlock(&lli->lli_och_mutex);
+
+       if (old_och != NULL)
+               rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
+
+       RETURN(rc);
+}
+
+/**
  * Acquire a lease and open the file.
  */
 static struct obd_client_handle *
@@ -794,45 +812,12 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
                RETURN(ERR_PTR(-EINVAL));
 
        if (file != NULL) {
-               struct ll_inode_info *lli = ll_i2info(inode);
-               struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
-               struct obd_client_handle **och_p;
-               __u64 *och_usecount;
-
                if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
                        RETURN(ERR_PTR(-EPERM));
 
-               /* Get the openhandle of the file */
-               rc = -EBUSY;
-               mutex_lock(&lli->lli_och_mutex);
-               if (fd->fd_lease_och != NULL) {
-                       mutex_unlock(&lli->lli_och_mutex);
-                       RETURN(ERR_PTR(rc));
-               }
-
-               if (fd->fd_och == NULL) {
-                       if (file->f_mode & FMODE_WRITE) {
-                               LASSERT(lli->lli_mds_write_och != NULL);
-                               och_p = &lli->lli_mds_write_och;
-                               och_usecount = &lli->lli_open_fd_write_count;
-                       } else {
-                               LASSERT(lli->lli_mds_read_och != NULL);
-                               och_p = &lli->lli_mds_read_och;
-                               och_usecount = &lli->lli_open_fd_read_count;
-                       }
-                       if (*och_usecount == 1) {
-                               fd->fd_och = *och_p;
-                               *och_p = NULL;
-                               *och_usecount = 0;
-                               rc = 0;
-                       }
-               }
-               mutex_unlock(&lli->lli_och_mutex);
-               if (rc < 0) /* more than 1 opener */
+               rc = ll_lease_och_acquire(inode, file, &old_handle);
+               if (rc)
                        RETURN(ERR_PTR(rc));
-
-               LASSERT(fd->fd_och != NULL);
-               old_handle = fd->fd_och->och_fh;
        }
 
        OBD_ALLOC_PTR(och);
@@ -878,12 +863,12 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 
        /* already get lease, handle lease lock */
        ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
-       if (it.d.lustre.it_lock_mode == 0 ||
-           it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
+       if (it.it_lock_mode == 0 ||
+           it.it_lock_bits != MDS_INODELOCK_OPEN) {
                /* open lock must return for lease */
-               CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
-                       PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
-                       it.d.lustre.it_lock_bits);
+               CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
+                       PFID(ll_inode2fid(inode)), it.it_lock_mode,
+                       it.it_lock_bits);
                GOTO(out_close, rc = -EPROTO);
        }
 
@@ -892,13 +877,13 @@ ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 
 out_close:
        /* Cancel open lock */
-       if (it.d.lustre.it_lock_mode != 0) {
+       if (it.it_lock_mode != 0) {
                ldlm_lock_decref_and_cancel(&och->och_lease_handle,
-                                           it.d.lustre.it_lock_mode);
-               it.d.lustre.it_lock_mode = 0;
+                                           it.it_lock_mode);
+               it.it_lock_mode = 0;
                och->och_lease_handle.cookie = 0ULL;
        }
-       rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
+       rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
        if (rc2 < 0)
                CERROR("%s: error closing file "DFID": %d\n",
                       ll_get_fsname(inode->i_sb, NULL, 0),
@@ -913,6 +898,68 @@ out:
 }
 
 /**
+ * Check whether a layout swap can be done between two inodes.
+ *
+ * \param[in] inode1  First inode to check
+ * \param[in] inode2  Second inode to check
+ *
+ * \retval 0 on success, layout swap can be performed between both inodes
+ * \retval negative error code if requirements are not met
+ */
+static int ll_check_swap_layouts_validity(struct inode *inode1,
+                                         struct inode *inode2)
+{
+       if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
+               return -EINVAL;
+
+       if (inode_permission(inode1, MAY_WRITE) ||
+           inode_permission(inode2, MAY_WRITE))
+               return -EPERM;
+
+       if (inode1->i_sb != inode2->i_sb)
+               return -EXDEV;
+
+       return 0;
+}
+
+static int ll_swap_layouts_close(struct obd_client_handle *och,
+                                struct inode *inode, struct inode *inode2)
+{
+       const struct lu_fid     *fid1 = ll_inode2fid(inode);
+       const struct lu_fid     *fid2;
+       int                      rc;
+       ENTRY;
+
+       CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
+              ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
+
+       rc = ll_check_swap_layouts_validity(inode, inode2);
+       if (rc < 0)
+               GOTO(out_free_och, rc);
+
+       /* We now know that inode2 is a lustre inode */
+       fid2 = ll_inode2fid(inode2);
+
+       rc = lu_fid_cmp(fid1, fid2);
+       if (rc == 0)
+               GOTO(out_free_och, rc = -EINVAL);
+
+       /* Close the file and swap layouts between inode & inode2.
+        * NB: lease lock handle is released in mdc_close_layout_swap_pack()
+        * because we still need it to pack l_remote_handle to MDT. */
+       rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
+                                      inode2);
+
+       och = NULL; /* freed in ll_close_inode_openhandle() */
+
+out_free_och:
+       if (och != NULL)
+               OBD_FREE_PTR(och);
+
+       RETURN(rc);
+}
+
+/**
  * Release lease and close the file.
  * It will check if the lease has ever broken.
  */
@@ -933,168 +980,88 @@ static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
        }
 
        CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
-               PFID(&ll_i2info(inode)->lli_fid), cancelled);
+              PFID(&ll_i2info(inode)->lli_fid), cancelled);
 
        if (!cancelled)
                ldlm_cli_cancel(&och->och_lease_handle, 0);
+
        if (lease_broken != NULL)
                *lease_broken = cancelled;
 
-       rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
-                                      NULL);
-       RETURN(rc);
-}
-
-/* Fills the obdo with the attributes for the lsm */
-static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
-                         struct obd_capa *capa, struct obdo *obdo,
-                         __u64 ioepoch, int dv_flags)
-{
-        struct ptlrpc_request_set *set;
-        struct obd_info            oinfo = { { { 0 } } };
-        int                        rc;
-
-        ENTRY;
-
-        LASSERT(lsm != NULL);
-
-        oinfo.oi_md = lsm;
-        oinfo.oi_oa = obdo;
-       oinfo.oi_oa->o_oi = lsm->lsm_oi;
-        oinfo.oi_oa->o_mode = S_IFREG;
-        oinfo.oi_oa->o_ioepoch = ioepoch;
-        oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
-                               OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
-                               OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
-                               OBD_MD_FLMTIME | OBD_MD_FLCTIME |
-                               OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
-                               OBD_MD_FLDATAVERSION;
-        oinfo.oi_capa = capa;
-       if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
-               oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
-               oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
-               if (dv_flags & LL_DV_WR_FLUSH)
-                       oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
-       }
-
-        set = ptlrpc_prep_set();
-        if (set == NULL) {
-                CERROR("can't allocate ptlrpc set\n");
-                rc = -ENOMEM;
-        } else {
-                rc = obd_getattr_async(exp, &oinfo, set);
-                if (rc == 0)
-                        rc = ptlrpc_set_wait(set);
-                ptlrpc_set_destroy(set);
-        }
-       if (rc == 0) {
-               oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
-                                        OBD_MD_FLATIME | OBD_MD_FLMTIME |
-                                        OBD_MD_FLCTIME | OBD_MD_FLSIZE |
-                                        OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
-               if (dv_flags & LL_DV_WR_FLUSH &&
-                   !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
-                     oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
-                       RETURN(-ENOTSUPP);
-       }
+       rc = ll_close_inode_openhandle(inode, och, 0, NULL);
        RETURN(rc);
 }
 
-/**
-  * Performs the getattr on the inode and updates its fields.
-  * If @sync != 0, perform the getattr under the server-side lock.
-  */
-int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
-                     __u64 ioepoch, int sync)
-{
-       struct obd_capa      *capa = ll_mdscapa_get(inode);
-       struct lov_stripe_md *lsm;
-       int rc;
-       ENTRY;
-
-       lsm = ccc_inode_lsm_get(inode);
-       rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
-                               capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
-       capa_put(capa);
-       if (rc == 0) {
-               struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
-
-               obdo_refresh_inode(inode, obdo, obdo->o_valid);
-               CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
-                      " blksize %lu\n", POSTID(oi), i_size_read(inode),
-                      (unsigned long long)inode->i_blocks,
-                      (unsigned long)ll_inode_blksize(inode));
-       }
-       ccc_inode_lsm_put(inode, lsm);
-       RETURN(rc);
-}
-
-int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
+int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 {
        struct ll_inode_info *lli = ll_i2info(inode);
        struct cl_object *obj = lli->lli_clob;
-       struct cl_attr *attr = ccc_env_thread_attr(env);
-       struct ost_lvb lvb;
+       struct cl_attr *attr = vvp_env_thread_attr(env);
+       s64 atime;
+       s64 mtime;
+       s64 ctime;
        int rc = 0;
 
        ENTRY;
 
        ll_inode_size_lock(inode);
-       /* merge timestamps the most recently obtained from mds with
-          timestamps obtained from osts */
-       LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
-       LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
-       LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
-       inode_init_lvb(inode, &lvb);
+
+       /* Merge timestamps the most recently obtained from MDS with
+        * timestamps obtained from OSTs.
+        *
+        * Do not overwrite atime of inode because it may be refreshed
+        * by file_accessed() function. If the read was served by cache
+        * data, there is no RPC to be sent so that atime may not be
+        * transferred to OSTs at all. MDT only updates atime at close time
+        * if it's at least 'mdd.*.atime_diff' older.
+        * All in all, the atime in Lustre does not strictly comply with
+        * POSIX. Solving this problem needs to send an RPC to MDT for each
+        * read, this will hurt performance. */
+       if (LTIME_S(inode->i_atime) < lli->lli_atime)
+               LTIME_S(inode->i_atime) = lli->lli_atime;
+       LTIME_S(inode->i_mtime) = lli->lli_mtime;
+       LTIME_S(inode->i_ctime) = lli->lli_ctime;
+
+       atime = LTIME_S(inode->i_atime);
+       mtime = LTIME_S(inode->i_mtime);
+       ctime = LTIME_S(inode->i_ctime);
 
        cl_object_attr_lock(obj);
        rc = cl_object_attr_get(env, obj, attr);
        cl_object_attr_unlock(obj);
 
-       if (rc == 0) {
-               if (lvb.lvb_atime < attr->cat_atime)
-                       lvb.lvb_atime = attr->cat_atime;
-               if (lvb.lvb_ctime < attr->cat_ctime)
-                       lvb.lvb_ctime = attr->cat_ctime;
-               if (lvb.lvb_mtime < attr->cat_mtime)
-                       lvb.lvb_mtime = attr->cat_mtime;
-
-               CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
-                               PFID(&lli->lli_fid), attr->cat_size);
-               cl_isize_write_nolock(inode, attr->cat_size);
-
-               inode->i_blocks = attr->cat_blocks;
-
-               LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
-               LTIME_S(inode->i_atime) = lvb.lvb_atime;
-               LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
-       }
-       ll_inode_size_unlock(inode);
+       if (rc != 0)
+               GOTO(out_size_unlock, rc);
 
-       RETURN(rc);
-}
+       if (atime < attr->cat_atime)
+               atime = attr->cat_atime;
 
-int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
-                     lstat_t *st)
-{
-        struct obdo obdo = { 0 };
-        int rc;
+       if (ctime < attr->cat_ctime)
+               ctime = attr->cat_ctime;
 
-        rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
-        if (rc == 0) {
-                st->st_size   = obdo.o_size;
-                st->st_blocks = obdo.o_blocks;
-                st->st_mtime  = obdo.o_mtime;
-                st->st_atime  = obdo.o_atime;
-                st->st_ctime  = obdo.o_ctime;
-        }
-        return rc;
+       if (mtime < attr->cat_mtime)
+               mtime = attr->cat_mtime;
+
+       CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
+              PFID(&lli->lli_fid), attr->cat_size);
+
+       i_size_write(inode, attr->cat_size);
+       inode->i_blocks = attr->cat_blocks;
+
+       LTIME_S(inode->i_atime) = atime;
+       LTIME_S(inode->i_mtime) = mtime;
+       LTIME_S(inode->i_ctime) = ctime;
+
+out_size_unlock:
+       ll_inode_size_unlock(inode);
+
+       RETURN(rc);
 }
 
 static bool file_is_noatime(const struct file *file)
 {
        const struct vfsmount *mnt = file->f_path.mnt;
-       const struct inode *inode = file->f_path.dentry->d_inode;
+       const struct inode *inode = file_inode((struct file *)file);
 
        /* Adapted from file_accessed() and touch_atime().*/
        if (file->f_flags & O_NOATIME)
@@ -1120,7 +1087,7 @@ static bool file_is_noatime(const struct file *file)
 
 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
 {
-        struct inode *inode = file->f_dentry->d_inode;
+       struct inode *inode = file_inode((struct file *)file);
 
         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
        if (write) {
@@ -1141,195 +1108,334 @@ static void ll_io_init(struct cl_io *io, const struct file *file, int write)
        io->ci_noatime = file_is_noatime(file);
 }
 
-static ssize_t
-ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
-                  struct file *file, enum cl_io_type iot,
-                  loff_t *ppos, size_t count)
+static ssize_t
+ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
+                  struct file *file, enum cl_io_type iot,
+                  loff_t *ppos, size_t count)
+{
+       struct vvp_io           *vio = vvp_env_io(env);
+       struct inode            *inode = file_inode(file);
+       struct ll_inode_info    *lli = ll_i2info(inode);
+       struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
+       struct cl_io            *io;
+       ssize_t                 result = 0;
+       int                     rc = 0;
+       struct range_lock       range;
+
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: %llu, count: %zu\n",
+               file_dentry(file)->d_name.name, iot, *ppos, count);
+
+restart:
+       io = vvp_env_thread_io(env);
+       ll_io_init(io, file, iot == CIT_WRITE);
+
+       if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
+               bool range_locked = false;
+
+               if (file->f_flags & O_APPEND)
+                       range_lock_init(&range, 0, LUSTRE_EOF);
+               else
+                       range_lock_init(&range, *ppos, *ppos + count - 1);
+
+               vio->vui_fd  = LUSTRE_FPRIVATE(file);
+               vio->vui_io_subtype = args->via_io_subtype;
+
+               switch (vio->vui_io_subtype) {
+               case IO_NORMAL:
+                       vio->vui_iter = args->u.normal.via_iter;
+                       vio->vui_iocb = args->u.normal.via_iocb;
+                       /* Direct IO reads must also take range lock,
+                        * or multiple reads will try to work on the same pages
+                        * See LU-6227 for details. */
+                       if (((iot == CIT_WRITE) ||
+                           (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
+                           !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+                               CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
+                                      RL_PARA(&range));
+                               rc = range_lock(&lli->lli_write_tree, &range);
+                               if (rc < 0)
+                                       GOTO(out, rc);
+
+                               range_locked = true;
+                       }
+                       break;
+               case IO_SPLICE:
+                       vio->u.splice.vui_pipe = args->u.splice.via_pipe;
+                       vio->u.splice.vui_flags = args->u.splice.via_flags;
+                       break;
+               default:
+                       CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
+                       LBUG();
+               }
+
+               ll_cl_add(file, env, io, LCC_RW);
+               rc = cl_io_loop(env, io);
+               ll_cl_remove(file, env);
+
+               if (range_locked) {
+                       CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
+                              RL_PARA(&range));
+                       range_unlock(&lli->lli_write_tree, &range);
+               }
+       } else {
+               /* cl_io_rw_init() handled IO */
+               rc = io->ci_result;
+       }
+
+       if (io->ci_nob > 0) {
+               result += io->ci_nob;
+               count -= io->ci_nob;
+               *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
+
+               /* prepare IO restart */
+               if (count > 0 && args->via_io_subtype == IO_NORMAL)
+                       args->u.normal.via_iter = vio->vui_iter;
+       }
+       GOTO(out, rc);
+out:
+       cl_io_fini(env, io);
+
+       if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
+               CDEBUG(D_VFSTRACE,
+                      "%s: restart %s from %lld, count:%zu, result: %zd\n",
+                      file_dentry(file)->d_name.name,
+                      iot == CIT_READ ? "read" : "write",
+                      *ppos, count, result);
+               goto restart;
+       }
+
+       if (iot == CIT_READ) {
+               if (result > 0)
+                       ll_stats_ops_tally(ll_i2sbi(inode),
+                                          LPROC_LL_READ_BYTES, result);
+       } else if (iot == CIT_WRITE) {
+               if (result > 0) {
+                       ll_stats_ops_tally(ll_i2sbi(inode),
+                                          LPROC_LL_WRITE_BYTES, result);
+                       fd->fd_write_failed = false;
+               } else if (result == 0 && rc == 0) {
+                       rc = io->ci_result;
+                       if (rc < 0)
+                               fd->fd_write_failed = true;
+                       else
+                               fd->fd_write_failed = false;
+               } else if (rc != -ERESTARTSYS) {
+                       fd->fd_write_failed = true;
+               }
+       }
+
+       CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
+
+       return result > 0 ? result : rc;
+}
+
+/**
+ * The purpose of fast read is to overcome per I/O overhead and improve IOPS
+ * especially for small I/O.
+ *
+ * To serve a read request, CLIO has to create and initialize a cl_io and
+ * then request DLM lock. This has turned out to have siginificant overhead
+ * and affects the performance of small I/O dramatically.
+ *
+ * It's not necessary to create a cl_io for each I/O. Under the help of read
+ * ahead, most of the pages being read are already in memory cache and we can
+ * read those pages directly because if the pages exist, the corresponding DLM
+ * lock must exist so that page content must be valid.
+ *
+ * In fast read implementation, the llite speculatively finds and reads pages
+ * in memory cache. There are three scenarios for fast read:
+ *   - If the page exists and is uptodate, kernel VM will provide the data and
+ *     CLIO won't be intervened;
+ *   - If the page was brought into memory by read ahead, it will be exported
+ *     and read ahead parameters will be updated;
+ *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
+ *     it will go back and invoke normal read, i.e., a cl_io will be created
+ *     and DLM lock will be requested.
+ *
+ * POSIX compliance: posix standard states that read is intended to be atomic.
+ * Lustre read implementation is in line with Linux kernel read implementation
+ * and neither of them complies with POSIX standard in this matter. Fast read
+ * doesn't make the situation worse on single node but it may interleave write
+ * results from multiple nodes due to short read handling in ll_file_aio_read().
+ *
+ * \param env - lu_env
+ * \param iocb - kiocb from kernel
+ * \param iter - user space buffers where the data will be copied
+ *
+ * \retval - number of bytes have been read, or error code if error occurred.
+ */
+static ssize_t
+ll_do_fast_read(const struct lu_env *env, struct kiocb *iocb,
+               struct iov_iter *iter)
+{
+       ssize_t result;
+
+       if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
+               return 0;
+
+       /* NB: we can't do direct IO for fast read because it will need a lock
+        * to make IO engine happy. */
+       if (iocb->ki_filp->f_flags & O_DIRECT)
+               return 0;
+
+       ll_cl_add(iocb->ki_filp, env, NULL, LCC_RW);
+       result = generic_file_read_iter(iocb, iter);
+       ll_cl_remove(iocb->ki_filp, env);
+
+       /* If the first page is not in cache, generic_file_aio_read() will be
+        * returned with -ENODATA.
+        * See corresponding code in ll_readpage(). */
+       if (result == -ENODATA)
+               result = 0;
+
+       if (result > 0)
+               ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
+                               LPROC_LL_READ_BYTES, result);
+
+       return result;
+}
+
+/*
+ * Read from a file (through the page cache).
+ */
+static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
-       struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
-       struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
-       struct cl_io         *io;
-       ssize_t               result;
-       ENTRY;
+       struct lu_env *env;
+       struct vvp_io_args *args;
+       ssize_t result;
+       ssize_t rc2;
+       __u16 refcheck;
 
-       CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
-               file->f_dentry->d_name.name, iot, *ppos, count);
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               return PTR_ERR(env);
 
-restart:
-        io = ccc_env_thread_io(env);
-        ll_io_init(io, file, iot == CIT_WRITE);
-
-        if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
-                struct vvp_io *vio = vvp_env_io(env);
-                struct ccc_io *cio = ccc_env_io(env);
-                int write_mutex_locked = 0;
-
-               cio->cui_fd  = LUSTRE_FPRIVATE(file);
-               vio->cui_io_subtype = args->via_io_subtype;
-
-               ll_cl_add(file, env, io);
-
-                switch (vio->cui_io_subtype) {
-                case IO_NORMAL:
-                        cio->cui_iov = args->u.normal.via_iov;
-                        cio->cui_nrsegs = args->u.normal.via_nrsegs;
-                        cio->cui_tot_nrsegs = cio->cui_nrsegs;
-                        cio->cui_iocb = args->u.normal.via_iocb;
-                        if ((iot == CIT_WRITE) &&
-                            !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
-                               if (mutex_lock_interruptible(&lli->
-                                                       lli_write_mutex))
-                                       GOTO(out, result = -ERESTARTSYS);
-                               write_mutex_locked = 1;
-                       }
-                       down_read(&lli->lli_trunc_sem);
-                        break;
-                case IO_SPLICE:
-                        vio->u.splice.cui_pipe = args->u.splice.via_pipe;
-                        vio->u.splice.cui_flags = args->u.splice.via_flags;
-                        break;
-                default:
-                        CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
-                        LBUG();
-                }
-                result = cl_io_loop(env, io);
-               if (args->via_io_subtype == IO_NORMAL)
-                       up_read(&lli->lli_trunc_sem);
-               if (write_mutex_locked)
-                       mutex_unlock(&lli->lli_write_mutex);
-               ll_cl_remove(file, env);
-        } else {
-                /* cl_io_rw_init() handled IO */
-                result = io->ci_result;
-        }
+       result = ll_do_fast_read(env, iocb, to);
+       if (result < 0 || iov_iter_count(to) == 0)
+               GOTO(out, result);
 
-        if (io->ci_nob > 0) {
-                result = io->ci_nob;
-                *ppos = io->u.ci_wr.wr.crw_pos;
-        }
-        GOTO(out, result);
-out:
-        cl_io_fini(env, io);
-       /* If any bit been read/written (result != 0), we just return
-        * short read/write instead of restart io. */
-       if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
-               CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
-                      iot == CIT_READ ? "read" : "write",
-                      file->f_dentry->d_name.name, *ppos, count);
-               LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
-               goto restart;
-       }
+       args = ll_env_args(env, IO_NORMAL);
+       args->u.normal.via_iter = to;
+       args->u.normal.via_iocb = iocb;
 
-        if (iot == CIT_READ) {
-                if (result >= 0)
-                        ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
-                                           LPROC_LL_READ_BYTES, result);
-        } else if (iot == CIT_WRITE) {
-                if (result >= 0) {
-                        ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
-                                           LPROC_LL_WRITE_BYTES, result);
-                       fd->fd_write_failed = false;
-               } else if (result != -ERESTARTSYS) {
-                       fd->fd_write_failed = true;
-               }
-       }
-       CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
+       rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
+                                &iocb->ki_pos, iov_iter_count(to));
+       if (rc2 > 0)
+               result += rc2;
+       else if (result == 0)
+               result = rc2;
 
+out:
+       cl_env_put(env, &refcheck);
        return result;
 }
 
+/*
+ * Write to a file (through the page cache).
+ */
+static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct vvp_io_args *args;
+       struct lu_env *env;
+       ssize_t result;
+       __u16 refcheck;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               return PTR_ERR(env);
+
+       args = ll_env_args(env, IO_NORMAL);
+       args->u.normal.via_iter = from;
+       args->u.normal.via_iocb = iocb;
 
+       result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
+                                   &iocb->ki_pos, iov_iter_count(from));
+       cl_env_put(env, &refcheck);
+       return result;
+}
+
+#ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
 /*
  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
  */
 static int ll_file_get_iov_count(const struct iovec *iov,
-                                 unsigned long *nr_segs, size_t *count)
+                                unsigned long *nr_segs, size_t *count)
 {
-        size_t cnt = 0;
-        unsigned long seg;
+       size_t cnt = 0;
+       unsigned long seg;
 
-        for (seg = 0; seg < *nr_segs; seg++) {
-                const struct iovec *iv = &iov[seg];
+       for (seg = 0; seg < *nr_segs; seg++) {
+               const struct iovec *iv = &iov[seg];
 
-                /*
-                 * If any segment has a negative length, or the cumulative
-                 * length ever wraps negative then return -EINVAL.
-                 */
-                cnt += iv->iov_len;
-                if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
-                        return -EINVAL;
-                if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
-                        continue;
-                if (seg == 0)
-                        return -EFAULT;
-                *nr_segs = seg;
-                cnt -= iv->iov_len;   /* This segment is no good */
-                break;
-        }
-        *count = cnt;
-        return 0;
+               /*
+                * If any segment has a negative length, or the cumulative
+                * length ever wraps negative then return -EINVAL.
+                */
+               cnt += iv->iov_len;
+               if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
+                       return -EINVAL;
+               if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+                       continue;
+               if (seg == 0)
+                       return -EFAULT;
+               *nr_segs = seg;
+               cnt -= iv->iov_len;     /* This segment is no good */
+               break;
+       }
+       *count = cnt;
+       return 0;
 }
 
 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-                                unsigned long nr_segs, loff_t pos)
+                               unsigned long nr_segs, loff_t pos)
 {
-        struct lu_env      *env;
-        struct vvp_io_args *args;
-        size_t              count;
-        ssize_t             result;
-        int                 refcheck;
-        ENTRY;
+       struct iov_iter to;
+       size_t iov_count;
+       ssize_t result;
+       ENTRY;
 
-        result = ll_file_get_iov_count(iov, &nr_segs, &count);
-        if (result)
-                RETURN(result);
+       result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
+       if (result)
+               RETURN(result);
 
-        env = cl_env_get(&refcheck);
-        if (IS_ERR(env))
-                RETURN(PTR_ERR(env));
+# ifdef HAVE_IOV_ITER_INIT_DIRECTION
+       iov_iter_init(&to, READ, iov, nr_segs, iov_count);
+# else /* !HAVE_IOV_ITER_INIT_DIRECTION */
+       iov_iter_init(&to, iov, nr_segs, iov_count, 0);
+# endif /* HAVE_IOV_ITER_INIT_DIRECTION */
 
-        args = vvp_env_args(env, IO_NORMAL);
-        args->u.normal.via_iov = (struct iovec *)iov;
-        args->u.normal.via_nrsegs = nr_segs;
-        args->u.normal.via_iocb = iocb;
+       result = ll_file_read_iter(iocb, &to);
 
-        result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
-                                    &iocb->ki_pos, count);
-        cl_env_put(env, &refcheck);
-        RETURN(result);
+       RETURN(result);
 }
 
-static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
-                            loff_t *ppos)
+static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
+                           loff_t *ppos)
 {
-        struct lu_env *env;
-        struct iovec  *local_iov;
+       struct iovec   iov = { .iov_base = buf, .iov_len = count };
         struct kiocb  *kiocb;
         ssize_t        result;
-        int            refcheck;
         ENTRY;
 
-        env = cl_env_get(&refcheck);
-        if (IS_ERR(env))
-                RETURN(PTR_ERR(env));
+       OBD_ALLOC_PTR(kiocb);
+       if (kiocb == NULL)
+               RETURN(-ENOMEM);
 
-        local_iov = &vvp_env_info(env)->vti_local_iov;
-        kiocb = &vvp_env_info(env)->vti_kiocb;
-        local_iov->iov_base = (void __user *)buf;
-        local_iov->iov_len = count;
         init_sync_kiocb(kiocb, file);
         kiocb->ki_pos = *ppos;
 #ifdef HAVE_KIOCB_KI_LEFT
-        kiocb->ki_left = count;
-#else
-        kiocb->ki_nbytes = count;
+       kiocb->ki_left = count;
+#elif defined(HAVE_KI_NBYTES)
+       kiocb->ki_nbytes = count;
 #endif
 
-        result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
-        *ppos = kiocb->ki_pos;
+       result = ll_file_aio_read(kiocb, &iov, 1, kiocb->ki_pos);
+       *ppos = kiocb->ki_pos;
 
-        cl_env_put(env, &refcheck);
-        RETURN(result);
+       OBD_FREE_PTR(kiocb);
+       RETURN(result);
 }
 
 /*
@@ -1337,66 +1443,59 @@ static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
  * AIO stuff
  */
 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-                                 unsigned long nr_segs, loff_t pos)
+                                unsigned long nr_segs, loff_t pos)
 {
-        struct lu_env      *env;
-        struct vvp_io_args *args;
-        size_t              count;
-        ssize_t             result;
-        int                 refcheck;
-        ENTRY;
+       struct iov_iter from;
+       size_t iov_count;
+       ssize_t result;
+       ENTRY;
 
-        result = ll_file_get_iov_count(iov, &nr_segs, &count);
-        if (result)
-                RETURN(result);
+       result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
+       if (result)
+               RETURN(result);
 
-        env = cl_env_get(&refcheck);
-        if (IS_ERR(env))
-                RETURN(PTR_ERR(env));
+# ifdef HAVE_IOV_ITER_INIT_DIRECTION
+       iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
+# else /* !HAVE_IOV_ITER_INIT_DIRECTION */
+       iov_iter_init(&from, iov, nr_segs, iov_count, 0);
+# endif /* HAVE_IOV_ITER_INIT_DIRECTION */
 
-        args = vvp_env_args(env, IO_NORMAL);
-        args->u.normal.via_iov = (struct iovec *)iov;
-        args->u.normal.via_nrsegs = nr_segs;
-        args->u.normal.via_iocb = iocb;
+       result = ll_file_write_iter(iocb, &from);
 
-        result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
-                                  &iocb->ki_pos, count);
-        cl_env_put(env, &refcheck);
-        RETURN(result);
+       RETURN(result);
 }
 
-static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
-                             loff_t *ppos)
+static ssize_t ll_file_write(struct file *file, const char __user *buf,
+                            size_t count, loff_t *ppos)
 {
-        struct lu_env *env;
-        struct iovec  *local_iov;
+       struct lu_env *env;
+       struct iovec   iov = { .iov_base = (void __user *)buf,
+                              .iov_len = count };
         struct kiocb  *kiocb;
         ssize_t        result;
-        int            refcheck;
+       __u16          refcheck;
         ENTRY;
 
         env = cl_env_get(&refcheck);
         if (IS_ERR(env))
                 RETURN(PTR_ERR(env));
 
-        local_iov = &vvp_env_info(env)->vti_local_iov;
-        kiocb = &vvp_env_info(env)->vti_kiocb;
-        local_iov->iov_base = (void __user *)buf;
-        local_iov->iov_len = count;
+       kiocb = &ll_env_info(env)->lti_kiocb;
         init_sync_kiocb(kiocb, file);
         kiocb->ki_pos = *ppos;
 #ifdef HAVE_KIOCB_KI_LEFT
-        kiocb->ki_left = count;
-#else
-        kiocb->ki_nbytes = count;
+       kiocb->ki_left = count;
+#elif defined(HAVE_KI_NBYTES)
+       kiocb->ki_nbytes = count;
 #endif
 
-        result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
-        *ppos = kiocb->ki_pos;
+       result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
+       *ppos = kiocb->ki_pos;
 
-        cl_env_put(env, &refcheck);
-        RETURN(result);
+       cl_env_put(env, &refcheck);
+       RETURN(result);
 }
+#endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
 
 /*
  * Send file content (through pagecache) somewhere with helper
@@ -1408,14 +1507,14 @@ static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
         struct lu_env      *env;
         struct vvp_io_args *args;
         ssize_t             result;
-        int                 refcheck;
+       __u16               refcheck;
         ENTRY;
 
         env = cl_env_get(&refcheck);
         if (IS_ERR(env))
                 RETURN(PTR_ERR(env));
 
-        args = vvp_env_args(env, IO_SPLICE);
+       args = ll_env_args(env, IO_SPLICE);
         args->u.splice.via_pipe = pipe;
         args->u.splice.via_flags = flags;
 
@@ -1424,125 +1523,30 @@ static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
         RETURN(result);
 }
 
-static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
-                           obd_count ost_idx)
-{
-       struct obd_export *exp = ll_i2dtexp(inode);
-       struct obd_trans_info oti = { 0 };
-       struct obdo *oa = NULL;
-       int lsm_size;
-       int rc = 0;
-       struct lov_stripe_md *lsm = NULL, *lsm2;
-       ENTRY;
-
-       OBDO_ALLOC(oa);
-       if (oa == NULL)
-               RETURN(-ENOMEM);
-
-       lsm = ccc_inode_lsm_get(inode);
-       if (!lsm_has_objects(lsm))
-                GOTO(out, rc = -ENOENT);
-
-        lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
-                   (lsm->lsm_stripe_count));
-
-        OBD_ALLOC_LARGE(lsm2, lsm_size);
-        if (lsm2 == NULL)
-                GOTO(out, rc = -ENOMEM);
-
-       oa->o_oi = *oi;
-        oa->o_nlink = ost_idx;
-        oa->o_flags |= OBD_FL_RECREATE_OBJS;
-        oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
-        obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
-                                   OBD_MD_FLMTIME | OBD_MD_FLCTIME);
-        obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
-        memcpy(lsm2, lsm, lsm_size);
-       ll_inode_size_lock(inode);
-       rc = obd_create(NULL, exp, oa, &lsm2, &oti);
-       ll_inode_size_unlock(inode);
-
-       OBD_FREE_LARGE(lsm2, lsm_size);
-       GOTO(out, rc);
-out:
-       ccc_inode_lsm_put(inode, lsm);
-       OBDO_FREE(oa);
-       return rc;
-}
-
-static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
-{
-       struct ll_recreate_obj ucreat;
-       struct ost_id           oi;
-       ENTRY;
-
-       if (!cfs_capable(CFS_CAP_SYS_ADMIN))
-               RETURN(-EPERM);
-
-       if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
-                          sizeof(ucreat)))
-               RETURN(-EFAULT);
-
-       ostid_set_seq_mdt0(&oi);
-       ostid_set_id(&oi, ucreat.lrc_id);
-       RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
-}
-
-static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
-{
-       struct lu_fid   fid;
-       struct ost_id   oi;
-       obd_count       ost_idx;
-        ENTRY;
-
-       if (!cfs_capable(CFS_CAP_SYS_ADMIN))
-               RETURN(-EPERM);
-
-       if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
-               RETURN(-EFAULT);
-
-       fid_to_ostid(&fid, &oi);
-       ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
-       RETURN(ll_lov_recreate(inode, &oi, ost_idx));
-}
-
 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
                              __u64  flags, struct lov_user_md *lum,
                             int lum_size)
 {
-       struct lov_stripe_md *lsm = NULL;
-       struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
-       int rc = 0;
+       struct lookup_intent oit = {
+               .it_op = IT_OPEN,
+               .it_flags = flags | MDS_OPEN_BY_FID,
+       };
+       int rc;
        ENTRY;
 
-       lsm = ccc_inode_lsm_get(inode);
-       if (lsm != NULL) {
-               ccc_inode_lsm_put(inode, lsm);
-               CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
-                      PFID(ll_inode2fid(inode)));
-               GOTO(out, rc = -EEXIST);
-       }
-
        ll_inode_size_lock(inode);
        rc = ll_intent_file_open(file, lum, lum_size, &oit);
-       if (rc)
-               GOTO(out_unlock, rc);
-       rc = oit.d.lustre.it_status;
        if (rc < 0)
-               GOTO(out_req_free, rc);
+               GOTO(out_unlock, rc);
 
-       ll_release_openhandle(file->f_dentry, &oit);
+       ll_release_openhandle(file_dentry(file), &oit);
 
 out_unlock:
        ll_inode_size_unlock(inode);
        ll_intent_release(&oit);
-       ccc_inode_lsm_put(inode, lsm);
-out:
        cl_lov_delay_create_clear(&file->f_flags);
+
        RETURN(rc);
-out_req_free:
-       ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
-       goto out;
 }
 
 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
@@ -1647,66 +1651,58 @@ static int ll_lov_setea(struct inode *inode, struct file *file,
        if (lump == NULL)
                 RETURN(-ENOMEM);
 
-       if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
-               OBD_FREE_LARGE(lump, lum_size);
-               RETURN(-EFAULT);
-       }
+       if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size))
+               GOTO(out_lump, rc = -EFAULT);
 
        rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
 
+out_lump:
        OBD_FREE_LARGE(lump, lum_size);
        RETURN(rc);
 }
 
+static int ll_file_getstripe(struct inode *inode,
+                            struct lov_user_md __user *lum)
+{
+       struct lu_env   *env;
+       __u16           refcheck;
+       int             rc;
+       ENTRY;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
+       cl_env_put(env, &refcheck);
+       RETURN(rc);
+}
+
 static int ll_lov_setstripe(struct inode *inode, struct file *file,
                            unsigned long arg)
 {
-       struct lov_user_md_v3    lumv3;
-       struct lov_user_md_v1   *lumv1 = (struct lov_user_md_v1 *)&lumv3;
-       struct lov_user_md_v1   *lumv1p = (struct lov_user_md_v1 *)arg;
-       struct lov_user_md_v3   *lumv3p = (struct lov_user_md_v3 *)arg;
-       int                      lum_size, rc;
-       __u64                    flags = FMODE_WRITE;
+       struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
+       struct lov_user_md        *klum;
+       int                        lum_size, rc;
+       __u64                      flags = FMODE_WRITE;
        ENTRY;
 
-       /* first try with v1 which is smaller than v3 */
-       lum_size = sizeof(struct lov_user_md_v1);
-       if (copy_from_user(lumv1, lumv1p, lum_size))
-               RETURN(-EFAULT);
-
-       if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
-               lum_size = sizeof(struct lov_user_md_v3);
-               if (copy_from_user(&lumv3, lumv3p, lum_size))
-                       RETURN(-EFAULT);
-       }
+       rc = ll_copy_user_md(lum, &klum);
+       if (rc < 0)
+               RETURN(rc);
 
-       rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
+       lum_size = rc;
+       rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
        if (rc == 0) {
-               struct lov_stripe_md *lsm;
                __u32 gen;
 
-               put_user(0, &lumv1p->lmm_stripe_count);
+               put_user(0, &lum->lmm_stripe_count);
 
                ll_layout_refresh(inode, &gen);
-               lsm = ccc_inode_lsm_get(inode);
-               rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
-                                  0, lsm, (void *)arg);
-               ccc_inode_lsm_put(inode, lsm);
+               rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
        }
-       RETURN(rc);
-}
-
-static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
-{
-       struct lov_stripe_md *lsm;
-       int rc = -ENODATA;
-       ENTRY;
 
-       lsm = ccc_inode_lsm_get(inode);
-       if (lsm != NULL)
-               rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
-                                  lsm, (void *)arg);
-       ccc_inode_lsm_put(inode, lsm);
+       OBD_FREE(klum, lum_size);
        RETURN(rc);
 }
 
@@ -1715,24 +1711,29 @@ ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
 {
         struct ll_inode_info   *lli = ll_i2info(inode);
         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
-        struct ccc_grouplock    grouplock;
+       struct ll_grouplock     grouplock;
         int                     rc;
         ENTRY;
 
+       if (arg == 0) {
+               CWARN("group id for group lock must not be 0\n");
+               RETURN(-EINVAL);
+       }
+
         if (ll_file_nolock(file))
                 RETURN(-EOPNOTSUPP);
 
        spin_lock(&lli->lli_lock);
        if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
                CWARN("group lock already existed with gid %lu\n",
-                     fd->fd_grouplock.cg_gid);
+                     fd->fd_grouplock.lg_gid);
                spin_unlock(&lli->lli_lock);
                RETURN(-EINVAL);
        }
-       LASSERT(fd->fd_grouplock.cg_lock == NULL);
+       LASSERT(fd->fd_grouplock.lg_lock == NULL);
        spin_unlock(&lli->lli_lock);
 
-       rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
+       rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
                              arg, (file->f_flags & O_NONBLOCK), &grouplock);
        if (rc)
                RETURN(rc);
@@ -1753,11 +1754,12 @@ ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
        RETURN(0);
 }
 
-int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
+static int ll_put_grouplock(struct inode *inode, struct file *file,
+                           unsigned long arg)
 {
        struct ll_inode_info   *lli = ll_i2info(inode);
        struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
-       struct ccc_grouplock    grouplock;
+       struct ll_grouplock     grouplock;
        ENTRY;
 
        spin_lock(&lli->lli_lock);
@@ -1766,11 +1768,12 @@ int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
                 CWARN("no group lock held\n");
                 RETURN(-EINVAL);
         }
-        LASSERT(fd->fd_grouplock.cg_lock != NULL);
 
-        if (fd->fd_grouplock.cg_gid != arg) {
-                CWARN("group lock %lu doesn't match current id %lu\n",
-                       arg, fd->fd_grouplock.cg_gid);
+       LASSERT(fd->fd_grouplock.lg_lock != NULL);
+
+       if (fd->fd_grouplock.lg_gid != arg) {
+               CWARN("group lock %lu doesn't match current id %lu\n",
+                     arg, fd->fd_grouplock.lg_gid);
                spin_unlock(&lli->lli_lock);
                RETURN(-EINVAL);
        }
@@ -1819,12 +1822,11 @@ int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
 
        ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 
-        rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
-                                      inode, och, NULL);
+       rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 out:
        /* this one is in place of ll_file_open */
        if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
-               ptlrpc_req_finished(it->d.lustre.it_data);
+               ptlrpc_req_finished(it->it_request);
                it_clear_disposition(it, DISP_ENQ_OPEN_REF);
        }
        RETURN(rc);
@@ -1833,96 +1835,97 @@ out:
 /**
  * Get size for inode for which FIEMAP mapping is requested.
  * Make the FIEMAP get_info call and returns the result.
+ * \param fiemap       kernel buffer to hold extens
+ * \param num_bytes    kernel buffer size
  */
-static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
+static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
                        size_t num_bytes)
 {
-       struct obd_export *exp = ll_i2dtexp(inode);
-       struct lov_stripe_md *lsm = NULL;
-        struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
-       __u32 vallen = num_bytes;
-        int rc;
-        ENTRY;
+       struct lu_env                   *env;
+       __u16                           refcheck;
+       int                             rc = 0;
+       struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
+       ENTRY;
 
-        /* Checks for fiemap flags */
-        if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
-                fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
-                return -EBADR;
-        }
+       /* Checks for fiemap flags */
+       if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
+               fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
+               return -EBADR;
+       }
 
-        /* Check for FIEMAP_FLAG_SYNC */
-        if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
-                rc = filemap_fdatawrite(inode->i_mapping);
-                if (rc)
-                        return rc;
-        }
+       /* Check for FIEMAP_FLAG_SYNC */
+       if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
+               rc = filemap_fdatawrite(inode->i_mapping);
+               if (rc)
+                       return rc;
+       }
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       if (i_size_read(inode) == 0) {
+               rc = ll_glimpse_size(inode);
+               if (rc)
+                       GOTO(out, rc);
+       }
 
-       lsm = ccc_inode_lsm_get(inode);
-       if (lsm == NULL)
-               return -ENOENT;
+       fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+       obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
+       obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
 
-       /* If the stripe_count > 1 and the application does not understand
-        * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
-        */
-       if (lsm->lsm_stripe_count > 1 &&
-           !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
-               GOTO(out, rc = -EOPNOTSUPP);
-
-       fm_key.oa.o_oi = lsm->lsm_oi;
-        fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
-
-        obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
-        obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
-        /* If filesize is 0, then there would be no objects for mapping */
-        if (fm_key.oa.o_size == 0) {
-                fiemap->fm_mapped_extents = 0;
+       /* If filesize is 0, then there would be no objects for mapping */
+       if (fmkey.lfik_oa.o_size == 0) {
+               fiemap->fm_mapped_extents = 0;
                GOTO(out, rc = 0);
-        }
-
-        memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
+       }
 
-        rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
-                          fiemap, lsm);
-        if (rc)
-                CERROR("obd_get_info failed: rc = %d\n", rc);
+       fmkey.lfik_fiemap = *fiemap;
 
+       rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
+                             &fmkey, fiemap, &num_bytes);
 out:
-       ccc_inode_lsm_put(inode, lsm);
+       cl_env_put(env, &refcheck);
        RETURN(rc);
 }
 
-int ll_fid2path(struct inode *inode, void *arg)
+int ll_fid2path(struct inode *inode, void __user *arg)
 {
        struct obd_export       *exp = ll_i2mdexp(inode);
-       struct getinfo_fid2path *gfout, *gfin;
-       int                      outsize, rc;
+       const struct getinfo_fid2path __user *gfin = arg;
+       __u32                    pathlen;
+       struct getinfo_fid2path *gfout;
+       size_t                   outsize;
+       int                      rc;
+
        ENTRY;
 
        if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
            !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
                RETURN(-EPERM);
 
-       /* Need to get the buflen */
-       OBD_ALLOC_PTR(gfin);
-       if (gfin == NULL)
-               RETURN(-ENOMEM);
-       if (copy_from_user(gfin, arg, sizeof(*gfin))) {
-               OBD_FREE_PTR(gfin);
+       /* Only need to get the buflen */
+       if (get_user(pathlen, &gfin->gf_pathlen))
                RETURN(-EFAULT);
-       }
 
-       outsize = sizeof(*gfout) + gfin->gf_pathlen;
+       if (pathlen > PATH_MAX)
+               RETURN(-EINVAL);
+
+       outsize = sizeof(*gfout) + pathlen;
        OBD_ALLOC(gfout, outsize);
-       if (gfout == NULL) {
-               OBD_FREE_PTR(gfin);
+       if (gfout == NULL)
                RETURN(-ENOMEM);
-       }
-       memcpy(gfout, gfin, sizeof(*gfout));
-       OBD_FREE_PTR(gfin);
+
+       if (copy_from_user(gfout, arg, sizeof(*gfout)))
+               GOTO(gf_free, rc = -EFAULT);
+       /* append root FID after gfout to let MDT know the root FID so that it
+        * can lookup the correct path, this is mainly for fileset.
+        * old server without fileset mount support will ignore this. */
+       *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
 
        /* Call mdc_iocontrol */
        rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
-       if (rc)
+       if (rc != 0)
                GOTO(gf_free, rc);
 
        if (copy_to_user(arg, gfout, outsize))
@@ -1933,102 +1936,58 @@ gf_free:
        RETURN(rc);
 }
 
-static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
-{
-        struct ll_user_fiemap *fiemap_s;
-        size_t num_bytes, ret_bytes;
-        unsigned int extent_count;
-        int rc = 0;
-
-        /* Get the extent count so we can calculate the size of
-         * required fiemap buffer */
-        if (get_user(extent_count,
-            &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
-                RETURN(-EFAULT);
-        num_bytes = sizeof(*fiemap_s) + (extent_count *
-                                         sizeof(struct ll_fiemap_extent));
-
-        OBD_ALLOC_LARGE(fiemap_s, num_bytes);
-        if (fiemap_s == NULL)
-                RETURN(-ENOMEM);
-
-       /* get the fiemap value */
-       if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
-                          sizeof(*fiemap_s)))
-               GOTO(error, rc = -EFAULT);
-
-        /* If fm_extent_count is non-zero, read the first extent since
-         * it is used to calculate end_offset and device from previous
-         * fiemap call. */
-        if (extent_count) {
-                if (copy_from_user(&fiemap_s->fm_extents[0],
-                    (char __user *)arg + sizeof(*fiemap_s),
-                    sizeof(struct ll_fiemap_extent)))
-                        GOTO(error, rc = -EFAULT);
-        }
-
-        rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
-        if (rc)
-                GOTO(error, rc);
-
-        ret_bytes = sizeof(struct ll_user_fiemap);
-
-        if (extent_count != 0)
-                ret_bytes += (fiemap_s->fm_mapped_extents *
-                                 sizeof(struct ll_fiemap_extent));
-
-       if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
-               rc = -EFAULT;
-
-error:
-        OBD_FREE_LARGE(fiemap_s, num_bytes);
-        RETURN(rc);
-}
-
 /*
  * Read the data_version for inode.
  *
  * This value is computed using stripe object version on OST.
  * Version is computed using server side locking.
  *
- * @param sync if do sync on the OST side;
+ * @param flags if do sync on the OST side;
  *             0: no sync
  *             LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
  *             LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
  */
 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
 {
-       struct lov_stripe_md    *lsm = NULL;
-       struct ll_sb_info       *sbi = ll_i2sbi(inode);
-       struct obdo             *obdo = NULL;
-       int                      rc;
+       struct cl_object *obj = ll_i2info(inode)->lli_clob;
+       struct lu_env *env;
+       struct cl_io *io;
+       __u16  refcheck;
+       int result;
+
        ENTRY;
 
-       /* If no stripe, we consider version is 0. */
-       lsm = ccc_inode_lsm_get(inode);
-       if (!lsm_has_objects(lsm)) {
+       /* If no file object initialized, we consider its version is 0. */
+       if (obj == NULL) {
                *data_version = 0;
-               CDEBUG(D_INODE, "No object for inode\n");
-               GOTO(out, rc = 0);
+               RETURN(0);
        }
 
-       OBD_ALLOC_PTR(obdo);
-       if (obdo == NULL)
-               GOTO(out, rc = -ENOMEM);
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
 
-       rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
-       if (rc == 0) {
-               if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
-                       rc = -EOPNOTSUPP;
-               else
-                       *data_version = obdo->o_data_version;
-       }
+       io = vvp_env_thread_io(env);
+       io->ci_obj = obj;
+       io->u.ci_data_version.dv_data_version = 0;
+       io->u.ci_data_version.dv_flags = flags;
 
-       OBD_FREE_PTR(obdo);
-       EXIT;
-out:
-       ccc_inode_lsm_put(inode, lsm);
-       RETURN(rc);
+restart:
+       if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
+               result = cl_io_loop(env, io);
+       else
+               result = io->ci_result;
+
+       *data_version = io->u.ci_data_version.dv_data_version;
+
+       cl_io_fini(env, io);
+
+       if (unlikely(io->ci_need_restart))
+               goto restart;
+
+       cl_env_put(env, &refcheck);
+
+       RETURN(result);
 }
 
 /*
@@ -2036,11 +1995,11 @@ out:
  */
 int ll_hsm_release(struct inode *inode)
 {
-       struct cl_env_nest nest;
        struct lu_env *env;
        struct obd_client_handle *och = NULL;
        __u64 data_version = 0;
        int rc;
+       __u16 refcheck;
        ENTRY;
 
        CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
@@ -2056,17 +2015,17 @@ int ll_hsm_release(struct inode *inode)
        if (rc != 0)
                GOTO(out, rc);
 
-       env = cl_env_nested_get(&nest);
+       env = cl_env_get(&refcheck);
        if (IS_ERR(env))
                GOTO(out, rc = PTR_ERR(env));
 
-       ll_merge_lvb(env, inode);
-       cl_env_nested_put(&nest, env);
+       ll_merge_attr(env, inode);
+       cl_env_put(env, &refcheck);
 
        /* Release the file.
         * NB: lease lock handle is released in mdc_hsm_release_pack() because
         * we still need it to pack l_remote_handle to MDT. */
-       rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
+       rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
                                       &data_version);
        och = NULL;
 
@@ -2079,10 +2038,12 @@ out:
 }
 
 struct ll_swap_stack {
-       struct iattr             ia1, ia2;
-       __u64                    dv1, dv2;
-       struct inode            *inode1, *inode2;
-       bool                     check_dv1, check_dv2;
+       __u64                    dv1;
+       __u64                    dv2;
+       struct inode            *inode1;
+       struct inode            *inode2;
+       bool                     check_dv1;
+       bool                     check_dv2;
 };
 
 static int ll_swap_layouts(struct file *file1, struct file *file2,
@@ -2099,18 +2060,12 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
        if (llss == NULL)
                RETURN(-ENOMEM);
 
-       llss->inode1 = file1->f_dentry->d_inode;
-       llss->inode2 = file2->f_dentry->d_inode;
+       llss->inode1 = file_inode(file1);
+       llss->inode2 = file_inode(file2);
 
-       if (!S_ISREG(llss->inode2->i_mode))
-               GOTO(free, rc = -EINVAL);
-
-       if (inode_permission(llss->inode1, MAY_WRITE) ||
-           inode_permission(llss->inode2, MAY_WRITE))
-               GOTO(free, rc = -EPERM);
-
-       if (llss->inode2->i_sb != llss->inode1->i_sb)
-               GOTO(free, rc = -EXDEV);
+       rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
+       if (rc < 0)
+               GOTO(free, rc);
 
        /* we use 2 bool because it is easier to swap than 2 bits */
        if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
@@ -2125,7 +2080,7 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
 
        rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
        if (rc == 0) /* same file, done! */
-               GOTO(free, rc = 0);
+               GOTO(free, rc);
 
        if (rc < 0) { /* sequentialize it */
                swap(llss->inode1, llss->inode2);
@@ -2147,18 +2102,6 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
                }
        }
 
-       /* to be able to restore mtime and atime after swap
-        * we need to first save them */
-       if (lsl->sl_flags &
-           (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
-               llss->ia1.ia_mtime = llss->inode1->i_mtime;
-               llss->ia1.ia_atime = llss->inode1->i_atime;
-               llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
-               llss->ia2.ia_mtime = llss->inode2->i_mtime;
-               llss->ia2.ia_atime = llss->inode2->i_atime;
-               llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
-       }
-
        /* ultimate check, before swaping the layouts we check if
         * dataversion has changed (if requested) */
        if (llss->check_dv1) {
@@ -2189,47 +2132,17 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
        if (IS_ERR(op_data))
                GOTO(free, rc = PTR_ERR(op_data));
 
-       rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
-                          sizeof(*op_data), op_data, NULL);
-       ll_finish_md_op_data(op_data);
-
-putgl:
-       if (gid != 0) {
-               ll_put_grouplock(llss->inode2, file2, gid);
-               ll_put_grouplock(llss->inode1, file1, gid);
-       }
-
-       /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
-       if (rc != 0)
-               GOTO(free, rc);
-
-       /* clear useless flags */
-       if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
-               llss->ia1.ia_valid &= ~ATTR_MTIME;
-               llss->ia2.ia_valid &= ~ATTR_MTIME;
-       }
-
-       if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
-               llss->ia1.ia_valid &= ~ATTR_ATIME;
-               llss->ia2.ia_valid &= ~ATTR_ATIME;
-       }
-
-       /* update time if requested */
-       rc = 0;
-       if (llss->ia2.ia_valid != 0) {
-               mutex_lock(&llss->inode1->i_mutex);
-               rc = ll_setattr(file1->f_dentry, &llss->ia2);
-               mutex_unlock(&llss->inode1->i_mutex);
-       }
+       rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
+                          sizeof(*op_data), op_data, NULL);
+       ll_finish_md_op_data(op_data);
 
-       if (llss->ia1.ia_valid != 0) {
-               int rc1;
+       if (rc < 0)
+               GOTO(putgl, rc);
 
-               mutex_lock(&llss->inode2->i_mutex);
-               rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
-               mutex_unlock(&llss->inode2->i_mutex);
-               if (rc == 0)
-                       rc = rc1;
+putgl:
+       if (gid != 0) {
+               ll_put_grouplock(llss->inode2, file2, gid);
+               ll_put_grouplock(llss->inode1, file1, gid);
        }
 
 free:
@@ -2239,10 +2152,15 @@ free:
        RETURN(rc);
 }
 
-static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
+int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
 {
        struct md_op_data       *op_data;
        int                      rc;
+       ENTRY;
+
+       /* Detect out-of range masks */
+       if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
+               RETURN(-EINVAL);
 
        /* Non-root users are forbidden to set or clear flags which are
         * NOT defined in HSM_USER_MASK. */
@@ -2250,6 +2168,11 @@ static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
            !cfs_capable(CFS_CAP_SYS_ADMIN))
                RETURN(-EPERM);
 
+       /* Detect out-of range archive id */
+       if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
+           (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
+               RETURN(-EINVAL);
+
        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
                                     LUSTRE_OPC_ANY, hss);
        if (IS_ERR(op_data))
@@ -2305,10 +2228,14 @@ static int ll_hsm_import(struct inode *inode, struct file *file,
                         ATTR_MTIME | ATTR_MTIME_SET |
                         ATTR_ATIME | ATTR_ATIME_SET;
 
-       rc = ll_setattr_raw(file->f_dentry, attr, true);
+       inode_lock(inode);
+
+       rc = ll_setattr_raw(file_dentry(file), attr, true);
        if (rc == -ENODATA)
                rc = 0;
 
+       inode_unlock(inode);
+
 out:
        if (hss != NULL)
                OBD_FREE_PTR(hss);
@@ -2319,10 +2246,101 @@ out:
        RETURN(rc);
 }
 
+static inline long ll_lease_type_from_fmode(fmode_t fmode)
+{
+       return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
+              ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
+}
+
+static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
+{
+       struct inode *inode = file_inode(file);
+       struct iattr ia = {
+               .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
+                           ATTR_MTIME | ATTR_MTIME_SET |
+                           ATTR_CTIME | ATTR_CTIME_SET,
+               .ia_atime = {
+                       .tv_sec = lfu->lfu_atime_sec,
+                       .tv_nsec = lfu->lfu_atime_nsec,
+               },
+               .ia_mtime = {
+                       .tv_sec = lfu->lfu_mtime_sec,
+                       .tv_nsec = lfu->lfu_mtime_nsec,
+               },
+               .ia_ctime = {
+                       .tv_sec = lfu->lfu_ctime_sec,
+                       .tv_nsec = lfu->lfu_ctime_nsec,
+               },
+       };
+       int rc;
+       ENTRY;
+
+       if (!capable(CAP_SYS_ADMIN))
+               RETURN(-EPERM);
+
+       if (!S_ISREG(inode->i_mode))
+               RETURN(-EINVAL);
+
+       inode_lock(inode);
+       rc = ll_setattr_raw(file_dentry(file), &ia, false);
+       inode_unlock(inode);
+
+       RETURN(rc);
+}
+
+/*
+ * Give file access advices
+ *
+ * The ladvise interface is similar to Linux fadvise() system call, except it
+ * forwards the advices directly from Lustre client to server. The server side
+ * codes will apply appropriate read-ahead and caching techniques for the
+ * corresponding files.
+ *
+ * A typical workload for ladvise is e.g. a bunch of different clients are
+ * doing small random reads of a file, so prefetching pages into OSS cache
+ * with big linear reads before the random IO is a net benefit. Fetching
+ * all that data into each client cache with fadvise() may not be, due to
+ * much more data being sent to the client.
+ */
+static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
+                     struct llapi_lu_ladvise *ladvise)
+{
+       struct lu_env *env;
+       struct cl_io *io;
+       struct cl_ladvise_io *lio;
+       int rc;
+       __u16 refcheck;
+       ENTRY;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       io = vvp_env_thread_io(env);
+       io->ci_obj = ll_i2info(inode)->lli_clob;
+
+       /* initialize parameters for ladvise */
+       lio = &io->u.ci_ladvise;
+       lio->li_start = ladvise->lla_start;
+       lio->li_end = ladvise->lla_end;
+       lio->li_fid = ll_inode2fid(inode);
+       lio->li_advice = ladvise->lla_advice;
+       lio->li_flags = flags;
+
+       if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
+               rc = cl_io_loop(env, io);
+       else
+               rc = io->ci_result;
+
+       cl_io_fini(env, io);
+       cl_env_put(env, &refcheck);
+       RETURN(rc);
+}
+
 static long
 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-       struct inode            *inode = file->f_dentry->d_inode;
+       struct inode            *inode = file_inode(file);
        struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
        int                      flags, rc;
        ENTRY;
@@ -2338,14 +2356,14 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
         switch(cmd) {
         case LL_IOC_GETFLAGS:
                 /* Get the current value of the file flags */
-                return put_user(fd->fd_flags, (int *)arg);
+               return put_user(fd->fd_flags, (int __user *)arg);
         case LL_IOC_SETFLAGS:
         case LL_IOC_CLRFLAGS:
                 /* Set or clear specific file flags */
                 /* XXX This probably needs checks to ensure the flags are
                  *     not abused, and to handle any flag side effects.
                  */
-                if (get_user(flags, (int *) arg))
+               if (get_user(flags, (int __user *) arg))
                         RETURN(-EFAULT);
 
                 if (cmd == LL_IOC_SETFLAGS) {
@@ -2369,43 +2387,62 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                struct file *file2;
                struct lustre_swap_layouts lsl;
 
-               if (copy_from_user(&lsl, (char *)arg,
+               if (copy_from_user(&lsl, (char __user *)arg,
                                       sizeof(struct lustre_swap_layouts)))
                        RETURN(-EFAULT);
 
-               if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
+               if ((file->f_flags & O_ACCMODE) == O_RDONLY)
                        RETURN(-EPERM);
 
                file2 = fget(lsl.sl_fd);
                if (file2 == NULL)
                        RETURN(-EBADF);
 
-               rc = -EPERM;
-               if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
+               /* O_WRONLY or O_RDWR */
+               if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
+                       GOTO(out, rc = -EPERM);
+
+               if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
+                       struct inode                    *inode2;
+                       struct ll_inode_info            *lli;
+                       struct obd_client_handle        *och = NULL;
+
+                       if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
+                               GOTO(out, rc = -EINVAL);
+
+                       lli = ll_i2info(inode);
+                       mutex_lock(&lli->lli_och_mutex);
+                       if (fd->fd_lease_och != NULL) {
+                               och = fd->fd_lease_och;
+                               fd->fd_lease_och = NULL;
+                       }
+                       mutex_unlock(&lli->lli_och_mutex);
+                       if (och == NULL)
+                               GOTO(out, rc = -ENOLCK);
+                       inode2 = file_inode(file2);
+                       rc = ll_swap_layouts_close(och, inode, inode2);
+               } else {
                        rc = ll_swap_layouts(file, file2, &lsl);
+               }
+out:
                fput(file2);
                RETURN(rc);
        }
-        case LL_IOC_LOV_GETSTRIPE:
-                RETURN(ll_lov_getstripe(inode, arg));
-        case LL_IOC_RECREATE_OBJ:
-                RETURN(ll_lov_recreate_obj(inode, arg));
-        case LL_IOC_RECREATE_FID:
-                RETURN(ll_lov_recreate_fid(inode, arg));
-        case FSFILT_IOC_FIEMAP:
-                RETURN(ll_ioctl_fiemap(inode, arg));
+       case LL_IOC_LOV_GETSTRIPE:
+               RETURN(ll_file_getstripe(inode,
+                                        (struct lov_user_md __user *)arg));
         case FSFILT_IOC_GETFLAGS:
         case FSFILT_IOC_SETFLAGS:
                 RETURN(ll_iocontrol(inode, file, cmd, arg));
         case FSFILT_IOC_GETVERSION_OLD:
         case FSFILT_IOC_GETVERSION:
-                RETURN(put_user(inode->i_generation, (int *)arg));
+               RETURN(put_user(inode->i_generation, (int __user *)arg));
         case LL_IOC_GROUP_LOCK:
                 RETURN(ll_get_grouplock(inode, file, arg));
         case LL_IOC_GROUP_UNLOCK:
                 RETURN(ll_put_grouplock(inode, file, arg));
         case IOC_OBD_STATFS:
-                RETURN(ll_obd_statfs(inode, (void *)arg));
+               RETURN(ll_obd_statfs(inode, (void __user *)arg));
 
         /* We need to special case any other ioctls we want to handle,
          * to send them to the MDS/OST as appropriate and to properly
@@ -2416,25 +2453,29 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case LL_IOC_FLUSHCTX:
                RETURN(ll_flush_ctx(inode));
        case LL_IOC_PATH2FID: {
-               if (copy_to_user((void *)arg, ll_inode2fid(inode),
+               if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
                                 sizeof(struct lu_fid)))
                        RETURN(-EFAULT);
 
                RETURN(0);
        }
+       case LL_IOC_GETPARENT:
+               RETURN(ll_getparent(file, (struct getparent __user *)arg));
+
        case OBD_IOC_FID2PATH:
-               RETURN(ll_fid2path(inode, (void *)arg));
+               RETURN(ll_fid2path(inode, (void __user *)arg));
        case LL_IOC_DATA_VERSION: {
                struct ioc_data_version idv;
                int rc;
 
-               if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
+               if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
                        RETURN(-EFAULT);
 
                idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
                rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
 
-               if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
+               if (rc == 0 &&
+                   copy_to_user((char __user *)arg, &idv, sizeof(idv)))
                        RETURN(-EFAULT);
 
                RETURN(rc);
@@ -2447,7 +2488,7 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                 if (mdtidx < 0)
                         RETURN(mdtidx);
 
-                if (put_user((int)mdtidx, (int*)arg))
+               if (put_user((int)mdtidx, (int __user *)arg))
                         RETURN(-EFAULT);
 
                 RETURN(0);
@@ -2474,7 +2515,7 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
                                   op_data, NULL);
 
-               if (copy_to_user((void *)arg, hus, sizeof(*hus)))
+               if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
                        rc = -EFAULT;
 
                ll_finish_md_op_data(op_data);
@@ -2489,7 +2530,7 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                if (hss == NULL)
                        RETURN(-ENOMEM);
 
-               if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
+               if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
                        OBD_FREE_PTR(hss);
                        RETURN(-EFAULT);
                }
@@ -2518,7 +2559,7 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
                                   op_data, NULL);
 
-               if (copy_to_user((char *)arg, hca, sizeof(*hca)))
+               if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
                        rc = -EFAULT;
 
                ll_finish_md_op_data(op_data);
@@ -2529,20 +2570,20 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                struct ll_inode_info *lli = ll_i2info(inode);
                struct obd_client_handle *och = NULL;
                bool lease_broken;
-               fmode_t mode = 0;
+               fmode_t fmode;
 
                switch (arg) {
-               case F_WRLCK:
+               case LL_LEASE_WRLCK:
                        if (!(file->f_mode & FMODE_WRITE))
                                RETURN(-EPERM);
-                       mode = FMODE_WRITE;
+                       fmode = FMODE_WRITE;
                        break;
-               case F_RDLCK:
+               case LL_LEASE_RDLCK:
                        if (!(file->f_mode & FMODE_READ))
                                RETURN(-EPERM);
-                       mode = FMODE_READ;
+                       fmode = FMODE_READ;
                        break;
-               case F_UNLCK:
+               case LL_LEASE_UNLCK:
                        mutex_lock(&lli->lli_och_mutex);
                        if (fd->fd_lease_och != NULL) {
                                och = fd->fd_lease_och;
@@ -2550,25 +2591,30 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                        }
                        mutex_unlock(&lli->lli_och_mutex);
 
-                       if (och != NULL) {
-                               mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
-                               rc = ll_lease_close(och, inode, &lease_broken);
-                               if (rc == 0 && lease_broken)
-                                       mode = 0;
-                       } else {
-                               rc = -ENOLCK;
-                       }
+                       if (och == NULL)
+                               RETURN(-ENOLCK);
+
+                       fmode = och->och_flags;
+                       rc = ll_lease_close(och, inode, &lease_broken);
+                       if (rc < 0)
+                               RETURN(rc);
+
+                       rc = ll_lease_och_release(inode, file);
+                       if (rc < 0)
+                               RETURN(rc);
+
+                       if (lease_broken)
+                               fmode = 0;
 
-                       /* return the type of lease or error */
-                       RETURN(rc < 0 ? rc : (int)mode);
+                       RETURN(ll_lease_type_from_fmode(fmode));
                default:
                        RETURN(-EINVAL);
                }
 
-               CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
+               CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
 
                /* apply for lease */
-               och = ll_lease_open(inode, file, mode, 0);
+               och = ll_lease_open(inode, file, fmode, 0);
                if (IS_ERR(och))
                        RETURN(PTR_ERR(och));
 
@@ -2589,8 +2635,8 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case LL_IOC_GET_LEASE: {
                struct ll_inode_info *lli = ll_i2info(inode);
                struct ldlm_lock *lock = NULL;
+               fmode_t fmode = 0;
 
-               rc = 0;
                mutex_lock(&lli->lli_och_mutex);
                if (fd->fd_lease_och != NULL) {
                        struct obd_client_handle *och = fd->fd_lease_och;
@@ -2599,14 +2645,15 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                        if (lock != NULL) {
                                lock_res_and_lock(lock);
                                if (!ldlm_is_cancel(lock))
-                                       rc = och->och_flags &
-                                               (FMODE_READ | FMODE_WRITE);
+                                       fmode = och->och_flags;
+
                                unlock_res_and_lock(lock);
                                LDLM_LOCK_PUT(lock);
                        }
                }
                mutex_unlock(&lli->lli_och_mutex);
-               RETURN(rc);
+
+               RETURN(ll_lease_type_from_fmode(fmode));
        }
        case LL_IOC_HSM_IMPORT: {
                struct hsm_user_import *hui;
@@ -2615,7 +2662,7 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                if (hui == NULL)
                        RETURN(-ENOMEM);
 
-               if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
+               if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
                        OBD_FREE_PTR(hui);
                        RETURN(-EFAULT);
                }
@@ -2625,7 +2672,66 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                OBD_FREE_PTR(hui);
                RETURN(rc);
        }
+       case LL_IOC_FUTIMES_3: {
+               struct ll_futimes_3 lfu;
+
+               if (copy_from_user(&lfu,
+                                  (const struct ll_futimes_3 __user *)arg,
+                                  sizeof(lfu)))
+                       RETURN(-EFAULT);
+
+               RETURN(ll_file_futimes_3(file, &lfu));
+       }
+       case LL_IOC_LADVISE: {
+               struct llapi_ladvise_hdr *ladvise_hdr;
+               int i;
+               int num_advise;
+               int alloc_size = sizeof(*ladvise_hdr);
+
+               rc = 0;
+               OBD_ALLOC_PTR(ladvise_hdr);
+               if (ladvise_hdr == NULL)
+                       RETURN(-ENOMEM);
+
+               if (copy_from_user(ladvise_hdr,
+                                  (const struct llapi_ladvise_hdr __user *)arg,
+                                  alloc_size))
+                       GOTO(out_ladvise, rc = -EFAULT);
+
+               if (ladvise_hdr->lah_magic != LADVISE_MAGIC ||
+                   ladvise_hdr->lah_count < 1)
+                       GOTO(out_ladvise, rc = -EINVAL);
+
+               num_advise = ladvise_hdr->lah_count;
+               if (num_advise >= LAH_COUNT_MAX)
+                       GOTO(out_ladvise, rc = -EFBIG);
 
+               OBD_FREE_PTR(ladvise_hdr);
+               alloc_size = offsetof(typeof(*ladvise_hdr),
+                                     lah_advise[num_advise]);
+               OBD_ALLOC(ladvise_hdr, alloc_size);
+               if (ladvise_hdr == NULL)
+                       RETURN(-ENOMEM);
+
+               /*
+                * TODO: submit multiple advices to one server in a single RPC
+                */
+               if (copy_from_user(ladvise_hdr,
+                                  (const struct llapi_ladvise_hdr __user *)arg,
+                                  alloc_size))
+                       GOTO(out_ladvise, rc = -EFAULT);
+
+               for (i = 0; i < num_advise; i++) {
+                       rc = ll_ladvise(inode, file, ladvise_hdr->lah_flags,
+                                       &ladvise_hdr->lah_advise[i]);
+                       if (rc)
+                               break;
+               }
+
+out_ladvise:
+               OBD_FREE(ladvise_hdr, alloc_size);
+               RETURN(rc);
+       }
        default: {
                int err;
 
@@ -2634,7 +2740,7 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                        RETURN(err);
 
                RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
-                                    (void *)arg));
+                                    (void __user *)arg));
        }
        }
 }
@@ -2659,7 +2765,7 @@ static loff_t
 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
                 loff_t maxsize, loff_t eof)
 {
-       struct inode *inode = file->f_dentry->d_inode;
+       struct inode *inode = file_inode(file);
 
        switch (origin) {
        case SEEK_END:
@@ -2679,9 +2785,9 @@ generic_file_llseek_size(struct file *file, loff_t offset, int origin,
                 * SEEK_CURs. Note that parallel writes and reads behave
                 * like SEEK_SET.
                 */
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                offset = llseek_execute(file, file->f_pos + offset, maxsize);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                return offset;
        case SEEK_DATA:
                /*
@@ -2708,7 +2814,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int origin,
 
 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
 {
-       struct inode *inode = file->f_dentry->d_inode;
+       struct inode *inode = file_inode(file);
        loff_t retval, eof = 0;
 
        ENTRY;
@@ -2733,7 +2839,7 @@ static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
 
 static int ll_flush(struct file *file, fl_owner_t id)
 {
-       struct inode *inode = file->f_dentry->d_inode;
+       struct inode *inode = file_inode(file);
        struct ll_inode_info *lli = ll_i2info(inode);
        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
        int rc, err;
@@ -2766,31 +2872,27 @@ static int ll_flush(struct file *file, fl_owner_t id)
 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
                       enum cl_fsync_mode mode, int ignore_layout)
 {
-       struct cl_env_nest nest;
        struct lu_env *env;
        struct cl_io *io;
-       struct obd_capa *capa = NULL;
        struct cl_fsync_io *fio;
        int result;
+       __u16 refcheck;
        ENTRY;
 
        if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
            mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
                RETURN(-EINVAL);
 
-       env = cl_env_nested_get(&nest);
+       env = cl_env_get(&refcheck);
        if (IS_ERR(env))
                RETURN(PTR_ERR(env));
 
-       capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
-
-       io = ccc_env_thread_io(env);
-       io->ci_obj = cl_i2info(inode)->lli_clob;
+       io = vvp_env_thread_io(env);
+       io->ci_obj = ll_i2info(inode)->lli_clob;
        io->ci_ignore_layout = ignore_layout;
 
        /* initialize parameters for sync */
        fio = &io->u.ci_fsync;
-       fio->fi_capa = capa;
        fio->fi_start = start;
        fio->fi_end = end;
        fio->fi_fid = ll_inode2fid(inode);
@@ -2804,27 +2906,25 @@ int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
        if (result == 0)
                result = fio->fi_nr_written;
        cl_io_fini(env, io);
-       cl_env_nested_put(&nest, env);
-
-       capa_put(capa);
+       cl_env_put(env, &refcheck);
 
        RETURN(result);
 }
 
 /*
- * When dentry is provided (the 'else' case), *file->f_dentry may be
+ * When dentry is provided (the 'else' case), file_dentry() may be
  * null and dentry must be used directly rather than pulled from
- * *file->f_dentry as is done otherwise.
+ * file_dentry() as is done otherwise.
  */
 
 #ifdef HAVE_FILE_FSYNC_4ARGS
 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
-       struct dentry *dentry = file->f_dentry;
+       struct dentry *dentry = file_dentry(file);
 #elif defined(HAVE_FILE_FSYNC_2ARGS)
 int ll_fsync(struct file *file, int datasync)
 {
-       struct dentry *dentry = file->f_dentry;
+       struct dentry *dentry = file_dentry(file);
        loff_t start = 0;
        loff_t end = LLONG_MAX;
 #else
@@ -2836,7 +2936,6 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
        struct inode *inode = dentry->d_inode;
        struct ll_inode_info *lli = ll_i2info(inode);
        struct ptlrpc_request *req;
-       struct obd_capa *oc;
        int rc, err;
        ENTRY;
 
@@ -2846,7 +2945,7 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
 
 #ifdef HAVE_FILE_FSYNC_4ARGS
        rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 #else
        /* fsync's caller has already called _fdata{sync,write}, we want
         * that IO to finish before calling the osc and mdc sync methods */
@@ -2865,10 +2964,7 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
                        rc = err;
        }
 
-       oc = ll_mdscapa_get(inode);
-       err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
-                      &req);
-       capa_put(oc);
+       err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
        if (!rc)
                rc = err;
        if (!err)
@@ -2887,7 +2983,7 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
        }
 
 #ifdef HAVE_FILE_FSYNC_4ARGS
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 #endif
        RETURN(rc);
 }
@@ -2895,7 +2991,7 @@ int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
 static int
 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
 {
-       struct inode *inode = file->f_dentry->d_inode;
+       struct inode *inode = file_inode(file);
        struct ll_sb_info *sbi = ll_i2sbi(inode);
        struct ldlm_enqueue_info einfo = {
                .ei_type        = LDLM_FLOCK,
@@ -2903,8 +2999,8 @@ ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
                .ei_cbdata      = file_lock,
        };
        struct md_op_data *op_data;
-       struct lustre_handle lockh = {0};
-       ldlm_policy_data_t flock = {{0}};
+       struct lustre_handle lockh = { 0 };
+       union ldlm_policy_data flock = { { 0 } };
        int fl_type = file_lock->fl_type;
        __u64 flags = 0;
        int rc;
@@ -2997,8 +3093,8 @@ ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
         if (IS_ERR(op_data))
                 RETURN(PTR_ERR(op_data));
 
-       CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
-              "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
+       CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
+              "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
               flock.l_flock.pid, flags, einfo.ei_mode,
               flock.l_flock.start, flock.l_flock.end);
 
@@ -3009,6 +3105,11 @@ ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
        if (!(flags & LDLM_FL_TEST_LOCK))
                file_lock->fl_type = fl_type;
 
+#ifdef HAVE_LOCKS_LOCK_FILE_WAIT
+       if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
+           !(flags & LDLM_FL_TEST_LOCK))
+               rc2  = locks_lock_file_wait(file, file_lock);
+#else
         if ((file_lock->fl_flags & FL_FLOCK) &&
             (rc == 0 || file_lock->fl_type == F_UNLCK))
                rc2  = flock_lock_file_wait(file, file_lock);
@@ -3016,6 +3117,7 @@ ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
             !(flags & LDLM_FL_TEST_LOCK))
                rc2  = posix_lock_file_wait(file, file_lock);
+#endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
 
        if (rc2 && file_lock->fl_type != F_UNLCK) {
                einfo.ei_mode = LCK_NL;
@@ -3030,7 +3132,8 @@ ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
 }
 
 int ll_get_fid_by_name(struct inode *parent, const char *name,
-                      int namelen, struct lu_fid *fid)
+                      int namelen, struct lu_fid *fid,
+                      struct inode **inode)
 {
        struct md_op_data       *op_data = NULL;
        struct mdt_body         *body;
@@ -3043,7 +3146,7 @@ int ll_get_fid_by_name(struct inode *parent, const char *name,
        if (IS_ERR(op_data))
                RETURN(PTR_ERR(op_data));
 
-       op_data->op_valid = OBD_MD_FLID;
+       op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
        rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
        ll_finish_md_op_data(op_data);
        if (rc < 0)
@@ -3054,6 +3157,9 @@ int ll_get_fid_by_name(struct inode *parent, const char *name,
                GOTO(out_req, rc = -EFAULT);
        if (fid != NULL)
                *fid = body->mbo_fid1;
+
+       if (inode != NULL)
+               rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
 out_req:
        ptlrpc_req_finished(req);
        RETURN(rc);
@@ -3066,8 +3172,11 @@ int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
        struct inode          *child_inode = NULL;
        struct md_op_data     *op_data;
        struct ptlrpc_request *request = NULL;
+       struct obd_client_handle *och = NULL;
        struct qstr           qstr;
+       struct mdt_body         *body;
        int                    rc;
+       __u64                   data_version = 0;
        ENTRY;
 
        CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
@@ -3082,36 +3191,68 @@ int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
        qstr.hash = full_name_hash(name, namelen);
        qstr.name = name;
        qstr.len = namelen;
-       dchild = d_lookup(file->f_dentry, &qstr);
-       if (dchild != NULL && dchild->d_inode != NULL) {
-               op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
-               if (dchild->d_inode != NULL) {
+       dchild = d_lookup(file_dentry(file), &qstr);
+       if (dchild != NULL) {
+               if (dchild->d_inode != NULL)
                        child_inode = igrab(dchild->d_inode);
-                       ll_invalidate_aliases(child_inode);
-               }
                dput(dchild);
-       } else {
+       }
+
+       if (child_inode == NULL) {
                rc = ll_get_fid_by_name(parent, name, namelen,
-                                       &op_data->op_fid3);
+                                       &op_data->op_fid3, &child_inode);
                if (rc != 0)
                        GOTO(out_free, rc);
        }
 
+       if (child_inode == NULL)
+               GOTO(out_free, rc = -EINVAL);
+
+       /*
+        * lfs migrate command needs to be blocked on the client
+        * by checking the migrate FID against the FID of the
+        * filesystem root.
+        */
+       if (child_inode == parent->i_sb->s_root->d_inode)
+               GOTO(out_iput, rc = -EINVAL);
+
+       inode_lock(child_inode);
+       op_data->op_fid3 = *ll_inode2fid(child_inode);
        if (!fid_is_sane(&op_data->op_fid3)) {
-               CERROR("%s: migrate %s , but fid "DFID" is insane\n",
+               CERROR("%s: migrate %s, but FID "DFID" is insane\n",
                       ll_get_fsname(parent->i_sb, NULL, 0), name,
                       PFID(&op_data->op_fid3));
-               GOTO(out_free, rc = -EINVAL);
+               GOTO(out_unlock, rc = -EINVAL);
        }
 
        rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
        if (rc < 0)
-               GOTO(out_free, rc);
+               GOTO(out_unlock, rc);
 
        if (rc == mdtidx) {
-               CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
+               CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
                       PFID(&op_data->op_fid3), mdtidx);
-               GOTO(out_free, rc = 0);
+               GOTO(out_unlock, rc = 0);
+       }
+again:
+       if (S_ISREG(child_inode->i_mode)) {
+               och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
+               if (IS_ERR(och)) {
+                       rc = PTR_ERR(och);
+                       och = NULL;
+                       GOTO(out_unlock, rc);
+               }
+
+               rc = ll_data_version(child_inode, &data_version,
+                                    LL_DV_WR_FLUSH);
+               if (rc != 0)
+                       GOTO(out_close, rc);
+
+               op_data->op_handle = och->och_fh;
+               op_data->op_data = och->och_mod;
+               op_data->op_data_version = data_version;
+               op_data->op_lease_handle = och->och_lease_handle;
+               op_data->op_bias |= MDS_RENAME_MIGRATE;
        }
 
        op_data->op_mds = mdtidx;
@@ -3121,16 +3262,42 @@ int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
        if (rc == 0)
                ll_update_times(request, parent);
 
-       ptlrpc_req_finished(request);
-       if (rc != 0)
-               GOTO(out_free, rc);
+       if (request != NULL) {
+               body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+               if (body == NULL) {
+                       ptlrpc_req_finished(request);
+                       GOTO(out_close, rc = -EPROTO);
+               }
 
-out_free:
-       if (child_inode != NULL) {
-               clear_nlink(child_inode);
-               iput(child_inode);
+               /* If the server does release layout lock, then we cleanup
+                * the client och here, otherwise release it in out_close: */
+               if (och != NULL &&
+                   body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
+                       obd_mod_put(och->och_mod);
+                       md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
+                                                 och);
+                       och->och_fh.cookie = DEAD_HANDLE_MAGIC;
+                       OBD_FREE_PTR(och);
+                       och = NULL;
+               }
+               ptlrpc_req_finished(request);
        }
 
+       /* Try again if the file layout has changed. */
+       if (rc == -EAGAIN && S_ISREG(child_inode->i_mode)) {
+               request = NULL;
+               goto again;
+       }
+out_close:
+       if (och != NULL) /* close the file */
+               ll_lease_close(och, child_inode, NULL);
+       if (rc == 0)
+               clear_nlink(child_inode);
+out_unlock:
+       inode_unlock(child_inode);
+out_iput:
+       iput(child_inode);
+out_free:
        ll_finish_md_op_data(op_data);
        RETURN(rc);
 }
@@ -3153,16 +3320,16 @@ ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
  * \param l_req_mode [IN] searched lock mode
  * \retval boolean, true iff all bits are found
  */
-int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
+int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
 {
-        struct lustre_handle lockh;
-        ldlm_policy_data_t policy;
-        ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
-                                (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
-        struct lu_fid *fid;
+       struct lustre_handle lockh;
+       union ldlm_policy_data policy;
+       enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
+                             (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
+       struct lu_fid *fid;
        __u64 flags;
-        int i;
-        ENTRY;
+       int i;
+       ENTRY;
 
         if (!inode)
                RETURN(0);
@@ -3194,17 +3361,17 @@ int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
         RETURN(*bits == 0);
 }
 
-ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
-                           struct lustre_handle *lockh, __u64 flags,
-                           ldlm_mode_t mode)
+enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
+                              struct lustre_handle *lockh, __u64 flags,
+                              enum ldlm_mode mode)
 {
-        ldlm_policy_data_t policy = { .l_inodebits = {bits}};
-        struct lu_fid *fid;
-        ldlm_mode_t rc;
-        ENTRY;
+       union ldlm_policy_data policy = { .l_inodebits = { bits } };
+       struct lu_fid *fid;
+       enum ldlm_mode rc;
+       ENTRY;
 
-        fid = &ll_i2info(inode)->lli_fid;
-        CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
+       fid = &ll_i2info(inode)->lli_fid;
+       CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
 
        rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
                           fid, LDLM_IBITS, &policy, mode, lockh);
@@ -3217,6 +3384,13 @@ static int ll_inode_revalidate_fini(struct inode *inode, int rc)
        /* Already unlinked. Just update nlink and return success */
        if (rc == -ENOENT) {
                clear_nlink(inode);
+               /* If it is striped directory, and there is bad stripe
+                * Let's revalidate the dentry again, instead of returning
+                * error */
+               if (S_ISDIR(inode->i_mode) &&
+                   ll_i2info(inode)->lli_lsm_md != NULL)
+                       return 0;
+
                /* This path cannot be hit for regular files unless in
                 * case of obscure races, so no need to to validate
                 * size. */
@@ -3264,11 +3438,9 @@ static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
                 if (IS_ERR(op_data))
                         RETURN(PTR_ERR(op_data));
 
-                oit.it_create_mode |= M_CHECK_STALE;
                rc = md_intent_lock(exp, op_data, &oit, &req,
                                    &ll_md_blocking_ast, 0);
                 ll_finish_md_op_data(op_data);
-                oit.it_create_mode &= ~M_CHECK_STALE;
                 if (rc < 0) {
                         rc = ll_inode_revalidate_fini(inode, rc);
                         GOTO (out, rc);
@@ -3284,15 +3456,18 @@ static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
                    here to preserve get_cwd functionality on 2.6.
                    Bug 10503 */
-               if (!dentry->d_inode->i_nlink)
+               if (!dentry->d_inode->i_nlink) {
+                       ll_lock_dcache(inode);
                        d_lustre_invalidate(dentry, 0);
+                       ll_unlock_dcache(inode);
+               }
 
                 ll_lookup_finish_locks(&oit, dentry);
         } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
-                struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
-                obd_valid valid = OBD_MD_FLGETATTR;
-                struct md_op_data *op_data;
-                int ealen = 0;
+               struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
+               u64 valid = OBD_MD_FLGETATTR;
+               struct md_op_data *op_data;
+               int ealen = 0;
 
                if (S_ISREG(inode->i_mode)) {
                        rc = ll_get_default_mdsize(sbi, &ealen);
@@ -3308,9 +3483,6 @@ static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
                         RETURN(PTR_ERR(op_data));
 
                 op_data->op_valid = valid;
-                /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
-                 * capa for this inode. Because we only keep capas of dirs
-                 * fresh. */
                 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
                 ll_finish_md_op_data(op_data);
                 if (rc) {
@@ -3332,16 +3504,17 @@ static int ll_merge_md_attr(struct inode *inode)
 
        LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
        rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
-                          &attr);
+                          &attr, ll_md_blocking_ast);
        if (rc != 0)
                RETURN(rc);
 
-       ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
-       ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
+       set_nlink(inode, attr.cat_nlink);
+       inode->i_blocks = attr.cat_blocks;
+       i_size_write(inode, attr.cat_size);
 
-       ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
-       ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
-       ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
+       ll_i2info(inode)->lli_atime = attr.cat_atime;
+       ll_i2info(inode)->lli_mtime = attr.cat_mtime;
+       ll_i2info(inode)->lli_ctime = attr.cat_ctime;
 
        RETURN(0);
 }
@@ -3366,9 +3539,9 @@ ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
                                RETURN(rc);
                }
 
-               LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
-               LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
-               LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
+               LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
+               LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
+               LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
        } else {
                /* In case of restore, the MDT has the right size and has
                 * already send it back without granting the layout lock,
@@ -3377,7 +3550,7 @@ ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
                 * restore the MDT holds the layout lock so the glimpse will
                 * block up to the end of restore (getattr will block)
                 */
-               if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
+               if (!ll_file_test_flag(ll_i2info(inode), LLIF_FILE_RESTORING))
                        rc = ll_glimpse_size(inode);
        }
        RETURN(rc);
@@ -3397,6 +3570,8 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
         if (res)
                 return res;
 
+       OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
+
        stat->dev = inode->i_sb->s_dev;
        if (ll_need_32bit_api(sbi))
                stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
@@ -3410,16 +3585,10 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
        stat->mtime = inode->i_mtime;
        stat->ctime = inode->i_ctime;
        stat->blksize = 1 << inode->i_blkbits;
-       stat->blocks = inode->i_blocks;
 
-       if (S_ISDIR(inode->i_mode) &&
-               ll_i2info(inode)->lli_lsm_md != NULL) {
-               stat->nlink = lli->lli_stripe_dir_nlink;
-               stat->size = lli->lli_stripe_dir_size;
-       } else {
-               stat->nlink = inode->i_nlink;
-               stat->size = i_size_read(inode);
-       }
+       stat->nlink = inode->i_nlink;
+       stat->size = i_size_read(inode);
+       stat->blocks = inode->i_blocks;
 
         return 0;
 }
@@ -3427,35 +3596,37 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                     __u64 start, __u64 len)
 {
-        int rc;
-        size_t num_bytes;
-        struct ll_user_fiemap *fiemap;
-        unsigned int extent_count = fieinfo->fi_extents_max;
+       int             rc;
+       size_t          num_bytes;
+       struct fiemap   *fiemap;
+       unsigned int    extent_count = fieinfo->fi_extents_max;
 
-        num_bytes = sizeof(*fiemap) + (extent_count *
-                                       sizeof(struct ll_fiemap_extent));
-        OBD_ALLOC_LARGE(fiemap, num_bytes);
+       num_bytes = sizeof(*fiemap) + (extent_count *
+                                      sizeof(struct fiemap_extent));
+       OBD_ALLOC_LARGE(fiemap, num_bytes);
 
-        if (fiemap == NULL)
-                RETURN(-ENOMEM);
+       if (fiemap == NULL)
+               RETURN(-ENOMEM);
 
-        fiemap->fm_flags = fieinfo->fi_flags;
-        fiemap->fm_extent_count = fieinfo->fi_extents_max;
-        fiemap->fm_start = start;
-        fiemap->fm_length = len;
-       if (extent_count > 0)
-               memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
-                      sizeof(struct ll_fiemap_extent));
+       fiemap->fm_flags = fieinfo->fi_flags;
+       fiemap->fm_extent_count = fieinfo->fi_extents_max;
+       fiemap->fm_start = start;
+       fiemap->fm_length = len;
+       if (extent_count > 0 &&
+           copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
+                          sizeof(struct fiemap_extent)) != 0)
+               GOTO(out, rc = -EFAULT);
 
        rc = ll_do_fiemap(inode, fiemap, num_bytes);
 
        fieinfo->fi_flags = fiemap->fm_flags;
        fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
-       if (extent_count > 0)
-               memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
-                      fiemap->fm_mapped_extents *
-                      sizeof(struct ll_fiemap_extent));
-
+       if (extent_count > 0 &&
+           copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
+                        fiemap->fm_mapped_extents *
+                        sizeof(struct fiemap_extent)) != 0)
+               GOTO(out, rc = -EFAULT);
+out:
        OBD_FREE_LARGE(fiemap, num_bytes);
        return rc;
 }
@@ -3575,12 +3746,7 @@ int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
        }
 
        ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
-
-       if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
-               rc = lustre_check_remote_perm(inode, mask);
-       else
-               rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
-
+       rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
        /* restore current process's credentials and FS capability */
        if (squash_id) {
                revert_creds(old_cred);
@@ -3592,53 +3758,80 @@ int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
 
 /* -o localflock - only provides locally consistent flock locks */
 struct file_operations ll_file_operations = {
-        .read           = ll_file_read,
-       .aio_read    = ll_file_aio_read,
-        .write          = ll_file_write,
-       .aio_write   = ll_file_aio_write,
-        .unlocked_ioctl = ll_file_ioctl,
-        .open           = ll_file_open,
-        .release        = ll_file_release,
-        .mmap           = ll_file_mmap,
-        .llseek         = ll_file_seek,
-        .splice_read    = ll_file_splice_read,
-        .fsync          = ll_fsync,
-        .flush          = ll_flush
+#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+# ifdef HAVE_SYNC_READ_WRITE
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+# endif
+       .read_iter      = ll_file_read_iter,
+       .write_iter     = ll_file_write_iter,
+#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+       .read           = ll_file_read,
+       .aio_read       = ll_file_aio_read,
+       .write          = ll_file_write,
+       .aio_write      = ll_file_aio_write,
+#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+       .unlocked_ioctl = ll_file_ioctl,
+       .open           = ll_file_open,
+       .release        = ll_file_release,
+       .mmap           = ll_file_mmap,
+       .llseek         = ll_file_seek,
+       .splice_read    = ll_file_splice_read,
+       .fsync          = ll_fsync,
+       .flush          = ll_flush
 };
 
 struct file_operations ll_file_operations_flock = {
-        .read           = ll_file_read,
-       .aio_read    = ll_file_aio_read,
-        .write          = ll_file_write,
-       .aio_write   = ll_file_aio_write,
-        .unlocked_ioctl = ll_file_ioctl,
-        .open           = ll_file_open,
-        .release        = ll_file_release,
-        .mmap           = ll_file_mmap,
-        .llseek         = ll_file_seek,
-        .splice_read    = ll_file_splice_read,
-        .fsync          = ll_fsync,
-        .flush          = ll_flush,
-        .flock          = ll_file_flock,
-        .lock           = ll_file_flock
+#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+# ifdef HAVE_SYNC_READ_WRITE
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+# endif /* HAVE_SYNC_READ_WRITE */
+       .read_iter      = ll_file_read_iter,
+       .write_iter     = ll_file_write_iter,
+#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+       .read           = ll_file_read,
+       .aio_read       = ll_file_aio_read,
+       .write          = ll_file_write,
+       .aio_write      = ll_file_aio_write,
+#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+       .unlocked_ioctl = ll_file_ioctl,
+       .open           = ll_file_open,
+       .release        = ll_file_release,
+       .mmap           = ll_file_mmap,
+       .llseek         = ll_file_seek,
+       .splice_read    = ll_file_splice_read,
+       .fsync          = ll_fsync,
+       .flush          = ll_flush,
+       .flock          = ll_file_flock,
+       .lock           = ll_file_flock
 };
 
 /* These are for -o noflock - to return ENOSYS on flock calls */
 struct file_operations ll_file_operations_noflock = {
-        .read           = ll_file_read,
-       .aio_read    = ll_file_aio_read,
-        .write          = ll_file_write,
-       .aio_write   = ll_file_aio_write,
-        .unlocked_ioctl = ll_file_ioctl,
-        .open           = ll_file_open,
-        .release        = ll_file_release,
-        .mmap           = ll_file_mmap,
-        .llseek         = ll_file_seek,
-        .splice_read    = ll_file_splice_read,
-        .fsync          = ll_fsync,
-        .flush          = ll_flush,
-        .flock          = ll_file_noflock,
-        .lock           = ll_file_noflock
+#ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
+# ifdef HAVE_SYNC_READ_WRITE
+       .read           = new_sync_read,
+       .write          = new_sync_write,
+# endif /* HAVE_SYNC_READ_WRITE */
+       .read_iter      = ll_file_read_iter,
+       .write_iter     = ll_file_write_iter,
+#else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+       .read           = ll_file_read,
+       .aio_read       = ll_file_aio_read,
+       .write          = ll_file_write,
+       .aio_write      = ll_file_aio_write,
+#endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
+       .unlocked_ioctl = ll_file_ioctl,
+       .open           = ll_file_open,
+       .release        = ll_file_release,
+       .mmap           = ll_file_mmap,
+       .llseek         = ll_file_seek,
+       .splice_read    = ll_file_splice_read,
+       .fsync          = ll_fsync,
+       .flush          = ll_flush,
+       .flock          = ll_file_noflock,
+       .lock           = ll_file_noflock
 };
 
 struct inode_operations ll_file_inode_operations = {
@@ -3759,48 +3952,53 @@ ll_iocontrol_call(struct inode *inode, struct file *file,
 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
 {
        struct ll_inode_info *lli = ll_i2info(inode);
-       struct cl_env_nest nest;
+       struct cl_object *obj = lli->lli_clob;
        struct lu_env *env;
-       int result;
+       int rc;
+       __u16 refcheck;
        ENTRY;
 
-       if (lli->lli_clob == NULL)
+       if (obj == NULL)
                RETURN(0);
 
-       env = cl_env_nested_get(&nest);
+       env = cl_env_get(&refcheck);
        if (IS_ERR(env))
                RETURN(PTR_ERR(env));
 
-       result = cl_conf_set(env, lli->lli_clob, conf);
-       cl_env_nested_put(&nest, env);
+       rc = cl_conf_set(env, lli->lli_clob, conf);
+       if (rc < 0)
+               GOTO(out, rc);
 
        if (conf->coc_opc == OBJECT_CONF_SET) {
                struct ldlm_lock *lock = conf->coc_lock;
+               struct cl_layout cl = {
+                       .cl_layout_gen = 0,
+               };
 
                LASSERT(lock != NULL);
                LASSERT(ldlm_has_layout(lock));
-               if (result == 0) {
-                       struct lustre_md *md = conf->u.coc_md;
-                       __u32 gen = LL_LAYOUT_GEN_EMPTY;
-
-                       /* it can only be allowed to match after layout is
-                        * applied to inode otherwise false layout would be
-                        * seen. Applying layout shoud happen before dropping
-                        * the intent lock. */
-                       ldlm_lock_allow_match(lock);
-
-                       lli->lli_has_smd = lsm_has_objects(md->lsm);
-                       if (md->lsm != NULL)
-                               gen = md->lsm->lsm_layout_gen;
-
-                       CDEBUG(D_VFSTRACE,
-                              DFID ": layout version change: %u -> %u\n",
-                              PFID(&lli->lli_fid), ll_layout_version_get(lli),
-                              gen);
-                       ll_layout_version_set(lli, gen);
-               }
+
+               /* it can only be allowed to match after layout is
+                * applied to inode otherwise false layout would be
+                * seen. Applying layout shoud happen before dropping
+                * the intent lock. */
+               ldlm_lock_allow_match(lock);
+
+               rc = cl_object_layout_get(env, obj, &cl);
+               if (rc < 0)
+                       GOTO(out, rc);
+
+               CDEBUG(D_VFSTRACE,
+                      DFID": layout version change: %u -> %u\n",
+                      PFID(&lli->lli_fid), ll_layout_version_get(lli),
+                      cl.cl_layout_gen);
+               ll_layout_version_set(lli, cl.cl_layout_gen);
        }
-       RETURN(result);
+
+out:
+       cl_env_put(env, &refcheck);
+
+       RETURN(rc);
 }
 
 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
@@ -3808,7 +4006,6 @@ static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
 
 {
        struct ll_sb_info *sbi = ll_i2sbi(inode);
-       struct obd_capa *oc;
        struct ptlrpc_request *req;
        struct mdt_body *body;
        void *lvbdata;
@@ -3821,7 +4018,7 @@ static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
               PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
               lock->l_lvb_data, lock->l_lvb_len);
 
-       if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
+       if (lock->l_lvb_data != NULL)
                RETURN(0);
 
        /* if layout lock was granted right away, the layout is returned
@@ -3829,13 +4026,11 @@ static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
         * blocked and then granted via completion ast, we have to fetch
         * layout here. Please note that we can't use the LVB buffer in
         * completion AST because it doesn't have a large enough buffer */
-       oc = ll_mdscapa_get(inode);
        rc = ll_get_default_mdsize(sbi, &lmmsize);
        if (rc == 0)
-               rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+               rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
                                OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
                                lmmsize, 0, &req);
-       capa_put(oc);
        if (rc < 0)
                RETURN(rc);
 
@@ -3857,13 +4052,17 @@ static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
 
        memcpy(lvbdata, lmm, lmmsize);
        lock_res_and_lock(lock);
-       if (lock->l_lvb_data != NULL)
-               OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
-
-       lock->l_lvb_data = lvbdata;
-       lock->l_lvb_len = lmmsize;
+       if (unlikely(lock->l_lvb_data == NULL)) {
+               lock->l_lvb_type = LVB_T_LAYOUT;
+               lock->l_lvb_data = lvbdata;
+               lock->l_lvb_len = lmmsize;
+               lvbdata = NULL;
+       }
        unlock_res_and_lock(lock);
 
+       if (lvbdata)
+               OBD_FREE_LARGE(lvbdata, lmmsize);
+
        EXIT;
 
 out:
@@ -3875,13 +4074,12 @@ out:
  * Apply the layout to the inode. Layout lock is held and will be released
  * in this function.
  */
-static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
-                               struct inode *inode, __u32 *gen, bool reconf)
+static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
+                             struct inode *inode)
 {
        struct ll_inode_info *lli = ll_i2info(inode);
        struct ll_sb_info    *sbi = ll_i2sbi(inode);
        struct ldlm_lock *lock;
-       struct lustre_md md = { NULL };
        struct cl_object_conf conf;
        int rc = 0;
        bool lvb_ready;
@@ -3894,11 +4092,11 @@ static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
        LASSERT(lock != NULL);
        LASSERT(ldlm_has_layout(lock));
 
-       LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d\n",
-                  PFID(&lli->lli_fid), inode, reconf);
+       LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
+                  PFID(&lli->lli_fid), inode);
 
        /* in case this is a caching lock and reinstate with new inode */
-       md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
+       md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
 
        lock_res_and_lock(lock);
        lvb_ready = ldlm_is_lvb_ready(lock);
@@ -3906,54 +4104,27 @@ static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
        /* checking lvb_ready is racy but this is okay. The worst case is
         * that multi processes may configure the file on the same time. */
 
-       if (lvb_ready || !reconf) {
-               rc = -ENODATA;
-               if (lvb_ready) {
-                       /* layout_gen must be valid if layout lock is not
-                        * cancelled and stripe has already set */
-                       *gen = ll_layout_version_get(lli);
-                       rc = 0;
-               }
-               GOTO(out, rc);
-       }
+       if (lvb_ready)
+               GOTO(out, rc = 0);
 
        rc = ll_layout_fetch(inode, lock);
        if (rc < 0)
                GOTO(out, rc);
 
-       /* for layout lock, lmm is returned in lock's lvb.
+       /* for layout lock, lmm is stored in lock's lvb.
         * lvb_data is immutable if the lock is held so it's safe to access it
-        * without res lock. See the description in ldlm_lock_decref_internal()
-        * for the condition to free lvb_data of layout lock */
-       if (lock->l_lvb_data != NULL) {
-               rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
-                                 lock->l_lvb_data, lock->l_lvb_len);
-               if (rc >= 0) {
-                       *gen = LL_LAYOUT_GEN_EMPTY;
-                       if (md.lsm != NULL)
-                               *gen = md.lsm->lsm_layout_gen;
-                       rc = 0;
-               } else {
-                       CERROR("%s: file "DFID" unpackmd error: %d\n",
-                               ll_get_fsname(inode->i_sb, NULL, 0),
-                               PFID(&lli->lli_fid), rc);
-               }
-       }
-       if (rc < 0)
-               GOTO(out, rc);
-
-       /* set layout to file. Unlikely this will fail as old layout was
+        * without res lock.
+        *
+        * set layout to file. Unlikely this will fail as old layout was
         * surely eliminated */
        memset(&conf, 0, sizeof conf);
        conf.coc_opc = OBJECT_CONF_SET;
        conf.coc_inode = inode;
        conf.coc_lock = lock;
-       conf.u.coc_md = &md;
+       conf.u.coc_layout.lb_buf = lock->l_lvb_data;
+       conf.u.coc_layout.lb_len = lock->l_lvb_len;
        rc = ll_layout_conf(inode, &conf);
 
-       if (md.lsm != NULL)
-               obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
-
        /* refresh layout failed, need to wait */
        wait_layout = rc == -EBUSY;
        EXIT;
@@ -3982,27 +4153,14 @@ out:
        RETURN(rc);
 }
 
-/**
- * This function checks if there exists a LAYOUT lock on the client side,
- * or enqueues it if it doesn't have one in cache.
- *
- * This function will not hold layout lock so it may be revoked any time after
- * this function returns. Any operations depend on layout should be redone
- * in that case.
- *
- * This function should be called before lov_io_init() to get an uptodate
- * layout version, the caller should save the version number and after IO
- * is finished, this function should be called again to verify that layout
- * is not changed during IO time.
- */
-int ll_layout_refresh(struct inode *inode, __u32 *gen)
+static int ll_layout_refresh_locked(struct inode *inode)
 {
        struct ll_inode_info  *lli = ll_i2info(inode);
        struct ll_sb_info     *sbi = ll_i2sbi(inode);
        struct md_op_data     *op_data;
-       struct lookup_intent   it;
-       struct lustre_handle   lockh;
-       ldlm_mode_t            mode;
+       struct lookup_intent    it;
+       struct lustre_handle    lockh;
+       enum ldlm_mode          mode;
        struct ldlm_enqueue_info einfo = {
                .ei_type = LDLM_IBITS,
                .ei_mode = LCK_CR,
@@ -4012,65 +4170,92 @@ int ll_layout_refresh(struct inode *inode, __u32 *gen)
        int rc;
        ENTRY;
 
-       *gen = ll_layout_version_get(lli);
-       if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
-               RETURN(0);
-
-       /* sanity checks */
-       LASSERT(fid_is_sane(ll_inode2fid(inode)));
-       LASSERT(S_ISREG(inode->i_mode));
-
-       /* take layout lock mutex to enqueue layout lock exclusively. */
-       mutex_lock(&lli->lli_layout_mutex);
-
 again:
        /* mostly layout lock is caching on the local side, so try to match
         * it before grabbing layout lock mutex. */
        mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
                               LCK_CR | LCK_CW | LCK_PR | LCK_PW);
        if (mode != 0) { /* hit cached lock */
-               rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
+               rc = ll_layout_lock_set(&lockh, mode, inode);
                if (rc == -EAGAIN)
                        goto again;
 
-               mutex_unlock(&lli->lli_layout_mutex);
                RETURN(rc);
        }
 
        op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
                                     0, 0, LUSTRE_OPC_ANY, NULL);
-       if (IS_ERR(op_data)) {
-               mutex_unlock(&lli->lli_layout_mutex);
+       if (IS_ERR(op_data))
                RETURN(PTR_ERR(op_data));
-       }
 
        /* have to enqueue one */
        memset(&it, 0, sizeof(it));
        it.it_op = IT_LAYOUT;
        lockh.cookie = 0ULL;
 
-       LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)\n",
+       LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
                          ll_get_fsname(inode->i_sb, NULL, 0),
                          PFID(&lli->lli_fid), inode);
 
        rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
-       if (it.d.lustre.it_data != NULL)
-               ptlrpc_req_finished(it.d.lustre.it_data);
-       it.d.lustre.it_data = NULL;
+       if (it.it_request != NULL)
+               ptlrpc_req_finished(it.it_request);
+       it.it_request = NULL;
 
        ll_finish_md_op_data(op_data);
 
-       mode = it.d.lustre.it_lock_mode;
-       it.d.lustre.it_lock_mode = 0;
+       mode = it.it_lock_mode;
+       it.it_lock_mode = 0;
        ll_intent_drop_lock(&it);
 
        if (rc == 0) {
                /* set lock data in case this is a new lock */
                ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
-               rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
+               rc = ll_layout_lock_set(&lockh, mode, inode);
                if (rc == -EAGAIN)
                        goto again;
        }
+
+       RETURN(rc);
+}
+
+/**
+ * This function checks if there exists a LAYOUT lock on the client side,
+ * or enqueues it if it doesn't have one in cache.
+ *
+ * This function will not hold layout lock so it may be revoked any time after
+ * this function returns. Any operations depend on layout should be redone
+ * in that case.
+ *
+ * This function should be called before lov_io_init() to get an uptodate
+ * layout version, the caller should save the version number and after IO
+ * is finished, this function should be called again to verify that layout
+ * is not changed during IO time.
+ */
+int ll_layout_refresh(struct inode *inode, __u32 *gen)
+{
+       struct ll_inode_info    *lli = ll_i2info(inode);
+       struct ll_sb_info       *sbi = ll_i2sbi(inode);
+       int rc;
+       ENTRY;
+
+       *gen = ll_layout_version_get(lli);
+       if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
+               RETURN(0);
+
+       /* sanity checks */
+       LASSERT(fid_is_sane(ll_inode2fid(inode)));
+       LASSERT(S_ISREG(inode->i_mode));
+
+       /* take layout lock mutex to enqueue layout lock exclusively. */
+       mutex_lock(&lli->lli_layout_mutex);
+
+       rc = ll_layout_refresh_locked(inode);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       *gen = ll_layout_version_get(lli);
+out:
        mutex_unlock(&lli->lli_layout_mutex);
 
        RETURN(rc);
@@ -4099,9 +4284,8 @@ int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
        hur->hur_user_item[0].hui_extent.offset = offset;
        hur->hur_user_item[0].hui_extent.length = length;
        hur->hur_request.hr_itemcount = 1;
-       rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
+       rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
                           len, hur, NULL);
        OBD_FREE(hur, len);
        RETURN(rc);
 }
-