Whamcloud - gitweb
LU-744 clio: save memory allocations for cl_page
[fs/lustre-release.git] / lustre / lov / lov_object.c
index d5781b4..4b1d3af 100644 (file)
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
+/*
  * GPL HEADER START
  *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  * GPL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Implementation of cl_object for LOV layer.
  *
  *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
  */
 
-/** \addtogroup lov lov @{ */
-
 #define DEBUG_SUBSYSTEM S_LOV
 
 #include "lov_cl_internal.h"
+#include <lustre_debug.h>
+
+/** \addtogroup lov
+ *  @{
+ */
 
 /*****************************************************************************
  *
@@ -55,7 +59,7 @@ struct lov_layout_operations {
                         struct lov_object *lov,
                         const struct cl_object_conf *conf,
                         union lov_layout_state *state);
-        void (*llo_delete)(const struct lu_env *env, struct lov_object *lov,
+       int (*llo_delete)(const struct lu_env *env, struct lov_object *lov,
                            union lov_layout_state *state);
         void (*llo_fini)(const struct lu_env *env, struct lov_object *lov,
                          union lov_layout_state *state);
@@ -63,10 +67,8 @@ struct lov_layout_operations {
                             union lov_layout_state *state);
         int  (*llo_print)(const struct lu_env *env, void *cookie,
                           lu_printer_t p, const struct lu_object *o);
-        struct cl_page *(*llo_page_init)(const struct lu_env *env,
-                                         struct cl_object *obj,
-                                         struct cl_page *page,
-                                         cfs_page_t *vmpage);
+        int  (*llo_page_init)(const struct lu_env *env, struct cl_object *obj,
+                               struct cl_page *page, cfs_page_t *vmpage);
         int  (*llo_lock_init)(const struct lu_env *env,
                               struct cl_object *obj, struct cl_lock *lock,
                               const struct cl_io *io);
@@ -103,29 +105,6 @@ static void lov_install_raid0(const struct lu_env *env,
                               struct lov_object *lov,
                               union  lov_layout_state *state)
 {
-        lov->u = *state;
-}
-
-static void oinfo_get_fid(const struct lov_oinfo *oinfo, struct lu_fid *fid)
-{
-        __u64 idx = oinfo->loi_id;
-
-        /* See idif definition in wiki:CMD3_interoperability_architecture */
-
-        LASSERT(oinfo->loi_gr < 1ULL << 16);
-        LASSERT(oinfo->loi_id < 1ULL << 49);
-        ENTRY;
-
-        /*
-         * Now that the fid of stripe is not unique now, ost_idx have to
-         * be used to make it unique. This is ok because the stripe fids
-         * are just used in client side(to locate the objects). -jay
-         */
-        fid->f_seq = ((__u64)oinfo->loi_ost_idx) << 32 |
-                     oinfo->loi_gr << 16 | idx >> 32;
-        fid->f_oid = idx; /* truncated to 32 bits by assignment */
-        fid->f_ver = 0;
-        EXIT;
 }
 
 static struct cl_object *lov_sub_find(const struct lu_env *env,
@@ -151,16 +130,27 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
         struct lov_oinfo        *oinfo;
         int result;
 
+       if (OBD_FAIL_CHECK(OBD_FAIL_LOV_INIT)) {
+               /* For sanity:test_206.
+                * Do not leave the object in cache to avoid accessing
+                * freed memory. This is because osc_object is referring to
+                * lov_oinfo of lsm_stripe_data which will be freed due to
+                * this failure. */
+               cl_object_kill(env, stripe);
+               cl_object_put(env, stripe);
+               return -EIO;
+       }
+
         hdr    = cl_object_header(lov2cl(lov));
         subhdr = cl_object_header(stripe);
         parent = subhdr->coh_parent;
 
-        oinfo = r0->lo_lsm->lsm_oinfo[idx];
-        CDEBUG(D_INODE, DFID"@%p[%d] -> "DFID"@%p: id: "LPU64" gr: "LPU64
+       oinfo = lov->lo_lsm->lsm_oinfo[idx];
+        CDEBUG(D_INODE, DFID"@%p[%d] -> "DFID"@%p: id: "LPU64" seq: "LPU64
                " idx: %d gen: %d\n",
                PFID(&subhdr->coh_lu.loh_fid), subhdr, idx,
                PFID(&hdr->coh_lu.loh_fid), hdr,
-               oinfo->loi_id, oinfo->loi_gr,
+               oinfo->loi_id, oinfo->loi_seq,
                oinfo->loi_ost_idx, oinfo->loi_ost_gen);
 
         if (parent == NULL) {
@@ -172,7 +162,7 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
                 r0->lo_sub[idx]->lso_index = idx;
                 result = 0;
         } else {
-                CERROR("Stripe is already owned by other file (%i).\n", idx);
+                CERROR("Stripe is already owned by other file (%d).\n", idx);
                 LU_OBJECT_DEBUG(D_ERROR, env, &stripe->co_lu, "\n");
                 LU_OBJECT_DEBUG(D_ERROR, env, lu_object_top(&parent->coh_lu),
                                 "old\n");
@@ -199,14 +189,23 @@ static int lov_init_raid0(const struct lu_env *env,
         struct lov_layout_raid0 *r0      = &state->raid0;
 
         ENTRY;
-        r0->lo_nr  = conf->u.coc_md->lsm->lsm_stripe_count;
-        r0->lo_lsm = conf->u.coc_md->lsm;
+
+       if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3) {
+               dump_lsm(D_ERROR, lsm);
+               LASSERTF(0, "magic mismatch, expected %d/%d, actual %d.\n",
+                        LOV_MAGIC_V1, LOV_MAGIC_V3, lsm->lsm_magic);
+       }
+
+       LASSERT(lov->lo_lsm == NULL);
+       lov->lo_lsm = lsm_addref(lsm);
+       r0->lo_nr  = lsm->lsm_stripe_count;
         LASSERT(r0->lo_nr <= lov_targets_nr(dev));
 
-        OBD_ALLOC(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+        OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
         if (r0->lo_sub != NULL) {
                 result = 0;
                 subconf->coc_inode = conf->coc_inode;
+               spin_lock_init(&r0->lo_sub_lock);
                 /*
                  * Create stripe cl_objects.
                  */
@@ -215,9 +214,14 @@ static int lov_init_raid0(const struct lu_env *env,
                         struct lov_oinfo *oinfo = lsm->lsm_oinfo[i];
                         int ost_idx = oinfo->loi_ost_idx;
 
-                        oinfo_get_fid(oinfo, ofid);
+                        fid_ostid_unpack(ofid, &oinfo->loi_oi,
+                                         oinfo->loi_ost_idx);
                         subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
                         subconf->u.coc_oinfo = oinfo;
+                        LASSERTF(subdev != NULL, "not init ost %d\n", ost_idx);
+                       /* In the function below, .hs_keycmp resolves to
+                        * lu_obj_hop_keycmp() */
+                       /* coverity[overrun-buffer-val] */
                         stripe = lov_sub_find(env, subdev, ofid, subconf);
                         if (!IS_ERR(stripe))
                                 result = lov_init_sub(env, lov, stripe, r0, i);
@@ -229,10 +233,12 @@ static int lov_init_raid0(const struct lu_env *env,
         RETURN(result);
 }
 
-static void lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
-                             union lov_layout_state *state)
+static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
+                           union lov_layout_state *state)
 {
-        LASSERT(lov->lo_type == LLT_EMPTY);
+       LASSERT(lov->lo_type == LLT_EMPTY);
+       cl_object_prune(env, &lov->lo_cl);
+       return 0;
 }
 
 static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
@@ -241,12 +247,16 @@ static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
         struct cl_object        *sub;
         struct lov_layout_raid0 *r0;
         struct lu_site          *site;
+        struct lu_site_bkt_data *bkt;
         cfs_waitlink_t          *waiter;
 
         r0  = &lov->u.raid0;
-        sub = lovsub2cl(los);
         LASSERT(r0->lo_sub[idx] == los);
 
+        sub  = lovsub2cl(los);
+        site = sub->co_lu.lo_dev->ld_site;
+        bkt  = lu_site_bkt_from_fid(site, &sub->co_lu.lo_header->loh_fid);
+
         cl_object_kill(env, sub);
         /* release a reference to the sub-object and ... */
         lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
@@ -256,40 +266,56 @@ static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
          * ->lo_sub[] slot in lovsub_object_fini() */
         if (r0->lo_sub[idx] == los) {
                 waiter = &lov_env_info(env)->lti_waiter;
-                site   = sub->co_lu.lo_dev->ld_site;
                 cfs_waitlink_init(waiter);
-                cfs_waitq_add(&site->ls_marche_funebre, waiter);
-                set_current_state(CFS_TASK_UNINT);
-
-                while (r0->lo_sub[idx] == los)
+                cfs_waitq_add(&bkt->lsb_marche_funebre, waiter);
+                cfs_set_current_state(CFS_TASK_UNINT);
+                while (1) {
                         /* this wait-queue is signaled at the end of
                          * lu_object_free(). */
-                        cfs_waitq_wait(waiter, CFS_TASK_UNINT);
-                cfs_waitq_del(&site->ls_marche_funebre, waiter);
+                        cfs_set_current_state(CFS_TASK_UNINT);
+                       spin_lock(&r0->lo_sub_lock);
+                       if (r0->lo_sub[idx] == los) {
+                               spin_unlock(&r0->lo_sub_lock);
+                               cfs_waitq_wait(waiter, CFS_TASK_UNINT);
+                       } else {
+                               spin_unlock(&r0->lo_sub_lock);
+                                cfs_set_current_state(CFS_TASK_RUNNING);
+                                break;
+                        }
+                }
+                cfs_waitq_del(&bkt->lsb_marche_funebre, waiter);
         }
         LASSERT(r0->lo_sub[idx] == NULL);
 }
 
-static void lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
-                             union lov_layout_state *state)
+static int lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
+                           union lov_layout_state *state)
 {
-        struct lov_layout_raid0 *r0 = &state->raid0;
-        int                      i;
+       struct lov_layout_raid0 *r0 = &state->raid0;
+       struct lov_stripe_md    *lsm = lov->lo_lsm;
+       int i;
+
+       ENTRY;
+
+       dump_lsm(D_INODE, lsm);
+       if (lov->lo_lsm_invalid && cfs_atomic_read(&lsm->lsm_refc) > 1)
+               RETURN(-EBUSY);
 
-        ENTRY;
         if (r0->lo_sub != NULL) {
                 for (i = 0; i < r0->lo_nr; ++i) {
                         struct lovsub_object *los = r0->lo_sub[i];
 
-                        if (los != NULL)
+                        if (los != NULL) {
+                               cl_locks_prune(env, &los->lso_cl, 1);
                                 /*
                                  * If top-level object is to be evicted from
                                  * the cache, so are its sub-objects.
                                  */
                                 lov_subobject_kill(env, lov, los, i);
+                       }
                 }
         }
-        EXIT;
+       RETURN(0);
 }
 
 static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov,
@@ -301,14 +327,18 @@ static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov,
 static void lov_fini_raid0(const struct lu_env *env, struct lov_object *lov,
                            union lov_layout_state *state)
 {
-        struct lov_layout_raid0 *r0 = &state->raid0;
+       struct lov_layout_raid0 *r0 = &state->raid0;
+       ENTRY;
 
-        ENTRY;
-        if (r0->lo_sub != NULL) {
-                OBD_FREE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
-                r0->lo_sub = NULL;
-        }
-        EXIT;
+       if (r0->lo_sub != NULL) {
+               OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+               r0->lo_sub = NULL;
+       }
+
+       dump_lsm(D_INODE, lov->lo_lsm);
+       lov_free_memmd(&lov->lo_lsm);
+
+       EXIT;
 }
 
 static int lov_print_empty(const struct lu_env *env, void *cookie,
@@ -355,14 +385,25 @@ static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj,
 static int lov_attr_get_raid0(const struct lu_env *env, struct cl_object *obj,
                               struct cl_attr *attr)
 {
-        struct lov_object       *lov = cl2lov(obj);
-        struct lov_layout_raid0 *r0 = lov_r0(lov);
-        struct lov_stripe_md    *lsm = lov->u.raid0.lo_lsm;
+       struct lov_object       *lov = cl2lov(obj);
+       struct lov_layout_raid0 *r0 = lov_r0(lov);
+       struct lov_stripe_md    *lsm = lov->lo_lsm;
         struct ost_lvb          *lvb = &lov_env_info(env)->lti_lvb;
         __u64                    kms;
         int                      result = 0;
 
         ENTRY;
+
+       /* this is called w/o holding type guard mutex, so it must be inside
+        * an on going IO otherwise lsm may be replaced.
+        * LU-2117: it turns out there exists one exception. For mmaped files,
+        * the lock of those files may be requested in the other file's IO
+        * context, and this function is called in ccc_lock_state(), it will
+        * hit this assertion.
+        * Anyway, it's still okay to call attr_get w/o type guard as layout
+        * can't go if locks exist. */
+       /* LASSERT(cfs_atomic_read(&lsm->lsm_refc) > 1); */
+
         if (!r0->lo_attr_valid) {
                 /*
                  * Fill LVB with attributes already initialized by the upper
@@ -405,7 +446,7 @@ const static struct lov_layout_operations lov_dispatch[] = {
                 .llo_install   = lov_install_empty,
                 .llo_print     = lov_print_empty,
                 .llo_page_init = lov_page_init_empty,
-                .llo_lock_init = NULL,
+                .llo_lock_init = lov_lock_init_empty,
                 .llo_io_init   = lov_io_init_empty,
                 .llo_getattr   = lov_attr_get_empty
         },
@@ -436,18 +477,29 @@ const static struct lov_layout_operations lov_dispatch[] = {
         lov_dispatch[__llt].op(__VA_ARGS__);                            \
 })
 
+static inline void lov_conf_freeze(struct lov_object *lov)
+{
+       if (lov->lo_owner != cfs_current())
+               down_read(&lov->lo_type_guard);
+}
+
+static inline void lov_conf_thaw(struct lov_object *lov)
+{
+       if (lov->lo_owner != cfs_current())
+               up_read(&lov->lo_type_guard);
+}
+
 #define LOV_2DISPATCH_MAYLOCK(obj, op, lock, ...)                       \
 ({                                                                      \
         struct lov_object                      *__obj = (obj);          \
         int                                     __lock = !!(lock);      \
         typeof(lov_dispatch[0].op(__VA_ARGS__)) __result;               \
                                                                         \
-        __lock &= __obj->lo_owner != cfs_current();                     \
         if (__lock)                                                     \
-                down_read(&__obj->lo_type_guard);                       \
+                lov_conf_freeze(__obj);                                        \
         __result = LOV_2DISPATCH_NOLOCK(obj, op, __VA_ARGS__);          \
         if (__lock)                                                     \
-                up_read(&__obj->lo_type_guard);                         \
+                lov_conf_thaw(__obj);                                  \
         __result;                                                       \
 })
 
@@ -462,58 +514,103 @@ do {                                                                    \
         struct lov_object                      *__obj = (obj);          \
         enum lov_layout_type                    __llt;                  \
                                                                         \
-        if (__obj->lo_owner != cfs_current())                           \
-                down_read(&__obj->lo_type_guard);                       \
+       lov_conf_freeze(__obj);                                         \
         __llt = __obj->lo_type;                                         \
         LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch));        \
         lov_dispatch[__llt].op(__VA_ARGS__);                            \
-        if (__obj->lo_owner != cfs_current())                           \
-                up_read(&__obj->lo_type_guard);                         \
+       lov_conf_thaw(__obj);                                           \
 } while (0)
 
-static int lov_layout_change(const struct lu_env *env,
-                             struct lov_object *obj, enum lov_layout_type llt,
-                             const struct cl_object_conf *conf)
+static void lov_conf_lock(struct lov_object *lov)
 {
-        int result;
-        union lov_layout_state       *state = &lov_env_info(env)->lti_state;
-        const struct lov_layout_operations *old_ops;
-        const struct lov_layout_operations *new_ops;
+       LASSERT(lov->lo_owner != cfs_current());
+       down_write(&lov->lo_type_guard);
+       LASSERT(lov->lo_owner == NULL);
+       lov->lo_owner = cfs_current();
+}
 
-        LASSERT(0 <= obj->lo_type && obj->lo_type < ARRAY_SIZE(lov_dispatch));
-        LASSERT(0 <= llt && llt < ARRAY_SIZE(lov_dispatch));
-        ENTRY;
+static void lov_conf_unlock(struct lov_object *lov)
+{
+       lov->lo_owner = NULL;
+       up_write(&lov->lo_type_guard);
+}
 
-        old_ops = &lov_dispatch[obj->lo_type];
-        new_ops = &lov_dispatch[llt];
-
-        result = new_ops->llo_init(env, lu2lov_dev(obj->lo_cl.co_lu.lo_dev),
-                                   obj, conf, state);
-        if (result == 0) {
-                struct cl_object_header *hdr = cl_object_header(&obj->lo_cl);
-                void                    *cookie;
-                struct lu_env           *nested;
-                int                      refcheck;
-
-                cookie = cl_env_reenter();
-                nested = cl_env_get(&refcheck);
-                if (!IS_ERR(nested))
-                        cl_object_prune(nested, &obj->lo_cl);
-                else
-                        result = PTR_ERR(nested);
-                cl_env_put(nested, &refcheck);
-                cl_env_reexit(cookie);
-
-                old_ops->llo_fini(env, obj, &obj->u);
-                LASSERT(list_empty(&hdr->coh_locks));
-                LASSERT(hdr->coh_tree.rnode == NULL);
-                LASSERT(hdr->coh_pages == 0);
-
-                new_ops->llo_install(env, obj, state);
-                obj->lo_type = llt;
-        } else
-                new_ops->llo_fini(env, obj, state);
-        RETURN(result);
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov)
+{
+       struct l_wait_info lwi = { 0 };
+       struct lov_stripe_md *lsm = lov->lo_lsm;
+       ENTRY;
+
+       if (!lov->lo_lsm_invalid || lsm == NULL)
+               RETURN(0);
+
+       LASSERT(cfs_atomic_read(&lsm->lsm_refc) > 0);
+       while (cfs_atomic_read(&lsm->lsm_refc) > 1 && lov->lo_lsm_invalid) {
+               lov_conf_unlock(lov);
+
+               CDEBUG(D_INODE, "file:"DFID" wait for active IO, now: %d.\n",
+                       PFID(lu_object_fid(lov2lu(lov))),
+                       cfs_atomic_read(&lsm->lsm_refc));
+
+               l_wait_event(lov->lo_waitq,
+                            cfs_atomic_read(&lsm->lsm_refc) == 1, &lwi);
+               lov_conf_lock(lov);
+       }
+       RETURN(0);
+}
+
+static int lov_layout_change(const struct lu_env *unused,
+                             struct lov_object *lov, enum lov_layout_type llt,
+                             const struct cl_object_conf *conf)
+{
+       int result;
+       union lov_layout_state *state = &lov->u;
+       const struct lov_layout_operations *old_ops;
+       const struct lov_layout_operations *new_ops;
+
+       struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
+       void *cookie;
+       struct lu_env *env;
+       int refcheck;
+
+       LASSERT(0 <= lov->lo_type && lov->lo_type < ARRAY_SIZE(lov_dispatch));
+       LASSERT(0 <= llt && llt < ARRAY_SIZE(lov_dispatch));
+       ENTRY;
+
+       cookie = cl_env_reenter();
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env)) {
+               cl_env_reexit(cookie);
+               RETURN(PTR_ERR(env));
+       }
+
+       old_ops = &lov_dispatch[lov->lo_type];
+       new_ops = &lov_dispatch[llt];
+
+       result = old_ops->llo_delete(env, lov, &lov->u);
+       if (result == 0) {
+               old_ops->llo_fini(env, lov, &lov->u);
+               LASSERT(cfs_list_empty(&hdr->coh_locks));
+               LASSERT(hdr->coh_tree.rnode == NULL);
+               LASSERT(hdr->coh_pages == 0);
+
+               lov->lo_type = LLT_EMPTY;
+               result = new_ops->llo_init(env,
+                                       lu2lov_dev(lov->lo_cl.co_lu.lo_dev),
+                                       lov, conf, state);
+               if (result == 0) {
+                       new_ops->llo_install(env, lov, state);
+                       lov->lo_type = llt;
+               } else {
+                       new_ops->llo_delete(env, lov, state);
+                       new_ops->llo_fini(env, lov, state);
+                       /* this file becomes an EMPTY file. */
+               }
+       }
+
+       cl_env_put(env, &refcheck);
+       cl_env_reexit(cookie);
+       RETURN(result);
 }
 
 /*****************************************************************************
@@ -528,12 +625,15 @@ int lov_object_init(const struct lu_env *env, struct lu_object *obj,
         struct lov_device            *dev   = lu2lov_dev(obj->lo_dev);
         struct lov_object            *lov   = lu2lov(obj);
         const struct cl_object_conf  *cconf = lu2cl_conf(conf);
-        union  lov_layout_state      *set   = &lov_env_info(env)->lti_state;
+        union  lov_layout_state      *set   = &lov->u;
         const struct lov_layout_operations *ops;
         int result;
 
         ENTRY;
-        init_rwsem(&lov->lo_type_guard);
+       init_rwsem(&lov->lo_type_guard);
+       cfs_waitq_init(&lov->lo_waitq);
+
+       cl_object_page_init(lu2cl(obj), sizeof(struct lov_page));
 
         /* no locking is necessary, as object is being created */
         lov->lo_type = cconf->u.coc_md->lsm != NULL ? LLT_RAID0 : LLT_EMPTY;
@@ -541,32 +641,70 @@ int lov_object_init(const struct lu_env *env, struct lu_object *obj,
         result = ops->llo_init(env, dev, lov, cconf, set);
         if (result == 0)
                 ops->llo_install(env, lov, set);
-        else
-                ops->llo_fini(env, lov, set);
         RETURN(result);
 }
 
 static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
                         const struct cl_object_conf *conf)
 {
-        struct lov_object *lov = cl2lov(obj);
-        int result;
-
-        ENTRY;
-        /*
-         * Currently only LLT_EMPTY -> LLT_RAID0 transition is supported.
-         */
-        LASSERT(lov->lo_owner != cfs_current());
-        down_write(&lov->lo_type_guard);
-        LASSERT(lov->lo_owner == NULL);
-        lov->lo_owner = cfs_current();
-        if (lov->lo_type == LLT_EMPTY && conf->u.coc_md->lsm != NULL)
-                result = lov_layout_change(env, lov, LLT_RAID0, conf);
-        else
-                result = -EOPNOTSUPP;
-        lov->lo_owner = NULL;
-        up_write(&lov->lo_type_guard);
-        RETURN(result);
+       struct lov_stripe_md *lsm = NULL;
+       struct lov_object *lov = cl2lov(obj);
+       int result = 0;
+       ENTRY;
+
+       lov_conf_lock(lov);
+       if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+               lov->lo_lsm_invalid = 1;
+               GOTO(out, result = 0);
+       }
+
+       if (conf->coc_opc == OBJECT_CONF_WAIT) {
+               result = lov_layout_wait(env, lov);
+               GOTO(out, result);
+       }
+
+       LASSERT(conf->coc_opc == OBJECT_CONF_SET);
+
+       if (conf->u.coc_md != NULL)
+               lsm = conf->u.coc_md->lsm;
+       if ((lsm == NULL && lov->lo_lsm == NULL) ||
+           (lsm != NULL && lov->lo_lsm != NULL &&
+            lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen)) {
+               /* same version of layout */
+               lov->lo_lsm_invalid = 0;
+               GOTO(out, result = 0);
+       }
+
+       /* will change layout - check if there still exists active IO. */
+       if (lov->lo_lsm != NULL &&
+           cfs_atomic_read(&lov->lo_lsm->lsm_refc) > 1) {
+               lov->lo_lsm_invalid = 1;
+               GOTO(out, result = -EBUSY);
+       }
+
+       /*
+        * Only LLT_EMPTY <-> LLT_RAID0 transitions are supported.
+        */
+       switch (lov->lo_type) {
+       case LLT_EMPTY:
+               if (lsm != NULL)
+                       result = lov_layout_change(env, lov, LLT_RAID0, conf);
+               break;
+       case LLT_RAID0:
+               if (lsm == NULL)
+                       result = lov_layout_change(env, lov, LLT_EMPTY, conf);
+               else if (lov_stripe_md_cmp(lov->lo_lsm, lsm))
+                       result = -EOPNOTSUPP;
+               break;
+       default:
+               LBUG();
+       }
+       lov->lo_lsm_invalid = result != 0;
+       EXIT;
+
+out:
+       lov_conf_unlock(lov);
+       RETURN(result);
 }
 
 static void lov_object_delete(const struct lu_env *env, struct lu_object *obj)
@@ -595,11 +733,11 @@ static int lov_object_print(const struct lu_env *env, void *cookie,
         return LOV_2DISPATCH(lu2lov(o), llo_print, env, cookie, p, o);
 }
 
-struct cl_page *lov_page_init(const struct lu_env *env, struct cl_object *obj,
-                              struct cl_page *page, cfs_page_t *vmpage)
+int lov_page_init(const struct lu_env *env, struct cl_object *obj,
+               struct cl_page *page, cfs_page_t *vmpage)
 {
-        return LOV_2DISPATCH(cl2lov(obj),
-                             llo_page_init, env, obj, page, vmpage);
+        return LOV_2DISPATCH_NOLOCK(cl2lov(obj),
+                                   llo_page_init, env, obj, page, vmpage);
 }
 
 /**
@@ -607,23 +745,11 @@ struct cl_page *lov_page_init(const struct lu_env *env, struct cl_object *obj,
  * layer. Dispatches to the appropriate layout io initialization method.
  */
 int lov_io_init(const struct lu_env *env, struct cl_object *obj,
-                struct cl_io *io)
+               struct cl_io *io)
 {
-        CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl);
-        /*
-         * Do not take lock in case of CIT_MISC io, because
-         *
-         *     - if this is an io for a glimpse, then we don't care;
-         *
-         *     - if this not a glimpse (writepage or lock cancellation), then
-         *       layout change cannot happen because a page or a lock
-         *       already exist; and
-         *
-         *     - lock ordering (lock mutex nests within layout rw-semaphore)
-         *       is obeyed in case of lock cancellation.
-         */
-        return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init,
-                                     io->ci_type != CIT_MISC, env, obj, io);
+       CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl);
+       return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init,
+                                    !io->ci_ignore_layout, env, obj, io);
 }
 
 /**
@@ -649,9 +775,11 @@ static int lov_attr_set(const struct lu_env *env, struct cl_object *obj,
 }
 
 int lov_lock_init(const struct lu_env *env, struct cl_object *obj,
-                  struct cl_lock *lock, const struct cl_io *io)
+                 struct cl_lock *lock, const struct cl_io *io)
 {
-        return LOV_2DISPATCH(cl2lov(obj), llo_lock_init, env, obj, lock, io);
+       /* No need to lock because we've taken one refcount of layout.  */
+       return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_lock_init, env, obj, lock,
+                                   io);
 }
 
 static const struct cl_object_operations lov_ops = {
@@ -673,14 +801,14 @@ static const struct lu_object_operations lov_lu_obj_ops = {
 };
 
 struct lu_object *lov_object_alloc(const struct lu_env *env,
-                                   const struct lu_object_header *_,
+                                   const struct lu_object_header *unused,
                                    struct lu_device *dev)
 {
         struct lov_object *lov;
         struct lu_object  *obj;
 
         ENTRY;
-        OBD_SLAB_ALLOC_PTR(lov, lov_object_kmem);
+        OBD_SLAB_ALLOC_PTR_GFP(lov, lov_object_kmem, CFS_ALLOC_IO);
         if (lov != NULL) {
                 obj = lov2lu(lov);
                 lu_object_init(obj, NULL, dev);
@@ -697,4 +825,99 @@ struct lu_object *lov_object_alloc(const struct lu_env *env,
         RETURN(obj);
 }
 
+struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov)
+{
+       struct lov_stripe_md *lsm = NULL;
+
+       lov_conf_freeze(lov);
+       if (lov->lo_lsm != NULL) {
+               lsm = lsm_addref(lov->lo_lsm);
+               CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n",
+                       lsm, cfs_atomic_read(&lsm->lsm_refc),
+                       lov->lo_lsm_invalid, cfs_current());
+       }
+       lov_conf_thaw(lov);
+       return lsm;
+}
+
+void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm)
+{
+       if (lsm == NULL)
+               return;
+
+       CDEBUG(D_INODE, "lsm %p decref %d by %p.\n",
+               lsm, cfs_atomic_read(&lsm->lsm_refc), cfs_current());
+
+       if (lov_free_memmd(&lsm) <= 1 && lov->lo_lsm_invalid)
+               cfs_waitq_signal(&lov->lo_waitq);
+}
+
+struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj)
+{
+       struct lu_object *luobj;
+       struct lov_stripe_md *lsm = NULL;
+
+       if (clobj == NULL)
+               return NULL;
+
+       luobj = lu_object_locate(&cl_object_header(clobj)->coh_lu,
+                                &lov_device_type);
+       if (luobj != NULL)
+               lsm = lov_lsm_addref(lu2lov(luobj));
+       return lsm;
+}
+EXPORT_SYMBOL(lov_lsm_get);
+
+void lov_lsm_put(struct cl_object *clobj, struct lov_stripe_md *lsm)
+{
+       struct lu_object *luobj;
+
+       if (clobj == NULL || lsm == NULL)
+               return;
+
+       luobj = lu_object_locate(&cl_object_header(clobj)->coh_lu,
+                                &lov_device_type);
+       LASSERT(luobj != NULL);
+
+       lov_lsm_decref(lu2lov(luobj), lsm);
+}
+EXPORT_SYMBOL(lov_lsm_put);
+
+int lov_read_and_clear_async_rc(struct cl_object *clob)
+{
+       struct lu_object *luobj;
+       int rc = 0;
+       ENTRY;
+
+       luobj = lu_object_locate(&cl_object_header(clob)->coh_lu,
+                                &lov_device_type);
+       if (luobj != NULL) {
+               struct lov_object *lov = lu2lov(luobj);
+
+               lov_conf_freeze(lov);
+               switch (lov->lo_type) {
+               case LLT_RAID0: {
+                       struct lov_stripe_md *lsm;
+                       int i;
+
+                       lsm = lov->lo_lsm;
+                       LASSERT(lsm != NULL);
+                       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+                               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+                               if (loi->loi_ar.ar_rc && !rc)
+                                       rc = loi->loi_ar.ar_rc;
+                               loi->loi_ar.ar_rc = 0;
+                       }
+               }
+               case LLT_EMPTY:
+                       break;
+               default:
+                       LBUG();
+               }
+               lov_conf_thaw(lov);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lov_read_and_clear_async_rc);
+
 /** @} lov */