Whamcloud - gitweb
LU-12296 llite: improve ll_dom_lock_cancel
[fs/lustre-release.git] / lustre / lov / lov_object.c
index 3a9eb87..cc041c0 100644 (file)
@@ -23,7 +23,7 @@
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2016, Intel Corporation.
+ * Copyright (c) 2011, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -37,6 +37,8 @@
 
 #define DEBUG_SUBSYSTEM S_LOV
 
+#include <linux/random.h>
+
 #include "lov_cl_internal.h"
 
 static inline struct lov_device *lov_object_dev(struct lov_object *obj)
@@ -74,6 +76,8 @@ struct lov_layout_operations {
                             struct cl_object *obj, struct cl_io *io);
         int  (*llo_getattr)(const struct lu_env *env, struct cl_object *obj,
                             struct cl_attr *attr);
+       int  (*llo_flush)(const struct lu_env *env, struct cl_object *obj,
+                         struct ldlm_lock *lock);
 };
 
 static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov);
@@ -89,30 +93,40 @@ static void lov_lsm_put(struct lov_stripe_md *lsm)
  * Lov object layout operations.
  *
  */
-static int lov_init_empty(const struct lu_env *env, struct lov_device *dev,
-                         struct lov_object *lov, struct lov_stripe_md *lsm,
-                         const struct cl_object_conf *conf,
-                         union lov_layout_state *state)
+
+static struct cl_object *lov_sub_find(const struct lu_env *env,
+                                     struct cl_device *dev,
+                                     const struct lu_fid *fid,
+                                     const struct cl_object_conf *conf)
 {
-       return 0;
+       struct lu_object *o;
+
+       ENTRY;
+
+       o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
+       LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
+       RETURN(lu2cl(o));
 }
 
-static struct cl_object *lov_sub_find(const struct lu_env *env,
-                                      struct cl_device *dev,
-                                      const struct lu_fid *fid,
-                                      const struct cl_object_conf *conf)
+static int lov_page_slice_fixup(struct lov_object *lov,
+                               struct cl_object *stripe)
 {
-        struct lu_object *o;
+       struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
+       struct cl_object *o;
 
-        ENTRY;
-        o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
-        LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
-        RETURN(lu2cl(o));
+       if (stripe == NULL)
+               return hdr->coh_page_bufsize - lov->lo_cl.co_slice_off -
+                      cfs_size_round(sizeof(struct lov_page));
+
+       cl_object_for_each(o, stripe)
+               o->co_slice_off += hdr->coh_page_bufsize;
+
+       return cl_object_header(stripe)->coh_page_bufsize;
 }
 
 static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
-                       struct cl_object *subobj, struct lov_layout_raid0 *r0,
-                       struct lov_oinfo *oinfo, int idx)
+                       struct cl_object *subobj, struct lov_oinfo *oinfo,
+                       int idx)
 {
        struct cl_object_header *hdr;
        struct cl_object_header *subhdr;
@@ -132,7 +146,7 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
                return -EIO;
        }
 
-       hdr    = cl_object_header(lov2cl(lov));
+       hdr = cl_object_header(lov2cl(lov));
        subhdr = cl_object_header(subobj);
 
        CDEBUG(D_INODE, DFID"@%p[%d:%d] -> "DFID"@%p: ostid: "DOSTID
@@ -145,13 +159,14 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
        spin_lock(&subhdr->coh_attr_guard);
        parent = subhdr->coh_parent;
        if (parent == NULL) {
+               struct lovsub_object *lso = cl2lovsub(subobj);
+
                subhdr->coh_parent = hdr;
                spin_unlock(&subhdr->coh_attr_guard);
                subhdr->coh_nesting = hdr->coh_nesting + 1;
                lu_object_ref_add(&subobj->co_lu, "lov-parent", lov);
-               r0->lo_sub[stripe] = cl2lovsub(subobj);
-               r0->lo_sub[stripe]->lso_super = lov;
-               r0->lo_sub[stripe]->lso_index = idx;
+               lso->lso_super = lov;
+               lso->lso_index = idx;
                result = 0;
        } else {
                struct lu_object  *old_obj;
@@ -181,42 +196,27 @@ static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
        return result;
 }
 
-static int lov_page_slice_fixup(struct lov_object *lov,
-                               struct cl_object *stripe)
-{
-       struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
-       struct cl_object *o;
-
-       if (stripe == NULL)
-               return hdr->coh_page_bufsize - lov->lo_cl.co_slice_off -
-                      cfs_size_round(sizeof(struct lov_page));
-
-       cl_object_for_each(o, stripe)
-               o->co_slice_off += hdr->coh_page_bufsize;
-
-       return cl_object_header(stripe)->coh_page_bufsize;
-}
-
 static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
-                         struct lov_object *lov, int index,
-                         struct lov_layout_raid0 *r0)
+                         struct lov_object *lov, unsigned int index,
+                         const struct cl_object_conf *conf,
+                         struct lov_layout_entry *lle)
 {
-       struct lov_thread_info  *lti     = lov_env_info(env);
-       struct cl_object_conf   *subconf = &lti->lti_stripe_conf;
-       struct lu_fid           *ofid    = &lti->lti_fid;
-       struct cl_object        *stripe;
+       struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+       struct lov_thread_info *lti = lov_env_info(env);
+       struct cl_object_conf *subconf = &lti->lti_stripe_conf;
+       struct lu_fid *ofid = &lti->lti_fid;
+       struct cl_object *stripe;
        struct lov_stripe_md_entry *lse  = lov_lse(lov, index);
        int result;
-       int psz;
+       int psz, sz;
        int i;
 
        ENTRY;
 
        spin_lock_init(&r0->lo_sub_lock);
        r0->lo_nr = lse->lsme_stripe_count;
-       LASSERT(r0->lo_nr <= lov_targets_nr(dev));
 
-       OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+       OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof(r0->lo_sub[0]));
        if (r0->lo_sub == NULL)
                GOTO(out, result = -ENOMEM);
 
@@ -255,7 +255,7 @@ static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
                if (IS_ERR(stripe))
                        GOTO(out, result = PTR_ERR(stripe));
 
-               result = lov_init_sub(env, lov, stripe, r0, oinfo,
+               result = lov_init_sub(env, lov, stripe, oinfo,
                                      lov_comp_index(index, i));
                if (result == -EAGAIN) { /* try again */
                        --i;
@@ -264,7 +264,9 @@ static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev,
                }
 
                if (result == 0) {
-                       int sz = lov_page_slice_fixup(lov, stripe);
+                       r0->lo_sub[i] = cl2lovsub(stripe);
+
+                       sz = lov_page_slice_fixup(lov, stripe);
                        LASSERT(ergo(psz > 0, psz == sz));
                        psz = sz;
                }
@@ -275,16 +277,369 @@ out:
        RETURN(result);
 }
 
+static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
+                              struct lov_layout_raid0 *r0,
+                              struct lovsub_object *los, int idx)
+{
+       struct cl_object        *sub;
+       struct lu_site          *site;
+       wait_queue_head_t *wq;
+       wait_queue_entry_t *waiter;
+
+        LASSERT(r0->lo_sub[idx] == los);
+
+       sub = lovsub2cl(los);
+       site = sub->co_lu.lo_dev->ld_site;
+       wq = lu_site_wq_from_fid(site, &sub->co_lu.lo_header->loh_fid);
+
+        cl_object_kill(env, sub);
+        /* release a reference to the sub-object and ... */
+        lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
+        cl_object_put(env, sub);
+
+       /* ... wait until it is actually destroyed---sub-object clears its
+        * ->lo_sub[] slot in lovsub_object_free() */
+       if (r0->lo_sub[idx] == los) {
+               waiter = &lov_env_info(env)->lti_waiter;
+               init_waitqueue_entry(waiter, current);
+               add_wait_queue(wq, waiter);
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               while (1) {
+                       /* this wait-queue is signaled at the end of
+                        * lu_object_free(). */
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       spin_lock(&r0->lo_sub_lock);
+                       if (r0->lo_sub[idx] == los) {
+                               spin_unlock(&r0->lo_sub_lock);
+                               schedule();
+                       } else {
+                               spin_unlock(&r0->lo_sub_lock);
+                               set_current_state(TASK_RUNNING);
+                               break;
+                       }
+               }
+               remove_wait_queue(wq, waiter);
+       }
+       LASSERT(r0->lo_sub[idx] == NULL);
+}
+
+static void lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
+                            struct lov_layout_entry *lle)
+{
+       struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+
+       ENTRY;
+
+        if (r0->lo_sub != NULL) {
+               int i;
+
+               for (i = 0; i < r0->lo_nr; ++i) {
+                       struct lovsub_object *los = r0->lo_sub[i];
+
+                       if (los != NULL) {
+                               cl_object_prune(env, &los->lso_cl);
+                               /*
+                                * If top-level object is to be evicted from
+                                * the cache, so are its sub-objects.
+                                */
+                               lov_subobject_kill(env, lov, r0, los, i);
+                       }
+               }
+       }
+
+       EXIT;
+}
+
+static void lov_fini_raid0(const struct lu_env *env,
+                          struct lov_layout_entry *lle)
+{
+       struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+
+       if (r0->lo_sub != NULL) {
+               OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+               r0->lo_sub = NULL;
+       }
+}
+
+static int lov_print_raid0(const struct lu_env *env, void *cookie,
+                          lu_printer_t p, const struct lov_layout_entry *lle)
+{
+       const struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+       int i;
+
+       for (i = 0; i < r0->lo_nr; ++i) {
+               struct lu_object *sub;
+
+               if (r0->lo_sub[i] != NULL) {
+                       sub = lovsub2lu(r0->lo_sub[i]);
+                       lu_object_print(env, cookie, p, sub);
+               } else {
+                       (*p)(env, cookie, "sub %d absent\n", i);
+               }
+       }
+       return 0;
+}
+
+static int lov_attr_get_raid0(const struct lu_env *env, struct lov_object *lov,
+                             unsigned int index, struct lov_layout_entry *lle,
+                             struct cl_attr **lov_attr)
+{
+       struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+       struct lov_stripe_md *lsm = lov->lo_lsm;
+       struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb;
+       struct cl_attr *attr = &r0->lo_attr;
+       __u64 kms = 0;
+       int result = 0;
+
+       if (r0->lo_attr_valid) {
+               *lov_attr = attr;
+               return 0;
+       }
+
+       memset(lvb, 0, sizeof(*lvb));
+
+       /* XXX: timestamps can be negative by sanity:test_39m,
+        * how can it be? */
+       lvb->lvb_atime = LLONG_MIN;
+       lvb->lvb_ctime = LLONG_MIN;
+       lvb->lvb_mtime = LLONG_MIN;
+
+       /*
+        * XXX that should be replaced with a loop over sub-objects,
+        * doing cl_object_attr_get() on them. But for now, let's
+        * reuse old lov code.
+        */
+
+       /*
+        * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
+        * happy. It's not needed, because new code uses
+        * ->coh_attr_guard spin-lock to protect consistency of
+        * sub-object attributes.
+        */
+       lov_stripe_lock(lsm);
+       result = lov_merge_lvb_kms(lsm, index, lvb, &kms);
+       lov_stripe_unlock(lsm);
+       if (result == 0) {
+               cl_lvb2attr(attr, lvb);
+               attr->cat_kms = kms;
+               r0->lo_attr_valid = 1;
+               *lov_attr = attr;
+       }
+
+       return result;
+}
+
+static struct lov_comp_layout_entry_ops raid0_ops = {
+       .lco_init      = lov_init_raid0,
+       .lco_fini      = lov_fini_raid0,
+       .lco_getattr   = lov_attr_get_raid0,
+};
+
+static int lov_attr_get_dom(const struct lu_env *env, struct lov_object *lov,
+                           unsigned int index, struct lov_layout_entry *lle,
+                           struct cl_attr **lov_attr)
+{
+       struct lov_layout_dom *dom = &lle->lle_dom;
+       struct lov_oinfo *loi = dom->lo_loi;
+       struct cl_attr *attr = &dom->lo_dom_r0.lo_attr;
+
+       if (dom->lo_dom_r0.lo_attr_valid) {
+               *lov_attr = attr;
+               return 0;
+       }
+
+       if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks))
+               return OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks);
+
+       cl_lvb2attr(attr, &loi->loi_lvb);
+
+       /* DoM component size can be bigger than stripe size after
+        * client's setattr RPC, so do not count anything beyond
+        * component end. Alternatively, check that limit on server
+        * and do not allow size overflow there. */
+       if (attr->cat_size > lle->lle_extent->e_end)
+               attr->cat_size = lle->lle_extent->e_end;
+
+       attr->cat_kms = attr->cat_size;
+
+       dom->lo_dom_r0.lo_attr_valid = 1;
+       *lov_attr = attr;
+
+       return 0;
+}
+
+/**
+ * Lookup FLD to get MDS index of the given DOM object FID.
+ *
+ * \param[in]  ld      LOV device
+ * \param[in]  fid     FID to lookup
+ * \param[out] nr      index in MDC array to return back
+ *
+ * \retval             0 and \a mds filled with MDS index if successful
+ * \retval             negative value on error
+ */
+static int lov_fld_lookup(struct lov_device *ld, const struct lu_fid *fid,
+                         __u32 *nr)
+{
+       __u32 mds_idx;
+       int i, rc;
+
+       ENTRY;
+
+       rc = fld_client_lookup(&ld->ld_lmv->u.lmv.lmv_fld, fid_seq(fid),
+                              &mds_idx, LU_SEQ_RANGE_MDT, NULL);
+       if (rc) {
+               CERROR("%s: error while looking for mds number. Seq %#llx"
+                      ", err = %d\n", lu_dev_name(cl2lu_dev(&ld->ld_cl)),
+                      fid_seq(fid), rc);
+               RETURN(rc);
+       }
+
+       CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n",
+              mds_idx, PFID(fid));
+
+       /* find proper MDC device in the array */
+       for (i = 0; i < ld->ld_md_tgts_nr; i++) {
+               if (ld->ld_md_tgts[i].ldm_mdc != NULL &&
+                   ld->ld_md_tgts[i].ldm_idx == mds_idx)
+                       break;
+       }
+
+       if (i == ld->ld_md_tgts_nr) {
+               CERROR("%s: cannot find corresponding MDC device for mds #%x "
+                      "for fid="DFID"\n", lu_dev_name(cl2lu_dev(&ld->ld_cl)),
+                      mds_idx, PFID(fid));
+               rc = -EINVAL;
+       } else {
+               *nr = i;
+       }
+       RETURN(rc);
+}
+
+/**
+ * Implementation of lov_comp_layout_entry_ops::lco_init for DOM object.
+ *
+ * Init the DOM object for the first time. It prepares also RAID0 entry
+ * for it to use in common methods with ordinary RAID0 layout entries.
+ *
+ * \param[in] env      execution environment
+ * \param[in] dev      LOV device
+ * \param[in] lov      LOV object
+ * \param[in] index    Composite layout entry index in LSM
+ * \param[in] lle      Composite LOV layout entry
+ */
+static int lov_init_dom(const struct lu_env *env, struct lov_device *dev,
+                       struct lov_object *lov, unsigned int index,
+                       const struct cl_object_conf *conf,
+                       struct lov_layout_entry *lle)
+{
+       struct lov_thread_info *lti = lov_env_info(env);
+       struct lov_stripe_md_entry *lsme = lov_lse(lov, index);
+       struct cl_object *clo;
+       struct lu_object *o = lov2lu(lov);
+       const struct lu_fid *fid = lu_object_fid(o);
+       struct cl_device *mdcdev;
+       struct lov_oinfo *loi = NULL;
+       struct cl_object_conf *sconf = &lti->lti_stripe_conf;
+
+       int rc;
+       __u32 idx = 0;
+
+       ENTRY;
+
+       LASSERT(index == 0);
+
+       /* find proper MDS device */
+       rc = lov_fld_lookup(dev, fid, &idx);
+       if (rc)
+               RETURN(rc);
+
+       LASSERTF(dev->ld_md_tgts[idx].ldm_mdc != NULL,
+                "LOV md target[%u] is NULL\n", idx);
+
+       /* check lsm is DOM, more checks are needed */
+       LASSERT(lsme->lsme_stripe_count == 0);
+
+       /*
+        * Create lower cl_objects.
+        */
+       mdcdev = dev->ld_md_tgts[idx].ldm_mdc;
+
+       LASSERTF(mdcdev != NULL, "non-initialized mdc subdev\n");
+
+       /* DoM object has no oinfo in LSM entry, create it exclusively */
+       OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, GFP_NOFS);
+       if (loi == NULL)
+               RETURN(-ENOMEM);
+
+       fid_to_ostid(lu_object_fid(lov2lu(lov)), &loi->loi_oi);
+
+       sconf->u.coc_oinfo = loi;
+again:
+       clo = lov_sub_find(env, mdcdev, fid, sconf);
+       if (IS_ERR(clo))
+               GOTO(out, rc = PTR_ERR(clo));
+
+       rc = lov_init_sub(env, lov, clo, loi, lov_comp_index(index, 0));
+       if (rc == -EAGAIN) /* try again */
+               goto again;
+       else if (rc != 0)
+               GOTO(out, rc);
+
+       lle->lle_dom.lo_dom = cl2lovsub(clo);
+       spin_lock_init(&lle->lle_dom.lo_dom_r0.lo_sub_lock);
+       lle->lle_dom.lo_dom_r0.lo_nr = 1;
+       lle->lle_dom.lo_dom_r0.lo_sub = &lle->lle_dom.lo_dom;
+       lle->lle_dom.lo_loi = loi;
+
+       rc = lov_page_slice_fixup(lov, clo);
+       RETURN(rc);
+
+out:
+       if (loi != NULL)
+               OBD_SLAB_FREE_PTR(loi, lov_oinfo_slab);
+       return rc;
+}
+
+/**
+ * Implementation of lov_layout_operations::llo_fini for DOM object.
+ *
+ * Finish the DOM object and free related memory.
+ *
+ * \param[in] env      execution environment
+ * \param[in] lov      LOV object
+ * \param[in] state    LOV layout state
+ */
+static void lov_fini_dom(const struct lu_env *env,
+                        struct lov_layout_entry *lle)
+{
+       if (lle->lle_dom.lo_dom != NULL)
+               lle->lle_dom.lo_dom = NULL;
+       if (lle->lle_dom.lo_loi != NULL)
+               OBD_SLAB_FREE_PTR(lle->lle_dom.lo_loi, lov_oinfo_slab);
+}
+
+static struct lov_comp_layout_entry_ops dom_ops = {
+       .lco_init = lov_init_dom,
+       .lco_fini = lov_fini_dom,
+       .lco_getattr = lov_attr_get_dom,
+};
+
 static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
                              struct lov_object *lov, struct lov_stripe_md *lsm,
                              const struct cl_object_conf *conf,
                              union lov_layout_state *state)
 {
        struct lov_layout_composite *comp = &state->composite;
+       struct lov_layout_entry *lle;
+       struct lov_mirror_entry *lre;
        unsigned int entry_count;
        unsigned int psz = 0;
+       unsigned int mirror_count;
+       int flr_state = lsm->lsm_flags & LCM_FL_FLR_MASK;
        int result = 0;
-       int i;
+       unsigned int seq;
+       int i, j;
 
        ENTRY;
 
@@ -293,167 +648,193 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
        lov->lo_lsm = lsm_addref(lsm);
        lov->lo_layout_invalid = true;
 
+       dump_lsm(D_INODE, lsm);
+
        entry_count = lsm->lsm_entry_count;
-       comp->lo_entry_count = entry_count;
+
+       spin_lock_init(&comp->lo_write_lock);
+       comp->lo_flags = lsm->lsm_flags;
+       comp->lo_mirror_count = lsm->lsm_mirror_count + 1;
+       comp->lo_entry_count = lsm->lsm_entry_count;
+       comp->lo_preferred_mirror = -1;
+
+       if (equi(flr_state == LCM_FL_NONE, comp->lo_mirror_count > 1))
+               RETURN(-EINVAL);
+
+       OBD_ALLOC(comp->lo_mirrors,
+                 comp->lo_mirror_count * sizeof(*comp->lo_mirrors));
+       if (comp->lo_mirrors == NULL)
+               RETURN(-ENOMEM);
 
        OBD_ALLOC(comp->lo_entries, entry_count * sizeof(*comp->lo_entries));
        if (comp->lo_entries == NULL)
                RETURN(-ENOMEM);
 
-       for (i = 0; i < entry_count; i++) {
-               struct lov_layout_entry *le = &comp->lo_entries[i];
+       /* Initiate all entry types and extents data at first */
+       for (i = 0, j = 0, mirror_count = 1; i < entry_count; i++) {
+               int mirror_id = 0;
+
+               lle = &comp->lo_entries[i];
+
+               lle->lle_lsme = lsm->lsm_entries[i];
+               lle->lle_type = lov_entry_type(lle->lle_lsme);
+               switch (lle->lle_type) {
+               case LOV_PATTERN_RAID0:
+                       lle->lle_comp_ops = &raid0_ops;
+                       break;
+               case LOV_PATTERN_MDT:
+                       lle->lle_comp_ops = &dom_ops;
+                       break;
+               default:
+                       CERROR("%s: unknown composite layout entry type %i\n",
+                              lov2obd(dev->ld_lov)->obd_name,
+                              lsm->lsm_entries[i]->lsme_pattern);
+                       dump_lsm(D_ERROR, lsm);
+                       RETURN(-EIO);
+               }
+
+               lle->lle_extent = &lle->lle_lsme->lsme_extent;
+               lle->lle_valid = !(lle->lle_lsme->lsme_flags & LCME_FL_STALE);
+
+               if (flr_state != LCM_FL_NONE)
+                       mirror_id = mirror_id_of(lle->lle_lsme->lsme_id);
+
+               lre = &comp->lo_mirrors[j];
+               if (i > 0) {
+                       if (mirror_id == lre->lre_mirror_id) {
+                               lre->lre_valid |= lle->lle_valid;
+                               lre->lre_stale |= !lle->lle_valid;
+                               lre->lre_end = i;
+                               continue;
+                       }
+
+                       /* new mirror detected, assume that the mirrors
+                        * are shorted in layout */
+                       ++mirror_count;
+                       ++j;
+                       if (j >= comp->lo_mirror_count)
+                               break;
+
+                       lre = &comp->lo_mirrors[j];
+               }
+
+               /* entries must be sorted by mirrors */
+               lre->lre_mirror_id = mirror_id;
+               lre->lre_start = lre->lre_end = i;
+               lre->lre_preferred = !!(lle->lle_lsme->lsme_flags &
+                                       LCME_FL_PREF_RD);
+               lre->lre_valid = lle->lle_valid;
+               lre->lre_stale = !lle->lle_valid;
+       }
+
+       /* sanity check for FLR */
+       if (mirror_count != comp->lo_mirror_count) {
+               CDEBUG(D_INODE, DFID
+                      " doesn't have the # of mirrors it claims, %u/%u\n",
+                      PFID(lu_object_fid(lov2lu(lov))), mirror_count,
+                      comp->lo_mirror_count + 1);
+
+               GOTO(out, result = -EINVAL);
+       }
+
+       lov_foreach_layout_entry(lov, lle) {
+               int index = lov_layout_entry_index(lov, lle);
 
-               le->lle_extent = lsm->lsm_entries[i]->lsme_extent;
                /**
                 * If the component has not been init-ed on MDS side, for
                 * PFL layout, we'd know that the components beyond this one
                 * will be dynamically init-ed later on file write/trunc ops.
                 */
-               if (!lsm_entry_inited(lsm, i))
-                       break;
+               if (!lsme_inited(lle->lle_lsme))
+                       continue;
 
-               result = lov_init_raid0(env, dev, lov, i, &le->lle_raid0);
+               result = lle->lle_comp_ops->lco_init(env, dev, lov, index,
+                                                    conf, lle);
                if (result < 0)
                        break;
 
                LASSERT(ergo(psz > 0, psz == result));
                psz = result;
        }
+
        if (psz > 0)
                cl_object_header(&lov->lo_cl)->coh_page_bufsize += psz;
 
-       return result > 0 ? 0 : result;
-}
+       /* decide the preferred mirror. It uses the hash value of lov_object
+        * so that different clients would use different mirrors for read. */
+       mirror_count = 0;
+       seq = hash_long((unsigned long)lov, 8);
+       for (i = 0; i < comp->lo_mirror_count; i++) {
+               unsigned int idx = (i + seq) % comp->lo_mirror_count;
 
-static int lov_init_released(const struct lu_env *env,
-                            struct lov_device *dev, struct lov_object *lov,
-                            struct lov_stripe_md *lsm,
-                            const struct cl_object_conf *conf,
-                            union lov_layout_state *state)
-{
-       LASSERT(lsm != NULL);
-       LASSERT(lsm->lsm_is_released);
-       LASSERT(lov->lo_lsm == NULL);
+               lre = lov_mirror_entry(lov, idx);
+               if (lre->lre_stale)
+                       continue;
 
-       lov->lo_lsm = lsm_addref(lsm);
-       return 0;
-}
+               mirror_count++; /* valid mirror */
 
-static struct cl_object *lov_find_subobj(const struct lu_env *env,
-                                        struct lov_object *lov,
-                                        struct lov_stripe_md *lsm,
-                                        int index)
-{
-       struct lov_device       *dev = lu2lov_dev(lov2lu(lov)->lo_dev);
-       struct lov_thread_info  *lti = lov_env_info(env);
-       struct lu_fid           *ofid = &lti->lti_fid;
-       struct lov_oinfo        *oinfo;
-       struct cl_device        *subdev;
-       int                     entry = lov_comp_entry(index);
-       int                     stripe = lov_comp_stripe(index);
-       int                     ost_idx;
-       int                     rc;
-       struct cl_object        *result;
-
-       if (lov->lo_type != LLT_COMP)
-               GOTO(out, result = NULL);
+               if (lre->lre_preferred || comp->lo_preferred_mirror < 0)
+                       comp->lo_preferred_mirror = idx;
+       }
+       if (!mirror_count) {
+               CDEBUG(D_INODE, DFID
+                      " doesn't have any valid mirrors\n",
+                      PFID(lu_object_fid(lov2lu(lov))));
 
-       if (entry >= lsm->lsm_entry_count ||
-           stripe >= lsm->lsm_entries[entry]->lsme_stripe_count)
-               GOTO(out, result = NULL);
+               comp->lo_preferred_mirror = 0;
+       }
 
-       oinfo = lsm->lsm_entries[entry]->lsme_oinfo[stripe];
-       ost_idx = oinfo->loi_ost_idx;
-       rc = ostid_to_fid(ofid, &oinfo->loi_oi, ost_idx);
-       if (rc != 0)
-               GOTO(out, result = NULL);
+       LASSERT(comp->lo_preferred_mirror >= 0);
 
-       subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
-       result = lov_sub_find(env, subdev, ofid, NULL);
+       EXIT;
 out:
-       if (result == NULL)
-               result = ERR_PTR(-EINVAL);
-       return result;
+       return result > 0 ? 0 : result;
 }
 
-static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
-                           union lov_layout_state *state)
+static int lov_init_empty(const struct lu_env *env, struct lov_device *dev,
+                         struct lov_object *lov, struct lov_stripe_md *lsm,
+                         const struct cl_object_conf *conf,
+                         union lov_layout_state *state)
 {
-       LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED);
-
-       lov_layout_wait(env, lov);
        return 0;
 }
 
-static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
-                              struct lov_layout_raid0 *r0,
-                              struct lovsub_object *los, int idx)
+static int lov_init_released(const struct lu_env *env,
+                            struct lov_device *dev, struct lov_object *lov,
+                            struct lov_stripe_md *lsm,
+                            const struct cl_object_conf *conf,
+                            union lov_layout_state *state)
 {
-       struct cl_object        *sub;
-       struct lu_site          *site;
-       struct lu_site_bkt_data *bkt;
-       wait_queue_t          *waiter;
-
-        LASSERT(r0->lo_sub[idx] == los);
-
-        sub  = lovsub2cl(los);
-        site = sub->co_lu.lo_dev->ld_site;
-        bkt  = lu_site_bkt_from_fid(site, &sub->co_lu.lo_header->loh_fid);
-
-        cl_object_kill(env, sub);
-        /* release a reference to the sub-object and ... */
-        lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
-        cl_object_put(env, sub);
+       LASSERT(lsm != NULL);
+       LASSERT(lsm->lsm_is_released);
+       LASSERT(lov->lo_lsm == NULL);
 
-        /* ... wait until it is actually destroyed---sub-object clears its
-         * ->lo_sub[] slot in lovsub_object_fini() */
-       if (r0->lo_sub[idx] == los) {
-               waiter = &lov_env_info(env)->lti_waiter;
-               init_waitqueue_entry(waiter, current);
-               add_wait_queue(&bkt->lsb_marche_funebre, waiter);
-               set_current_state(TASK_UNINTERRUPTIBLE);
-               while (1) {
-                       /* this wait-queue is signaled at the end of
-                        * lu_object_free(). */
-                       set_current_state(TASK_UNINTERRUPTIBLE);
-                       spin_lock(&r0->lo_sub_lock);
-                       if (r0->lo_sub[idx] == los) {
-                               spin_unlock(&r0->lo_sub_lock);
-                               schedule();
-                       } else {
-                               spin_unlock(&r0->lo_sub_lock);
-                               set_current_state(TASK_RUNNING);
-                               break;
-                       }
-               }
-               remove_wait_queue(&bkt->lsb_marche_funebre, waiter);
-       }
-       LASSERT(r0->lo_sub[idx] == NULL);
+       lov->lo_lsm = lsm_addref(lsm);
+       return 0;
 }
 
-static void lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
-                            struct lov_layout_raid0 *r0)
+static int lov_init_foreign(const struct lu_env *env,
+                           struct lov_device *dev, struct lov_object *lov,
+                           struct lov_stripe_md *lsm,
+                           const struct cl_object_conf *conf,
+                           union lov_layout_state *state)
 {
-       ENTRY;
-
-        if (r0->lo_sub != NULL) {
-               int i;
+       LASSERT(lsm != NULL);
+       LASSERT(lov->lo_type == LLT_FOREIGN);
+       LASSERT(lov->lo_lsm == NULL);
 
-               for (i = 0; i < r0->lo_nr; ++i) {
-                       struct lovsub_object *los = r0->lo_sub[i];
+       lov->lo_lsm = lsm_addref(lsm);
+       return 0;
+}
 
-                       if (los != NULL) {
-                               cl_object_prune(env, &los->lso_cl);
-                               /*
-                                * If top-level object is to be evicted from
-                                * the cache, so are its sub-objects.
-                                */
-                               lov_subobject_kill(env, lov, r0, los, i);
-                       }
-               }
-       }
+static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
+                           union lov_layout_state *state)
+{
+       LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED ||
+               lov->lo_type == LLT_FOREIGN);
 
-       EXIT;
+       lov_layout_wait(env, lov);
+       return 0;
 }
 
 static int lov_delete_composite(const struct lu_env *env,
@@ -470,7 +851,7 @@ static int lov_delete_composite(const struct lu_env *env,
        lov_layout_wait(env, lov);
        if (comp->lo_entries)
                lov_foreach_layout_entry(lov, entry)
-                       lov_delete_raid0(env, lov, &entry->lle_raid0);
+                       lov_delete_raid0(env, lov, entry);
 
        RETURN(0);
 }
@@ -481,15 +862,6 @@ static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov,
        LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED);
 }
 
-static void lov_fini_raid0(const struct lu_env *env,
-                          struct lov_layout_raid0 *r0)
-{
-       if (r0->lo_sub != NULL) {
-               OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
-               r0->lo_sub = NULL;
-       }
-}
-
 static void lov_fini_composite(const struct lu_env *env,
                               struct lov_object *lov,
                               union lov_layout_state *state)
@@ -501,13 +873,21 @@ static void lov_fini_composite(const struct lu_env *env,
                struct lov_layout_entry *entry;
 
                lov_foreach_layout_entry(lov, entry)
-                       lov_fini_raid0(env, &entry->lle_raid0);
+                       entry->lle_comp_ops->lco_fini(env, entry);
 
                OBD_FREE(comp->lo_entries,
                         comp->lo_entry_count * sizeof(*comp->lo_entries));
                comp->lo_entries = NULL;
        }
 
+       if (comp->lo_mirrors != NULL) {
+               OBD_FREE(comp->lo_mirrors,
+                        comp->lo_mirror_count * sizeof(*comp->lo_mirrors));
+               comp->lo_mirrors = NULL;
+       }
+
+       memset(comp, 0, sizeof(*comp));
+
        dump_lsm(D_INODE, lov->lo_lsm);
        lov_free_memmd(&lov->lo_lsm);
 
@@ -530,24 +910,6 @@ static int lov_print_empty(const struct lu_env *env, void *cookie,
         return 0;
 }
 
-static int lov_print_raid0(const struct lu_env *env, void *cookie,
-                          lu_printer_t p, struct lov_layout_raid0 *r0)
-{
-       int i;
-
-       for (i = 0; i < r0->lo_nr; ++i) {
-               struct lu_object *sub;
-
-               if (r0->lo_sub[i] != NULL) {
-                       sub = lovsub2lu(r0->lo_sub[i]);
-                       lu_object_print(env, cookie, p, sub);
-               } else {
-                       (*p)(env, cookie, "sub %d absent\n", i);
-               }
-       }
-       return 0;
-}
-
 static int lov_print_composite(const struct lu_env *env, void *cookie,
                               lu_printer_t p, const struct lu_object *o)
 {
@@ -563,12 +925,15 @@ static int lov_print_composite(const struct lu_env *env, void *cookie,
 
        for (i = 0; i < lsm->lsm_entry_count; i++) {
                struct lov_stripe_md_entry *lse = lsm->lsm_entries[i];
+               struct lov_layout_entry *lle = lov_entry(lov, i);
 
-               (*p)(env, cookie, DEXT ": { 0x%08X, %u, %u, %#x, %u, %u }\n",
+               (*p)(env, cookie,
+                    DEXT ": { 0x%08X, %u, %#x, %u, %#x, %u, %u }\n",
                     PEXT(&lse->lsme_extent), lse->lsme_magic,
-                    lse->lsme_id, lse->lsme_layout_gen, lse->lsme_flags,
-                    lse->lsme_stripe_count, lse->lsme_stripe_size);
-               lov_print_raid0(env, cookie, p, lov_r0(lov, i));
+                    lse->lsme_id, lse->lsme_pattern, lse->lsme_layout_gen,
+                    lse->lsme_flags, lse->lsme_stripe_count,
+                    lse->lsme_stripe_size);
+               lov_print_raid0(env, cookie, p, lle);
        }
 
        return 0;
@@ -588,6 +953,23 @@ static int lov_print_released(const struct lu_env *env, void *cookie,
        return 0;
 }
 
+static int lov_print_foreign(const struct lu_env *env, void *cookie,
+                               lu_printer_t p, const struct lu_object *o)
+{
+       struct lov_object       *lov = lu2lov(o);
+       struct lov_stripe_md    *lsm = lov->lo_lsm;
+
+       (*p)(env, cookie,
+               "foreign: %s, lsm{%p 0x%08X %d %u}:\n",
+               lov->lo_layout_invalid ? "invalid" : "valid", lsm,
+               lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+               lsm->lsm_layout_gen);
+       (*p)(env, cookie,
+               "raw_ea_content '%.*s'\n",
+               (int)lsm->lsm_foreign_size, (char *)lsm_foreign(lsm));
+       return 0;
+}
+
 /**
  * Implements cl_object_operations::coo_attr_get() method for an object
  * without stripes (LLT_EMPTY layout type).
@@ -602,51 +984,6 @@ static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj,
         return 0;
 }
 
-static int lov_attr_get_raid0(const struct lu_env *env, struct lov_object *lov,
-                             unsigned int index, struct lov_layout_raid0 *r0)
-
-{
-       struct lov_stripe_md *lsm = lov->lo_lsm;
-       struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb;
-       struct cl_attr *attr = &r0->lo_attr;
-       __u64 kms = 0;
-       int result = 0;
-
-       if (r0->lo_attr_valid)
-               return 0;
-
-       memset(lvb, 0, sizeof(*lvb));
-
-       /* XXX: timestamps can be negative by sanity:test_39m,
-        * how can it be? */
-       lvb->lvb_atime = LLONG_MIN;
-       lvb->lvb_ctime = LLONG_MIN;
-       lvb->lvb_mtime = LLONG_MIN;
-
-       /*
-        * XXX that should be replaced with a loop over sub-objects,
-        * doing cl_object_attr_get() on them. But for now, let's
-        * reuse old lov code.
-        */
-
-       /*
-        * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
-        * happy. It's not needed, because new code uses
-        * ->coh_attr_guard spin-lock to protect consistency of
-        * sub-object attributes.
-        */
-       lov_stripe_lock(lsm);
-       result = lov_merge_lvb_kms(lsm, index, lvb, &kms);
-       lov_stripe_unlock(lsm);
-       if (result == 0) {
-               cl_lvb2attr(attr, lvb);
-               attr->cat_kms = kms;
-               r0->lo_attr_valid = 1;
-       }
-
-       return result;
-}
-
 static int lov_attr_get_composite(const struct lu_env *env,
                                  struct cl_object *obj,
                                  struct cl_attr *attr)
@@ -654,25 +991,34 @@ static int lov_attr_get_composite(const struct lu_env *env,
        struct lov_object       *lov = cl2lov(obj);
        struct lov_layout_entry *entry;
        int                      result = 0;
-       int                      index = 0;
 
        ENTRY;
 
        attr->cat_size = 0;
        attr->cat_blocks = 0;
        lov_foreach_layout_entry(lov, entry) {
-               struct lov_layout_raid0 *r0 = &entry->lle_raid0;
-               struct cl_attr *lov_attr = &r0->lo_attr;
+               struct cl_attr *lov_attr = NULL;
+               int index = lov_layout_entry_index(lov, entry);
+
+               if (!entry->lle_valid)
+                       continue;
 
                /* PFL: This component has not been init-ed. */
                if (!lsm_entry_inited(lov->lo_lsm, index))
-                       break;
+                       continue;
 
-               result = lov_attr_get_raid0(env, lov, index, r0);
-               if (result != 0)
-                       break;
+               result = entry->lle_comp_ops->lco_getattr(env, lov, index,
+                                                         entry, &lov_attr);
+               if (result < 0)
+                       RETURN(result);
 
-               index++;
+               if (lov_attr == NULL)
+                       continue;
+
+               CDEBUG(D_INODE, "COMP ID #%i: s=%llu m=%llu a=%llu c=%llu "
+                      "b=%llu\n", index - 1, lov_attr->cat_size,
+                      lov_attr->cat_mtime, lov_attr->cat_atime,
+                      lov_attr->cat_ctime, lov_attr->cat_blocks);
 
                /* merge results */
                attr->cat_blocks += lov_attr->cat_blocks;
@@ -687,28 +1033,45 @@ static int lov_attr_get_composite(const struct lu_env *env,
                if (attr->cat_mtime < lov_attr->cat_mtime)
                        attr->cat_mtime = lov_attr->cat_mtime;
        }
-       RETURN(result);
+
+       RETURN(0);
+}
+
+static int lov_flush_composite(const struct lu_env *env,
+                              struct cl_object *obj,
+                              struct ldlm_lock *lock)
+{
+       struct lov_object *lov = cl2lov(obj);
+       struct lovsub_object *lovsub;
+
+       ENTRY;
+
+       if (!lsme_is_dom(lov->lo_lsm->lsm_entries[0]))
+               RETURN(-EINVAL);
+
+       lovsub = lov->u.composite.lo_entries[0].lle_dom.lo_dom;
+       RETURN(cl_object_flush(env, lovsub2cl(lovsub), lock));
 }
 
 const static struct lov_layout_operations lov_dispatch[] = {
-        [LLT_EMPTY] = {
-                .llo_init      = lov_init_empty,
-                .llo_delete    = lov_delete_empty,
-                .llo_fini      = lov_fini_empty,
-                .llo_print     = lov_print_empty,
-                .llo_page_init = lov_page_init_empty,
-                .llo_lock_init = lov_lock_init_empty,
-                .llo_io_init   = lov_io_init_empty,
+       [LLT_EMPTY] = {
+               .llo_init      = lov_init_empty,
+               .llo_delete    = lov_delete_empty,
+               .llo_fini      = lov_fini_empty,
+               .llo_print     = lov_print_empty,
+               .llo_page_init = lov_page_init_empty,
+               .llo_lock_init = lov_lock_init_empty,
+               .llo_io_init   = lov_io_init_empty,
                .llo_getattr   = lov_attr_get_empty,
-        },
-        [LLT_RELEASED] = {
-                .llo_init      = lov_init_released,
-                .llo_delete    = lov_delete_empty,
-                .llo_fini      = lov_fini_released,
-                .llo_print     = lov_print_released,
-                .llo_page_init = lov_page_init_empty,
-                .llo_lock_init = lov_lock_init_empty,
-                .llo_io_init   = lov_io_init_released,
+       },
+       [LLT_RELEASED] = {
+               .llo_init      = lov_init_released,
+               .llo_delete    = lov_delete_empty,
+               .llo_fini      = lov_fini_released,
+               .llo_print     = lov_print_released,
+               .llo_page_init = lov_page_init_empty,
+               .llo_lock_init = lov_lock_init_empty,
+               .llo_io_init   = lov_io_init_released,
                .llo_getattr   = lov_attr_get_empty,
        },
        [LLT_COMP] = {
@@ -720,6 +1083,17 @@ const static struct lov_layout_operations lov_dispatch[] = {
                .llo_lock_init = lov_lock_init_composite,
                .llo_io_init   = lov_io_init_composite,
                .llo_getattr   = lov_attr_get_composite,
+               .llo_flush     = lov_flush_composite,
+       },
+       [LLT_FOREIGN] = {
+               .llo_init      = lov_init_foreign,
+               .llo_delete    = lov_delete_empty,
+               .llo_fini      = lov_fini_released,
+               .llo_print     = lov_print_foreign,
+               .llo_page_init = lov_page_init_foreign,
+               .llo_lock_init = lov_lock_init_empty,
+               .llo_io_init   = lov_io_init_empty,
+               .llo_getattr   = lov_attr_get_empty,
        },
 };
 
@@ -752,6 +1126,9 @@ static enum lov_layout_type lov_type(struct lov_stripe_md *lsm)
            lsm->lsm_magic == LOV_MAGIC_COMP_V1)
                return LLT_COMP;
 
+       if (lsm->lsm_magic == LOV_MAGIC_FOREIGN)
+               return LLT_FOREIGN;
+
        return LLT_EMPTY;
 }
 
@@ -881,12 +1258,11 @@ static int lov_layout_change(const struct lu_env *unused,
        CDEBUG(D_INODE, DFID "Apply new layout lov %p, type %d\n",
               PFID(lu_object_fid(lov2lu(lov))), lov, llt);
 
-       lov->lo_type = LLT_EMPTY;
-
        /* page bufsize fixup */
        cl_object_header(&lov->lo_cl)->coh_page_bufsize -=
                lov_page_slice_fixup(lov, NULL);
 
+       lov->lo_type = llt;
        rc = new_ops->llo_init(env, lov_dev, lov, lsm, conf, state);
        if (rc != 0) {
                struct obd_device *obd = lov2obd(lov_dev->ld_lov);
@@ -896,11 +1272,10 @@ static int lov_layout_change(const struct lu_env *unused,
                new_ops->llo_delete(env, lov, state);
                new_ops->llo_fini(env, lov, state);
                /* this file becomes an EMPTY file. */
+               lov->lo_type = LLT_EMPTY;
                GOTO(out, rc);
        }
 
-       lov->lo_type = llt;
-
 out:
        cl_env_put(env, &refcheck);
        RETURN(rc);
@@ -1056,14 +1431,19 @@ int lov_page_init(const struct lu_env *env, struct cl_object *obj,
 int lov_io_init(const struct lu_env *env, struct cl_object *obj,
                struct cl_io *io)
 {
-       CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl);
+       CL_IO_SLICE_CLEAN(lov_env_io(env), lis_preserved);
 
        CDEBUG(D_INODE, DFID "io %p type %d ignore/verify layout %d/%d\n",
               PFID(lu_object_fid(&obj->co_lu)), io, io->ci_type,
               io->ci_ignore_layout, io->ci_verify_layout);
 
+       /* IO type CIT_MISC with ci_ignore_layout set are usually invoked from
+        * the OSC layer. It shouldn't take lov layout conf lock in that case,
+        * because as long as the OSC object exists, the layout can't be
+        * reconfigured. */
        return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init,
-                                    !io->ci_ignore_layout, env, obj, io);
+                       !(io->ci_ignore_layout && io->ci_type == CIT_MISC),
+                       env, obj, io);
 }
 
 /**
@@ -1253,6 +1633,43 @@ struct fiemap_state {
        bool                    fs_enough;
 };
 
+static struct cl_object *lov_find_subobj(const struct lu_env *env,
+                                        struct lov_object *lov,
+                                        struct lov_stripe_md *lsm,
+                                        int index)
+{
+       struct lov_device       *dev = lu2lov_dev(lov2lu(lov)->lo_dev);
+       struct lov_thread_info  *lti = lov_env_info(env);
+       struct lu_fid           *ofid = &lti->lti_fid;
+       struct lov_oinfo        *oinfo;
+       struct cl_device        *subdev;
+       int                     entry = lov_comp_entry(index);
+       int                     stripe = lov_comp_stripe(index);
+       int                     ost_idx;
+       int                     rc;
+       struct cl_object        *result;
+
+       if (lov->lo_type != LLT_COMP)
+               GOTO(out, result = NULL);
+
+       if (entry >= lsm->lsm_entry_count ||
+           stripe >= lsm->lsm_entries[entry]->lsme_stripe_count)
+               GOTO(out, result = NULL);
+
+       oinfo = lsm->lsm_entries[entry]->lsme_oinfo[stripe];
+       ost_idx = oinfo->loi_ost_idx;
+       rc = ostid_to_fid(ofid, &oinfo->loi_oi, ost_idx);
+       if (rc != 0)
+               GOTO(out, result = NULL);
+
+       subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
+       result = lov_sub_find(env, subdev, ofid, NULL);
+out:
+       if (result == NULL)
+               result = ERR_PTR(-EINVAL);
+       return result;
+}
+
 int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
                      struct lov_stripe_md *lsm, struct fiemap *fiemap,
                      size_t *buflen, struct ll_fiemap_info_key *fmkey,
@@ -1293,7 +1710,7 @@ int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
        if (lun_start == lun_end)
                return 0;
 
-       req_fm_len = obd_object_end - lun_start;
+       req_fm_len = obd_object_end - lun_start + 1;
        fs->fs_fm->fm_length = 0;
        len_mapped_single_call = 0;
 
@@ -1336,7 +1753,7 @@ int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
                        fs->fs_fm->fm_mapped_extents = 1;
 
                        fm_ext[0].fe_logical = lun_start;
-                       fm_ext[0].fe_length = obd_object_end - lun_start;
+                       fm_ext[0].fe_length = obd_object_end - lun_start + 1;
                        fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN;
 
                        goto inactive_tgt;
@@ -1451,8 +1868,11 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
        ENTRY;
 
        lsm = lov_lsm_addref(cl2lov(obj));
-       if (lsm == NULL)
-               RETURN(-ENODATA);
+       if (lsm == NULL) {
+               /* no extent: there is no object for mapping */
+               fiemap->fm_mapped_extents = 0;
+               return 0;
+       }
 
        if (!(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
                /**
@@ -1466,6 +1886,10 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
                        GOTO(out_lsm, rc = -ENOTSUPP);
        }
 
+       /* No support for DOM layout yet. */
+       if (lsme_is_dom(lsm->lsm_entries[0]))
+               GOTO(out_lsm, rc = -ENOTSUPP);
+
        if (lsm->lsm_is_released) {
                if (fiemap->fm_start < fmkey->lfik_oa.o_size) {
                        /**
@@ -1532,6 +1956,7 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
        if (start_entry == -1 || end_entry == -1)
                GOTO(out_fm_local, rc = -EINVAL);
 
+       /* TODO: rewrite it with lov_foreach_io_layout() */
        for (entry = start_entry; entry <= end_entry; entry++) {
                lsme = lsm->lsm_entries[entry];
 
@@ -1602,7 +2027,7 @@ out_lsm:
 }
 
 static int lov_object_getstripe(const struct lu_env *env, struct cl_object *obj,
-                               struct lov_user_md __user *lum)
+                               struct lov_user_md __user *lum, size_t size)
 {
        struct lov_object       *lov = cl2lov(obj);
        struct lov_stripe_md    *lsm;
@@ -1613,7 +2038,7 @@ static int lov_object_getstripe(const struct lu_env *env, struct cl_object *obj,
        if (lsm == NULL)
                RETURN(-ENODATA);
 
-       rc = lov_getstripe(cl2lov(obj), lsm, lum);
+       rc = lov_getstripe(env, cl2lov(obj), lsm, lum, size);
        lov_lsm_put(lsm);
        RETURN(rc);
 }
@@ -1637,7 +2062,18 @@ static int lov_object_layout_get(const struct lu_env *env,
 
        cl->cl_size = lov_comp_md_size(lsm);
        cl->cl_layout_gen = lsm->lsm_layout_gen;
-       cl->cl_is_composite = lsm_is_composite(lsm->lsm_magic);
+       cl->cl_dom_comp_size = 0;
+       cl->cl_is_released = lsm->lsm_is_released;
+       if (lsm_is_composite(lsm->lsm_magic)) {
+               struct lov_stripe_md_entry *lsme = lsm->lsm_entries[0];
+
+               cl->cl_is_composite = true;
+
+               if (lsme_is_dom(lsme))
+                       cl->cl_dom_comp_size = lsme->lsme_extent.e_end;
+       } else {
+               cl->cl_is_composite = false;
+       }
 
        rc = lov_lsm_pack(lsm, buf->lb_buf, buf->lb_len);
        lov_lsm_put(lsm);
@@ -1661,6 +2097,12 @@ static loff_t lov_object_maxbytes(struct cl_object *obj)
        return maxbytes;
 }
 
+static int lov_object_flush(const struct lu_env *env, struct cl_object *obj,
+                           struct ldlm_lock *lock)
+{
+       return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_flush, env, obj, lock);
+}
+
 static const struct cl_object_operations lov_ops = {
        .coo_page_init    = lov_page_init,
        .coo_lock_init    = lov_lock_init,
@@ -1672,6 +2114,7 @@ static const struct cl_object_operations lov_ops = {
        .coo_layout_get   = lov_object_layout_get,
        .coo_maxbytes     = lov_object_maxbytes,
        .coo_fiemap       = lov_object_fiemap,
+       .coo_object_flush = lov_object_flush
 };
 
 static const struct lu_object_operations lov_lu_obj_ops = {
@@ -1765,6 +2208,8 @@ int lov_read_and_clear_async_rc(struct cl_object *clob)
                }
                case LLT_RELEASED:
                case LLT_EMPTY:
+                       /* fall through */
+               case LLT_FOREIGN:
                        break;
                default:
                        LBUG();