Whamcloud - gitweb
LU-14510 dom: fiemap support for DoM files 21/55221/6
authorMikhail Pershin <mpershin@whamcloud.com>
Tue, 28 May 2024 11:02:23 +0000 (14:02 +0300)
committerOleg Drokin <green@whamcloud.com>
Sun, 24 Nov 2024 06:03:41 +0000 (06:03 +0000)
Patch adds support for fiemap to DoM files.
Server part:
- modify MDS_GET_IMFO handler to return FIEMAP like
  OST_GET_INFO does
- mdt_fiemap_get() to process fiemap request
Client part:
- rewrite lov_object_fiemap() to support DoM component
- rework fiemap_for_stripe() to work with both DoM and
  RAID0 layouts
- use initialized layout entries to get subobject and
  get rid of lov_find_subobj() used by fiemap only
- fix issue with wrong resume entry/stripe count
- mdc_object_fiemap() as implementation of .coo_fiemap
  cl_object_operations to send and receive fiemap request
- treat LOV subdev errors as UNKNOWN extent
- rework FID2PATH layout description to be compatible with
  other GET_INFO keys (no protocol changes)
- add sanity.sh test_130h for DoM fiemap with resuming

To indicate MDT device the extra bit is taken from stripe
number bits in favor of device number. So total absolute
stripe amount limit is 32768 in fiemap report

Signed-off-by: Mikhail Pershin <mpershin@whamcloud.com>
Change-Id: I9b6df04fd62d773aec2d916440ba08dfea06faa4
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/55221
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Lai Siyao <lai.siyao@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
12 files changed:
lustre/include/lustre_req_layout.h
lustre/include/uapi/linux/lustre/lustre_fiemap.h
lustre/lov/lov_cl_internal.h
lustre/lov/lov_object.c
lustre/lov/lov_offset.c
lustre/mdc/mdc_dev.c
lustre/mdc/mdc_request.c
lustre/mdt/mdt_handler.c
lustre/mdt/mdt_internal.h
lustre/mdt/mdt_io.c
lustre/ptlrpc/layout.c
lustre/tests/sanity.sh

index b722af6..9b951aa 100644 (file)
@@ -212,6 +212,7 @@ extern struct req_format RQF_MDS_CLOSE_INTENT;
 extern struct req_format RQF_MDS_CONNECT;
 extern struct req_format RQF_MDS_DISCONNECT;
 extern struct req_format RQF_MDS_GET_INFO;
+extern struct req_format RQF_MDS_FID2PATH;
 extern struct req_format RQF_MDS_READPAGE;
 extern struct req_format RQF_MDS_REINT;
 extern struct req_format RQF_MDS_REINT_CREATE;
index 941ffa5..1514f25 100644 (file)
 
 static inline int get_fe_device(struct fiemap_extent *fe)
 {
-       return fe->fe_device & 0xffff;
+       return fe->fe_device & 0x1ffff;
 }
 static inline void set_fe_device(struct fiemap_extent *fe, int devno)
 {
-       fe->fe_device = (fe->fe_device & 0xffff0000) | (devno & 0xffff);
+       fe->fe_device = (fe->fe_device & 0xfffe0000) | (devno & 0x1ffff);
 }
 static inline int get_fe_stripenr(struct fiemap_extent *fe)
 {
-       return fe->fe_device >> 16;
+       return fe->fe_device >> 17;
 }
 static inline void set_fe_stripenr(struct fiemap_extent *fe, int nr)
 {
-       fe->fe_device = (fe->fe_device & 0xffff) | (nr << 16);
+       fe->fe_device = (fe->fe_device & 0x1ffff) | (nr << 17);
 }
 static inline void set_fe_device_stripenr(struct fiemap_extent *fe, int devno,
                                          int nr)
 {
-       fe->fe_device = (nr << 16) | (devno & 0xffff);
+       fe->fe_device = (nr << 17) | (devno & 0x1ffff);
 }
 
 static inline __kernel_size_t fiemap_count_to_size(__kernel_size_t extent_count)
index e7d0493..a9845d4 100644 (file)
@@ -190,6 +190,7 @@ struct lov_layout_dom {
        struct lov_layout_raid0 lo_dom_r0;
        struct lovsub_object *lo_dom;
        struct lov_oinfo *lo_loi;
+       unsigned short lo_mdt_idx;
 };
 
 struct lov_layout_entry {
index 53fba61..ff12797 100644 (file)
@@ -538,6 +538,7 @@ again:
        lle->lle_dom.lo_dom_r0.lo_nr = 1;
        lle->lle_dom.lo_dom_r0.lo_sub = &lle->lle_dom.lo_dom;
        lle->lle_dom.lo_loi = loi;
+       lle->lle_dom.lo_mdt_idx = idx;
 
        rc = lov_page_slice_fixup(lov, clo);
        RETURN(rc);
@@ -1619,7 +1620,7 @@ static int lov_lock_init(const struct lu_env *env, struct cl_object *obj,
 
 /**
  * We calculate on which OST the mapping will end. If the length of mapping
- * is greater than (stripe_size * stripe_count) then the last_stripe will
+ * is greater than (stripe_size * stripe_count) then the last_stripe
  * will be one just before start_stripe. Else we check if the mapping
  * intersects each OST and find last_stripe.
  * This function returns the last_stripe and also sets the stripe_count
@@ -1643,15 +1644,15 @@ static int fiemap_calc_last_stripe(struct lov_stripe_md *lsm, int index,
        int last_stripe;
        int i, j;
 
+       if (lsme_is_dom(lsme)) {
+               *stripe_count = 1;
+               return start_stripe;
+       }
+
        init_stripe = lov_stripe_number(lsm, index, ext->e_start);
 
-       if (ext->e_end - ext->e_start >
-           lsme->lsme_stripe_size * lsme->lsme_stripe_count) {
-               if (init_stripe == start_stripe) {
-                       last_stripe = (start_stripe < 1) ?
-                               lsme->lsme_stripe_count - 1 : start_stripe - 1;
-                       *stripe_count = lsme->lsme_stripe_count;
-               } else if (init_stripe < start_stripe) {
+       if (ext->e_end - ext->e_start > stripe_width(lsm, index)) {
+               if (init_stripe <= start_stripe) {
                        last_stripe = (init_stripe < 1) ?
                                lsme->lsme_stripe_count - 1 : init_stripe - 1;
                        *stripe_count = lsme->lsme_stripe_count -
@@ -1663,7 +1664,7 @@ static int fiemap_calc_last_stripe(struct lov_stripe_md *lsm, int index,
        } else {
                for (j = 0, i = start_stripe; j < lsme->lsme_stripe_count;
                     i = (i + 1) % lsme->lsme_stripe_count, j++) {
-                       if (!lov_stripe_intersects(lsm, index,  i, ext, NULL,
+                       if (!lov_stripe_intersects(lsm, index, i, ext, NULL,
                                                   NULL))
                                break;
                        if ((start_stripe != init_stripe) && (i == init_stripe))
@@ -1751,10 +1752,12 @@ static u64 fiemap_calc_fm_end_offset(struct fiemap *fiemap,
            local_end < lun_end) {
                fm_end_offset = local_end;
        } else {
+               int stripes = lsme_is_dom(lsme) ? 1 : lsme->lsme_stripe_count;
+
                /* This is a special value to indicate that caller should
                 * calculate offset in next stripe. */
                fm_end_offset = 0;
-               *start_stripe = (stripe_no + 1) % lsme->lsme_stripe_count;
+               *start_stripe = (stripe_no + 1) % stripes;
        }
 
        return fm_end_offset;
@@ -1774,74 +1777,46 @@ struct fiemap_state {
        bool                    fs_enough;      /* enough for this call */
 };
 
-static struct cl_object *lov_find_subobj(const struct lu_env *env,
-                                        struct lov_object *lov,
-                                        struct lov_stripe_md *lsm,
-                                        int index)
+static int fiemap_unknown(struct fiemap_state *fs, u64 obd_start, u64 obd_end)
 {
-       struct lov_device       *dev = lu2lov_dev(lov2lu(lov)->lo_dev);
-       struct lov_thread_info  *lti = lov_env_info(env);
-       struct lu_fid           *ofid = &lti->lti_fid;
-       struct lov_oinfo        *oinfo;
-       struct cl_device        *subdev;
-       int                     entry = lov_comp_entry(index);
-       int                     stripe = lov_comp_stripe(index);
-       int                     ost_idx;
-       int                     rc;
-       struct cl_object        *result;
-
-       if (lov->lo_type != LLT_COMP)
-               GOTO(out, result = NULL);
-
-       if (entry >= lsm->lsm_entry_count ||
-           stripe >= lsm->lsm_entries[entry]->lsme_stripe_count)
-               GOTO(out, result = NULL);
-
-       oinfo = lsm->lsm_entries[entry]->lsme_oinfo[stripe];
-       ost_idx = oinfo->loi_ost_idx;
-       rc = ostid_to_fid(ofid, &oinfo->loi_oi, ost_idx);
-       if (rc != 0)
-               GOTO(out, result = NULL);
-
-       subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
-       result = lov_sub_find(env, subdev, ofid, NULL);
-out:
-       if (result == NULL)
-               result = ERR_PTR(-EINVAL);
-       return result;
+       /* If OST is inactive or layout is not supported or available
+        * then return extent with UNKNOWN flag.
+        */
+       fs->fs_fm->fm_mapped_extents = 1;
+       if (fs->fs_fm->fm_extent_count) {
+               fs->fs_fm->fm_extents[0].fe_logical = obd_start;
+               fs->fs_fm->fm_extents[0].fe_length = obd_end - obd_start + 1;
+               fs->fs_fm->fm_extents[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN;
+       }
+       fs->fs_device_done = true;
+       return 1;
 }
 
 static int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
                             struct lov_stripe_md *lsm, struct fiemap *fiemap,
                             size_t *buflen, struct ll_fiemap_info_key *fmkey,
-                            int index, int stripe_last, int stripeno,
+                            int index, int stripe_last, const int stripeno,
                             struct fiemap_state *fs)
 {
-       struct lov_stripe_md_entry *lsme = lsm->lsm_entries[index];
-       struct cl_object *subobj;
-       struct lov_obd *lov = lu2lov_dev(obj->co_lu.lo_dev)->ld_lov;
-       struct fiemap_extent *fm_ext = &fs->fs_fm->fm_extents[0];
+       struct lov_object *lo = cl2lov(obj);
+       struct lov_layout_entry *lle = lov_entry(lo, index);
+       struct lov_stripe_md_entry *lsme = lle->lle_lsme;
+       struct cl_object *subobj = NULL;
+       struct fiemap *fsm = fs->fs_fm;
+       struct fiemap_extent *fm_ext = &fsm->fm_extents[0];
        u64 req_fm_len; /* max requested extent coverage */
        u64 len_mapped_single_call;
        u64 obd_start;
        u64 obd_end;
        unsigned int ext_count;
-       /* EOF for object */
-       bool ost_eof = false;
-       /* done with required mapping for this OST? */
-       bool ost_done = false;
-       int ost_index;
+       int devnr = 0;
        int rc = 0;
 
-       fs->fs_device_done = false;
        /* Find out range of mapping on this stripe */
        if ((lov_stripe_intersects(lsm, index, stripeno, &fs->fs_ext,
                                   &obd_start, &obd_end)) == 0)
                return 0;
 
-       if (lov_oinfo_is_dummy(lsme->lsme_oinfo[stripeno]))
-               return -EIO;
-
        /* If this is a continuation FIEMAP call and we are on
         * starting stripe then obd_start needs to be set to
         * end_offset */
@@ -1852,18 +1827,59 @@ static int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
            obd_start)
                return 0;
 
+       fs->fs_device_done = false;
        req_fm_len = obd_end - obd_start + 1;
-       fs->fs_fm->fm_length = 0;
+       fsm->fm_length = 0;
        len_mapped_single_call = 0;
 
-       /* find lobsub object */
-       subobj = lov_find_subobj(env, cl2lov(obj), lsm,
-                                lov_comp_index(index, stripeno));
-       if (IS_ERR(subobj))
-               return PTR_ERR(subobj);
+       if (lo->lo_type != LLT_COMP || !lle->lle_valid) {
+               ext_count = fiemap_unknown(fs, obd_start, obd_end);
+               GOTO(out_unknown, rc = -EOPNOTSUPP);
+       }
+
+       switch (lle->lle_type) {
+       case LOV_PATTERN_RAID0:
+       {
+               struct lov_device *lov = lov_object_dev(lo);
+               const struct lov_layout_raid0 *r0 = &lle->lle_raid0;
+               struct lov_oinfo *oinfo;
+
+               if (stripeno >= r0->lo_nr)
+                       RETURN(-EINVAL);
+               subobj = lovsub2cl(r0->lo_sub[stripeno]);
+               oinfo = lsme->lsme_oinfo[stripeno];
+               if (lov_oinfo_is_dummy(oinfo))
+                       RETURN(-EIO);
+               devnr = oinfo->loi_ost_idx;
+               if (devnr < 0 || devnr >= lov_targets_nr(lov))
+                       RETURN(-EINVAL);
+               if (!lov->ld_lov->lov_tgts[devnr]->ltd_active) {
+                       ext_count = fiemap_unknown(fs, obd_start, obd_end);
+                       GOTO(out_unknown, rc = -ENODEV);
+               }
+               break;
+       }
+       case LOV_PATTERN_MDT:
+       {
+               const struct lov_layout_dom *dom = &lle->lle_dom;
+
+               subobj = lovsub2cl(dom->lo_dom);
+               if (lov_oinfo_is_dummy(dom->lo_loi))
+                       RETURN(-EIO);
+               devnr = dom->lo_mdt_idx | 0x10000ULL;
+               break;
+       }
+       default:
+               ext_count = fiemap_unknown(fs, obd_start, obd_end);
+               GOTO(out_unknown, rc = -EOPNOTSUPP);
+       }
+
+       if (!subobj)
+               RETURN(-EINVAL);
+
        /* If the output buffer is very large and the objects have many
         * extents we may need to loop on a single OST repeatedly */
-       do {
+       while (!fs->fs_device_done) {
                if (fiemap->fm_extent_count > 0) {
                        /* Don't get too many extents. */
                        if (fs->fs_cur_extent + fs->fs_cnt_need >
@@ -1873,56 +1889,35 @@ static int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj,
                }
 
                obd_start += len_mapped_single_call;
-               fs->fs_fm->fm_length = req_fm_len - len_mapped_single_call;
-               req_fm_len = fs->fs_fm->fm_length;
+               fsm->fm_length = req_fm_len - len_mapped_single_call;
+               req_fm_len = fsm->fm_length;
                /**
                 * If we've collected enough extent map, we'd request 1 more,
                 * to see whether we coincidentally finished all available
                 * extent map, so that FIEMAP_EXTENT_LAST would be set.
                 */
-               fs->fs_fm->fm_extent_count = fs->fs_enough ?
-                                            1 : fs->fs_cnt_need;
-               fs->fs_fm->fm_mapped_extents = 0;
-               fs->fs_fm->fm_flags = fiemap->fm_flags;
-
-               ost_index = lsme->lsme_oinfo[stripeno]->loi_ost_idx;
-
-               if (ost_index < 0 || ost_index >= lov->desc.ld_tgt_count)
-                       GOTO(obj_put, rc = -EINVAL);
-               /* If OST is inactive, return extent with UNKNOWN flag. */
-               if (!lov->lov_tgts[ost_index]->ltd_active) {
-
-                       fs->fs_fm->fm_mapped_extents = 1;
-                       if (fs->fs_fm->fm_extent_count == 0)
-                               goto inactive_tgt;
-
-                       fm_ext[0].fe_logical = obd_start;
-                       fm_ext[0].fe_length = obd_end - obd_start + 1;
-                       fm_ext[0].fe_flags |=
-                               FIEMAP_EXTENT_UNKNOWN | FIEMAP_EXTENT_LAST;
-
-                       goto inactive_tgt;
+               fsm->fm_extent_count = fs->fs_enough ? 1 : fs->fs_cnt_need;
+               fsm->fm_mapped_extents = 0;
+               fsm->fm_flags = fiemap->fm_flags;
+               fsm->fm_start = obd_start;
+               fsm->fm_flags &= ~FIEMAP_FLAG_DEVICE_ORDER;
+               fmkey->lfik_fiemap = *fsm;
+               *buflen = fiemap_count_to_size(fsm->fm_extent_count);
+               rc = cl_object_fiemap(env, subobj, fmkey, fsm, buflen);
+               if (rc) {
+                       /* Can we report as UNKNOWN all subdev error? */
+                       ext_count = fiemap_unknown(fs, obd_start, obd_end);
+                       GOTO(out_unknown, rc);
                }
-
-               fs->fs_fm->fm_start = obd_start;
-               fs->fs_fm->fm_flags &= ~FIEMAP_FLAG_DEVICE_ORDER;
-               memcpy(&fmkey->lfik_fiemap, fs->fs_fm, sizeof(*fs->fs_fm));
-               *buflen = fiemap_count_to_size(fs->fs_fm->fm_extent_count);
-
-               rc = cl_object_fiemap(env, subobj, fmkey, fs->fs_fm, buflen);
-               if (rc != 0)
-                       GOTO(obj_put, rc);
-inactive_tgt:
-               ext_count = fs->fs_fm->fm_mapped_extents;
+               ext_count = fsm->fm_mapped_extents;
                if (ext_count == 0) {
-                       ost_done = true;
                        fs->fs_device_done = true;
                        /* If last stripe has hold at the end,
                         * we need to return */
                        if (stripeno == fs->fs_last_stripe) {
                                fiemap->fm_mapped_extents = 0;
                                fs->fs_finish_stripe = true;
-                               GOTO(obj_put, rc);
+                               RETURN(0);
                        }
                        break;
                } else if (fs->fs_enough) {
@@ -1930,7 +1925,7 @@ inactive_tgt:
                         * We've collected enough extents and there are
                         * more extents after it.
                         */
-                       GOTO(obj_put, rc);
+                       RETURN(0);
                }
 
                /* If we just need num of extents, got to next device */
@@ -1945,24 +1940,22 @@ inactive_tgt:
                                         obd_start;
 
                /* Have we finished mapping on this device? */
-               if (req_fm_len <= len_mapped_single_call) {
-                       ost_done = true;
+               if (req_fm_len <= len_mapped_single_call)
                        fs->fs_device_done = true;
-               }
 
                /* Clear the EXTENT_LAST flag which can be present on
                 * the last extent */
                if (fm_ext[ext_count - 1].fe_flags & FIEMAP_EXTENT_LAST)
                        fm_ext[ext_count - 1].fe_flags &= ~FIEMAP_EXTENT_LAST;
+
                if (lov_stripe_size(lsm, index,
                                    fm_ext[ext_count - 1].fe_logical +
                                    fm_ext[ext_count - 1].fe_length,
                                    stripeno) >= fmkey->lfik_oa.o_size) {
-                       ost_eof = true;
                        fs->fs_device_done = true;
                }
-
-               fiemap_prepare_and_copy_exts(fiemap, fm_ext, ost_index,
+out_unknown:
+               fiemap_prepare_and_copy_exts(fiemap, fm_ext, devnr,
                                             ext_count, fs->fs_cur_extent,
                                             stripe_last + stripeno);
                fs->fs_cur_extent += ext_count;
@@ -1970,14 +1963,11 @@ inactive_tgt:
                /* Ran out of available extents? */
                if (fs->fs_cur_extent >= fiemap->fm_extent_count)
                        fs->fs_enough = true;
-       } while (!ost_done && !ost_eof);
+       }
 
        if (stripeno == fs->fs_last_stripe)
                fs->fs_finish_stripe = true;
-obj_put:
-       cl_object_put(env, subobj);
-
-       return rc;
+       return 0;
 }
 
 /**
@@ -2037,10 +2027,6 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
                        GOTO(out_lsm, rc = -EOPNOTSUPP);
        }
 
-       /* No support for DOM layout yet. */
-       if (lsme_is_dom(lsm->lsm_entries[0]))
-               GOTO(out_lsm, rc = -EOPNOTSUPP);
-
        if (lsm->lsm_is_released) {
                if (fiemap->fm_start < fmkey->lfik_oa.o_size) {
                        /**
@@ -2091,8 +2077,8 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
        if (whole_start > fmkey->lfik_oa.o_size)
                GOTO(out_fm_local, rc = -EINVAL);
        whole_end = (fiemap->fm_length == OBD_OBJECT_EOF) ?
-                                       fmkey->lfik_oa.o_size + 1 :
-                                       whole_start + fiemap->fm_length;
+                    fmkey->lfik_oa.o_size + 1 :
+                    whole_start + fiemap->fm_length;
        /**
         * If fiemap->fm_length != OBD_OBJECT_EOF but whole_end exceeds file
         * size
@@ -2115,15 +2101,18 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
        end_entry = lsm->lsm_entry_count - 1;
        cur_stripe = 0;
        for (entry = 0; entry <= end_entry; entry++) {
+               int stripes;
+
                lsme = lsm->lsm_entries[entry];
-               if (cur_stripe + lsme->lsme_stripe_count >= stripe_last) {
+               stripes = lsme_is_dom(lsme) ? 1 : lsme->lsme_stripe_count;
+               if (cur_stripe + stripes > stripe_last) {
                        start_entry = entry;
                        start_stripe = stripe_last - cur_stripe;
                        break;
                }
-
-               cur_stripe += lsme->lsme_stripe_count;
+               cur_stripe += stripes;
        }
+
        if (start_entry == -1) {
                CERROR(DFID": FIEMAP does not init start entry, cur_stripe=%d, "
                       "stripe_last=%d\n", PFID(lu_object_fid(&obj->co_lu)),
@@ -2140,18 +2129,21 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
        range.e_end = whole_end;
 
        for (entry = start_entry; entry <= end_entry; entry++) {
+               int stripes;
+
                /* remeber to update stripe_last accordingly */
                lsme = lsm->lsm_entries[entry];
+               stripes = lsme_is_dom(lsme) ? 1 : lsme->lsme_stripe_count;
 
                /* FLR could contain component holes between entries */
                if (!lsme_inited(lsme)) {
-                       stripe_last += lsme->lsme_stripe_count;
+                       stripe_last += stripes;
                        resume = false;
                        continue;
                }
 
                if (!lu_extent_is_overlapped(&range, &lsme->lsme_extent)) {
-                       stripe_last += lsme->lsme_stripe_count;
+                       stripe_last += stripes;
                        resume = false;
                        continue;
                }
@@ -2196,8 +2188,7 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
 
                /* Check each stripe */
                for (cur_stripe = fs.fs_start_stripe; stripe_count > 0;
-                    --stripe_count,
-                    cur_stripe = (cur_stripe + 1) % lsme->lsme_stripe_count) {
+                    --stripe_count, cur_stripe = (cur_stripe + 1) % stripes) {
                        /* reset fs_finish_stripe */
                        fs.fs_finish_stripe = false;
                        rc = fiemap_for_stripe(env, obj, lsm, fiemap, buflen,
@@ -2212,7 +2203,7 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
                        if (fs.fs_finish_stripe)
                                break;
                } /* for each stripe */
-               stripe_last += lsme->lsme_stripe_count;
+               stripe_last += stripes;
        } /* for covering layout component entry */
 
 finish:
@@ -2240,7 +2231,6 @@ skip_last_device_calc:
        fiemap->fm_mapped_extents = fs.fs_cur_extent;
 out_fm_local:
        OBD_FREE_LARGE(fm_local, buffer_size);
-
 out_lsm:
        lov_lsm_put(lsm);
        return rc;
index a9389ac..3a0bbbe 100644 (file)
@@ -225,7 +225,7 @@ int lov_stripe_intersects(struct lov_stripe_md *lsm, int index, int stripeno,
        u64 loc_start, loc_end;
 
        if (!lu_extent_is_overlapped(ext, &entry->lsme_extent))
-                       return 0;
+               return 0;
 
        if (!obd_start)
                obd_start = &loc_start;
index e0d96bc..fe96b96 100644 (file)
@@ -1392,7 +1392,7 @@ static int mdc_io_init(const struct lu_env *env, struct cl_object *obj,
 }
 
 static void mdc_build_res_name(struct osc_object *osc,
-                                  struct ldlm_res_id *resname)
+                              struct ldlm_res_id *resname)
 {
        fid_build_reg_res_name(lu_object_fid(osc2lu(osc)), resname);
 }
@@ -1506,6 +1506,89 @@ static int mdc_object_flush(const struct lu_env *env, struct cl_object *obj,
        RETURN(mdc_dlm_canceling(env, lock));
 }
 
+static int mdc_object_fiemap(const struct lu_env *env, struct cl_object *obj,
+                            struct ll_fiemap_info_key *fmkey,
+                            struct fiemap *fiemap, size_t *buflen)
+{
+       struct osc_thread_info *info = osc_env_info(env);
+       struct osc_object *osc = cl2osc(obj);
+       struct obd_export *exp = osc_export(osc);
+       struct lustre_handle lockh;
+       enum ldlm_mode mode = LCK_MINMODE;
+       struct ptlrpc_request *req;
+       struct fiemap *repbuf;
+       struct ll_fiemap_info_key *rq_fmkey;
+       char *fmbuf;
+       __u64 flags;
+       int rc;
+
+       ENTRY;
+
+       fmkey->lfik_oa.o_oi = osc->oo_oinfo->loi_oi;
+
+       if (fmkey->lfik_fiemap.fm_flags & FIEMAP_FLAG_SYNC) {
+               struct ldlm_res_id *resid = &osc_env_info(env)->oti_resname;
+               union ldlm_policy_data *policy = &info->oti_policy;
+
+               mdc_build_res_name(osc, resid);
+               mdc_lock_build_policy(env, NULL, policy);
+               flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_LVB_READY;
+               mode = mdc_dom_lock_match(env, exp, resid, LDLM_IBITS, policy,
+                                         LCK_PR | LCK_PW | LCK_GROUP,
+                                         &flags, osc, &lockh, 0);
+               fmkey->lfik_oa.o_valid |= OBD_MD_FLFLAGS;
+               if (mode) { /* lock is cached on client */
+                       fmkey->lfik_oa.o_flags &= ~OBD_FL_SRVLOCK;
+                       if (mode != LCK_PR) {
+                               ldlm_lock_addref(&lockh, LCK_PR);
+                               ldlm_lock_decref(&lockh, mode);
+                       }
+               } else {
+                       /* no cached lock, needs acquire lock on server side */
+                       fmkey->lfik_oa.o_flags |= OBD_FL_SRVLOCK;
+               }
+       }
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_OST_GET_INFO_FIEMAP);
+       if (!req)
+               GOTO(drop_lock, rc = -ENOMEM);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY, RCL_CLIENT,
+                            sizeof(*fmkey));
+       req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, RCL_CLIENT,
+                            *buflen);
+       req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, RCL_SERVER,
+                            *buflen);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO);
+       if (rc != 0) {
+               ptlrpc_request_free(req);
+               GOTO(drop_lock, rc);
+       }
+       rq_fmkey = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY);
+       *rq_fmkey = *fmkey;
+       fmbuf = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+       memcpy(fmbuf, fiemap, *buflen);
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(fini_req, rc);
+
+       repbuf = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+       if (!repbuf)
+               GOTO(fini_req, rc = -EPROTO);
+       memcpy(fiemap, repbuf, *buflen);
+
+fini_req:
+       ptlrpc_req_put(req);
+drop_lock:
+       if (mode)
+               ldlm_lock_decref(&lockh, LCK_PR);
+       RETURN(rc);
+}
+
 static const struct cl_object_operations mdc_ops = {
        .coo_page_init = osc_page_init,
        .coo_lock_init = mdc_lock_init,
@@ -1515,7 +1598,8 @@ static const struct cl_object_operations mdc_ops = {
        .coo_glimpse = osc_object_glimpse,
        .coo_req_attr_set = mdc_req_attr_set,
        .coo_prune = mdc_object_prune,
-       .coo_object_flush = mdc_object_flush
+       .coo_object_flush = mdc_object_flush,
+       .coo_fiemap = mdc_object_fiemap,
 };
 
 static const struct osc_object_operations mdc_object_ops = {
index c0953cc..6eee865 100644 (file)
@@ -2380,35 +2380,36 @@ static int mdc_get_info_rpc(struct obd_export *exp,
                            u32 keylen, void *key,
                            u32 vallen, void *val)
 {
-        struct obd_import      *imp = class_exp2cliimp(exp);
-        struct ptlrpc_request  *req;
-        char                   *tmp;
-        int                     rc = -EINVAL;
-        ENTRY;
+       struct obd_import *imp = class_exp2cliimp(exp);
+       struct ptlrpc_request *req;
+       char *tmp;
+       int rc = -EINVAL;
 
-        req = ptlrpc_request_alloc(imp, &RQF_MDS_GET_INFO);
-        if (req == NULL)
-                RETURN(-ENOMEM);
+       ENTRY;
+
+       req = ptlrpc_request_alloc(imp, &RQF_MDS_FID2PATH);
+       if (req == NULL)
+               RETURN(-ENOMEM);
 
-        req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_KEY,
-                             RCL_CLIENT, keylen);
-        req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VALLEN,
+       req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_KEY,
+                            RCL_CLIENT, keylen);
+       req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VALLEN,
                             RCL_CLIENT, sizeof(vallen));
 
-        rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO);
-        if (rc) {
-                ptlrpc_request_free(req);
-                RETURN(rc);
-        }
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
 
-        tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_KEY);
-        memcpy(tmp, key, keylen);
-        tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_VALLEN);
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_KEY);
+       memcpy(tmp, key, keylen);
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_VALLEN);
        memcpy(tmp, &vallen, sizeof(vallen));
 
-        req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VAL,
-                             RCL_SERVER, vallen);
-        ptlrpc_request_set_replen(req);
+       req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VAL,
+                            RCL_SERVER, vallen);
+       ptlrpc_request_set_replen(req);
 
        /* if server failed to resolve FID, and OI scrub not able to fix it, it
         * will return -EINPROGRESS, ptlrpc_queue_wait() will keep retrying,
index b398af3..68c32b4 100644 (file)
@@ -7798,49 +7798,55 @@ static int mdt_rpc_fid2path(struct mdt_thread_info *info, void *key, int keylen,
 
 int mdt_get_info(struct tgt_session_info *tsi)
 {
-       char    *key;
-       int      keylen;
-       __u32   *vallen;
-       void    *valout;
-       int      rc;
+       char *key;
+       int keylen;
+       int rc;
 
        ENTRY;
 
        key = req_capsule_client_get(tsi->tsi_pill, &RMF_GETINFO_KEY);
-       if (key == NULL) {
-               CDEBUG(D_IOCTL, "No GETINFO key\n");
-               RETURN(err_serious(-EFAULT));
+       if (!key) {
+               DEBUG_REQ(D_IOCTL, tgt_ses_req(tsi), "no GETINFO key");
+               RETURN(err_serious(-EPROTO));
        }
        keylen = req_capsule_get_size(tsi->tsi_pill, &RMF_GETINFO_KEY,
                                      RCL_CLIENT);
-
-       vallen = req_capsule_client_get(tsi->tsi_pill, &RMF_GETINFO_VALLEN);
-       if (vallen == NULL) {
-               CDEBUG(D_IOCTL, "%s: cannot get RMF_GETINFO_VALLEN buffer\n",
-                               tgt_name(tsi->tsi_tgt));
-               RETURN(err_serious(-EFAULT));
-       }
-
-       req_capsule_set_size(tsi->tsi_pill, &RMF_GETINFO_VAL, RCL_SERVER,
-                            *vallen);
-       rc = req_capsule_server_pack(tsi->tsi_pill);
-       if (rc)
-               RETURN(err_serious(rc));
-
-       valout = req_capsule_server_get(tsi->tsi_pill, &RMF_GETINFO_VAL);
-       if (valout == NULL) {
-               CDEBUG(D_IOCTL, "%s: cannot get get-info RPC out buffer\n",
-                               tgt_name(tsi->tsi_tgt));
-               RETURN(err_serious(-EFAULT));
-       }
-
        if (KEY_IS(KEY_FID2PATH)) {
-               struct mdt_thread_info  *info = tsi2mdt_info(tsi);
+               struct mdt_thread_info *info;
+               __u32 *vallen;
+               void *valout;
+
+               req_capsule_extend(tsi->tsi_pill, &RQF_MDS_FID2PATH);
+               vallen = req_capsule_client_get(tsi->tsi_pill,
+                                               &RMF_GETINFO_VALLEN);
+               if (!vallen) {
+                       CDEBUG(D_IOCTL,
+                              "%s: cannot get RMF_GETINFO_VALLEN buffer\n",
+                              tgt_name(tsi->tsi_tgt));
+                       RETURN(err_serious(-EPROTO));
+               }
 
+               req_capsule_set_size(tsi->tsi_pill, &RMF_GETINFO_VAL,
+                                    RCL_SERVER, *vallen);
+               rc = req_capsule_server_pack(tsi->tsi_pill);
+               if (rc)
+                       RETURN(err_serious(rc));
+
+               valout = req_capsule_server_get(tsi->tsi_pill,
+                                               &RMF_GETINFO_VAL);
+               if (!valout) {
+                       CDEBUG(D_IOCTL,
+                              "%s: cannot get get-info RPC out buffer\n",
+                              tgt_name(tsi->tsi_tgt));
+                       RETURN(-ENOMEM);
+               }
+               info = tsi2mdt_info(tsi);
                rc = mdt_rpc_fid2path(info, key, keylen, valout, *vallen);
                mdt_thread_info_fini(info);
+       } else if (KEY_IS(KEY_FIEMAP)) {
+               rc = mdt_fiemap_get(tsi);
        } else {
-               rc = -EINVAL;
+               rc = err_serious(-EOPNOTSUPP);
        }
        RETURN(rc);
 }
index 6de73df..3cec8b8 100644 (file)
@@ -1420,6 +1420,7 @@ int mdt_obd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp,
                     ktime_t kstart);
 int mdt_punch_hdl(struct tgt_session_info *tsi);
 int mdt_fallocate_hdl(struct tgt_session_info *tsi);
+int mdt_fiemap_get(struct tgt_session_info *tsi);
 int mdt_glimpse_enqueue(struct mdt_thread_info *mti, struct ldlm_namespace *ns,
                        struct ldlm_lock **lockp, __u64 flags);
 int mdt_brw_enqueue(struct mdt_thread_info *info, struct ldlm_namespace *ns,
index 9f84170..73c7bce 100644 (file)
@@ -1079,6 +1079,120 @@ out:
        return rc;
 }
 
+/*
+ * Check if there is any sparse area in DoM segment.
+ */
+static int dom_has_zero_regions(struct fiemap *fiemap)
+{
+       struct fiemap_extent *fiemap_start = fiemap->fm_extents;
+       __u64 begin = fiemap->fm_start;
+       unsigned int i;
+
+       for (i = 0; i < fiemap->fm_mapped_extents; i++) {
+               if (fiemap_start[i].fe_logical > begin)
+                       return true;
+               begin = fiemap_start[i].fe_logical + fiemap_start[i].fe_length;
+       }
+       return begin < (fiemap->fm_start + fiemap->fm_length);
+}
+
+int mdt_dom_fiemap(const struct lu_env *env, struct mdt_device *mdt,
+                  const struct lu_fid *fid, struct fiemap *fiemap)
+{
+       struct mdt_object *mo;
+       int rc;
+
+       ENTRY;
+
+       mo = mdt_object_find(env, mdt, fid);
+       if (IS_ERR(mo))
+               RETURN(PTR_ERR(mo));
+
+       mdt_dom_read_lock(mo);
+       if (!mdt_object_exists(mo))
+               GOTO(out, rc = -ENOENT);
+       if (mdt_object_remote(mo))
+               GOTO(out, rc = -EREMOTE);
+       if (!S_ISREG(lu_object_attr(&mo->mot_obj)))
+               GOTO(out, rc = -EBADF);
+
+       rc = dt_fiemap_get(env, mdt_obj2dt(mo), fiemap);
+out:
+       mdt_dom_read_unlock(mo);
+       lu_object_put(env, &mo->mot_obj);
+       RETURN(rc);
+}
+/**
+ * Get FIEMAP (FIle Extent MAPping) for object with the given FID.
+ *
+ * This function returns a list of extents which describes how a file's
+ * blocks are laid out on the disk.
+ *
+ * \param[in] tsi      target session environment for this request
+ *
+ * \retval             0 if \a fiemap is filled with data successfully
+ * \retval             negative value on error
+ */
+int mdt_fiemap_get(struct tgt_session_info *tsi)
+{
+       struct ldlm_namespace *ns = tsi->tsi_tgt->lut_obd->obd_namespace;
+       struct obd_export *exp = tsi->tsi_exp;
+       struct mdt_device *mdt = mdt_dev(exp->exp_obd->obd_lu_dev);
+       struct ll_fiemap_info_key *fm_key;
+       struct obdo *oa;
+       struct fiemap *fiemap;
+       const struct lu_fid *fid;
+       int rlen, rc;
+       bool srvlock;
+
+       req_capsule_extend(tsi->tsi_pill, &RQF_OST_GET_INFO_FIEMAP);
+
+       fm_key = req_capsule_client_get(tsi->tsi_pill, &RMF_FIEMAP_KEY);
+       if (!fm_key)
+               RETURN(err_serious(-EPROTO));
+
+       rlen = fiemap_count_to_size(fm_key->lfik_fiemap.fm_extent_count);
+       req_capsule_set_size(tsi->tsi_pill, &RMF_FIEMAP_VAL, RCL_SERVER, rlen);
+       rc = req_capsule_server_pack(tsi->tsi_pill);
+       if (rc)
+               RETURN(err_serious(rc));
+
+       fiemap = req_capsule_server_get(tsi->tsi_pill, &RMF_FIEMAP_VAL);
+       if (!fiemap)
+               RETURN(-ENOMEM);
+
+       oa = &fm_key->lfik_oa;
+       rc = tgt_validate_obdo(tsi, oa);
+       if (rc)
+               RETURN(rc);
+
+       fid = &oa->o_oi.oi_fid;
+       *fiemap = fm_key->lfik_fiemap;
+
+       CDEBUG(D_INODE, "get FIEMAP of object "DFID"\n", PFID(fid));
+       rc = mdt_dom_fiemap(tsi->tsi_env, mdt, fid, fiemap);
+       if (rc)
+               RETURN(rc);
+
+       srvlock = (exp_connect_flags(exp) & OBD_CONNECT_SRVLOCK) &&
+                 oa->o_valid & OBD_MD_FLFLAGS &&
+                 oa->o_flags & OBD_FL_SRVLOCK;
+
+       if (srvlock && dom_has_zero_regions(fiemap)) {
+               __u64 flg = 0;
+               struct lustre_handle lh = { 0 };
+
+               CDEBUG(D_OTHER, "FIEMAP: lock "DFID" due to sparse areas\n",
+                      PFID(fid));
+               fid_build_reg_res_name(fid, &tsi->tsi_resid);
+               rc = tgt_mdt_data_lock(ns, &tsi->tsi_resid, &lh, LCK_PR, &flg);
+               if (!rc)
+                       rc = mdt_dom_fiemap(tsi->tsi_env, mdt, fid, fiemap);
+       }
+
+       RETURN(rc);
+}
+
 static int mdt_object_punch(const struct lu_env *env, struct dt_device *dt,
                            struct dt_object *dob, __u64 start, __u64 end,
                            struct lu_attr *la)
@@ -2012,3 +2126,4 @@ void mdt_dom_discard_data(struct mdt_thread_info *info,
 
        RETURN_EXIT;
 }
+
index 7eb7716..7bfbae3 100644 (file)
@@ -404,6 +404,11 @@ static const struct req_msg_field *ost_grant_shrink_client[] = {
 static const struct req_msg_field *mds_getinfo_client[] = {
        &RMF_PTLRPC_BODY,
        &RMF_GETINFO_KEY,
+};
+
+static const struct req_msg_field *mds_fid2path_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_GETINFO_KEY,
        &RMF_GETINFO_VALLEN
 };
 
@@ -823,6 +828,7 @@ static struct req_format *req_formats[] = {
        &RQF_MDS_CONNECT,
        &RQF_MDS_DISCONNECT,
        &RQF_MDS_GET_INFO,
+       &RQF_MDS_FID2PATH,
        &RQF_MDS_GET_ROOT,
        &RQF_MDS_STATFS,
        &RQF_MDS_STATFS_NEW,
@@ -1588,6 +1594,11 @@ struct req_format RQF_MDS_GET_INFO =
                        mds_getinfo_server);
 EXPORT_SYMBOL(RQF_MDS_GET_INFO);
 
+struct req_format RQF_MDS_FID2PATH =
+       DEFINE_REQ_FMT0("MDS_FID2PATH", mds_fid2path_client,
+                       mds_getinfo_server);
+EXPORT_SYMBOL(RQF_MDS_FID2PATH);
+
 struct req_format RQF_MDS_BATCH =
        DEFINE_REQ_FMT0("MDS_BATCH", mds_batch_client,
                        mds_batch_server);
index 7c47980..d3cda5d 100755 (executable)
@@ -16821,6 +16821,66 @@ test_130h() {
 }
 run_test 130h "FIEMAP deadlock"
 
+test_130i() {
+       (( $MDS1_VERSION >= $(version_code 2.15.63.195) )) ||
+               skip "Need MDS version at least 2.15.63.195 for DoM support"
+       local filefrag_op=$(filefrag -l 2>&1 | grep "invalid option")
+       [[ -z "$filefrag_op" ]] || skip_env "filefrag missing Lustre support"
+       [[ "$ost1_FSTYPE" != "zfs" ]] ||
+               skip "LU-1941: FIEMAP unimplemented on ZFS"
+
+       local dom_file=$DIR/$tfile
+
+       stack_trap "rm -f $dom_file"
+
+       $LFS setstripe -E 1M -L mdt -E -1 -c2 -S 131072 -o1,0 $dom_file ||
+               error "setstripe on $dom_file"
+
+       local blks=$((128 * 3))
+       local expected=$(((blks / 3) * 4))
+
+       for ((i = 0; i < $blks; i++)); do
+               dd if=/dev/zero of=$dom_file count=1 bs=4k seek=$((2 * i)) \
+                       conv=notrunc > /dev/null 2>&1 ||
+                       error "dd failed to $dom_file"
+       done
+
+       filefrag -ves $dom_file | (head -7; echo ; tail -5)
+       (( ! ${PIPESTATUS[0]} )) || error "filefrag $dom_file failed"
+
+       filefrag_op=$(filefrag -ve -k $dom_file |
+                     sed -n '/ext:/,/found/{/ext:/d; /found/d; p}')
+
+       local last_lun=$(echo $filefrag_op | cut -d: -f5)
+       local lun_len=0
+       local num_luns=1
+
+       while IFS=$'\n' read line; do
+               local frag_lun=$(echo $line | cut -d: -f5)
+               local ext_len=$(echo $line | cut -d: -f4)
+
+               if (( $frag_lun != $last_lun )); then
+                       if (( lun_len != expected )); then
+                               error "dev #$last_lun: $lun_len != $expected"
+                       else
+                               (( num_luns += 1 ))
+                               lun_len=0
+                       fi
+               fi
+               (( lun_len += ext_len ))
+               last_lun=$frag_lun
+       done <<< "$filefrag_op"
+
+       if (( num_luns != 3 )); then
+               error "num devices: $num_luns, but 3 expected"
+       fi
+       if (( lun_len != expected )); then
+               error "dev #$last_lun: $lun_len != $expected"
+       fi
+       echo "FIEMAP on DoM file succeeded"
+}
+run_test 130i "FIEMAP (DoM file)"
+
 # Test for writev/readv
 test_131a() {
        rwv -f $DIR/$tfile -w -n 3 524288 1048576 1572864 ||