Whamcloud - gitweb
LU-11974 llapi: improve llapi_layout_get_by_xattr(3) API
[fs/lustre-release.git] / lustre / utils / liblustreapi_layout.c
index e85b39d..b967356 100644 (file)
@@ -60,6 +60,7 @@ struct llapi_layout_comp {
        struct lu_extent        llc_extent;     /* [start, end) of component */
        uint32_t                llc_id;         /* unique ID of component */
        uint32_t                llc_flags;      /* LCME_FL_* flags */
+       uint64_t                llc_timestamp;  /* snapshot timestamp */
        struct list_head        llc_list;       /* linked to the llapi_layout
                                                   components list */
 };
@@ -148,6 +149,7 @@ llapi_layout_swab_lov_user_md(struct lov_user_md *lum, int lum_size)
                        ent = &comp_v1->lcm_entries[i];
                        __swab32s(&ent->lcme_id);
                        __swab32s(&ent->lcme_flags);
+                       __swab64s(&ent->lcme_timestamp);
                        __swab64s(&ent->lcme_extent.e_start);
                        __swab64s(&ent->lcme_extent.e_end);
                        __swab32s(&ent->lcme_offset);
@@ -355,28 +357,134 @@ struct llapi_layout *llapi_layout_alloc(void)
 }
 
 /**
+ * Check if the given \a lum_size is large enough to hold the required
+ * fields in \a lum.
+ *
+ * \param[in] lum      the struct lov_user_md to check
+ * \param[in] lum_size the number of bytes in \a lum
+ *
+ * \retval true                the \a lum_size is too small
+ * \retval false       the \a lum_size is large enough
+ */
+static bool llapi_layout_lum_truncated(struct lov_user_md *lum, size_t lum_size)
+{
+       uint32_t magic;
+
+       if (lum_size < sizeof(lum->lmm_magic))
+               return true;
+
+       if (lum->lmm_magic == LOV_MAGIC_V1 ||
+           lum->lmm_magic == __swab32(LOV_MAGIC_V1))
+               magic = LOV_MAGIC_V1;
+       else if (lum->lmm_magic == LOV_MAGIC_V3 ||
+                lum->lmm_magic == __swab32(LOV_MAGIC_V3))
+               magic = LOV_MAGIC_V3;
+       else if (lum->lmm_magic == LOV_MAGIC_COMP_V1 ||
+                lum->lmm_magic == __swab32(LOV_MAGIC_COMP_V1))
+               magic = LOV_MAGIC_COMP_V1;
+       else
+               return true;
+
+       if (magic == LOV_MAGIC_V1 || magic == LOV_MAGIC_V3)
+               return lum_size < lov_user_md_size(0, magic);
+       else
+               return lum_size < sizeof(struct lov_comp_md_v1);
+}
+
+/* Verify if the objects count in lum is consistent with the
+ * stripe count in lum. It applies to regular file only. */
+static bool llapi_layout_lum_valid(struct lov_user_md *lum, int lum_size)
+{
+       struct lov_comp_md_v1 *comp_v1 = NULL;
+       int i, ent_count, obj_count;
+
+       if (lum->lmm_magic == LOV_MAGIC_COMP_V1) {
+               comp_v1 = (struct lov_comp_md_v1 *)lum;
+               ent_count = comp_v1->lcm_entry_count;
+       } else if (lum->lmm_magic == LOV_MAGIC_V1 ||
+                  lum->lmm_magic == LOV_MAGIC_V3) {
+               ent_count = 1;
+       } else {
+               return false;
+       }
+
+       for (i = 0; i < ent_count; i++) {
+               if (comp_v1) {
+                       lum = (struct lov_user_md *)((char *)comp_v1 +
+                               comp_v1->lcm_entries[i].lcme_offset);
+                       lum_size = comp_v1->lcm_entries[i].lcme_size;
+               }
+               obj_count = llapi_layout_objects_in_lum(lum, lum_size);
+
+               if (comp_v1) {
+                       if (!(comp_v1->lcm_entries[i].lcme_flags &
+                                LCME_FL_INIT) && obj_count != 0)
+                               return false;
+               } else if (obj_count != lum->lmm_stripe_count) {
+                       return false;
+               }
+       }
+       return true;
+}
+
+/**
  * Convert the data from a lov_user_md to a newly allocated llapi_layout.
  * The caller is responsible for freeing the returned pointer.
  *
- * \param[in] lum      LOV user metadata structure to copy data from
- * \param[in] lum_size size the the lum passed in
+ * \param[in] lov_xattr                LOV user metadata xattr to copy data from
+ * \param[in] lov_xattr_size   size the lov_xattr_size passed in
+ * \param[in] flags            bitwise-or'd flags to control the behavior
  *
  * \retval             valid llapi_layout pointer on success
  * \retval             NULL if memory allocation fails
  */
-static struct llapi_layout *
-llapi_layout_from_lum(const struct lov_user_md *lum, int lum_size)
+struct llapi_layout *llapi_layout_get_by_xattr(void *lov_xattr,
+                                              ssize_t lov_xattr_size,
+                                              uint32_t flags)
 {
+       struct lov_user_md *lum = lov_xattr;
        struct lov_comp_md_v1 *comp_v1 = NULL;
        struct lov_comp_md_entry_v1 *ent;
        struct lov_user_md *v1;
-       struct llapi_layout *layout;
+       struct llapi_layout *layout = NULL;
        struct llapi_layout_comp *comp;
        int i, ent_count = 0, obj_count;
 
-       layout = __llapi_layout_alloc();
-       if (layout == NULL)
+       if (lov_xattr == NULL || lov_xattr_size <= 0) {
+               errno = EINVAL;
                return NULL;
+       }
+
+       /* Return an error if we got back a partial layout. */
+       if (llapi_layout_lum_truncated(lov_xattr, lov_xattr_size)) {
+               errno = ERANGE;
+               return NULL;
+       }
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+       if (flags & LLAPI_LXF_COPY) {
+               lum = malloc(lov_xattr_size);
+               if (lum == NULL) {
+                       errno = ENOMEM;
+                       return NULL;
+               }
+               memcpy(lum, lov_xattr, lov_xattr_size);
+       }
+#endif
+
+       llapi_layout_swab_lov_user_md(lum, lov_xattr_size);
+
+       if ((flags & LLAPI_LXF_CHECK) &&
+           !llapi_layout_lum_valid(lum, lov_xattr_size)) {
+               errno = EBADSLT;
+               goto out;
+       }
+
+       layout = __llapi_layout_alloc();
+       if (layout == NULL) {
+               errno = ENOMEM;
+               goto out;
+       }
 
        if (lum->lmm_magic == LOV_MAGIC_COMP_V1) {
                comp_v1 = (struct lov_comp_md_v1 *)lum;
@@ -390,11 +498,19 @@ llapi_layout_from_lum(const struct lov_user_md *lum, int lum_size)
                   lum->lmm_magic == LOV_MAGIC_V3) {
                ent_count = 1;
                layout->llot_is_composite = false;
+
+               if (lov_xattr_size <= 0) {
+                       errno = EINVAL;
+                       goto out_layout;
+               }
+       } else {
+               errno = EOPNOTSUPP;
+               goto out_layout;
        }
 
        if (ent_count == 0) {
                errno = EINVAL;
-               goto error;
+               goto out_layout;
        }
 
        v1 = (struct lov_user_md *)lum;
@@ -403,21 +519,23 @@ llapi_layout_from_lum(const struct lov_user_md *lum, int lum_size)
                        ent = &comp_v1->lcm_entries[i];
                        v1 = (struct lov_user_md *)((char *)comp_v1 +
                                ent->lcme_offset);
-                       lum_size = ent->lcme_size;
+                       lov_xattr_size = ent->lcme_size;
                } else {
                        ent = NULL;
                }
 
-               obj_count = llapi_layout_objects_in_lum(v1, lum_size);
+               obj_count = llapi_layout_objects_in_lum(v1, lov_xattr_size);
                comp = __llapi_comp_alloc(obj_count);
                if (comp == NULL)
-                       goto error;
+                       goto out_layout;
 
                if (ent != NULL) {
                        comp->llc_extent.e_start = ent->lcme_extent.e_start;
                        comp->llc_extent.e_end = ent->lcme_extent.e_end;
                        comp->llc_id = ent->lcme_id;
                        comp->llc_flags = ent->lcme_flags;
+                       if (comp->llc_flags & LCME_FL_NOSYNC)
+                               comp->llc_timestamp = ent->lcme_timestamp;
                } else {
                        comp->llc_extent.e_start = 0;
                        comp->llc_extent.e_end = LUSTRE_EOF;
@@ -472,10 +590,14 @@ llapi_layout_from_lum(const struct lov_user_md *lum, int lum_size)
                layout->llot_cur_comp = comp;
        }
 
+out:
+       if (lum != lov_xattr)
+               free(lum);
        return layout;
-error:
+out_layout:
        llapi_layout_free(layout);
-       return NULL;
+       layout = NULL;
+       goto out;
 }
 
 /**
@@ -622,6 +744,8 @@ llapi_layout_to_lum(const struct llapi_layout *layout)
                        ent = &comp_v1->lcm_entries[ent_idx];
                        ent->lcme_id = comp->llc_id;
                        ent->lcme_flags = comp->llc_flags;
+                       if (ent->lcme_flags & LCME_FL_NOSYNC)
+                               ent->lcme_timestamp = comp->llc_timestamp;
                        ent->lcme_extent.e_start = comp->llc_extent.e_start;
                        ent->lcme_extent.e_end = comp->llc_extent.e_end;
                        ent->lcme_size = blob_size;
@@ -749,77 +873,6 @@ static bool is_any_specified(const struct llapi_layout *layout)
 }
 
 /**
- * Check if the given \a lum_size is large enough to hold the required
- * fields in \a lum.
- *
- * \param[in] lum      the struct lov_user_md to check
- * \param[in] lum_size the number of bytes in \a lum
- *
- * \retval true                the \a lum_size is too small
- * \retval false       the \a lum_size is large enough
- */
-static bool llapi_layout_lum_truncated(struct lov_user_md *lum, size_t lum_size)
-{
-       uint32_t magic;
-
-       if (lum_size < sizeof(lum->lmm_magic))
-               return true;
-
-       if (lum->lmm_magic == LOV_MAGIC_V1 ||
-           lum->lmm_magic == __swab32(LOV_MAGIC_V1))
-               magic = LOV_MAGIC_V1;
-       else if (lum->lmm_magic == LOV_MAGIC_V3 ||
-                lum->lmm_magic == __swab32(LOV_MAGIC_V3))
-               magic = LOV_MAGIC_V3;
-       else if (lum->lmm_magic == LOV_MAGIC_COMP_V1 ||
-                lum->lmm_magic == __swab32(LOV_MAGIC_COMP_V1))
-               magic = LOV_MAGIC_COMP_V1;
-       else
-               return true;
-
-       if (magic == LOV_MAGIC_V1 || magic == LOV_MAGIC_V3)
-               return lum_size < lov_user_md_size(0, magic);
-       else
-               return lum_size < sizeof(struct lov_comp_md_v1);
-}
-
-/* Verify if the objects count in lum is consistent with the
- * stripe count in lum. It applies to regular file only. */
-static bool llapi_layout_lum_valid(struct lov_user_md *lum, int lum_size)
-{
-       struct lov_comp_md_v1 *comp_v1 = NULL;
-       int i, ent_count, obj_count;
-
-       if (lum->lmm_magic == LOV_MAGIC_COMP_V1) {
-               comp_v1 = (struct lov_comp_md_v1 *)lum;
-               ent_count = comp_v1->lcm_entry_count;
-       } else if (lum->lmm_magic == LOV_MAGIC_V1 ||
-                  lum->lmm_magic == LOV_MAGIC_V3) {
-               ent_count = 1;
-       } else {
-               return false;
-       }
-
-       for (i = 0; i < ent_count; i++) {
-               if (comp_v1) {
-                       lum = (struct lov_user_md *)((char *)comp_v1 +
-                               comp_v1->lcm_entries[i].lcme_offset);
-                       lum_size = comp_v1->lcm_entries[i].lcme_size;
-               }
-               obj_count = llapi_layout_objects_in_lum(lum, lum_size);
-
-               if (comp_v1) {
-                       if (!(comp_v1->lcm_entries[i].lcme_flags &
-                                LCME_FL_INIT) && obj_count != 0)
-                               return false;
-               } else if (obj_count != lum->lmm_stripe_count) {
-                       return false;
-               }
-       }
-       return true;
-}
-
-/**
  * Get the striping layout for the file referenced by file descriptor \a fd.
  *
  * If the filesystem does not support the "lustre." xattr namespace, the
@@ -858,14 +911,6 @@ struct llapi_layout *llapi_layout_get_by_fd(int fd, uint32_t flags)
                goto out;
        }
 
-       /* Return an error if we got back a partial layout. */
-       if (llapi_layout_lum_truncated(lum, bytes_read)) {
-               errno = EINTR;
-               goto out;
-       }
-
-       llapi_layout_swab_lov_user_md(lum, bytes_read);
-
        /* Directories may have a positive non-zero lum->lmm_stripe_count
         * yet have an empty lum->lmm_objects array. For non-directories the
         * amount of data returned from the kernel must be consistent
@@ -873,12 +918,8 @@ struct llapi_layout *llapi_layout_get_by_fd(int fd, uint32_t flags)
        if (fstat(fd, &st) < 0)
                goto out;
 
-       if (!S_ISDIR(st.st_mode) && !llapi_layout_lum_valid(lum, bytes_read)) {
-               errno = EINTR;
-               goto out;
-       }
-
-       layout = llapi_layout_from_lum(lum, bytes_read);
+       layout = llapi_layout_get_by_xattr(lum, bytes_read,
+               S_ISDIR(st.st_mode) ? 0 : LLAPI_LXF_CHECK);
 out:
        free(lum);
        return layout;
@@ -2457,6 +2498,10 @@ int llapi_mirror_find_stale(struct llapi_layout *layout,
                        /* not in the specified mirror */
                        if (j == ids_nr)
                                goto next;
+               } else if (flags & LCME_FL_NOSYNC) {
+                       /* if not specified mirrors, do not resync "nosync"
+                        * mirrors */
+                       goto next;
                }
 
                rc = llapi_layout_comp_id_get(layout, &id);
@@ -2491,9 +2536,9 @@ error:
 }
 
 /* locate @layout to a valid component covering file [file_start, file_end) */
-static uint32_t llapi_mirror_find(struct llapi_layout *layout,
-                                 uint64_t file_start, uint64_t file_end,
-                                 uint64_t *endp)
+uint32_t llapi_mirror_find(struct llapi_layout *layout,
+                          uint64_t file_start, uint64_t file_end,
+                          uint64_t *endp)
 {
        uint32_t mirror_id = 0;
        int rc;
@@ -2546,12 +2591,21 @@ static uint32_t llapi_mirror_find(struct llapi_layout *layout,
        return mirror_id;
 }
 
-ssize_t llapi_mirror_resync_one(int fd, struct llapi_layout *layout,
-                               uint32_t dst, uint64_t start, uint64_t end)
+int llapi_mirror_resync_many(int fd, struct llapi_layout *layout,
+                            struct llapi_resync_comp *comp_array,
+                            int comp_size,  uint64_t start, uint64_t end)
 {
-       uint64_t mirror_end = 0;
-       ssize_t result = 0;
-       size_t count;
+       uint64_t count;
+       size_t page_size = sysconf(_SC_PAGESIZE);
+       const size_t buflen = 4 << 20; /* 4M */
+       void *buf;
+       uint64_t pos = start;
+       int i;
+       int rc;
+
+       rc = posix_memalign(&buf, page_size, buflen);
+       if (rc)
+               return -rc;
 
        if (end == OBD_OBJECT_EOF)
                count = OBD_OBJECT_EOF;
@@ -2560,30 +2614,99 @@ ssize_t llapi_mirror_resync_one(int fd, struct llapi_layout *layout,
 
        while (count > 0) {
                uint32_t src;
-               size_t to_copy;
-               ssize_t copied;
+               uint64_t mirror_end = 0;
+               uint64_t bytes_left;
+               ssize_t bytes_read;
+               size_t to_read;
+               size_t to_write;
 
-               src = llapi_mirror_find(layout, start, end, &mirror_end);
+               src = llapi_mirror_find(layout, pos, end, &mirror_end);
                if (src == 0)
                        return -ENOENT;
 
-               if (mirror_end == OBD_OBJECT_EOF)
-                       to_copy = count;
-               else
-                       to_copy = MIN(count, mirror_end - start);
-
-               copied = llapi_mirror_copy(fd, src, dst, start, to_copy);
-               if (copied < 0)
-                       return copied;
+               if (mirror_end == OBD_OBJECT_EOF) {
+                       bytes_left = count;
+               } else {
+                       bytes_left = MIN(count, mirror_end - pos);
+                       bytes_left = ((bytes_left - 1) | (page_size - 1)) + 1;
+               }
+               to_read = MIN(buflen, bytes_left);
 
-               result += copied;
-               if (copied < to_copy) /* end of file */
+               bytes_read = llapi_mirror_read(fd, src, buf, to_read, pos);
+               if (bytes_read == 0) {
+                       /* end of file */
+                       break;
+               }
+               if (bytes_read < 0) {
+                       rc = bytes_read;
                        break;
+               }
+
+               /* round up to page align to make direct IO happy. */
+               to_write = ((bytes_read - 1) | (page_size - 1)) + 1;
+
+               for (i = 0; i < comp_size; i++) {
+                       ssize_t written;
+                       off_t pos2 = pos;
+                       size_t to_write2 = to_write;
+
+                       /* skip non-overlapped component */
+                       if (pos >= comp_array[i].lrc_end ||
+                           pos + to_write <= comp_array[i].lrc_start)
+                               continue;
+
+                       if (pos < comp_array[i].lrc_start)
+                               pos2 = comp_array[i].lrc_start;
+
+                       to_write2 -= pos2 - pos;
+
+                       if ((pos + to_write) > comp_array[i].lrc_end)
+                               to_write2 -= pos + to_write -
+                                            comp_array[i].lrc_end;
+
+                       written = llapi_mirror_write(fd,
+                                       comp_array[i].lrc_mirror_id,
+                                       buf + pos2 - pos,
+                                       to_write2, pos2);
+                       if (written < 0) {
+                               /**
+                                * this component is not written successfully,
+                                * mark it using its lrc_synced, it is supposed
+                                * to be false before getting here.
+                                *
+                                * And before this function returns, all
+                                * elements of comp_array will reverse their
+                                * lrc_synced flag to reflect their true
+                                * meanings.
+                                */
+                               comp_array[i].lrc_synced = true;
+                               continue;
+                       }
+                       assert(written == to_write2);
+               }
 
-               if (count != OBD_OBJECT_EOF)
-                       count -= copied;
-               start += copied;
+               pos += bytes_read;
+               count -= bytes_read;
        }
 
-       return result;
+       free(buf);
+
+       if (rc < 0) {
+               for (i = 0; i < comp_size; i++)
+                       comp_array[i].lrc_synced = false;
+               return rc;
+       }
+
+       for (i = 0; i < comp_size; i++) {
+               comp_array[i].lrc_synced = !comp_array[i].lrc_synced;
+               if (comp_array[i].lrc_synced && pos & (page_size - 1)) {
+                       rc = llapi_mirror_truncate(fd,
+                                       comp_array[i].lrc_mirror_id, pos);
+                       if (rc < 0)
+                               comp_array[i].lrc_synced = false;
+               }
+       }
+
+       /* partially successful is successful */
+       return 0;
 }