Whamcloud - gitweb
LU-17204 lod: don't panic on short LOVEA 27/52727/3
authorAlex Zhuravlev <bzzz@whamcloud.com>
Tue, 17 Oct 2023 12:21:18 +0000 (15:21 +0300)
committerOleg Drokin <green@whamcloud.com>
Wed, 8 Nov 2023 22:05:15 +0000 (22:05 +0000)
when we request LOVEA and find the existing buffer is not enough,
we ask for LOVEA's size and reallocate the buffer. but LOVEA can
shrink in parallel (e.g. new default striping), so our expectation
that the size must be greater than size of the existing buffer is
not correct. replace the corresponding assertion with a simple
repeat + extra check for a livelock.

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: I26ad5091228bf78858f8538478dbcbdb235cddf4
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/52727
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Mikhail Pershin <mpershin@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/lod/lod_lov.c

index ca90629..e99082f 100644 (file)
@@ -458,7 +458,6 @@ int lod_ea_store_resize(struct lod_thread_info *info, size_t size)
 
        if (info->lti_ea_store) {
                LASSERT(info->lti_ea_store_size);
-               LASSERT(info->lti_ea_store_size < round);
                CDEBUG(D_INFO, "EA store size %d is not enough, need %d\n",
                       info->lti_ea_store_size, round);
                OBD_FREE_LARGE(info->lti_ea_store, info->lti_ea_store_size);
@@ -1042,7 +1041,7 @@ int lod_get_ea(const struct lu_env *env, struct lod_object *lo,
 {
        struct lod_thread_info  *info = lod_env_info(env);
        struct dt_object        *next = dt_object_child(&lo->ldo_obj);
-       int                     rc;
+       int rc, count = 0;
        ENTRY;
 
        LASSERT(info);
@@ -1070,6 +1069,11 @@ repeat:
                        RETURN(rc);
 
                LASSERT(rc > 0);
+               if (rc <= info->lti_ea_store_size) {
+                       /* sometimes LOVEA can shrink in parallel */
+                       LASSERT(count++ < 10);
+                       goto repeat;
+               }
                rc = lod_ea_store_resize(info, rc);
                if (rc)
                        RETURN(rc);