Whamcloud - gitweb
EX-9895 osc: preserve compressed pages for OST_WRITE replay
authorAlex Zhuravlev <bzzz@whamcloud.com>
Wed, 5 Jun 2024 16:43:51 +0000 (19:43 +0300)
committerAndreas Dilger <adilger@whamcloud.com>
Wed, 19 Jun 2024 05:34:21 +0000 (05:34 +0000)
it's incorrect to release compressed pages right after reply
as we may resend them during OST_WRITE replay.

Test-Parameters: env=ONLY=1081 testlist=sanity-compr
Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: I3edc16d6556ddd60735d2f14fe879fc0f45231d7
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/55323
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Sergey Cheremencev <scherementsev@ddn.com>
Reviewed-by: Zhenyu Xu <bobijam@hotmail.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lustre/include/lustre_net.h
lustre/include/lustre_osc.h
lustre/osc/osc_request.c
lustre/tests/sanity-compr.sh

index 38ad79e..f19675e 100644 (file)
@@ -567,7 +567,7 @@ union ptlrpc_async_args {
         * a pointer to it here.  The pointer_arg ensures this struct is at
         * least big enough for that.
         */
-       void    *pointer_arg[11];
+       void    *pointer_arg[12];
        __u64   space[7];
 };
 
index 5b30224..0efe93a 100644 (file)
@@ -567,6 +567,7 @@ struct osc_brw_async_args {
        struct client_obd       *aa_cli;
        struct list_head         aa_oaps;
        struct list_head         aa_exts;
+       atomic_t                 aa_ppga_refc;
 };
 
 extern struct kmem_cache *osc_lock_kmem;
index 1738717..580f898 100644 (file)
@@ -2064,9 +2064,11 @@ no_bulk:
        ptlrpc_request_set_replen(req);
 
        aa = ptlrpc_req_async_args(aa, req);
+       atomic_set(&aa->aa_ppga_refc, 0);
        if (opc == OST_WRITE &&  compressed) {
                aa->aa_ncpage_count = ncpcount;
                aa->aa_ncppga = ncpga;
+               atomic_set(&aa->aa_ppga_refc, 2);
        }
 
        aa->aa_oa = oa;
@@ -2788,13 +2790,14 @@ static int brw_interpret(const struct lu_env *env,
                       aa->aa_requested_nob :
                       req->rq_bulk->bd_nob_transferred);
 
-       if (aa->aa_ncppga) {
-               osc_release_ppga(aa->aa_ncppga, aa->aa_ncpage_count);
+       if (aa->aa_ncpage_count) {
                /*
                 * allocated aa_ncpage_count for ppga and used
                 * only aa_page_count
                 */
-               free_cpga(aa->aa_ppga, aa->aa_ncpage_count);
+               osc_release_ppga(aa->aa_ncppga, aa->aa_ncpage_count);
+               if (atomic_dec_and_test(&aa->aa_ppga_refc))
+                       free_cpga(aa->aa_ppga, aa->aa_ncpage_count);
        } else {
                osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
        }
@@ -2818,6 +2821,12 @@ static int brw_interpret(const struct lu_env *env,
 
 static void brw_commit(struct ptlrpc_request *req)
 {
+       struct osc_brw_async_args *aa = NULL;
+
+       aa = ptlrpc_req_async_args(aa, req);
+       if (aa->aa_ncpage_count && atomic_dec_and_test(&aa->aa_ppga_refc))
+               free_cpga(aa->aa_ppga, aa->aa_ncpage_count);
+
        /* If osc_inc_unstable_pages (via osc_extent_finish) races with
         * this called via the rq_commit_cb, I need to ensure
         * osc_dec_unstable_pages is still called. Otherwise unstable
index b2b54ab..17d8d1d 100644 (file)
@@ -1628,6 +1628,64 @@ test_1080() {
 }
 run_test 1080 "Compression header error tolerance"
 
+test_1081() {
+       which dbench > /dev/null 2>&1 || skip_env "No dbench installed"
+
+       local DBENCHDIR=$DIR/$tdir
+       mkdir -p $DBENCHDIR
+       stack_trap "rm -rf $DBENCHDIR"
+
+       local SPACE=`df -P $MOUNT | tail -n 1 | awk '{ print $4 }'`
+       DB_THREADS=$((SPACE / 50000))
+       (( $THREADS < $DB_THREADS )) && DB_THREADS=$THREADS
+
+       myUID=$RUNAS_ID
+       myGID=$RUNAS_GID
+       myRUNAS=$RUNAS
+       FAIL_ON_ERROR=false check_runas_id_ret $myUID $myGID $myRUNAS ||
+               { myRUNAS="" && myUID=$UID && myGID=`id -g $USER`; }
+       chown $myUID:$myGID $DBENCHDIR
+
+       # enable cache in OSD to help with false failures
+       # when we skip writes after replay_barrier, but keep reading
+       # for read-modify-write operations
+       readcache=$(do_facet ost1 $LCTL get_param -n osd*.*OST0000.read_cache_enable)
+       writecache=$(do_facet ost1 $LCTL get_param -n osd*.*OST0000.writethrough_cache_enable)
+       stack_trap "do_facet ost1 $LCTL set_param osd*.*OST0000.writethrough_cache_enable=$writecache"
+       stack_trap "do_facet ost1 $LCTL set_param osd*.*OST0000.read_cache_enable=$readcache"
+       do_facet ost1 $LCTL set_param osd*.*OST0000*.writethrough_cache_enable=1
+       do_facet ost1 $LCTL set_param osd*.*OST0000*.read_cache_enable=1
+
+       local pids=""
+       local duration="120"
+       [[ "$SLOW" == "yes" ]] && duration="240"
+       for ((i = 0; i < 2; i++)); do
+               mkdir -p $DBENCHDIR/$i
+               $LFS setstripe -c -1 -Eeof -Z lz4:0 --compress-chunk=64 \
+                       $DBENCHDIR/$i || error "can't set striping"
+               chown $myUID:$myGID $DBENCHDIR/$i
+
+               $myRUNAS bash rundbench -D $DBENCHDIR/$i $DB_THREADS -t $duration &
+               pids="$pids $!"
+       done
+
+       local start=$SECONDS
+       while (( (SECONDS - start) < duration )); do
+               sleep 1
+               replay_barrier ost1
+               sleep 0.5
+               fail ost1
+       done
+
+       for i in $pids; do
+               wait $i
+               rc=$?
+               echo "$i finished with $rc"
+       done
+       return $rc
+}
+run_test 1081 "failover dbench"
+
 test_1121() {
        (( $OST1_VERSION >= $(version_code 2.14.0-ddn152) )) ||
                skip "Need OST >= 2.14.0-ddn152 for get compr statistics"