Whamcloud - gitweb
LU-7593 target: umount vs tgt_last_rcvd_update deadlock
[fs/lustre-release.git] / lustre / target / tgt_lastrcvd.c
index d1afc9d..55e5995 100644 (file)
@@ -27,7 +27,7 @@
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2014, Intel Corporation.
+ * Copyright (c) 2011, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -50,7 +50,8 @@ static int tgt_bitmap_chunk_alloc(struct lu_target *lut, int chunk)
 {
        unsigned long *bm;
 
-       OBD_ALLOC(bm, BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) * sizeof(long));
+       OBD_ALLOC_LARGE(bm, BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) *
+                       sizeof(long));
        if (bm == NULL)
                return -ENOMEM;
 
@@ -59,7 +60,7 @@ static int tgt_bitmap_chunk_alloc(struct lu_target *lut, int chunk)
        if (lut->lut_reply_bitmap[chunk] != NULL) {
                /* someone else already allocated the bitmap for this chunk */
                spin_unlock(&lut->lut_client_bitmap_lock);
-               OBD_FREE(bm, BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) *
+               OBD_FREE_LARGE(bm, BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) *
                         sizeof(long));
                return 0;
        }
@@ -155,6 +156,12 @@ static int tgt_clear_reply_slot(struct lu_target *lut, int idx)
        LASSERT(chunk < LUT_REPLY_SLOTS_MAX_CHUNKS);
        LASSERT(b < LUT_REPLY_SLOTS_PER_CHUNK);
 
+       if (lut->lut_reply_bitmap[chunk] == NULL) {
+               CERROR("%s: slot %d not allocated\n",
+                      tgt_name(lut), idx);
+               return -ENOENT;
+       }
+
        if (test_and_clear_bit(b, lut->lut_reply_bitmap[chunk]) == 0) {
                CERROR("%s: slot %d already clear in bitmap\n",
                       tgt_name(lut), idx);
@@ -308,7 +315,7 @@ static void tgt_free_reply_data(struct lu_target *lut,
 {
        CDEBUG(D_TRACE, "%s: free reply data %p: xid %llu, transno %llu, "
               "client gen %u, slot idx %d\n",
-              tgt_name(lut), trd, trd->trd_reply.lrd_xid,
+              lut == NULL ? "" : tgt_name(lut), trd, trd->trd_reply.lrd_xid,
               trd->trd_reply.lrd_transno, trd->trd_reply.lrd_client_gen,
               trd->trd_index);
 
@@ -316,7 +323,8 @@ static void tgt_free_reply_data(struct lu_target *lut,
 
        list_del(&trd->trd_list);
        ted->ted_reply_cnt--;
-       tgt_clear_reply_slot(lut, trd->trd_index);
+       if (lut != NULL)
+               tgt_clear_reply_slot(lut, trd->trd_index);
        OBD_FREE_PTR(trd);
 }
 
@@ -331,7 +339,7 @@ static void tgt_release_reply_data(struct lu_target *lut,
 {
        CDEBUG(D_TRACE, "%s: release reply data %p: xid %llu, transno %llu, "
               "client gen %u, slot idx %d\n",
-              tgt_name(lut), trd, trd->trd_reply.lrd_xid,
+              lut == NULL ? "" : tgt_name(lut), trd, trd->trd_reply.lrd_xid,
               trd->trd_reply.lrd_transno, trd->trd_reply.lrd_client_gen,
               trd->trd_index);
 
@@ -377,6 +385,9 @@ int tgt_client_alloc(struct obd_export *exp)
        ENTRY;
        LASSERT(exp != exp->exp_obd->obd_self_export);
 
+       spin_lock_init(&exp->exp_target_data.ted_nodemap_lock);
+       INIT_LIST_HEAD(&exp->exp_target_data.ted_nodemap_member);
+
        OBD_ALLOC_PTR(exp->exp_target_data.ted_lcd);
        if (exp->exp_target_data.ted_lcd == NULL)
                RETURN(-ENOMEM);
@@ -418,9 +429,12 @@ void tgt_client_free(struct obd_export *exp)
        OBD_FREE_PTR(ted->ted_lcd);
        ted->ted_lcd = NULL;
 
-       /* Slot may be not yet assigned */
-       if (ted->ted_lr_idx < 0)
+       /* Target may have been freed (see LU-7430)
+        * Slot may be not yet assigned */
+       if (exp->exp_obd->u.obt.obt_magic != OBT_MAGIC ||
+           ted->ted_lr_idx < 0)
                return;
+
        /* Clear bit when lcd is freed */
        LASSERT(lut && lut->lut_client_bitmap);
        if (!test_and_clear_bit(ted->ted_lr_idx, lut->lut_client_bitmap)) {
@@ -503,6 +517,7 @@ static int tgt_client_data_update(const struct lu_env *env,
                RETURN(PTR_ERR(th));
 
        tti_buf_lcd(tti);
+       mutex_lock(&ted->ted_lcd_lock);
        rc = dt_declare_record_write(env, tgt->lut_last_rcvd,
                                     &tti->tti_buf,
                                     ted->ted_lr_off, th);
@@ -532,6 +547,7 @@ static int tgt_client_data_update(const struct lu_env *env,
        rc = tgt_client_data_write(env, tgt, ted->ted_lcd, &tti->tti_off, th);
        EXIT;
 out:
+       mutex_unlock(&ted->ted_lcd_lock);
        dt_trans_stop(env, tgt->lut_bottom, th);
        CDEBUG(D_INFO, "%s: update last_rcvd client data for UUID = %s, "
               "last_transno = "LPU64": rc = %d\n", tgt->lut_obd->obd_name,
@@ -745,7 +761,7 @@ void tgt_boot_epoch_update(struct lu_target *tgt)
 }
 
 /**
- * commit callback, need to update last_commited value
+ * commit callback, need to update last_committed value
  */
 struct tgt_last_committed_callback {
        struct dt_txn_commit_cb  llcc_cb;
@@ -761,21 +777,29 @@ static void tgt_cb_last_committed(struct lu_env *env, struct thandle *th,
 
        ccb = container_of0(cb, struct tgt_last_committed_callback, llcc_cb);
 
+       LASSERT(ccb->llcc_exp);
        LASSERT(ccb->llcc_tgt != NULL);
        LASSERT(ccb->llcc_exp->exp_obd == ccb->llcc_tgt->lut_obd);
 
+       /* Fast path w/o spinlock, if exp_last_committed was updated
+        * with higher transno, no need to take spinlock and check,
+        * also no need to update obd_last_committed. */
+       if (ccb->llcc_transno <= ccb->llcc_exp->exp_last_committed)
+               goto out;
        spin_lock(&ccb->llcc_tgt->lut_translock);
        if (ccb->llcc_transno > ccb->llcc_tgt->lut_obd->obd_last_committed)
                ccb->llcc_tgt->lut_obd->obd_last_committed = ccb->llcc_transno;
 
-       LASSERT(ccb->llcc_exp);
        if (ccb->llcc_transno > ccb->llcc_exp->exp_last_committed) {
                ccb->llcc_exp->exp_last_committed = ccb->llcc_transno;
                spin_unlock(&ccb->llcc_tgt->lut_translock);
+
                ptlrpc_commit_replies(ccb->llcc_exp);
+               tgt_cancel_slc_locks(ccb->llcc_transno);
        } else {
                spin_unlock(&ccb->llcc_tgt->lut_translock);
        }
+out:
        class_export_cb_put(ccb->llcc_exp);
        if (ccb->llcc_transno)
                CDEBUG(D_HA, "%s: transno "LPD64" is committed\n",
@@ -1047,10 +1071,8 @@ int tgt_client_del(const struct lu_env *env, struct obd_export *exp)
                RETURN(rc);
        }
 
-       mutex_lock(&ted->ted_lcd_lock);
        memset(ted->ted_lcd->lcd_uuid, 0, sizeof ted->ted_lcd->lcd_uuid);
        rc = tgt_client_data_update(env, exp);
-       mutex_unlock(&ted->ted_lcd_lock);
 
        CDEBUG(rc == 0 ? D_INFO : D_ERROR,
               "%s: zeroing out client %s at idx %u (%llu), rc %d\n",
@@ -1125,7 +1147,7 @@ static int tgt_last_rcvd_update(const struct lu_env *env, struct lu_target *tgt,
        struct tg_export_data   *ted;
        __u64                   *transno_p;
        int                      rc = 0;
-       bool                     lw_client, update = false;
+       bool                     lw_client;
 
        ENTRY;
 
@@ -1184,27 +1206,21 @@ static int tgt_last_rcvd_update(const struct lu_env *env, struct lu_target *tgt,
                spin_lock(&tgt->lut_translock);
                if (tti->tti_transno > tgt->lut_lsd.lsd_last_transno) {
                        tgt->lut_lsd.lsd_last_transno = tti->tti_transno;
-                       update = true;
+                       spin_unlock(&tgt->lut_translock);
+                       /* Although lightweight (LW) connections have no slot
+                        * in the last_rcvd, we still want to maintain
+                        * the in-memory lsd_client_data structure in order to
+                        * properly handle reply reconstruction. */
+                       rc = tgt_server_data_write(env, tgt, th);
+               } else {
+                       spin_unlock(&tgt->lut_translock);
                }
-               spin_unlock(&tgt->lut_translock);
-               /* Although lightweight (LW) connections have no slot in
-                * last_rcvd, we still want to maintain the in-memory
-                * lsd_client_data structure in order to properly handle reply
-                * reconstruction. */
        } else if (ted->ted_lr_off == 0) {
                CERROR("%s: client idx %d has offset %lld\n",
                       tgt_name(tgt), ted->ted_lr_idx, ted->ted_lr_off);
                RETURN(-EINVAL);
        }
 
-       /* if the export has already been disconnected, we have no last_rcvd
-        * slot, update server data with latest transno then */
-       if (ted->ted_lcd == NULL) {
-               CWARN("commit transaction for disconnected client %s: rc %d\n",
-                     exp->exp_client_uuid.uuid, rc);
-               GOTO(srv_update, rc = 0);
-       }
-
        /* Target that supports multiple reply data */
        if (tgt_is_multimodrpcs_client(exp)) {
                struct tg_reply_data    *trd;
@@ -1214,7 +1230,7 @@ static int tgt_last_rcvd_update(const struct lu_env *env, struct lu_target *tgt,
 
                OBD_ALLOC_PTR(trd);
                if (unlikely(trd == NULL))
-                       GOTO(srv_update, rc = -ENOMEM);
+                       RETURN(-ENOMEM);
 
                /* fill reply data information */
                lrd = &trd->trd_reply;
@@ -1245,12 +1261,14 @@ static int tgt_last_rcvd_update(const struct lu_env *env, struct lu_target *tgt,
                }
 
                rc = tgt_add_reply_data(env, tgt, ted, trd, th, write_update);
-               GOTO(srv_update, rc);
+               if (rc < 0)
+                       OBD_FREE_PTR(trd);
+               return rc;
        }
 
        /* Enough for update replay, let's return */
        if (req == NULL)
-               GOTO(srv_update, rc);
+               RETURN(rc);
 
        mutex_lock(&ted->ted_lcd_lock);
        LASSERT(ergo(tti->tti_transno == 0, th->th_result != 0));
@@ -1278,21 +1296,27 @@ static int tgt_last_rcvd_update(const struct lu_env *env, struct lu_target *tgt,
 
        /* Update transno in slot only if non-zero number, i.e. no errors */
        if (likely(tti->tti_transno != 0)) {
-               if (*transno_p > tti->tti_transno &&
-                   !tgt->lut_no_reconstruct) {
-                       CERROR("%s: trying to overwrite bigger transno:"
-                              "on-disk: "LPU64", new: "LPU64" replay: %d. "
-                              "see LU-617.\n", tgt_name(tgt), *transno_p,
-                              tti->tti_transno, req_is_replay(req));
-                       if (req_is_replay(req)) {
-                               spin_lock(&req->rq_export->exp_lock);
-                               req->rq_export->exp_vbr_failed = 1;
-                               spin_unlock(&req->rq_export->exp_lock);
+               /* Don't overwrite bigger transaction number with lower one.
+                * That is not sign of problem in all cases, but in any case
+                * this value should be monotonically increased only. */
+               if (*transno_p > tti->tti_transno) {
+                       if (!tgt->lut_no_reconstruct) {
+                               CERROR("%s: trying to overwrite bigger transno:"
+                                      "on-disk: "LPU64", new: "LPU64" replay: "
+                                      "%d. See LU-617.\n", tgt_name(tgt),
+                                      *transno_p, tti->tti_transno,
+                                      req_is_replay(req));
+                               if (req_is_replay(req)) {
+                                       spin_lock(&req->rq_export->exp_lock);
+                                       req->rq_export->exp_vbr_failed = 1;
+                                       spin_unlock(&req->rq_export->exp_lock);
+                               }
+                               mutex_unlock(&ted->ted_lcd_lock);
+                               RETURN(req_is_replay(req) ? -EOVERFLOW : 0);
                        }
-                       mutex_unlock(&ted->ted_lcd_lock);
-                       RETURN(req_is_replay(req) ? -EOVERFLOW : 0);
+               } else {
+                       *transno_p = tti->tti_transno;
                }
-               *transno_p = tti->tti_transno;
        }
 
        if (!lw_client) {
@@ -1304,11 +1328,7 @@ static int tgt_last_rcvd_update(const struct lu_env *env, struct lu_target *tgt,
                }
        }
        mutex_unlock(&ted->ted_lcd_lock);
-       EXIT;
-srv_update:
-       if (update)
-               rc = tgt_server_data_write(env, tgt, th);
-       return rc;
+       RETURN(rc);
 }
 
 /*
@@ -1459,6 +1479,12 @@ static int tgt_clients_data_init(const struct lu_env *env,
 
                class_export_put(exp);
 
+               rc = rev_import_init(exp);
+               if (rc != 0) {
+                       class_unlink_export(exp);
+                       GOTO(err_out, rc);
+               }
+
                /* Need to check last_rcvd even for duplicated exports. */
                CDEBUG(D_OTHER, "client at idx %d has last_transno = "LPU64"\n",
                       cl_idx, last_transno);
@@ -1698,19 +1724,32 @@ int tgt_txn_start_cb(const struct lu_env *env, struct thandle *th,
        if (tsi->tsi_exp == NULL)
                return 0;
 
-       dto = dt_object_locate(tgt->lut_last_rcvd, th->th_dev);
-       tti_buf_lcd(tti);
-
-       rc = dt_declare_record_write(env, dto, &tti->tti_buf,
-                                    tsi->tsi_exp->exp_target_data.ted_lr_off,
-                                    th);
-       if (rc)
-               return rc;
-
-       tti_buf_lsd(tti);
-       rc = dt_declare_record_write(env, dto, &tti->tti_buf, 0, th);
-       if (rc)
-               return rc;
+       if (tgt_is_multimodrpcs_client(tsi->tsi_exp)) {
+               /*
+                * Use maximum possible file offset for declaration to ensure
+                * ZFS will reserve enough credits for a write anywhere in this
+                * file, since we don't know where in the file the write will be
+                * because a replay slot has not been assigned.  This should be
+                * replaced by dmu_tx_hold_append() when available.
+                */
+               tti->tti_off = atomic_read(&tgt->lut_num_clients) * 8 *
+                               sizeof(struct lsd_reply_data);
+               tti->tti_buf.lb_buf = NULL;
+               tti->tti_buf.lb_len = sizeof(struct lsd_reply_data);
+               dto = dt_object_locate(tgt->lut_reply_data, th->th_dev);
+               rc = dt_declare_record_write(env, dto, &tti->tti_buf,
+                                            tti->tti_off, th);
+               if (rc)
+                       return rc;
+       } else {
+               dto = dt_object_locate(tgt->lut_last_rcvd, th->th_dev);
+               tti_buf_lcd(tti);
+               tti->tti_off = tsi->tsi_exp->exp_target_data.ted_lr_off;
+               rc = dt_declare_record_write(env, dto, &tti->tti_buf,
+                                            tti->tti_off, th);
+               if (rc)
+                       return rc;
+       }
 
        if (tsi->tsi_vbr_obj != NULL &&
            !lu_object_remote(&tsi->tsi_vbr_obj->do_lu)) {
@@ -1853,7 +1892,11 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt)
 
                        /* create in-memory reply_data and link it to
                         * target export's reply list */
-                       tgt_set_reply_slot(tgt, idx);
+                       rc = tgt_set_reply_slot(tgt, idx);
+                       if (rc != 0) {
+                               mutex_unlock(&ted->ted_lcd_lock);
+                               GOTO(out, rc);
+                       }
                        trd->trd_reply = *lrd;
                        trd->trd_pre_versions[0] = 0;
                        trd->trd_pre_versions[1] = 0;