X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftarget%2Ftgt_lastrcvd.c;h=061f088f8a91967b9888b14c84c028384bf57995;hp=cfcf18c266e0b824b3e50a42d41d2035265a00a8;hb=300858ccfcd00b52663de45e0bb472012242f342;hpb=72057a3af19ee02d9a686bd7e7d074917e381310 diff --git a/lustre/target/tgt_lastrcvd.c b/lustre/target/tgt_lastrcvd.c index cfcf18c..061f088 100644 --- a/lustre/target/tgt_lastrcvd.c +++ b/lustre/target/tgt_lastrcvd.c @@ -23,7 +23,7 @@ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2011, 2015, Intel Corporation. + * Copyright (c) 2011, 2017, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -40,6 +40,8 @@ #include "tgt_internal.h" +/** version recovery epoch */ +#define LR_EPOCH_BITS 32 /* Allocate a bitmap for a chunk of reply data slots */ static int tgt_bitmap_chunk_alloc(struct lu_target *lut, int chunk) @@ -146,6 +148,13 @@ static int tgt_clear_reply_slot(struct lu_target *lut, int idx) int chunk; int b; + if (lut->lut_obd->obd_stopping) + /* + * in case of failover keep the bit set in order to + * avoid overwriting slots in reply_data which might + * be required by resent rpcs + */ + return 0; chunk = idx / LUT_REPLY_SLOTS_PER_CHUNK; b = idx % LUT_REPLY_SLOTS_PER_CHUNK; @@ -214,6 +223,9 @@ static int tgt_reply_header_write(const struct lu_env *env, tgt->lut_obd->obd_name, REPLY_DATA, lrh->lrh_magic, lrh->lrh_header_size, lrh->lrh_reply_size); + if (tgt->lut_bottom->dd_rdonly) + RETURN(0); + buf.lrh_magic = cpu_to_le32(lrh->lrh_magic); buf.lrh_header_size = cpu_to_le32(lrh->lrh_header_size); buf.lrh_reply_size = cpu_to_le32(lrh->lrh_reply_size); @@ -319,7 +331,7 @@ static void tgt_free_reply_data(struct lu_target *lut, list_del(&trd->trd_list); ted->ted_reply_cnt--; - if (lut != NULL) + if (lut != NULL && trd->trd_index != TRD_INDEX_MEMORY) tgt_clear_reply_slot(lut, trd->trd_index); OBD_FREE_PTR(trd); } @@ -383,6 +395,8 @@ int tgt_client_alloc(struct obd_export *exp) spin_lock_init(&exp->exp_target_data.ted_nodemap_lock); INIT_LIST_HEAD(&exp->exp_target_data.ted_nodemap_member); + spin_lock_init(&exp->exp_target_data.ted_fmd_lock); + INIT_LIST_HEAD(&exp->exp_target_data.ted_fmd_list); OBD_ALLOC_PTR(exp->exp_target_data.ted_lcd); if (exp->exp_target_data.ted_lcd == NULL) @@ -406,6 +420,8 @@ void tgt_client_free(struct obd_export *exp) LASSERT(exp != exp->exp_obd->obd_self_export); + tgt_fmd_cleanup(exp); + /* free reply data */ mutex_lock(&ted->ted_lcd_lock); list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) { @@ -444,8 +460,22 @@ void tgt_client_free(struct obd_export *exp) } EXPORT_SYMBOL(tgt_client_free); -int tgt_client_data_read(const struct lu_env *env, struct lu_target *tgt, - struct lsd_client_data *lcd, loff_t *off, int index) +static inline void tgt_check_lcd(const char *obd_name, int index, + struct lsd_client_data *lcd) +{ + size_t uuid_size = sizeof(lcd->lcd_uuid); + + if (strnlen((char*)lcd->lcd_uuid, uuid_size) == uuid_size) { + lcd->lcd_uuid[uuid_size - 1] = '\0'; + + LCONSOLE_ERROR("the client UUID (%s) on %s for exports stored in last_rcvd(index = %d) is bad!\n", + lcd->lcd_uuid, obd_name, index); + } +} + +static int tgt_client_data_read(const struct lu_env *env, struct lu_target *tgt, + struct lsd_client_data *lcd, + loff_t *off, int index) { struct tgt_thread_info *tti = tgt_th_info(env); int rc; @@ -453,7 +483,7 @@ int tgt_client_data_read(const struct lu_env *env, struct lu_target *tgt, tti_buf_lcd(tti); rc = dt_record_read(env, tgt->lut_last_rcvd, &tti->tti_buf, off); if (rc == 0) { - check_lcd(tgt->lut_obd->obd_name, index, &tti->tti_lcd); + tgt_check_lcd(tgt->lut_obd->obd_name, index, &tti->tti_lcd); lcd_le_to_cpu(&tti->tti_lcd, lcd); lcd->lcd_last_result = ptlrpc_status_ntoh(lcd->lcd_last_result); lcd->lcd_last_close_result = @@ -471,9 +501,10 @@ int tgt_client_data_read(const struct lu_env *env, struct lu_target *tgt, return rc; } -int tgt_client_data_write(const struct lu_env *env, struct lu_target *tgt, - struct lsd_client_data *lcd, loff_t *off, - struct thandle *th) +static int tgt_client_data_write(const struct lu_env *env, + struct lu_target *tgt, + struct lsd_client_data *lcd, + loff_t *off, struct thandle *th) { struct tgt_thread_info *tti = tgt_th_info(env); struct dt_object *dto; @@ -488,6 +519,59 @@ int tgt_client_data_write(const struct lu_env *env, struct lu_target *tgt, return dt_record_write(env, dto, &tti->tti_buf, off, th); } +struct tgt_new_client_callback { + struct dt_txn_commit_cb lncc_cb; + struct obd_export *lncc_exp; +}; + +static void tgt_cb_new_client(struct lu_env *env, struct thandle *th, + struct dt_txn_commit_cb *cb, int err) +{ + struct tgt_new_client_callback *ccb; + + ccb = container_of(cb, struct tgt_new_client_callback, lncc_cb); + + LASSERT(ccb->lncc_exp->exp_obd); + + CDEBUG(D_RPCTRACE, "%s: committing for initial connect of %s\n", + ccb->lncc_exp->exp_obd->obd_name, + ccb->lncc_exp->exp_client_uuid.uuid); + + spin_lock(&ccb->lncc_exp->exp_lock); + + ccb->lncc_exp->exp_need_sync = 0; + + spin_unlock(&ccb->lncc_exp->exp_lock); + class_export_cb_put(ccb->lncc_exp); + + OBD_FREE_PTR(ccb); +} + +int tgt_new_client_cb_add(struct thandle *th, struct obd_export *exp) +{ + struct tgt_new_client_callback *ccb; + struct dt_txn_commit_cb *dcb; + int rc; + + OBD_ALLOC_PTR(ccb); + if (ccb == NULL) + return -ENOMEM; + + ccb->lncc_exp = class_export_cb_get(exp); + + dcb = &ccb->lncc_cb; + dcb->dcb_func = tgt_cb_new_client; + INIT_LIST_HEAD(&dcb->dcb_linkage); + strlcpy(dcb->dcb_name, "tgt_cb_new_client", sizeof(dcb->dcb_name)); + + rc = dt_trans_cb_add(th, dcb); + if (rc) { + class_export_cb_put(exp); + OBD_FREE_PTR(ccb); + } + return rc; +} + /** * Update client data in last_rcvd */ @@ -508,12 +592,14 @@ static int tgt_client_data_update(const struct lu_env *env, RETURN(-EINVAL); } + if (tgt->lut_bottom->dd_rdonly) + RETURN(0); + th = dt_trans_create(env, tgt->lut_bottom); if (IS_ERR(th)) RETURN(PTR_ERR(th)); tti_buf_lcd(tti); - mutex_lock(&ted->ted_lcd_lock); rc = dt_declare_record_write(env, tgt->lut_last_rcvd, &tti->tti_buf, ted->ted_lr_off, th); @@ -523,6 +609,9 @@ static int tgt_client_data_update(const struct lu_env *env, rc = dt_trans_start_local(env, tgt->lut_bottom, th); if (rc) GOTO(out, rc); + + mutex_lock(&ted->ted_lcd_lock); + /* * Until this operations will be committed the sync is needed * for this export. This should be done _after_ starting the @@ -541,9 +630,11 @@ static int tgt_client_data_update(const struct lu_env *env, tti->tti_off = ted->ted_lr_off; rc = tgt_client_data_write(env, tgt, ted->ted_lcd, &tti->tti_off, th); + + mutex_unlock(&ted->ted_lcd_lock); + EXIT; out: - mutex_unlock(&ted->ted_lcd_lock); dt_trans_stop(env, tgt->lut_bottom, th); CDEBUG(D_INFO, "%s: update last_rcvd client data for UUID = %s, " "last_transno = %llu: rc = %d\n", tgt->lut_obd->obd_name, @@ -552,7 +643,7 @@ out: return rc; } -int tgt_server_data_read(const struct lu_env *env, struct lu_target *tgt) +static int tgt_server_data_read(const struct lu_env *env, struct lu_target *tgt) { struct tgt_thread_info *tti = tgt_th_info(env); int rc; @@ -570,8 +661,8 @@ int tgt_server_data_read(const struct lu_env *env, struct lu_target *tgt) return rc; } -int tgt_server_data_write(const struct lu_env *env, struct lu_target *tgt, - struct thandle *th) +static int tgt_server_data_write(const struct lu_env *env, + struct lu_target *tgt, struct thandle *th) { struct tgt_thread_info *tti = tgt_th_info(env); struct dt_object *dto; @@ -615,6 +706,9 @@ int tgt_server_data_update(const struct lu_env *env, struct lu_target *tgt, tgt->lut_lsd.lsd_last_transno = tgt->lut_last_transno; spin_unlock(&tgt->lut_translock); + if (tgt->lut_bottom->dd_rdonly) + RETURN(0); + th = dt_trans_create(env, tgt->lut_bottom); if (IS_ERR(th)) RETURN(PTR_ERR(th)); @@ -642,8 +736,8 @@ out: } EXPORT_SYMBOL(tgt_server_data_update); -int tgt_truncate_last_rcvd(const struct lu_env *env, struct lu_target *tgt, - loff_t size) +static int tgt_truncate_last_rcvd(const struct lu_env *env, + struct lu_target *tgt, loff_t size) { struct dt_object *dt = tgt->lut_last_rcvd; struct thandle *th; @@ -652,6 +746,9 @@ int tgt_truncate_last_rcvd(const struct lu_env *env, struct lu_target *tgt, ENTRY; + if (tgt->lut_bottom->dd_rdonly) + RETURN(0); + attr.la_size = size; attr.la_valid = LA_SIZE; @@ -700,7 +797,7 @@ void tgt_boot_epoch_update(struct lu_target *tgt) struct lu_env env; struct ptlrpc_request *req; __u32 start_epoch; - struct list_head client_list; + LIST_HEAD(client_list); int rc; if (tgt->lut_obd->obd_stopping) @@ -714,12 +811,11 @@ void tgt_boot_epoch_update(struct lu_target *tgt) } spin_lock(&tgt->lut_translock); - start_epoch = lr_epoch(tgt->lut_last_transno) + 1; + start_epoch = (tgt->lut_last_transno >> LR_EPOCH_BITS) + 1; tgt->lut_last_transno = (__u64)start_epoch << LR_EPOCH_BITS; tgt->lut_lsd.lsd_start_epoch = start_epoch; spin_unlock(&tgt->lut_translock); - INIT_LIST_HEAD(&client_list); /** * The recovery is not yet finished and final queue can still be updated * with resend requests. Move final list to separate one for processing @@ -747,7 +843,7 @@ void tgt_boot_epoch_update(struct lu_target *tgt) * - there is no client to recover or the recovery was aborted */ if (!strncmp(tgt->lut_obd->obd_type->typ_name, LUSTRE_MDT_NAME, 3) && - (tgt->lut_obd->obd_max_recoverable_clients == 0 || + (atomic_read(&tgt->lut_obd->obd_max_recoverable_clients) == 0 || tgt->lut_obd->obd_abort_recovery)) tgt->lut_lsd.lsd_feature_incompat &= ~OBD_INCOMPAT_MULTI_RPCS; @@ -771,7 +867,7 @@ static void tgt_cb_last_committed(struct lu_env *env, struct thandle *th, { struct tgt_last_committed_callback *ccb; - ccb = container_of0(cb, struct tgt_last_committed_callback, llcc_cb); + ccb = container_of(cb, struct tgt_last_committed_callback, llcc_cb); LASSERT(ccb->llcc_exp); LASSERT(ccb->llcc_tgt != NULL); @@ -800,16 +896,21 @@ static void tgt_cb_last_committed(struct lu_env *env, struct thandle *th, } else { spin_unlock(&ccb->llcc_tgt->lut_translock); } + + CDEBUG(D_HA, "%s: transno %lld is committed\n", + ccb->llcc_tgt->lut_obd->obd_name, ccb->llcc_transno); + out: class_export_cb_put(ccb->llcc_exp); - if (ccb->llcc_transno) - CDEBUG(D_HA, "%s: transno %lld is committed\n", - ccb->llcc_tgt->lut_obd->obd_name, ccb->llcc_transno); OBD_FREE_PTR(ccb); } -int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *tgt, - struct obd_export *exp, __u64 transno) +/** + * Add commit callback function, it returns a non-zero value to inform + * caller to use sync transaction if necessary. + */ +static int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *tgt, + struct obd_export *exp, __u64 transno) { struct tgt_last_committed_callback *ccb; struct dt_txn_commit_cb *dcb; @@ -838,60 +939,29 @@ int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *tgt, /* report failure to force synchronous operation */ return -EPERM; - return rc; + /* if exp_need_sync is set, return non-zero value to force + * a sync transaction. */ + return rc ? rc : exp->exp_need_sync; } -struct tgt_new_client_callback { - struct dt_txn_commit_cb lncc_cb; - struct obd_export *lncc_exp; -}; - -static void tgt_cb_new_client(struct lu_env *env, struct thandle *th, - struct dt_txn_commit_cb *cb, int err) +static int tgt_is_local_client(const struct lu_env *env, + struct obd_export *exp) { - struct tgt_new_client_callback *ccb; - - ccb = container_of0(cb, struct tgt_new_client_callback, lncc_cb); - - LASSERT(ccb->lncc_exp->exp_obd); - - CDEBUG(D_RPCTRACE, "%s: committing for initial connect of %s\n", - ccb->lncc_exp->exp_obd->obd_name, - ccb->lncc_exp->exp_client_uuid.uuid); - - spin_lock(&ccb->lncc_exp->exp_lock); - - ccb->lncc_exp->exp_need_sync = 0; + struct lu_target *tgt = class_exp2tgt(exp); + struct tgt_session_info *tsi = tgt_ses_info(env); + struct ptlrpc_request *req = tgt_ses_req(tsi); - spin_unlock(&ccb->lncc_exp->exp_lock); - class_export_cb_put(ccb->lncc_exp); + if (exp_connect_flags(exp) & OBD_CONNECT_MDS || + exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) + return 0; + if (tgt->lut_local_recovery) + return 0; + if (!req) + return 0; + if (!LNetIsPeerLocal(req->rq_peer.nid)) + return 0; - OBD_FREE_PTR(ccb); -} - -int tgt_new_client_cb_add(struct thandle *th, struct obd_export *exp) -{ - struct tgt_new_client_callback *ccb; - struct dt_txn_commit_cb *dcb; - int rc; - - OBD_ALLOC_PTR(ccb); - if (ccb == NULL) - return -ENOMEM; - - ccb->lncc_exp = class_export_cb_get(exp); - - dcb = &ccb->lncc_cb; - dcb->dcb_func = tgt_cb_new_client; - INIT_LIST_HEAD(&dcb->dcb_linkage); - strlcpy(dcb->dcb_name, "tgt_cb_new_client", sizeof(dcb->dcb_name)); - - rc = dt_trans_cb_add(th, dcb); - if (rc) { - class_export_cb_put(exp); - OBD_FREE_PTR(ccb); - } - return rc; + return 1; } /** @@ -915,6 +985,13 @@ int tgt_client_new(const struct lu_env *env, struct obd_export *exp) if (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) RETURN(0); + if (tgt_is_local_client(env, exp)) { + LCONSOLE_WARN("%s: local client %s w/o recovery\n", + exp->exp_obd->obd_name, ted->ted_lcd->lcd_uuid); + exp->exp_no_recovery = 1; + RETURN(0); + } + /* the bitmap operations can handle cl_idx > sizeof(long) * 8, so * there's no need for extra complication here */ @@ -1042,7 +1119,8 @@ int tgt_client_del(const struct lu_env *env, struct obd_export *exp) /* XXX if lcd_uuid were a real obd_uuid, I could use obd_uuid_equals */ if (!strcmp((char *)ted->ted_lcd->lcd_uuid, (char *)tgt->lut_obd->obd_uuid.uuid) || - exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) + exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT || + exp->exp_no_recovery) RETURN(0); /* Slot may be not yet assigned, use case is race between Client @@ -1069,6 +1147,9 @@ int tgt_client_del(const struct lu_env *env, struct obd_export *exp) if (exp->exp_flags & OBD_OPT_FAILOVER) RETURN(0); + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_CLIENT_DEL)) + RETURN(0); + /* Make sure the server's last_transno is up to date. * This should be done before zeroing client slot so last_transno will * be in server data or in client data in case of failure */ @@ -1091,12 +1172,35 @@ int tgt_client_del(const struct lu_env *env, struct obd_export *exp) } EXPORT_SYMBOL(tgt_client_del); -int tgt_add_reply_data(const struct lu_env *env, struct lu_target *tgt, +static void tgt_clean_by_tag(struct obd_export *exp, __u64 xid, __u16 tag) +{ + struct tg_export_data *ted = &exp->exp_target_data; + struct lu_target *lut = class_exp2tgt(exp); + struct tg_reply_data *trd, *tmp; + + if (tag == 0) + return; + + list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) { + if (trd->trd_tag != tag) + continue; + + LASSERT(ergo(tgt_is_increasing_xid_client(exp), + trd->trd_reply.lrd_xid <= xid)); + + ted->ted_release_tag++; + tgt_release_reply_data(lut, ted, trd); + } +} + +static int tgt_add_reply_data(const struct lu_env *env, struct lu_target *tgt, struct tg_export_data *ted, struct tg_reply_data *trd, + struct ptlrpc_request *req, struct thandle *th, bool update_lrd_file) { struct lsd_reply_data *lrd; int i; + int rc; lrd = &trd->trd_reply; /* update export last transno */ @@ -1105,30 +1209,51 @@ int tgt_add_reply_data(const struct lu_env *env, struct lu_target *tgt, ted->ted_lcd->lcd_last_transno = lrd->lrd_transno; mutex_unlock(&ted->ted_lcd_lock); - /* find a empty slot */ - i = tgt_find_free_reply_slot(tgt); - if (unlikely(i < 0)) { - CERROR("%s: couldn't find a slot for reply data: " - "rc = %d\n", tgt_name(tgt), i); - RETURN(i); - } - trd->trd_index = i; + if (tgt != NULL) { + /* find a empty slot */ + i = tgt_find_free_reply_slot(tgt); + if (unlikely(i < 0)) { + CERROR("%s: couldn't find a slot for reply data: " + "rc = %d\n", tgt_name(tgt), i); + RETURN(i); + } + trd->trd_index = i; - if (update_lrd_file) { - loff_t off; - int rc; + if (update_lrd_file) { + loff_t off; - /* write reply data to disk */ - off = sizeof(struct lsd_reply_header) + sizeof(*lrd) * i; - rc = tgt_reply_data_write(env, tgt, lrd, off, th); - if (unlikely(rc != 0)) { - CERROR("%s: can't update %s file: rc = %d\n", - tgt_name(tgt), REPLY_DATA, rc); - RETURN(rc); + /* write reply data to disk */ + off = sizeof(struct lsd_reply_header) + sizeof(*lrd) * i; + rc = tgt_reply_data_write(env, tgt, lrd, off, th); + if (unlikely(rc != 0)) { + CERROR("%s: can't update %s file: rc = %d\n", + tgt_name(tgt), REPLY_DATA, rc); + GOTO(free_slot, rc); + } } + } else { + trd->trd_index = TRD_INDEX_MEMORY; } + /* add reply data to target export's reply list */ mutex_lock(&ted->ted_lcd_lock); + if (req != NULL) { + int exclude = tgt_is_increasing_xid_client(req->rq_export) ? + MSG_REPLAY : MSG_REPLAY|MSG_RESENT; + + if (req->rq_obsolete) { + CDEBUG(D_INFO, + "drop reply data update for obsolete req xid=%llu," + "transno=%llu, tag=%hu\n", req->rq_xid, + lrd->lrd_transno, trd->trd_tag); + mutex_unlock(&ted->ted_lcd_lock); + GOTO(free_slot, rc = -EBADR); + } + + if (!(lustre_msg_get_flags(req->rq_reqmsg) & exclude)) + tgt_clean_by_tag(req->rq_export, req->rq_xid, + trd->trd_tag); + } list_add(&trd->trd_list, &ted->ted_reply_list); ted->ted_reply_cnt++; if (ted->ted_reply_cnt > ted->ted_reply_max) @@ -1138,10 +1263,76 @@ int tgt_add_reply_data(const struct lu_env *env, struct lu_target *tgt, CDEBUG(D_TRACE, "add reply %p: xid %llu, transno %llu, " "tag %hu, client gen %u, slot idx %d\n", trd, lrd->lrd_xid, lrd->lrd_transno, - trd->trd_tag, lrd->lrd_client_gen, i); + trd->trd_tag, lrd->lrd_client_gen, trd->trd_index); + RETURN(0); + +free_slot: + if (tgt != NULL) + tgt_clear_reply_slot(tgt, trd->trd_index); + return rc; +} + +int tgt_mk_reply_data(const struct lu_env *env, + struct lu_target *tgt, + struct tg_export_data *ted, + struct ptlrpc_request *req, + __u64 opdata, + struct thandle *th, + bool write_update, + __u64 transno) +{ + struct tg_reply_data *trd; + struct lsd_reply_data *lrd; + __u64 *pre_versions = NULL; + int rc; + + OBD_ALLOC_PTR(trd); + if (unlikely(trd == NULL)) + RETURN(-ENOMEM); + + /* fill reply data information */ + lrd = &trd->trd_reply; + lrd->lrd_transno = transno; + if (req != NULL) { + lrd->lrd_xid = req->rq_xid; + trd->trd_tag = lustre_msg_get_tag(req->rq_reqmsg); + lrd->lrd_client_gen = ted->ted_lcd->lcd_generation; + if (write_update) { + pre_versions = lustre_msg_get_versions(req->rq_repmsg); + lrd->lrd_result = th->th_result; + } + } else { + struct tgt_session_info *tsi; + + LASSERT(env != NULL); + tsi = tgt_ses_info(env); + LASSERT(tsi->tsi_xid != 0); + + lrd->lrd_xid = tsi->tsi_xid; + lrd->lrd_result = tsi->tsi_result; + lrd->lrd_client_gen = tsi->tsi_client_gen; + } + + lrd->lrd_data = opdata; + if (pre_versions) { + trd->trd_pre_versions[0] = pre_versions[0]; + trd->trd_pre_versions[1] = pre_versions[1]; + trd->trd_pre_versions[2] = pre_versions[2]; + trd->trd_pre_versions[3] = pre_versions[3]; + } + + rc = tgt_add_reply_data(env, tgt, ted, trd, req, + th, write_update); + if (rc < 0) { + OBD_FREE_PTR(trd); + if (rc == -EBADR) + rc = 0; + } + return rc; + } -EXPORT_SYMBOL(tgt_add_reply_data); +EXPORT_SYMBOL(tgt_mk_reply_data); /* * last_rcvd & last_committed update callbacks @@ -1152,11 +1343,11 @@ static int tgt_last_rcvd_update(const struct lu_env *env, struct lu_target *tgt, { struct tgt_thread_info *tti = tgt_th_info(env); struct tgt_session_info *tsi = tgt_ses_info(env); - struct obd_export *exp = tsi->tsi_exp; - struct tg_export_data *ted; - __u64 *transno_p; - int rc = 0; - bool lw_client; + struct obd_export *exp = tsi->tsi_exp; + struct tg_export_data *ted; + __u64 *transno_p; + bool nolcd = false; + int rc = 0; ENTRY; @@ -1164,11 +1355,15 @@ static int tgt_last_rcvd_update(const struct lu_env *env, struct lu_target *tgt, LASSERT(exp != NULL); ted = &exp->exp_target_data; - lw_client = exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT; - if (ted->ted_lr_idx < 0 && !lw_client) - /* ofd connect may cause transaction before export has - * last_rcvd slot */ - RETURN(0); + /* Some clients don't support recovery, and they don't have last_rcvd + * client data: + * 1. lightweight clients. + * 2. local clients on MDS which doesn't enable "localrecov". + * 3. OFD connect may cause transaction before export has last_rcvd + * slot. + */ + if (ted->ted_lr_idx < 0) + nolcd = true; if (req != NULL) tti->tti_transno = lustre_msg_get_transno(req->rq_reqmsg); @@ -1209,14 +1404,13 @@ static int tgt_last_rcvd_update(const struct lu_env *env, struct lu_target *tgt, /* if can't add callback, do sync write */ th->th_sync |= !!tgt_last_commit_cb_add(th, tgt, exp, tti->tti_transno); - if (lw_client) { - /* All operations performed by LW clients are synchronous and - * we store the committed transno in the last_rcvd header */ + if (nolcd) { + /* store transno in the last_rcvd header */ spin_lock(&tgt->lut_translock); if (tti->tti_transno > tgt->lut_lsd.lsd_last_transno) { tgt->lut_lsd.lsd_last_transno = tti->tti_transno; spin_unlock(&tgt->lut_translock); - /* Although lightweight (LW) connections have no slot + /* Although current connection doesn't have slot * in the last_rcvd, we still want to maintain * the in-memory lsd_client_data structure in order to * properly handle reply reconstruction. */ @@ -1232,47 +1426,8 @@ static int tgt_last_rcvd_update(const struct lu_env *env, struct lu_target *tgt, /* Target that supports multiple reply data */ if (tgt_is_multimodrpcs_client(exp)) { - struct tg_reply_data *trd; - struct lsd_reply_data *lrd; - __u64 *pre_versions; - bool write_update; - - OBD_ALLOC_PTR(trd); - if (unlikely(trd == NULL)) - RETURN(-ENOMEM); - - /* fill reply data information */ - lrd = &trd->trd_reply; - lrd->lrd_transno = tti->tti_transno; - if (req != NULL) { - lrd->lrd_xid = req->rq_xid; - trd->trd_tag = lustre_msg_get_tag(req->rq_reqmsg); - pre_versions = lustre_msg_get_versions(req->rq_repmsg); - lrd->lrd_result = th->th_result; - lrd->lrd_client_gen = ted->ted_lcd->lcd_generation; - write_update = true; - } else { - LASSERT(tsi->tsi_xid != 0); - lrd->lrd_xid = tsi->tsi_xid; - lrd->lrd_result = tsi->tsi_result; - lrd->lrd_client_gen = tsi->tsi_client_gen; - trd->trd_tag = 0; - pre_versions = NULL; - write_update = false; - } - - lrd->lrd_data = opdata; - if (pre_versions) { - trd->trd_pre_versions[0] = pre_versions[0]; - trd->trd_pre_versions[1] = pre_versions[1]; - trd->trd_pre_versions[2] = pre_versions[2]; - trd->trd_pre_versions[3] = pre_versions[3]; - } - - rc = tgt_add_reply_data(env, tgt, ted, trd, th, write_update); - if (rc < 0) - OBD_FREE_PTR(trd); - return rc; + return tgt_mk_reply_data(env, tgt, ted, req, opdata, th, + !!(req != NULL), tti->tti_transno); } /* Enough for update replay, let's return */ @@ -1328,9 +1483,13 @@ static int tgt_last_rcvd_update(const struct lu_env *env, struct lu_target *tgt, } } - if (!lw_client) { + if (!nolcd) { tti->tti_off = ted->ted_lr_off; - rc = tgt_client_data_write(env, tgt, ted->ted_lcd, &tti->tti_off, th); + if (CFS_FAIL_CHECK(OBD_FAIL_TGT_RCVD_EIO)) + rc = -EIO; + else + rc = tgt_client_data_write(env, tgt, ted->ted_lcd, + &tti->tti_off, th); if (rc < 0) { mutex_unlock(&ted->ted_lcd_lock); RETURN(rc); @@ -1401,8 +1560,11 @@ static int tgt_clients_data_init(const struct lu_env *env, ENTRY; - CLASSERT(offsetof(struct lsd_client_data, lcd_padding) + - sizeof(lcd->lcd_padding) == LR_CLIENT_SIZE); + if (tgt->lut_bottom->dd_rdonly) + RETURN(0); + + BUILD_BUG_ON(offsetof(struct lsd_client_data, lcd_padding) + + sizeof(lcd->lcd_padding) != LR_CLIENT_SIZE); OBD_ALLOC_PTR(lcd); if (lcd == NULL) @@ -1467,7 +1629,7 @@ static int tgt_clients_data_init(const struct lu_env *env, exp->exp_connecting = 0; exp->exp_in_recovery = 0; spin_unlock(&exp->exp_lock); - obd->obd_max_recoverable_clients++; + atomic_inc(&obd->obd_max_recoverable_clients); if (tgt->lut_lsd.lsd_feature_incompat & OBD_INCOMPAT_MULTI_RPCS && @@ -1556,8 +1718,8 @@ int tgt_server_data_init(const struct lu_env *env, struct lu_target *tgt) last_rcvd_size = (unsigned long)tti->tti_attr.la_size; /* ensure padding in the struct is the correct size */ - CLASSERT(offsetof(struct lr_server_data, lsd_padding) + - sizeof(lsd->lsd_padding) == LR_SERVER_SIZE); + BUILD_BUG_ON(offsetof(struct lr_server_data, lsd_padding) + + sizeof(lsd->lsd_padding) != LR_SERVER_SIZE); rc = server_name2index(tgt_name(tgt), &index, NULL); if (rc < 0) { @@ -1600,19 +1762,29 @@ int tgt_server_data_init(const struct lu_env *env, struct lu_target *tgt) RETURN(rc); } if (strcmp(lsd->lsd_uuid, tgt->lut_obd->obd_uuid.uuid)) { - LCONSOLE_ERROR_MSG(0x157, "Trying to start OBD %s " - "using the wrong disk %s. Were the" - " /dev/ assignments rearranged?\n", - tgt->lut_obd->obd_uuid.uuid, - lsd->lsd_uuid); - RETURN(-EINVAL); + if (tgt->lut_bottom->dd_rdonly) { + /* Such difference may be caused by mounting + * up snapshot with new fsname under rd_only + * mode. But even if it was NOT, it will not + * damage the system because of "rd_only". */ + memcpy(lsd->lsd_uuid, + tgt->lut_obd->obd_uuid.uuid, + sizeof(lsd->lsd_uuid)); + } else { + LCONSOLE_ERROR_MSG(0x157, "Trying to start " + "OBD %s using the wrong " + "disk %s. Were the /dev/ " + "assignments rearranged?\n", + tgt->lut_obd->obd_uuid.uuid, + lsd->lsd_uuid); + RETURN(-EINVAL); + } } if (lsd->lsd_osd_index != index) { - LCONSOLE_ERROR_MSG(0x157, "%s: index %d in last rcvd " - "is different with the index %d in" - "config log, It might be disk" - "corruption!\n", tgt_name(tgt), + LCONSOLE_ERROR_MSG(0x157, + "%s: index %d in last rcvd is different with the index %d in config log, It might be disk corruption!\n", + tgt_name(tgt), lsd->lsd_osd_index, index); RETURN(-EINVAL); } @@ -1722,6 +1894,14 @@ int tgt_txn_start_cb(const struct lu_env *env, struct thandle *th, struct dt_object *dto; int rc; + /* For readonly case, the caller should have got failure + * when start the transaction. If the logic comes here, + * there must be something wrong. */ + if (unlikely(tgt->lut_bottom->dd_rdonly)) { + dump_stack(); + LBUG(); + } + /* if there is no session, then this transaction is not result of * request processing but some local operation */ if (env->le_ses == NULL) @@ -1741,13 +1921,10 @@ int tgt_txn_start_cb(const struct lu_env *env, struct thandle *th, * because a replay slot has not been assigned. This should be * replaced by dmu_tx_hold_append() when available. */ - tti->tti_off = atomic_read(&tgt->lut_num_clients) * 8 * - sizeof(struct lsd_reply_data); tti->tti_buf.lb_buf = NULL; tti->tti_buf.lb_len = sizeof(struct lsd_reply_data); dto = dt_object_locate(tgt->lut_reply_data, th->th_dev); - rc = dt_declare_record_write(env, dto, &tti->tti_buf, - tti->tti_off, th); + rc = dt_declare_record_write(env, dto, &tti->tti_buf, -1, th); if (rc) return rc; } else { @@ -1823,7 +2000,6 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt) unsigned long reply_data_size; int rc; struct lsd_reply_header *lrh = NULL; - struct lsd_client_data *lcd = NULL; struct tg_reply_data *trd = NULL; int idx; loff_t off; @@ -1872,10 +2048,6 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt) if (hash == NULL) GOTO(out, rc = -ENODEV); - OBD_ALLOC_PTR(lcd); - if (lcd == NULL) - GOTO(out, rc = -ENOMEM); - OBD_ALLOC_PTR(trd); if (trd == NULL) GOTO(out, rc = -ENOMEM); @@ -1927,6 +2099,13 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt) /* update export last committed transation */ exp->exp_last_committed = max(exp->exp_last_committed, lrd->lrd_transno); + /* Update lcd_last_transno as well for check in + * tgt_release_reply_data() or the latest client + * transno can be lost. + */ + ted->ted_lcd->lcd_last_transno = + max(ted->ted_lcd->lcd_last_transno, + exp->exp_last_committed); mutex_unlock(&ted->ted_lcd_lock); class_export_put(exp); @@ -1958,8 +2137,6 @@ int tgt_reply_data_init(const struct lu_env *env, struct lu_target *tgt) out: if (hash != NULL) cfs_hash_putref(hash); - if (lcd != NULL) - OBD_FREE_PTR(lcd); if (trd != NULL) OBD_FREE_PTR(trd); if (lrh != NULL) @@ -1967,43 +2144,70 @@ out: return rc; } -struct tg_reply_data *tgt_lookup_reply_by_xid(struct tg_export_data *ted, - __u64 xid) +static int tgt_check_lookup_req(struct ptlrpc_request *req, int lookup, + struct tg_reply_data *trd) { - struct tg_reply_data *found = NULL; - struct tg_reply_data *reply; + struct tg_export_data *ted = &req->rq_export->exp_target_data; + struct lu_target *lut = class_exp2tgt(req->rq_export); + __u16 tag = lustre_msg_get_tag(req->rq_reqmsg); + int rc = 0; + struct tg_reply_data *reply; + bool check_increasing; + + if (tag == 0) + return 0; + + check_increasing = tgt_is_increasing_xid_client(req->rq_export) && + !(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY); + if (!lookup && !check_increasing) + return 0; - mutex_lock(&ted->ted_lcd_lock); list_for_each_entry(reply, &ted->ted_reply_list, trd_list) { - if (reply->trd_reply.lrd_xid == xid) { - found = reply; + if (lookup && reply->trd_reply.lrd_xid == req->rq_xid) { + rc = 1; + if (trd != NULL) + *trd = *reply; + break; + } else if (check_increasing && reply->trd_tag == tag && + reply->trd_reply.lrd_xid > req->rq_xid) { + rc = -EPROTO; + CERROR("%s: busy tag=%u req_xid=%llu, trd=%p: xid=%llu transno=%llu client_gen=%u slot_idx=%d: rc = %d\n", + tgt_name(lut), tag, req->rq_xid, trd, + reply->trd_reply.lrd_xid, + reply->trd_reply.lrd_transno, + reply->trd_reply.lrd_client_gen, + reply->trd_index, rc); break; } } - mutex_unlock(&ted->ted_lcd_lock); - return found; + + return rc; } -EXPORT_SYMBOL(tgt_lookup_reply_by_xid); /* Look for a reply data matching specified request @req * A copy is returned in @trd if the pointer is not NULL */ -bool tgt_lookup_reply(struct ptlrpc_request *req, struct tg_reply_data *trd) +int tgt_lookup_reply(struct ptlrpc_request *req, struct tg_reply_data *trd) { - struct tg_export_data *ted = &req->rq_export->exp_target_data; - struct tg_reply_data *reply; - bool found = false; - - reply = tgt_lookup_reply_by_xid(ted, req->rq_xid); - if (reply != NULL) { - found = true; - if (trd != NULL) - *trd = *reply; + struct tg_export_data *ted = &req->rq_export->exp_target_data; + int found = 0; + bool not_replay = !(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY); + + mutex_lock(&ted->ted_lcd_lock); + if (not_replay && req->rq_xid <= req->rq_export->exp_last_xid) { + /* A check for the last_xid is needed here in case there is + * no reply data is left in the list. It may happen if another + * RPC on another slot increased the last_xid between our + * process_req_last_xid & tgt_lookup_reply calls */ + found = -EPROTO; + } else { + found = tgt_check_lookup_req(req, 1, trd); } + mutex_unlock(&ted->ted_lcd_lock); - CDEBUG(D_TRACE, "%s: lookup reply xid %llu, found %d\n", - tgt_name(class_exp2tgt(req->rq_export)), req->rq_xid, - found ? 1 : 0); + CDEBUG(D_TRACE, "%s: lookup reply xid %llu, found %d last_xid %llu\n", + tgt_name(class_exp2tgt(req->rq_export)), req->rq_xid, found, + req->rq_export->exp_last_xid); return found; } @@ -2015,37 +2219,19 @@ int tgt_handle_received_xid(struct obd_export *exp, __u64 rcvd_xid) struct lu_target *lut = class_exp2tgt(exp); struct tg_reply_data *trd, *tmp; - mutex_lock(&ted->ted_lcd_lock); + list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) { if (trd->trd_reply.lrd_xid > rcvd_xid) continue; ted->ted_release_xid++; tgt_release_reply_data(lut, ted, trd); } - mutex_unlock(&ted->ted_lcd_lock); return 0; } -int tgt_handle_tag(struct obd_export *exp, __u16 tag) +int tgt_handle_tag(struct ptlrpc_request *req) { - struct tg_export_data *ted = &exp->exp_target_data; - struct lu_target *lut = class_exp2tgt(exp); - struct tg_reply_data *trd, *tmp; - - if (tag == 0) - return 0; - - mutex_lock(&ted->ted_lcd_lock); - list_for_each_entry_safe(trd, tmp, &ted->ted_reply_list, trd_list) { - if (trd->trd_tag != tag) - continue; - ted->ted_release_tag++; - tgt_release_reply_data(lut, ted, trd); - break; - } - mutex_unlock(&ted->ted_lcd_lock); - - return 0; + return tgt_check_lookup_req(req, 0, NULL); }