From fa32b478599bbdfe6d51e41d8715665632d011b3 Mon Sep 17 00:00:00 2001 From: Johann Lombardi Date: Tue, 11 Sep 2012 00:18:01 +0200 Subject: [PATCH] LU-1789 protocol: add support for lightweight connection A lightweight connection has the following uncommon properties: - no entry is added to last_rcvd file, so no recovery is possible - a lightweight connection can be set up while the target is in recovery, therefore it should be used use caution - locks can still be acquired through this connection, although they won't be replayed. We might also consider disabling pings in the future. Signed-off-by: Johann Lombardi Change-Id: I2b2d1a28d0fd4ca278806cab6103f96fbd88a07d Reviewed-on: http://review.whamcloud.com/3925 Reviewed-by: Niu Yawei Tested-by: Hudson Reviewed-by: wangdi Tested-by: Maloo Reviewed-by: Oleg Drokin --- lustre/include/lustre/lustre_idl.h | 8 +++++--- lustre/include/obd_support.h | 1 + lustre/ldlm/ldlm_lib.c | 18 +++++++++++------- lustre/llite/llite_lib.c | 6 ++++++ lustre/mdt/mdt_recovery.c | 34 +++++++++++++++++++++++++++++----- lustre/obdclass/genops.c | 5 +++++ lustre/ofd/ofd_trans.c | 31 ++++++++++++++++++++++++++++--- lustre/osd-ldiskfs/osd_handler.c | 6 ++++-- lustre/ptlrpc/import.c | 9 +++++---- lustre/ptlrpc/target.c | 18 ++++++++++++++---- lustre/tests/recovery-small.sh | 36 ++++++++++++++++++++++++++++++++++++ 11 files changed, 144 insertions(+), 28 deletions(-) diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index 6416808..a41e82a 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -1198,8 +1198,9 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); OBD_CONNECT_FID | LRU_RESIZE_CONNECT_FLAG | \ OBD_CONNECT_VBR | OBD_CONNECT_LOV_V3 | \ OBD_CONNECT_SOM | OBD_CONNECT_FULL20 | \ - OBD_CONNECT_64BITHASH | \ - OBD_CONNECT_EINPROGRESS | OBD_CONNECT_JOBSTATS) + OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \ + OBD_CONNECT_EINPROGRESS | \ + OBD_CONNECT_LIGHTWEIGHT) #define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \ OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \ OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \ @@ -1214,7 +1215,8 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); OBD_CONNECT_GRANT_SHRINK | OBD_CONNECT_FULL20 | \ OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | \ OBD_CONNECT_MAX_EASIZE | \ - OBD_CONNECT_EINPROGRESS | OBD_CONNECT_JOBSTATS) + OBD_CONNECT_EINPROGRESS | \ + OBD_CONNECT_JOBSTATS | OBD_CONNECT_LIGHTWEIGHT) #define ECHO_CONNECT_SUPPORTED (0) #define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \ OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \ diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 13ca2c0..3b4f041 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -395,6 +395,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_MDC_OLD_EXT_FLAGS 0x802 #define OBD_FAIL_MDC_GETATTR_ENQUEUE 0x803 #define OBD_FAIL_MDC_RPCS_SEM 0x804 +#define OBD_FAIL_MDC_LIGHTWEIGHT 0x805 #define OBD_FAIL_MGS 0x900 #define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901 diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 2fa220b..d058a45 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -744,7 +744,7 @@ int target_handle_connect(struct ptlrpc_request *req) int rc = 0; char *target_start; int target_len; - int mds_conn = 0; + bool mds_conn = false, lw_client = false; struct obd_connect_data *data, *tmpdata; int size, tmpsize; lnet_nid_t *client_nid = NULL; @@ -870,7 +870,10 @@ int target_handle_connect(struct ptlrpc_request *req) if ((lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) && (data->ocd_connect_flags & OBD_CONNECT_MDS)) - mds_conn = 1; + mds_conn = true; + + if ((data->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0) + lw_client = true; /* lctl gets a backstage, all-access pass. */ if (obd_uuid_equals(&cluuid, &target->obd_uuid)) @@ -978,8 +981,8 @@ no_export: export ? (long)export->exp_last_request_time : 0); /* If this is the first time a client connects, reset the recovery - * timer */ - if (rc == 0 && target->obd_recovering) + * timer. Discard lightweight connections which might be local */ + if (!lw_client && rc == 0 && target->obd_recovering) check_and_start_recovery_timer(target, req, export == NULL); /* We want to handle EALREADY but *not* -EALREADY from @@ -997,7 +1000,8 @@ no_export: client_nid = &req->rq_peer.nid; if (export == NULL) { - if (target->obd_recovering) { + /* allow lightweight connections during recovery */ + if (target->obd_recovering && !lw_client) { cfs_time_t t; int c; /* connected */ int i; /* in progress */ @@ -1128,7 +1132,7 @@ dont_check_exports: &export->exp_nid_hash); } - if (target->obd_recovering && !export->exp_in_recovery) { + if (target->obd_recovering && !export->exp_in_recovery && !lw_client) { int has_transno; __u64 transno = data->ocd_transno; @@ -1168,7 +1172,7 @@ dont_check_exports: } /* Tell the client we're in recovery, when client is involved in it. */ - if (target->obd_recovering) + if (target->obd_recovering && !lw_client) lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING); tmp = req_capsule_client_get(&req->rq_pill, &RMF_CONN); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index e05431e..23b04d1 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -227,6 +227,12 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, #ifdef CONFIG_FS_POSIX_ACL data->ocd_connect_flags |= OBD_CONNECT_ACL; #endif + + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT)) + /* flag mdc connection as lightweight, only used for test + * purpose, use with care */ + data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT; + data->ocd_ibits_known = MDS_INODELOCK_FULL; data->ocd_version = LUSTRE_VERSION_CODE; diff --git a/lustre/mdt/mdt_recovery.c b/lustre/mdt/mdt_recovery.c index 6c4b38f..057b840 100644 --- a/lustre/mdt/mdt_recovery.c +++ b/lustre/mdt/mdt_recovery.c @@ -419,14 +419,38 @@ static int mdt_last_rcvd_update(struct mdt_thread_info *mti, lcd->lcd_last_data = mti->mti_opdata; } - if (off <= 0) { - CERROR("client idx %d has offset %lld\n", ted->ted_lr_idx, off); - err = -EINVAL; - } else { + if ((mti->mti_exp->exp_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0) { + /* Although lightweight (LW) connections have no slot in + * last_rcvd, we still want to maintain the in-memory + * lsd_client_data structure in order to properly handle reply + * reconstruction. */ + struct lu_target *tg = &mdt->mdt_lut; + bool update = false; + + cfs_mutex_unlock(&ted->ted_lcd_lock); + err = 0; + + /* All operations performed by LW clients are synchronous and + * we store the committed transno in the last_rcvd header */ + cfs_spin_lock(&tg->lut_translock); + if (mti->mti_transno > tg->lut_lsd.lsd_last_transno) { + tg->lut_lsd.lsd_last_transno = mti->mti_transno; + update = true; + } + cfs_spin_unlock(&tg->lut_translock); + + if (update) + err = lut_server_data_write(mti->mti_env, tg, th); + } else if (off <= 0) { + CERROR("%s: client idx %d has offset %lld\n", + mdt2obd_dev(mdt)->obd_name, ted->ted_lr_idx, off); + cfs_mutex_unlock(&ted->ted_lcd_lock); + err = -EINVAL; + } else { err = lut_client_data_write(mti->mti_env, &mdt->mdt_lut, lcd, &off, th); + cfs_mutex_unlock(&ted->ted_lcd_lock); } - cfs_mutex_unlock(&ted->ted_lcd_lock); RETURN(err); } diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 3d6a145..8b07fb1 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -1295,6 +1295,11 @@ void class_disconnect_stale_exports(struct obd_device *obd, &exp->exp_obd->obd_uuid)) continue; + /* don't evict clients which have no slot in last_rcvd + * (e.g. lightweight connection) */ + if (exp->exp_target_data.ted_lr_idx == -1) + continue; + cfs_spin_lock(&exp->exp_lock); if (test_export(exp)) { cfs_spin_unlock(&exp->exp_lock); diff --git a/lustre/ofd/ofd_trans.c b/lustre/ofd/ofd_trans.c index d9f26fe..cd238f3 100644 --- a/lustre/ofd/ofd_trans.c +++ b/lustre/ofd/ofd_trans.c @@ -115,12 +115,16 @@ static int ofd_last_rcvd_update(struct ofd_thread_info *info, __u64 *transno_p; loff_t off; int err; + bool lw_client = false; ENTRY; LASSERT(ofd); LASSERT(info->fti_exp); + if ((info->fti_exp->exp_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0) + lw_client = true; + fed = &info->fti_exp->exp_filter_data; LASSERT(fed); lcd = fed->fed_ted.ted_lcd; @@ -134,7 +138,7 @@ static int ofd_last_rcvd_update(struct ofd_thread_info *info, } /* ofd connect may cause transaction before export has last_rcvd * slot */ - if (fed->fed_ted.ted_lr_idx < 0) + if (fed->fed_ted.ted_lr_idx < 0 && !lw_client) RETURN(0); off = fed->fed_ted.ted_lr_off; @@ -157,9 +161,30 @@ static int ofd_last_rcvd_update(struct ofd_thread_info *info, } *transno_p = info->fti_transno; - LASSERT(fed->fed_ted.ted_lr_off > 0); - err = lut_client_data_write(info->fti_env, &ofd->ofd_lut, lcd, + if (lw_client) { + /* Although lightweight (LW) connections have no slot in + * last_rcvd, we still want to maintain the in-memory + * lsd_client_data structure in order to properly handle reply + * reconstruction. */ + struct lu_target *tg =&ofd->ofd_lut; + bool update = false; + + err = 0; + /* All operations performed by LW clients are synchronous and + * we store the committed transno in the last_rcvd header */ + cfs_spin_lock(&tg->lut_translock); + if (info->fti_transno > tg->lut_lsd.lsd_last_transno) { + tg->lut_lsd.lsd_last_transno = info->fti_transno; + update = true; + } + cfs_spin_unlock(&tg->lut_translock); + if (update) + err = lut_server_data_write(info->fti_env, tg, th); + } else { + LASSERT(fed->fed_ted.ted_lr_off > 0); + err = lut_client_data_write(info->fti_env, &ofd->ofd_lut, lcd, &off, th); + } RETURN(err); } diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 366454d..05cb0bc 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -826,8 +826,6 @@ static int osd_trans_stop(const struct lu_env *env, struct thandle *th) if (oh->ot_handle != NULL) { handle_t *hdl = oh->ot_handle; - hdl->h_sync = th->th_sync; - /* * add commit callback * notice we don't do this in osd_trans_start() @@ -855,6 +853,10 @@ static int osd_trans_stop(const struct lu_env *env, struct thandle *th) rc = dt_txn_hook_stop(env, th); if (rc != 0) CERROR("Failure in transaction hook: %d\n", rc); + + /* hook functions might modify th_sync */ + hdl->h_sync = th->th_sync; + oh->ot_handle = NULL; OSD_CHECK_SLOW_TH(oh, oti->oti_dev, rc = ldiskfs_journal_stop(hdl)); diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index b7e7c84..d93c17a 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -1318,10 +1318,11 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp) /* Don't care about MGC eviction */ if (strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_MGC_NAME) != 0) { - LCONSOLE_ERROR_MSG(0x167, "This client was evicted by " - "%.*s; in progress operations using " - "this service will fail.\n", - target_len, target_start); + LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted " + "by %.*s; in progress operations " + "using this service will fail.\n", + imp->imp_obd->obd_name, target_len, + target_start); } CDEBUG(D_HA, "evicted from %s@%s; invalidating\n", obd2cli_tgt(imp->imp_obd), diff --git a/lustre/ptlrpc/target.c b/lustre/ptlrpc/target.c index c4dcb16..d0e99de 100644 --- a/lustre/ptlrpc/target.c +++ b/lustre/ptlrpc/target.c @@ -575,6 +575,11 @@ int lut_last_commit_cb_add(struct thandle *th, struct lu_target *lut, class_export_cb_put(exp); OBD_FREE_PTR(ccb); } + + if ((exp->exp_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0) + /* report failure to force synchronous operation */ + return -EPERM; + return rc; } EXPORT_SYMBOL(lut_last_commit_cb_add); @@ -649,6 +654,11 @@ int lut_client_new(const struct lu_env *env, struct obd_export *exp) if (!strcmp(ted->ted_lcd->lcd_uuid, tg->lut_obd->obd_uuid.uuid)) RETURN(0); + cfs_mutex_init(&ted->ted_lcd_lock); + + if ((exp->exp_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0) + RETURN(0); + /* the bitmap operations can handle cl_idx > sizeof(long) * 8, so * there's no need for extra complication here */ @@ -673,8 +683,6 @@ repeat: ted->ted_lr_off = tg->lut_lsd.lsd_client_start + idx * tg->lut_lsd.lsd_client_size; - cfs_mutex_init(&ted->ted_lcd_lock); - LASSERTF(ted->ted_lr_off > 0, "ted_lr_off = %llu\n", ted->ted_lr_off); CDEBUG(D_INFO, "%s: new client at index %d (%llu) with UUID '%s'\n", @@ -711,7 +719,8 @@ int lut_client_add(const struct lu_env *env, struct obd_export *exp, int idx) LASSERT(tg->lut_client_bitmap != NULL); LASSERTF(idx >= 0, "%d\n", idx); - if (!strcmp(ted->ted_lcd->lcd_uuid, tg->lut_obd->obd_uuid.uuid)) + if (!strcmp(ted->ted_lcd->lcd_uuid, tg->lut_obd->obd_uuid.uuid) || + (exp->exp_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0) RETURN(0); if (cfs_test_and_set_bit(idx, tg->lut_client_bitmap)) { @@ -747,7 +756,8 @@ int lut_client_del(const struct lu_env *env, struct obd_export *exp) /* XXX if lcd_uuid were a real obd_uuid, I could use obd_uuid_equals */ if (!strcmp((char *)ted->ted_lcd->lcd_uuid, - (char *)tg->lut_obd->obd_uuid.uuid)) + (char *)tg->lut_obd->obd_uuid.uuid) || + (exp->exp_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0) RETURN(0); CDEBUG(D_INFO, "%s: del client at idx %u, off %lld, UUID '%s'\n", diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 02b500a..9936a45 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -1491,6 +1491,42 @@ test_105() } run_test 105 "IR: NON IR clients support" +cleanup_106() { + trap 0 + umount_client $DIR2 +} + +test_106() { # LU-1789 +#define OBD_FAIL_MDC_LIGHTWEIGHT 0x805 + $LCTL set_param fail_loc=0x805 + + trap cleanup_106 EXIT + + # enable lightweight flag on mdc connection + mount_client $DIR2 + + local MDS_NEXP=$(do_facet $SINGLEMDS \ + lctl get_param -n mdt.${mds1_svc}.num_exports | + cut -d' ' -f2) + $LCTL set_param fail_loc=0 + + touch $DIR2/$tfile || error "failed to create empty file" + replay_barrier $SINGLEMDS + facet_failover $SINGLEMDS + + # lightweight connection must be evicted + wait_client_evicted $SINGLEMDS $MDS_NEXP $((TIMEOUT * 3)) || \ + error "lightweight client not evicted by mds" + + # and all operations performed by lightweight client should be + # synchronous, so the file created before mds restart should be there + $CHECKSTAT -t file $DIR/$tfile || error "file not present" + rm -f $DIR/$tfile + + cleanup_106 +} +run_test 106 "lightweight connection support" + complete $(basename $0) $SECONDS check_and_cleanup_lustre exit_status -- 1.8.3.1