From c4ec46c0ca89b9c8ed5dec0763b6d7537363e65f Mon Sep 17 00:00:00 2001 From: bobijam Date: Thu, 14 Feb 2008 03:03:16 +0000 Subject: [PATCH] Branch HEAD b=3462 i=johann i=adilger patch addressing replay-single test53 set port for HEAD. --- lustre/ChangeLog | 8 + lustre/include/obd_support.h | 7 +- lustre/ldlm/ldlm_lib.c | 30 ++-- lustre/mds/handler.c | 15 +- lustre/mds/mds_open.c | 4 +- lustre/mds/mds_reint.c | 20 ++- lustre/mdt/mdt_handler.c | 18 +- lustre/mdt/mdt_open.c | 23 ++- lustre/tests/replay-single.sh | 362 ++++++++++++++++++++++++++++++++++------- lustre/tests/test-framework.sh | 75 +++++---- 10 files changed, 417 insertions(+), 145 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 0e2d9b1..456edee 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -12,6 +12,14 @@ tbd Sun Microsystems, Inc. * RHEL 4 and RHEL 5/SLES 10 clients behaves differently on 'cd' to a removed cwd "./" (refer to Bugzilla 14399). +Severity : normal +Frequency : very rare +Bugzilla : 3462 +Description: Fix replay if there is an un-replied request and open +Details : In some cases, older replay request will revert the + mcd->mcd_last_xid on MDS which is used to record the client's + latest sent request. + Severity : enhancement Bugzilla : 14720 Description: Update to RHEL5 latest kernel-2.6.18-53.1.6.el5. diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 8d86823..ce5ecf0 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -140,6 +140,7 @@ int __obd_fail_check_set(__u32 id, __u32 value, int set); #define OBD_FAIL_MDS_LLOG_CREATE_FAILED 0x13c #define OBD_FAIL_MDS_OSC_PRECREATE 0x13d #define OBD_FAIL_MDS_LOV_SYNC_RACE 0x13e +#define OBD_FAIL_MDS_CLOSE_NET_REP 0x13f #define OBD_FAIL_OST 0x200 #define OBD_FAIL_OST_CONNECT_NET 0x201 @@ -370,7 +371,7 @@ static inline void obd_race(__u32 id) extern atomic_t libcfs_kmemory; -#ifdef LPROCFS +#ifdef LPROCFS #define obd_memory_add(size) \ lprocfs_counter_add(obd_memory, OBD_MEMORY_STAT, (long)(size)) #define obd_memory_sub(size) \ @@ -412,7 +413,7 @@ static inline void obd_memory_sub(long size) obd_alloc -= size; } -static inline void obd_pages_add(int order) +static inline void obd_pages_add(int order) { obd_pages += 1<< order; if (obd_pages > obd_max_pages) @@ -709,7 +710,7 @@ do { \ cfs_mem_cache_free((slab), (ptr)); \ (ptr) = NULL; \ 0; \ -}) +}) #define OBD_SLAB_ALLOC(ptr, slab, type, size) \ do { \ LASSERT(!in_interrupt()); \ diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 923401a..d31b78c 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -475,7 +475,7 @@ int client_disconnect_export(struct obd_export *exp) spin_lock(&imp->imp_lock); imp->imp_deactive = 1; spin_unlock(&imp->imp_lock); - + /* Some non-replayable imports (MDS's OSCs) are pinged, so just * delete it regardless. (It's safe to delete an import that was * never added.) */ @@ -609,7 +609,7 @@ int target_handle_connect(struct ptlrpc_request *req) if (target->obd_no_conn) { LCONSOLE_WARN("%s: temporarily refusing client connection " - "from %s\n", target->obd_name, + "from %s\n", target->obd_name, libcfs_nid2str(req->rq_peer.nid)); GOTO(out, rc = -EAGAIN); } @@ -752,7 +752,7 @@ int target_handle_connect(struct ptlrpc_request *req) } else { OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout); if (req->rq_export == NULL && initial_conn) - export->exp_last_request_time = + export->exp_last_request_time = max(export->exp_last_request_time, (time_t)CURRENT_SECONDS); } @@ -770,7 +770,7 @@ int target_handle_connect(struct ptlrpc_request *req) CWARN("%s: connection from %s@%s %st"LPU64" exp %p cur %ld last %ld\n", target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), target->obd_recovering ? "recovering/" : "", data->ocd_transno, - export, (long)CURRENT_SECONDS, + export, (long)CURRENT_SECONDS, export ? (long)export->exp_last_request_time : 0); @@ -860,7 +860,7 @@ dont_check_exports: if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) { export->exp_libclient = 1; spin_unlock(&export->exp_lock); - + spin_lock(&target->obd_dev_lock); list_del_init(&export->exp_obd_chain_timed); spin_unlock(&target->obd_dev_lock); @@ -877,8 +877,8 @@ dont_check_exports: spin_lock(&target->obd_dev_lock); /* Export might be hashed already, e.g. if this is reconnect */ if (hlist_unhashed(&export->exp_nid_hash)) - lustre_hash_additem(export->exp_obd->obd_nid_hash_body, - &export->exp_connection->c_peer.nid, + lustre_hash_additem(export->exp_obd->obd_nid_hash_body, + &export->exp_connection->c_peer.nid, &export->exp_nid_hash); spin_unlock(&target->obd_dev_lock); @@ -887,7 +887,7 @@ dont_check_exports: spin_lock(&export->exp_lock); export->exp_in_recovery = 1; export->exp_req_replay_needed = 1; - export->exp_lock_replay_needed = 1; + export->exp_lock_replay_needed = 1; spin_unlock(&export->exp_lock); if ((lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_TRANSNO) && data->ocd_transno < target->obd_next_recovery_transno) @@ -897,7 +897,7 @@ dont_check_exports: target->obd_recoverable_clients++; atomic_inc(&target->obd_req_replay_clients); atomic_inc(&target->obd_lock_replay_clients); - if (target->obd_connected_clients == + if (target->obd_connected_clients == target->obd_max_recoverable_clients) wake_up(&target->obd_next_transno_waitq); } @@ -907,7 +907,7 @@ dont_check_exports: if (export->exp_imp_reverse != NULL) { /* destroyed import can be still referenced in ctxt */ - obd_set_info_async(export, strlen(KEY_REVIMP_UPD), + obd_set_info_async(export, strlen(KEY_REVIMP_UPD), KEY_REVIMP_UPD, 0, NULL, NULL); /* in some recovery senarios, previous ctx init rpc handled @@ -979,7 +979,7 @@ int target_handle_disconnect(struct ptlrpc_request *req) /* keep the rq_export around so we can send the reply */ req->rq_status = obd_disconnect(class_export_get(req->rq_export)); - + RETURN(0); } @@ -1109,7 +1109,7 @@ static void abort_lock_replay_queue(struct obd_device *obd) list_for_each_entry_safe(req, n, &obd->obd_lock_replay_queue, rq_list){ DEBUG_REQ(D_ERROR, req, "aborted:"); req->rq_status = -ENOTCONN; - if (ptlrpc_error(req)) { + if (ptlrpc_error(req)) { DEBUG_REQ(D_ERROR, req, "failed abort_lock_reply; skipping"); } @@ -1618,7 +1618,7 @@ void target_recovery_init(struct obd_device *obd, svc_handler_t handler) { if (obd->obd_max_recoverable_clients == 0) return; - + CWARN("RECOVERY: service %s, %d recoverable clients, " "last_transno "LPU64"\n", obd->obd_name, obd->obd_max_recoverable_clients, obd->obd_last_committed); @@ -1824,13 +1824,13 @@ int target_pack_pool_reply(struct ptlrpc_request *req) { struct ldlm_pool *pl; ENTRY; - + if (!req->rq_export || !exp_connect_lru_resize(req->rq_export)) { lustre_msg_set_slv(req->rq_repmsg, 0); lustre_msg_set_limit(req->rq_repmsg, 0); RETURN(0); } - + pl = ldlm_exp2pl(req->rq_export); spin_lock(&pl->pl_lock); diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 44ef204..1f52b90 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -377,7 +377,7 @@ int mds_init_export(struct obd_export *exp) INIT_LIST_HEAD(&med->med_open_head); spin_lock_init(&med->med_open_lock); - + spin_lock(&exp->exp_lock); exp->exp_connecting = 1; spin_unlock(&exp->exp_lock); @@ -1706,6 +1706,7 @@ int mds_handle(struct ptlrpc_request *req) if (OBD_FAIL_CHECK(OBD_FAIL_MDS_CLOSE_NET)) RETURN(0); rc = mds_close(req, REQ_REC_OFF); + fail = OBD_FAIL_MDS_CLOSE_NET_REP; break; case MDS_DONE_WRITING: @@ -1988,7 +1989,7 @@ static int mds_setup(struct obd_device *obd, struct lustre_cfg* lcfg) /* We mounted in lustre_fill_super. lcfg bufs 1, 2, 4 (device, fstype, mount opts) are ignored.*/ - + lsi = s2lsi(lmi->lmi_sb); fsoptions_to_mds_flags(mds, lsi->lsi_ldd->ldd_mount_opts); fsoptions_to_mds_flags(mds, lsi->lsi_lmd->lmd_opts); @@ -2589,12 +2590,12 @@ static int mdt_setup(struct obd_device *obd, struct lustre_cfg *lcfg) mds_max_threads = mds_min_threads = mds_num_threads; } else { /* Base min threads on memory and cpus */ - mds_min_threads = num_possible_cpus() * num_physpages >> + mds_min_threads = num_possible_cpus() * num_physpages >> (27 - CFS_PAGE_SHIFT); if (mds_min_threads < MDS_THREADS_MIN) mds_min_threads = MDS_THREADS_MIN; /* Largest auto threads start value */ - if (mds_min_threads > 32) + if (mds_min_threads > 32) mds_min_threads = 32; mds_max_threads = min(MDS_THREADS_MAX, mds_min_threads * 4); } @@ -2604,7 +2605,7 @@ static int mdt_setup(struct obd_device *obd, struct lustre_cfg *lcfg) MDS_MAXREPSIZE, MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT, mds_handle, LUSTRE_MDS_NAME, - obd->obd_proc_entry, NULL, + obd->obd_proc_entry, NULL, mds_min_threads, mds_max_threads, "ll_mdt", 0); if (!mds->mds_service) { @@ -2638,7 +2639,7 @@ static int mdt_setup(struct obd_device *obd, struct lustre_cfg *lcfg) MDS_MAXREPSIZE, MDS_READPAGE_PORTAL, MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT, mds_handle, "mds_readpage", - obd->obd_proc_entry, NULL, + obd->obd_proc_entry, NULL, MDS_THREADS_MIN_READPAGE, mds_max_threads, "ll_mdt_rdpg", 0); if (!mds->mds_readpage_service) { @@ -2652,7 +2653,7 @@ static int mdt_setup(struct obd_device *obd, struct lustre_cfg *lcfg) GOTO(err_thread3, rc); ping_evictor_start(); - + RETURN(0); err_thread3: diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index bb6607b..420855f 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -970,14 +970,14 @@ int mds_open(struct mds_update_record *rec, int offset, } /* Step 2: Lookup the child */ - + if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) && (rec->ur_flags & MDS_OPEN_LOCK) && (rec->ur_namelen == 1)) { /* hack for nfsd with no_subtree_check, it will use anon * dentry w/o filename to open the file. the anon dentry's * parent was set to itself, so rec->ur_fid1 is the file. * And in MDC it cannot derive the dentry's parent dentry, - * hence the file's name, so we hack here in MDS, + * hence the file's name, so we hack here in MDS, * refer to bug 13030. */ dchild = mds_fid2dentry(mds, rec->ur_fid1, NULL); } else { diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index eec3581..4dadd10 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -102,7 +102,7 @@ static void mds_cancel_cookies_cb(struct obd_device *obd, __u64 transno, /* Assumes caller has already pushed us into the kernel context. */ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle, - struct ptlrpc_request *req, int rc, __u32 op_data, + struct ptlrpc_request *req, int rc, __u32 op_data, int force_sync) { struct mds_export_data *med = &req->rq_export->exp_mds_data; @@ -172,10 +172,14 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle, mcd->mcd_last_close_data = cpu_to_le32(op_data); } else { prev_transno = le64_to_cpu(mcd->mcd_last_transno); - mcd->mcd_last_transno = cpu_to_le64(transno); - mcd->mcd_last_xid = cpu_to_le64(req->rq_xid); - mcd->mcd_last_result = cpu_to_le32(rc); - mcd->mcd_last_data = cpu_to_le32(op_data); + if (((lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_RESENT | MSG_REPLAY)) == 0) || + (transno > prev_transno)) { + mcd->mcd_last_transno = cpu_to_le64(transno); + mcd->mcd_last_xid = cpu_to_le64(req->rq_xid); + mcd->mcd_last_result = cpu_to_le32(rc); + mcd->mcd_last_data = cpu_to_le32(op_data); + } } /* update the server data to not lose the greatest transno. Bug 11125 */ if ((transno == 0) && (prev_transno == mds->mds_last_transno)) @@ -188,7 +192,7 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle, struct obd_export *exp = req->rq_export; if (!force_sync) - force_sync = fsfilt_add_journal_cb(exp->exp_obd,transno, + force_sync = fsfilt_add_journal_cb(exp->exp_obd,transno, handle, mds_commit_cb, NULL); @@ -610,7 +614,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, if (logcookies == NULL) GOTO(cleanup, rc = -ENOMEM); - if (mds_log_op_setattr(obd, inode->i_uid, inode->i_gid, + if (mds_log_op_setattr(obd, inode->i_uid, inode->i_gid, lmm, lmm_size, logcookies, cookie_size) <= 0) { OBD_FREE(logcookies, cookie_size); @@ -1825,7 +1829,7 @@ static int mds_reint_link(struct mds_update_record *rec, int offset, if (rec->ur_dlm) ldlm_request_cancel(req, rec->ur_dlm, 0); - + /* Step 1: Lookup the source inode and target directory by FID */ de_src = mds_fid2dentry(mds, rec->ur_fid1, NULL); if (IS_ERR(de_src)) diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 331f970..fdb9f45 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -348,7 +348,7 @@ void mdt_pack_size2body(struct mdt_thread_info *info, struct mdt_object *o) { struct mdt_body *b; struct lu_attr *attr = &info->mti_attr.ma_attr; - + b = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY); /* Check if Size-on-MDS is enabled. */ @@ -392,7 +392,7 @@ void mdt_pack_attr2body(struct mdt_thread_info *info, struct mdt_body *b, b->ino = fid_oid(fid); /* 1.6 compatibility */ b->generation = fid_ver(fid); /* 1.6 compatibility */ b->valid |= OBD_MD_FLGENER; /* 1.6 compatibility */ - + CDEBUG(D_INODE, DFID": nlink=%d, mode=%o, size="LPU64"\n", PFID(fid), b->nlink, b->mode, b->size); } @@ -778,7 +778,7 @@ static int mdt_getattr_name_lock(struct mdt_thread_info *info, if (name == NULL) RETURN(err_serious(-EFAULT)); - namelen = req_capsule_get_size(info->mti_pill, &RMF_NAME, + namelen = req_capsule_get_size(info->mti_pill, &RMF_NAME, RCL_CLIENT) - 1; LASSERT(namelen >= 0); @@ -1489,6 +1489,12 @@ static int mdt_reint(struct mdt_thread_info *info) ENTRY; + if (OBD_FAIL_CHECK_RESET(OBD_FAIL_MDS_REINT_NET, + OBD_FAIL_MDS_REINT_NET)) { + info->mti_fail_id = OBD_FAIL_MDS_REINT_NET; + RETURN(0); + } + opc = mdt_reint_opcode(info, reint_fmts); if (opc >= 0) { /* @@ -3659,7 +3665,7 @@ static void mdt_fini(const struct lu_env *env, struct mdt_device *m) ENTRY; ping_evictor_stop(); - + target_recovery_fini(obd); mdt_stop_ptlrpc_service(m); @@ -4546,11 +4552,11 @@ int mdt_postrecov(const struct lu_env *env, struct mdt_device *mdt) struct obd_device *obd = mdt->mdt_md_dev.md_lu_dev.ld_obd; int rc, lost; ENTRY; - /* if some clients didn't participate in recovery then we can possibly + /* if some clients didn't participate in recovery then we can possibly * lost sequence. Now we should increase sequence for safe value */ lost = obd->obd_max_recoverable_clients - obd->obd_connected_clients; mdt_seq_adjust(env, mdt, lost); - + rc = ld->ld_ops->ldo_recovery_complete(env, ld); RETURN(rc); } diff --git a/lustre/mdt/mdt_open.c b/lustre/mdt/mdt_open.c index 3c992d9..49626c7 100644 --- a/lustre/mdt/mdt_open.c +++ b/lustre/mdt/mdt_open.c @@ -125,7 +125,7 @@ void mdt_sizeonmds_enable(struct mdt_thread_info *info, { spin_lock(&info->mti_mdt->mdt_ioepoch_lock); if (info->mti_epoch->ioepoch == mo->mot_ioepoch) { - LASSERT(!mdt_epoch_opened(mo)); + LASSERT(!mdt_epoch_opened(mo)); mo->mot_ioepoch = 0; mo->mot_flags = 0; } @@ -141,7 +141,7 @@ int mdt_epoch_open(struct mdt_thread_info *info, struct mdt_object *o) int rc = 0; ENTRY; - if (!(mdt_conn_flags(info) & OBD_CONNECT_SOM) || + if (!(mdt_conn_flags(info) & OBD_CONNECT_SOM) || !S_ISREG(lu_object_attr(&o->mot_obj.mo_lu))) RETURN(0); @@ -451,7 +451,7 @@ static int mdt_mfd_open(struct mdt_thread_info *info, struct mdt_object *p, /* Check wheather old cookie already exist in * the list, becasue when do recovery, client * might be disconnected from server, and - * restart replay, so there maybe some orphan + * restart replay, so there maybe some orphan * mfd here, we should remove them */ LASSERT(info->mti_rr.rr_handle != NULL); old_mfd = mdt_handle2mfd(info, info->mti_rr.rr_handle); @@ -1087,8 +1087,8 @@ int mdt_mfd_close(struct mdt_thread_info *info, struct mdt_file_data *mfd) mdt_write_allow(info->mti_mdt, o); } else if (mode & FMODE_EPOCH) { ret = mdt_epoch_close(info, o); - } - + } + /* Update atime on close only. */ if ((mode & MDS_FMODE_EXEC || mode & FMODE_READ || mode & FMODE_WRITE) && (ma->ma_valid & MA_INODE) && (ma->ma_attr.la_valid & LA_ATIME)) { @@ -1151,6 +1151,12 @@ int mdt_close(struct mdt_thread_info *info) int rc, ret = 0; ENTRY; + if (OBD_FAIL_CHECK_RESET(OBD_FAIL_MDS_CLOSE_NET, + OBD_FAIL_MDS_CLOSE_NET)) { + info->mti_fail_id = OBD_FAIL_MDS_CLOSE_NET; + RETURN(0); + } + /* Close may come with the Size-on-MDS update. Unpack it. */ rc = mdt_close_unpack(info); if (rc) @@ -1215,6 +1221,9 @@ int mdt_close(struct mdt_thread_info *info) if (OBD_FAIL_CHECK(OBD_FAIL_MDS_CLOSE_PACK)) RETURN(err_serious(-ENOMEM)); + if (OBD_FAIL_CHECK_RESET(OBD_FAIL_MDS_CLOSE_NET_REP, + OBD_FAIL_MDS_CLOSE_NET_REP)) + info->mti_fail_id = OBD_FAIL_MDS_CLOSE_NET_REP; RETURN(rc ? rc : ret); } @@ -1252,8 +1261,8 @@ int mdt_done_writing(struct mdt_thread_info *info) ": cookie = "LPX64"\n", PFID(info->mti_rr.rr_fid1), info->mti_epoch->handle.cookie); RETURN(-ESTALE); - } - + } + LASSERT(mfd->mfd_mode == FMODE_EPOCH || mfd->mfd_mode == FMODE_EPOCHLCK); class_handle_unhash(&mfd->mfd_handle); diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index afae2aa3..40bc1d2 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -42,7 +42,7 @@ test_0() { run_test 0 "empty replay" test_0b() { - # this test attempts to trigger a race in the precreation code, + # this test attempts to trigger a race in the precreation code, # and must run before any other objects are created on the filesystem fail ost1 createmany -o $DIR/$tfile 20 || return 1 @@ -54,14 +54,14 @@ seq_set_width() { local mds=$1 local width=$2 - local file=`ls /proc/fs/lustre/seq/cli-srv-$mds-mdc-*/width` + local file=`ls /proc/fs/lustre/seq/cli-srv-$mds-mdc-*/width` echo $width > $file } seq_get_width() { local mds=$1 - local file=`ls /proc/fs/lustre/seq/cli-srv-$mds-mdc-*/width` + local file=`ls /proc/fs/lustre/seq/cli-srv-$mds-mdc-*/width` cat $file } @@ -73,15 +73,15 @@ seq_get_width() # (1) fld_create replay should happen; # # (2) fld_create replay should not return -EEXISTS, if it does -# this means sequence manager recovery code is buggy and allocated +# this means sequence manager recovery code is buggy and allocated # same sequence two times after recovery. # # multi-mds # --------- -# (1) fld_create replay may not happen, because its home MDS is +# (1) fld_create replay may not happen, because its home MDS is # MDS2 which is not involved to revovery; # -# (2) as fld_create does not happen on MDS1, it does not make any +# (2) as fld_create does not happen on MDS1, it does not make any # problem. test_0c() { local label=`mdsdevlabel 1` @@ -89,30 +89,30 @@ test_0c() { replay_barrier $SINGLEMDS local sw=`seq_get_width $label` - - # make seq manager switch to next sequence each + + # make seq manager switch to next sequence each # time as new fid is needed. seq_set_width $label 1 - - # make sure that fld has created at least one new + + # make sure that fld has created at least one new # entry on server touch $DIR/$tfile || return 2 seq_set_width $label $sw - + # fail $SINGLEMDS and start recovery, replay RPCs, etc. fail $SINGLEMDS - + # wait for recovery finish sleep 10 df $MOUNT - - # flush fld cache and dentry cache to make it lookup + + # flush fld cache and dentry cache to make it lookup # created entry instead of revalidating existent one umount $MOUNT zconf_mount `hostname` $MOUNT - - # issue lookup which should call fld lookup which - # should fail if client did not replay fld create + + # issue lookup which should call fld lookup which + # should fail if client did not replay fld create # correctly and server has no fld entry touch $DIR/$tfile || return 3 rm $DIR/$tfile || return 4 @@ -186,11 +186,11 @@ test_4() { replay_barrier $SINGLEMDS for i in `seq 10`; do echo "tag-$i" > $DIR/$tfile-$i - done + done fail $SINGLEMDS for i in `seq 10`; do grep -q "tag-$i" $DIR/$tfile-$i || error "$tfile-$i" - done + done } run_test 4 "|x| 10 open(O_CREAT)s" @@ -202,17 +202,17 @@ test_4b() { } run_test 4b "|x| rm 10 files" -# The idea is to get past the first block of precreated files on both +# The idea is to get past the first block of precreated files on both # osts, and then replay. test_5() { replay_barrier $SINGLEMDS for i in `seq 220`; do echo "tag-$i" > $DIR/$tfile-$i - done + done fail $SINGLEMDS for i in `seq 220`; do grep -q "tag-$i" $DIR/$tfile-$i || error "f1c-$i" - done + done rm -rf $DIR/$tfile-* sleep 3 # waiting for commitment of removal @@ -235,7 +235,7 @@ test_6b() { replay_barrier $SINGLEMDS rm -rf $DIR/$tdir fail $SINGLEMDS - $CHECKSTAT -t dir $DIR/$tdir && return 1 || true + $CHECKSTAT -t dir $DIR/$tdir && return 1 || true } run_test 6b "|X| rmdir" @@ -303,7 +303,7 @@ test_11() { mv $DIR/$tfile $DIR/$tfile-2 replay_barrier $SINGLEMDS echo "new" > $DIR/$tfile - grep new $DIR/$tfile + grep new $DIR/$tfile grep old $DIR/$tfile-2 fail $SINGLEMDS grep new $DIR/$tfile || return 1 @@ -312,7 +312,7 @@ test_11() { run_test 11 "create open write rename |X| create-old-name read" test_12() { - mcreate $DIR/$tfile + mcreate $DIR/$tfile multiop $DIR/$tfile o_tSc & pid=$! # give multiop a chance to open @@ -330,13 +330,13 @@ run_test 12 "open, unlink |X| close" # 1777 - replay open after committed chmod that would make -# a regular open a failure +# a regular open a failure test_13() { - mcreate $DIR/$tfile + mcreate $DIR/$tfile multiop $DIR/$tfile O_wc & pid=$! # give multiop a chance to open - sleep 1 + sleep 1 chmod 0 $DIR/$tfile $CHECKSTAT -p 0 $DIR/$tfile replay_barrier $SINGLEMDS @@ -353,7 +353,7 @@ test_14() { multiop $DIR/$tfile O_tSc & pid=$! # give multiop a chance to open - sleep 1 + sleep 1 rm -f $DIR/$tfile replay_barrier $SINGLEMDS kill -USR1 $pid || return 1 @@ -369,7 +369,7 @@ test_15() { multiop $DIR/$tfile O_tSc & pid=$! # give multiop a chance to open - sleep 1 + sleep 1 rm -f $DIR/$tfile replay_barrier $SINGLEMDS touch $DIR/g11 || return 1 @@ -401,7 +401,7 @@ test_17() { multiop $DIR/$tfile O_c & pid=$! # give multiop a chance to open - sleep 1 + sleep 1 fail $SINGLEMDS kill -USR1 $pid || return 1 wait $pid || return 2 @@ -415,7 +415,7 @@ test_18() { multiop $DIR/$tfile O_tSc & pid=$! # give multiop a chance to open - sleep 1 + sleep 1 rm -f $DIR/$tfile touch $DIR/$tfile-2 || return 1 echo "pid: $pid will close" @@ -450,7 +450,7 @@ test_20() { multiop $DIR/$tfile O_tSc & pid=$! # give multiop a chance to open - sleep 1 + sleep 1 rm -f $DIR/$tfile fail $SINGLEMDS @@ -479,7 +479,7 @@ test_20b() { # bug 10480 df -P $DIR || df -P $DIR || true # reconnect wait_mds_recovery_done || error "MDS recovery not done" - # FIXME just because recovery is done doesn't mean we've finished + # FIXME just because recovery is done doesn't mean we've finished # orphan cleanup. Fake it with a sleep for now... sleep 10 AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'` @@ -514,7 +514,7 @@ test_21() { multiop $DIR/$tfile O_tSc & pid=$! # give multiop a chance to open - sleep 1 + sleep 1 rm -f $DIR/$tfile touch $DIR/g11 || return 1 @@ -531,7 +531,7 @@ test_22() { multiop $DIR/$tfile O_tSc & pid=$! # give multiop a chance to open - sleep 1 + sleep 1 replay_barrier $SINGLEMDS rm -f $DIR/$tfile @@ -548,7 +548,7 @@ test_23() { multiop $DIR/$tfile O_tSc & pid=$! # give multiop a chance to open - sleep 1 + sleep 1 replay_barrier $SINGLEMDS rm -f $DIR/$tfile @@ -567,7 +567,7 @@ test_24() { multiop $DIR/$tfile O_tSc & pid=$! # give multiop a chance to open - sleep 1 + sleep 1 replay_barrier $SINGLEMDS fail $SINGLEMDS @@ -583,7 +583,7 @@ test_25() { multiop $DIR/$tfile O_tSc & pid=$! # give multiop a chance to open - sleep 1 + sleep 1 rm -f $DIR/$tfile replay_barrier $SINGLEMDS @@ -602,7 +602,7 @@ test_26() { multiop $DIR/$tfile-2 O_tSc & pid2=$! # give multiop a chance to open - sleep 1 + sleep 1 rm -f $DIR/$tfile-1 rm -f $DIR/$tfile-2 kill -USR1 $pid2 @@ -624,7 +624,7 @@ test_27() { multiop $DIR/$tfile-2 O_tSc & pid2=$! # give multiop a chance to open - sleep 1 + sleep 1 rm -f $DIR/$tfile-1 rm -f $DIR/$tfile-2 @@ -645,7 +645,7 @@ test_28() { multiop $DIR/$tfile-2 O_tSc & pid2=$! # give multiop a chance to open - sleep 1 + sleep 1 replay_barrier $SINGLEMDS rm -f $DIR/$tfile-1 rm -f $DIR/$tfile-2 @@ -667,7 +667,7 @@ test_29() { multiop $DIR/$tfile-2 O_tSc & pid2=$! # give multiop a chance to open - sleep 1 + sleep 1 replay_barrier $SINGLEMDS rm -f $DIR/$tfile-1 rm -f $DIR/$tfile-2 @@ -689,7 +689,7 @@ test_30() { multiop $DIR/$tfile-2 O_tSc & pid2=$! # give multiop a chance to open - sleep 1 + sleep 1 rm -f $DIR/$tfile-1 rm -f $DIR/$tfile-2 @@ -711,7 +711,7 @@ test_31() { multiop $DIR/$tfile-2 O_tSc & pid2=$! # give multiop a chance to open - sleep 1 + sleep 1 rm -f $DIR/$tfile-1 replay_barrier $SINGLEMDS @@ -751,13 +751,13 @@ test_33() { createmany -o $DIR/$tfile-%d 100 fail_abort $SINGLEMDS # this file should be gone, because the replay was aborted - $CHECKSTAT -t file $DIR/$tfile-* && return 3 + $CHECKSTAT -t file $DIR/$tfile-* && return 3 unlinkmany $DIR/$tfile-%d 0 100 return 0 } run_test 33 "abort recovery before client does replay" -# Stale FID sequence +# Stale FID sequence test_33a() { replay_barrier $SINGLEMDS createmany -o $DIR/$tfile-%d 10 @@ -774,7 +774,7 @@ test_34() { multiop $DIR/$tfile O_c & pid=$! # give multiop a chance to open - sleep 1 + sleep 1 rm -f $DIR/$tfile replay_barrier $SINGLEMDS @@ -786,7 +786,7 @@ test_34() { } run_test 34 "abort recovery before client does replay (test mds_cleanup_orphans)" -# bug 2278 - generate one orphan on OST, then destroy it during recovery from llog +# bug 2278 - generate one orphan on OST, then destroy it during recovery from llog test_35() { touch $DIR/$tfile @@ -810,7 +810,7 @@ test_36() { checkstat $DIR/$tfile facet_failover $SINGLEMDS cancel_lru_locks mdc - if dmesg | grep "unknown lock cookie"; then + if dmesg | grep "unknown lock cookie"; then echo "cancel after replay failed" return 1 fi @@ -824,7 +824,7 @@ test_37() { multiop $DIR/$tfile dD_c & pid=$! # give multiop a chance to open - sleep 1 + sleep 1 rmdir $DIR/$tfile replay_barrier $SINGLEMDS @@ -866,7 +866,7 @@ count_ost_writes() { #b=2477,2532 test_40(){ - $LCTL mark multiop $MOUNT/$tfile OS_c + $LCTL mark multiop $MOUNT/$tfile OS_c multiop $MOUNT/$tfile OS_c & PID=$! writeme -s $MOUNT/${tfile}-2 & @@ -880,7 +880,7 @@ test_40(){ sleep $TIMEOUT stat2=`count_ost_writes` echo "$stat1, $stat2" - if [ $stat1 -lt $stat2 ]; then + if [ $stat1 -lt $stat2 ]; then echo "writes continuing during recovery" RC=0 else @@ -889,7 +889,7 @@ test_40(){ fi echo "waiting for writeme $WRITE_PID" kill $WRITE_PID - wait $WRITE_PID + wait $WRITE_PID echo "waiting for multiop $PID" wait $PID || return 2 @@ -914,7 +914,7 @@ test_41() { local f=$MOUNT/$tfile # make sure the start of the file is ost1 - lfs setstripe $f -s $((128 * 1024)) -i 0 + lfs setstripe $f -s $((128 * 1024)) -i 0 do_facet client dd if=/dev/zero of=$f bs=4k count=1 || return 3 cancel_lru_locks osc # fail ost2 and read from ost1 @@ -936,7 +936,7 @@ test_42() { debugsave sysctl -w lnet.debug=-1 facet_failover ost1 - + # osc is evicted, fs is smaller (but only with failout OSTs (bug 7287) #blocks_after=`df -P $MOUNT | tail -n 1 | awk '{ print $2 }'` #[ $blocks_after -lt $blocks ] || return 1 @@ -1000,7 +1000,7 @@ test_45() { pid=$! sleep 1 - # This will cause the CLOSE to fail before even + # This will cause the CLOSE to fail before even # allocating a reply buffer $LCTL --device $mdcdev deactivate || return 4 @@ -1028,7 +1028,7 @@ test_46() { run_test 46 "Don't leak file handle after open resend (3325)" test_47() { # bug 2824 - # create some files to make sure precreate has been done on all + # create some files to make sure precreate has been done on all # OSTs. (just in case this test is run independently) createmany -o $DIR/$tfile 20 || return 1 @@ -1038,10 +1038,10 @@ test_47() { # bug 2824 df $MOUNT || return 2 # let the MDS discover the OST failure, attempt to recover, fail - # and recover again. + # and recover again. sleep $((3 * TIMEOUT)) - # Without 2824, this createmany would hang + # Without 2824, this createmany would hang createmany -o $DIR/$tfile 20 || return 3 unlinkmany $DIR/$tfile 20 || return 4 @@ -1092,7 +1092,245 @@ test_52() { } run_test 52 "time out lock replay (3764)" -#b_cray 53 "|X| open request and close reply while two MDC requests in flight" +# bug 3462 - simultaneous MDC requests +test_53a() { + mkdir -p $DIR/${tdir}-1 + mkdir -p $DIR/${tdir}-2 + multiop $DIR/${tdir}-1/f O_c & + close_pid=$! + # give multiop a change to open + sleep 1 + + #define OBD_FAIL_MDS_CLOSE_NET 0x115 + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000115" + kill -USR1 $close_pid + cancel_lru_locks mdc # force the close + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0" + + mcreate $DIR/${tdir}-2/f || return 1 + + # close should still be here + [ -d /proc/$close_pid ] || return 2 + + replay_barrier_nodf $SINGLEMDS + fail $SINGLEMDS + wait $close_pid || return 3 + + $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4 + $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5 + rm -rf $DIR/${tdir}-* +} +run_test 53a "|X| close request while two MDC requests in flight" + +test_53b() { + mkdir -p $DIR/${tdir}-1 + mkdir -p $DIR/${tdir}-2 + multiop $DIR/${tdir}-1/f O_c & + close_pid=$! + + #define OBD_FAIL_MDS_REINT_NET 0x107 + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000107" + mcreate $DIR/${tdir}-2/f & + open_pid=$! + sleep 1 + + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0" + kill -USR1 $close_pid + cancel_lru_locks mdc # force the close + wait $close_pid || return 1 + # open should still be here + [ -d /proc/$open_pid ] || return 2 + + replay_barrier_nodf $SINGLEMDS + fail $SINGLEMDS + wait $open_pid || return 3 + + $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4 + $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5 + rm -rf $DIR/${tdir}-* +} +run_test 53b "|X| open request while two MDC requests in flight" + +test_53c() { + mkdir -p $DIR/${tdir}-1 + mkdir -p $DIR/${tdir}-2 + multiop $DIR/${tdir}-1/f O_c & + close_pid=$! + + #define OBD_FAIL_MDS_REINT_NET 0x107 + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000107" + mcreate $DIR/${tdir}-2/f & + open_pid=$! + sleep 1 + + #define OBD_FAIL_MDS_CLOSE_NET 0x115 + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000115" + kill -USR1 $close_pid + cancel_lru_locks mdc # force the close + + replay_barrier_nodf $SINGLEMDS + fail_nodf $SINGLEMDS + wait $open_pid || return 1 + sleep 2 + # close should be gone + [ -d /proc/$close_pid ] && return 2 + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0" + + $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3 + $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4 + rm -rf $DIR/${tdir}-* +} +run_test 53c "|X| open request and close request while two MDC requests in flight" + +test_53d() { + mkdir -p $DIR/${tdir}-1 + mkdir -p $DIR/${tdir}-2 + multiop $DIR/${tdir}-1/f O_c & + close_pid=$! + # give multiop a chance to open + sleep 1 + + #define OBD_FAIL_MDS_CLOSE_NET_REP 0x13f + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x8000013f" + kill -USR1 $close_pid + cancel_lru_locks mdc # force the close + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0" + mcreate $DIR/${tdir}-2/f || return 1 + + # close should still be here + [ -d /proc/$close_pid ] || return 2 + fail $SINGLEMDS + wait $close_pid || return 3 + + $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4 + $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5 + rm -rf $DIR/${tdir}-* +} +run_test 53d "|X| close reply while two MDC requests in flight" + +test_53e() { + mkdir -p $DIR/${tdir}-1 + mkdir -p $DIR/${tdir}-2 + multiop $DIR/${tdir}-1/f O_c & + close_pid=$! + + #define OBD_FAIL_MDS_REINT_NET_REP 0x119 + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x119" + mcreate $DIR/${tdir}-2/f & + open_pid=$! + sleep 1 + + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0" + kill -USR1 $close_pid + cancel_lru_locks mdc # force the close + wait $close_pid || return 1 + # open should still be here + [ -d /proc/$open_pid ] || return 2 + + replay_barrier_nodf $SINGLEMDS + fail $SINGLEMDS + wait $open_pid || return 3 + + $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4 + $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5 + rm -rf $DIR/${tdir}-* +} +run_test 53e "|X| open reply while two MDC requests in flight" + +test_53f() { + mkdir -p $DIR/${tdir}-1 + mkdir -p $DIR/${tdir}-2 + multiop $DIR/${tdir}-1/f O_c & + close_pid=$! + + #define OBD_FAIL_MDS_REINT_NET_REP 0x119 + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x119" + mcreate $DIR/${tdir}-2/f & + open_pid=$! + sleep 1 + + #define OBD_FAIL_MDS_CLOSE_NET_REP 0x13f + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x8000013f" + kill -USR1 $close_pid + cancel_lru_locks mdc # force the close + + replay_barrier_nodf $SINGLEMDS + fail_nodf $SINGLEMDS + wait $open_pid || return 1 + sleep 2 + # close should be gone + [ -d /proc/$close_pid ] && return 2 + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0" + + $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3 + $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4 + rm -rf $DIR/${tdir}-* +} +run_test 53f "|X| open reply and close reply while two MDC requests in flight" + +test_53g() { + mkdir -p $DIR/${tdir}-1 + mkdir -p $DIR/${tdir}-2 + multiop $DIR/${tdir}-1/f O_c & + close_pid=$! + + #define OBD_FAIL_MDS_REINT_NET_REP 0x119 + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x119" + mcreate $DIR/${tdir}-2/f & + open_pid=$! + sleep 1 + + #define OBD_FAIL_MDS_CLOSE_NET 0x115 + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000115" + kill -USR1 $close_pid + cancel_lru_locks mdc # force the close + + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0" + replay_barrier_nodf $SINGLEMDS + fail_nodf $SINGLEMDS + wait $open_pid || return 1 + sleep 2 + # close should be gone + [ -d /proc/$close_pid ] && return 2 + + $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3 + $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4 + rm -rf $DIR/${tdir}-* +} +run_test 53g "|X| drop open reply and close request while close and open are both in flight" + +test_53h() { + mkdir -p $DIR/${tdir}-1 + mkdir -p $DIR/${tdir}-2 + multiop $DIR/${tdir}-1/f O_c & + close_pid=$! + + #define OBD_FAIL_MDS_REINT_NET 0x107 + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000107" + mcreate $DIR/${tdir}-2/f & + open_pid=$! + sleep 1 + + #define OBD_FAIL_MDS_CLOSE_NET_REP 0x13f + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x8000013f" + kill -USR1 $close_pid + cancel_lru_locks mdc # force the close + sleep 1 + + replay_barrier_nodf $SINGLEMDS + fail_nodf $SINGLEMDS + wait $open_pid || return 1 + sleep 2 + # close should be gone + [ -d /proc/$close_pid ] && return 2 + do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0" + + $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3 + $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4 + rm -rf $DIR/${tdir}-* +} +run_test 53h "|X| open request and close reply while two MDC requests in flight" + #b_cray 54 "|X| open request and close reply while two MDC requests in flight" #b3761 ASSERTION(hash != 0) failed @@ -1172,7 +1410,7 @@ test_60() { fail $SINGLEMDS unlinkmany $DIR/$tdir/$tfile-%d 100 100 local no_ctxt=`dmesg | grep "No ctxt"` - [ -z "$no_ctxt" ] || error "ctxt is not initialized in recovery" + [ -z "$no_ctxt" ] || error "ctxt is not initialized in recovery" } run_test 60 "test llog post recovery init vs llog unlink" diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index c657e32..c3d8dfe 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -47,7 +47,7 @@ print_summary () { local o=$(echo $O | tr "[:upper:]" "[:lower:]") o=${o//_/-} o=${o//tyn/tyN} - local log=${TMP}/${o}.log + local log=${TMP}/${o}.log [ -f $log ] && skipped=$(grep excluded $log | awk '{ printf " %s", $3 }' | sed 's/test_//g') [ -f $log ] && slow=$(grep SLOW $log | awk '{ printf " %s", $3 }' | sed 's/test_//g') [ "${!O}" = "done" ] && \ @@ -79,9 +79,9 @@ init_test_env() { export PATH=:$PATH:$LUSTRE/utils:$LUSTRE/utils/gss:$LUSTRE/tests export LCTL=${LCTL:-"$LUSTRE/utils/lctl"} - [ ! -f "$LCTL" ] && export LCTL=$(which lctl) + [ ! -f "$LCTL" ] && export LCTL=$(which lctl) export LFS=${LFS:-"$LUSTRE/utils/lfs"} - [ ! -f "$LFS" ] && export LFS=$(which lfs) + [ ! -f "$LFS" ] && export LFS=$(which lfs) export L_GETIDENTITY=${L_GETIDENTITY:-"$LUSTRE/utils/l_getidentity"} if [ ! -f "$L_GETIDENTITY" ]; then if `which l_getidentity > /dev/null 2>&1`; then @@ -91,9 +91,9 @@ init_test_env() { fi fi export MKFS=${MKFS:-"$LUSTRE/utils/mkfs.lustre"} - [ ! -f "$MKFS" ] && export MKFS=$(which mkfs.lustre) + [ ! -f "$MKFS" ] && export MKFS=$(which mkfs.lustre) export TUNEFS=${TUNEFS:-"$LUSTRE/utils/tunefs.lustre"} - [ ! -f "$TUNEFS" ] && export TUNEFS=$(which tunefs.lustre) + [ ! -f "$TUNEFS" ] && export TUNEFS=$(which tunefs.lustre) export CHECKSTAT="${CHECKSTAT:-"checkstat -v"} " export FSYTPE=${FSTYPE:-"ldiskfs"} export NAME=${NAME:-local} @@ -102,7 +102,7 @@ init_test_env() { [ "$GSS_PIPEFS" = "true" ] && [ ! -f "$LGSSD" ] && \ export LGSSD=$(which lgssd) export LSVCGSSD=${LSVCGSSD:-"$LUSTRE/utils/gss/lsvcgssd"} - [ ! -f "$LSVCGSSD" ] && export LSVCGSSD=$(which lsvcgssd) + [ ! -f "$LSVCGSSD" ] && export LSVCGSSD=$(which lsvcgssd) export KRB5DIR=${KRB5DIR:-"/usr/kerberos"} export DIR2 @@ -127,14 +127,14 @@ init_test_env() { ;; esac - # Paths on remote nodes, if different + # Paths on remote nodes, if different export RLUSTRE=${RLUSTRE:-$LUSTRE} export RPWD=${RPWD:-$PWD} export I_MOUNTED=${I_MOUNTED:-"no"} # command line - - while getopts "rvf:" opt $*; do + + while getopts "rvf:" opt $*; do case $opt in f) CONFIG=$OPTARG;; r) REFORMAT=--reformat;; @@ -270,7 +270,7 @@ unload_modules() { MODULES=$($LCTL modules | awk '{ print $2 }') if [ -n "$MODULES" ]; then echo "Modules still loaded: " - echo $MODULES + echo $MODULES if [ -e $LPROC ]; then echo "Lustre still loaded" cat $LPROC/devices || true @@ -391,7 +391,7 @@ ostdevlabel() { } # Facet functions -# start facet device options +# start facet device options start() { facet=$1 shift @@ -399,12 +399,12 @@ start() { shift echo "Starting ${facet}: $@ ${device} ${MOUNT%/*}/${facet}" do_facet ${facet} mkdir -p ${MOUNT%/*}/${facet} - do_facet ${facet} mount -t lustre $@ ${device} ${MOUNT%/*}/${facet} + do_facet ${facet} mount -t lustre $@ ${device} ${MOUNT%/*}/${facet} RC=${PIPESTATUS[0]} if [ $RC -ne 0 ]; then - echo mount -t lustre $@ ${device} ${MOUNT%/*}/${facet} + echo mount -t lustre $@ ${device} ${MOUNT%/*}/${facet} echo Start of ${device} on ${facet} failed ${RC} - else + else do_facet ${facet} "sysctl -w lnet.debug=$PTLDEBUG; \ sysctl -w lnet.subsystem_debug=${SUBSYSTEM# }; \ sysctl -w lnet.debug_mb=${DEBUG_SIZE}" @@ -452,7 +452,7 @@ zconf_mount() { exit 1 fi - echo "Starting client: $OPTIONS $device $mnt" + echo "Starting client: $OPTIONS $device $mnt" do_node $client mkdir -p $mnt do_node $client mount -t lustre $OPTIONS $device $mnt || return 1 @@ -479,7 +479,7 @@ shutdown_facet() { facet=$1 if [ "$FAILURE_MODE" = HARD ]; then $POWER_DOWN `facet_active_host $facet` - sleep 2 + sleep 2 elif [ "$FAILURE_MODE" = SOFT ]; then stop $facet fi @@ -570,7 +570,7 @@ wait_mds_recovery_done () { echo "Waiting $(($MAX - $WAIT)) secs for MDS recovery done" done echo "MDS recovery not done in $MAX sec" - return 1 + return 1 } wait_exit_ST () { @@ -668,6 +668,11 @@ fail() { df $MOUNT || error "post-failover df: $?" } +fail_nodf() { + local facet=$1 + facet_failover $facet +} + fail_abort() { local facet=$1 stop $facet @@ -696,7 +701,7 @@ h2gm () { h2name_or_ip() { if [ "$1" = "client" -o "$1" = "'*'" ]; then echo \'*\'; else - echo $1"@$2" + echo $1"@$2" fi } @@ -759,7 +764,7 @@ facet_active() { fi active=${!activevar} - if [ -z "$active" ] ; then + if [ -z "$active" ] ; then echo -n ${facet} else echo -n ${active} @@ -778,7 +783,7 @@ facet_active_host() { change_active() { local facet=$1 - failover=${facet}failover + failover=${facet}failover host=`facet_host $failover` [ -z "$host" ] && return curactive=`facet_active $facet` @@ -853,8 +858,8 @@ stopall() { if [ $activemds != "mds1" ]; then fail mds1 fi - - # assume client mount is local + + # assume client mount is local grep " $MOUNT " /proc/mounts && zconf_umount $HOSTNAME $MOUNT $* grep " $MOUNT2 " /proc/mounts && zconf_umount $HOSTNAME $MOUNT2 $* [ "$CLIENTONLY" ] && return @@ -993,7 +998,7 @@ cleanup_and_setup_lustre() { if [ "$ONLY" == "cleanup" -o "`mount | grep $MOUNT`" ]; then sysctl -w lnet.debug=0 || true cleanupall - if [ "$ONLY" == "cleanup" ]; then + if [ "$ONLY" == "cleanup" ]; then exit 0 fi fi @@ -1010,7 +1015,7 @@ check_and_cleanup_lustre() { unset I_MOUNTED } -####### +####### # General functions check_network() { @@ -1159,7 +1164,7 @@ debugrestore() { } ################################## -# Test interface +# Test interface ################################## error() { @@ -1260,7 +1265,7 @@ run_test() { fi run_one $1 "$2" - + return $? } @@ -1352,8 +1357,8 @@ canonical_path() { } sync_clients() { - [ -d $DIR1 ] && cd $DIR1 && sync; sleep 1; sync - [ -d $DIR2 ] && cd $DIR2 && sync; sleep 1; sync + [ -d $DIR1 ] && cd $DIR1 && sync; sleep 1; sync + [ -d $DIR2 ] && cd $DIR2 && sync; sleep 1; sync cd $SAVE_PWD } @@ -1371,14 +1376,14 @@ check_grant() { for i in `seq $OSTCOUNT`; do $LFS setstripe $DIR1/${tfile}_check_grant_$i -i $(($i -1)) -c 1 dd if=/dev/zero of=$DIR1/${tfile}_check_grant_$i bs=4k \ - count=1 > /dev/null 2>&1 + count=1 > /dev/null 2>&1 done # sync all the data and make sure no pending data on server sync_clients - #get client grant and server grant + #get client grant and server grant client_grant=0 - for d in ${LPROC}/osc/*/cur_grant_bytes; do + for d in ${LPROC}/osc/*/cur_grant_bytes; do client_grant=$((client_grant + `cat $d`)) done server_grant=0 @@ -1391,7 +1396,7 @@ check_grant() { rm $DIR1/${tfile}_check_grant_$i done - #check whether client grant == server grant + #check whether client grant == server grant if [ $client_grant != $server_grant ]; then echo "failed: client:${client_grant} server: ${server_grant}" return 1 @@ -1468,7 +1473,7 @@ nodes_list () { # FIXME. We need a list of clients local myNODES=$HOSTNAME local myNODES_sort - + if [ "$PDSH" -a "$PDSH" != "no_dsh" ]; then myNODES="$myNODES $(osts_nodes) $(mdts_nodes)" fi @@ -1491,8 +1496,8 @@ check_runas_id() { chmod 0755 $DIR chown $myRUNAS_ID:$myRUNAS_ID $DIR/d0_runas_test $myRUNAS touch $DIR/d0_runas_test/f$$ || \ - error "unable to write to $DIR/d0_runas_test as UID $myRUNAS_ID. - Please set RUNAS_ID to some UID which exists on MDS and client or + error "unable to write to $DIR/d0_runas_test as UID $myRUNAS_ID. + Please set RUNAS_ID to some UID which exists on MDS and client or add user $myRUNAS_ID:$myRUNAS_ID on these nodes." rm -rf $DIR/d0_runas_test } -- 1.8.3.1