From 4ac619e1ac93e2ba5812733e39861b73db72f9f9 Mon Sep 17 00:00:00 2001 From: yury Date: Thu, 22 Jan 2009 13:09:15 +0000 Subject: [PATCH] b=17194 - HEAD version of both patches in this one. --- lustre/include/linux/lvfs.h | 4 ++ lustre/include/lustre_export.h | 8 +++- lustre/include/obd.h | 3 -- lustre/include/obd_class.h | 13 +++++- lustre/ldlm/ldlm_lib.c | 12 +++-- lustre/lvfs/lvfs_linux.c | 52 +++++++++++++++++++++ lustre/mdt/mdt_recovery.c | 100 +++++++++++++++++++++++++++++------------ lustre/obdclass/genops.c | 16 +++---- lustre/obdclass/obd_config.c | 4 +- lustre/obdclass/obd_mount.c | 4 ++ lustre/obdfilter/filter.c | 24 +++++----- lustre/tests/replay-single.sh | 6 +-- 12 files changed, 182 insertions(+), 64 deletions(-) diff --git a/lustre/include/linux/lvfs.h b/lustre/include/linux/lvfs.h index 17576c3..e90b155 100644 --- a/lustre/include/linux/lvfs.h +++ b/lustre/include/linux/lvfs.h @@ -104,6 +104,10 @@ int lustre_fread(struct file *file, void *buf, int len, loff_t *off); int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off); int lustre_fsync(struct file *file); long l_readdir(struct file * file, struct list_head *dentry_list); +int l_notify_change(struct vfsmount *mnt, struct dentry *dchild, + struct iattr *newattrs); +int simple_truncate(struct dentry *dir, struct vfsmount *mnt, + char *name, loff_t length); static inline void l_dput(struct dentry *de) { diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h index 94033ef..c043411 100644 --- a/lustre/include/lustre_export.h +++ b/lustre/include/lustre_export.h @@ -113,6 +113,12 @@ typedef struct nid_stat { int nid_exp_ref_count; }nid_stat_t; +enum obd_option { + OBD_OPT_FORCE = 0x0001, + OBD_OPT_FAILOVER = 0x0002, + OBD_OPT_ABORT_RECOV = 0x0004, +}; + struct obd_export { struct portals_handle exp_handle; atomic_t exp_refcount; @@ -137,7 +143,7 @@ struct obd_export { spinlock_t exp_lock; /* protects flags int below */ /* ^ protects exp_outstanding_replies too */ __u64 exp_connect_flags; - int exp_flags; + enum obd_option exp_flags; unsigned long exp_failed:1, exp_in_recovery:1, exp_disconnected:1, diff --git a/lustre/include/obd.h b/lustre/include/obd.h index f250c58..941bae3 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -1075,9 +1075,6 @@ struct obd_device { struct lu_ref obd_reference; }; -#define OBD_OPT_FORCE 0x0001 -#define OBD_OPT_FAILOVER 0x0002 - #define OBD_LLOG_FL_SENDNOW 0x0001 enum obd_cleanup_stage { diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h index 1bc75e1..e65da9f 100644 --- a/lustre/include/obd_class.h +++ b/lustre/include/obd_class.h @@ -206,9 +206,18 @@ int class_connect(struct lustre_handle *conn, struct obd_device *obd, int class_disconnect(struct obd_export *exp); void class_fail_export(struct obd_export *exp); void class_disconnect_exports(struct obd_device *obddev); -int class_disconnect_stale_exports(struct obd_device *, - int (*test_export)(struct obd_export *)); int class_manual_cleanup(struct obd_device *obd); +int class_disconnect_stale_exports(struct obd_device *, + int (*test_export)(struct obd_export *), + enum obd_option flags); + +static inline enum obd_option exp_flags_from_obd(struct obd_device *obd) +{ + return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) | + (obd->obd_force ? OBD_OPT_FORCE : 0) | + (obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) | + 0); +} void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid); void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj); diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index a9b6f08..4bf5368 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1667,7 +1667,9 @@ static int target_recovery_thread(void *arg) "evict them\n", obd->obd_connected_clients, obd->obd_max_recoverable_clients); obd->obd_abort_recovery = obd->obd_stopping; - class_disconnect_stale_exports(obd, connect_done); + class_disconnect_stale_exports(obd, connect_done, + exp_flags_from_obd(obd) | + OBD_OPT_ABORT_RECOV); } /* next stage: replay requests */ delta = jiffies; @@ -1697,7 +1699,9 @@ static int target_recovery_thread(void *arg) if (obd->obd_abort_recovery) { CDEBUG(D_ERROR, "req replay timed out, aborting ...\n"); obd->obd_abort_recovery = obd->obd_stopping; - class_disconnect_stale_exports(obd, req_replay_done); + class_disconnect_stale_exports(obd, req_replay_done, + exp_flags_from_obd(obd) | + OBD_OPT_ABORT_RECOV); abort_req_replay_queue(obd); } @@ -1722,7 +1726,9 @@ static int target_recovery_thread(void *arg) int stale; CERROR("lock replay timed out, aborting ...\n"); obd->obd_abort_recovery = obd->obd_stopping; - stale = class_disconnect_stale_exports(obd, lock_replay_done); + stale = class_disconnect_stale_exports(obd, lock_replay_done, + exp_flags_from_obd(obd) | + OBD_OPT_ABORT_RECOV); abort_lock_replay_queue(obd); } diff --git a/lustre/lvfs/lvfs_linux.c b/lustre/lvfs/lvfs_linux.c index 5d07875..88b4334 100644 --- a/lustre/lvfs/lvfs_linux.c +++ b/lustre/lvfs/lvfs_linux.c @@ -421,6 +421,58 @@ long l_readdir(struct file *file, struct list_head *dentry_list) } EXPORT_SYMBOL(l_readdir); +int l_notify_change(struct vfsmount *mnt, struct dentry *dchild, + struct iattr *newattrs) +{ + int rc; + + LOCK_INODE_MUTEX(dchild->d_inode); +#ifdef HAVE_SECURITY_PLUG + rc = notify_change(dchild, mnt, newattrs); +#else + rc = notify_change(dchild, newattrs); +#endif + UNLOCK_INODE_MUTEX(dchild->d_inode); + return rc; +} +EXPORT_SYMBOL(l_notify_change); + +/* utility to truncate a file */ +int simple_truncate(struct dentry *dir, struct vfsmount *mnt, + char *name, loff_t length) +{ + struct dentry *dchild; + struct iattr newattrs; + int err = 0; + ENTRY; + + CDEBUG(D_INODE, "truncating file %.*s to %lld\n", (int)strlen(name), + name, (long long)length); + dchild = ll_lookup_one_len(name, dir, strlen(name)); + if (IS_ERR(dchild)) + GOTO(out, err = PTR_ERR(dchild)); + + if (dchild->d_inode) { + int old_mode = dchild->d_inode->i_mode; + if (S_ISDIR(old_mode)) { + CERROR("found %s (%lu/%u) is mode %o\n", name, + dchild->d_inode->i_ino, + dchild->d_inode->i_generation, old_mode); + GOTO(out_dput, err = -EISDIR); + } + + newattrs.ia_size = length; + newattrs.ia_valid = ATTR_SIZE; + err = l_notify_change(mnt, dchild, &newattrs); + } + EXIT; +out_dput: + dput(dchild); +out: + return err; +} +EXPORT_SYMBOL(simple_truncate); + #ifdef LUSTRE_KERNEL_VERSION #ifndef HAVE_CLEAR_RDONLY_ON_PUT #error rdonly patchset must be updated [cfs bz11248] diff --git a/lustre/mdt/mdt_recovery.c b/lustre/mdt/mdt_recovery.c index b5e263e..c32ae5a 100644 --- a/lustre/mdt/mdt_recovery.c +++ b/lustre/mdt/mdt_recovery.c @@ -49,7 +49,8 @@ #include "mdt_internal.h" static int mdt_server_data_update(const struct lu_env *env, - struct mdt_device *mdt); + struct mdt_device *mdt, + int need_sync); struct lu_buf *mdt_buf(const struct lu_env *env, void *area, ssize_t len) { @@ -243,8 +244,16 @@ static inline int mdt_last_rcvd_header_read(const struct lu_env *env, return rc; } +static void mdt_client_cb(const struct mdt_device *mdt, __u64 transno, + void *data, int err) +{ + struct obd_device *obd = mdt2obd_dev(mdt); + target_client_add_cb(obd, transno, data, err); +} + static inline int mdt_last_rcvd_header_write(const struct lu_env *env, - struct mdt_device *mdt) + struct mdt_device *mdt, + int need_sync) { struct mdt_thread_info *mti; struct thandle *th; @@ -253,6 +262,11 @@ static inline int mdt_last_rcvd_header_write(const struct lu_env *env, mti = lu_context_key_get(&env->le_ctx, &mdt_thread_key); + if (mti->mti_exp) { + spin_lock(&mti->mti_exp->exp_lock); + mti->mti_exp->exp_need_sync = need_sync; + spin_unlock(&mti->mti_exp->exp_lock); + } mdt_trans_credit_init(env, mdt, MDT_TXN_LAST_RCVD_WRITE_OP); th = mdt_trans_start(env, mdt); if (IS_ERR(th)) @@ -261,6 +275,9 @@ static inline int mdt_last_rcvd_header_write(const struct lu_env *env, mti->mti_off = 0; lsd_cpu_to_le(&mdt->mdt_lsd, &mti->mti_lsd); + if (need_sync && mti->mti_exp) + mdt_trans_add_cb(th, mdt_client_cb, mti->mti_exp); + rc = mdt_record_write(env, mdt->mdt_last_rcvd, mdt_buf_const(env, &mti->mti_lsd, sizeof(mti->mti_lsd)), @@ -561,7 +578,8 @@ static int mdt_server_data_init(const struct lu_env *env, lsd->lsd_mount_count = mdt->mdt_mount_count; /* save it, so mount count and last_transno is current */ - rc = mdt_server_data_update(env, mdt); + rc = mdt_server_data_update(env, mdt, (mti->mti_exp && + mti->mti_exp->exp_need_sync)); if (rc) GOTO(err_client, rc); @@ -574,7 +592,8 @@ out: } static int mdt_server_data_update(const struct lu_env *env, - struct mdt_device *mdt) + struct mdt_device *mdt, + int need_sync) { int rc = 0; ENTRY; @@ -591,18 +610,10 @@ static int mdt_server_data_update(const struct lu_env *env, * mdt->mdt_last_rcvd may be NULL that time. */ if (mdt->mdt_last_rcvd != NULL) - rc = mdt_last_rcvd_header_write(env, mdt); + rc = mdt_last_rcvd_header_write(env, mdt, need_sync); RETURN(rc); } -void mdt_cb_new_client(const struct mdt_device *mdt, __u64 transno, - void *data, int err) -{ - struct obd_device *obd = mdt2obd_dev(mdt); - - target_client_add_cb(obd, transno, data, err); -} - int mdt_client_new(const struct lu_env *env, struct mdt_device *mdt) { unsigned long *bitmap = mdt->mdt_client_bitmap; @@ -651,16 +662,22 @@ int mdt_client_new(const struct lu_env *env, struct mdt_device *mdt) init_mutex(&med->med_lcd_lock); LASSERTF(med->med_lr_off > 0, "med_lr_off = %llu\n", med->med_lr_off); - /* write new client data */ + + /* Write new client data. */ off = med->med_lr_off; mdt_trans_credit_init(env, mdt, MDT_TXN_LAST_RCVD_WRITE_OP); + th = mdt_trans_start(env, mdt); if (IS_ERR(th)) RETURN(PTR_ERR(th)); - /* until this operations will be committed the sync is needed for this - * export */ - mdt_trans_add_cb(th, mdt_cb_new_client, mti->mti_exp); + /* + * Until this operations will be committed the sync is needed + * for this export. This should be done _after_ starting the + * transaction so that many connecting clients will not bring + * server down with lots of sync writes. + */ + mdt_trans_add_cb(th, mdt_client_cb, mti->mti_exp); spin_lock(&mti->mti_exp->exp_lock); mti->mti_exp->exp_need_sync = 1; spin_unlock(&mti->mti_exp->exp_lock); @@ -730,21 +747,24 @@ int mdt_client_del(const struct lu_env *env, struct mdt_device *mdt) struct mdt_export_data *med; struct lsd_client_data *lcd; struct obd_device *obd = mdt2obd_dev(mdt); - struct thandle *th; - loff_t off; - int rc = 0; + struct obd_export *exp; + struct thandle *th; + int need_sync; + loff_t off; + int rc = 0; ENTRY; mti = lu_context_key_get(&env->le_ctx, &mdt_thread_key); LASSERT(mti != NULL); - med = &mti->mti_exp->exp_mdt_data; + exp = mti->mti_exp; + med = &exp->exp_mdt_data; lcd = med->med_lcd; if (!lcd) RETURN(0); /* XXX: If lcd_uuid were a real obd_uuid, I could use obd_uuid_equals */ - if (!strcmp(med->med_lcd->lcd_uuid, obd->obd_uuid.uuid)) + if (!strcmp(lcd->lcd_uuid, obd->obd_uuid.uuid)) GOTO(free, 0); CDEBUG(D_INFO, "freeing client at idx %u, offset %lld\n", @@ -772,16 +792,34 @@ int mdt_client_del(const struct lu_env *env, struct mdt_device *mdt) LBUG(); } + /* Don't force sync on disconnect if aborting recovery, + * or it does num_clients * num_osts. b=17194 */ + need_sync = (!exp->exp_libclient || exp->exp_need_sync) && + !(exp->exp_flags & OBD_OPT_ABORT_RECOV); + /* * This may be called from difficult reply handler path and * mdt->mdt_last_rcvd may be NULL that time. */ if (mdt->mdt_last_rcvd != NULL) { mdt_trans_credit_init(env, mdt, MDT_TXN_LAST_RCVD_WRITE_OP); + + spin_lock(&exp->exp_lock); + exp->exp_need_sync = need_sync; + spin_unlock(&exp->exp_lock); + th = mdt_trans_start(env, mdt); if (IS_ERR(th)) GOTO(free, rc = PTR_ERR(th)); + if (need_sync) { + /* + * Until this operations will be committed the sync + * is needed for this export. + */ + mdt_trans_add_cb(th, mdt_client_cb, exp); + } + mutex_down(&med->med_lcd_lock); memset(lcd, 0, sizeof *lcd); @@ -791,18 +829,20 @@ int mdt_client_del(const struct lu_env *env, struct mdt_device *mdt) } CDEBUG(rc == 0 ? D_INFO : D_ERROR, "Zeroing out client idx %u in " - "%s rc %d\n", med->med_lr_idx, LAST_RCVD, rc); + "%s %ssync rc %d\n", med->med_lr_idx, LAST_RCVD, + need_sync ? "" : "a", rc); spin_lock(&mdt->mdt_client_bitmap_lock); clear_bit(med->med_lr_idx, mdt->mdt_client_bitmap); spin_unlock(&mdt->mdt_client_bitmap_lock); - /* - * Make sure the server's last_transno is up to date. Do this after the - * client is freed so we know all the client's transactions have been - * committed. + /* + * Make sure the server's last_transno is up to date. Do this + * after the client is freed so we know all the client's + * transactions have been committed. */ - mdt_server_data_update(env, mdt); + mdt_server_data_update(env, mdt, need_sync); + EXIT; free: OBD_FREE_PTR(lcd); @@ -866,7 +906,9 @@ static int mdt_last_rcvd_update(struct mdt_thread_info *mti, */ if (mti->mti_transno == 0 && *transno_p == mdt->mdt_last_transno) - mdt_server_data_update(mti->mti_env, mdt); + mdt_server_data_update(mti->mti_env, mdt, + (mti->mti_exp && + mti->mti_exp->exp_need_sync)); *transno_p = mti->mti_transno; diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 63f443a..7341aaa 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -1068,7 +1068,8 @@ int class_disconnect(struct obd_export *export) RETURN(0); } -static void class_disconnect_export_list(struct list_head *list, int flags) +static void class_disconnect_export_list(struct list_head *list, + enum obd_option flags) { int rc; struct lustre_handle fake_conn; @@ -1118,12 +1119,6 @@ static void class_disconnect_export_list(struct list_head *list, int flags) EXIT; } -static inline int get_exp_flags_from_obd(struct obd_device *obd) -{ - return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) | - (obd->obd_force ? OBD_OPT_FORCE : 0)); -} - void class_disconnect_exports(struct obd_device *obd) { struct list_head work_list; @@ -1139,7 +1134,7 @@ void class_disconnect_exports(struct obd_device *obd) CDEBUG(D_HA, "OBD device %d (%p) has exports, " "disconnecting them\n", obd->obd_minor, obd); class_disconnect_export_list(&work_list, - get_exp_flags_from_obd(obd)); + exp_flags_from_obd(obd)); } else CDEBUG(D_HA, "OBD device %d (%p) has no exports\n", obd->obd_minor, obd); @@ -1150,7 +1145,8 @@ EXPORT_SYMBOL(class_disconnect_exports); /* Remove exports that have not completed recovery. */ int class_disconnect_stale_exports(struct obd_device *obd, - int (*test_export)(struct obd_export *)) + int (*test_export)(struct obd_export *), + enum obd_option flags) { struct list_head work_list; struct list_head *pos, *n; @@ -1182,7 +1178,7 @@ int class_disconnect_stale_exports(struct obd_device *obd, CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n", obd->obd_name, cnt); - class_disconnect_export_list(&work_list, get_exp_flags_from_obd(obd)); + class_disconnect_export_list(&work_list, flags); RETURN(cnt); } EXPORT_SYMBOL(class_disconnect_stale_exports); diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index eb8d415..7783a3a 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -598,9 +598,7 @@ void class_decref(struct obd_device *obd, const char *scope, const void *source) be no more in-progress ops by this point.*/ spin_lock(&obd->obd_self_export->exp_lock); - obd->obd_self_export->exp_flags |= - (obd->obd_fail ? OBD_OPT_FAILOVER : 0) | - (obd->obd_force ? OBD_OPT_FORCE : 0); + obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd); spin_unlock(&obd->obd_self_export->exp_lock); /* note that we'll recurse into class_decref again */ diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index f568574..b632fa3 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -1358,6 +1358,10 @@ static struct vfsmount *server_kernel_mount(struct super_block *sb) GOTO(out_free, rc); } + if (lmd->lmd_flags & LMD_FLG_ABORT_RECOV) + simple_truncate(mnt->mnt_sb->s_root, mnt, LAST_RCVD, + LR_CLIENT_START); + OBD_PAGE_FREE(__page); lsi->lsi_ldd = ldd; /* freed at lsi cleanup */ CDEBUG(D_SUPER, "%s: mnt = %p\n", lmd->lmd_dev, mnt); diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index c258b8f..0b7ad33 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -357,12 +357,13 @@ static int filter_client_add(struct obd_device *obd, struct obd_export *exp, RETURN(0); } +struct lsd_client_data zero_lcd; /* globals are implicitly zeroed */ + static int filter_client_free(struct obd_export *exp) { struct filter_export_data *fed = &exp->exp_filter_data; struct filter_obd *filter = &exp->exp_obd->u.filter; struct obd_device *obd = exp->exp_obd; - struct lsd_client_data zero_lcd; struct lvfs_run_ctxt saved; int rc; loff_t off; @@ -399,23 +400,26 @@ static int filter_client_free(struct obd_export *exp) } if (!(exp->exp_flags & OBD_OPT_FAILOVER)) { - memset(&zero_lcd, 0, sizeof zero_lcd); + /* Don't force sync on disconnect if aborting recovery, + * or it does num_clients * num_osts. b=17194 */ + int need_sync = (!exp->exp_libclient || exp->exp_need_sync) && + !(exp->exp_flags&OBD_OPT_ABORT_RECOV); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); rc = fsfilt_write_record(obd, filter->fo_rcvd_filp, &zero_lcd, - sizeof(zero_lcd), &off, - (!exp->exp_libclient || - exp->exp_need_sync)); + sizeof(zero_lcd), &off, 0); + + /* Make sure the server's last_transno is up to date. Do this + * after the client is freed so we know all the client's + * transactions have been committed. */ if (rc == 0) - /* update server's transno */ filter_update_server_data(obd, filter->fo_rcvd_filp, - filter->fo_fsd, - !exp->exp_libclient); + filter->fo_fsd, need_sync); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); CDEBUG(rc == 0 ? D_INFO : D_ERROR, - "zeroing out client %s at idx %u (%llu) in %s rc %d\n", + "zero out client %s at idx %u/%llu in %s %ssync rc %d\n", fed->fed_lcd->lcd_uuid, fed->fed_lr_idx, fed->fed_lr_off, - LAST_RCVD, rc); + LAST_RCVD, need_sync ? "" : "a", rc); } if (!test_and_clear_bit(fed->fed_lr_idx, filter->fo_last_rcvd_slots)) { diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 925a089..5a2958a 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -19,8 +19,8 @@ GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""} remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0 # Skip these tests -# bug number: 17466 -ALWAYS_EXCEPT="61d $REPLAY_SINGLE_EXCEPT" +# bug number: 17466 15962 +ALWAYS_EXCEPT="61d 33b $REPLAY_SINGLE_EXCEPT" if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then CONFIG_EXCEPTIONS="0b 42 47 61a 61c" @@ -730,7 +730,7 @@ test_33a() { # was test_33 } run_test 33a "abort recovery before client does replay" -# Stale FID sequence +# Stale FID sequence bug 15962 test_33b() { # was test_33a replay_barrier $SINGLEMDS createmany -o $DIR/$tfile-%d 10 -- 1.8.3.1