From 940dcd5282df4c8503053a5b3e932e8a7dc00486 Mon Sep 17 00:00:00 2001 From: deen Date: Wed, 19 Sep 2007 21:35:53 +0000 Subject: [PATCH] Allow llrd to evict clients directly on OSTs. b=12418 i=green i=adilger --- lustre/ChangeLog | 6 +++ lustre/include/lprocfs_status.h | 2 + lustre/include/obd.h | 5 ++- lustre/mds/handler.c | 1 + lustre/mds/lproc_mds.c | 43 ++++++++++++++++---- lustre/obdclass/lprocfs_status.c | 35 +++++++++++++++- lustre/obdclass/obd_config.c | 1 + lustre/obdfilter/lproc_obdfilter.c | 3 +- lustre/ost/ost_handler.c | 82 +++++++++++++++++++++++++++++--------- lustre/ptlrpc/lproc_ptlrpc.c | 7 +++- 10 files changed, 156 insertions(+), 29 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 1eb2f2e..3b33686 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -268,6 +268,12 @@ Details : Don't take the BKL in fsfilt_ext3_setattr() for 2.6 kernels. It causes scheduling issues when removing large files (17TB in the present case). +Severity : normal +Frequency : only with liblustre clients on XT3 +Bugzilla : 12418 +Description: evictions taking too long +Details : allow llrd to evict clients directly on OSTs + -------------------------------------------------------------------------------- 2007-08-10 Cluster File Systems, Inc. diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index f9aaac7..a3f32ba 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -44,6 +44,7 @@ struct lprocfs_vars { cfs_read_proc_t *read_fptr; cfs_write_proc_t *write_fptr; void *data; + struct file_operations *fops; }; struct lprocfs_static_vars { @@ -241,6 +242,7 @@ extern cfs_proc_dir_entry_t *lprocfs_srch(cfs_proc_dir_entry_t *root, extern int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list); extern int lprocfs_obd_cleanup(struct obd_device *obd); +extern struct file_operations lprocfs_evict_client_fops; extern int lprocfs_seq_create(cfs_proc_dir_entry_t *parent, char *name, mode_t mode, struct file_operations *seq_fops, diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 8c1489f..6e09db1 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -535,7 +535,8 @@ struct mds_obd { struct semaphore mds_health_sem; unsigned long mds_lov_objids_valid:1, mds_fl_user_xattr:1, - mds_fl_acl:1; + mds_fl_acl:1, + mds_evict_ost_nids:1; struct upcall_cache *mds_identity_cache; @@ -911,6 +912,8 @@ struct obd_device { cfs_proc_dir_entry_t *obd_svc_procroot; struct lprocfs_stats *obd_svc_stats; struct semaphore obd_proc_exp_sem; + atomic_t obd_evict_inprogress; + cfs_waitq_t obd_evict_inprogress_waitq; }; #define OBD_OPT_FORCE 0x0001 diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index c1eccb3..9741b80 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -1952,6 +1952,7 @@ static int mds_setup(struct obd_device *obd, struct lustre_cfg* lcfg) mds->mds_max_mdsize = sizeof(struct lov_mds_md); mds->mds_max_cookiesize = sizeof(struct llog_cookie); mds->mds_atime_diff = MAX_ATIME_DIFF; + mds->mds_evict_ost_nids = 1; sprintf(ns_name, "mds-%s", obd->obd_uuid.uuid); obd->obd_namespace = ldlm_namespace_new(ns_name, LDLM_NAMESPACE_SERVER); diff --git a/lustre/mds/lproc_mds.c b/lustre/mds/lproc_mds.c index f273d0e..05ae719 100644 --- a/lustre/mds/lproc_mds.c +++ b/lustre/mds/lproc_mds.c @@ -46,6 +46,31 @@ static int lprocfs_mds_rd_mntdev(char *page, char **start, off_t off, int count, return snprintf(page, count, "%s\n",obd->u.mds.mds_vfsmnt->mnt_devname); } +static int lprocfs_mds_rd_evictostnids(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device* obd = (struct obd_device *)data; + + LASSERT(obd != NULL); + + return snprintf(page, count, "%d\n", obd->u.mds.mds_evict_ost_nids); +} + +static int lprocfs_mds_wr_evictostnids(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + obd->u.mds.mds_evict_ost_nids = !!val; + + return count; +} + static int lprocfs_mds_wr_evict_client(struct file *file, const char *buffer, unsigned long count, void *data) { @@ -64,14 +89,16 @@ static int lprocfs_mds_wr_evict_client(struct file *file, const char *buffer, if (!set) return -ENOMEM; - rc = obd_set_info_async(mds->mds_osc_exp, strlen("evict_by_nid"), - "evict_by_nid", strlen(tmpbuf + 4) + 1, - tmpbuf + 4, set); - if (rc) - CERROR("Failed to evict nid %s from OSTs: rc %d\n", tmpbuf + 4, - rc); + if (obd->u.mds.mds_evict_ost_nids) { + rc = obd_set_info_async(mds->mds_osc_exp,strlen("evict_by_nid"), + "evict_by_nid", strlen(tmpbuf + 4) + 1, + tmpbuf + 4, set); + if (rc) + CERROR("Failed to evict nid %s from OSTs: rc %d\n", + tmpbuf + 4, rc); - ptlrpc_check_set(set); + ptlrpc_check_set(set); + } /* See the comments in function lprocfs_wr_evict_client() * in ptlrpc/lproc_ptlrpc.c for details. - jay */ @@ -293,6 +320,8 @@ struct lprocfs_vars lprocfs_mds_obd_vars[] = { { "mntdev", lprocfs_mds_rd_mntdev, 0, 0 }, { "recovery_status", lprocfs_obd_rd_recovery_status, 0, 0 }, { "evict_client", 0, lprocfs_mds_wr_evict_client, 0 }, + { "evict_ost_nids", lprocfs_mds_rd_evictostnids, + lprocfs_mds_wr_evictostnids, 0 }, { "num_exports", lprocfs_rd_num_exports, 0, 0 }, #ifdef HAVE_QUOTA_SUPPORT { "quota_bunit_sz", lprocfs_rd_bunit, lprocfs_wr_bunit, 0 }, diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 4a7fa3e..23427d2 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -181,6 +181,36 @@ static struct file_operations lprocfs_generic_fops = { .write = lprocfs_fops_write, }; +int lprocfs_evict_client_open(struct inode *inode, struct file *f) +{ + struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode); + struct obd_device *obd = dp->data; + + atomic_inc(&obd->obd_evict_inprogress); + + return 0; +} + +int lprocfs_evict_client_release(struct inode *inode, struct file *f) +{ + struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode); + struct obd_device *obd = dp->data; + + atomic_dec(&obd->obd_evict_inprogress); + wake_up(&obd->obd_evict_inprogress_waitq); + + return 0; +} + +struct file_operations lprocfs_evict_client_fops = { + .owner = THIS_MODULE, + .read = lprocfs_fops_read, + .write = lprocfs_fops_write, + .open = lprocfs_evict_client_open, + .release = lprocfs_evict_client_release, +}; +EXPORT_SYMBOL(lprocfs_evict_client_fops); + int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list, void *data) { @@ -237,7 +267,10 @@ int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list, return -ENOMEM; } - proc->proc_fops = &lprocfs_generic_fops; + if (list->fops) + proc->proc_fops = list->fops; + else + proc->proc_fops = &lprocfs_generic_fops; proc->read_proc = list->read_fptr; proc->write_proc = list->write_fptr; proc->data = (list->data ? list->data : data); diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index f5284b5..0ae76a3 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -179,6 +179,7 @@ int class_attach(struct lustre_cfg *lcfg) cfs_init_timer(&obd->obd_recovery_timer); spin_lock_init(&obd->obd_processing_task_lock); cfs_waitq_init(&obd->obd_next_transno_waitq); + cfs_waitq_init(&obd->obd_evict_inprogress_waitq); CFS_INIT_LIST_HEAD(&obd->obd_req_replay_queue); CFS_INIT_LIST_HEAD(&obd->obd_lock_replay_queue); CFS_INIT_LIST_HEAD(&obd->obd_final_req_queue); diff --git a/lustre/obdfilter/lproc_obdfilter.c b/lustre/obdfilter/lproc_obdfilter.c index a206d81..123c1ed 100644 --- a/lustre/obdfilter/lproc_obdfilter.c +++ b/lustre/obdfilter/lproc_obdfilter.c @@ -282,7 +282,8 @@ static struct lprocfs_vars lprocfs_obd_vars[] = { { "tot_pending", lprocfs_filter_rd_tot_pending, 0, 0 }, { "tot_granted", lprocfs_filter_rd_tot_granted, 0, 0 }, { "recovery_status", lprocfs_obd_rd_recovery_status, 0, 0 }, - { "evict_client", 0, lprocfs_wr_evict_client, 0 }, + { "evict_client", 0, lprocfs_wr_evict_client, 0, + &lprocfs_evict_client_fops}, { "num_exports", lprocfs_rd_num_exports, 0, 0 }, { "readcache_max_filesize", lprocfs_filter_rd_readcache, diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index d2a8ade..c18d919 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -660,6 +660,7 @@ static void ost_prolong_locks(struct obd_export *exp, struct obd_ioobj *obj, static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) { struct ptlrpc_bulk_desc *desc; + struct obd_export *exp = req->rq_export; struct niobuf_remote *remote_nb; struct niobuf_remote *pp_rnb = NULL; struct niobuf_local *local_nb; @@ -681,6 +682,17 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK | OBD_FAIL_ONCE, (obd_timeout + 1) / 4); + /* Check if there is eviction in progress, and if so, wait for it to + * finish */ + if (unlikely(atomic_read(&exp->exp_obd->obd_evict_inprogress))) { + lwi = LWI_INTR(NULL, NULL); // We do not care how long it takes + rc = l_wait_event(exp->exp_obd->obd_evict_inprogress_waitq, + !atomic_read(&exp->exp_obd->obd_evict_inprogress), + &lwi); + } + if (exp->exp_failed) + GOTO(out, rc = -ENOTCONN); + body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body), lustre_swab_ost_body); if (body == NULL) { @@ -743,7 +755,7 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) if (desc == NULL) GOTO(out, rc = -ENOMEM); - rc = ost_brw_lock_get(LCK_PR, req->rq_export, ioo, pp_rnb, &lockh); + rc = ost_brw_lock_get(LCK_PR, exp, ioo, pp_rnb, &lockh); if (rc != 0) GOTO(out_bulk, rc); @@ -761,12 +773,12 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) GOTO(out_lock, rc = -ETIMEDOUT); } - rc = obd_preprw(OBD_BRW_READ, req->rq_export, &body->oa, 1, + rc = obd_preprw(OBD_BRW_READ, exp, &body->oa, 1, ioo, npages, pp_rnb, local_nb, oti, capa); if (rc != 0) GOTO(out_lock, rc); - ost_prolong_locks(req->rq_export, ioo, pp_rnb, LCK_PW | LCK_PR); + ost_prolong_locks(exp, ioo, pp_rnb, LCK_PW | LCK_PR); nob = 0; for (i = 0; i < npages; i++) { @@ -807,7 +819,18 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) /* Check if client was evicted while we were doing i/o before touching network */ if (rc == 0) { - if (desc->bd_export->exp_failed) + /* Check if there is eviction in progress, and if so, wait for + * it to finish */ + if (unlikely(atomic_read(&exp->exp_obd-> + obd_evict_inprogress))) { + lwi = LWI_INTR(NULL, NULL); + rc = l_wait_event(exp->exp_obd-> + obd_evict_inprogress_waitq, + !atomic_read(&exp->exp_obd-> + obd_evict_inprogress), + &lwi); + } + if (exp->exp_failed) rc = -ENOTCONN; else { sptlrpc_svc_wrap_bulk(req, desc); @@ -820,12 +843,12 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) ost_bulk_timeout, desc); rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc) || - desc->bd_export->exp_failed, &lwi); + exp->exp_failed, &lwi); LASSERT(rc == 0 || rc == -ETIMEDOUT); if (rc == -ETIMEDOUT) { DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT"); ptlrpc_abort_bulk(desc); - } else if (desc->bd_export->exp_failed) { + } else if (exp->exp_failed) { DEBUG_REQ(D_ERROR, req, "Eviction on bulk PUT"); rc = -ENOTCONN; ptlrpc_abort_bulk(desc); @@ -846,7 +869,7 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) } /* Must commit after prep above in all cases */ - rc = obd_commitrw(OBD_BRW_READ, req->rq_export, &body->oa, 1, + rc = obd_commitrw(OBD_BRW_READ, exp, &body->oa, 1, ioo, npages, local_nb, oti, rc); ost_nio_pages_put(req, local_nb, npages); @@ -880,9 +903,9 @@ out: } CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - " "client will retry\n", - req->rq_export->exp_obd->obd_name, - req->rq_export->exp_client_uuid.uuid, - req->rq_export->exp_connection->c_remote_uuid.uuid, + exp->exp_obd->obd_name, + exp->exp_client_uuid.uuid, + exp->exp_connection->c_remote_uuid.uuid, libcfs_id2str(req->rq_peer)); } @@ -892,6 +915,7 @@ out: static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) { struct ptlrpc_bulk_desc *desc; + struct obd_export *exp = req->rq_export; struct niobuf_remote *remote_nb; struct niobuf_remote *pp_rnb; struct niobuf_local *local_nb; @@ -917,6 +941,17 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK | OBD_FAIL_ONCE, (obd_timeout + 1) / 4); + /* Check if there is eviction in progress, and if so, wait for it to + * finish */ + if (unlikely(atomic_read(&exp->exp_obd->obd_evict_inprogress))) { + lwi = LWI_INTR(NULL, NULL); // We do not care how long it takes + rc = l_wait_event(exp->exp_obd->obd_evict_inprogress_waitq, + !atomic_read(&exp->exp_obd->obd_evict_inprogress), + &lwi); + } + if (exp->exp_failed) + GOTO(out, rc = -ENOTCONN); + swab = lustre_msg_swabbed(req->rq_reqmsg); body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body), lustre_swab_ost_body); @@ -1000,7 +1035,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) if (desc == NULL) GOTO(out, rc = -ENOMEM); - rc = ost_brw_lock_get(LCK_PW, req->rq_export, ioo, pp_rnb, &lockh); + rc = ost_brw_lock_get(LCK_PW, exp, ioo, pp_rnb, &lockh); if (rc != 0) GOTO(out_bulk, rc); @@ -1018,7 +1053,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) GOTO(out_lock, rc = -ETIMEDOUT); } - ost_prolong_locks(req->rq_export, ioo, pp_rnb, LCK_PW); + ost_prolong_locks(exp, ioo, pp_rnb, LCK_PW); /* obd_preprw clobbers oa->valid, so save what we need */ client_cksum = body->oa.o_valid & OBD_MD_FLCKSUM ? body->oa.o_cksum : 0; @@ -1031,7 +1066,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) body->oa.o_valid &= ~OBD_MD_FLGRANT; } - rc = obd_preprw(OBD_BRW_WRITE, req->rq_export, &body->oa, objcount, + rc = obd_preprw(OBD_BRW_WRITE, exp, &body->oa, objcount, ioo, npages, pp_rnb, local_nb, oti, capa); if (rc != 0) GOTO(out_lock, rc); @@ -1099,8 +1134,19 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) sptlrpc_svc_unwrap_bulk(req, desc); + /* Check if there is eviction in progress, and if so, wait for + * it to finish */ + if (unlikely(atomic_read(&exp->exp_obd->obd_evict_inprogress))) { + lwi = LWI_INTR(NULL, NULL); + rc = l_wait_event(exp->exp_obd->obd_evict_inprogress_waitq, + !atomic_read(&exp->exp_obd->obd_evict_inprogress), + &lwi); + } + if (rc == 0 && exp->exp_failed) + rc = -ENOTCONN; + /* Must commit after prep above in all cases */ - rc = obd_commitrw(OBD_BRW_WRITE, req->rq_export, &repbody->oa, + rc = obd_commitrw(OBD_BRW_WRITE, exp, &repbody->oa, objcount, ioo, npages, local_nb, oti, rc); if (unlikely(client_cksum != server_cksum && rc == 0)) { @@ -1126,7 +1172,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) LCONSOLE_ERROR_MSG(0x168, "%s: BAD WRITE CHECKSUM: %s from " "%s%s%s inum "LPU64"/"LPU64" object " LPU64"/"LPU64" extent ["LPU64"-"LPU64"]\n", - req->rq_export->exp_obd->obd_name, msg, + exp->exp_obd->obd_name, msg, libcfs_id2str(req->rq_peer), via, router, body->oa.o_valid & OBD_MD_FLFID ? @@ -1186,9 +1232,9 @@ out: } CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - " "client will retry\n", - req->rq_export->exp_obd->obd_name, - req->rq_export->exp_client_uuid.uuid, - req->rq_export->exp_connection->c_remote_uuid.uuid, + exp->exp_obd->obd_name, + exp->exp_client_uuid.uuid, + exp->exp_connection->c_remote_uuid.uuid, libcfs_id2str(req->rq_peer)); } RETURN(rc); diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index 038e810..0730b0f 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -501,7 +501,12 @@ int lprocfs_wr_evict_client(struct file *file, const char *buffer, LPROCFS_EXIT(); sscanf(buffer, "%40s", tmpbuf); - obd_export_evict_by_uuid(obd, tmpbuf); + if (strncmp(tmpbuf, "nid:", 4) == 0) + obd_export_evict_by_nid(obd, tmpbuf + 4); + else if (strncmp(tmpbuf, "uuid:", 5) == 0) + obd_export_evict_by_uuid(obd, tmpbuf + 5); + else + obd_export_evict_by_uuid(obd, tmpbuf); LPROCFS_ENTRY(); class_decref(obd); -- 1.8.3.1