From 8cad1d968ad4efee46d08aee1e6a6266d4713e84 Mon Sep 17 00:00:00 2001 From: Rahul Deshmukh Date: Thu, 7 Oct 2010 17:31:46 +0530 Subject: [PATCH 1/1] b=22176 Add .sync_fs super block handler Adding sync_fs functionality by adding .sync_fs to super block handler. It basically mark all loi's as urgent (at an instant of time) & move it to separate list and drain them. i=oleg i=eric.mei --- lustre/include/obd.h | 14 ++++- lustre/include/obd_class.h | 14 +++++ lustre/ldlm/ldlm_lib.c | 1 + lustre/llite/llite_internal.h | 1 + lustre/llite/llite_lib.c | 13 +++++ lustre/llite/super25.c | 1 + lustre/lov/lov_internal.h | 4 ++ lustre/lov/lov_obd.c | 35 ++++++++++++ lustre/lov/lov_request.c | 67 ++++++++++++++++++++++ lustre/obdclass/lprocfs_status.c | 1 + lustre/osc/osc_internal.h | 1 + lustre/osc/osc_request.c | 116 ++++++++++++++++++++++++++++++++++++--- 12 files changed, 260 insertions(+), 8 deletions(-) diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 4a341d0..46f8bb7 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -96,7 +96,7 @@ struct lov_oinfo { /* per-stripe data structure */ cfs_list_t loi_hp_ready_item; cfs_list_t loi_write_item; cfs_list_t loi_read_item; - + cfs_list_t loi_sync_fs_item; unsigned long loi_kms_valid:1; __u64 loi_kms; /* known minimum size */ struct ost_lvb loi_lvb; @@ -123,6 +123,7 @@ static inline void loi_init(struct lov_oinfo *loi) CFS_INIT_LIST_HEAD(&loi->loi_hp_ready_item); CFS_INIT_LIST_HEAD(&loi->loi_write_item); CFS_INIT_LIST_HEAD(&loi->loi_read_item); + CFS_INIT_LIST_HEAD(&loi->loi_sync_fs_item); } struct lov_stripe_md { @@ -159,6 +160,12 @@ struct obd_info; typedef int (*obd_enqueue_update_f)(void *cookie, int rc); +struct osc_sync_fs_wait { + struct obd_info *sfw_oi; + obd_enqueue_update_f sfw_upcall; + int started; +}; + /* obd info for a particular level (lov, osc). */ struct obd_info { /* Lock policy. It keeps an extent which is specific for a particular @@ -440,6 +447,7 @@ struct client_obd { cfs_list_t cl_loi_hp_ready_list; cfs_list_t cl_loi_write_list; cfs_list_t cl_loi_read_list; + cfs_list_t cl_loi_sync_fs_list; int cl_r_in_flight; int cl_w_in_flight; /* just a sum of the loi/lop pending numbers to be exported by /proc */ @@ -487,7 +495,9 @@ struct client_obd { struct lu_client_seq *cl_seq; cfs_atomic_t cl_resends; /* resend count */ + struct osc_sync_fs_wait cl_sf_wait; }; + #define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid) #define CL_NOT_QUOTACHECKED 1 /* client->cl_qchk_stat init value */ @@ -1406,6 +1416,8 @@ struct obd_ops { char *ostname); void (*o_getref)(struct obd_device *obd); void (*o_putref)(struct obd_device *obd); + int (*o_sync_fs)(struct obd_device *obd, struct obd_info *oinfo, + int wait); /* * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c. diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h index f211b1d..7163582 100644 --- a/lustre/include/obd_class.h +++ b/lustre/include/obd_class.h @@ -1259,6 +1259,20 @@ static inline int obd_sync(struct obd_export *exp, struct obdo *oa, RETURN(rc); } +static inline int obd_sync_fs(struct obd_device *obd, struct obd_info *oinfo, + int wait) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, sync_fs, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, sync); + + rc = OBP(obd, sync_fs)(obd, oinfo, wait); + + RETURN(rc); +} + static inline int obd_punch_rqset(struct obd_export *exp, struct obd_info *oinfo, struct obd_trans_info *oti) diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 93f2366..c5e46eb 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -282,6 +282,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) CFS_INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list); CFS_INIT_LIST_HEAD(&cli->cl_loi_write_list); CFS_INIT_LIST_HEAD(&cli->cl_loi_read_list); + CFS_INIT_LIST_HEAD(&cli->cl_loi_sync_fs_list); client_obd_list_lock_init(&cli->cl_loi_list_lock); cli->cl_r_in_flight = 0; cli->cl_w_in_flight = 0; diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 03436ec..982645f 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -733,6 +733,7 @@ void ll_umount_begin(struct super_block *sb); #endif int ll_remount_fs(struct super_block *sb, int *flags, char *data); int ll_show_options(struct seq_file *seq, struct vfsmount *vfs); +int ll_sync_fs(struct super_block *sb, int wait); int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, struct super_block *); void lustre_dump_dentry(struct dentry *, int recur); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index a984eb1..0a4db55 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -2107,3 +2107,16 @@ int ll_show_options(struct seq_file *seq, struct vfsmount *vfs) RETURN(0); } + +int ll_sync_fs(struct super_block *sb, int wait) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + int rc = 0; + ENTRY; + + rc = obd_sync_fs(class_exp2obd(sbi->ll_dt_exp), NULL, wait); + if (rc) + CERROR("sync_fs fails: rc = %d\n", rc); + + RETURN(rc); +} diff --git a/lustre/llite/super25.c b/lustre/llite/super25.c index 358b769..0bf72df 100644 --- a/lustre/llite/super25.c +++ b/lustre/llite/super25.c @@ -99,6 +99,7 @@ struct super_operations lustre_super_operations = .umount_begin = ll_umount_begin, .remount_fs = ll_remount_fs, .show_options = ll_show_options, + .sync_fs = ll_sync_fs, }; diff --git a/lustre/lov/lov_internal.h b/lustre/lov/lov_internal.h index 9d899fc..9e640b2 100644 --- a/lustre/lov/lov_internal.h +++ b/lustre/lov/lov_internal.h @@ -258,6 +258,10 @@ int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, int lov_fini_statfs_set(struct lov_request_set *set); int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc); +int cb_sync_fs_update(void *cookie, int rc); +int lov_fini_sync_fs_set(struct lov_request_set *set); +int lov_prep_sync_fs_set(struct obd_device *obd, struct obd_info *oinfo, + struct lov_request_set **reqset); /* lov_obd.c */ void lov_fix_desc(struct lov_desc *desc); void lov_fix_desc_stripe_size(__u64 *val); diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index b82202a..1ab938a 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -1592,6 +1592,40 @@ static int lov_sync(struct obd_export *exp, struct obdo *oa, RETURN(rc); } +static int lov_sync_fs(struct obd_device *obd, struct obd_info *dummy, + int wait) +{ + struct lov_obd *lov; + struct obd_info oinfo = { { { 0 } } }; + struct lov_request *req; + struct lov_request_set *set; + struct l_wait_info lwi = { 0 }; + cfs_list_t *pos; + int rc = 0; + ENTRY; + + lov = &obd->u.lov; + rc = lov_prep_sync_fs_set(obd, &oinfo, &set); + if (rc) + RETURN(rc); + + cfs_list_for_each(pos, &set->set_list) { + struct obd_device *osc_obd; + req = cfs_list_entry(pos, struct lov_request, rq_link); + + osc_obd = class_exp2obd(lov->lov_tgts[req->rq_idx]->ltd_exp); + rc = obd_sync_fs(osc_obd, &req->rq_oi, wait); + if (rc) + break; + } + /* if wait then check if all sync_fs IO's are done */ + if (wait) + l_wait_event(set->set_waitq, lov_finished_set(set), &lwi); + + rc = lov_fini_sync_fs_set(set); + RETURN(rc); +} + static int lov_brw_check(struct lov_obd *lov, struct obd_info *lov_oinfo, obd_count oa_bufs, struct brw_page *pga) { @@ -2822,6 +2856,7 @@ struct obd_ops lov_obd_ops = { .o_pool_del = lov_pool_del, .o_getref = lov_getref, .o_putref = lov_putref, + .o_sync_fs = lov_sync_fs, }; static quota_interface_t *quota_interface; diff --git a/lustre/lov/lov_request.c b/lustre/lov/lov_request.c index 79d2943..3ad8f6f 100644 --- a/lustre/lov/lov_request.c +++ b/lustre/lov/lov_request.c @@ -1524,6 +1524,17 @@ int lov_fini_statfs_set(struct lov_request_set *set) RETURN(rc); } +int lov_fini_sync_fs_set(struct lov_request_set *set) +{ + int rc = 0; + ENTRY; + + if (set == NULL) + RETURN(rc); + lov_put_reqset(set); + RETURN(rc); +} + void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs, int success) { @@ -1703,3 +1714,59 @@ out_set: lov_fini_statfs_set(set); RETURN(rc); } + +int cb_sync_fs_update(void *cookie, int rc) +{ + struct obd_info *oinfo = cookie; + struct lov_request *lovreq; + ENTRY; + + lovreq = container_of(oinfo, struct lov_request, rq_oi); + lov_update_set(lovreq->rq_rqset, lovreq, rc); + + RETURN(rc); +} + +int lov_prep_sync_fs_set(struct obd_device *obd, struct obd_info *oinfo, + struct lov_request_set **request) +{ + struct lov_request_set *set; + struct lov_obd *lov = &obd->u.lov; + int rc = 0; + int i; + + ENTRY; + + OBD_ALLOC(set, sizeof(*set)); + if (set == NULL) + RETURN(ENOMEM); + lov_init_set(set); + set->set_obd = obd; + set->set_oi = oinfo; + + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + struct lov_request *req; + + if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active || + !lov->lov_tgts[i]->ltd_exp) { + CDEBUG(D_INFO, "lov idx %d inactive or disabled\n", i); + continue; + } + + OBD_ALLOC(req, sizeof(*req)); + if (req == NULL) + GOTO(out, rc = ENOMEM); + + req->rq_idx = i; + req->rq_oi.oi_cb_up = cb_sync_fs_update; + + lov_set_add_req(req, set); + } + if (!set->set_count) + GOTO(out, rc = -EIO); + *request = set; + RETURN(rc); +out: + lov_fini_sync_fs_set(set); + RETURN(rc); +} diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 0eac9ea..1ed1903 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -1473,6 +1473,7 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del); LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref); LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, sync_fs); } int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats) diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h index 9928169..b30fc9b 100644 --- a/lustre/osc/osc_internal.h +++ b/lustre/osc/osc_internal.h @@ -49,6 +49,7 @@ enum async_flags { to give the caller a chance to update or cancel the size of the io */ ASYNC_HP = 0x10, + ASYNC_SYNCFS = 0x20, }; struct obd_async_page_ops { diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 7144f0b..f4c5dd0 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -864,6 +864,19 @@ static unsigned long rpcs_in_flight(struct client_obd *cli) return cli->cl_r_in_flight + cli->cl_w_in_flight; } +int osc_wake_sync_fs(struct client_obd *cli) +{ + int rc = 0; + ENTRY; + if (cfs_list_empty(&cli->cl_loi_sync_fs_list) && + cli->cl_sf_wait.started) { + cli->cl_sf_wait.sfw_upcall(cli->cl_sf_wait.sfw_oi, rc); + cli->cl_sf_wait.started = 0; + CDEBUG(D_CACHE, "sync_fs_loi list is empty\n"); + } + RETURN(rc); +} + /* caller must hold loi_list_lock */ void osc_wake_cache_waiters(struct client_obd *cli) { @@ -1924,6 +1937,24 @@ static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap, osc_release_write_grant(cli, &oap->oap_brw_page, sent); } +static int lop_makes_syncfs_rpc(struct loi_oap_pages *lop) +{ + struct osc_async_page *oap; + ENTRY; + + if (cfs_list_empty(&lop->lop_urgent)) + RETURN(0); + + oap = cfs_list_entry(lop->lop_urgent.next, + struct osc_async_page, oap_urgent_item); + + if (oap->oap_async_flags & ASYNC_SYNCFS) { + CDEBUG(D_CACHE, "syncfs request forcing RPC\n"); + RETURN(1); + } + + RETURN(0); +} /* This maintains the lists of pending pages to read/write for a given object * (lop). This is used by osc_check_rpcs->osc_next_loi() and loi_list_maint() @@ -2012,10 +2043,19 @@ void loi_list_maint(struct client_obd *cli, struct lov_oinfo *loi) on_list(&loi->loi_ready_item, &cli->cl_loi_ready_list, 0); on_list(&loi->loi_hp_ready_item, &cli->cl_loi_hp_ready_list, 1); } else { - on_list(&loi->loi_hp_ready_item, &cli->cl_loi_hp_ready_list, 0); - on_list(&loi->loi_ready_item, &cli->cl_loi_ready_list, - lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE)|| - lop_makes_rpc(cli, &loi->loi_read_lop, OBD_BRW_READ)); + if (lop_makes_syncfs_rpc(&loi->loi_write_lop)) { + on_list(&loi->loi_sync_fs_item, + &cli->cl_loi_sync_fs_list, + loi->loi_write_lop.lop_num_pending); + } else { + on_list(&loi->loi_hp_ready_item, + &cli->cl_loi_hp_ready_list, 0); + on_list(&loi->loi_ready_item, &cli->cl_loi_ready_list, + lop_makes_rpc(cli, &loi->loi_write_lop, + OBD_BRW_WRITE)|| + lop_makes_rpc(cli, &loi->loi_read_lop, + OBD_BRW_READ)); + } } on_list(&loi->loi_write_item, &cli->cl_loi_write_list, @@ -2231,6 +2271,7 @@ static int brw_interpret(const struct lu_env *env, osc_release_write_grant(aa->aa_cli, aa->aa_ppga[i], 1); } osc_wake_cache_waiters(cli); + osc_wake_sync_fs(cli); osc_check_rpcs(env, cli); client_obd_list_unlock(&cli->cl_loi_list_lock); if (!async) @@ -2403,7 +2444,7 @@ osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli, * to be canceled, the pages covered by the lock will be sent out * with ASYNC_HP. We have to send out them as soon as possible. */ cfs_list_for_each_entry_safe(oap, tmp, &lop->lop_urgent, oap_urgent_item) { - if (oap->oap_async_flags & ASYNC_HP) + if (oap->oap_async_flags & ASYNC_HP) cfs_list_move(&oap->oap_pending_item, &tmp_list); else cfs_list_move_tail(&oap->oap_pending_item, &tmp_list); @@ -2563,7 +2604,7 @@ osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli, } osc_wake_cache_waiters(cli); - + osc_wake_sync_fs(cli); loi_list_maint(cli, loi); client_obd_list_unlock(&cli->cl_loi_list_lock); @@ -2656,6 +2697,9 @@ struct lov_oinfo *osc_next_loi(struct client_obd *cli) if (!cfs_list_empty(&cli->cl_loi_ready_list)) RETURN(cfs_list_entry(cli->cl_loi_ready_list.next, struct lov_oinfo, loi_ready_item)); + if (!cfs_list_empty(&cli->cl_loi_sync_fs_list)) + RETURN(cfs_list_entry(cli->cl_loi_sync_fs_list.next, + struct lov_oinfo, loi_sync_fs_item)); /* then if we have cache waiters, return all objects with queued * writes. This is especially important when many small files @@ -2771,6 +2815,8 @@ void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli) cfs_list_del_init(&loi->loi_write_item); if (!cfs_list_empty(&loi->loi_read_item)) cfs_list_del_init(&loi->loi_read_item); + if (!cfs_list_empty(&loi->loi_sync_fs_item)) + cfs_list_del_init(&loi->loi_sync_fs_item); loi_list_maint(cli, loi); @@ -3035,6 +3081,21 @@ int osc_set_async_flags_base(struct client_obd *cli, if ((oap->oap_async_flags & async_flags) == async_flags) RETURN(0); + /* XXX: This introduces a tiny insignificant race for the case if this + * loi already had other urgent items. + */ + if (SETTING(oap->oap_async_flags, async_flags, ASYNC_SYNCFS) && + cfs_list_empty(&oap->oap_rpc_item) && + cfs_list_empty(&oap->oap_urgent_item)) { + cfs_list_add_tail(&oap->oap_urgent_item, &lop->lop_urgent); + flags |= ASYNC_SYNCFS; + cfs_spin_lock(&oap->oap_lock); + oap->oap_async_flags |= flags; + cfs_spin_unlock(&oap->oap_lock); + loi_list_maint(cli, loi); + RETURN(0); + } + if (SETTING(oap->oap_async_flags, async_flags, ASYNC_READY)) flags |= ASYNC_READY; @@ -3091,7 +3152,8 @@ int osc_teardown_async_page(struct obd_export *exp, if (!cfs_list_empty(&oap->oap_urgent_item)) { cfs_list_del_init(&oap->oap_urgent_item); cfs_spin_lock(&oap->oap_lock); - oap->oap_async_flags &= ~(ASYNC_URGENT | ASYNC_HP); + oap->oap_async_flags &= ~(ASYNC_URGENT | ASYNC_HP | + ASYNC_SYNCFS); cfs_spin_unlock(&oap->oap_lock); } if (!cfs_list_empty(&oap->oap_pending_item)) { @@ -4513,6 +4575,45 @@ int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg) return(rc); } +static int osc_sync_fs(struct obd_device *obd, struct obd_info *oinfo, + int wait) +{ + struct client_obd *cli; + struct lov_oinfo *loi; + struct lov_oinfo *tloi; + struct osc_async_page *oap; + struct osc_async_page *toap; + struct loi_oap_pages *lop; + struct lu_env *env; + int refcheck; + int rc = 0; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + cli = &obd->u.cli; + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_sf_wait.sfw_oi = oinfo; + cli->cl_sf_wait.sfw_upcall = oinfo->oi_cb_up; + cli->cl_sf_wait.started = 1; + /* creating cl_loi_sync_fs list */ + cfs_list_for_each_entry_safe(loi, tloi, &cli->cl_loi_write_list, + loi_write_item) { + lop = &loi->loi_write_lop; + cfs_list_for_each_entry_safe(oap, toap, &lop->lop_pending, + oap_pending_item) + osc_set_async_flags_base(cli, loi, oap, ASYNC_SYNCFS); + } + + osc_check_rpcs(env, cli); + osc_wake_sync_fs(cli); + client_obd_list_unlock(&cli->cl_loi_list_lock); + cl_env_put(env, &refcheck); + RETURN(rc); +} + static int osc_process_config(struct obd_device *obd, obd_count len, void *buf) { return osc_process_config_base(obd, buf); @@ -4555,6 +4656,7 @@ struct obd_ops osc_obd_ops = { .o_llog_init = osc_llog_init, .o_llog_finish = osc_llog_finish, .o_process_config = osc_process_config, + .o_sync_fs = osc_sync_fs, }; extern struct lu_kmem_descr osc_caches[]; -- 1.8.3.1