From b1026acd3a05c4dccf80c5b0e499e198e62d4fca Mon Sep 17 00:00:00 2001 From: bobijam Date: Fri, 6 Mar 2009 04:54:00 +0000 Subject: [PATCH] Branch HEAD b=17536 o=johann i=zhenyu.xu (bobijam) i=adilger MDS create should not wait for statfs RPC while holding DLM lock. --- lustre/ChangeLog | 12 ++-- lustre/include/lustre/lustre_idl.h | 7 ++- lustre/include/lustre_net.h | 9 +-- lustre/include/obd.h | 11 +++- lustre/lov/lov_internal.h | 3 + lustre/lov/lov_obd.c | 18 ++++-- lustre/lov/lov_qos.c | 113 +++++++++++++++++++++++++++++++++++++ lustre/lov/lov_request.c | 18 ++++-- lustre/ptlrpc/ptlrpcd.c | 24 +++++++- 9 files changed, 193 insertions(+), 22 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 79589dc..768f607 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -14,24 +14,28 @@ tbd Sun Microsystems, Inc. * File join has been disabled in this release, refer to Bugzilla 16929. Severity : enhancement +Bugzilla : 17536 +Description: MDS create should not wait for statfs RPC while holding DLM lock. + +Severity : enhancement Bugzilla : 18289 Description: Update to RHEL5U3 kernel-2.6.18-128.1.1.el5. Severity : normal -Frequency : normal +Frequency : normal Bugzilla : 12069 -Descriptoin: OST grant too much space to client even there are not enough space. +Descriptoin: OST grant too much space to client even there are not enough space. Details : Client will shrink its grant cache to OST if there are no write activity over 6 mins (GRANT_SHRINK_INTERVAL), and OST will retrieve this grant cache if there are already not enough avaible space - (left_space < total_clients * 32M). + (left_space < total_clients * 32M). Severity : normal Frequency : start MDS on uncleanly shutdowned MDS device Bugzilla : 16839 Descriptoin: ll_sync thread stay in waiting mds<>ost recovery finished Details : stay in waiting mds<>ost recovery finished produce random bugs - due race between two ll_sync thread for one lov target. send + due race between two ll_sync thread for one lov target. send ACTIVATE event only if connect realy finished and import have FULL state. diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index df4d5b4..f7941cb 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -1053,9 +1053,14 @@ struct obd_statfs { extern void lustre_swab_obd_statfs (struct obd_statfs *os); #define OBD_STATFS_NODELAY 0x0001 /* requests should be send without delay * and resends for avoid deadlocks */ - #define OBD_STATFS_FROM_CACHE 0x0002 /* the statfs callback should not update * obd_osfs_age */ +#define OBD_STATFS_PTLRPCD 0x0004 /* requests will be sent via ptlrpcd + * instead of a specific set. This + * means that we cannot rely on the set + * interpret routine to be called. + * lov_statfs_fini() must thus be called + * by the request interpret routine */ /* ost_body.data values for OST_BRW */ diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index b4d2e63..70c3150 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -1147,19 +1147,19 @@ ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase) { if (req->rq_phase == new_phase) return; - + if (new_phase == RQ_PHASE_UNREGISTERING) { req->rq_next_phase = req->rq_phase; if (req->rq_import) atomic_inc(&req->rq_import->imp_unregistering); } - + if (req->rq_phase == RQ_PHASE_UNREGISTERING) { if (req->rq_import) atomic_dec(&req->rq_import->imp_unregistering); } - DEBUG_REQ(D_RPCTRACE, req, "move req \"%s\" -> \"%s\"", + DEBUG_REQ(D_RPCTRACE, req, "move req \"%s\" -> \"%s\"", ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase)); req->rq_phase = new_phase; @@ -1274,7 +1274,7 @@ int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid); /* ptlrpc/pinger.c */ enum timeout_event { - TIMEOUT_GRANT = 1 + TIMEOUT_GRANT = 1 }; struct timeout_item; typedef int (*timeout_cb_t)(struct timeout_item *, void *); @@ -1316,6 +1316,7 @@ int ptlrpcd_start(const char *name, struct ptlrpcd_ctl *pc); void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force); void ptlrpcd_wake(struct ptlrpc_request *req); void ptlrpcd_add_req(struct ptlrpc_request *req, enum ptlrpcd_scope scope); +void ptlrpcd_add_rqset(struct ptlrpc_request_set *set); int ptlrpcd_addref(void); void ptlrpcd_decref(void); diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 5d58050..cb5babd 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -648,6 +648,10 @@ struct lov_qos_rr { unsigned long lqr_dirty:1; /* recalc round-robin list */ }; +struct lov_statfs_data { + struct obd_info lsd_oi; + struct obd_statfs lsd_statfs; +}; /* Stripe placement optimization */ struct lov_qos { struct list_head lq_oss_list; /* list of OSSs that targets use */ @@ -659,7 +663,12 @@ struct lov_qos { unsigned long lq_dirty:1, /* recalc qos data */ lq_same_space:1,/* the ost's all have approx. the same space avail */ - lq_reset:1; /* zero current penalties */ + lq_reset:1, /* zero current penalties */ + lq_statfs_in_progress:1; /* statfs op in progress */ + /* qos statfs data */ + struct lov_statfs_data *lq_statfs_data; + cfs_waitq_t lq_statfs_waitq; /* waitqueue to notify statfs + * requests completion */ }; struct lov_tgt_desc { diff --git a/lustre/lov/lov_internal.h b/lustre/lov/lov_internal.h index 2aaaff4..69f6848 100644 --- a/lustre/lov/lov_internal.h +++ b/lustre/lov/lov_internal.h @@ -166,6 +166,8 @@ int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt); void qos_shrink_lsm(struct lov_request_set *set); int qos_prep_create(struct obd_export *exp, struct lov_request_set *set); void qos_update(struct lov_obd *lov); +void qos_statfs_done(struct lov_obd *lov); +void qos_statfs_update(struct obd_device *obd, __u64 max_age, int wait); int qos_remedy_create(struct lov_request_set *set, struct lov_request *req); /* lov_request.c */ @@ -236,6 +238,7 @@ void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs, int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, int success); int lov_fini_statfs_set(struct lov_request_set *set); +int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc); /* lov_obd.c */ void lov_fix_desc(struct lov_desc *desc); diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 0d4fe78..e51b0e2 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -810,6 +810,11 @@ int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg) lov->lov_qos.lq_prio_free = 232; /* Default threshold for rr (roughly 17%) */ lov->lov_qos.lq_threshold_rr = 43; + /* Init statfs fields */ + OBD_ALLOC_PTR(lov->lov_qos.lq_statfs_data); + if (NULL == lov->lov_qos.lq_statfs_data) + RETURN(-ENOMEM); + cfs_waitq_init(&lov->lov_qos.lq_statfs_waitq); lov->lov_pools_hash_body = lustre_hash_init("POOLS", 7, 7, &pool_hash_operations, 0); @@ -914,6 +919,7 @@ static int lov_cleanup(struct obd_device *obd) /* clear pools parent proc entry only after all pools is killed */ lprocfs_obd_cleanup(obd); + OBD_FREE_PTR(lov->lov_qos.lq_statfs_data); RETURN(0); } @@ -1106,8 +1112,6 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa, struct obd_info oinfo; struct lov_request_set *set = NULL; struct lov_request *req; - struct obd_statfs osfs; - __u64 maxage; int rc = 0; ENTRY; @@ -1133,8 +1137,11 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa, GOTO(out, rc); } - maxage = cfs_time_shift_64(-lov->desc.ld_qos_maxage); - obd_statfs_rqset(exp->exp_obd, &osfs, maxage, OBD_STATFS_NODELAY); + /* issue statfs rpcs if the osfs data is older than qos_maxage - 1s, + * later in alloc_qos(), we will wait for those rpcs to complete if + * the osfs age is older than 2 * qos_maxage */ + qos_statfs_update(exp->exp_obd, + cfs_time_shift_64(-lov->desc.ld_qos_maxage) + HZ, 0); rc = lov_prep_create_set(exp, &oinfo, ea, src_oa, oti, &set); if (rc) @@ -1847,8 +1854,7 @@ static int lov_cancel_unused(struct obd_export *exp, RETURN(rc); } -static int lov_statfs_interpret(struct ptlrpc_request_set *rqset, - void *data, int rc) +int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc) { struct lov_request_set *lovset = (struct lov_request_set *)data; int err; diff --git a/lustre/lov/lov_qos.c b/lustre/lov/lov_qos.c index 8913d90..084e2f2 100644 --- a/lustre/lov/lov_qos.c +++ b/lustre/lov/lov_qos.c @@ -47,6 +47,7 @@ #include #include +#include #include "lov_internal.h" /* #define QOS_DEBUG 1 */ @@ -765,6 +766,11 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt, lov_getref(exp->exp_obd); + /* wait for fresh statfs info if needed, the rpcs are sent in + * lov_create() */ + qos_statfs_update(exp->exp_obd, + cfs_time_shift_64(-2 * lov->desc.ld_qos_maxage), 1); + /* Detect -EAGAIN early, before expensive lock is taken. */ if (!lov->lov_qos.lq_dirty && lov->lov_qos.lq_same_space) GOTO(out_nolock, rc = -EAGAIN); @@ -1103,3 +1109,110 @@ void qos_update(struct lov_obd *lov) ENTRY; lov->lov_qos.lq_dirty = 1; } + +void qos_statfs_done(struct lov_obd *lov) +{ + LASSERT(lov->lov_qos.lq_statfs_in_progress); + down_write(&lov->lov_qos.lq_rw_sem); + lov->lov_qos.lq_statfs_in_progress = 0; + /* wake up any threads waiting for the statfs rpcs to complete */ + cfs_waitq_signal(&lov->lov_qos.lq_statfs_waitq); + up_write(&lov->lov_qos.lq_rw_sem); +} + +static int qos_statfs_ready(struct obd_device *obd, __u64 max_age) +{ + struct lov_obd *lov = &obd->u.lov; + int rc; + ENTRY; + down_read(&lov->lov_qos.lq_rw_sem); + rc = lov->lov_qos.lq_statfs_in_progress == 0 || + cfs_time_beforeq_64(max_age, obd->obd_osfs_age); + up_read(&lov->lov_qos.lq_rw_sem); + RETURN(rc); +} + +/* + * Update statfs data if the current osfs age is older than max_age. + * If wait is not set, it means that we are called from lov_create() + * and we should just issue the rpcs without waiting for them to complete. + * If wait is set, we are called from alloc_qos() and we just have + * to wait for the request set to complete. + */ +void qos_statfs_update(struct obd_device *obd, __u64 max_age, int wait) +{ + struct lov_obd *lov = &obd->u.lov; + struct obd_info *oinfo; + int rc = 0; + struct ptlrpc_request_set *set = NULL; + ENTRY; + + if (cfs_time_beforeq_64(max_age, obd->obd_osfs_age)) + /* statfs data are quite recent, don't need to refresh it */ + RETURN_EXIT; + + if (!wait && lov->lov_qos.lq_statfs_in_progress) + /* statfs already in progress */ + RETURN_EXIT; + + down_write(&lov->lov_qos.lq_rw_sem); + if (lov->lov_qos.lq_statfs_in_progress) { + up_write(&lov->lov_qos.lq_rw_sem); + GOTO(out, rc = 0); + } + /* no statfs in flight, send rpcs */ + lov->lov_qos.lq_statfs_in_progress = 1; + up_write(&lov->lov_qos.lq_rw_sem); + + if (wait) + CDEBUG(D_QOS, "%s: did not manage to get fresh statfs data " + "in a timely manner (osfs age "LPU64", max age "LPU64")" + ", sending new statfs rpcs\n", + obd_uuid2str(&lov->desc.ld_uuid), obd->obd_osfs_age, + max_age); + + /* need to send statfs rpcs */ + CDEBUG(D_QOS, "sending new statfs requests\n"); + memset(lov->lov_qos.lq_statfs_data, 0, + sizeof(*lov->lov_qos.lq_statfs_data)); + oinfo = &lov->lov_qos.lq_statfs_data->lsd_oi; + oinfo->oi_osfs = &lov->lov_qos.lq_statfs_data->lsd_statfs; + oinfo->oi_flags = OBD_STATFS_NODELAY; + set = ptlrpc_prep_set(); + if (!set) + GOTO(out_failed, rc = -ENOMEM); + + rc = obd_statfs_async(obd, oinfo, max_age, set); + if (rc || list_empty(&set->set_requests)) { + if (rc) + CWARN("statfs failed with %d\n", rc); + GOTO(out_failed, rc); + } + /* send requests via ptlrpcd */ + oinfo->oi_flags |= OBD_STATFS_PTLRPCD; + ptlrpcd_add_rqset(set); + GOTO(out, rc); + +out_failed: + down_write(&lov->lov_qos.lq_rw_sem); + lov->lov_qos.lq_statfs_in_progress = 0; + /* wake up any threads waiting for the statfs rpcs to complete */ + cfs_waitq_signal(&lov->lov_qos.lq_statfs_waitq); + up_write(&lov->lov_qos.lq_rw_sem); + wait = 0; +out: + if (set) + ptlrpc_set_destroy(set); + if (wait) { + struct l_wait_info lwi = { 0 }; + CDEBUG(D_QOS, "waiting for statfs requests to complete\n"); + l_wait_event(lov->lov_qos.lq_statfs_waitq, + qos_statfs_ready(obd, max_age), &lwi); + if (cfs_time_before_64(obd->obd_osfs_age, max_age)) + CDEBUG(D_QOS, "%s: still no fresh statfs data after " + "waiting (osfs age "LPU64", max age " + LPU64")\n", + obd_uuid2str(&lov->desc.ld_uuid), + obd->obd_osfs_age, max_age); + } +} diff --git a/lustre/lov/lov_request.c b/lustre/lov/lov_request.c index f4aab31..249f0b1 100644 --- a/lustre/lov/lov_request.c +++ b/lustre/lov/lov_request.c @@ -955,7 +955,7 @@ int lov_fini_getattr_set(struct lov_request_set *set) } /* The callback for osc_getattr_async that finilizes a request info when a - * response is recieved. */ + * response is received. */ static int cb_getattr_update(void *cookie, int rc) { struct obd_info *oinfo = cookie; @@ -1145,7 +1145,7 @@ int lov_update_setattr_set(struct lov_request_set *set, } /* The callback for osc_setattr_async that finilizes a request info when a - * response is recieved. */ + * response is received. */ static int cb_setattr_update(void *cookie, int rc) { struct obd_info *oinfo = cookie; @@ -1281,7 +1281,7 @@ int lov_update_punch_set(struct lov_request_set *set, } /* The callback for osc_punch that finilizes a request info when a response - * is recieved. */ + * is received. */ static int cb_update_punch(void *cookie, int rc) { struct obd_info *oinfo = cookie; @@ -1566,7 +1566,7 @@ void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs, } /* The callback for osc_statfs_async that finilizes a request info when a - * response is recieved. */ + * response is received. */ static int cb_statfs_update(void *cookie, int rc) { struct obd_info *oinfo = cookie; @@ -1593,7 +1593,7 @@ static int cb_statfs_update(void *cookie, int rc) if (rc && !(lov->lov_tgts[lovreq->rq_idx] && lov->lov_tgts[lovreq->rq_idx]->ltd_active)) rc = 0; - RETURN(rc); + GOTO(out, rc); } spin_lock(&obd->obd_osfs_lock); @@ -1604,6 +1604,14 @@ static int cb_statfs_update(void *cookie, int rc) lov_update_statfs(osfs, lov_sfs, success); qos_update(lov); +out: + if (lovreq->rq_rqset->set_oi->oi_flags & OBD_STATFS_PTLRPCD && + lovreq->rq_rqset->set_count == lovreq->rq_rqset->set_completes) { + lov_statfs_interpret(NULL, lovreq->rq_rqset, + lovreq->rq_rqset->set_success != + lovreq->rq_rqset->set_count); + qos_statfs_done(lov); + } RETURN(0); } diff --git a/lustre/ptlrpc/ptlrpcd.c b/lustre/ptlrpc/ptlrpcd.c index 867ec66..2f21170 100644 --- a/lustre/ptlrpc/ptlrpcd.c +++ b/lustre/ptlrpc/ptlrpcd.c @@ -103,6 +103,28 @@ void ptlrpcd_wake(struct ptlrpc_request *req) } /* + * Move all request from an existing request set to the ptlrpcd queue. + * All requests from the set must be in phase RQ_PHASE_NEW. + */ +void ptlrpcd_add_rqset(struct ptlrpc_request_set *set) +{ + struct list_head *tmp, *pos; + + list_for_each_safe(pos, tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(pos, struct ptlrpc_request, rq_set_chain); + + LASSERT(req->rq_phase == RQ_PHASE_NEW); + list_del_init(&req->rq_set_chain); + req->rq_set = NULL; + ptlrpcd_add_req(req, PSCOPE_OTHER); + set->set_remaining--; + } + LASSERT(set->set_remaining == 0); +} +EXPORT_SYMBOL(ptlrpcd_add_rqset); + +/* * Requests that are added to the ptlrpcd queue are sent via * ptlrpcd_check->ptlrpc_check_set(). */ @@ -263,7 +285,7 @@ static int ptlrpcd(void *arg) exit++; } - /* + /* * Let's make one more loop to make sure that ptlrpcd_check() * copied all raced new rpcs into the set so we can kill them. */ -- 1.8.3.1