From: yury Date: Mon, 10 Dec 2007 16:53:39 +0000 (+0000) Subject: b=13766 X-Git-Tag: v1_7_0_51~411 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=2f83d64b5485e139e626580b1fa18630acbbb850 b=13766 r=nikita,adilger,shadow - lots of fixes and cleanups in ldlm pools code. --- diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index 7e021b2..8eec429 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -217,10 +217,12 @@ struct ldlm_lock; struct ldlm_resource; struct ldlm_namespace; -typedef int (*ldlm_pool_recalc_t)(struct ldlm_pool *pl); - -typedef int (*ldlm_pool_shrink_t)(struct ldlm_pool *pl, - int nr, unsigned int gfp_mask); +struct ldlm_pool_ops { + int (*po_recalc)(struct ldlm_pool *pl); + int (*po_shrink)(struct ldlm_pool *pl, int nr, + unsigned int gfp_mask); + int (*po_setup)(struct ldlm_pool *pl, int limit); +}; enum { LDLM_POOL_CTL_RECALC = 1 << 0, /* Pool recalc is enabled */ @@ -235,39 +237,39 @@ enum { #define LDLM_POOLS_MODEST_MARGIN (5) /* A change to SLV in % after which we want to wake up pools thread asap. */ -#define LDLM_POOLS_FAST_SLV_CHANGE (5) +#define LDLM_POOLS_FAST_SLV_CHANGE (50) struct ldlm_pool { /* Common pool fields */ - cfs_proc_dir_entry_t *pl_proc_dir; /* Pool proc directory. */ - char pl_name[100]; /* Pool name, should be long - * enough to contain complex - * proc entry name. */ - spinlock_t pl_lock; /* Lock for protecting slv/clv - * updates. */ - atomic_t pl_limit; /* Number of allowed locks in - * in pool, both, client and - * server side. */ - atomic_t pl_granted; /* Number of granted locks. */ - atomic_t pl_grant_rate; /* Grant rate per T. */ - atomic_t pl_cancel_rate; /* Cancel rate per T. */ - atomic_t pl_grant_speed; /* Grant speed (GR - CR) per T. */ - __u64 pl_server_lock_volume; /* Server lock volume. Protected - * by pl_lock. */ - cfs_time_t pl_update_time; /* Time when last slv from server - * was obtained. */ - ldlm_pool_recalc_t pl_recalc; /* Recalc callback func pointer. */ - ldlm_pool_shrink_t pl_shrink; /* Shrink callback func pointer. */ - int pl_control; /* Pool features mask */ + cfs_proc_dir_entry_t *pl_proc_dir; /* Pool proc directory. */ + char pl_name[100]; /* Pool name, should be long + * enough to contain complex + * proc entry name. */ + spinlock_t pl_lock; /* Lock for protecting slv/clv + * updates. */ + atomic_t pl_limit; /* Number of allowed locks in + * in pool, both, client and + * server side. */ + atomic_t pl_granted; /* Number of granted locks. */ + atomic_t pl_grant_rate; /* Grant rate per T. */ + atomic_t pl_cancel_rate; /* Cancel rate per T. */ + atomic_t pl_grant_speed; /* Grant speed (GR-CR) per T. */ + __u64 pl_server_lock_volume; /* Server lock volume. + * Protected by pl_lock */ + atomic_t pl_lock_volume_factor; /* Lock volume factor. */ + + time_t pl_recalc_time; /* Time when last slv from + * server was obtained. */ + struct ldlm_pool_ops *pl_ops; /* Recalc and shrink ops. */ + + int pl_control; /* Pool features mask */ - /* Server side pool fields */ - atomic_t pl_grant_plan; /* Planned number of granted - * locks for next T. */ - atomic_t pl_grant_step; /* Grant plan step for next T. */ + atomic_t pl_grant_plan; /* Planned number of granted + * locks for next T. */ + atomic_t pl_grant_step; /* Grant plan step for next + * T. */ - /* Client side pool related fields */ - atomic_t pl_lock_volume_factor; /* Lock volume factor. */ - struct lprocfs_stats *pl_stats; /* Pool statistics. */ + struct lprocfs_stats *pl_stats; /* Pool statistics. */ }; static inline int pool_recalc_enabled(struct ldlm_pool *pl) @@ -295,6 +297,12 @@ typedef enum { LDLM_NAMESPACE_MODEST = 1 << 1 } ldlm_appetite_t; +/* Default value for ->ns_shrink_thumb. If lock is not extent one its cost + * is one page. Here we have 256 pages which is 1M on i386. Thus by default + * all extent locks which have more than 1M long extent will be kept in lru, + * others (including ibits locks) will be canceled on memory pressure event. */ +#define LDLM_LOCK_SHRINK_THUMB 256 + struct ldlm_namespace { char *ns_name; ldlm_side_t ns_client; /* is this a client-side lock tree? */ @@ -315,6 +323,9 @@ struct ldlm_namespace { unsigned int ns_max_unused; unsigned int ns_max_age; + + /* Lower limit to number of pages in lock to keep it in cache */ + unsigned int ns_shrink_thumb; cfs_time_t ns_next_dump; /* next debug dump, jiffies */ atomic_t ns_locks; @@ -805,7 +816,7 @@ int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, unsigned int gfp_mask); void ldlm_pool_fini(struct ldlm_pool *pl); -int ldlm_pool_setup(struct ldlm_pool *pl, __u32 limit); +int ldlm_pool_setup(struct ldlm_pool *pl, int limit); int ldlm_pool_recalc(struct ldlm_pool *pl); __u64 ldlm_pool_get_slv(struct ldlm_pool *pl); __u32 ldlm_pool_get_limit(struct ldlm_pool *pl); diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h index 25ac511..73adcb6 100644 --- a/lustre/ldlm/ldlm_internal.h +++ b/lustre/ldlm/ldlm_internal.h @@ -36,9 +36,15 @@ typedef enum { } ldlm_sync_t; /* Cancel lru flag, it indicates we cancel aged locks. */ -#define LDLM_CANCEL_AGED 0x00000001 +enum { + LDLM_CANCEL_AGED = 1 << 0, /* Cancel aged locks (non lru resize). */ + LDLM_CANCEL_PASSED = 1 << 1, /* Cancel passed number of locks. */ + LDLM_CANCEL_SHRINK = 1 << 2, /* Cancel locks from shrinker. */ + LDLM_CANCEL_LRUR = 1 << 3 /* Cancel locks from lru resize. */ +}; -int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync); +int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync, + int flags); int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, int count, int max, int flags); diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 41d241f..abf719f 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1794,18 +1794,16 @@ int target_pack_pool_reply(struct ptlrpc_request *req) struct ldlm_pool *pl; ENTRY; - if (req->rq_export == NULL) { + if (!req->rq_export || !exp_connect_lru_resize(req->rq_export)) { lustre_msg_set_slv(req->rq_repmsg, 0); lustre_msg_set_limit(req->rq_repmsg, 0); RETURN(0); } - if (!exp_connect_lru_resize(req->rq_export)) - RETURN(0); - pl = ldlm_exp2pl(req->rq_export); spin_lock(&pl->pl_lock); + LASSERT(ldlm_pool_get_slv(pl) != 0 && ldlm_pool_get_limit(pl) != 0); lustre_msg_set_slv(req->rq_repmsg, ldlm_pool_get_slv(pl)); lustre_msg_set_limit(req->rq_repmsg, ldlm_pool_get_limit(pl)); spin_unlock(&pl->pl_lock); @@ -1829,7 +1827,6 @@ int target_send_reply_msg(struct ptlrpc_request *req, int rc, int fail_id) DEBUG_REQ(D_NET, req, "sending reply"); } - target_pack_pool_reply(req); return (ptlrpc_send_reply(req, 1)); } diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index b42d6a5..59c6e72 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -635,7 +635,7 @@ void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode) * enqueue. */ if (!exp_connect_cancelset(lock->l_conn_export) && !ns_connect_lru_resize(ns)) - ldlm_cancel_lru(ns, 0, LDLM_ASYNC); + ldlm_cancel_lru(ns, 0, LDLM_ASYNC, 0); } else { unlock_res_and_lock(lock); } diff --git a/lustre/ldlm/ldlm_pool.c b/lustre/ldlm/ldlm_pool.c index 704b4cd..979de47 100644 --- a/lustre/ldlm/ldlm_pool.c +++ b/lustre/ldlm/ldlm_pool.c @@ -138,11 +138,17 @@ static inline __u64 ldlm_pool_slv_min(__u32 L) } enum { - LDLM_POOL_GRANTED_STAT = 0, + LDLM_POOL_FIRST_STAT = 0, + LDLM_POOL_GRANTED_STAT = LDLM_POOL_FIRST_STAT, + LDLM_POOL_GRANT_STAT, + LDLM_POOL_CANCEL_STAT, LDLM_POOL_GRANT_RATE_STAT, LDLM_POOL_CANCEL_RATE_STAT, LDLM_POOL_GRANT_PLAN_STAT, LDLM_POOL_SLV_STAT, + LDLM_POOL_SHRINK_REQTD_STAT, + LDLM_POOL_SHRINK_FREED_STAT, + LDLM_POOL_RECALC_STAT, LDLM_POOL_LAST_STAT }; @@ -218,8 +224,7 @@ static int ldlm_srv_pool_recalc(struct ldlm_pool *pl) ENTRY; spin_lock(&pl->pl_lock); - recalc_interval_sec = cfs_duration_sec(cfs_time_current() - - pl->pl_update_time); + recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time; if (recalc_interval_sec > 0) { /* Update statistics */ ldlm_pool_recalc_stats(pl); @@ -230,12 +235,12 @@ static int ldlm_srv_pool_recalc(struct ldlm_pool *pl) /* Update grant_plan for new period. */ ldlm_pool_recalc_grant_plan(pl); - pl->pl_update_time = cfs_time_current(); /* Zero out all rates and speed for the last period. */ atomic_set(&pl->pl_grant_rate, 0); atomic_set(&pl->pl_cancel_rate, 0); atomic_set(&pl->pl_grant_speed, 0); + pl->pl_recalc_time = cfs_time_current_sec(); } spin_unlock(&pl->pl_lock); RETURN(0); @@ -246,30 +251,36 @@ static int ldlm_srv_pool_recalc(struct ldlm_pool *pl) static int ldlm_srv_pool_shrink(struct ldlm_pool *pl, int nr, unsigned int gfp_mask) { - __u32 granted, limit; - __u64 slv_delta; + __u32 limit; ENTRY; - /* Client already canceled locks but server is already in shrinker and - * can't cancel anything. Let's catch this race. */ - if ((granted = atomic_read(&pl->pl_granted)) == 0) + /* VM is asking how many entries may be potentially freed. */ + if (nr == 0) + RETURN(atomic_read(&pl->pl_granted)); + + /* Client already canceled locks but server is already in shrinker + * and can't cancel anything. Let's catch this race. */ + if (atomic_read(&pl->pl_granted) == 0) RETURN(0); spin_lock(&pl->pl_lock); - /* Simple proportion but it gives impression on how much should be - * SLV changed for request @nr of locks to be canceled.*/ - slv_delta = nr * ldlm_pool_get_slv(pl); - limit = ldlm_pool_get_limit(pl); - do_div(slv_delta, granted); - - /* As SLV has some dependence on historical data, that is new value - * is based on old one, this decreasing will make clients get some - * locks back to the server and after some time it will stabilize.*/ - if (slv_delta < ldlm_pool_get_slv(pl)) - ldlm_pool_set_slv(pl, ldlm_pool_get_slv(pl) - slv_delta); - else + /* We want shrinker to possibly cause cancelation of @nr locks from + * clients or grant approximately @nr locks smaller next intervals. + * + * This is why we decresed SLV by @nr. This effect will only be as + * long as one re-calc interval (1s these days) and this should be + * enough to pass this decreased SLV to all clients. On next recalc + * interval pool will either increase SLV if locks load is not high + * or will keep on same level or even decrease again, thus, shrinker + * decreased SLV will affect next recalc intervals and this way will + * make locking load lower. */ + if (nr < ldlm_pool_get_slv(pl)) { + ldlm_pool_set_slv(pl, ldlm_pool_get_slv(pl) - nr); + } else { + limit = ldlm_pool_get_limit(pl); ldlm_pool_set_slv(pl, ldlm_pool_slv_min(limit)); + } spin_unlock(&pl->pl_lock); /* We did not really free any memory here so far, it only will be @@ -277,6 +288,13 @@ static int ldlm_srv_pool_shrink(struct ldlm_pool *pl, RETURN(0); } +static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit) +{ + ENTRY; + ldlm_pool_set_limit(pl, limit); + RETURN(0); +} + static int ldlm_cli_pool_recalc(struct ldlm_pool *pl) { time_t recalc_interval_sec; @@ -284,8 +302,7 @@ static int ldlm_cli_pool_recalc(struct ldlm_pool *pl) spin_lock(&pl->pl_lock); - recalc_interval_sec = cfs_duration_sec(cfs_time_current() - - pl->pl_update_time); + recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time; if (recalc_interval_sec > 0) { /* Update statistics only every T */ ldlm_pool_recalc_stats(pl); @@ -294,28 +311,63 @@ static int ldlm_cli_pool_recalc(struct ldlm_pool *pl) atomic_set(&pl->pl_grant_rate, 0); atomic_set(&pl->pl_cancel_rate, 0); atomic_set(&pl->pl_grant_speed, 0); + pl->pl_recalc_time = cfs_time_current_sec(); } spin_unlock(&pl->pl_lock); - /* Recalc client pool is done without taking into account pl_update_time - * as this may be called voluntary in the case of emergency. Client - * recalc does not calculate anything, we do not risk to have skew - * of some pool param. */ - ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LDLM_ASYNC); - RETURN(0); + /* Do not cancel locks in case lru resize is disabled for this ns */ + if (!ns_connect_lru_resize(ldlm_pl2ns(pl))) + RETURN(0); + + /* In the time of canceling locks on client we do not need to maintain + * sharp timing, we only want to cancel locks asap according to new SLV. + * This may be called when SLV has changed much, this is why we do not + * take into account pl->pl_recalc_time here. */ + RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LDLM_ASYNC, + LDLM_CANCEL_LRUR)); } static int ldlm_cli_pool_shrink(struct ldlm_pool *pl, int nr, unsigned int gfp_mask) { ENTRY; - RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), nr, LDLM_SYNC)); + + /* Do not cancel locks in case lru resize is disabled for this ns */ + if (!ns_connect_lru_resize(ldlm_pl2ns(pl))) + RETURN(0); + + /* Find out how many locks may be released according to shrink + * policy. */ + if (nr == 0) + RETURN(ldlm_cancel_lru_local(ldlm_pl2ns(pl), NULL, 0, + 0, LDLM_CANCEL_SHRINK)); + + /* Cancel @nr locks accoding to shrink policy */ + RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), nr, LDLM_SYNC, + LDLM_CANCEL_SHRINK)); } +struct ldlm_pool_ops ldlm_srv_pool_ops = { + .po_recalc = ldlm_srv_pool_recalc, + .po_shrink = ldlm_srv_pool_shrink, + .po_setup = ldlm_srv_pool_setup +}; + +struct ldlm_pool_ops ldlm_cli_pool_ops = { + .po_recalc = ldlm_cli_pool_recalc, + .po_shrink = ldlm_cli_pool_shrink +}; + int ldlm_pool_recalc(struct ldlm_pool *pl) { - if (pl->pl_recalc != NULL && pool_recalc_enabled(pl)) - return pl->pl_recalc(pl); + int count; + + if (pl->pl_ops->po_recalc != NULL && pool_recalc_enabled(pl)) { + count = pl->pl_ops->po_recalc(pl); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT, + count); + return count; + } return 0; } EXPORT_SYMBOL(ldlm_pool_recalc); @@ -323,22 +375,32 @@ EXPORT_SYMBOL(ldlm_pool_recalc); int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, unsigned int gfp_mask) { - if (pl->pl_shrink != NULL && pool_shrink_enabled(pl)) { - CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks\n", - pl->pl_name, nr); - return pl->pl_shrink(pl, nr, gfp_mask); + int cancel = 0; + + if (pl->pl_ops->po_shrink != NULL && pool_shrink_enabled(pl)) { + cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask); + if (nr > 0) { + lprocfs_counter_add(pl->pl_stats, + LDLM_POOL_SHRINK_REQTD_STAT, + nr); + lprocfs_counter_add(pl->pl_stats, + LDLM_POOL_SHRINK_FREED_STAT, + cancel); + CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks, " + "shrunk %d\n", pl->pl_name, nr, cancel); + } } - return 0; + return cancel; } EXPORT_SYMBOL(ldlm_pool_shrink); /* The purpose of this function is to re-setup limit and maximal allowed * slv according to the passed limit. */ -int ldlm_pool_setup(struct ldlm_pool *pl, __u32 limit) +int ldlm_pool_setup(struct ldlm_pool *pl, int limit) { ENTRY; - if (ns_is_server(ldlm_pl2ns(pl))) - ldlm_pool_set_limit(pl, limit); + if (pl->pl_ops->po_setup != NULL) + RETURN(pl->pl_ops->po_setup(pl, limit)); RETURN(0); } EXPORT_SYMBOL(ldlm_pool_setup); @@ -368,10 +430,9 @@ static int lprocfs_rd_pool_state(char *page, char **start, off_t off, pl->pl_name); nr += snprintf(page + nr, count - nr, " SLV: "LPU64"\n", slv); - if (ns_is_client(ldlm_pl2ns(pl))) { - nr += snprintf(page + nr, count - nr, " LVF: %d\n", - atomic_read(&pl->pl_lock_volume_factor)); - } + nr += snprintf(page + nr, count - nr, " LVF: %d\n", + atomic_read(&pl->pl_lock_volume_factor)); + nr += snprintf(page + nr, count - nr, " GSP: %d%%\n", grant_step); nr += snprintf(page + nr, count - nr, " GP: %d\n", @@ -469,13 +530,11 @@ static int ldlm_pool_proc_init(struct ldlm_pool *pl) pool_vars[0].write_fptr = lprocfs_wr_atomic; lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0); - if (ns_is_client(ns)) { - snprintf(var_name, MAX_STRING_SIZE, "lock_volume_factor"); - pool_vars[0].data = &pl->pl_lock_volume_factor; - pool_vars[0].read_fptr = lprocfs_rd_uint; - pool_vars[0].write_fptr = lprocfs_wr_uint; - lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0); - } + snprintf(var_name, MAX_STRING_SIZE, "lock_volume_factor"); + pool_vars[0].data = &pl->pl_lock_volume_factor; + pool_vars[0].read_fptr = lprocfs_rd_uint; + pool_vars[0].write_fptr = lprocfs_wr_uint; + lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0); snprintf(var_name, MAX_STRING_SIZE, "state"); pool_vars[0].data = pl; @@ -483,13 +542,17 @@ static int ldlm_pool_proc_init(struct ldlm_pool *pl) lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0); pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT - - LDLM_POOL_GRANTED_STAT, 0); + LDLM_POOL_FIRST_STAT, 0); if (!pl->pl_stats) GOTO(out_free_name, rc = -ENOMEM); lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT, LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, "granted", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT, 0, + "grant", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT, 0, + "cancel", "locks"); lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT, LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, "grant_rate", "locks/s"); @@ -502,6 +565,15 @@ static int ldlm_pool_proc_init(struct ldlm_pool *pl) lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT, LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, "slv", "slv"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "shrink_request", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "shrink_freed", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_RECALC_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "recalc_freed", "locks"); lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats); EXIT; @@ -534,7 +606,7 @@ int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, spin_lock_init(&pl->pl_lock); atomic_set(&pl->pl_granted, 0); - pl->pl_update_time = cfs_time_current(); + pl->pl_recalc_time = cfs_time_current_sec(); atomic_set(&pl->pl_lock_volume_factor, 1); atomic_set(&pl->pl_grant_rate, 0); @@ -548,15 +620,13 @@ int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, ns->ns_name, idx); if (client == LDLM_NAMESPACE_SERVER) { - pl->pl_recalc = ldlm_srv_pool_recalc; - pl->pl_shrink = ldlm_srv_pool_shrink; + pl->pl_ops = &ldlm_srv_pool_ops; ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L); ldlm_pool_set_slv(pl, ldlm_pool_slv_max(LDLM_POOL_HOST_L)); } else { ldlm_pool_set_slv(pl, 1); ldlm_pool_set_limit(pl, 1); - pl->pl_recalc = ldlm_cli_pool_recalc; - pl->pl_shrink = ldlm_cli_pool_shrink; + pl->pl_ops = &ldlm_cli_pool_ops; } rc = ldlm_pool_proc_init(pl); @@ -573,8 +643,7 @@ void ldlm_pool_fini(struct ldlm_pool *pl) { ENTRY; ldlm_pool_proc_fini(pl); - pl->pl_recalc = NULL; - pl->pl_shrink = NULL; + pl->pl_ops = NULL; EXIT; } EXPORT_SYMBOL(ldlm_pool_fini); @@ -586,9 +655,12 @@ void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock) atomic_inc(&pl->pl_grant_rate); atomic_inc(&pl->pl_grant_speed); - /* No need to recalc client pools here as this is already done - * on enqueue/cancel and locks to cancel already packed to the - * rpc. */ + lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT); + + /* Do not do pool recalc for client side as all locks which + * potentially may be canceled has already been packed into + * enqueue/cancel rpc. Also we do not want to run out of stack + * with too long call paths. */ if (ns_is_server(ldlm_pl2ns(pl))) ldlm_pool_recalc(pl); EXIT; @@ -603,7 +675,8 @@ void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock) atomic_inc(&pl->pl_cancel_rate); atomic_dec(&pl->pl_grant_speed); - /* Same as in ldlm_pool_add() */ + lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT); + if (ns_is_server(ldlm_pl2ns(pl))) ldlm_pool_recalc(pl); EXIT; @@ -675,11 +748,22 @@ static int ldlm_pools_shrink(ldlm_side_t client, int nr, nr, client == LDLM_NAMESPACE_CLIENT ? "client" : "server"); /* Find out how many resources we may release. */ - mutex_down(ldlm_namespace_lock(client)); - list_for_each_entry(ns, ldlm_namespace_list(client), ns_list_chain) - total += ldlm_pool_granted(&ns->ns_pool); - mutex_up(ldlm_namespace_lock(client)); - + for (nr_ns = atomic_read(ldlm_namespace_nr(client)); + nr_ns > 0; nr_ns--) + { + mutex_down(ldlm_namespace_lock(client)); + if (list_empty(ldlm_namespace_list(client))) { + mutex_up(ldlm_namespace_lock(client)); + return 0; + } + ns = ldlm_namespace_first(client); + ldlm_namespace_get(ns); + ldlm_namespace_move(ns, client); + mutex_up(ldlm_namespace_lock(client)); + total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask); + ldlm_namespace_put(ns, 1); + } + if (nr == 0 || total == 0) return total; @@ -727,15 +811,18 @@ void ldlm_pools_recalc(ldlm_side_t client) { __u32 nr_l = 0, nr_p = 0, l; struct ldlm_namespace *ns; - int rc, nr, equal = 0; + int nr, equal = 0; - /* Check all modest namespaces. */ - mutex_down(ldlm_namespace_lock(client)); - list_for_each_entry(ns, ldlm_namespace_list(client), ns_list_chain) { - if (ns->ns_appetite != LDLM_NAMESPACE_MODEST) - continue; + /* No need to setup pool limit for client pools. */ + if (client == LDLM_NAMESPACE_SERVER) { + /* Check all modest namespaces first. */ + mutex_down(ldlm_namespace_lock(client)); + list_for_each_entry(ns, ldlm_namespace_list(client), + ns_list_chain) + { + if (ns->ns_appetite != LDLM_NAMESPACE_MODEST) + continue; - if (client == LDLM_NAMESPACE_SERVER) { l = ldlm_pool_granted(&ns->ns_pool); if (l == 0) l = 1; @@ -747,21 +834,24 @@ void ldlm_pools_recalc(ldlm_side_t client) nr_l += l; nr_p++; } - } - /* Make sure that modest namespaces did not eat more that 2/3 of limit */ - if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) { - CWARN("Modest pools eat out 2/3 of locks limit. %d of %lu. " - "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L); - equal = 1; - } + /* Make sure that modest namespaces did not eat more that 2/3 + * of limit */ + if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) { + CWARN("\"Modest\" pools eat out 2/3 of server locks " + "limit (%d of %lu). This means that you have too " + "many clients for this amount of server RAM. " + "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L); + equal = 1; + } - /* The rest is given to greedy namespaces. */ - list_for_each_entry(ns, ldlm_namespace_list(client), ns_list_chain) { - if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY) - continue; + /* The rest is given to greedy namespaces. */ + list_for_each_entry(ns, ldlm_namespace_list(client), + ns_list_chain) + { + if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY) + continue; - if (client == LDLM_NAMESPACE_SERVER) { if (equal) { /* In the case 2/3 locks are eaten out by * modest pools, we re-setup equal limit @@ -777,8 +867,8 @@ void ldlm_pools_recalc(ldlm_side_t client) } ldlm_pool_setup(&ns->ns_pool, l); } + mutex_up(ldlm_namespace_lock(client)); } - mutex_up(ldlm_namespace_lock(client)); /* Recalc at least ldlm_namespace_nr(client) namespaces. */ for (nr = atomic_read(ldlm_namespace_nr(client)); nr > 0; nr--) { @@ -798,11 +888,7 @@ void ldlm_pools_recalc(ldlm_side_t client) mutex_up(ldlm_namespace_lock(client)); /* After setup is done - recalc the pool. */ - rc = ldlm_pool_recalc(&ns->ns_pool); - if (rc) - CERROR("%s: pool recalculation error " - "%d\n", ns->ns_pool.pl_name, rc); - + ldlm_pool_recalc(&ns->ns_pool); ldlm_namespace_put(ns, 1); } } diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 3e2bce2..2b34473 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -515,15 +515,20 @@ struct ptlrpc_request *ldlm_prep_enqueue_req(struct obd_export *exp, /* Estimate the amount of available space in the request. */ int avail = ldlm_req_handles_avail(exp, size, bufcount, LDLM_ENQUEUE_CANCEL_OFF); + int flags, cancel; + LASSERT(avail >= count); + flags = ns_connect_lru_resize(ns) ? + LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED; + cancel = ns_connect_lru_resize(ns) ? 0 : 1; + /* Cancel lru locks here _only_ if the server supports * EARLY_CANCEL. Otherwise we have to send extra CANCEL * rpc right on enqueue, what will make it slower, vs. * asynchronous rpc in blocking thread. */ - count += ldlm_cancel_lru_local(ns, cancels, - ns_connect_lru_resize(ns) ? 0 : 1, - avail - count, LDLM_CANCEL_AGED); + count += ldlm_cancel_lru_local(ns, cancels, cancel, + avail - count, flags); size[DLM_LOCKREQ_OFF] = ldlm_request_bufsize(count, LDLM_ENQUEUE); } @@ -964,27 +969,42 @@ int ldlm_cli_update_pool(struct ptlrpc_request *req) pl = ldlm_imp2pl(req->rq_import); spin_lock(&pl->pl_lock); -#ifdef __KERNEL__ + + /* Check if we need to wakeup pools thread for fast SLV change. + * This is only done when threads period is noticably long like + * 10s or more. */ +#if defined(__KERNEL__) && (LDLM_POOLS_THREAD_PERIOD >= 10) { - __u64 old_slv, fast_slv_change; + __u64 old_slv, new_slv, fast_change; old_slv = ldlm_pool_get_slv(pl); - fast_slv_change = old_slv * LDLM_POOLS_FAST_SLV_CHANGE; - do_div(fast_slv_change, 100); -#endif - pl->pl_update_time = cfs_time_current(); - ldlm_pool_set_slv(pl, lustre_msg_get_slv(req->rq_repmsg)); - ldlm_pool_set_limit(pl, lustre_msg_get_limit(req->rq_repmsg)); -#ifdef __KERNEL__ + new_slv = lustre_msg_get_slv(req->rq_repmsg); + fast_change = old_slv * LDLM_POOLS_FAST_SLV_CHANGE; + do_div(fast_change, 100); + /* Wake up pools thread only if SLV has changed more than - * 5% since last update. In this case we want to react asap. + * 50% since last update. In this case we want to react asap. * Otherwise it is no sense to wake up pools as they are - * re-calculated every 1s anyways. */ - if (old_slv > ldlm_pool_get_slv(pl) && - old_slv - ldlm_pool_get_slv(pl) > fast_slv_change) + * re-calculated every LDLM_POOLS_THREAD_PERIOD anyways. */ + if (old_slv > new_slv && old_slv - new_slv > fast_change) ldlm_pools_wakeup(); } #endif + /* In some cases RPC may contain slv and limit zeroed out. This is + * the case when server does not support lru resize feature. This is + * also possible in some recovery cases when server side reqs have no + * ref to obd export and thus access to server side namespace is no + * possible. */ + if (lustre_msg_get_slv(req->rq_repmsg) != 0 && + lustre_msg_get_limit(req->rq_repmsg) != 0) { + ldlm_pool_set_slv(pl, lustre_msg_get_slv(req->rq_repmsg)); + ldlm_pool_set_limit(pl, lustre_msg_get_limit(req->rq_repmsg)); + } else { + DEBUG_REQ(D_HA, req, "zero SLV or Limit found " + "(SLV: "LPU64", Limit: %u)", + lustre_msg_get_slv(req->rq_repmsg), + lustre_msg_get_limit(req->rq_repmsg)); + } spin_unlock(&pl->pl_lock); RETURN(0); @@ -1011,13 +1031,18 @@ int ldlm_cli_cancel(struct lustre_handle *lockh) if (rc == LDLM_FL_BL_AST) { rc = ldlm_cli_cancel_req(lock->l_conn_export, &cancels, 1, 0); } else if (rc == LDLM_FL_CANCELING) { + struct ldlm_namespace *ns = lock->l_resource->lr_namespace; int avail = ldlm_cancel_handles_avail(lock->l_conn_export); - int count = 1; + int flags, cancel; LASSERT(avail > 0); - count += ldlm_cancel_lru_local(lock->l_resource->lr_namespace, - &cancels, 0, avail - 1, - LDLM_CANCEL_AGED); - ldlm_cli_cancel_list(&cancels, count, NULL, 0, 0); + + flags = ns_connect_lru_resize(ns) ? + LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED; + cancel = ns_connect_lru_resize(ns) ? 0 : 1; + + cancel += ldlm_cancel_lru_local(ns, &cancels, 0, + avail - cancel, flags); + ldlm_cli_cancel_list(&cancels, cancel, NULL, 0, 0); } if (rc != LDLM_FL_CANCELING) LDLM_LOCK_PUT(lock); @@ -1081,6 +1106,123 @@ static int ldlm_cancel_list_local(struct list_head *cancels, int count) RETURN(count); } +/* Return 1 if @lock should be canceled according to shrinker policy. + * Return zero otherwise. */ +static int ldlm_cancel_shrink_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int unused, int added, + int asked) +{ + int lock_cost; + __u64 page_nr; + + if (lock->l_resource->lr_type == LDLM_EXTENT) { + struct ldlm_extent *l_extent; + + /* For all extent locks cost is 1 + number of pages in + * their extent. */ + l_extent = &lock->l_policy_data.l_extent; + page_nr = (l_extent->end - l_extent->start); + do_div(page_nr, CFS_PAGE_SIZE); + +#ifdef __KERNEL__ + /* XXX: In fact this is evil hack, we can't access inode + * here. For doing it right we need somehow to have number + * of covered by lock. This should be fixed later when 10718 + * is landed. */ + if (lock->l_ast_data != NULL) { + struct inode *inode = lock->l_ast_data; + if (page_nr > inode->i_mapping->nrpages) + page_nr = inode->i_mapping->nrpages; + } +#endif + lock_cost = 1 + page_nr; + } else { + /* For all locks which are not extent ones cost is 1 */ + lock_cost = 1; + } + + /* Keep all expensive locks in lru for the memory pressure time + * cancel policy. They anyways may be canceled by lru resize + * pplicy if they have not small enough CLV. */ + return (lock_cost <= ns->ns_shrink_thumb); +} + +/* Return 1 if @lock should be canceled according to lru resize policy. + * Return zero otherwise. */ +static int ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int unused, int added, + int asked) +{ + cfs_time_t cur = cfs_time_current(); + struct ldlm_pool *pl = &ns->ns_pool; + __u64 slv, lvf, lv; + cfs_time_t la; + + spin_lock(&pl->pl_lock); + slv = ldlm_pool_get_slv(pl); + lvf = atomic_read(&pl->pl_lock_volume_factor); + spin_unlock(&pl->pl_lock); + + la = cfs_duration_sec(cfs_time_sub(cur, + lock->l_last_used)); + + /* Stop when slv is not yet come from server or + * lv is smaller than it is. */ + lv = lvf * la * unused; + return (slv > 1 && lv >= slv); +} + +/* Return 1 if @lock should be canceled according to passed policy. + * Return zero otherwise. */ +static int ldlm_cancel_passed_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int unused, int added, + int asked) +{ + /* Do nothing here, we allow canceling all locks which + * are passed here from upper layer logic. So that locks + * number to be canceled will be limited by @count and + * @max in ldlm_cancel_lru_local(). */ + return 1; +} + +/* Return 1 if @lock should be canceled according to aged policy. + * Return zero otherwise. */ +static int ldlm_cancel_aged_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int unused, int added, + int asked) +{ + /* Cancel old locks if reached asked limit. */ + return !((added >= asked) && + cfs_time_before_64(cfs_time_current(), + cfs_time_add(lock->l_last_used, + ns->ns_max_age))); +} + +typedef int (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *, + struct ldlm_lock *, int, + int, int); + +static ldlm_cancel_lru_policy_t +ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags) +{ + if (ns_connect_lru_resize(ns)) { + if (flags & LDLM_CANCEL_SHRINK) + return ldlm_cancel_shrink_policy; + else if (flags & LDLM_CANCEL_LRUR) + return ldlm_cancel_lrur_policy; + else if (flags & LDLM_CANCEL_PASSED) + return ldlm_cancel_passed_policy; + } else { + if (flags & LDLM_CANCEL_AGED) + return ldlm_cancel_aged_policy; + } + return NULL; +} + /* - Free space in lru for @count new locks, * redundant unused locks are canceled locally; * - also cancel locally unused aged locks; @@ -1092,14 +1234,25 @@ static int ldlm_cancel_list_local(struct list_head *cancels, int count) * There are the following use cases: ldlm_cancel_resource_local(), * ldlm_cancel_lru_local() and ldlm_cli_cancel(), which check&set this * flag properly. As any attempt to cancel a lock rely on this flag, - * l_bl_ast list is accessed later without any special locking. */ + * l_bl_ast list is accessed later without any special locking. + * + * Calling policies for enabled lru resize: + * ---------------------------------------- + * flags & LDLM_CANCEL_LRUR - use lru resize policy (SLV from server) to + * cancel not more than @count locks; + * + * flags & LDLM_CANCEL_PASSED - cancel @count number of old locks (located at + * the beginning of lru list); + * + * flags & LDLM_CANCEL_SHRINK - cancel not more than @count locks according to + * memory pressre policy function. + */ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, int count, int max, int flags) { - cfs_time_t cur = cfs_time_current(); - int added = 0, unused; - struct ldlm_lock *lock; - __u64 slv, lvf, lv; + ldlm_cancel_lru_policy_t cancel_lru_policy_func; + int added = 0, unused, cancel; + struct ldlm_lock *lock, *next; ENTRY; spin_lock(&ns->ns_unused_lock); @@ -1108,103 +1261,85 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, if (!ns_connect_lru_resize(ns)) count += unused - ns->ns_max_unused; - while (!list_empty(&ns->ns_unused_list)) { - struct ldlm_pool *pl = &ns->ns_pool; - - LASSERT(unused >= 0); + cancel_lru_policy_func = ldlm_cancel_lru_policy(ns, flags); + + list_for_each_entry_safe(lock, next, &ns->ns_unused_list, l_lru) { + /* Make sure that we skip locks being already in cancel. */ + if ((lock->l_flags & LDLM_FL_CANCELING) || + (lock->l_flags & LDLM_FL_BL_AST)) + continue; - if (max && added >= max) + /* For any flags, stop scanning if @max or passed @count is + * reached. */ + if ((max && added >= max) || (count && added >= count)) break; - list_for_each_entry(lock, &ns->ns_unused_list, l_lru) { - /* somebody is already doing CANCEL or there is a - * blocking request will send cancel. */ - if (!(lock->l_flags & LDLM_FL_CANCELING) && - !(lock->l_flags & LDLM_FL_BL_AST)) + /* Pass the lock through the policy filter and see if it + * should stay in lru. */ + if (cancel_lru_policy_func != NULL) { + cancel = cancel_lru_policy_func(ns, lock, unused, + added, count); + + /* Take next lock for shrink policy, we need to check + * whole list. Stop scanning for other policies. */ + if ((flags & LDLM_CANCEL_SHRINK) && !cancel) + continue; + else if (!cancel) break; } - if (&lock->l_lru == &ns->ns_unused_list) - break; - if (ns_connect_lru_resize(ns)) { - cfs_time_t la; - - /* Take into account SLV only if cpount == 0. */ - if (count == 0) { - /* Calculate lv for every lock. */ - spin_lock(&pl->pl_lock); - slv = ldlm_pool_get_slv(pl); - lvf = atomic_read(&pl->pl_lock_volume_factor); - spin_unlock(&pl->pl_lock); - - la = cfs_duration_sec(cfs_time_sub(cur, - lock->l_last_used)); - if (la == 0) - la = 1; - - /* Stop when slv is not yet come from server - * or lv is smaller than it is. */ - lv = lvf * la * unused; - if (slv == 1 || lv < slv) - break; - } else { - if (added >= count) - break; + if (cancels != NULL) { + LDLM_LOCK_GET(lock); /* dropped by bl thread */ + spin_unlock(&ns->ns_unused_lock); + + lock_res_and_lock(lock); + /* Check flags again under the lock. */ + if ((lock->l_flags & LDLM_FL_CANCELING) || + (lock->l_flags & LDLM_FL_BL_AST) || + (ldlm_lock_remove_from_lru(lock) == 0)) { + /* other thread is removing lock from lru or + * somebody is already doing CANCEL or + * there is a blocking request which will send + * cancel by itseft. */ + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + spin_lock(&ns->ns_unused_lock); + continue; } - } else { - if ((added >= count) && - (!(flags & LDLM_CANCEL_AGED) || - cfs_time_before_64(cur, ns->ns_max_age + - lock->l_last_used))) - break; - } - - LDLM_LOCK_GET(lock); /* dropped by bl thread */ - spin_unlock(&ns->ns_unused_lock); - - lock_res_and_lock(lock); - /* Check flags again under the lock. */ - if ((lock->l_flags & LDLM_FL_CANCELING) || - (lock->l_flags & LDLM_FL_BL_AST) || - (ldlm_lock_remove_from_lru(lock) == 0)) { - /* other thread is removing lock from lru or - * somebody is already doing CANCEL or - * there is a blocking request which will send - * cancel by itseft. */ + LASSERT(!lock->l_readers && !lock->l_writers); + + /* If we have chosen to cancel this lock voluntarily, we + * better send cancel notification to server, so that it + * frees appropriate state. This might lead to a race + * where while we are doing cancel here, server is also + * silently cancelling this lock. */ + lock->l_flags &= ~LDLM_FL_CANCEL_ON_BLOCK; + + /* Setting the CBPENDING flag is a little misleading, but + * prevents an important race; namely, once CBPENDING is + * set, the lock can accumulate no more readers/writers. + * Since readers and writers are already zero here, + * ldlm_lock_decref() won't see this flag and call + * l_blocking_ast */ + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING; + + /* We can't re-add to l_lru as it confuses the refcounting + * in ldlm_lock_remove_from_lru() if an AST arrives after + * we drop ns_lock below. We use l_bl_ast and can't use + * l_pending_chain as it is used both on server and client + * nevertheless bug 5666 says it is used only on server */ + LASSERT(list_empty(&lock->l_bl_ast)); + list_add(&lock->l_bl_ast, cancels); unlock_res_and_lock(lock); - LDLM_LOCK_PUT(lock); spin_lock(&ns->ns_unused_lock); - continue; } - LASSERT(!lock->l_readers && !lock->l_writers); - - /* If we have chosen to canecl this lock voluntarily, we better - send cancel notification to server, so that it frees - appropriate state. This might lead to a race where while - we are doing cancel here, server is also silently - cancelling this lock. */ - lock->l_flags &= ~LDLM_FL_CANCEL_ON_BLOCK; - - /* Setting the CBPENDING flag is a little misleading, but - * prevents an important race; namely, once CBPENDING is set, - * the lock can accumulate no more readers/writers. Since - * readers and writers are already zero here, ldlm_lock_decref - * won't see this flag and call l_blocking_ast */ - lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING; - /* We can't re-add to l_lru as it confuses the refcounting in - * ldlm_lock_remove_from_lru() if an AST arrives after we drop - * ns_lock below. We use l_bl_ast and can't use l_pending_chain - * as it is used both on server and client nevertheles bug 5666 - * says it is used only on server. --umka */ - - LASSERT(list_empty(&lock->l_bl_ast)); - list_add(&lock->l_bl_ast, cancels); - unlock_res_and_lock(lock); - spin_lock(&ns->ns_unused_lock); added++; unused--; } spin_unlock(&ns->ns_unused_lock); + + if (cancels == NULL) + RETURN(added); RETURN(ldlm_cancel_list(cancels, added)); } @@ -1213,7 +1348,8 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, * in a thread and this function will return after the thread has been * asked to call the callback. when called with LDLM_SYNC the blocking * callback will be performed in this function. */ -int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync) +int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync, + int flags) { CFS_LIST_HEAD(cancels); int count, rc; @@ -1222,7 +1358,7 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync) #ifndef __KERNEL__ sync = LDLM_SYNC; /* force to be sync in user space */ #endif - count = ldlm_cancel_lru_local(ns, &cancels, nr, 0, 0); + count = ldlm_cancel_lru_local(ns, &cancels, nr, 0, flags); if (sync == LDLM_ASYNC) { rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count); if (rc == 0) diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index 1a2e3de..16d831f 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -152,7 +152,8 @@ static int lprocfs_wr_lru_size(struct file *file, const char *buffer, int canceled, unused = ns->ns_nr_unused; /* Try to cancel all @ns_nr_unused locks. */ - canceled = ldlm_cancel_lru(ns, unused, LDLM_SYNC); + canceled = ldlm_cancel_lru(ns, unused, LDLM_SYNC, + LDLM_CANCEL_PASSED); if (canceled < unused) { CERROR("not all requested locks are canceled, " "requested: %d, canceled: %d\n", unused, @@ -162,7 +163,7 @@ static int lprocfs_wr_lru_size(struct file *file, const char *buffer, } else { tmp = ns->ns_max_unused; ns->ns_max_unused = 0; - ldlm_cancel_lru(ns, 0, LDLM_SYNC); + ldlm_cancel_lru(ns, 0, LDLM_SYNC, LDLM_CANCEL_PASSED); ns->ns_max_unused = tmp; } return count; @@ -185,7 +186,7 @@ static int lprocfs_wr_lru_size(struct file *file, const char *buffer, CDEBUG(D_DLMTRACE, "changing namespace %s unused locks from %u to %u\n", ns->ns_name, ns->ns_nr_unused, (unsigned int)tmp); - ldlm_cancel_lru(ns, (unsigned int)tmp, LDLM_ASYNC); + ldlm_cancel_lru(ns, (unsigned int)tmp, LDLM_ASYNC, LDLM_CANCEL_PASSED); if (!lru_resize) { CDEBUG(D_DLMTRACE, "disable lru_resize for namespace %s\n", @@ -196,7 +197,7 @@ static int lprocfs_wr_lru_size(struct file *file, const char *buffer, CDEBUG(D_DLMTRACE, "changing namespace %s max_unused from %u to %u\n", ns->ns_name, ns->ns_max_unused, (unsigned int)tmp); ns->ns_max_unused = (unsigned int)tmp; - ldlm_cancel_lru(ns, 0, LDLM_ASYNC); + ldlm_cancel_lru(ns, 0, LDLM_ASYNC, LDLM_CANCEL_PASSED); /* Make sure that originally lru resize was supported before * turning it on here. */ @@ -248,13 +249,19 @@ void ldlm_proc_namespace(struct ldlm_namespace *ns) lock_vars[0].write_fptr = lprocfs_wr_lru_size; lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); + snprintf(lock_name, MAX_STRING_SIZE, "%s/shrink_thumb", + ns->ns_name); + lock_vars[0].data = ns; + lock_vars[0].read_fptr = lprocfs_rd_uint; + lock_vars[0].write_fptr = lprocfs_wr_uint; + lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); + snprintf(lock_name, MAX_STRING_SIZE, "%s/lru_max_age", ns->ns_name); lock_vars[0].data = &ns->ns_max_age; lock_vars[0].read_fptr = lprocfs_rd_uint; lock_vars[0].write_fptr = lprocfs_wr_uint; lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); - } } #undef MAX_STRING_SIZE @@ -284,6 +291,7 @@ struct ldlm_namespace *ldlm_namespace_new(char *name, ldlm_side_t client, if (!ns->ns_hash) GOTO(out_ns, NULL); + ns->ns_shrink_thumb = LDLM_LOCK_SHRINK_THUMB; ns->ns_appetite = apt; namelen = strlen(name); OBD_ALLOC(ns->ns_name, namelen + 1); diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index fb1bb1d..5da190c 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -342,6 +342,9 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) lustre_msg_set_opc(req->rq_repmsg, req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0); + if (req->rq_export && req->rq_export->exp_obd) + target_pack_pool_reply(req); + if (req->rq_export == NULL || req->rq_export->exp_connection == NULL) conn = ptlrpc_get_connection(req->rq_peer, req->rq_self, NULL); else diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 6e0de6e..486f185 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -4672,12 +4672,21 @@ test_121() { #bug #10589 } run_test 121 "read cancel race =========" +cmd_cancel_lru_locks() { + NS=$1 + test "x$NS" = "x" && NS="mdc" + for d in `find $LPROC/ldlm/namespaces | grep $NS`; do + if test -f $d/lru_size; then + cancel_lru_locks $d + fi + done +} + test_124a() { [ -z "`grep lru_resize $LPROC/mdc/*/connect_flags`" ] && \ skip "no lru resize on server" && return 0 - cancel_lru_locks mdc + cmd_cancel_lru_locks "mdc" lru_resize_enable - NSDIR=`find $LPROC/ldlm/namespaces | grep mdc | head -1` # we want to test main pool functionality, that is cancel based on SLV # this is why shrinkers are disabled @@ -4687,20 +4696,33 @@ test_124a() { NR=2000 mkdir -p $DIR/$tdir || error "failed to create $DIR/$tdir" - LRU_SIZE=`cat $NSDIR/lru_size` - # use touch to produce $NR new locks log "create $NR files at $DIR/$tdir" for ((i=0;i<$NR;i++)); do touch $DIR/$tdir/f$i; done + + NSDIR="" + LRU_SIZE=0 + for d in `find $LPROC/ldlm/namespaces | grep mdc-`; do + if test -f $d/lru_size; then + LRU_SIZE=`cat $d/lru_size` + if test $LRU_SIZE -gt 0; then + log "using $d namespace" + NSDIR=$d + break + fi + fi + done - LRU_SIZE_B=`cat $NSDIR/lru_size` - if test $LRU_SIZE -ge $LRU_SIZE_B; then + if test -z $NSDIR; then skip "No cached locks created!" - cat $NSDIR/pool/state return 0 fi - LRU_SIZE_B=$((LRU_SIZE_B-LRU_SIZE)) - log "created $LRU_SIZE_B lock(s)" + + if test $LRU_SIZE -lt 100; then + skip "Not enough cached locks created!" + return 0 + fi + log "created $LRU_SIZE lock(s)" # we want to sleep 30s to not make test too long SLEEP=30 @@ -4718,6 +4740,7 @@ test_124a() { # Use $LRU_SIZE_B here to take into account real number of locks created # in the case of CMD, LRU_SIZE_B != $NR in most of cases LVF=$(($MAX_HRS * 60 * 60 * $LIMIT / $SLEEP)) + LRU_SIZE_B=$LRU_SIZE log "make client drop locks $LVF times faster so that ${SLEEP}s is enough to cancel $LRU_SIZE_B lock(s)" OLD_LVF=`cat $NSDIR/pool/lock_volume_factor` echo "$LVF" > $NSDIR/pool/lock_volume_factor @@ -4740,39 +4763,82 @@ test_124a() { } run_test 124a "lru resize =======================================" +set_lru_size() { + NS=$1 + SIZE=$2 + test "x$NS" = "x" && NS="mdc" + test "x$SIZE" = "x" && SIZE="0" + test $SIZE -lt 0 && SIZE="0" + test $SIZE -gt 0 && ACTION="disabled" || ACTION="enabled" + for d in `find $LPROC/ldlm/namespaces | grep $NS`; do + if test -f $d/lru_size; then + log "$(basename $d):" + log " lru resize $ACTION" + log " lru_size=$SIZE" + echo $SIZE > $d/lru_size + fi + done +} + +get_lru_size() { + NS=$1 + test "x$NS" = "x" && NS="mdc" + for d in `find $LPROC/ldlm/namespaces | grep $NS`; do + if test -f $d/lru_size; then + log "$(basename $d):" + log " lru_size=$(cat $d/lru_size)" + fi + done +} + test_124b() { [ -z "`grep lru_resize $LPROC/mdc/*/connect_flags`" ] && \ skip "no lru resize on server" && return 0 - cleanup -f || error "failed to unmount" - MOUNTOPT="$MOUNTOPT,nolruresize" - setup - NR=2000 - mkdir -p $DIR/$tdir || error "failed to create $DIR/$tdir" + NSDIR=`find $LPROC/ldlm/namespaces | grep mdc | head -1` + LIMIT=`cat $NSDIR/pool/limit` + + NR_CPU=$(awk '/processor/' /proc/cpuinfo | wc -l) + # 100 locks here is default value for non-shrinkable lru as well + # as the order to switch to static lru managing policy + # define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus()) + LDLM_DEFAULT_LRU_SIZE=$((100 * NR_CPU)) + + NR=$((LIMIT-(LIMIT/3))) + log "starting lru resize disable cycle" + set_lru_size "mdc-" $LDLM_DEFAULT_LRU_SIZE - createmany -o $DIR/$tdir/f $NR - log "doing ls -la $DIR/$tdir 3 times (lru resize disabled)" + mkdir -p $DIR/$tdir/disable_lru_resize || + error "failed to create $DIR/$tdir/disable_lru_resize" + + createmany -o $DIR/$tdir/disable_lru_resize/f $NR + log "doing ls -la $DIR/$tdir/disable_lru_resize 3 times" stime=`date +%s` - ls -la $DIR/$tdir > /dev/null - ls -la $DIR/$tdir > /dev/null - ls -la $DIR/$tdir > /dev/null + ls -la $DIR/$tdir/disable_lru_resize > /dev/null + ls -la $DIR/$tdir/disable_lru_resize > /dev/null + ls -la $DIR/$tdir/disable_lru_resize > /dev/null etime=`date +%s` nolruresize_delta=$((etime-stime)) log "ls -la time: $nolruresize_delta seconds" + get_lru_size "mdc-" + + log "starting lru resize enable cycle" + mkdir -p $DIR/$tdir/enable_lru_resize || + error "failed to create $DIR/$tdir/enable_lru_resize" - cleanup -f || error "failed to unmount" - MOUNTOPT=`echo $MOUNTOPT | sed "s/nolruresize/lruresize/"` - setup + # 0 locks means here flush lru and switch to lru resize policy + set_lru_size "mdc-" 0 - createmany -o $DIR/$tdir/f $NR - log "doing ls -la $DIR/$tdir 3 times (lru resize enabled)" + createmany -o $DIR/$tdir/enable_lru_resize/f $NR + log "doing ls -la $DIR/$tdir/enable_lru_resize 3 times" stime=`date +%s` - ls -la $DIR/$tdir > /dev/null - ls -la $DIR/$tdir > /dev/null - ls -la $DIR/$tdir > /dev/null + ls -la $DIR/$tdir/enable_lru_resize > /dev/null + ls -la $DIR/$tdir/enable_lru_resize > /dev/null + ls -la $DIR/$tdir/enable_lru_resize > /dev/null etime=`date +%s` lruresize_delta=$((etime-stime)) log "ls -la time: $lruresize_delta seconds" + get_lru_size "mdc-" if test $lruresize_delta -gt $nolruresize_delta; then log "ls -la is $((lruresize_delta - $nolruresize_delta))s slower with lru resize enabled" @@ -4781,8 +4847,6 @@ test_124b() { else log "lru resize performs the same with no lru resize" fi - - unlinkmany $DIR/$tdir/f $NR } run_test 124b "lru resize (performance test) ======================="