* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
* Use is subject to license terms.
*
- * Copyright (c) 2011, 2012, Whamcloud, Inc.
+ * Copyright (c) 2011, 2012, Intel Corporation.
*/
/*
* This file is part of Lustre, http://www.lustre.org/
#include <lustre_fid.h>
#include <lu_object.h>
#include <libcfs/list.h>
-/* lu_time_global_{init,fini}() */
-#include <lu_time.h>
static void lu_object_free(const struct lu_env *env, struct lu_object *o);
struct lu_site *site;
struct lu_object *orig;
cfs_hash_bd_t bd;
+ const struct lu_fid *fid;
top = o->lo_header;
site = o->lo_dev->ld_site;
orig = o;
+ /*
+ * till we have full fids-on-OST implemented anonymous objects
+ * are possible in OSP. such an object isn't listed in the site
+ * so we should not remove it from the site.
+ */
+ fid = lu_object_fid(o);
+ if (fid_is_zero(fid)) {
+ LASSERT(top->loh_hash.next == NULL
+ && top->loh_hash.pprev == NULL);
+ LASSERT(cfs_list_empty(&top->loh_lru));
+ if (!cfs_atomic_dec_and_test(&top->loh_ref))
+ return;
+ cfs_list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+ if (o->lo_ops->loo_object_release != NULL)
+ o->lo_ops->loo_object_release(env, o);
+ }
+ lu_object_free(env, orig);
+ return;
+ }
+
cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd);
bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
* and LRU lock, no race with concurrent object lookup is possible
* and we can safely destroy object below.
*/
- cfs_hash_bd_del_locked(site->ls_obj_hash, &bd, &top->loh_hash);
+ if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags))
+ cfs_hash_bd_del_locked(site->ls_obj_hash, &bd, &top->loh_hash);
cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
/*
* Object was already removed from hash and lru above, can
*/
void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o)
{
- cfs_set_bit(LU_OBJECT_HEARD_BANSHEE,
- &o->lo_header->loh_flags);
+ set_bit(LU_OBJECT_HEARD_BANSHEE, &o->lo_header->loh_flags);
return lu_object_put(env, o);
}
EXPORT_SYMBOL(lu_object_put_nocache);
/**
+ * Kill the object and take it out of LRU cache.
+ * Currently used by client code for layout change.
+ */
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
+{
+ struct lu_object_header *top;
+
+ top = o->lo_header;
+ set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags);
+ if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) {
+ cfs_hash_t *obj_hash = o->lo_dev->ld_site->ls_obj_hash;
+ cfs_hash_bd_t bd;
+
+ cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1);
+ cfs_list_del_init(&top->loh_lru);
+ cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash);
+ cfs_hash_bd_unlock(obj_hash, &bd, 1);
+ }
+}
+EXPORT_SYMBOL(lu_object_unhash);
+
+/**
* Allocate new object.
*
* This follows object creation protocol, described in the comment within
int result;
ENTRY;
- /*
- * Create top-level object slice. This will also create
- * lu_object_header.
- */
- top = dev->ld_ops->ldo_object_alloc(env, NULL, dev);
- if (top == NULL)
- RETURN(ERR_PTR(-ENOMEM));
+ /*
+ * Create top-level object slice. This will also create
+ * lu_object_header.
+ */
+ top = dev->ld_ops->ldo_object_alloc(env, NULL, dev);
+ if (top == NULL)
+ RETURN(ERR_PTR(-ENOMEM));
+ if (IS_ERR(top))
+ RETURN(top);
/*
* This is the only place where object fid is assigned. It's constant
* after this point.
*/
- LASSERT(fid_is_igif(f) || fid_ver(f) == 0);
top->lo_header->loh_fid = *f;
layers = &top->lo_header->loh_layers;
do {
*version = ver;
bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd);
- /* cfs_hash_bd_lookup_intent is a somehow "internal" function
- * of cfs_hash, but we don't want refcount on object right now */
- hnode = cfs_hash_bd_lookup_locked(s->ls_obj_hash, bd, (void *)f);
+ /* cfs_hash_bd_peek_locked is a somehow "internal" function
+ * of cfs_hash, it doesn't add refcount on object. */
+ hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f);
if (hnode == NULL) {
lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS);
return NULL;
h = container_of0(hnode, struct lu_object_header, loh_hash);
if (likely(!lu_object_is_dying(h))) {
+ cfs_hash_get(s->ls_obj_hash, hnode);
lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
cfs_list_del_init(&h->loh_lru);
return lu_object_top(h);
* returned (to assure that references to dying objects are eventually
* drained), and moreover, lookup has to wait until object is freed.
*/
- cfs_atomic_dec(&h->loh_ref);
cfs_waitlink_init(waiter);
cfs_waitq_add(&bkt->lsb_marche_funebre, waiter);
int lu_device_type_init(struct lu_device_type *ldt)
{
- int result;
+ int result = 0;
- CFS_INIT_LIST_HEAD(&ldt->ldt_linkage);
- result = ldt->ldt_ops->ldto_init(ldt);
- if (result == 0)
- cfs_list_add(&ldt->ldt_linkage, &lu_device_types);
- return result;
+ CFS_INIT_LIST_HEAD(&ldt->ldt_linkage);
+ if (ldt->ldt_ops->ldto_init)
+ result = ldt->ldt_ops->ldto_init(ldt);
+ if (result == 0)
+ cfs_list_add(&ldt->ldt_linkage, &lu_device_types);
+ return result;
}
EXPORT_SYMBOL(lu_device_type_init);
void lu_device_type_fini(struct lu_device_type *ldt)
{
- cfs_list_del_init(&ldt->ldt_linkage);
- ldt->ldt_ops->ldto_fini(ldt);
+ cfs_list_del_init(&ldt->ldt_linkage);
+ if (ldt->ldt_ops->ldto_fini)
+ ldt->ldt_ops->ldto_fini(ldt);
}
EXPORT_SYMBOL(lu_device_type_fini);
{
struct lu_device_type *ldt;
- cfs_list_for_each_entry(ldt, &lu_device_types, ldt_linkage) {
- if (ldt->ldt_device_nr == 0)
- ldt->ldt_ops->ldto_stop(ldt);
- }
+ cfs_list_for_each_entry(ldt, &lu_device_types, ldt_linkage) {
+ if (ldt->ldt_device_nr == 0 && ldt->ldt_ops->ldto_stop)
+ ldt->ldt_ops->ldto_stop(ldt);
+ }
}
EXPORT_SYMBOL(lu_types_stop);
* Global list of all sites on this node
*/
static CFS_LIST_HEAD(lu_sites);
-static CFS_DEFINE_MUTEX(lu_sites_guard);
+static DEFINE_MUTEX(lu_sites_guard);
/**
* Global environment used by site shrinker.
void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d)
{
- cfs_spin_lock(&s->ls_ld_lock);
+ spin_lock(&s->ls_ld_lock);
if (cfs_list_empty(&d->ld_linkage))
cfs_list_add(&d->ld_linkage, &s->ls_ld_linkage);
- cfs_spin_unlock(&s->ls_ld_lock);
+ spin_unlock(&s->ls_ld_lock);
}
EXPORT_SYMBOL(lu_dev_add_linkage);
void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d)
{
- cfs_spin_lock(&s->ls_ld_lock);
+ spin_lock(&s->ls_ld_lock);
cfs_list_del_init(&d->ld_linkage);
- cfs_spin_unlock(&s->ls_ld_lock);
+ spin_unlock(&s->ls_ld_lock);
}
EXPORT_SYMBOL(lu_dev_del_linkage);
lu_ref_add(&top->ld_reference, "site-top", s);
CFS_INIT_LIST_HEAD(&s->ls_ld_linkage);
- cfs_spin_lock_init(&s->ls_ld_lock);
+ spin_lock_init(&s->ls_ld_lock);
lu_dev_add_linkage(s, top);
- RETURN(0);
+ RETURN(0);
}
EXPORT_SYMBOL(lu_site_init);
*/
void lu_site_fini(struct lu_site *s)
{
- cfs_mutex_lock(&lu_sites_guard);
+ mutex_lock(&lu_sites_guard);
cfs_list_del_init(&s->ls_linkage);
- cfs_mutex_unlock(&lu_sites_guard);
+ mutex_unlock(&lu_sites_guard);
if (s->ls_obj_hash != NULL) {
cfs_hash_putref(s->ls_obj_hash);
int lu_site_init_finish(struct lu_site *s)
{
int result;
- cfs_mutex_lock(&lu_sites_guard);
+ mutex_lock(&lu_sites_guard);
result = lu_context_refill(&lu_shrink_env.le_ctx);
if (result == 0)
cfs_list_add(&s->ls_linkage, &lu_sites);
- cfs_mutex_unlock(&lu_sites_guard);
+ mutex_unlock(&lu_sites_guard);
return result;
}
EXPORT_SYMBOL(lu_site_init_finish);
/**
* Maximal number of tld slots.
*/
- LU_CONTEXT_KEY_NR = 32
+ LU_CONTEXT_KEY_NR = 40
};
static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, };
LASSERT(key->lct_owner != NULL);
result = -ENFILE;
- cfs_spin_lock(&lu_keys_guard);
+ spin_lock(&lu_keys_guard);
for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
if (lu_keys[i] == NULL) {
key->lct_index = i;
break;
}
}
- cfs_spin_unlock(&lu_keys_guard);
- return result;
+ spin_unlock(&lu_keys_guard);
+ return result;
}
EXPORT_SYMBOL(lu_context_key_register);
*/
void lu_context_key_degister(struct lu_context_key *key)
{
- LASSERT(cfs_atomic_read(&key->lct_used) >= 1);
- LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+ LASSERT(cfs_atomic_read(&key->lct_used) >= 1);
+ LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
- lu_context_key_quiesce(key);
+ lu_context_key_quiesce(key);
- ++key_set_version;
- cfs_spin_lock(&lu_keys_guard);
- key_fini(&lu_shrink_env.le_ctx, key->lct_index);
- if (lu_keys[key->lct_index]) {
- lu_keys[key->lct_index] = NULL;
- lu_ref_fini(&key->lct_reference);
- }
- cfs_spin_unlock(&lu_keys_guard);
+ ++key_set_version;
+ spin_lock(&lu_keys_guard);
+ key_fini(&lu_shrink_env.le_ctx, key->lct_index);
+ if (lu_keys[key->lct_index]) {
+ lu_keys[key->lct_index] = NULL;
+ lu_ref_fini(&key->lct_reference);
+ }
+ spin_unlock(&lu_keys_guard);
- LASSERTF(cfs_atomic_read(&key->lct_used) == 1,
- "key has instances: %d\n",
- cfs_atomic_read(&key->lct_used));
+ LASSERTF(cfs_atomic_read(&key->lct_used) == 1,
+ "key has instances: %d\n",
+ cfs_atomic_read(&key->lct_used));
}
EXPORT_SYMBOL(lu_context_key_degister);
/*
* XXX memory barrier has to go here.
*/
- cfs_spin_lock(&lu_keys_guard);
- cfs_list_for_each_entry(ctx, &lu_context_remembered,
- lc_remember)
- key_fini(ctx, key->lct_index);
- cfs_spin_unlock(&lu_keys_guard);
- ++key_set_version;
- }
+ spin_lock(&lu_keys_guard);
+ cfs_list_for_each_entry(ctx, &lu_context_remembered,
+ lc_remember)
+ key_fini(ctx, key->lct_index);
+ spin_unlock(&lu_keys_guard);
+ ++key_set_version;
+ }
}
EXPORT_SYMBOL(lu_context_key_quiesce);
{
int rc;
- memset(ctx, 0, sizeof *ctx);
- ctx->lc_state = LCS_INITIALIZED;
- ctx->lc_tags = tags;
- if (tags & LCT_REMEMBER) {
- cfs_spin_lock(&lu_keys_guard);
- cfs_list_add(&ctx->lc_remember, &lu_context_remembered);
- cfs_spin_unlock(&lu_keys_guard);
+ memset(ctx, 0, sizeof *ctx);
+ ctx->lc_state = LCS_INITIALIZED;
+ ctx->lc_tags = tags;
+ if (tags & LCT_REMEMBER) {
+ spin_lock(&lu_keys_guard);
+ cfs_list_add(&ctx->lc_remember, &lu_context_remembered);
+ spin_unlock(&lu_keys_guard);
} else {
CFS_INIT_LIST_HEAD(&ctx->lc_remember);
}
keys_fini(ctx);
} else { /* could race with key degister */
- cfs_spin_lock(&lu_keys_guard);
+ spin_lock(&lu_keys_guard);
keys_fini(ctx);
cfs_list_del_init(&ctx->lc_remember);
- cfs_spin_unlock(&lu_keys_guard);
+ spin_unlock(&lu_keys_guard);
}
}
EXPORT_SYMBOL(lu_context_fini);
void lu_context_tags_update(__u32 tags)
{
- cfs_spin_lock(&lu_keys_guard);
- lu_context_tags_default |= tags;
- key_set_version ++;
- cfs_spin_unlock(&lu_keys_guard);
+ spin_lock(&lu_keys_guard);
+ lu_context_tags_default |= tags;
+ key_set_version++;
+ spin_unlock(&lu_keys_guard);
}
EXPORT_SYMBOL(lu_context_tags_update);
void lu_context_tags_clear(__u32 tags)
{
- cfs_spin_lock(&lu_keys_guard);
- lu_context_tags_default &= ~tags;
- key_set_version ++;
- cfs_spin_unlock(&lu_keys_guard);
+ spin_lock(&lu_keys_guard);
+ lu_context_tags_default &= ~tags;
+ key_set_version++;
+ spin_unlock(&lu_keys_guard);
}
EXPORT_SYMBOL(lu_context_tags_clear);
void lu_session_tags_update(__u32 tags)
{
- cfs_spin_lock(&lu_keys_guard);
- lu_session_tags_default |= tags;
- key_set_version ++;
- cfs_spin_unlock(&lu_keys_guard);
+ spin_lock(&lu_keys_guard);
+ lu_session_tags_default |= tags;
+ key_set_version++;
+ spin_unlock(&lu_keys_guard);
}
EXPORT_SYMBOL(lu_session_tags_update);
void lu_session_tags_clear(__u32 tags)
{
- cfs_spin_lock(&lu_keys_guard);
- lu_session_tags_default &= ~tags;
- key_set_version ++;
- cfs_spin_unlock(&lu_keys_guard);
+ spin_lock(&lu_keys_guard);
+ lu_session_tags_default &= ~tags;
+ key_set_version++;
+ spin_unlock(&lu_keys_guard);
}
EXPORT_SYMBOL(lu_session_tags_clear);
#ifdef __KERNEL__
+/*
+ * There exists a potential lock inversion deadlock scenario when using
+ * Lustre on top of ZFS. This occurs between one of ZFS's
+ * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
+ * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
+ * while thread B will take the ht_lock and sleep on the lu_sites_guard
+ * lock. Obviously neither thread will wake and drop their respective hold
+ * on their lock.
+ *
+ * To prevent this from happening we must ensure the lu_sites_guard lock is
+ * not taken while down this code path. ZFS reliably does not set the
+ * __GFP_FS bit in its code paths, so this can be used to determine if it
+ * is safe to take the lu_sites_guard lock.
+ *
+ * Ideally we should accurately return the remaining number of cached
+ * objects without taking the lu_sites_guard lock, but this is not
+ * possible in the current implementation.
+ */
static int lu_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
{
lu_site_stats_t stats;
int remain = shrink_param(sc, nr_to_scan);
CFS_LIST_HEAD(splice);
- if (remain != 0) {
- if (!(shrink_param(sc, gfp_mask) & __GFP_FS))
+ if (!(shrink_param(sc, gfp_mask) & __GFP_FS)) {
+ if (remain != 0)
return -1;
- CDEBUG(D_INODE, "Shrink %d objects\n", remain);
+ else
+ /* We must not take the lu_sites_guard lock when
+ * __GFP_FS is *not* set because of the deadlock
+ * possibility detailed above. Additionally,
+ * since we cannot determine the number of
+ * objects in the cache without taking this
+ * lock, we're in a particularly tough spot. As
+ * a result, we'll just lie and say our cache is
+ * empty. This _should_ be ok, as we can't
+ * reclaim objects when __GFP_FS is *not* set
+ * anyways.
+ */
+ return 0;
}
- cfs_mutex_lock(&lu_sites_guard);
+ CDEBUG(D_INODE, "Shrink %d objects\n", remain);
+
+ mutex_lock(&lu_sites_guard);
cfs_list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
if (shrink_param(sc, nr_to_scan) != 0) {
remain = lu_site_purge(&lu_shrink_env, s, remain);
break;
}
cfs_list_splice(&splice, lu_sites.prev);
- cfs_mutex_unlock(&lu_sites_guard);
+ mutex_unlock(&lu_sites_guard);
cached = (cached / 100) * sysctl_vfs_cache_pressure;
if (shrink_param(sc, nr_to_scan) == 0)
return 0;
}
-void lu_debugging_setup(void)
+int lu_debugging_setup(void)
{
- lu_env_init(&lu_debugging_env, ~0);
+ return lu_env_init(&lu_debugging_env, ~0);
}
void lu_context_keys_dump(void)
int llo_global_init(void);
void llo_global_fini(void);
+/* context key constructor/destructor: lu_ucred_key_init, lu_ucred_key_fini */
+LU_KEY_INIT_FINI(lu_ucred, struct lu_ucred);
+
+static struct lu_context_key lu_ucred_key = {
+ .lct_tags = LCT_SESSION,
+ .lct_init = lu_ucred_key_init,
+ .lct_fini = lu_ucred_key_fini
+};
+
+/**
+ * Get ucred key if session exists and ucred key is allocated on it.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred(const struct lu_env *env)
+{
+ if (!env->le_ses)
+ return NULL;
+ return lu_context_key_get(env->le_ses, &lu_ucred_key);
+}
+EXPORT_SYMBOL(lu_ucred);
+
+/**
+ * Get ucred key and check if it is properly initialized.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred_check(const struct lu_env *env)
+{
+ struct lu_ucred *uc = lu_ucred(env);
+ if (uc && uc->uc_valid != UCRED_OLD && uc->uc_valid != UCRED_NEW)
+ return NULL;
+ return uc;
+}
+EXPORT_SYMBOL(lu_ucred_check);
+
+/**
+ * Get ucred key, which must exist and must be properly initialized.
+ * Assert otherwise.
+ */
+struct lu_ucred *lu_ucred_assert(const struct lu_env *env)
+{
+ struct lu_ucred *uc = lu_ucred_check(env);
+ LASSERT(uc != NULL);
+ return uc;
+}
+EXPORT_SYMBOL(lu_ucred_assert);
+
/**
* Initialization of global lu_* data.
*/
result = lu_context_key_register(&lu_global_key);
if (result != 0)
return result;
+
+ LU_CONTEXT_KEY_INIT(&lu_ucred_key);
+ result = lu_context_key_register(&lu_ucred_key);
+ if (result != 0)
+ return result;
+
/*
* At this level, we don't know what tags are needed, so allocate them
* conservatively. This should not be too bad, because this
* environment is global.
*/
- cfs_mutex_lock(&lu_sites_guard);
+ mutex_lock(&lu_sites_guard);
result = lu_env_init(&lu_shrink_env, LCT_SHRINKER);
- cfs_mutex_unlock(&lu_sites_guard);
+ mutex_unlock(&lu_sites_guard);
if (result != 0)
return result;
if (lu_site_shrinker == NULL)
return -ENOMEM;
- result = lu_time_global_init();
- if (result)
- GOTO(out, result);
-
#ifdef __KERNEL__
- result = dt_global_init();
- if (result)
- GOTO(out, result);
+ result = dt_global_init();
+ if (result != 0)
+ return result;
- result = llo_global_init();
- if (result)
- GOTO(out, result);
+ result = llo_global_init();
+ if (result != 0)
+ return result;
#endif
result = cl_global_init();
-out:
return result;
}
llo_global_fini();
dt_global_fini();
#endif
- lu_time_global_fini();
if (lu_site_shrinker != NULL) {
cfs_remove_shrinker(lu_site_shrinker);
lu_site_shrinker = NULL;
}
lu_context_key_degister(&lu_global_key);
+ lu_context_key_degister(&lu_ucred_key);
/*
* Tear shrinker environment down _after_ de-registering
* lu_global_key, because the latter has a value in the former.
*/
- cfs_mutex_lock(&lu_sites_guard);
+ mutex_lock(&lu_sites_guard);
lu_env_fini(&lu_shrink_env);
- cfs_mutex_unlock(&lu_sites_guard);
+ mutex_unlock(&lu_sites_guard);
lu_ref_global_fini();
}
}
EXPORT_SYMBOL(lu_site_stats_print);
-const char *lu_time_names[LU_TIME_NR] = {
- [LU_TIME_FIND_LOOKUP] = "find_lookup",
- [LU_TIME_FIND_ALLOC] = "find_alloc",
- [LU_TIME_FIND_INSERT] = "find_insert"
-};
-EXPORT_SYMBOL(lu_time_names);
-
/**
* Helper function to initialize a number of kmem slab caches at once.
*/
cfs_hash_bd_unlock(hs, &bd, 1);
}
EXPORT_SYMBOL(lu_object_assign_fid);
+
+/**
+ * allocates object with 0 (non-assiged) fid
+ * XXX: temporary solution to be able to assign fid in ->do_create()
+ * till we have fully-functional OST fids
+ */
+struct lu_object *lu_object_anon(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_object_conf *conf)
+{
+ struct lu_fid fid;
+ struct lu_object *o;
+
+ fid_zero(&fid);
+ o = lu_object_alloc(env, dev, &fid, conf);
+
+ return o;
+}
+EXPORT_SYMBOL(lu_object_anon);