Whamcloud - gitweb
LU-2282 obdclass: Slightly increase the size of lu_keys[]
[fs/lustre-release.git] / lustre / obdclass / lu_object.c
index a91b4b6..74c1cc9 100644 (file)
@@ -75,11 +75,32 @@ void lu_object_put(const struct lu_env *env, struct lu_object *o)
         struct lu_site          *site;
         struct lu_object        *orig;
         cfs_hash_bd_t            bd;
+       const struct lu_fid     *fid;
 
         top  = o->lo_header;
         site = o->lo_dev->ld_site;
         orig = o;
 
+       /*
+        * till we have full fids-on-OST implemented anonymous objects
+        * are possible in OSP. such an object isn't listed in the site
+        * so we should not remove it from the site.
+        */
+       fid = lu_object_fid(o);
+       if (fid_is_zero(fid)) {
+               LASSERT(top->loh_hash.next == NULL
+                       && top->loh_hash.pprev == NULL);
+               LASSERT(cfs_list_empty(&top->loh_lru));
+               if (!cfs_atomic_dec_and_test(&top->loh_ref))
+                       return;
+               cfs_list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+                       if (o->lo_ops->loo_object_release != NULL)
+                               o->lo_ops->loo_object_release(env, o);
+               }
+               lu_object_free(env, orig);
+               return;
+       }
+
         cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd);
         bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
 
@@ -135,6 +156,18 @@ void lu_object_put(const struct lu_env *env, struct lu_object *o)
 EXPORT_SYMBOL(lu_object_put);
 
 /**
+ * Put object and don't keep in cache. This is temporary solution for
+ * multi-site objects when its layering is not constant.
+ */
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o)
+{
+       cfs_set_bit(LU_OBJECT_HEARD_BANSHEE,
+                   &o->lo_header->loh_flags);
+       return lu_object_put(env, o);
+}
+EXPORT_SYMBOL(lu_object_put_nocache);
+
+/**
  * Allocate new object.
  *
  * This follows object creation protocol, described in the comment within
@@ -1231,7 +1264,7 @@ enum {
         /**
          * Maximal number of tld slots.
          */
-        LU_CONTEXT_KEY_NR = 32
+        LU_CONTEXT_KEY_NR = 40
 };
 
 static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, };
@@ -1764,6 +1797,24 @@ static void lu_site_stats_get(cfs_hash_t *hs,
 
 #ifdef __KERNEL__
 
+/*
+ * There exists a potential lock inversion deadlock scenario when using
+ * Lustre on top of ZFS. This occurs between one of ZFS's
+ * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
+ * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
+ * while thread B will take the ht_lock and sleep on the lu_sites_guard
+ * lock. Obviously neither thread will wake and drop their respective hold
+ * on their lock.
+ *
+ * To prevent this from happening we must ensure the lu_sites_guard lock is
+ * not taken while down this code path. ZFS reliably does not set the
+ * __GFP_FS bit in its code paths, so this can be used to determine if it
+ * is safe to take the lu_sites_guard lock.
+ *
+ * Ideally we should accurately return the remaining number of cached
+ * objects without taking the  lu_sites_guard lock, but this is not
+ * possible in the current implementation.
+ */
 static int lu_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
 {
         lu_site_stats_t stats;
@@ -1773,12 +1824,26 @@ static int lu_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
         int remain = shrink_param(sc, nr_to_scan);
         CFS_LIST_HEAD(splice);
 
-        if (remain != 0) {
-                if (!(shrink_param(sc, gfp_mask) & __GFP_FS))
+       if (!(shrink_param(sc, gfp_mask) & __GFP_FS)) {
+               if (remain != 0)
                         return -1;
-                CDEBUG(D_INODE, "Shrink %d objects\n", remain);
+               else
+                       /* We must not take the lu_sites_guard lock when
+                        * __GFP_FS is *not* set because of the deadlock
+                        * possibility detailed above. Additionally,
+                        * since we cannot determine the number of
+                        * objects in the cache without taking this
+                        * lock, we're in a particularly tough spot. As
+                        * a result, we'll just lie and say our cache is
+                        * empty. This _should_ be ok, as we can't
+                        * reclaim objects when __GFP_FS is *not* set
+                        * anyways.
+                        */
+                       return 0;
         }
 
+       CDEBUG(D_INODE, "Shrink %d objects\n", remain);
+
         cfs_mutex_lock(&lu_sites_guard);
         cfs_list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
                 if (shrink_param(sc, nr_to_scan) != 0) {
@@ -2048,3 +2113,53 @@ void lu_kmem_fini(struct lu_kmem_descr *caches)
         }
 }
 EXPORT_SYMBOL(lu_kmem_fini);
+
+/**
+ * Temporary solution to be able to assign fid in ->do_create()
+ * till we have fully-functional OST fids
+ */
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+                         const struct lu_fid *fid)
+{
+       struct lu_site          *s = o->lo_dev->ld_site;
+       struct lu_fid           *old = &o->lo_header->loh_fid;
+       struct lu_site_bkt_data *bkt;
+       struct lu_object        *shadow;
+       cfs_waitlink_t           waiter;
+       cfs_hash_t              *hs;
+       cfs_hash_bd_t            bd;
+       __u64                    version = 0;
+
+       LASSERT(fid_is_zero(old));
+
+       hs = s->ls_obj_hash;
+       cfs_hash_bd_get_and_lock(hs, (void *)fid, &bd, 1);
+       shadow = htable_lookup(s, &bd, fid, &waiter, &version);
+       /* supposed to be unique */
+       LASSERT(shadow == NULL);
+       *old = *fid;
+       bkt = cfs_hash_bd_extra_get(hs, &bd);
+       cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+       bkt->lsb_busy++;
+       cfs_hash_bd_unlock(hs, &bd, 1);
+}
+EXPORT_SYMBOL(lu_object_assign_fid);
+
+/**
+ * allocates object with 0 (non-assiged) fid
+ * XXX: temporary solution to be able to assign fid in ->do_create()
+ *      till we have fully-functional OST fids
+ */
+struct lu_object *lu_object_anon(const struct lu_env *env,
+                                struct lu_device *dev,
+                                const struct lu_object_conf *conf)
+{
+       struct lu_fid     fid;
+       struct lu_object *o;
+
+       fid_zero(&fid);
+       o = lu_object_alloc(env, dev, &fid, conf);
+
+       return o;
+}
+EXPORT_SYMBOL(lu_object_anon);