LU-2282 obdclass: Slightly increase the size of lu_keys[]

[fs/lustre-release.git] / lustre / obdclass / lu_object.c
diff --git a/lustre/obdclass/lu_object.c b/lustre/obdclass/lu_object.c

index 9eef99e..74c1cc9 100644 (file)
--- a/lustre/obdclass/lu_object.c
+++ b/lustre/obdclass/lu_object.c
@@ -75,11 +75,32 @@ void lu_object_put(const struct lu_env *env, struct lu_object *o)
          struct lu_site          *site;
          struct lu_object        *orig;
          cfs_hash_bd_t            bd;
+       const struct lu_fid     *fid;
  
          top  = o->lo_header;
          site = o->lo_dev->ld_site;
          orig = o;
  
+       /*
+        * till we have full fids-on-OST implemented anonymous objects
+        * are possible in OSP. such an object isn't listed in the site
+        * so we should not remove it from the site.
+        */
+       fid = lu_object_fid(o);
+       if (fid_is_zero(fid)) {
+               LASSERT(top->loh_hash.next == NULL
+                       && top->loh_hash.pprev == NULL);
+               LASSERT(cfs_list_empty(&top->loh_lru));
+               if (!cfs_atomic_dec_and_test(&top->loh_ref))
+                       return;
+               cfs_list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+                       if (o->lo_ops->loo_object_release != NULL)
+                               o->lo_ops->loo_object_release(env, o);
+               }
+               lu_object_free(env, orig);
+               return;
+       }
+
          cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd);
          bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
  
@@ -1243,7 +1264,7 @@ enum {
          /**
           * Maximal number of tld slots.
           */
-        LU_CONTEXT_KEY_NR = 32
+        LU_CONTEXT_KEY_NR = 40
  };
  
  static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, };
@@ -1776,6 +1797,24 @@ static void lu_site_stats_get(cfs_hash_t *hs,
  
  #ifdef __KERNEL__
  
+/*
+ * There exists a potential lock inversion deadlock scenario when using
+ * Lustre on top of ZFS. This occurs between one of ZFS's
+ * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
+ * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
+ * while thread B will take the ht_lock and sleep on the lu_sites_guard
+ * lock. Obviously neither thread will wake and drop their respective hold
+ * on their lock.
+ *
+ * To prevent this from happening we must ensure the lu_sites_guard lock is
+ * not taken while down this code path. ZFS reliably does not set the
+ * __GFP_FS bit in its code paths, so this can be used to determine if it
+ * is safe to take the lu_sites_guard lock.
+ *
+ * Ideally we should accurately return the remaining number of cached
+ * objects without taking the  lu_sites_guard lock, but this is not
+ * possible in the current implementation.
+ */
  static int lu_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
  {
          lu_site_stats_t stats;
@@ -1785,12 +1824,26 @@ static int lu_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
          int remain = shrink_param(sc, nr_to_scan);
          CFS_LIST_HEAD(splice);
  
-        if (remain != 0) {
-                if (!(shrink_param(sc, gfp_mask) & __GFP_FS))
+       if (!(shrink_param(sc, gfp_mask) & __GFP_FS)) {
+               if (remain != 0)
                          return -1;
-                CDEBUG(D_INODE, "Shrink %d objects\n", remain);
+               else
+                       /* We must not take the lu_sites_guard lock when
+                        * __GFP_FS is *not* set because of the deadlock
+                        * possibility detailed above. Additionally,
+                        * since we cannot determine the number of
+                        * objects in the cache without taking this
+                        * lock, we're in a particularly tough spot. As
+                        * a result, we'll just lie and say our cache is
+                        * empty. This _should_ be ok, as we can't
+                        * reclaim objects when __GFP_FS is *not* set
+                        * anyways.
+                        */
+                       return 0;
          }
  
+       CDEBUG(D_INODE, "Shrink %d objects\n", remain);
+
          cfs_mutex_lock(&lu_sites_guard);
          cfs_list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
                  if (shrink_param(sc, nr_to_scan) != 0) {
@@ -2091,3 +2144,22 @@ void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
         cfs_hash_bd_unlock(hs, &bd, 1);
  }
  EXPORT_SYMBOL(lu_object_assign_fid);
+
+/**
+ * allocates object with 0 (non-assiged) fid
+ * XXX: temporary solution to be able to assign fid in ->do_create()
+ *      till we have fully-functional OST fids
+ */
+struct lu_object *lu_object_anon(const struct lu_env *env,
+                                struct lu_device *dev,
+                                const struct lu_object_conf *conf)
+{
+       struct lu_fid     fid;
+       struct lu_object *o;
+
+       fid_zero(&fid);
+       o = lu_object_alloc(env, dev, &fid, conf);
+
+       return o;
+}
+EXPORT_SYMBOL(lu_object_anon);