LU-3857 osd: cleanup procfs after osd_shutdown

[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_handler.c
diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c

index dec4da4..c13c682 100644 (file)
--- a/lustre/osd-ldiskfs/osd_handler.c
+++ b/lustre/osd-ldiskfs/osd_handler.c
@@ -64,6 +64,7 @@
  #include <lustre_fid.h>
  
  #include "osd_internal.h"
+#include "osd_dynlocks.h"
  
  /* llo_* api support */
  #include <md_object.h>
@@ -77,6 +78,20 @@ int ldiskfs_track_declares_assert;
  CFS_MODULE_PARM(ldiskfs_track_declares_assert, "i", int, 0644,
                 "LBUG during tracking of declares");
  
+/* Slab to allocate dynlocks */
+struct kmem_cache *dynlock_cachep;
+
+static struct lu_kmem_descr ldiskfs_caches[] = {
+       {
+               .ckd_cache = &dynlock_cachep,
+               .ckd_name  = "dynlock_cache",
+               .ckd_size  = sizeof(struct dynlock_handle)
+       },
+       {
+               .ckd_cache = NULL
+       }
+};
+
  static const char dot[] = ".";
  static const char dotdot[] = "..";
  static const char remote_obj_dir[] = "REM_OBJ_DIR";
@@ -304,6 +319,7 @@ int osd_get_idif(struct osd_thread_info *info, struct inode *inode,
  static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
  {
         struct osd_thread_info  *info   = osd_oti_get(env);
+       struct osd_device       *osd    = osd_obj2dev(obj);
         struct lustre_mdt_attrs *lma    = &info->oti_mdt_attrs;
         struct inode            *inode  = obj->oo_inode;
         struct dentry           *dentry = &info->oti_obj_dentry;
@@ -318,11 +334,41 @@ static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
         rc = __osd_xattr_get(inode, dentry, XATTR_NAME_LMA,
                              info->oti_mdt_attrs_old, LMA_OLD_SIZE);
         if (rc == -ENODATA && !fid_is_igif(lu_object_fid(&obj->oo_dt.do_lu)) &&
-           osd_obj2dev(obj)->od_check_ff) {
+           osd->od_check_ff) {
                 fid = &lma->lma_self_fid;
                 rc = osd_get_idif(info, inode, dentry, fid);
-               if (rc > 0)
+               if ((rc > 0) || (rc == -ENODATA && osd->od_lma_self_repair)) {
+                       handle_t *jh;
+
+                       /* For the given OST-object, if it has neither LMA nor
+                        * FID in XATTR_NAME_FID, then the given FID (which is
+                        * contained in the @obj, from client RPC for locating
+                        * the OST-object) is trusted. We use it to generate
+                        * the LMA. */
+
+                       LASSERT(current->journal_info == NULL);
+
+                       jh = osd_journal_start_sb(osd_sb(osd), LDISKFS_HT_MISC,
+                                       osd_dto_credits_noquota[DTO_XATTR_SET]);
+                       if (IS_ERR(jh)) {
+                               CWARN("%s: cannot start journal for "
+                                     "lma_self_repair: rc = %ld\n",
+                                     osd_name(osd), PTR_ERR(jh));
+                               RETURN(0);
+                       }
+
+                       rc = osd_ea_fid_set(info, inode,
+                               lu_object_fid(&obj->oo_dt.do_lu),
+                               fid_is_on_ost(info, osd,
+                                             lu_object_fid(&obj->oo_dt.do_lu),
+                                             OI_CHECK_FLD) ?
+                               LMAC_FID_ON_OST : 0, 0);
+                       if (rc != 0)
+                               CWARN("%s: cannot self repair the LMA: "
+                                     "rc = %d\n", osd_name(osd), rc);
+                       ldiskfs_journal_stop(jh);
                         RETURN(0);
+               }
         }
  
         if (unlikely(rc == -ENODATA))
@@ -337,8 +383,7 @@ static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
                 if (unlikely((lma->lma_incompat & ~LMA_INCOMPAT_SUPP) ||
                              CFS_FAIL_CHECK(OBD_FAIL_OSD_LMA_INCOMPAT))) {
                         CWARN("%s: unsupported incompat LMA feature(s) %#x for "
-                             "fid = "DFID", ino = %lu\n",
-                             osd_obj2dev(obj)->od_svname,
+                             "fid = "DFID", ino = %lu\n", osd_name(osd),
                               lma->lma_incompat & ~LMA_INCOMPAT_SUPP,
                               PFID(lu_object_fid(&obj->oo_dt.do_lu)),
                               inode->i_ino);
@@ -351,8 +396,7 @@ static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
         if (fid != NULL &&
             unlikely(!lu_fid_eq(lu_object_fid(&obj->oo_dt.do_lu), fid))) {
                 CDEBUG(D_INODE, "%s: FID "DFID" != self_fid "DFID"\n",
-                      osd_obj2dev(obj)->od_svname,
-                      PFID(lu_object_fid(&obj->oo_dt.do_lu)),
+                      osd_name(osd), PFID(lu_object_fid(&obj->oo_dt.do_lu)),
                        PFID(&lma->lma_self_fid));
                 rc = -EREMCHG;
         }
@@ -392,8 +436,17 @@ static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj,
         if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOENT))
                 RETURN(-ENOENT);
  
+       /* For the object is created as locking anchor, or for the object to
+        * be created on disk. No need to osd_oi_lookup() at here because FID
+        * shouldn't never be re-used, if it's really a duplicate FID from
+        * unexpected reason, we should be able to detect it later by calling
+        * do_create->osd_oi_insert(). */
+       if (conf != NULL && conf->loc_flags & LOC_F_NEW)
+               GOTO(out, result = 0);
+
         /* Search order: 1. per-thread cache. */
-       if (lu_fid_eq(fid, &oic->oic_fid)) {
+       if (lu_fid_eq(fid, &oic->oic_fid) &&
+           likely(oic->oic_dev == dev)) {
                 id = &oic->oic_lid;
                 goto iget;
         }
@@ -406,16 +459,6 @@ static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj,
                         goto iget;
         }
  
-       /*
-        * Objects are created as locking anchors or place holders for objects
-        * yet to be created. No need to osd_oi_lookup() at here because FID
-        * shouldn't never be re-used, if it's really a duplicate FID from
-        * unexpected reason, we should be able to detect it later by calling
-        * do_create->osd_oi_insert()
-        */
-       if (conf != NULL && conf->loc_flags & LOC_F_NEW)
-               GOTO(out, result = 0);
-
         /* Search order: 3. OI files. */
         result = osd_oi_lookup(info, dev, fid, id, OI_CHECK_FLD);
         if (result == -ENOENT) {
@@ -440,7 +483,7 @@ iget:
                 if (result == -ENOENT || result == -ESTALE) {
                         if (!in_oi) {
                                 fid_zero(&oic->oic_fid);
-                               GOTO(out, result = 0);
+                               GOTO(out, result = -ENOENT);
                         }
  
                         /* XXX: There are three possible cases:
@@ -458,10 +501,6 @@ iget:
                         if (result == 0)
                                 /* It is the case 1 or 2. */
                                 goto trigger;
-
-                       if (result == -ENOENT)
-                               /* It is the case 3. */
-                               result = 0;
                 } else if (result == -EREMCHG) {
  
  trigger:
@@ -878,7 +917,7 @@ int osd_trans_start(const struct lu_env *env, struct dt_device *d,
           * XXX temporary stuff. Some abstraction layer should
           * be used.
           */
-        jh = ldiskfs_journal_start_sb(osd_sb(dev), oh->ot_credits);
+        jh = osd_journal_start_sb(osd_sb(dev), LDISKFS_HT_MISC, oh->ot_credits);
          osd_th_started(oh);
          if (!IS_ERR(jh)) {
                  oh->ot_handle = jh;
@@ -898,6 +937,28 @@ out:
          RETURN(rc);
  }
  
+static int osd_seq_exists(const struct lu_env *env,
+                             struct osd_device *osd, obd_seq seq)
+{
+       struct lu_seq_range     *range = &osd_oti_get(env)->oti_seq_range;
+       struct seq_server_site  *ss = osd_seq_site(osd);
+       int                     rc;
+       ENTRY;
+
+       if (ss == NULL)
+               RETURN(1);
+
+       rc = osd_fld_lookup(env, osd, seq, range);
+       if (rc != 0) {
+               if (rc != -ENOENT)
+                       CERROR("%s: can't lookup FLD sequence "LPX64
+                              ": rc = %d\n", osd_name(osd), seq, rc);
+               RETURN(0);
+       }
+
+       RETURN(ss->ss_node_id == range->lsr_index);
+}
+
  /*
   * Concurrency: shouldn't matter.
   */
@@ -947,21 +1008,21 @@ static int osd_trans_stop(const struct lu_env *env, struct thandle *th)
                  OBD_FREE_PTR(oh);
          }
  
-        /* as we want IO to journal and data IO be concurrent, we don't block
-         * awaiting data IO completion in osd_do_bio(), instead we wait here
-         * once transaction is submitted to the journal. all reqular requests
-         * don't do direct IO (except read/write), thus this wait_event becomes
-         * no-op for them.
-         *
-         * IMPORTANT: we have to wait till any IO submited by the thread is
-         * completed otherwise iobuf may be corrupted by different request
-         */
-        cfs_wait_event(iobuf->dr_wait,
-                       cfs_atomic_read(&iobuf->dr_numreqs) == 0);
-        if (!rc)
-                rc = iobuf->dr_error;
+       /* as we want IO to journal and data IO be concurrent, we don't block
+        * awaiting data IO completion in osd_do_bio(), instead we wait here
+        * once transaction is submitted to the journal. all reqular requests
+        * don't do direct IO (except read/write), thus this wait_event becomes
+        * no-op for them.
+        *
+        * IMPORTANT: we have to wait till any IO submited by the thread is
+        * completed otherwise iobuf may be corrupted by different request
+        */
+       wait_event(iobuf->dr_wait,
+                      cfs_atomic_read(&iobuf->dr_numreqs) == 0);
+       if (!rc)
+               rc = iobuf->dr_error;
  
-        RETURN(rc);
+       RETURN(rc);
  }
  
  static int osd_trans_cb_add(struct thandle *th, struct dt_txn_commit_cb *dcb)
@@ -1114,7 +1175,6 @@ static void osd_conf_get(const struct lu_env *env,
          /*
           * XXX should be taken from not-yet-existing fs abstraction layer.
           */
-       param->ddp_mnt = osd_dt_dev(dev)->od_mnt;
          param->ddp_max_name_len = LDISKFS_NAME_LEN;
          param->ddp_max_nlink    = LDISKFS_LINK_MAX;
         param->ddp_block_shift  = sb->s_blocksize_bits;
@@ -1419,48 +1479,50 @@ static int capa_is_sane(const struct lu_env *env,
  }
  
  int osd_object_auth(const struct lu_env *env, struct dt_object *dt,
-                    struct lustre_capa *capa, __u64 opc)
+                   struct lustre_capa *capa, __u64 opc)
  {
-        const struct lu_fid *fid = lu_object_fid(&dt->do_lu);
-        struct osd_device *dev = osd_dev(dt->do_lu.lo_dev);
-        struct md_capainfo *ci;
-        int rc;
+       const struct lu_fid *fid = lu_object_fid(&dt->do_lu);
+       struct osd_device *osd = osd_dev(dt->do_lu.lo_dev);
+       struct lu_capainfo *lci;
+       int rc;
  
-        if (!dev->od_fl_capa)
-                return 0;
+       if (!osd->od_fl_capa)
+               return 0;
  
-        if (capa == BYPASS_CAPA)
-                return 0;
+       if (capa == BYPASS_CAPA)
+               return 0;
  
-        ci = md_capainfo(env);
-        if (unlikely(!ci))
-                return 0;
+       lci = lu_capainfo_get(env);
+       if (unlikely(lci == NULL))
+               return 0;
  
-        if (ci->mc_auth == LC_ID_NONE)
-                return 0;
+       if (lci->lci_auth == LC_ID_NONE)
+               return 0;
  
-        if (!capa) {
-                CERROR("no capability is provided for fid "DFID"\n", PFID(fid));
-                return -EACCES;
-        }
+       if (capa == NULL) {
+               CERROR("%s: no capability provided for FID "DFID": rc = %d\n",
+                      osd_name(osd), PFID(fid), -EACCES);
+               return -EACCES;
+       }
  
-        if (!lu_fid_eq(fid, &capa->lc_fid)) {
-                DEBUG_CAPA(D_ERROR, capa, "fid "DFID" mismatch with",
-                           PFID(fid));
-                return -EACCES;
-        }
+       if (!lu_fid_eq(fid, &capa->lc_fid)) {
+               DEBUG_CAPA(D_ERROR, capa, "fid "DFID" mismatch with",
+                          PFID(fid));
+               return -EACCES;
+       }
  
-        if (!capa_opc_supported(capa, opc)) {
-                DEBUG_CAPA(D_ERROR, capa, "opc "LPX64" not supported by", opc);
-                return -EACCES;
-        }
+       if (!capa_opc_supported(capa, opc)) {
+               DEBUG_CAPA(D_ERROR, capa, "opc "LPX64" not supported by", opc);
+               return -EACCES;
+       }
  
-        if ((rc = capa_is_sane(env, dev, capa, dev->od_capa_keys))) {
-                DEBUG_CAPA(D_ERROR, capa, "insane (rc %d)", rc);
-                return -EACCES;
-        }
+       rc = capa_is_sane(env, osd, capa, osd->od_capa_keys);
+       if (rc != 0) {
+               DEBUG_CAPA(D_ERROR, capa, "insane: rc = %d", rc);
+               return -EACCES;
+       }
  
-        return 0;
+       return 0;
  }
  
  static struct timespec *osd_inode_time(const struct lu_env *env,
@@ -1796,9 +1858,8 @@ struct dentry *osd_child_dentry_get(const struct lu_env *env,
  }
  
  static int osd_mkfile(struct osd_thread_info *info, struct osd_object *obj,
-                      cfs_umode_t mode,
-                      struct dt_allocation_hint *hint,
-                      struct thandle *th)
+                     umode_t mode, struct dt_allocation_hint *hint,
+                     struct thandle *th)
  {
          int result;
          struct osd_device  *osd = osd_obj2dev(obj);
@@ -1931,16 +1992,16 @@ static int osd_mksym(struct osd_thread_info *info, struct osd_object *obj,
  }
  
  static int osd_mknod(struct osd_thread_info *info, struct osd_object *obj,
-                     struct lu_attr *attr,
-                     struct dt_allocation_hint *hint,
-                     struct dt_object_format *dof,
-                     struct thandle *th)
+                    struct lu_attr *attr,
+                    struct dt_allocation_hint *hint,
+                    struct dt_object_format *dof,
+                    struct thandle *th)
  {
-        cfs_umode_t mode = attr->la_mode & (S_IFMT | S_IALLUGO | S_ISVTX);
-        int result;
+       umode_t mode = attr->la_mode & (S_IFMT | S_IALLUGO | S_ISVTX);
+       int result;
  
-        LINVRNT(osd_invariant(obj));
-        LASSERT(obj->oo_inode == NULL);
+       LINVRNT(osd_invariant(obj));
+       LASSERT(obj->oo_inode == NULL);
          LASSERT(S_ISCHR(mode) || S_ISBLK(mode) ||
                  S_ISFIFO(mode) || S_ISSOCK(mode));
  
@@ -1995,7 +2056,7 @@ static osd_obj_type_f osd_create_type_f(enum dt_format_type type)
  
  static void osd_ah_init(const struct lu_env *env, struct dt_allocation_hint *ah,
                         struct dt_object *parent, struct dt_object *child,
-                       cfs_umode_t child_mode)
+                       umode_t child_mode)
  {
          LASSERT(ah);
  
@@ -2082,31 +2143,34 @@ static int __osd_object_create(struct osd_thread_info *info,
   * \retval 0, on success
   */
  static int __osd_oi_insert(const struct lu_env *env, struct osd_object *obj,
-                           const struct lu_fid *fid, struct thandle *th)
+                          const struct lu_fid *fid, struct thandle *th)
  {
-        struct osd_thread_info *info = osd_oti_get(env);
-        struct osd_inode_id    *id   = &info->oti_id;
-        struct osd_device      *osd  = osd_obj2dev(obj);
+       struct osd_thread_info *info = osd_oti_get(env);
+       struct osd_inode_id    *id   = &info->oti_id;
+       struct osd_device      *osd  = osd_obj2dev(obj);
+       struct osd_thandle     *oh;
  
-        LASSERT(obj->oo_inode != NULL);
+       LASSERT(obj->oo_inode != NULL);
+
+       oh = container_of0(th, struct osd_thandle, ot_super);
+       LASSERT(oh->ot_handle);
  
         osd_id_gen(id, obj->oo_inode->i_ino, obj->oo_inode->i_generation);
-       return osd_oi_insert(info, osd, fid, id, th, OI_CHECK_FLD);
+       return osd_oi_insert(info, osd, fid, id, oh->ot_handle, OI_CHECK_FLD);
  }
  
  int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd,
-                  const struct lu_fid *fid, struct lu_seq_range *range)
+                  obd_seq seq, struct lu_seq_range *range)
  {
         struct seq_server_site  *ss = osd_seq_site(osd);
-       int                     rc;
  
-       if (fid_is_idif(fid)) {
+       if (fid_seq_is_idif(seq)) {
                 fld_range_set_ost(range);
-               range->lsr_index = fid_idif_ost_idx(fid);
+               range->lsr_index = idif_ost_idx(seq);
                 return 0;
         }
  
-       if (!fid_seq_in_fldb(fid_seq(fid))) {
+       if (!fid_seq_in_fldb(seq)) {
                 fld_range_set_mdt(range);
                 if (ss != NULL)
                         /* FIXME: If ss is NULL, it suppose not get lsr_index
@@ -2117,12 +2181,8 @@ int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd,
  
         LASSERT(ss != NULL);
         fld_range_set_any(range);
-       rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(fid), range);
-       if (rc != 0) {
-               CERROR("%s: cannot find FLD range for "DFID": rc = %d\n",
-                      osd_name(osd), PFID(fid), rc);
-       }
-       return rc;
+       /* OSD will only do local fld lookup */
+       return fld_local_lookup(env, ss->ss_server_fld, seq, range);
  }
  
  /*
@@ -2135,7 +2195,6 @@ static int osd_declare_object_create(const struct lu_env *env,
                                      struct dt_object_format *dof,
                                      struct thandle *handle)
  {
-       struct lu_seq_range     *range = &osd_oti_get(env)->oti_seq_range;
         struct osd_thandle      *oh;
         int                      rc;
         ENTRY;
@@ -2173,16 +2232,6 @@ static int osd_declare_object_create(const struct lu_env *env,
         if (rc != 0)
                 RETURN(rc);
  
-       /* It does fld look up inside declare, and the result will be
-        * added to fld cache, so the following fld lookup inside insert
-        * does not need send RPC anymore, so avoid send rpc with holding
-        * transaction */
-       if (fid_is_norm(lu_object_fid(&dt->do_lu)) &&
-               !fid_is_last_id(lu_object_fid(&dt->do_lu)))
-               osd_fld_lookup(env, osd_dt_dev(handle->th_dev),
-                              lu_object_fid(&dt->do_lu), range);
-
-
         RETURN(rc);
  }
  
@@ -2280,7 +2329,8 @@ static int osd_object_destroy(const struct lu_env *env,
                 RETURN(-EPERM);
  
         if (S_ISDIR(inode->i_mode)) {
-               LASSERT(osd_inode_unlinked(inode) || inode->i_nlink == 1);
+               LASSERT(osd_inode_unlinked(inode) || inode->i_nlink == 1 ||
+                       inode->i_nlink == 2);
                 /* it will check/delete the inode from remote parent,
                  * how to optimize it? unlink performance impaction XXX */
                 result = osd_delete_from_remote_parent(env, osd, obj, oh);
@@ -2296,7 +2346,8 @@ static int osd_object_destroy(const struct lu_env *env,
  
         osd_trans_exec_op(env, th, OSD_OT_DESTROY);
  
-        result = osd_oi_delete(osd_oti_get(env), osd, fid, th, OI_CHECK_FLD);
+       result = osd_oi_delete(osd_oti_get(env), osd, fid, oh->ot_handle,
+                              OI_CHECK_FLD);
  
          /* XXX: add to ext3 orphan list */
          /* rc = ext3_orphan_add(handle_t *handle, struct inode *inode) */
@@ -2576,20 +2627,22 @@ static int osd_declare_object_ref_add(const struct lu_env *env,
  static int osd_object_ref_add(const struct lu_env *env,
                                struct dt_object *dt, struct thandle *th)
  {
-       struct osd_object *obj = osd_dt_obj(dt);
-       struct inode      *inode = obj->oo_inode;
-       bool               need_dirty = false;
-       int                rc = 0;
+       struct osd_object  *obj = osd_dt_obj(dt);
+       struct inode       *inode = obj->oo_inode;
+       struct osd_thandle *oh;
+       int                 rc = 0;
  
          LINVRNT(osd_invariant(obj));
         LASSERT(dt_object_exists(dt) && !dt_object_remote(dt));
          LASSERT(osd_write_locked(env, obj));
          LASSERT(th != NULL);
  
+        oh = container_of0(th, struct osd_thandle, ot_super);
+        LASSERT(oh->ot_handle != NULL);
+
         osd_trans_exec_op(env, th, OSD_OT_REF_ADD);
  
-       /* This based on ldiskfs_inc_count(), which is not exported.
-        *
+       /*
          * The DIR_NLINK feature allows directories to exceed LDISKFS_LINK_MAX
          * (65000) subdirectories by storing "1" in i_nlink if the link count
          * would otherwise overflow. Directory tranversal tools understand
@@ -2601,28 +2654,11 @@ static int osd_object_ref_add(const struct lu_env *env,
          * in case they are being linked into the PENDING directory
          */
         spin_lock(&obj->oo_guard);
-       if (unlikely(!S_ISDIR(inode->i_mode) &&
-                    inode->i_nlink >= LDISKFS_LINK_MAX)) {
-               /* MDD should have checked this, but good to be safe */
-               rc = -EMLINK;
-       } else if (unlikely(inode->i_nlink == 0 ||
-                           (S_ISDIR(inode->i_mode) &&
-                            inode->i_nlink >= LDISKFS_LINK_MAX))) {
-               /* inc_nlink from 0 may cause WARN_ON */
-               set_nlink(inode, 1);
-               need_dirty = true;
-       } else if (!S_ISDIR(inode->i_mode) ||
-                  (S_ISDIR(inode->i_mode) && inode->i_nlink >= 2)) {
-               inc_nlink(inode);
-               need_dirty = true;
-       } /* else (S_ISDIR(inode->i_mode) && inode->i_nlink == 1) { ; } */
-
+       ldiskfs_inc_count(oh->ot_handle, inode);
         LASSERT(inode->i_nlink <= LDISKFS_LINK_MAX);
         spin_unlock(&obj->oo_guard);
  
-       if (need_dirty)
-               ll_dirty_inode(inode, I_DIRTY_DATASYNC);
-
+       ll_dirty_inode(inode, I_DIRTY_DATASYNC);
         LINVRNT(osd_invariant(obj));
  
         return rc;
@@ -2652,37 +2688,40 @@ static int osd_declare_object_ref_del(const struct lu_env *env,
  static int osd_object_ref_del(const struct lu_env *env, struct dt_object *dt,
                                struct thandle *th)
  {
-        struct osd_object *obj = osd_dt_obj(dt);
-        struct inode      *inode = obj->oo_inode;
+       struct osd_object       *obj = osd_dt_obj(dt);
+       struct inode            *inode = obj->oo_inode;
+       struct osd_device       *osd = osd_dev(dt->do_lu.lo_dev);
+       struct osd_thandle      *oh;
  
-        LINVRNT(osd_invariant(obj));
+       LINVRNT(osd_invariant(obj));
         LASSERT(dt_object_exists(dt) && !dt_object_remote(dt));
-        LASSERT(osd_write_locked(env, obj));
-        LASSERT(th != NULL);
+       LASSERT(osd_write_locked(env, obj));
+       LASSERT(th != NULL);
+
+        oh = container_of0(th, struct osd_thandle, ot_super);
+        LASSERT(oh->ot_handle != NULL);
  
         osd_trans_exec_op(env, th, OSD_OT_REF_DEL);
  
         spin_lock(&obj->oo_guard);
-       LASSERT(inode->i_nlink > 0);
-
-       /* This based on ldiskfs_dec_count(), which is not exported.
-        *
-        * If a directory already has nlink == 1, then do not drop the nlink
-        * count to 0, even temporarily, to avoid race conditions with other
-        * threads not holding oo_guard seeing i_nlink == 0 in rare cases.
-        *
-        * nlink == 1 means the directory has/had > EXT4_LINK_MAX subdirs.
-        * */
-       if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 1) {
-               drop_nlink(inode);
-
-               spin_unlock(&obj->oo_guard);
-               ll_dirty_inode(inode, I_DIRTY_DATASYNC);
-               LINVRNT(osd_invariant(obj));
-       } else {
+       /* That can be result of upgrade from old Lustre version and
+        * applied only to local files.  Just skip this ref_del call.
+        * ext4_unlink() only treats this as a warning, don't LASSERT here.*/
+       if (inode->i_nlink == 0) {
+               CDEBUG_LIMIT(fid_is_norm(lu_object_fid(&dt->do_lu)) ?
+                            D_ERROR : D_INODE, "%s: nlink == 0 on "DFID
+                            ", maybe an upgraded file? (LU-3915)\n",
+                            osd_name(osd), PFID(lu_object_fid(&dt->do_lu)));
                 spin_unlock(&obj->oo_guard);
+               return 0;
         }
  
+       ldiskfs_dec_count(oh->ot_handle, inode);
+       spin_unlock(&obj->oo_guard);
+
+       ll_dirty_inode(inode, I_DIRTY_DATASYNC);
+       LINVRNT(osd_invariant(obj));
+
         return 0;
  }
  
@@ -2878,90 +2917,89 @@ static int osd_xattr_del(const struct lu_env *env, struct dt_object *dt,
  }
  
  static struct obd_capa *osd_capa_get(const struct lu_env *env,
-                                     struct dt_object *dt,
-                                     struct lustre_capa *old,
-                                     __u64 opc)
+                                    struct dt_object *dt,
+                                    struct lustre_capa *old, __u64 opc)
  {
-        struct osd_thread_info *info = osd_oti_get(env);
-        const struct lu_fid *fid = lu_object_fid(&dt->do_lu);
-        struct osd_object *obj = osd_dt_obj(dt);
-        struct osd_device *dev = osd_obj2dev(obj);
-        struct lustre_capa_key *key = &info->oti_capa_key;
-        struct lustre_capa *capa = &info->oti_capa;
-        struct obd_capa *oc;
-        struct md_capainfo *ci;
-        int rc;
-        ENTRY;
+       struct osd_thread_info *info = osd_oti_get(env);
+       const struct lu_fid *fid = lu_object_fid(&dt->do_lu);
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct osd_device *osd = osd_obj2dev(obj);
+       struct lustre_capa_key *key = &info->oti_capa_key;
+       struct lustre_capa *capa = &info->oti_capa;
+       struct obd_capa *oc;
+       struct lu_capainfo *lci;
+       int rc;
+       ENTRY;
  
-        if (!dev->od_fl_capa)
-                RETURN(ERR_PTR(-ENOENT));
+       if (!osd->od_fl_capa)
+               RETURN(ERR_PTR(-ENOENT));
  
         LASSERT(dt_object_exists(dt) && !dt_object_remote(dt));
-        LINVRNT(osd_invariant(obj));
+       LINVRNT(osd_invariant(obj));
  
-        /* renewal sanity check */
-        if (old && osd_object_auth(env, dt, old, opc))
-                RETURN(ERR_PTR(-EACCES));
-
-        ci = md_capainfo(env);
-        if (unlikely(!ci))
-                RETURN(ERR_PTR(-ENOENT));
-
-        switch (ci->mc_auth) {
-        case LC_ID_NONE:
-                RETURN(NULL);
-        case LC_ID_PLAIN:
-                capa->lc_uid = obj->oo_inode->i_uid;
-                capa->lc_gid = obj->oo_inode->i_gid;
-                capa->lc_flags = LC_ID_PLAIN;
-                break;
-        case LC_ID_CONVERT: {
-                __u32 d[4], s[4];
-
-                s[0] = obj->oo_inode->i_uid;
-                cfs_get_random_bytes(&(s[1]), sizeof(__u32));
-                s[2] = obj->oo_inode->i_gid;
-                cfs_get_random_bytes(&(s[3]), sizeof(__u32));
-                rc = capa_encrypt_id(d, s, key->lk_key, CAPA_HMAC_KEY_MAX_LEN);
-                if (unlikely(rc))
-                        RETURN(ERR_PTR(rc));
-
-                capa->lc_uid   = ((__u64)d[1] << 32) | d[0];
-                capa->lc_gid   = ((__u64)d[3] << 32) | d[2];
-                capa->lc_flags = LC_ID_CONVERT;
-                break;
-        }
-        default:
-                RETURN(ERR_PTR(-EINVAL));
+       /* renewal sanity check */
+       if (old && osd_object_auth(env, dt, old, opc))
+               RETURN(ERR_PTR(-EACCES));
+
+       lci = lu_capainfo_get(env);
+       if (unlikely(lci == NULL))
+               RETURN(ERR_PTR(-ENOENT));
+
+       switch (lci->lci_auth) {
+       case LC_ID_NONE:
+               RETURN(NULL);
+       case LC_ID_PLAIN:
+               capa->lc_uid = obj->oo_inode->i_uid;
+               capa->lc_gid = obj->oo_inode->i_gid;
+               capa->lc_flags = LC_ID_PLAIN;
+               break;
+       case LC_ID_CONVERT: {
+               __u32 d[4], s[4];
+
+               s[0] = obj->oo_inode->i_uid;
+               cfs_get_random_bytes(&(s[1]), sizeof(__u32));
+               s[2] = obj->oo_inode->i_gid;
+               cfs_get_random_bytes(&(s[3]), sizeof(__u32));
+               rc = capa_encrypt_id(d, s, key->lk_key, CAPA_HMAC_KEY_MAX_LEN);
+               if (unlikely(rc))
+                       RETURN(ERR_PTR(rc));
+
+               capa->lc_uid   = ((__u64)d[1] << 32) | d[0];
+               capa->lc_gid   = ((__u64)d[3] << 32) | d[2];
+               capa->lc_flags = LC_ID_CONVERT;
+               break;
          }
+       default:
+               RETURN(ERR_PTR(-EINVAL));
+       }
  
-        capa->lc_fid = *fid;
-        capa->lc_opc = opc;
-        capa->lc_flags |= dev->od_capa_alg << 24;
-        capa->lc_timeout = dev->od_capa_timeout;
-        capa->lc_expiry = 0;
+       capa->lc_fid = *fid;
+       capa->lc_opc = opc;
+       capa->lc_flags |= osd->od_capa_alg << 24;
+       capa->lc_timeout = osd->od_capa_timeout;
+       capa->lc_expiry = 0;
  
-        oc = capa_lookup(dev->od_capa_hash, capa, 1);
-        if (oc) {
-                LASSERT(!capa_is_expired(oc));
-                RETURN(oc);
-        }
+       oc = capa_lookup(osd->od_capa_hash, capa, 1);
+       if (oc) {
+               LASSERT(!capa_is_expired(oc));
+               RETURN(oc);
+       }
  
         spin_lock(&capa_lock);
-       *key = dev->od_capa_keys[1];
+       *key = osd->od_capa_keys[1];
         spin_unlock(&capa_lock);
  
-        capa->lc_keyid = key->lk_keyid;
-        capa->lc_expiry = cfs_time_current_sec() + dev->od_capa_timeout;
+       capa->lc_keyid = key->lk_keyid;
+       capa->lc_expiry = cfs_time_current_sec() + osd->od_capa_timeout;
  
-        rc = capa_hmac(capa->lc_hmac, capa, key->lk_key);
-        if (rc) {
-                DEBUG_CAPA(D_ERROR, capa, "HMAC failed: %d for", rc);
-                RETURN(ERR_PTR(rc));
-        }
+       rc = capa_hmac(capa->lc_hmac, capa, key->lk_key);
+       if (rc) {
+               DEBUG_CAPA(D_ERROR, capa, "HMAC failed: %d for", rc);
+               RETURN(ERR_PTR(rc));
+       }
  
-        oc = capa_add(dev->od_capa_hash, capa);
-        RETURN(oc);
+       oc = capa_add(osd->od_capa_hash, capa);
+       RETURN(oc);
  }
  
  static int osd_object_sync(const struct lu_env *env, struct dt_object *dt)
@@ -2980,6 +3018,7 @@ static int osd_object_sync(const struct lu_env *env, struct dt_object *dt)
         file->f_dentry = dentry;
         file->f_mapping = inode->i_mapping;
         file->f_op = inode->i_fop;
+       set_file_inode(file, inode);
  #ifndef HAVE_FILE_FSYNC_4ARGS
         mutex_lock(&inode->i_mutex);
  #endif
@@ -3325,23 +3364,16 @@ static inline int osd_get_fid_from_dentry(struct ldiskfs_dir_entry_2 *de,
  static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd,
                           struct lu_fid *fid)
  {
-       struct lu_seq_range     *range = &osd_oti_get(env)->oti_seq_range;
-       struct seq_server_site  *ss = osd_seq_site(osd);
-       int                     rc;
         ENTRY;
  
-       /* Those FID seqs, which are not in FLDB, must be local seq */
-       if (unlikely(!fid_seq_in_fldb(fid_seq(fid)) || ss == NULL))
+       /* FID seqs not in FLDB, must be local seq */
+       if (unlikely(!fid_seq_in_fldb(fid_seq(fid))))
                 RETURN(0);
  
-       rc = osd_fld_lookup(env, osd, fid, range);
-       if (rc != 0) {
-               CERROR("%s: Can not lookup fld for "DFID"\n",
-                      osd_name(osd), PFID(fid));
-               RETURN(rc);
-       }
+       if (osd_seq_exists(env, osd, fid_seq(fid)))
+               RETURN(0);
  
-       RETURN(ss->ss_node_id != range->lsr_index);
+       RETURN(1);
  }
  
  /**
@@ -3396,7 +3428,7 @@ static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt,
                 down_write(&obj->oo_ext_idx_sem);
          }
  
-        bh = ldiskfs_find_entry(dir, &dentry->d_name, &de, hlock);
+        bh = osd_ldiskfs_find_entry(dir, &dentry->d_name, &de, NULL, hlock);
          if (bh) {
                 __u32 ino = 0;
  
@@ -3880,6 +3912,7 @@ int osd_add_oi_cache(struct osd_thread_info *info, struct osd_device *osd,
                id->oii_ino, id->oii_gen, info);
         info->oti_cache.oic_lid = *id;
         info->oti_cache.oic_fid = *fid;
+       info->oti_cache.oic_dev = osd;
  
         return 0;
  }
@@ -3918,7 +3951,7 @@ static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj,
                 down_read(&obj->oo_ext_idx_sem);
          }
  
-        bh = osd_ldiskfs_find_entry(dir, dentry, &de, hlock);
+        bh = osd_ldiskfs_find_entry(dir, &dentry->d_name, &de, NULL, hlock);
          if (bh) {
                 struct osd_thread_info *oti = osd_oti_get(env);
                 struct osd_inode_id *id = &oti->oti_id;
@@ -3942,7 +3975,7 @@ static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj,
                         rc = osd_ea_fid_get(env, obj, ino, fid, id);
                 else
                         osd_id_gen(id, ino, OSD_OII_NOGEN);
-               if (rc != 0 || osd_remote_fid(env, dev, fid)) {
+               if (rc != 0) {
                         fid_zero(&oic->oic_fid);
                         GOTO(out, rc);
                 }
@@ -4021,10 +4054,11 @@ struct osd_object *osd_object_find(const struct lu_env *env,
                         lu_object_put(env, luch);
                          child = ERR_PTR(-ENOENT);
                  }
-        } else
-                child = (void *)luch;
+       } else {
+               child = ERR_CAST(luch);
+       }
  
-        return child;
+       return child;
  }
  
  /**
@@ -4173,8 +4207,8 @@ static int osd_index_ea_insert(const struct lu_env *env, struct dt_object *dt,
                         CERROR("%s: Can not find object "DFID"%u:%u: rc = %d\n",
                                osd_name(osd), PFID(fid),
                                id->oii_ino, id->oii_gen,
-                              (int)PTR_ERR(child_inode));
-                       RETURN(PTR_ERR(child_inode));
+                              (int)PTR_ERR(child));
+                       RETURN(PTR_ERR(child));
                 }
                 child_inode = igrab(child->oo_inode);
         }
@@ -4506,6 +4540,8 @@ static struct dt_it *osd_it_ea_init(const struct lu_env *env,
         file->f_dentry          = obj_dentry;
         file->f_mapping         = obj->oo_inode->i_mapping;
         file->f_op              = obj->oo_inode->i_fop;
+       set_file_inode(file, obj->oo_inode);
+
         lu_object_get(lo);
         RETURN((struct dt_it *) it);
  }
@@ -4914,7 +4950,7 @@ osd_dirent_check_repair(const struct lu_env *env, struct osd_object *obj,
  
  again:
         if (dev->od_dirent_journal) {
-               jh = ldiskfs_journal_start_sb(sb, credits);
+               jh = osd_journal_start_sb(sb, LDISKFS_HT_MISC, credits);
                 if (IS_ERR(jh)) {
                         rc = PTR_ERR(jh);
                         CERROR("%.16s: fail to start trans for dirent "
@@ -4944,7 +4980,7 @@ again:
                 }
         }
  
-       bh = osd_ldiskfs_find_entry(dir, dentry, &de, hlock);
+       bh = osd_ldiskfs_find_entry(dir, &dentry->d_name, &de, NULL, hlock);
         /* For dot/dotdot entry, if there is not enough space to hold the
          * FID-in-dirent, just keep them there. It only happens when the
          * device upgraded from 1.8 or restored from MDT file-level backup.
@@ -5354,11 +5390,6 @@ static void osd_umount(const struct lu_env *env, struct osd_device *o)
  {
         ENTRY;
  
-       if (o->od_fsops) {
-               fsfilt_put_ops(o->od_fsops);
-               o->od_fsops = NULL;
-       }
-
         if (o->od_mnt != NULL) {
                 shrink_dcache_sb(osd_sb(o));
                 osd_sync(env, &o->od_dt_dev);
@@ -5394,13 +5425,6 @@ static int osd_mount(const struct lu_env *env,
                 RETURN(-E2BIG);
         strcpy(o->od_mntdev, dev);
  
-       o->od_fsops = fsfilt_get_ops(mt_str(LDD_MT_LDISKFS));
-       if (IS_ERR(o->od_fsops)) {
-               CERROR("%s: Can't find fsfilt_ldiskfs\n", name);
-               o->od_fsops = NULL;
-               RETURN(-ENOTSUPP);
-       }
-
         OBD_PAGE_ALLOC(__page, GFP_IOFS);
         if (__page == NULL)
                 GOTO(out, rc = -ENOMEM);
@@ -5431,7 +5455,7 @@ static int osd_mount(const struct lu_env *env,
         }
  
         o->od_mnt = vfs_kern_mount(type, s_flags, dev, options);
-       cfs_module_put(type->owner);
+       module_put(type->owner);
  
         if (IS_ERR(o->od_mnt)) {
                 rc = PTR_ERR(o->od_mnt);
@@ -5475,8 +5499,6 @@ out_mnt:
  out:
         if (__page)
                 OBD_PAGE_FREE(__page);
-       if (rc)
-               fsfilt_put_ops(o->od_fsops);
  
         return rc;
  }
@@ -5487,8 +5509,8 @@ static struct lu_device *osd_device_fini(const struct lu_env *env,
         struct osd_device *o = osd_dev(d);
         ENTRY;
  
-       osd_procfs_fini(o);
         osd_shutdown(env, o);
+       osd_procfs_fini(o);
         osd_scrub_cleanup(env, o);
         osd_obj_map_fini(o);
         osd_umount(env, o);
@@ -5538,6 +5560,9 @@ static int osd_device_init0(const struct lu_env *env,
                 GOTO(out_mnt, rc);
         }
  
+       if (server_name_is_ost(o->od_svname))
+               o->od_is_ost = 1;
+
         rc = osd_obj_map_init(env, o);
         if (rc != 0)
                 GOTO(out_mnt, rc);
@@ -5551,6 +5576,9 @@ static int osd_device_init0(const struct lu_env *env,
         if (rc != 0)
                 GOTO(out_site, rc);
  
+       /* self-repair LMA by default */
+       o->od_lma_self_repair = 1;
+
         CFS_INIT_LIST_HEAD(&o->od_ios_list);
         /* setup scrub, including OI files initialization */
         rc = osd_scrub_setup(env, o);
@@ -5793,16 +5821,29 @@ static struct obd_ops osd_obd_device_ops = {
  static int __init osd_mod_init(void)
  {
          struct lprocfs_static_vars lvars;
+       int rc;
+
+       osd_oi_mod_init();
+       lprocfs_osd_init_vars(&lvars);
  
-        osd_oi_mod_init();
-        lprocfs_osd_init_vars(&lvars);
-        return class_register_type(&osd_obd_device_ops, NULL, lvars.module_vars,
-                                  LUSTRE_OSD_LDISKFS_NAME, &osd_device_type);
+       rc = lu_kmem_init(ldiskfs_caches);
+       if (rc)
+               return rc;
+
+       rc = class_register_type(&osd_obd_device_ops, NULL, NULL,
+#ifndef HAVE_ONLY_PROCFS_SEQ
+                               lvars.module_vars,
+#endif
+                               LUSTRE_OSD_LDISKFS_NAME, &osd_device_type);
+       if (rc)
+               lu_kmem_fini(ldiskfs_caches);
+       return rc;
  }
  
  static void __exit osd_mod_exit(void)
  {
         class_unregister_type(LUSTRE_OSD_LDISKFS_NAME);
+       lu_kmem_fini(ldiskfs_caches);
  }
  
  MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");