LU-4482 grant: don't use cache data in osd_statfs()

[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_handler.c
diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c

index 241fc19..3fd2e4c 100644 (file)
--- a/lustre/osd-ldiskfs/osd_handler.c
+++ b/lustre/osd-ldiskfs/osd_handler.c
@@ -62,6 +62,8 @@
  /* struct ptlrpc_thread */
  #include <lustre_net.h>
  #include <lustre_fid.h>
+/* process_config */
+#include <lustre_param.h>
  
  #include "osd_internal.h"
  #include "osd_dynlocks.h"
@@ -229,8 +231,8 @@ struct inode *osd_iget(struct osd_thread_info *info, struct osd_device *dev,
                        id->oii_ino, PTR_ERR(inode));
         } else if (id->oii_gen != OSD_OII_NOGEN &&
                    inode->i_generation != id->oii_gen) {
-               CDEBUG(D_INODE, "unmatched inode: ino = %u, gen0 = %u, "
-                      "gen1 = %u\n",
+               CDEBUG(D_INODE, "unmatched inode: ino = %u, oii_gen = %u, "
+                      "i_generation = %u\n",
                        id->oii_ino, id->oii_gen, inode->i_generation);
                 iput(inode);
                 inode = ERR_PTR(-ESTALE);
@@ -238,7 +240,6 @@ struct inode *osd_iget(struct osd_thread_info *info, struct osd_device *dev,
                 /* due to parallel readdir and unlink,
                 * we can have dead inode here. */
                 CDEBUG(D_INODE, "stale inode: ino = %u\n", id->oii_ino);
-               make_bad_inode(inode);
                 iput(inode);
                 inode = ERR_PTR(-ESTALE);
         } else if (is_bad_inode(inode)) {
@@ -288,6 +289,119 @@ osd_iget_fid(struct osd_thread_info *info, struct osd_device *dev,
         return inode;
  }
  
+static struct inode *osd_iget_check(struct osd_thread_info *info,
+                                   struct osd_device *dev,
+                                   const struct lu_fid *fid,
+                                   struct osd_inode_id *id,
+                                   bool in_oi)
+{
+       struct inode    *inode;
+       int              rc     = 0;
+       ENTRY;
+
+       inode = ldiskfs_iget(osd_sb(dev), id->oii_ino);
+       if (IS_ERR(inode)) {
+               rc = PTR_ERR(inode);
+               if (!in_oi || (rc != -ENOENT && rc != -ESTALE)) {
+                       CDEBUG(D_INODE, "no inode: ino = %u, rc = %d\n",
+                              id->oii_ino, rc);
+
+                       GOTO(put, rc);
+               }
+
+               goto check_oi;
+       }
+
+       if (is_bad_inode(inode)) {
+               rc = -ENOENT;
+               if (!in_oi) {
+                       CDEBUG(D_INODE, "bad inode: ino = %u\n", id->oii_ino);
+
+                       GOTO(put, rc);
+               }
+
+               goto check_oi;
+       }
+
+       if (id->oii_gen != OSD_OII_NOGEN &&
+           inode->i_generation != id->oii_gen) {
+               rc = -ESTALE;
+               if (!in_oi) {
+                       CDEBUG(D_INODE, "unmatched inode: ino = %u, "
+                              "oii_gen = %u, i_generation = %u\n",
+                              id->oii_ino, id->oii_gen, inode->i_generation);
+
+                       GOTO(put, rc);
+               }
+
+               goto check_oi;
+       }
+
+       if (inode->i_nlink == 0) {
+               rc = -ENOENT;
+               if (!in_oi) {
+                       CDEBUG(D_INODE, "stale inode: ino = %u\n", id->oii_ino);
+
+                       GOTO(put, rc);
+               }
+
+               goto check_oi;
+       }
+
+check_oi:
+       if (rc != 0) {
+               LASSERTF(rc == -ESTALE || rc == -ENOENT, "rc = %d\n", rc);
+
+               rc = osd_oi_lookup(info, dev, fid, id, OI_CHECK_FLD);
+               /* XXX: There are some possible cases:
+                *      1. rc = 0.
+                *         Backup/restore caused the OI invalid.
+                *      2. rc = 0.
+                *         Someone unlinked the object but NOT removed
+                *         the OI mapping, such as mount target device
+                *         as ldiskfs, and modify something directly.
+                *      3. rc = -ENOENT.
+                *         Someone just removed the object between the
+                *         former oi_lookup and the iget. It is normal.
+                *      4. Other failure cases.
+                *
+                *      Generally, when the device is mounted, it will
+                *      auto check whether the system is restored from
+                *      file-level backup or not. We trust such detect
+                *      to distinguish the 1st case from the 2nd case. */
+               if (rc == 0) {
+                       if (!IS_ERR(inode) && inode->i_generation != 0 &&
+                           inode->i_generation == id->oii_gen)
+                               rc = -ENOENT;
+                       else
+                               rc = -EREMCHG;
+               }
+       } else {
+               if (id->oii_gen == OSD_OII_NOGEN)
+                       osd_id_gen(id, inode->i_ino, inode->i_generation);
+
+               /* Do not update file c/mtime in ldiskfs.
+                * NB: we don't have any lock to protect this because we don't
+                * have reference on osd_object now, but contention with
+                * another lookup + attr_set can't happen in the tiny window
+                * between if (...) and set S_NOCMTIME. */
+               if (!(inode->i_flags & S_NOCMTIME))
+                       inode->i_flags |= S_NOCMTIME;
+       }
+
+       GOTO(put, rc);
+
+put:
+       if (rc != 0) {
+               if (!IS_ERR(inode))
+                       iput(inode);
+
+               inode = ERR_PTR(rc);
+       }
+
+       return inode;
+}
+
  /**
   * \retval +v: new filter_fid, does not contain self-fid
   * \retval 0:  filter_fid_old, contains self-fid
@@ -316,13 +430,41 @@ int osd_get_idif(struct osd_thread_info *info, struct inode *inode,
         return rc;
  }
  
+static int osd_lma_self_repair(struct osd_thread_info *info,
+                              struct osd_device *osd, struct inode *inode,
+                              const struct lu_fid *fid, __u32 compat)
+{
+       handle_t *jh;
+       int       rc;
+
+       LASSERT(current->journal_info == NULL);
+
+       jh = osd_journal_start_sb(osd_sb(osd), LDISKFS_HT_MISC,
+                                 osd_dto_credits_noquota[DTO_XATTR_SET]);
+       if (IS_ERR(jh)) {
+               rc = PTR_ERR(jh);
+               CWARN("%s: cannot start journal for lma_self_repair: rc = %d\n",
+                     osd_name(osd), rc);
+               return rc;
+       }
+
+       rc = osd_ea_fid_set(info, inode, fid, compat, 0);
+       if (rc != 0)
+               CWARN("%s: cannot self repair the LMA: rc = %d\n",
+                     osd_name(osd), rc);
+       ldiskfs_journal_stop(jh);
+       return rc;
+}
+
  static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
  {
         struct osd_thread_info  *info   = osd_oti_get(env);
+       struct osd_device       *osd    = osd_obj2dev(obj);
         struct lustre_mdt_attrs *lma    = &info->oti_mdt_attrs;
         struct inode            *inode  = obj->oo_inode;
         struct dentry           *dentry = &info->oti_obj_dentry;
         struct lu_fid           *fid    = NULL;
+       const struct lu_fid     *rfid   = lu_object_fid(&obj->oo_dt.do_lu);
         int                      rc;
         ENTRY;
  
@@ -332,12 +474,20 @@ static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
         CLASSERT(LMA_OLD_SIZE >= sizeof(*lma));
         rc = __osd_xattr_get(inode, dentry, XATTR_NAME_LMA,
                              info->oti_mdt_attrs_old, LMA_OLD_SIZE);
-       if (rc == -ENODATA && !fid_is_igif(lu_object_fid(&obj->oo_dt.do_lu)) &&
-           osd_obj2dev(obj)->od_check_ff) {
+       if (rc == -ENODATA && !fid_is_igif(rfid) && osd->od_check_ff) {
                 fid = &lma->lma_self_fid;
                 rc = osd_get_idif(info, inode, dentry, fid);
-               if (rc > 0)
+               if ((rc > 0) || (rc == -ENODATA && osd->od_lma_self_repair)) {
+                       /* For the given OST-object, if it has neither LMA nor
+                        * FID in XATTR_NAME_FID, then the given FID (which is
+                        * contained in the @obj, from client RPC for locating
+                        * the OST-object) is trusted. We use it to generate
+                        * the LMA. */
+                       osd_lma_self_repair(info, osd, inode, rfid,
+                               fid_is_on_ost(info, osd, fid, OI_CHECK_FLD) ?
+                               LMAC_FID_ON_OST : 0);
                         RETURN(0);
+               }
         }
  
         if (unlikely(rc == -ENODATA))
@@ -352,23 +502,41 @@ static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
                 if (unlikely((lma->lma_incompat & ~LMA_INCOMPAT_SUPP) ||
                              CFS_FAIL_CHECK(OBD_FAIL_OSD_LMA_INCOMPAT))) {
                         CWARN("%s: unsupported incompat LMA feature(s) %#x for "
-                             "fid = "DFID", ino = %lu\n",
-                             osd_obj2dev(obj)->od_svname,
+                             "fid = "DFID", ino = %lu\n", osd_name(osd),
                               lma->lma_incompat & ~LMA_INCOMPAT_SUPP,
-                             PFID(lu_object_fid(&obj->oo_dt.do_lu)),
-                             inode->i_ino);
+                             PFID(rfid), inode->i_ino);
                         rc = -EOPNOTSUPP;
                 } else if (!(lma->lma_compat & LMAC_NOT_IN_OI)) {
                         fid = &lma->lma_self_fid;
                 }
         }
  
-       if (fid != NULL &&
-           unlikely(!lu_fid_eq(lu_object_fid(&obj->oo_dt.do_lu), fid))) {
+       if (fid != NULL && unlikely(!lu_fid_eq(rfid, fid))) {
+               if (fid_is_idif(rfid) && fid_is_idif(fid)) {
+                       struct ost_id   *oi   = &info->oti_ostid;
+                       struct lu_fid   *fid1 = &info->oti_fid3;
+                       __u32            idx  = fid_idif_ost_idx(rfid);
+
+                       /* For old IDIF, the OST index is not part of the IDIF,
+                        * Means that different OSTs may have the same IDIFs.
+                        * Under such case, we need to make some compatible
+                        * check to make sure to trigger OI scrub properly. */
+                       if (idx != 0 && fid_idif_ost_idx(fid) == 0) {
+                               /* Given @rfid is new, LMA is old. */
+                               fid_to_ostid(fid, oi);
+                               ostid_to_fid(fid1, oi, idx);
+                               if (lu_fid_eq(fid1, rfid)) {
+                                       if (osd->od_lma_self_repair)
+                                               osd_lma_self_repair(info, osd,
+                                                       inode, rfid,
+                                                       LMAC_FID_ON_OST);
+                                       RETURN(0);
+                               }
+                       }
+               }
+
                 CDEBUG(D_INODE, "%s: FID "DFID" != self_fid "DFID"\n",
-                      osd_obj2dev(obj)->od_svname,
-                      PFID(lu_object_fid(&obj->oo_dt.do_lu)),
-                      PFID(&lma->lma_self_fid));
+                      osd_name(osd), PFID(rfid), PFID(fid));
                 rc = -EREMCHG;
         }
  
@@ -448,33 +616,20 @@ static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj,
         in_oi = true;
  
  iget:
-       inode = osd_iget(info, dev, id);
+       inode = osd_iget_check(info, dev, fid, id, in_oi);
         if (IS_ERR(inode)) {
                 result = PTR_ERR(inode);
                 if (result == -ENOENT || result == -ESTALE) {
-                       if (!in_oi) {
+                       if (!in_oi)
                                 fid_zero(&oic->oic_fid);
-                               GOTO(out, result = -ENOENT);
-                       }
  
-                       /* XXX: There are three possible cases:
-                        *      1. Backup/restore caused the OI invalid.
-                        *      2. Someone unlinked the object but NOT removed
-                        *         the OI mapping, such as mount target device
-                        *         as ldiskfs, and modify something directly.
-                        *      3. Someone just removed the object between the
-                        *         former oi_lookup and the iget. It is normal.
-                        *
-                        *      It is diffcult to distinguish the 2nd from the
-                        *      1st case. Relatively speaking, the 1st case is
-                        *      common than the 2nd case, trigger OI scrub. */
-                       result = osd_oi_lookup(info, dev, fid, id, true);
-                       if (result == 0)
-                               /* It is the case 1 or 2. */
-                               goto trigger;
+                       GOTO(out, result = -ENOENT);
                 } else if (result == -EREMCHG) {
  
  trigger:
+                       if (!in_oi)
+                               fid_zero(&oic->oic_fid);
+
                         if (unlikely(triggered))
                                 GOTO(out, result = saved);
  
@@ -483,10 +638,9 @@ trigger:
                                 result = -EINPROGRESS;
                         } else if (!dev->od_noscrub) {
                                 result = osd_scrub_start(dev);
-                               LCONSOLE_ERROR("%.16s: trigger OI scrub by RPC "
-                                              "for "DFID", rc = %d [1]\n",
-                                              LDISKFS_SB(osd_sb(dev))->s_es->\
-                                              s_volume_name,PFID(fid), result);
+                               LCONSOLE_WARN("%.16s: trigger OI scrub by RPC "
+                                             "for "DFID", rc = %d [1]\n",
+                                             osd_name(dev), PFID(fid),result);
                                 if (result == 0 || result == -EALREADY)
                                         result = -EINPROGRESS;
                                 else
@@ -529,8 +683,18 @@ trigger:
         if (result != 0) {
                 iput(inode);
                 obj->oo_inode = NULL;
-               if (result == -EREMCHG)
+               if (result == -EREMCHG) {
+                       if (!in_oi) {
+                               result = osd_oi_lookup(info, dev, fid, id,
+                                                      OI_CHECK_FLD);
+                               if (result != 0) {
+                                       fid_zero(&oic->oic_fid);
+                                       GOTO(out, result);
+                               }
+                       }
+
                         goto trigger;
+               }
  
                 GOTO(out, result);
         }
@@ -767,20 +931,20 @@ static void osd_trans_commit_cb(struct super_block *sb,
  
          lu_context_exit(&th->th_ctx);
          lu_context_fini(&th->th_ctx);
-        OBD_FREE_PTR(oh);
+       thandle_put(th);
  }
  
  static struct thandle *osd_trans_create(const struct lu_env *env,
-                                        struct dt_device *d)
+                                       struct dt_device *d)
  {
-        struct osd_thread_info *oti = osd_oti_get(env);
-        struct osd_iobuf       *iobuf = &oti->oti_iobuf;
-        struct osd_thandle     *oh;
-        struct thandle         *th;
-        ENTRY;
+       struct osd_thread_info  *oti = osd_oti_get(env);
+       struct osd_iobuf        *iobuf = &oti->oti_iobuf;
+       struct osd_thandle      *oh;
+       struct thandle          *th;
+       ENTRY;
  
-        /* on pending IO in this thread should left from prev. request */
-        LASSERT(cfs_atomic_read(&iobuf->dr_numreqs) == 0);
+       /* on pending IO in this thread should left from prev. request */
+       LASSERT(atomic_read(&iobuf->dr_numreqs) == 0);
  
          th = ERR_PTR(-ENOMEM);
         OBD_ALLOC_GFP(oh, sizeof *oh, __GFP_IO);
@@ -792,7 +956,9 @@ static struct thandle *osd_trans_create(const struct lu_env *env,
                  th->th_result = 0;
                  th->th_tags = LCT_TX_HANDLE;
                  oh->ot_credits = 0;
-                oti->oti_dev = osd_dt_dev(d);
+               atomic_set(&th->th_refc, 1);
+               th->th_alloc_size = sizeof(*oh);
+               oti->oti_dev = osd_dt_dev(d);
                  CFS_INIT_LIST_HEAD(&oh->ot_dcb_list);
                  osd_th_alloced(oh);
  
@@ -888,7 +1054,7 @@ int osd_trans_start(const struct lu_env *env, struct dt_device *d,
           * XXX temporary stuff. Some abstraction layer should
           * be used.
           */
-        jh = ldiskfs_journal_start_sb(osd_sb(dev), oh->ot_credits);
+        jh = osd_journal_start_sb(osd_sb(dev), LDISKFS_HT_MISC, oh->ot_credits);
          osd_th_started(oh);
          if (!IS_ERR(jh)) {
                  oh->ot_handle = jh;
@@ -908,10 +1074,33 @@ out:
          RETURN(rc);
  }
  
+static int osd_seq_exists(const struct lu_env *env,
+                             struct osd_device *osd, obd_seq seq)
+{
+       struct lu_seq_range     *range = &osd_oti_get(env)->oti_seq_range;
+       struct seq_server_site  *ss = osd_seq_site(osd);
+       int                     rc;
+       ENTRY;
+
+       if (ss == NULL)
+               RETURN(1);
+
+       rc = osd_fld_lookup(env, osd, seq, range);
+       if (rc != 0) {
+               if (rc != -ENOENT)
+                       CERROR("%s: can't lookup FLD sequence "LPX64
+                              ": rc = %d\n", osd_name(osd), seq, rc);
+               RETURN(0);
+       }
+
+       RETURN(ss->ss_node_id == range->lsr_index);
+}
+
  /*
   * Concurrency: shouldn't matter.
   */
-static int osd_trans_stop(const struct lu_env *env, struct thandle *th)
+static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt,
+                         struct thandle *th)
  {
          int                     rc = 0;
          struct osd_thandle     *oh;
@@ -954,7 +1143,7 @@ static int osd_trans_stop(const struct lu_env *env, struct thandle *th)
                  if (rc != 0)
                          CERROR("Failure to stop transaction: %d\n", rc);
          } else {
-                OBD_FREE_PTR(oh);
+               thandle_put(&oh->ot_super);
          }
  
         /* as we want IO to journal and data IO be concurrent, we don't block
@@ -967,7 +1156,7 @@ static int osd_trans_stop(const struct lu_env *env, struct thandle *th)
          * completed otherwise iobuf may be corrupted by different request
          */
         wait_event(iobuf->dr_wait,
-                      cfs_atomic_read(&iobuf->dr_numreqs) == 0);
+                      atomic_read(&iobuf->dr_numreqs) == 0);
         if (!rc)
                 rc = iobuf->dr_error;
  
@@ -1056,6 +1245,8 @@ static int osd_object_print(const struct lu_env *env, void *cookie,
                      d ? d->id_ops->id_name : "plain");
  }
  
+#define GRANT_FOR_LOCAL_OIDS 32 /* 128kB for last_rcvd, quota files, ... */
+
  /*
   * Concurrency: shouldn't matter.
   */
@@ -1080,24 +1271,28 @@ int osd_statfs(const struct lu_env *env, struct dt_device *d,
          }
  
         spin_lock(&osd->od_osfs_lock);
-       /* cache 1 second */
-       if (cfs_time_before_64(osd->od_osfs_age, cfs_time_shift_64(-1))) {
-               result = sb->s_op->statfs(sb->s_root, ksfs);
-               if (likely(result == 0)) { /* N.B. statfs can't really fail */
-                       osd->od_osfs_age = cfs_time_current_64();
-                       statfs_pack(&osd->od_statfs, ksfs);
-                       if (sb->s_flags & MS_RDONLY)
-                               sfs->os_state = OS_STATE_READONLY;
-               }
+       result = sb->s_op->statfs(sb->s_root, ksfs);
+       if (likely(result == 0)) { /* N.B. statfs can't really fail */
+               statfs_pack(sfs, ksfs);
+               if (sb->s_flags & MS_RDONLY)
+                       sfs->os_state = OS_STATE_READONLY;
         }
  
-       if (likely(result == 0))
-               *sfs = osd->od_statfs;
         spin_unlock(&osd->od_osfs_lock);
  
-        if (unlikely(env == NULL))
+       if (unlikely(env == NULL))
                  OBD_FREE_PTR(ksfs);
  
+       /* Reserve a small amount of space for local objects like last_rcvd,
+        * llog, quota files, ... */
+       if (sfs->os_bavail <= GRANT_FOR_LOCAL_OIDS) {
+               sfs->os_bavail = 0;
+       } else {
+               sfs->os_bavail -= GRANT_FOR_LOCAL_OIDS;
+               /** Take out metadata overhead for indirect blocks */
+               sfs->os_bavail -= sfs->os_bavail >> (sb->s_blocksize_bits - 3);
+       }
+
          return result;
  }
  
@@ -1124,7 +1319,6 @@ static void osd_conf_get(const struct lu_env *env,
          /*
           * XXX should be taken from not-yet-existing fs abstraction layer.
           */
-       param->ddp_mnt = osd_dt_dev(dev)->od_mnt;
          param->ddp_max_name_len = LDISKFS_NAME_LEN;
          param->ddp_max_nlink    = LDISKFS_LINK_MAX;
         param->ddp_block_shift  = sb->s_blocksize_bits;
@@ -2010,7 +2204,6 @@ static void osd_ah_init(const struct lu_env *env, struct dt_allocation_hint *ah,
  {
          LASSERT(ah);
  
-        memset(ah, 0, sizeof(*ah));
          ah->dah_parent = parent;
          ah->dah_mode = child_mode;
  }
@@ -2093,23 +2286,26 @@ static int __osd_object_create(struct osd_thread_info *info,
   * \retval 0, on success
   */
  static int __osd_oi_insert(const struct lu_env *env, struct osd_object *obj,
-                           const struct lu_fid *fid, struct thandle *th)
+                          const struct lu_fid *fid, struct thandle *th)
  {
-        struct osd_thread_info *info = osd_oti_get(env);
-        struct osd_inode_id    *id   = &info->oti_id;
-        struct osd_device      *osd  = osd_obj2dev(obj);
+       struct osd_thread_info *info = osd_oti_get(env);
+       struct osd_inode_id    *id   = &info->oti_id;
+       struct osd_device      *osd  = osd_obj2dev(obj);
+       struct osd_thandle     *oh;
  
-        LASSERT(obj->oo_inode != NULL);
+       LASSERT(obj->oo_inode != NULL);
+
+       oh = container_of0(th, struct osd_thandle, ot_super);
+       LASSERT(oh->ot_handle);
  
         osd_id_gen(id, obj->oo_inode->i_ino, obj->oo_inode->i_generation);
-       return osd_oi_insert(info, osd, fid, id, th, OI_CHECK_FLD);
+       return osd_oi_insert(info, osd, fid, id, oh->ot_handle, OI_CHECK_FLD);
  }
  
  int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd,
                    obd_seq seq, struct lu_seq_range *range)
  {
         struct seq_server_site  *ss = osd_seq_site(osd);
-       int                     rc;
  
         if (fid_seq_is_idif(seq)) {
                 fld_range_set_ost(range);
@@ -2128,12 +2324,8 @@ int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd,
  
         LASSERT(ss != NULL);
         fld_range_set_any(range);
-       rc = fld_server_lookup(env, ss->ss_server_fld, seq, range);
-       if (rc != 0) {
-               CERROR("%s: cannot find FLD range for "LPX64": rc = %d\n",
-                      osd_name(osd), seq, rc);
-       }
-       return rc;
+       /* OSD will only do local fld lookup */
+       return fld_local_lookup(env, ss->ss_server_fld, seq, range);
  }
  
  /*
@@ -2146,7 +2338,6 @@ static int osd_declare_object_create(const struct lu_env *env,
                                      struct dt_object_format *dof,
                                      struct thandle *handle)
  {
-       struct lu_seq_range     *range = &osd_oti_get(env)->oti_seq_range;
         struct osd_thandle      *oh;
         int                      rc;
         ENTRY;
@@ -2184,16 +2375,6 @@ static int osd_declare_object_create(const struct lu_env *env,
         if (rc != 0)
                 RETURN(rc);
  
-       /* It does fld look up inside declare, and the result will be
-        * added to fld cache, so the following fld lookup inside insert
-        * does not need send RPC anymore, so avoid send rpc with holding
-        * transaction */
-       if (fid_is_norm(lu_object_fid(&dt->do_lu)) &&
-               !fid_is_last_id(lu_object_fid(&dt->do_lu)))
-               osd_fld_lookup(env, osd_dt_dev(handle->th_dev),
-                              fid_seq(lu_object_fid(&dt->do_lu)), range);
-
-
         RETURN(rc);
  }
  
@@ -2291,7 +2472,8 @@ static int osd_object_destroy(const struct lu_env *env,
                 RETURN(-EPERM);
  
         if (S_ISDIR(inode->i_mode)) {
-               LASSERT(osd_inode_unlinked(inode) || inode->i_nlink == 1);
+               LASSERT(osd_inode_unlinked(inode) || inode->i_nlink == 1 ||
+                       inode->i_nlink == 2);
                 /* it will check/delete the inode from remote parent,
                  * how to optimize it? unlink performance impaction XXX */
                 result = osd_delete_from_remote_parent(env, osd, obj, oh);
@@ -2307,7 +2489,8 @@ static int osd_object_destroy(const struct lu_env *env,
  
         osd_trans_exec_op(env, th, OSD_OT_DESTROY);
  
-        result = osd_oi_delete(osd_oti_get(env), osd, fid, th, OI_CHECK_FLD);
+       result = osd_oi_delete(osd_oti_get(env), osd, fid, oh->ot_handle,
+                              OI_CHECK_FLD);
  
          /* XXX: add to ext3 orphan list */
          /* rc = ext3_orphan_add(handle_t *handle, struct inode *inode) */
@@ -2587,20 +2770,22 @@ static int osd_declare_object_ref_add(const struct lu_env *env,
  static int osd_object_ref_add(const struct lu_env *env,
                                struct dt_object *dt, struct thandle *th)
  {
-       struct osd_object *obj = osd_dt_obj(dt);
-       struct inode      *inode = obj->oo_inode;
-       bool               need_dirty = false;
-       int                rc = 0;
+       struct osd_object  *obj = osd_dt_obj(dt);
+       struct inode       *inode = obj->oo_inode;
+       struct osd_thandle *oh;
+       int                 rc = 0;
  
          LINVRNT(osd_invariant(obj));
         LASSERT(dt_object_exists(dt) && !dt_object_remote(dt));
          LASSERT(osd_write_locked(env, obj));
          LASSERT(th != NULL);
  
+        oh = container_of0(th, struct osd_thandle, ot_super);
+        LASSERT(oh->ot_handle != NULL);
+
         osd_trans_exec_op(env, th, OSD_OT_REF_ADD);
  
-       /* This based on ldiskfs_inc_count(), which is not exported.
-        *
+       /*
          * The DIR_NLINK feature allows directories to exceed LDISKFS_LINK_MAX
          * (65000) subdirectories by storing "1" in i_nlink if the link count
          * would otherwise overflow. Directory tranversal tools understand
@@ -2612,28 +2797,11 @@ static int osd_object_ref_add(const struct lu_env *env,
          * in case they are being linked into the PENDING directory
          */
         spin_lock(&obj->oo_guard);
-       if (unlikely(!S_ISDIR(inode->i_mode) &&
-                    inode->i_nlink >= LDISKFS_LINK_MAX)) {
-               /* MDD should have checked this, but good to be safe */
-               rc = -EMLINK;
-       } else if (unlikely(inode->i_nlink == 0 ||
-                           (S_ISDIR(inode->i_mode) &&
-                            inode->i_nlink >= LDISKFS_LINK_MAX))) {
-               /* inc_nlink from 0 may cause WARN_ON */
-               set_nlink(inode, 1);
-               need_dirty = true;
-       } else if (!S_ISDIR(inode->i_mode) ||
-                  (S_ISDIR(inode->i_mode) && inode->i_nlink >= 2)) {
-               inc_nlink(inode);
-               need_dirty = true;
-       } /* else (S_ISDIR(inode->i_mode) && inode->i_nlink == 1) { ; } */
-
+       ldiskfs_inc_count(oh->ot_handle, inode);
         LASSERT(inode->i_nlink <= LDISKFS_LINK_MAX);
         spin_unlock(&obj->oo_guard);
  
-       if (need_dirty)
-               ll_dirty_inode(inode, I_DIRTY_DATASYNC);
-
+       ll_dirty_inode(inode, I_DIRTY_DATASYNC);
         LINVRNT(osd_invariant(obj));
  
         return rc;
@@ -2666,12 +2834,16 @@ static int osd_object_ref_del(const struct lu_env *env, struct dt_object *dt,
         struct osd_object       *obj = osd_dt_obj(dt);
         struct inode            *inode = obj->oo_inode;
         struct osd_device       *osd = osd_dev(dt->do_lu.lo_dev);
+       struct osd_thandle      *oh;
  
         LINVRNT(osd_invariant(obj));
         LASSERT(dt_object_exists(dt) && !dt_object_remote(dt));
         LASSERT(osd_write_locked(env, obj));
         LASSERT(th != NULL);
  
+        oh = container_of0(th, struct osd_thandle, ot_super);
+        LASSERT(oh->ot_handle != NULL);
+
         osd_trans_exec_op(env, th, OSD_OT_REF_DEL);
  
         spin_lock(&obj->oo_guard);
@@ -2687,23 +2859,11 @@ static int osd_object_ref_del(const struct lu_env *env, struct dt_object *dt,
                 return 0;
         }
  
-       /* This based on ldiskfs_dec_count(), which is not exported.
-        *
-        * If a directory already has nlink == 1, then do not drop the nlink
-        * count to 0, even temporarily, to avoid race conditions with other
-        * threads not holding oo_guard seeing i_nlink == 0 in rare cases.
-        *
-        * nlink == 1 means the directory has/had > EXT4_LINK_MAX subdirs.
-        */
-       if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 1) {
-               drop_nlink(inode);
+       ldiskfs_dec_count(oh->ot_handle, inode);
+       spin_unlock(&obj->oo_guard);
  
-               spin_unlock(&obj->oo_guard);
-               ll_dirty_inode(inode, I_DIRTY_DATASYNC);
-               LINVRNT(osd_invariant(obj));
-       } else {
-               spin_unlock(&obj->oo_guard);
-       }
+       ll_dirty_inode(inode, I_DIRTY_DATASYNC);
+       LINVRNT(osd_invariant(obj));
  
         return 0;
  }
@@ -2818,6 +2978,9 @@ static int osd_xattr_set(const struct lu_env *env, struct dt_object *dt,
          if (osd_object_auth(env, dt, capa, CAPA_OPC_META_WRITE))
                  return -EACCES;
  
+       CDEBUG(D_INODE, DFID" set xattr '%s' with size %zd\n",
+              PFID(lu_object_fid(&dt->do_lu)), name, buf->lb_len);
+
         osd_trans_exec_op(env, handle, OSD_OT_XATTR_SET);
         if (fl & LU_XATTR_REPLACE)
                 fs_flags |= XATTR_REPLACE;
@@ -3001,6 +3164,7 @@ static int osd_object_sync(const struct lu_env *env, struct dt_object *dt)
         file->f_dentry = dentry;
         file->f_mapping = inode->i_mapping;
         file->f_op = inode->i_fop;
+       set_file_inode(file, inode);
  #ifndef HAVE_FILE_FSYNC_4ARGS
         mutex_lock(&inode->i_mutex);
  #endif
@@ -3343,31 +3507,6 @@ static inline int osd_get_fid_from_dentry(struct ldiskfs_dir_entry_2 *de,
         return rc;
  }
  
-static int osd_mdt_seq_exists(const struct lu_env *env,
-                             struct osd_device *osd, obd_seq seq)
-{
-       struct lu_seq_range     *range = &osd_oti_get(env)->oti_seq_range;
-       struct seq_server_site  *ss = osd_seq_site(osd);
-       int                     rc;
-       ENTRY;
-
-       if (ss == NULL)
-               RETURN(1);
-
-       /* XXX: currently, each MDT only store avaible sequence on disk, and no
-        * allocated sequences information on disk, so we have to lookup FLDB,
-        * but it probably makes more sense also store allocated sequence
-        * locally, so we do not need do remote FLDB lookup in OSD */
-       rc = osd_fld_lookup(env, osd, seq, range);
-       if (rc != 0) {
-               CERROR("%s: Can not lookup fld for "LPX64"\n",
-                      osd_name(osd), seq);
-               RETURN(0);
-       }
-
-       RETURN(ss->ss_node_id == range->lsr_index);
-}
-
  static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd,
                           struct lu_fid *fid)
  {
@@ -3377,8 +3516,7 @@ static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd,
         if (unlikely(!fid_seq_in_fldb(fid_seq(fid))))
                 RETURN(0);
  
-       /* Currently only check this for FID on MDT */
-       if (osd_mdt_seq_exists(env, osd, fid_seq(fid)))
+       if (osd_seq_exists(env, osd, fid_seq(fid)))
                 RETURN(0);
  
         RETURN(1);
@@ -3436,7 +3574,7 @@ static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt,
                 down_write(&obj->oo_ext_idx_sem);
          }
  
-        bh = ldiskfs_find_entry(dir, &dentry->d_name, &de, hlock);
+        bh = osd_ldiskfs_find_entry(dir, &dentry->d_name, &de, NULL, hlock);
          if (bh) {
                 __u32 ino = 0;
  
@@ -3874,13 +4012,11 @@ again:
         }
  
         if (!dev->od_noscrub && ++once == 1) {
-               CDEBUG(D_LFSCK, "Trigger OI scrub by RPC for "DFID"\n",
-                      PFID(fid));
                 rc = osd_scrub_start(dev);
-               LCONSOLE_ERROR("%.16s: trigger OI scrub by RPC for "DFID
-                              ", rc = %d [2]\n",
-                              LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name,
-                              PFID(fid), rc);
+               LCONSOLE_WARN("%.16s: trigger OI scrub by RPC for "DFID
+                             ", rc = %d [2]\n",
+                             LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name,
+                             PFID(fid), rc);
                 if (rc == 0)
                         goto again;
         }
@@ -3959,7 +4095,7 @@ static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj,
                 down_read(&obj->oo_ext_idx_sem);
          }
  
-        bh = osd_ldiskfs_find_entry(dir, dentry, &de, hlock);
+        bh = osd_ldiskfs_find_entry(dir, &dentry->d_name, &de, NULL, hlock);
          if (bh) {
                 struct osd_thread_info *oti = osd_oti_get(env);
                 struct osd_inode_id *id = &oti->oti_id;
@@ -3988,6 +4124,9 @@ static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj,
                         GOTO(out, rc);
                 }
  
+               if (osd_remote_fid(env, dev, fid))
+                       GOTO(out, rc = 0);
+
                 rc = osd_add_oi_cache(osd_oti_get(env), osd_obj2dev(obj), id,
                                       fid);
                 if (rc != 0)
@@ -4548,6 +4687,8 @@ static struct dt_it *osd_it_ea_init(const struct lu_env *env,
         file->f_dentry          = obj_dentry;
         file->f_mapping         = obj->oo_inode->i_mapping;
         file->f_op              = obj->oo_inode->i_fop;
+       set_file_inode(file, obj->oo_inode);
+
         lu_object_get(lo);
         RETURN((struct dt_it *) it);
  }
@@ -4701,14 +4842,16 @@ static int osd_ldiskfs_it_fill(const struct lu_env *env,
          else
                 up_read(&obj->oo_ext_idx_sem);
  
-        if (it->oie_rd_dirent == 0) {
-                result = -EIO;
-        } else {
-                it->oie_dirent = it->oie_buf;
-                it->oie_it_dirent = 1;
-        }
+       if (it->oie_rd_dirent == 0) {
+               /*If it does not get any dirent, it means it has been reached
+                *to the end of the dir */
+               it->oie_file.f_pos = ldiskfs_get_htree_eof(&it->oie_file);
+       } else {
+               it->oie_dirent = it->oie_buf;
+               it->oie_it_dirent = 1;
+       }
  
-        RETURN(result);
+       RETURN(result);
  }
  
  /**
@@ -4956,7 +5099,7 @@ osd_dirent_check_repair(const struct lu_env *env, struct osd_object *obj,
  
  again:
         if (dev->od_dirent_journal) {
-               jh = ldiskfs_journal_start_sb(sb, credits);
+               jh = osd_journal_start_sb(sb, LDISKFS_HT_MISC, credits);
                 if (IS_ERR(jh)) {
                         rc = PTR_ERR(jh);
                         CERROR("%.16s: fail to start trans for dirent "
@@ -4986,7 +5129,7 @@ again:
                 }
         }
  
-       bh = osd_ldiskfs_find_entry(dir, dentry, &de, hlock);
+       bh = osd_ldiskfs_find_entry(dir, &dentry->d_name, &de, NULL, hlock);
         /* For dot/dotdot entry, if there is not enough space to hold the
          * FID-in-dirent, just keep them there. It only happens when the
          * device upgraded from 1.8 or restored from MDT file-level backup.
@@ -5479,13 +5622,17 @@ static int osd_mount(const struct lu_env *env,
  #endif
  
         if (!LDISKFS_HAS_COMPAT_FEATURE(o->od_mnt->mnt_sb,
-           LDISKFS_FEATURE_COMPAT_HAS_JOURNAL)) {
+                                       LDISKFS_FEATURE_COMPAT_HAS_JOURNAL)) {
                 CERROR("%s: device %s is mounted w/o journal\n", name, dev);
                 GOTO(out_mnt, rc = -EINVAL);
         }
  
+#ifdef LDISKFS_MOUNT_DIRDATA
+       if (LDISKFS_HAS_INCOMPAT_FEATURE(o->od_mnt->mnt_sb,
+                                        LDISKFS_FEATURE_INCOMPAT_DIRDATA))
+               LDISKFS_SB(osd_sb(o))->s_mount_opt |= LDISKFS_MOUNT_DIRDATA;
+#endif
         inode = osd_sb(o)->s_root->d_inode;
-       ldiskfs_set_inode_state(inode, LDISKFS_STATE_LUSTRE_NO_OI);
         lu_local_obj_fid(fid, OSD_FS_ROOT_OID);
         rc = osd_ea_fid_set(info, inode, fid, LMAC_NOT_IN_OI, 0);
         if (rc != 0) {
@@ -5515,8 +5662,8 @@ static struct lu_device *osd_device_fini(const struct lu_env *env,
         struct osd_device *o = osd_dev(d);
         ENTRY;
  
-       osd_procfs_fini(o);
         osd_shutdown(env, o);
+       osd_procfs_fini(o);
         osd_scrub_cleanup(env, o);
         osd_obj_map_fini(o);
         osd_umount(env, o);
@@ -5545,7 +5692,6 @@ static int osd_device_init0(const struct lu_env *env,
  
         spin_lock_init(&o->od_osfs_lock);
         mutex_init(&o->od_otable_mutex);
-       o->od_osfs_age = cfs_time_shift_64(-1000);
  
         o->od_capa_hash = init_capa_hash();
         if (o->od_capa_hash == NULL)
@@ -5582,6 +5728,9 @@ static int osd_device_init0(const struct lu_env *env,
         if (rc != 0)
                 GOTO(out_site, rc);
  
+       /* self-repair LMA by default */
+       o->od_lma_self_repair = 1;
+
         CFS_INIT_LIST_HEAD(&o->od_ios_list);
         /* setup scrub, including OI files initialization */
         rc = osd_scrub_setup(env, o);
@@ -5674,23 +5823,32 @@ static struct lu_device *osd_device_free(const struct lu_env *env,
  static int osd_process_config(const struct lu_env *env,
                                struct lu_device *d, struct lustre_cfg *cfg)
  {
-        struct osd_device *o = osd_dev(d);
-        int err;
-        ENTRY;
+       struct osd_device               *o = osd_dev(d);
+       int                             rc;
+       ENTRY;
  
-        switch(cfg->lcfg_command) {
-        case LCFG_SETUP:
-                err = osd_mount(env, o, cfg);
-                break;
-        case LCFG_CLEANUP:
+       switch (cfg->lcfg_command) {
+       case LCFG_SETUP:
+               rc = osd_mount(env, o, cfg);
+               break;
+       case LCFG_CLEANUP:
                 lu_dev_del_linkage(d->ld_site, d);
-               err = osd_shutdown(env, o);
+               rc = osd_shutdown(env, o);
                 break;
-        default:
-                err = -ENOSYS;
-        }
+       case LCFG_PARAM:
+               LASSERT(&o->od_dt_dev);
+               rc = class_process_proc_param(PARAM_OSD, lprocfs_osd_obd_vars,
+                                             cfg, &o->od_dt_dev);
+               if (rc > 0 || rc == -ENOSYS)
+                       rc = class_process_proc_param(PARAM_OST,
+                                                     lprocfs_osd_obd_vars,
+                                                     cfg, &o->od_dt_dev);
+               break;
+       default:
+               rc = -ENOSYS;
+       }
  
-        RETURN(err);
+       RETURN(rc);
  }
  
  static int osd_recovery_complete(const struct lu_env *env,
@@ -5823,18 +5981,19 @@ static struct obd_ops osd_obd_device_ops = {
  
  static int __init osd_mod_init(void)
  {
-        struct lprocfs_static_vars lvars;
         int rc;
  
         osd_oi_mod_init();
-       lprocfs_osd_init_vars(&lvars);
  
         rc = lu_kmem_init(ldiskfs_caches);
         if (rc)
                 return rc;
  
-       rc = class_register_type(&osd_obd_device_ops, NULL, lvars.module_vars,
-                                LUSTRE_OSD_LDISKFS_NAME, &osd_device_type);
+       rc = class_register_type(&osd_obd_device_ops, NULL, NULL,
+#ifndef HAVE_ONLY_PROCFS_SEQ
+                               lprocfs_osd_module_vars,
+#endif
+                               LUSTRE_OSD_LDISKFS_NAME, &osd_device_type);
         if (rc)
                 lu_kmem_fini(ldiskfs_caches);
         return rc;