Whamcloud - gitweb
LU-3857 osd: cleanup procfs after osd_shutdown
[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_handler.c
index 241fc19..c13c682 100644 (file)
@@ -319,6 +319,7 @@ int osd_get_idif(struct osd_thread_info *info, struct inode *inode,
 static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
 {
        struct osd_thread_info  *info   = osd_oti_get(env);
+       struct osd_device       *osd    = osd_obj2dev(obj);
        struct lustre_mdt_attrs *lma    = &info->oti_mdt_attrs;
        struct inode            *inode  = obj->oo_inode;
        struct dentry           *dentry = &info->oti_obj_dentry;
@@ -333,11 +334,41 @@ static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
        rc = __osd_xattr_get(inode, dentry, XATTR_NAME_LMA,
                             info->oti_mdt_attrs_old, LMA_OLD_SIZE);
        if (rc == -ENODATA && !fid_is_igif(lu_object_fid(&obj->oo_dt.do_lu)) &&
-           osd_obj2dev(obj)->od_check_ff) {
+           osd->od_check_ff) {
                fid = &lma->lma_self_fid;
                rc = osd_get_idif(info, inode, dentry, fid);
-               if (rc > 0)
+               if ((rc > 0) || (rc == -ENODATA && osd->od_lma_self_repair)) {
+                       handle_t *jh;
+
+                       /* For the given OST-object, if it has neither LMA nor
+                        * FID in XATTR_NAME_FID, then the given FID (which is
+                        * contained in the @obj, from client RPC for locating
+                        * the OST-object) is trusted. We use it to generate
+                        * the LMA. */
+
+                       LASSERT(current->journal_info == NULL);
+
+                       jh = osd_journal_start_sb(osd_sb(osd), LDISKFS_HT_MISC,
+                                       osd_dto_credits_noquota[DTO_XATTR_SET]);
+                       if (IS_ERR(jh)) {
+                               CWARN("%s: cannot start journal for "
+                                     "lma_self_repair: rc = %ld\n",
+                                     osd_name(osd), PTR_ERR(jh));
+                               RETURN(0);
+                       }
+
+                       rc = osd_ea_fid_set(info, inode,
+                               lu_object_fid(&obj->oo_dt.do_lu),
+                               fid_is_on_ost(info, osd,
+                                             lu_object_fid(&obj->oo_dt.do_lu),
+                                             OI_CHECK_FLD) ?
+                               LMAC_FID_ON_OST : 0, 0);
+                       if (rc != 0)
+                               CWARN("%s: cannot self repair the LMA: "
+                                     "rc = %d\n", osd_name(osd), rc);
+                       ldiskfs_journal_stop(jh);
                        RETURN(0);
+               }
        }
 
        if (unlikely(rc == -ENODATA))
@@ -352,8 +383,7 @@ static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
                if (unlikely((lma->lma_incompat & ~LMA_INCOMPAT_SUPP) ||
                             CFS_FAIL_CHECK(OBD_FAIL_OSD_LMA_INCOMPAT))) {
                        CWARN("%s: unsupported incompat LMA feature(s) %#x for "
-                             "fid = "DFID", ino = %lu\n",
-                             osd_obj2dev(obj)->od_svname,
+                             "fid = "DFID", ino = %lu\n", osd_name(osd),
                              lma->lma_incompat & ~LMA_INCOMPAT_SUPP,
                              PFID(lu_object_fid(&obj->oo_dt.do_lu)),
                              inode->i_ino);
@@ -366,8 +396,7 @@ static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
        if (fid != NULL &&
            unlikely(!lu_fid_eq(lu_object_fid(&obj->oo_dt.do_lu), fid))) {
                CDEBUG(D_INODE, "%s: FID "DFID" != self_fid "DFID"\n",
-                      osd_obj2dev(obj)->od_svname,
-                      PFID(lu_object_fid(&obj->oo_dt.do_lu)),
+                      osd_name(osd), PFID(lu_object_fid(&obj->oo_dt.do_lu)),
                       PFID(&lma->lma_self_fid));
                rc = -EREMCHG;
        }
@@ -888,7 +917,7 @@ int osd_trans_start(const struct lu_env *env, struct dt_device *d,
          * XXX temporary stuff. Some abstraction layer should
          * be used.
          */
-        jh = ldiskfs_journal_start_sb(osd_sb(dev), oh->ot_credits);
+        jh = osd_journal_start_sb(osd_sb(dev), LDISKFS_HT_MISC, oh->ot_credits);
         osd_th_started(oh);
         if (!IS_ERR(jh)) {
                 oh->ot_handle = jh;
@@ -908,6 +937,28 @@ out:
         RETURN(rc);
 }
 
+static int osd_seq_exists(const struct lu_env *env,
+                             struct osd_device *osd, obd_seq seq)
+{
+       struct lu_seq_range     *range = &osd_oti_get(env)->oti_seq_range;
+       struct seq_server_site  *ss = osd_seq_site(osd);
+       int                     rc;
+       ENTRY;
+
+       if (ss == NULL)
+               RETURN(1);
+
+       rc = osd_fld_lookup(env, osd, seq, range);
+       if (rc != 0) {
+               if (rc != -ENOENT)
+                       CERROR("%s: can't lookup FLD sequence "LPX64
+                              ": rc = %d\n", osd_name(osd), seq, rc);
+               RETURN(0);
+       }
+
+       RETURN(ss->ss_node_id == range->lsr_index);
+}
+
 /*
  * Concurrency: shouldn't matter.
  */
@@ -1124,7 +1175,6 @@ static void osd_conf_get(const struct lu_env *env,
         /*
          * XXX should be taken from not-yet-existing fs abstraction layer.
          */
-       param->ddp_mnt = osd_dt_dev(dev)->od_mnt;
         param->ddp_max_name_len = LDISKFS_NAME_LEN;
         param->ddp_max_nlink    = LDISKFS_LINK_MAX;
        param->ddp_block_shift  = sb->s_blocksize_bits;
@@ -2093,23 +2143,26 @@ static int __osd_object_create(struct osd_thread_info *info,
  * \retval 0, on success
  */
 static int __osd_oi_insert(const struct lu_env *env, struct osd_object *obj,
-                           const struct lu_fid *fid, struct thandle *th)
+                          const struct lu_fid *fid, struct thandle *th)
 {
-        struct osd_thread_info *info = osd_oti_get(env);
-        struct osd_inode_id    *id   = &info->oti_id;
-        struct osd_device      *osd  = osd_obj2dev(obj);
+       struct osd_thread_info *info = osd_oti_get(env);
+       struct osd_inode_id    *id   = &info->oti_id;
+       struct osd_device      *osd  = osd_obj2dev(obj);
+       struct osd_thandle     *oh;
 
-        LASSERT(obj->oo_inode != NULL);
+       LASSERT(obj->oo_inode != NULL);
+
+       oh = container_of0(th, struct osd_thandle, ot_super);
+       LASSERT(oh->ot_handle);
 
        osd_id_gen(id, obj->oo_inode->i_ino, obj->oo_inode->i_generation);
-       return osd_oi_insert(info, osd, fid, id, th, OI_CHECK_FLD);
+       return osd_oi_insert(info, osd, fid, id, oh->ot_handle, OI_CHECK_FLD);
 }
 
 int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd,
                   obd_seq seq, struct lu_seq_range *range)
 {
        struct seq_server_site  *ss = osd_seq_site(osd);
-       int                     rc;
 
        if (fid_seq_is_idif(seq)) {
                fld_range_set_ost(range);
@@ -2128,12 +2181,8 @@ int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd,
 
        LASSERT(ss != NULL);
        fld_range_set_any(range);
-       rc = fld_server_lookup(env, ss->ss_server_fld, seq, range);
-       if (rc != 0) {
-               CERROR("%s: cannot find FLD range for "LPX64": rc = %d\n",
-                      osd_name(osd), seq, rc);
-       }
-       return rc;
+       /* OSD will only do local fld lookup */
+       return fld_local_lookup(env, ss->ss_server_fld, seq, range);
 }
 
 /*
@@ -2146,7 +2195,6 @@ static int osd_declare_object_create(const struct lu_env *env,
                                     struct dt_object_format *dof,
                                     struct thandle *handle)
 {
-       struct lu_seq_range     *range = &osd_oti_get(env)->oti_seq_range;
        struct osd_thandle      *oh;
        int                      rc;
        ENTRY;
@@ -2184,16 +2232,6 @@ static int osd_declare_object_create(const struct lu_env *env,
        if (rc != 0)
                RETURN(rc);
 
-       /* It does fld look up inside declare, and the result will be
-        * added to fld cache, so the following fld lookup inside insert
-        * does not need send RPC anymore, so avoid send rpc with holding
-        * transaction */
-       if (fid_is_norm(lu_object_fid(&dt->do_lu)) &&
-               !fid_is_last_id(lu_object_fid(&dt->do_lu)))
-               osd_fld_lookup(env, osd_dt_dev(handle->th_dev),
-                              fid_seq(lu_object_fid(&dt->do_lu)), range);
-
-
        RETURN(rc);
 }
 
@@ -2291,7 +2329,8 @@ static int osd_object_destroy(const struct lu_env *env,
                RETURN(-EPERM);
 
        if (S_ISDIR(inode->i_mode)) {
-               LASSERT(osd_inode_unlinked(inode) || inode->i_nlink == 1);
+               LASSERT(osd_inode_unlinked(inode) || inode->i_nlink == 1 ||
+                       inode->i_nlink == 2);
                /* it will check/delete the inode from remote parent,
                 * how to optimize it? unlink performance impaction XXX */
                result = osd_delete_from_remote_parent(env, osd, obj, oh);
@@ -2307,7 +2346,8 @@ static int osd_object_destroy(const struct lu_env *env,
 
        osd_trans_exec_op(env, th, OSD_OT_DESTROY);
 
-        result = osd_oi_delete(osd_oti_get(env), osd, fid, th, OI_CHECK_FLD);
+       result = osd_oi_delete(osd_oti_get(env), osd, fid, oh->ot_handle,
+                              OI_CHECK_FLD);
 
         /* XXX: add to ext3 orphan list */
         /* rc = ext3_orphan_add(handle_t *handle, struct inode *inode) */
@@ -2587,20 +2627,22 @@ static int osd_declare_object_ref_add(const struct lu_env *env,
 static int osd_object_ref_add(const struct lu_env *env,
                               struct dt_object *dt, struct thandle *th)
 {
-       struct osd_object *obj = osd_dt_obj(dt);
-       struct inode      *inode = obj->oo_inode;
-       bool               need_dirty = false;
-       int                rc = 0;
+       struct osd_object  *obj = osd_dt_obj(dt);
+       struct inode       *inode = obj->oo_inode;
+       struct osd_thandle *oh;
+       int                 rc = 0;
 
         LINVRNT(osd_invariant(obj));
        LASSERT(dt_object_exists(dt) && !dt_object_remote(dt));
         LASSERT(osd_write_locked(env, obj));
         LASSERT(th != NULL);
 
+        oh = container_of0(th, struct osd_thandle, ot_super);
+        LASSERT(oh->ot_handle != NULL);
+
        osd_trans_exec_op(env, th, OSD_OT_REF_ADD);
 
-       /* This based on ldiskfs_inc_count(), which is not exported.
-        *
+       /*
         * The DIR_NLINK feature allows directories to exceed LDISKFS_LINK_MAX
         * (65000) subdirectories by storing "1" in i_nlink if the link count
         * would otherwise overflow. Directory tranversal tools understand
@@ -2612,28 +2654,11 @@ static int osd_object_ref_add(const struct lu_env *env,
         * in case they are being linked into the PENDING directory
         */
        spin_lock(&obj->oo_guard);
-       if (unlikely(!S_ISDIR(inode->i_mode) &&
-                    inode->i_nlink >= LDISKFS_LINK_MAX)) {
-               /* MDD should have checked this, but good to be safe */
-               rc = -EMLINK;
-       } else if (unlikely(inode->i_nlink == 0 ||
-                           (S_ISDIR(inode->i_mode) &&
-                            inode->i_nlink >= LDISKFS_LINK_MAX))) {
-               /* inc_nlink from 0 may cause WARN_ON */
-               set_nlink(inode, 1);
-               need_dirty = true;
-       } else if (!S_ISDIR(inode->i_mode) ||
-                  (S_ISDIR(inode->i_mode) && inode->i_nlink >= 2)) {
-               inc_nlink(inode);
-               need_dirty = true;
-       } /* else (S_ISDIR(inode->i_mode) && inode->i_nlink == 1) { ; } */
-
+       ldiskfs_inc_count(oh->ot_handle, inode);
        LASSERT(inode->i_nlink <= LDISKFS_LINK_MAX);
        spin_unlock(&obj->oo_guard);
 
-       if (need_dirty)
-               ll_dirty_inode(inode, I_DIRTY_DATASYNC);
-
+       ll_dirty_inode(inode, I_DIRTY_DATASYNC);
        LINVRNT(osd_invariant(obj));
 
        return rc;
@@ -2666,12 +2691,16 @@ static int osd_object_ref_del(const struct lu_env *env, struct dt_object *dt,
        struct osd_object       *obj = osd_dt_obj(dt);
        struct inode            *inode = obj->oo_inode;
        struct osd_device       *osd = osd_dev(dt->do_lu.lo_dev);
+       struct osd_thandle      *oh;
 
        LINVRNT(osd_invariant(obj));
        LASSERT(dt_object_exists(dt) && !dt_object_remote(dt));
        LASSERT(osd_write_locked(env, obj));
        LASSERT(th != NULL);
 
+        oh = container_of0(th, struct osd_thandle, ot_super);
+        LASSERT(oh->ot_handle != NULL);
+
        osd_trans_exec_op(env, th, OSD_OT_REF_DEL);
 
        spin_lock(&obj->oo_guard);
@@ -2687,23 +2716,11 @@ static int osd_object_ref_del(const struct lu_env *env, struct dt_object *dt,
                return 0;
        }
 
-       /* This based on ldiskfs_dec_count(), which is not exported.
-        *
-        * If a directory already has nlink == 1, then do not drop the nlink
-        * count to 0, even temporarily, to avoid race conditions with other
-        * threads not holding oo_guard seeing i_nlink == 0 in rare cases.
-        *
-        * nlink == 1 means the directory has/had > EXT4_LINK_MAX subdirs.
-        */
-       if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 1) {
-               drop_nlink(inode);
+       ldiskfs_dec_count(oh->ot_handle, inode);
+       spin_unlock(&obj->oo_guard);
 
-               spin_unlock(&obj->oo_guard);
-               ll_dirty_inode(inode, I_DIRTY_DATASYNC);
-               LINVRNT(osd_invariant(obj));
-       } else {
-               spin_unlock(&obj->oo_guard);
-       }
+       ll_dirty_inode(inode, I_DIRTY_DATASYNC);
+       LINVRNT(osd_invariant(obj));
 
        return 0;
 }
@@ -3001,6 +3018,7 @@ static int osd_object_sync(const struct lu_env *env, struct dt_object *dt)
        file->f_dentry = dentry;
        file->f_mapping = inode->i_mapping;
        file->f_op = inode->i_fop;
+       set_file_inode(file, inode);
 #ifndef HAVE_FILE_FSYNC_4ARGS
        mutex_lock(&inode->i_mutex);
 #endif
@@ -3343,31 +3361,6 @@ static inline int osd_get_fid_from_dentry(struct ldiskfs_dir_entry_2 *de,
        return rc;
 }
 
-static int osd_mdt_seq_exists(const struct lu_env *env,
-                             struct osd_device *osd, obd_seq seq)
-{
-       struct lu_seq_range     *range = &osd_oti_get(env)->oti_seq_range;
-       struct seq_server_site  *ss = osd_seq_site(osd);
-       int                     rc;
-       ENTRY;
-
-       if (ss == NULL)
-               RETURN(1);
-
-       /* XXX: currently, each MDT only store avaible sequence on disk, and no
-        * allocated sequences information on disk, so we have to lookup FLDB,
-        * but it probably makes more sense also store allocated sequence
-        * locally, so we do not need do remote FLDB lookup in OSD */
-       rc = osd_fld_lookup(env, osd, seq, range);
-       if (rc != 0) {
-               CERROR("%s: Can not lookup fld for "LPX64"\n",
-                      osd_name(osd), seq);
-               RETURN(0);
-       }
-
-       RETURN(ss->ss_node_id == range->lsr_index);
-}
-
 static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd,
                          struct lu_fid *fid)
 {
@@ -3377,8 +3370,7 @@ static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd,
        if (unlikely(!fid_seq_in_fldb(fid_seq(fid))))
                RETURN(0);
 
-       /* Currently only check this for FID on MDT */
-       if (osd_mdt_seq_exists(env, osd, fid_seq(fid)))
+       if (osd_seq_exists(env, osd, fid_seq(fid)))
                RETURN(0);
 
        RETURN(1);
@@ -3436,7 +3428,7 @@ static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt,
                down_write(&obj->oo_ext_idx_sem);
         }
 
-        bh = ldiskfs_find_entry(dir, &dentry->d_name, &de, hlock);
+        bh = osd_ldiskfs_find_entry(dir, &dentry->d_name, &de, NULL, hlock);
         if (bh) {
                __u32 ino = 0;
 
@@ -3959,7 +3951,7 @@ static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj,
                down_read(&obj->oo_ext_idx_sem);
         }
 
-        bh = osd_ldiskfs_find_entry(dir, dentry, &de, hlock);
+        bh = osd_ldiskfs_find_entry(dir, &dentry->d_name, &de, NULL, hlock);
         if (bh) {
                struct osd_thread_info *oti = osd_oti_get(env);
                struct osd_inode_id *id = &oti->oti_id;
@@ -4548,6 +4540,8 @@ static struct dt_it *osd_it_ea_init(const struct lu_env *env,
        file->f_dentry          = obj_dentry;
        file->f_mapping         = obj->oo_inode->i_mapping;
        file->f_op              = obj->oo_inode->i_fop;
+       set_file_inode(file, obj->oo_inode);
+
        lu_object_get(lo);
        RETURN((struct dt_it *) it);
 }
@@ -4956,7 +4950,7 @@ osd_dirent_check_repair(const struct lu_env *env, struct osd_object *obj,
 
 again:
        if (dev->od_dirent_journal) {
-               jh = ldiskfs_journal_start_sb(sb, credits);
+               jh = osd_journal_start_sb(sb, LDISKFS_HT_MISC, credits);
                if (IS_ERR(jh)) {
                        rc = PTR_ERR(jh);
                        CERROR("%.16s: fail to start trans for dirent "
@@ -4986,7 +4980,7 @@ again:
                }
        }
 
-       bh = osd_ldiskfs_find_entry(dir, dentry, &de, hlock);
+       bh = osd_ldiskfs_find_entry(dir, &dentry->d_name, &de, NULL, hlock);
        /* For dot/dotdot entry, if there is not enough space to hold the
         * FID-in-dirent, just keep them there. It only happens when the
         * device upgraded from 1.8 or restored from MDT file-level backup.
@@ -5515,8 +5509,8 @@ static struct lu_device *osd_device_fini(const struct lu_env *env,
        struct osd_device *o = osd_dev(d);
        ENTRY;
 
-       osd_procfs_fini(o);
        osd_shutdown(env, o);
+       osd_procfs_fini(o);
        osd_scrub_cleanup(env, o);
        osd_obj_map_fini(o);
        osd_umount(env, o);
@@ -5582,6 +5576,9 @@ static int osd_device_init0(const struct lu_env *env,
        if (rc != 0)
                GOTO(out_site, rc);
 
+       /* self-repair LMA by default */
+       o->od_lma_self_repair = 1;
+
        CFS_INIT_LIST_HEAD(&o->od_ios_list);
        /* setup scrub, including OI files initialization */
        rc = osd_scrub_setup(env, o);
@@ -5833,8 +5830,11 @@ static int __init osd_mod_init(void)
        if (rc)
                return rc;
 
-       rc = class_register_type(&osd_obd_device_ops, NULL, lvars.module_vars,
-                                LUSTRE_OSD_LDISKFS_NAME, &osd_device_type);
+       rc = class_register_type(&osd_obd_device_ops, NULL, NULL,
+#ifndef HAVE_ONLY_PROCFS_SEQ
+                               lvars.module_vars,
+#endif
+                               LUSTRE_OSD_LDISKFS_NAME, &osd_device_type);
        if (rc)
                lu_kmem_fini(ldiskfs_caches);
        return rc;