Whamcloud - gitweb
b=10719
authornathan <nathan>
Tue, 20 Feb 2007 17:18:36 +0000 (17:18 +0000)
committernathan <nathan>
Tue, 20 Feb 2007 17:18:36 +0000 (17:18 +0000)
r=adilger
set external journal device read-only

12 files changed:
lustre/ChangeLog
lustre/include/linux/lustre_fsfilt.h
lustre/include/linux/lvfs_linux.h
lustre/include/linux/obd_support.h
lustre/lvfs/fsfilt_ext3.c
lustre/lvfs/lvfs_linux.c
lustre/mds/handler.c
lustre/mds/mds_lov.c
lustre/mds/mds_reint.c
lustre/mds/mds_xattr.c
lustre/obdclass/obd_mount.c
lustre/obdfilter/filter.c

index 6fbead0..8ef4590 100644 (file)
@@ -20,23 +20,6 @@ tbd         Cluster File Systems, Inc. <info@clusterfs.com>
        * Recommended e2fsprogs version: 1.39.cfs2-0
        * bug fixes
 
-Severity   : major
-Frequency  : liblustre (e.g. catamount) on a large cluster with >= 8 OSTs
-             per OSS
-Bugzilla   : 11684
-Description: System hang on startup
-Details    : This bug allowed the liblustre (e.g. catamount) client to
-             return to the app before handling all startup RPCs.  This
-            could leave the node unresponsive to lustre network traffic
-            and manifested as a server ptllnd timeout.
-
-Severity   : enhancement
-Bugzilla   : 11667
-Description: Add "/proc/sys/lustre/debug_peer_on_timeout"
-             (liblustre envirable: LIBLUSTRE_DEBUG_PEER_ON_TIMEOUT)
-            boolean to control whether to print peer debug info when a
-            client's RPC times out.
-
 Severity   : enhancement
 Bugzilla   : 8007
 Description: MountConf
@@ -223,7 +206,24 @@ Details    : for a short time RPCs with bulk IO are in the replay list,
             but replay of bulk IOs is unimplemented.  If the OST filesystem
             is corrupted due to disk cache incoherency and then replay is
             started it is possible to trip an assertion.  Avoid putting
-            committed RPCs into the replay list at all to avoid this issue.         
+            committed RPCs into the replay list at all to avoid this issue.
+
+Severity   : major
+Frequency  : liblustre (e.g. catamount) on a large cluster with >= 8 OSTs
+             per OSS
+Bugzilla   : 11684
+Description: System hang on startup
+Details    : This bug allowed the liblustre (e.g. catamount) client to
+             return to the app before handling all startup RPCs.  This
+            could leave the node unresponsive to lustre network traffic
+            and manifested as a server ptllnd timeout.
+
+Severity   : enhancement
+Bugzilla   : 11667
+Description: Add "/proc/sys/lustre/debug_peer_on_timeout"
+             (liblustre envirable: LIBLUSTRE_DEBUG_PEER_ON_TIMEOUT)
+            boolean to control whether to print peer debug info when a
+            client's RPC times out.
 
 Severity   : minor
 Frequency  : only for kernels with patches from Lustre below 1.4.3  
@@ -231,6 +231,14 @@ Bugzilla   : 11248
 Description: Remove old rdonly API
 Details    : Remove old rdonly API which unsed from at least lustre 1.4.3
 
+Severity   : major
+Frequency  : only for devices with external journals
+Bugzilla   : 10719
+Description: Set external device read-only also 
+Details    : During a commanded failover stop, we set the disk device
+            read-only while the server shuts down. We now also set any
+            external journal device read-only at the same time. 
+       
 ------------------------------------------------------------------------------
 
 TBD         Cluster File Systems, Inc. <info@clusterfs.com>
index 9cdb99c..70a4bc8 100644 (file)
@@ -108,6 +108,7 @@ struct fsfilt_operations {
         int     (* fs_qids)(struct file *file, struct inode *inode, int type,
                             struct list_head *list);
         int     (* fs_dquot)(struct lustre_dquot *dquot, int cmd);
+        lvfs_sbdev_type (* fs_journal_sbdev)(struct super_block *sb);
 };
 
 extern int fsfilt_register_ops(struct fsfilt_operations *fs_ops);
@@ -142,6 +143,14 @@ static inline __u8 *fsfilt_uuid(struct obd_device *obd, struct super_block *sb)
         return obd->obd_fsops->fs_uuid(sb);
 }
 
+static inline lvfs_sbdev_type fsfilt_journal_sbdev(struct obd_device *obd,
+                                                   struct super_block *sb)
+{
+        if (obd && obd->obd_fsops && obd->obd_fsops->fs_journal_sbdev)
+                return obd->obd_fsops->fs_journal_sbdev(sb);
+        return (lvfs_sbdev_type)0;
+}
+
 #define FSFILT_OP_UNLINK         1
 #define FSFILT_OP_RMDIR          2
 #define FSFILT_OP_RENAME         3
index 9c41cd0..ab759c0 100644 (file)
@@ -50,8 +50,11 @@ struct l_readdir_callback {
 #  define lvfs_sbdev_sync      fsync_dev
 # endif
 
-void lvfs_set_rdonly(lvfs_sbdev_type dev);
+/* Instead of calling within lvfs (a layering violation) */
+#define lvfs_set_rdonly(obd, sb) \
+        __lvfs_set_rdonly(lvfs_sbdev(sb), fsfilt_journal_sbdev(obd, sb))
+
+void __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev);
 int lvfs_check_rdonly(lvfs_sbdev_type dev);
-void lvfs_clear_rdonly(lvfs_sbdev_type dev);
 
-#endif
+#endif /*  __LVFS_LINUX_H__ */
index df22b43..5c466c3 100644 (file)
@@ -75,16 +75,16 @@ static inline __u32 crc32_le(__u32 crc, unsigned char const *p, size_t len)
 # include <linux/blkdev.h>
 # include <lvfs.h>
 
-static inline void OBD_FAIL_WRITE(int id, struct super_block *sb)
-{
-        if (OBD_FAIL_CHECK(id)) {
-                BDEVNAME_DECLARE_STORAGE(tmp);
-                CERROR("obd_fail_loc=%x, fail write operation on %s\n",
-                       id, ll_bdevname(sb, tmp));
-                lvfs_set_rdonly(lvfs_sbdev(sb));
-                /* We set FAIL_ONCE because we never "un-fail" a device */
-                obd_fail_loc |= OBD_FAILED | OBD_FAIL_ONCE;
-        }
+#define OBD_FAIL_WRITE(obd, id, sb)                                          \
+{                                                                            \
+        if (OBD_FAIL_CHECK(id)) {                                            \
+                BDEVNAME_DECLARE_STORAGE(tmp);                               \
+                CERROR("obd_fail_loc=%x, fail write operation on %s\n",      \
+                       id, ll_bdevname(sb, tmp));                            \
+                lvfs_set_rdonly(obd, sb);                                    \
+                /* We set FAIL_ONCE because we never "un-fail" a device */   \
+                obd_fail_loc |= OBD_FAILED | OBD_FAIL_ONCE;                  \
+        }                                                                    \
 }
 
 #define OBD_SLEEP_ON(wq, state)  wait_event_interruptible(wq, state)
index f83949c..80cc962 100644 (file)
@@ -1989,6 +1989,15 @@ static int fsfilt_ext3_dquot(struct lustre_dquot *dquot, int cmd)
 }
 #endif
 
+static lvfs_sbdev_type fsfilt_ext3_journal_sbdev(struct super_block *sb)
+{
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+        return (EXT3_SB(sb)->journal_bdev);
+#else
+        return kdev_t_to_nr(EXT3_SB(sb)->s_journal->j_dev);
+#endif
+}
+
 static struct fsfilt_operations fsfilt_ext3_ops = {
         .fs_type                = "ext3",
         .fs_owner               = THIS_MODULE,
@@ -2023,6 +2032,7 @@ static struct fsfilt_operations fsfilt_ext3_ops = {
         .fs_qids                = fsfilt_ext3_qids,
         .fs_dquot               = fsfilt_ext3_dquot,
 #endif
+        .fs_journal_sbdev       = fsfilt_ext3_journal_sbdev,
 };
 
 static int __init fsfilt_ext3_init(void)
index 21cd56a..71c9ede 100644 (file)
@@ -431,19 +431,22 @@ EXPORT_SYMBOL(obd_memory);
 EXPORT_SYMBOL(obd_memmax);
 
 #ifdef LUSTRE_KERNEL_VERSION
-
 #ifndef HAVE_CLEAR_RDONLY_ON_PUT
-#error rdonly patchset must be updated
+#error rdonly patchset must be updated [cfs bz11248]
 #endif
 
 void dev_set_rdonly(lvfs_sbdev_type dev);
-void dev_clear_rdonly(lvfs_sbdev_type dev);
 int dev_check_rdonly(lvfs_sbdev_type dev);
 
-void lvfs_set_rdonly(lvfs_sbdev_type dev)
+void __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev)
 {
-        CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
         lvfs_sbdev_sync(dev);
+        if (jdev && (jdev != dev)) {
+                CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
+                       (long)jdev);
+                dev_set_rdonly(jdev);
+        }
+        CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
         dev_set_rdonly(dev);
 }
 
@@ -452,16 +455,9 @@ int lvfs_check_rdonly(lvfs_sbdev_type dev)
         return dev_check_rdonly(dev);
 }
 
-void lvfs_clear_rdonly(lvfs_sbdev_type dev)
-{
-        CDEBUG(D_IOCTL | D_HA, "(will unset dev %lx rdonly on put)\n",
-               (long)dev);
-}
-
-EXPORT_SYMBOL(lvfs_set_rdonly);
+EXPORT_SYMBOL(__lvfs_set_rdonly);
 EXPORT_SYMBOL(lvfs_check_rdonly);
-EXPORT_SYMBOL(lvfs_clear_rdonly);
-#endif
+#endif /* LUSTRE_KERNEL_VERSION */
 
 int lvfs_check_io_health(struct obd_device *obd, struct file *file)
 {
index d8f314f..0f8788b 100644 (file)
@@ -2070,14 +2070,7 @@ err_ns:
 err_ops:
         fsfilt_put_ops(obd->obd_fsops);
 err_put:
-        if (lmi) {
-                server_put_mount(obd->obd_name, mnt);
-        } else {
-                /* old method */
-                unlock_kernel();
-                mntput(mnt);
-                lock_kernel();
-        }               
+        server_put_mount(obd->obd_name, mnt);
         obd->u.obt.obt_sb = NULL;
         return rc;
 }
@@ -2238,8 +2231,6 @@ static int mds_cleanup(struct obd_device *obd)
 {
         struct mds_obd *mds = &obd->u.mds;
         lvfs_sbdev_type save_dev;
-        int must_put = 0;
-        int must_relock = 0;
         ENTRY;
 
         if (obd->u.obt.obt_sb == NULL)
@@ -2264,21 +2255,7 @@ static int mds_cleanup(struct obd_device *obd)
         upcall_cache_cleanup(mds->mds_group_hash);
         mds->mds_group_hash = NULL;
 
-        must_put = server_put_mount(obd->obd_name, mds->mds_vfsmnt);
-        /* must_put is for old method (l_p_m returns non-0 on err) */
-
-        /* We can only unlock kernel if we are in the context of sys_ioctl,
-           otherwise we never called lock_kernel */
-        if (ll_kernel_locked()) {
-                unlock_kernel();
-                must_relock++;
-        }
-        
-        if (must_put) {
-                /* In case we didn't mount with lustre_get_mount -- old method*/
-                mntput(mds->mds_vfsmnt);
-                lvfs_clear_rdonly(save_dev);
-        }
+        server_put_mount(obd->obd_name, mds->mds_vfsmnt);
         obd->u.obt.obt_sb = NULL;
 
         ldlm_namespace_free(obd->obd_namespace, obd->obd_force);
@@ -2290,9 +2267,6 @@ static int mds_cleanup(struct obd_device *obd)
         }
         spin_unlock_bh(&obd->obd_processing_task_lock);
 
-        if (must_relock)
-                lock_kernel();
-
         fsfilt_put_ops(obd->obd_fsops);
 
         LCONSOLE_INFO("MDT %s has stopped.\n", obd->obd_name);
index 55782e5..43278a2 100644 (file)
@@ -564,7 +564,7 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                 CDEBUG(D_HA, "syncing mds %s\n", obd->obd_name);
                 rc = fsfilt_sync(obd, obd->u.obt.obt_sb);
 
-                lvfs_set_rdonly(lvfs_sbdev(obd->u.obt.obt_sb));
+                lvfs_set_rdonly(obd, obd->u.obt.obt_sb);
                 RETURN(0);
         }
 
index fb94b5a..96ba59d 100644 (file)
@@ -550,7 +550,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
                 need_lock = 0;
         }
 
-        OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_SETATTR_WRITE, inode->i_sb);
+        OBD_FAIL_WRITE(obd, OBD_FAIL_MDS_REINT_SETATTR_WRITE, inode->i_sb);
 
         /* start a log jounal handle if needed */
         if (S_ISREG(inode->i_mode) &&
@@ -802,7 +802,7 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
 
         cleanup_phase = 2; /* child dentry */
 
-        OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_CREATE_WRITE, dir->i_sb);
+        OBD_FAIL_WRITE(obd, OBD_FAIL_MDS_REINT_CREATE_WRITE, dir->i_sb);
 
         if (req->rq_export->exp_connect_flags & OBD_CONNECT_RDONLY) {
                 if (dchild->d_inode)
@@ -1610,7 +1610,7 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
 
         cleanup_phase = 3; /* child inum lock */
 
-        OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_UNLINK_WRITE, dparent->d_inode->i_sb);
+        OBD_FAIL_WRITE(obd, OBD_FAIL_MDS_REINT_UNLINK_WRITE, dparent->d_inode->i_sb);
 
         /* ldlm_reply in buf[0] if called via intent */
         if (offset == DLM_INTENT_REC_OFF)
@@ -1860,7 +1860,7 @@ static int mds_reint_link(struct mds_update_record *rec, int offset,
         }
 
         /* Step 4: Do it. */
-        OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_LINK_WRITE, de_src->d_inode->i_sb);
+        OBD_FAIL_WRITE(obd, OBD_FAIL_MDS_REINT_LINK_WRITE, de_src->d_inode->i_sb);
 
         if (req->rq_export->exp_connect_flags & OBD_CONNECT_RDONLY)
                 GOTO(cleanup, rc = -EROFS);
@@ -2212,7 +2212,7 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset,
         }
 
 no_unlink:
-        OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_RENAME_WRITE,
+        OBD_FAIL_WRITE(obd, OBD_FAIL_MDS_REINT_RENAME_WRITE,
                        de_srcdir->d_inode->i_sb);
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
index f9681b3..5b16864 100644 (file)
@@ -268,7 +268,7 @@ int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body)
         inode = de->d_inode;
         LASSERT(inode);
 
-        OBD_FAIL_WRITE(OBD_FAIL_MDS_SETXATTR_WRITE, inode->i_sb);
+        OBD_FAIL_WRITE(obd, OBD_FAIL_MDS_SETXATTR_WRITE, inode->i_sb);
 
         /* filter_op simply use setattr one */
         handle = fsfilt_start(obd, inode, FSFILT_OP_SETATTR, NULL);
index e1b1ed6..8822965 100644 (file)
@@ -175,8 +175,12 @@ int server_put_mount(char *name, struct vfsmount *mnt)
 {
         struct lustre_mount_info *lmi;
         struct lustre_sb_info *lsi;
+        int count = atomic_read(&mnt->mnt_count) - 1;
         ENTRY;
 
+        /* This might be the last one, can't deref after this */
+        unlock_mntput(mnt);
+        
         down(&lustre_mount_info_lock);
         lmi = server_find_mount(name);
         up(&lustre_mount_info_lock);
@@ -186,20 +190,16 @@ int server_put_mount(char *name, struct vfsmount *mnt)
         }
         lsi = s2lsi(lmi->lmi_sb);
         LASSERT(lmi->lmi_mnt == mnt);
-        unlock_mntput(lmi->lmi_mnt);
 
         CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d, vfscount=%d\n",
-               lmi->lmi_mnt, name, atomic_read(&lsi->lsi_mounts),
-               atomic_read(&lmi->lmi_mnt->mnt_count));
+               lmi->lmi_mnt, name, atomic_read(&lsi->lsi_mounts), count);
 
         if (lustre_put_lsi(lmi->lmi_sb)) {
                 CDEBUG(D_MOUNT, "Last put of mnt %p from %s, vfscount=%d\n",
-                       lmi->lmi_mnt, name,
-                       atomic_read(&lmi->lmi_mnt->mnt_count));
+                       lmi->lmi_mnt, name, count);
                 /* last mount is the One True Mount */
-                if (atomic_read(&lmi->lmi_mnt->mnt_count) > 1)
-                        CERROR("%s: mount busy, vfscount=%d!\n", name,
-                               atomic_read(&lmi->lmi_mnt->mnt_count));
+                if (count > 1)
+                        CERROR("%s: mount busy, vfscount=%d!\n", name, count);
         }
 
         /* this obd should never need the mount again */
@@ -1311,7 +1311,6 @@ static void server_put_super(struct super_block *sb)
         struct lustre_sb_info *lsi = s2lsi(sb);
         struct obd_device     *obd;
         struct vfsmount       *mnt = lsi->lsi_srv_mnt;
-        lvfs_sbdev_type        save_dev;
         char *tmpname, *extraname = NULL;
         int tmpname_sz;
         int lddflags = lsi->lsi_ldd->ldd_flags;
@@ -1369,8 +1368,6 @@ static void server_put_super(struct super_block *sb)
                 server_stop_mgs(sb);
         }
 
-        save_dev = lvfs_sbdev(sb);
-
         /* Clean the mgc and sb */
         rc = lustre_common_put_super(sb);
         /* FIXME how can I report a failure to umount? */
@@ -1381,9 +1378,6 @@ static void server_put_super(struct super_block *sb)
 
         /* drop the One True Mount */
         unlock_mntput(mnt);
-#ifndef LUSTRE_PATCHLESS
-        lvfs_clear_rdonly(save_dev);
-#endif
 
         /* Stop the servers (MDS, OSS) if no longer needed.  We must wait
            until the target is really gone so that our type refcount check
index 537060c..9fae669 100644 (file)
@@ -1740,14 +1740,7 @@ err_ops:
         fsfilt_put_ops(obd->obd_fsops);
         filter_iobuf_pool_done(filter);
 err_mntput:
-        if (lmi) {
-                server_put_mount(obd->obd_name, mnt);
-        } else {
-                /* old method */
-                unlock_kernel();
-                mntput(mnt);
-                lock_kernel();
-        }
+        server_put_mount(obd->obd_name, mnt);
         obd->u.obt.obt_sb = 0;
         return rc;
 }
@@ -1870,8 +1863,6 @@ static int filter_precleanup(struct obd_device *obd,
 static int filter_cleanup(struct obd_device *obd)
 {
         struct filter_obd *filter = &obd->u.filter;
-        lvfs_sbdev_type save_dev;
-        int must_relock = 0, must_put = 0;
         ENTRY;
 
         if (obd->obd_fail)
@@ -1896,7 +1887,6 @@ static int filter_cleanup(struct obd_device *obd)
 
         if (obd->u.obt.obt_sb == NULL)
                 RETURN(0);
-        save_dev = lvfs_sbdev(obd->u.obt.obt_sb);
 
         filter_post(obd);
 
@@ -1904,26 +1894,9 @@ static int filter_cleanup(struct obd_device *obd)
 
         LL_DQUOT_OFF(obd->u.obt.obt_sb);
 
-        must_put = server_put_mount(obd->obd_name, filter->fo_vfsmnt);
-        /* must_put is for old method (l_p_m returns non-0 on err) */
-
-        /* We can only unlock kernel if we are in the context of sys_ioctl,
-           otherwise we never called lock_kernel */
-        if (ll_kernel_locked()) {
-                unlock_kernel();
-                must_relock++;
-        }
-        
-        if (must_put) {
-                /* In case we didn't mount with lustre_get_mount -- old method*/
-                mntput(filter->fo_vfsmnt);
-                lvfs_clear_rdonly(save_dev);
-        }
+        server_put_mount(obd->obd_name, filter->fo_vfsmnt);
         obd->u.obt.obt_sb = NULL;
 
-        if (must_relock)
-                lock_kernel();
-
         fsfilt_put_ops(obd->obd_fsops);
 
         filter_iobuf_pool_done(filter);
@@ -3301,7 +3274,7 @@ int filter_iocontrol(unsigned int cmd, struct obd_export *exp,
                 CDEBUG(D_HA, "syncing ost %s\n", obd->obd_name);
                 rc = fsfilt_sync(obd, obd->u.obt.obt_sb);
 
-                lvfs_set_rdonly(lvfs_sbdev(obd->u.obt.obt_sb));
+                lvfs_set_rdonly(obd, obd->u.obt.obt_sb);
                 RETURN(0);
         }