Whamcloud - gitweb
b=18539
authorzhanghc <zhanghc>
Tue, 14 Jul 2009 13:34:03 +0000 (13:34 +0000)
committerzhanghc <zhanghc>
Tue, 14 Jul 2009 13:34:03 +0000 (13:34 +0000)
don't allocate new files on degraded OSTs

i=adilger@sun.com
i=hongchao.zhang@sun.com

lustre/include/obd.h
lustre/obdfilter/filter.c
lustre/obdfilter/lproc_obdfilter.c
lustre/osc/osc_create.c
lustre/osc/osc_internal.h
lustre/osc/osc_request.c
lustre/tests/sanity.sh

index 3848808..79b7391 100644 (file)
@@ -331,7 +331,8 @@ struct filter_obd {
         obd_size             fo_readcache_max_filesize;
         int                  fo_read_cache:1,   /**< enable read-only cache */
                              fo_writethrough_cache:1,/**< read cache writes */
-                             fo_mds_ost_sync:1; /**< MDS-OST orphan recovery*/
+                             fo_mds_ost_sync:1, /**< MDS-OST orphan recovery*/
+                             fo_raid_degraded:1;/**< RAID device degraded */
 
         struct obd_import   *fo_mdc_imp;
         struct obd_uuid      fo_mdc_uuid;
index 8fc2f31..e3987c5 100644 (file)
@@ -3739,8 +3739,13 @@ static int filter_statfs(struct obd_device *obd, struct obd_statfs *osfs,
         /* set EROFS to state field if FS is mounted as RDONLY. The goal is to
          * stop creating files on MDS if OST is not good shape to create
          * objects.*/
-        osfs->os_state = (filter->fo_obt.obt_sb->s_flags & MS_RDONLY) ?
-                OS_STATE_READONLY : 0;
+        osfs->os_state = 0;
+
+        if (filter->fo_obt.obt_sb->s_flags & MS_RDONLY)
+                osfs->os_state = OS_STATE_READONLY;
+
+        if (filter->fo_raid_degraded)
+                osfs->os_state |= OS_STATE_DEGRADED;
         RETURN(rc);
 }
 
index bcc46ec..fdd0eba 100644 (file)
@@ -341,6 +341,30 @@ static int lprocfs_filter_rd_mds_sync(char *page, char **start, off_t off,
         return snprintf(page, count, "%u\n", obd->u.filter.fo_mds_ost_sync);
 }
 
+int lprocfs_filter_rd_degraded(char *page, char **start, off_t off,
+                               int count, int *eof, void *data)
+{
+        struct obd_device *obd = data;
+
+        return snprintf(page, count, "%u\n", obd->u.filter.fo_raid_degraded);
+}
+
+int lprocfs_filter_wr_degraded(struct file *file, const char *buffer,
+                               unsigned long count, void *data)
+{
+        struct obd_device *obd = data;
+        int val, rc;
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        spin_lock(&obd->obd_osfs_lock);
+        obd->u.filter.fo_raid_degraded = !!val;
+        spin_unlock(&obd->obd_osfs_lock);
+        return count;
+}
+
 static struct lprocfs_vars lprocfs_filter_obd_vars[] = {
         { "uuid",         lprocfs_rd_uuid,          0, 0 },
         { "blocksize",    lprocfs_rd_blksize,       0, 0 },
@@ -383,6 +407,8 @@ static struct lprocfs_vars lprocfs_filter_obd_vars[] = {
         { "writethrough_cache_enable", lprocfs_filter_rd_wcache,
                           lprocfs_filter_wr_wcache, 0},
         { "mds_sync",     lprocfs_filter_rd_mds_sync, 0, 0},
+        { "degraded",     lprocfs_filter_rd_degraded,
+                          lprocfs_filter_wr_degraded, 0 },
         { 0 }
 };
 
index 53d6912..d422453 100644 (file)
@@ -123,18 +123,20 @@ static int osc_interpret_create(const struct lu_env *env,
                 spin_unlock(&oscc->oscc_lock);
                 break;
         }
-        case -ENOSPC:
         case -EROFS:
-        case -EFBIG: {
-                oscc->oscc_flags |= OSCC_FLAG_NOSPC;
-                if (body && rc == -ENOSPC) {
-                        oscc->oscc_grow_count = OST_MIN_PRECREATE;
-                        oscc->oscc_last_id = body->oa.o_id;
+                oscc->oscc_flags |= OSCC_FLAG_RDONLY;
+        case -ENOSPC:
+        case -EFBIG: 
+                if (rc != EROFS) {
+                        oscc->oscc_flags |= OSCC_FLAG_NOSPC;
+                        if (body && rc == -ENOSPC) {
+                                oscc->oscc_last_id = body->oa.o_id;
+                                oscc->oscc_grow_count = OST_MIN_PRECREATE;
+                        }
                 }
                 spin_unlock(&oscc->oscc_lock);
                 DEBUG_REQ(D_INODE, req, "OST out of space, flagging");
                 break;
-        }
         case -EIO: {
                 /* filter always set body->oa.o_id as the last_id
                  * of filter (see filter_handle_precreate for detail)*/
@@ -274,23 +276,22 @@ static int oscc_has_objects(struct osc_creator *oscc, int count)
 static int oscc_wait_for_objects(struct osc_creator *oscc, int count)
 {
         int have_objs;
-        int ost_full;
-        int osc_invalid;
+        int ost_unusable;
 
-        osc_invalid = oscc->oscc_obd->u.cli.cl_import->imp_invalid;
+        ost_unusable = oscc->oscc_obd->u.cli.cl_import->imp_invalid;
 
         spin_lock(&oscc->oscc_lock);
-        ost_full = (oscc->oscc_flags & OSCC_FLAG_NOSPC);
+        ost_unusable |= (OSCC_FLAG_NOSPC | OSCC_FLAG_RDONLY |
+                         OSCC_FLAG_EXITING) & oscc->oscc_flags;
         have_objs = oscc_has_objects_nolock(oscc, count);
-        osc_invalid |= oscc->oscc_flags & OSCC_FLAG_EXITING;
 
-        if (!ost_full && !osc_invalid)
+        if (!ost_unusable)
                 /* they release lock himself */
                 oscc_internal_create(oscc);
         else
                 spin_unlock(&oscc->oscc_lock);
 
-        return have_objs || ost_full || osc_invalid;
+        return have_objs || ost_unusable;
 }
 
 static int oscc_precreate(struct osc_creator *oscc)
@@ -312,23 +313,15 @@ static int oscc_precreate(struct osc_creator *oscc)
         if (!oscc_has_objects(oscc, 1) || (oscc->oscc_flags & OSCC_FLAG_NOSPC))
                 rc = -ENOSPC;
 
+        if (oscc->oscc_flags & OSCC_FLAG_RDONLY)
+                rc = -EROFS;
+
         if (oscc->oscc_obd->u.cli.cl_import->imp_invalid)
                 rc = -EIO;
 
         RETURN(rc);
 }
 
-static int oscc_recovering(struct osc_creator *oscc)
-{
-        int recov;
-
-        spin_lock(&oscc->oscc_lock);
-        recov = oscc->oscc_flags & OSCC_FLAG_RECOVERING;
-        spin_unlock(&oscc->oscc_lock);
-
-        return recov;
-}
-
 static int oscc_in_sync(struct osc_creator *oscc)
 {
         int sync;
@@ -358,16 +351,24 @@ int osc_precreate(struct obd_export *exp)
                 RETURN(1000);
 
         /* until oscc in recovery - other flags is wrong */
-        if (oscc_recovering(oscc))
-                RETURN(2);
-
-        if (oscc->oscc_flags & OSCC_FLAG_NOSPC)
+        spin_lock(&oscc->oscc_lock);
+        if (oscc->oscc_flags & OSCC_FLAG_NOSPC ||
+            oscc->oscc_flags & OSCC_FLAG_RDONLY) {
+                spin_unlock(&oscc->oscc_lock);
                 RETURN(1000);
+        }
 
-        if (oscc_has_objects(oscc, oscc->oscc_grow_count / 2))
+        if (oscc->oscc_flags & OSCC_FLAG_RECOVERING ||
+            oscc->oscc_flags & OSCC_FLAG_DEGRADED) {
+                spin_unlock(&oscc->oscc_lock);
+                RETURN(2);
+        }
+
+        if (oscc_has_objects_nolock(oscc, oscc->oscc_grow_count / 2)) {
+                spin_unlock(&oscc->oscc_lock);
                 RETURN(0);
+        }
 
-        spin_lock(&oscc->oscc_lock);
         if ((oscc->oscc_flags & OSCC_FLAG_SYNC_IN_PROGRESS) ||
             (oscc->oscc_flags & OSCC_FLAG_CREATING)) {
                 spin_unlock(&oscc->oscc_lock);
@@ -412,6 +413,9 @@ static int handle_async_create(struct ptlrpc_request *req, int rc)
         if (oscc->oscc_flags & OSCC_FLAG_NOSPC)
                 GOTO(out_wake, rc = -ENOSPC);
 
+        if (oscc->oscc_flags & OSCC_FLAG_RDONLY)
+                GOTO(out_wake, rc = -EROFS);
+
         /* we not have objects now - continue wait */
         RETURN(-EAGAIN);
 
@@ -633,6 +637,10 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
                         rc = -ENOSPC;
                         spin_unlock(&oscc->oscc_lock);
                         break;
+                } else if (oscc->oscc_flags & OSCC_FLAG_RDONLY) {
+                        rc = -EROFS;
+                        spin_unlock(&oscc->oscc_lock);
+                        break;
                 }
 
                 spin_unlock(&oscc->oscc_lock);
index 1476019..90c2590 100644 (file)
@@ -101,6 +101,8 @@ struct osc_cache_waiter {
 #define OSCC_FLAG_SYNC_IN_PROGRESS   0x08 /* only allow one thread to sync */
 #define OSCC_FLAG_LOW                0x10
 #define OSCC_FLAG_EXITING            0x20
+#define OSCC_FLAG_DEGRADED           0x40
+#define OSCC_FLAG_RDONLY             0x80
 
 int osc_precreate(struct obd_export *exp);
 int osc_create(struct obd_export *exp, struct obdo *oa,
index 10b27ba..e48a181 100644 (file)
@@ -3420,6 +3420,7 @@ static int osc_statfs_interpret(const struct lu_env *env,
                                 struct ptlrpc_request *req,
                                 struct osc_async_args *aa, int rc)
 {
+        struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
         struct obd_statfs *msfs;
         ENTRY;
 
@@ -3435,6 +3436,17 @@ static int osc_statfs_interpret(const struct lu_env *env,
                 GOTO(out, rc = -EPROTO);
         }
 
+        /* Reinitialize the RDONLY and DEGRADED flags at the client
+         * on each statfs, so they don't stay set permanently. */
+        spin_lock(&cli->cl_oscc.oscc_lock);
+        cli->cl_oscc.oscc_flags &= ~(OSCC_FLAG_RDONLY | OSCC_FLAG_DEGRADED);
+        if (msfs->os_state & OS_STATE_DEGRADED)
+                cli->cl_oscc.oscc_flags |= OSCC_FLAG_DEGRADED;
+        if (msfs->os_state & OS_STATE_READONLY)
+                cli->cl_oscc.oscc_flags |= OSCC_FLAG_RDONLY;
+        spin_unlock(&cli->cl_oscc.oscc_lock);
         *aa->aa_oi->oi_osfs = *msfs;
 out:
         rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
index d4af63b..4fcdf11 100644 (file)
@@ -1194,6 +1194,65 @@ test_27w() { # bug 10997
 }
 run_test 27w "check lfs setstripe -c -s -i options ============="
 
+test_27x() {
+       [ "$OSTCOUNT" -lt "2" ] && skip "$OSTCOUNT < 2 OSTs" && return
+       DELAY=$(do_facet mds lctl get_param -n lov.*.qos_maxage | awk '{print $1 + 2}')
+       OFFSET=$(($OSTCOUNTi - 1))
+       OSTIDX=0
+       local OST=$(lfs osts | awk '/'${OSTIDX}': / { print $2 }' | sed -e 's/_UUID$//')
+       
+       mkdir -p $DIR/$tdir
+       $SETSTRIPE $DIR/$tdir -c 1      # 1 stripe per file
+       do_facet ost$OSTIDX lctl set_param -n obdfilter.$OST.degraded 1
+       sleep $DELAY
+       createmany -o $DIR/$tdir/$tfile $OSTCOUNT
+       for i in `seq 0 $OFFSET`; do
+               [ `$GETSTRIPE $DIR/$tdir/$tfile$i | grep -A 10 obdidx | awk '{print $1}' | grep -w "$OSTIDX"` ] && 
+               error "OST0 was degraded but new created file still use it"
+       done
+       do_facet ost$OSTIDX lctl set_param -n obdfilter.$OST.degraded 0
+}
+run_test 27x "create files while OST0 is degraded"
+
+test_27y() {
+        [ "$OSTCOUNT" -lt "2" ] && skip "$OSTCOUNT < 2 OSTs -- skipping" && return
+        remote_mds_nodsh && skip "remote MDS with nodsh" && return
+
+        MDS_OSCS=`do_facet mds lctl dl | awk '/[oO][sS][cC].*md[ts]/ { print $4 }'`
+        DELAY=$(do_facet mds lctl get_param -n lov.*.qos_maxage | awk '{print $1 + 2}')
+        OFFSET=$(($OSTCOUNT-1))
+        OST=-1
+        for OSC in $MDS_OSCS; do
+                if [ $OST == -1 ]; then {
+                        OST=`osc_to_ost $OSC`
+                } else {
+                        echo $OSC "is Deactivate:"
+                        do_facet mds lctl --device  %$OSC deactivate
+                } fi
+        done
+
+        OSTIDX=$(lfs osts | grep ${OST} | awk '{print $1}' | sed -e 's/://')
+        mkdir -p $DIR/$tdir
+        $SETSTRIPE $DIR/$tdir -c 1      # 1 stripe / file
+
+        do_facet ost$OSTIDX lctl set_param -n obdfilter.$OST.degraded 1 
+        sleep $DELAY 
+        createmany -o $DIR/$tdir/$tfile $OSTCOUNT
+        do_facet ost$OSTIDX lctl set_param -n obdfilter.$OST.degraded 0 
+
+        for i in `seq 0 $OFFSET`; do
+                [ `$GETSTRIPE $DIR/$tdir/$tfile$i | grep -A 10 obdidx | awk '{print $1}'| grep -w "$OSTIDX"` ] || \
+                      error "files created on deactivated OSTs instead of degraded OST"
+        done
+        for OSC in $MDS_OSCS; do
+                [ `osc_to_ost $OSC` != $OST  ] && {
+                        echo $OSC "is activate"
+                        do_facet mds lctl --device %$OSC activate
+                }
+        done
+}
+run_test 27y "create files while OST0 is degraded and the rest inactive"
+
 # createtest also checks that device nodes are created and
 # then visible correctly (#2091)
 test_28() { # bug 2091