LU-5946 lprocfs: free expired jobstats after /proc read

author Niu Yawei <yawei.niu@intel.com>

Fri, 10 Jul 2015 02:14:33 +0000 (22:14 -0400)

committer Oleg Drokin <oleg.drokin@intel.com>

Sun, 19 Jul 2015 03:59:59 +0000 (03:59 +0000)
author Niu Yawei <yawei.niu@intel.com>
Fri, 10 Jul 2015 02:14:33 +0000 (22:14 -0400)
committer Oleg Drokin <oleg.drokin@intel.com>
Sun, 19 Jul 2015 03:59:59 +0000 (03:59 +0000)
diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h

index f12dd93..93da0d0 100644 (file)
--- a/lustre/include/lprocfs_status.h
+++ b/lustre/include/lprocfs_status.h
@@ -386,13 +386,14 @@ static inline void s2dhms(struct dhms *ts, time_t secs)
  typedef void (*cntr_init_callback)(struct lprocfs_stats *stats);
  
  struct obd_job_stats {
-       struct cfs_hash        *ojs_hash;
-       struct list_head        ojs_list;
-       rwlock_t                ojs_lock; /* protect the obj_list */
-       cntr_init_callback      ojs_cntr_init_fn;
-       int                     ojs_cntr_num;
-       int                     ojs_cleanup_interval;
-       time_t                  ojs_last_cleanup;
+       struct cfs_hash        *ojs_hash;       /* hash of jobids */
+       struct list_head        ojs_list;       /* list of job_stat structs */
+       rwlock_t                ojs_lock;       /* protect ojs_list/js_list */
+       unsigned int            ojs_cleanup_interval;/* seconds before expiry */
+       time_t                  ojs_last_cleanup; /* previous cleanup time */
+       cntr_init_callback      ojs_cntr_init_fn;/* lprocfs_stats initializer */
+       unsigned short          ojs_cntr_num;   /* number of stats in struct */
+       bool                    ojs_cleaning;   /* currently expiring stats */
  };
  
  #ifdef CONFIG_PROC_FS
diff --git a/lustre/obdclass/lprocfs_jobstats.c b/lustre/obdclass/lprocfs_jobstats.c

index 6bdf476..6dbf3da 100644 (file)
--- a/lustre/obdclass/lprocfs_jobstats.c
+++ b/lustre/obdclass/lprocfs_jobstats.c
@@ -64,13 +64,13 @@
   */
  
  struct job_stat {
-       struct hlist_node       js_hash;
-       struct list_head        js_list;
-       atomic_t                js_refcount;
-       char                    js_jobid[LUSTRE_JOBID_SIZE];
-       time_t                  js_timestamp; /* seconds */
-       struct lprocfs_stats    *js_stats;
-       struct obd_job_stats    *js_jobstats;
+       struct hlist_node       js_hash;        /* hash struct for this jobid */
+       struct list_head        js_list;        /* on ojs_list, with ojs_lock */
+       atomic_t                js_refcount;    /* num users of this struct */
+       char                    js_jobid[LUSTRE_JOBID_SIZE]; /* job name */
+       time_t                  js_timestamp;   /* seconds of most recent stat*/
+       struct lprocfs_stats    *js_stats;      /* per-job statistics */
+       struct obd_job_stats    *js_jobstats;   /* for accessing ojs_lock */
  };
  
  static unsigned
@@ -109,7 +109,7 @@ static void job_stat_get(struct cfs_hash *hs, struct hlist_node *hnode)
  static void job_free(struct job_stat *job)
  {
         LASSERT(atomic_read(&job->js_refcount) == 0);
-       LASSERT(job->js_jobstats);
+       LASSERT(job->js_jobstats != NULL);
  
         write_lock(&job->js_jobstats->ojs_lock);
         list_del_init(&job->js_list);
@@ -148,43 +148,100 @@ static struct cfs_hash_ops job_stats_hash_ops = {
         .hs_exit       = job_stat_exit,
  };
  
-static int job_iter_callback(struct cfs_hash *hs, struct cfs_hash_bd *bd,
-                            struct hlist_node *hnode, void *data)
+/**
+ * Jobstats expiry iterator to clean up old jobids
+ *
+ * Called for each job_stat structure on this device, it should delete stats
+ * older than the specified \a oldest_time in seconds.  If \a oldest_time is
+ * in the future then this will delete all statistics (e.g. during shutdown).
+ *
+ * \param[in] hs       hash of all jobids on this device
+ * \param[in] bd       hash bucket containing this jobid
+ * \param[in] hnode    hash structure for this jobid
+ * \param[in] data     pointer to stats expiry time in seconds
+ */
+static int job_cleanup_iter_callback(struct cfs_hash *hs,
+                                    struct cfs_hash_bd *bd,
+                                    struct hlist_node *hnode, void *data)
  {
-       time_t oldest = *((time_t *)data);
+       time_t oldest_time = *((time_t *)data);
         struct job_stat *job;
  
         job = hlist_entry(hnode, struct job_stat, js_hash);
-       if (!oldest || job->js_timestamp < oldest)
+       if (job->js_timestamp < oldest_time)
                 cfs_hash_bd_del_locked(hs, bd, hnode);
  
         return 0;
  }
  
-static void lprocfs_job_cleanup(struct obd_job_stats *stats, bool force)
+/**
+ * Clean up jobstats that were updated more than \a before seconds ago.
+ *
+ * Since this function may be called frequently, do not scan all of the
+ * jobstats on each call, only twice per cleanup interval.  That means stats
+ * may be around on average cleanup_interval / 4 longer than necessary,
+ * but that is not considered harmful.
+ *
+ * If \a before is negative then this will force clean up all jobstats due
+ * to the expiry time being in the future (e.g. at shutdown).
+ *
+ * If there is already another thread doing jobstats cleanup, don't try to
+ * do this again in the current thread unless this is a force cleanup.
+ *
+ * \param[in] stats    stucture tracking all job stats for this device
+ * \param[in] before   expire jobstats updated more than this many seconds ago
+ */
+static void lprocfs_job_cleanup(struct obd_job_stats *stats, int before)
  {
-       time_t oldest, now;
+       time_t now = cfs_time_current_sec();
+       time_t oldest;
  
-       if (stats->ojs_cleanup_interval == 0)
-               return;
+       if (likely(before >= 0)) {
+               unsigned int cleanup_interval = stats->ojs_cleanup_interval;
+
+               if (cleanup_interval == 0 || before == 0)
+                       return;
  
-       now = cfs_time_current_sec();
-       if (!force && now < stats->ojs_last_cleanup +
-                           stats->ojs_cleanup_interval)
+               if (now < stats->ojs_last_cleanup + cleanup_interval / 2)
+                       return;
+
+               if (stats->ojs_cleaning)
+                       return;
+       }
+
+       write_lock(&stats->ojs_lock);
+       if (before >= 0 && stats->ojs_cleaning) {
+               write_unlock(&stats->ojs_lock);
                 return;
+       }
  
-       oldest = now - stats->ojs_cleanup_interval;
-       cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback,
+       stats->ojs_cleaning = true;
+       write_unlock(&stats->ojs_lock);
+
+       /* Can't hold ojs_lock over hash iteration, since it is grabbed by
+        * job_cleanup_iter_callback()
+        *   ->cfs_hash_bd_del_locked()
+        *     ->job_putref()
+        *       ->job_free()
+        *
+        * Holding ojs_lock isn't necessary for safety of the hash iteration,
+        * since locking of the hash is handled internally, but there isn't
+        * any benefit to having multiple threads doing cleanup at one time.
+        */
+       oldest = now - before;
+       cfs_hash_for_each_safe(stats->ojs_hash, job_cleanup_iter_callback,
                                &oldest);
+
+       write_lock(&stats->ojs_lock);
+       stats->ojs_cleaning = false;
         stats->ojs_last_cleanup = cfs_time_current_sec();
+       write_unlock(&stats->ojs_lock);
  }
  
  static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs)
  {
         struct job_stat *job;
  
-       LASSERT(jobs->ojs_cntr_num && jobs->ojs_cntr_init_fn);
-
         OBD_ALLOC_PTR(job);
         if (job == NULL)
                 return NULL;
@@ -214,11 +271,13 @@ int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
         struct job_stat *job, *job2;
         ENTRY;
  
-       LASSERT(stats && stats->ojs_hash);
+       LASSERT(stats != NULL);
+       LASSERT(stats->ojs_hash != NULL);
  
-       lprocfs_job_cleanup(stats, false);
+       if (event >= stats->ojs_cntr_num)
+               RETURN(-EINVAL);
  
-       if (!jobid || !strlen(jobid))
+       if (jobid == NULL || strlen(jobid) == 0)
                 RETURN(-EINVAL);
  
         if (strlen(jobid) >= LUSTRE_JOBID_SIZE) {
@@ -231,6 +290,8 @@ int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
         if (job)
                 goto found;
  
+       lprocfs_job_cleanup(stats, stats->ojs_cleanup_interval);
+
         job = job_alloc(jobid, stats);
         if (job == NULL)
                 RETURN(-ENOMEM);
@@ -254,11 +315,11 @@ int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
  
  found:
         LASSERT(stats == job->js_jobstats);
-       LASSERT(stats->ojs_cntr_num > event);
         job->js_timestamp = cfs_time_current_sec();
         lprocfs_counter_add(job->js_stats, event, amount);
  
         job_putref(job);
+
         RETURN(0);
  }
  EXPORT_SYMBOL(lprocfs_job_stats_log);
@@ -266,11 +327,11 @@ EXPORT_SYMBOL(lprocfs_job_stats_log);
  void lprocfs_job_stats_fini(struct obd_device *obd)
  {
         struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
-       time_t oldest = 0;
  
         if (stats->ojs_hash == NULL)
                 return;
-       cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback, &oldest);
+
+       lprocfs_job_cleanup(stats, -99);
         cfs_hash_putref(stats->ojs_hash);
         stats->ojs_hash = NULL;
         LASSERT(list_empty(&stats->ojs_list));
@@ -323,17 +384,17 @@ static void *lprocfs_jobstats_seq_next(struct seq_file *p, void *v, loff_t *pos)
   * Example of output on MDT:
   *
   * job_stats:
- * - job_id:        test_id.222.25844
+ * - job_id:        dd.4854
   *   snapshot_time: 1322494486
- *   open:          { samples:        3, unit: reqs }
- *   close:         { samples:        3, unit: reqs }
+ *   open:          { samples:        1, unit: reqs }
+ *   close:         { samples:        1, unit: reqs }
   *   mknod:         { samples:        0, unit: reqs }
   *   link:          { samples:        0, unit: reqs }
   *   unlink:        { samples:        0, unit: reqs }
   *   mkdir:         { samples:        0, unit: reqs }
   *   rmdir:         { samples:        0, unit: reqs }
- *   rename:        { samples:        1, unit: reqs }
- *   getattr:       { samples:        7, unit: reqs }
+ *   rename:        { samples:        0, unit: reqs }
+ *   getattr:       { samples:        1, unit: reqs }
   *   setattr:       { samples:        0, unit: reqs }
   *   getxattr:      { samples:        0, unit: reqs }
   *   setxattr:      { samples:        0, unit: reqs }
@@ -343,13 +404,13 @@ static void *lprocfs_jobstats_seq_next(struct seq_file *p, void *v, loff_t *pos)
   * Example of output on OST:
   *
   * job_stats:
- * - job_id         4854
+ * - job_id         dd.4854
   *   snapshot_time: 1322494602
- *   read:          { samples:  0, unit: bytes, min:  0, max:  0, sum:  0 }
- *   write:         { samples:  1, unit: bytes, min: 10, max: 10, sum: 10 }
- *   setattr:       { samples:  0, unit: reqs }
- *   punch:         { samples:  0, unit: reqs }
- *   sync:          { samples:  0, unit: reqs }
+ *   read:          { samples: 0, unit: bytes, min:  0, max:  0, sum:  0 }
+ *   write:         { samples: 1, unit: bytes, min: 4096, max: 4096, sum: 4096 }
+ *   setattr:       { samples: 0, unit: reqs }
+ *   punch:         { samples: 0, unit: reqs }
+ *   sync:          { samples: 0, unit: reqs }
   */
  
  static const char spaces[] = "                    ";
@@ -436,12 +497,14 @@ static ssize_t lprocfs_jobstats_seq_write(struct file *file,
         struct seq_file *seq = file->private_data;
         struct obd_job_stats *stats = seq->private;
         char jobid[LUSTRE_JOBID_SIZE];
-       int all = 0;
         struct job_stat *job;
  
         if (len == 0 || len >= LUSTRE_JOBID_SIZE)
                 return -EINVAL;
  
+       if (stats->ojs_hash == NULL)
+               return -ENODEV;
+
         if (copy_from_user(jobid, buf, len))
                 return -EFAULT;
         jobid[len] = 0;
@@ -450,18 +513,13 @@ static ssize_t lprocfs_jobstats_seq_write(struct file *file,
         if (jobid[len - 1] == '\n')
                 jobid[len - 1] = 0;
  
-       if (strcmp(jobid, "clear") == 0)
-               all = 1;
+       if (strcmp(jobid, "clear") == 0) {
+               lprocfs_job_cleanup(stats, -99);
  
-       LASSERT(stats->ojs_hash);
-       if (all) {
-               time_t oldest = 0;
-               cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback,
-                                      &oldest);
                 return len;
         }
  
-       if (!strlen(jobid))
+       if (strlen(jobid) == 0)
                 return -EINVAL;
  
         job = cfs_hash_lookup(stats->ojs_hash, jobid);
@@ -474,13 +532,35 @@ static ssize_t lprocfs_jobstats_seq_write(struct file *file,
         return len;
  }
  
+/**
+ * Clean up the seq file state when the /proc file is closed.
+ *
+ * This also expires old job stats from the cache after they have been
+ * printed in case the system is idle and not generating new jobstats.
+ *
+ * \param[in] inode    struct inode for seq file being closed
+ * \param[in] file     struct file for seq file being closed
+ *
+ * \retval             0 on success
+ * \retval             negative errno on failure
+ */
+static int lprocfs_jobstats_seq_release(struct inode *inode, struct file *file)
+{
+       struct seq_file *seq = file->private_data;
+       struct obd_job_stats *stats = seq->private;
+
+       lprocfs_job_cleanup(stats, stats->ojs_cleanup_interval);
+
+       return lprocfs_seq_release(inode, file);
+}
+
  static const struct file_operations lprocfs_jobstats_seq_fops = {
         .owner   = THIS_MODULE,
         .open    = lprocfs_jobstats_seq_open,
         .read    = seq_read,
         .write   = lprocfs_jobstats_seq_write,
         .llseek  = seq_lseek,
-       .release = lprocfs_seq_release,
+       .release = lprocfs_jobstats_seq_release,
  };
  
  int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
@@ -493,9 +573,17 @@ int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
         LASSERT(obd->obd_proc_entry != NULL);
         LASSERT(obd->obd_type->typ_name);
  
-       if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) &&
-           strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME)) {
-               CERROR("Invalid obd device type.\n");
+       if (cntr_num <= 0)
+               RETURN(-EINVAL);
+
+       if (init_fn == NULL)
+               RETURN(-EINVAL);
+
+       /* Currently needs to be a target due to the use of obt_jobstats. */
+       if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) != 0 &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME) != 0) {
+               CERROR("%s: invalid device type %s for job stats: rc = %d\n",
+                      obd->obd_name, obd->obd_type->typ_name, -EINVAL);
                 RETURN(-EINVAL);
         }
         stats = &obd->u.obt.obt_jobstats;
@@ -534,7 +622,9 @@ int lprocfs_job_interval_seq_show(struct seq_file *m, void *data)
         struct obd_device *obd = m->private;
         struct obd_job_stats *stats;
  
-       LASSERT(obd != NULL);
+       if (obd == NULL)
+               return -ENODEV;
+
         stats = &obd->u.obt.obt_jobstats;
         return seq_printf(m, "%d\n", stats->ojs_cleanup_interval);
  }
@@ -544,11 +634,14 @@ ssize_t
  lprocfs_job_interval_seq_write(struct file *file, const char *buffer,
                                 size_t count, loff_t *off)
  {
-       struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+       struct obd_device *obd;
         struct obd_job_stats *stats;
         int val, rc;
  
-       LASSERT(obd != NULL);
+       obd = ((struct seq_file *)file->private_data)->private;
+       if (obd == NULL)
+               return -ENODEV;
+
         stats = &obd->u.obt.obt_jobstats;
  
         rc = lprocfs_write_helper(buffer, count, &val);
@@ -556,7 +649,7 @@ lprocfs_job_interval_seq_write(struct file *file, const char *buffer,
                 return rc;
  
         stats->ojs_cleanup_interval = val;
-       lprocfs_job_cleanup(stats, true);
+       lprocfs_job_cleanup(stats, stats->ojs_cleanup_interval);
         return count;
  }
  EXPORT_SYMBOL(lprocfs_job_interval_seq_write);
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh

index 0a9e496..bbf65c3 100644 (file)
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -11440,36 +11440,41 @@ else
  fi
  
  verify_jobstats() {
-       local cmd=$1
-       local target=$2
-
-       # clear old jobstats
-       do_facet $SINGLEMDS lctl set_param mdt.*.job_stats="clear"
-       do_facet ost1 lctl set_param obdfilter.*.job_stats="clear"
-
-       # use a new JobID for this test, or we might see an old one
-       [ "$JOBENV" = "FAKE_JOBID" ] && FAKE_JOBID=test_id.$testnum.$RANDOM
+       local cmd=($1)
+       shift
+       local facets="$@"
+
+# we don't really need to clear the stats for this test to work, since each
+# command has a unique jobid, but it makes debugging easier if needed.
+#      for facet in $facets; do
+#              local dev=$(convert_facet2label $facet)
+#              # clear old jobstats
+#              do_facet $facet lctl set_param *.$dev.job_stats="clear"
+#      done
+
+       # use a new JobID for each test, or we might see an old one
+       [ "$JOBENV" = "FAKE_JOBID" ] &&
+               FAKE_JOBID=id.$testnum.$(basename ${cmd[0]}).$RANDOM
  
         JOBVAL=${!JOBENV}
-       log "Test: $cmd"
+       log "Test: ${cmd[*]}"
         log "Using JobID environment variable $JOBENV=$JOBVAL"
  
         if [ $JOBENV = "FAKE_JOBID" ]; then
-               FAKE_JOBID=$JOBVAL $cmd
+               FAKE_JOBID=$JOBVAL ${cmd[*]}
         else
-               $cmd
+               ${cmd[*]}
         fi
  
-       if [ "$target" = "mdt" -o "$target" = "both" ]; then
-               FACET="$SINGLEMDS" # will need to get MDS number for DNE
-               do_facet $FACET lctl get_param mdt.*.job_stats |
-                       grep $JOBVAL || error "No job stats found on MDT $FACET"
-       fi
-       if [ "$target" = "ost" -o "$target" = "both" ]; then
-               FACET=ost1
-               do_facet $FACET lctl get_param obdfilter.*.job_stats |
-                       grep $JOBVAL || error "No job stats found on OST $FACET"
-       fi
+       # all files are created on OST0000
+       for facet in $facets; do
+               local stats="*.$(convert_facet2label $facet).job_stats"
+               if [ $(do_facet $facet lctl get_param $stats |
+                      grep -c $JOBVAL) -ne 1 ]; then
+                       do_facet $facet lctl get_param $stats
+                       error "No jobstats for $JOBVAL found on $facet::$stats"
+               fi
+       done
  }
  
  jobstats_set() {
@@ -11479,6 +11484,13 @@ jobstats_set() {
         wait_update $HOSTNAME "$LCTL get_param -n jobid_var" $NEW_JOBENV
  }
  
+cleanup_205() {
+       do_facet $SINGLEMDS \
+               $LCTL set_param mdt.*.job_cleanup_interval=$OLD_INTERVAL
+       [ $OLD_JOBENV != $JOBENV ] && jobstats_set $OLD_JOBENV
+       do_facet $SINGLEMDS lctl --device $MDT0 changelog_deregister $CL_USER
+}
+
  test_205() { # Job stats
         [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
         remote_mgs_nodsh && skip "remote MGS with nodsh" && return
@@ -11486,47 +11498,66 @@ test_205() { # Job stats
                 skip "Server doesn't support jobstats" && return 0
         [[ $JOBID_VAR = disable ]] && skip "jobstats is disabled" && return
  
-       local cmd
         OLD_JOBENV=$($LCTL get_param -n jobid_var)
         if [ $OLD_JOBENV != $JOBENV ]; then
                 jobstats_set $JOBENV
-               trap jobstats_set EXIT
+               trap cleanup_205 EXIT
         fi
  
-       local user=$(do_facet $SINGLEMDS $LCTL --device $MDT0 \
-                    changelog_register -n)
-       echo "Registered as changelog user $user"
+       CL_USER=$(do_facet $SINGLEMDS lctl --device $MDT0 changelog_register -n)
+       echo "Registered as changelog user $CL_USER"
+
+       OLD_INTERVAL=$(do_facet $SINGLEMDS \
+                      lctl get_param -n mdt.*.job_cleanup_interval)
+       local interval_new=5
+       do_facet $SINGLEMDS \
+               $LCTL set_param mdt.*.job_cleanup_interval=$interval_new
+       local start=$SECONDS
  
+       local cmd
         # mkdir
-       cmd="mkdir $DIR/$tfile"
-       verify_jobstats "$cmd" "mdt"
+       cmd="mkdir $DIR/$tdir"
+       verify_jobstats "$cmd" "$SINGLEMDS"
         # rmdir
-       cmd="rm -fr $DIR/$tfile"
-       verify_jobstats "$cmd" "mdt"
+       cmd="rmdir $DIR/$tdir"
+       verify_jobstats "$cmd" "$SINGLEMDS"
+       # mkdir on secondary MDT
+       if [ $MDSCOUNT -gt 1 ]; then
+               cmd="lfs mkdir -i 1 $DIR/$tdir.remote"
+               verify_jobstats "$cmd" "mds2"
+       fi
         # mknod
         cmd="mknod $DIR/$tfile c 1 3"
-       verify_jobstats "$cmd" "mdt"
+       verify_jobstats "$cmd" "$SINGLEMDS"
         # unlink
         cmd="rm -f $DIR/$tfile"
-       verify_jobstats "$cmd" "mdt"
+       verify_jobstats "$cmd" "$SINGLEMDS"
+       # create all files on OST0000 so verify_jobstats can find OST stats
         # open & close
         cmd="$SETSTRIPE -i 0 -c 1 $DIR/$tfile"
-       verify_jobstats "$cmd" "mdt"
+       verify_jobstats "$cmd" "$SINGLEMDS"
         # setattr
         cmd="touch $DIR/$tfile"
-       verify_jobstats "$cmd" "both"
+       verify_jobstats "$cmd" "$SINGLEMDS ost1"
         # write
         cmd="dd if=/dev/zero of=$DIR/$tfile bs=1M count=1 oflag=sync"
-       verify_jobstats "$cmd" "ost"
+       verify_jobstats "$cmd" "ost1"
         # read
         cmd="dd if=$DIR/$tfile of=/dev/null bs=1M count=1 iflag=direct"
-       verify_jobstats "$cmd" "ost"
+       verify_jobstats "$cmd" "ost1"
         # truncate
         cmd="$TRUNCATE $DIR/$tfile 0"
-       verify_jobstats "$cmd" "both"
+       verify_jobstats "$cmd" "$SINGLEMDS ost1"
         # rename
-       cmd="mv -f $DIR/$tfile $DIR/jobstats_test_rename"
-       verify_jobstats "$cmd" "mdt"
+       cmd="mv -f $DIR/$tfile $DIR/$tdir.rename"
+       verify_jobstats "$cmd" "$SINGLEMDS"
+       # jobstats expiry - sleep until old stats should be expired
+       local left=$((interval_new - (SECONDS - start)))
+       [ $left -ge 0 ] && echo "sleep $left for expiry" && sleep $((left + 1))
+       cmd="mkdir $DIR/$tdir.expire"
+       verify_jobstats "$cmd" "$SINGLEMDS"
+       [ $(do_facet $SINGLEMDS lctl get_param *.*.job_stats |
+           grep -c "job_id.*mkdir") -gt 1 ] && error "old jobstats not expired"
  
         # Ensure that jobid are present in changelog (if supported by MDS)
         if [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.6.52) ]
@@ -11546,10 +11577,7 @@ test_205() { # Job stats
                         error "Unexpected jobids when jobid_var=$JOBENV"
         fi
  
-       # cleanup
-       rm -f $DIR/jobstats_test_rename
-
-       [ $OLD_JOBENV != $JOBENV ] && jobstats_set $OLD_JOBENV
+       cleanup_205
  }
  run_test 205 "Verify job stats"
  
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh

index 27d0606..9a41a39 100755 (executable)
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -2981,6 +2981,13 @@ do_nodes() {
      return ${PIPESTATUS[0]}
  }
  
+##
+# Execute commands on a single service's host
+#
+# The \a facet (service) may be on a local or remote node, which is
+# determined at the time the command is run.
+#
+# usage: do_facet $facet command [arg ...]
  do_facet() {
         local facet=$1
         shift
author	Niu Yawei <yawei.niu@intel.com>
	Fri, 10 Jul 2015 02:14:33 +0000 (22:14 -0400)
committer	Oleg Drokin <oleg.drokin@intel.com>
	Sun, 19 Jul 2015 03:59:59 +0000 (03:59 +0000)
lustre/include/lprocfs_status.h		patch \| blob \| history
lustre/obdclass/lprocfs_jobstats.c		patch \| blob \| history
lustre/tests/sanity.sh		patch \| blob \| history
lustre/tests/test-framework.sh		patch \| blob \| history