b=24037 Changes of 2.6.32 kernel.

[fs/lustre-release.git] / lustre / obdfilter / filter.c
diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c

index e4bed09..530762d 100644 (file)
--- a/lustre/obdfilter/filter.c
+++ b/lustre/obdfilter/filter.c
@@ -26,7 +26,7 @@
   * GPL HEADER END
   */
  /*
- * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
   * Use is subject to license terms.
   */
  /*
@@ -241,6 +241,7 @@ static int lprocfs_init_rw_stats(struct obd_device *obd,
     plus the procfs overhead :( */
  static int filter_export_stats_init(struct obd_device *obd,
                                      struct obd_export *exp,
+                                    int reconnect,
                                      void *client_nid)
  {
          int rc, newnid = 0;
@@ -250,7 +251,7 @@ static int filter_export_stats_init(struct obd_device *obd,
                  /* Self-export gets no proc entry */
                  RETURN(0);
  
-        rc = lprocfs_exp_setup(exp, client_nid, &newnid);
+        rc = lprocfs_exp_setup(exp, client_nid, reconnect, &newnid);
          if (rc) {
                  /* Mask error for already created
                   * /proc entries */
@@ -289,7 +290,6 @@ static int filter_export_stats_init(struct obd_device *obd,
  
          RETURN(0);
   clean:
-        lprocfs_exp_cleanup(exp);
          return rc;
  }
  
@@ -869,7 +869,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                  fed = &exp->exp_filter_data;
                  *fed->fed_ted.ted_lcd = *lcd;
                  fed->fed_group = 0; /* will be assigned at connect */
-                filter_export_stats_init(obd, exp, NULL);
+                filter_export_stats_init(obd, exp, 0, NULL);
                  rc = filter_client_add(obd, exp, cl_idx);
                  /* can't fail for existing client */
                  LASSERTF(rc == 0, "rc = %d\n", rc);
@@ -880,9 +880,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                  exp->exp_connecting = 0;
                  exp->exp_in_recovery = 0;
                  cfs_spin_unlock(&exp->exp_lock);
-                cfs_spin_lock_bh(&obd->obd_processing_task_lock);
                  obd->obd_max_recoverable_clients++;
-                cfs_spin_unlock_bh(&obd->obd_processing_task_lock);
                  class_export_put(exp);
  
                  if (last_rcvd > le64_to_cpu(lsd->lsd_last_transno))
@@ -892,8 +890,8 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
  
          obd->obd_last_committed = le64_to_cpu(lsd->lsd_last_transno);
  out:
-        lut->lut_mount_count = mount_count + 1;
-        lsd->lsd_mount_count = cpu_to_le64(lut->lut_mount_count);
+        obd->u.obt.obt_mount_count = mount_count + 1;
+        lsd->lsd_mount_count = cpu_to_le64(obd->u.obt.obt_mount_count);
  
          /* save it, so mount count and last_transno is current */
          rc = filter_update_server_data(obd);
@@ -1197,7 +1195,7 @@ static int filter_prep_groups(struct obd_device *obd)
          loff_t off = 0;
          ENTRY;
  
-        O_dentry = simple_mkdir(current->fs->pwd, obd->u.obt.obt_vfsmnt,
+        O_dentry = simple_mkdir(cfs_fs_pwd(current->fs), obd->u.obt.obt_vfsmnt,
                                  "O", 0700, 1);
          CDEBUG(D_INODE, "got/created O: %p\n", O_dentry);
          if (IS_ERR(O_dentry)) {
@@ -1486,8 +1484,8 @@ struct dentry *filter_fid2dentry(struct obd_device *obd,
          if (dir_dentry == NULL)
                  filter_parent_unlock(dparent);
          if (IS_ERR(dchild)) {
-                CERROR("%s: child lookup error %ld\n", obd->obd_name,
-                       PTR_ERR(dchild));
+                CERROR("%s: object "LPU64":"LPU64" lookup error: rc %ld\n",
+                       obd->obd_name, id, group, PTR_ERR(dchild));
                  RETURN(dchild);
          }
  
@@ -1563,7 +1561,7 @@ int filter_vfs_unlink(struct inode *dir, struct dentry *dentry,
                  GOTO(out, rc = -EPERM);
  
          /* check_sticky() */
-        if ((dentry->d_inode->i_uid != current->fsuid &&
+        if ((dentry->d_inode->i_uid != cfs_curproc_fsuid() &&
               !cfs_capable(CFS_CAP_FOWNER)) || IS_APPEND(dentry->d_inode) ||
              IS_IMMUTABLE(dentry->d_inode))
                  GOTO(out, rc = -EPERM);
@@ -1571,7 +1569,7 @@ int filter_vfs_unlink(struct inode *dir, struct dentry *dentry,
          /* NOTE: This might need to go outside i_mutex, though it isn't clear if
           *       that was done because of journal_start (which is already done
           *       here) or some other ordering issue. */
-        DQUOT_INIT(dir);
+        ll_vfs_dq_init(dir);
  
          rc = ll_security_inode_unlink(dir, dentry, mnt);
          if (rc)
@@ -1597,7 +1595,10 @@ static int filter_destroy_internal(struct obd_device *obd, obd_id objid,
          struct inode *inode = dchild->d_inode;
          int rc;
  
-        if (inode->i_nlink != 1 || atomic_read(&inode->i_count) != 1) {
+        /* There should be 2 references to the inode:
+         *  1) taken by filter_prepare_destroy
+         *  2) taken by filter_destroy */
+        if (inode->i_nlink != 1 || atomic_read(&inode->i_count) != 2) {
                  CERROR("destroying objid %.*s ino %lu nlink %lu count %d\n",
                         dchild->d_name.len, dchild->d_name.name, inode->i_ino,
                         (unsigned long)inode->i_nlink,
@@ -1701,7 +1702,7 @@ static int filter_intent_policy(struct ldlm_namespace *ns,
           * lock, and should not be granted if the lock will be blocked.
           */
  
-        LASSERT(ns == res->lr_namespace);
+        LASSERT(ns == ldlm_res_to_ns(res));
          lock_res(res);
          rc = policy(lock, &tmpflags, 0, &err, &rpc_list);
          check_res_locked(res);
@@ -1723,7 +1724,7 @@ static int filter_intent_policy(struct ldlm_namespace *ns,
          if (rc == LDLM_ITER_CONTINUE) {
                  /* do not grant locks to the liblustre clients: they cannot
                   * handle ASTs robustly.  We need to do this while still
-                 * holding ns_lock to avoid the lock remaining on the res_link
+                 * holding lr_lock to avoid the lock remaining on the res_link
                   * list (and potentially being added to l_pending_list by an
                   * AST) when we are going to drop this lock ASAP. */
                  if (lock->l_export->exp_libclient ||
@@ -1746,7 +1747,7 @@ static int filter_intent_policy(struct ldlm_namespace *ns,
          *reply_lvb = *res_lvb;
  
          /*
-         * ->ns_lock guarantees that no new locks are granted, and,
+         * lr_lock guarantees that no new locks are granted, and,
           * therefore, that res->lr_lvb_data cannot increase beyond the
           * end of already granted lock. As a result, it is safe to
           * check against "stale" reply_lvb->lvb_size value without
@@ -1799,13 +1800,6 @@ static int filter_intent_policy(struct ldlm_namespace *ns,
  
          LASSERTF(l->l_glimpse_ast != NULL, "l == %p", l);
          rc = l->l_glimpse_ast(l, NULL); /* this will update the LVB */
-        /* Update the LVB from disk if the AST failed (this is a legal race) */
-        /*
-         * XXX nikita: situation when ldlm_server_glimpse_ast() failed before
-         * sending ast is not handled. This can result in lost client writes.
-         */
-        if (rc != 0)
-                ldlm_res_lvbo_update(res, NULL, 1);
  
          lock_res(res);
          *reply_lvb = *res_lvb;
@@ -1954,7 +1948,7 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg,
          __u8 *uuid_ptr;
          char *str, *label;
          char ns_name[48];
-        request_queue_t *q;
+        struct request_queue *q;
          int rc, i;
          ENTRY;
  
@@ -2019,6 +2013,7 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg,
  
          obd->u.obt.obt_vfsmnt = mnt;
          obd->u.obt.obt_sb = mnt->mnt_sb;
+        obd->u.obt.obt_magic = OBT_MAGIC;
          filter->fo_fstype = mnt->mnt_sb->s_type->name;
          CDEBUG(D_SUPER, "%s: mnt = %p\n", filter->fo_fstype, mnt);
  
@@ -2040,11 +2035,14 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg,
          CFS_INIT_LIST_HEAD(&filter->fo_export_list);
          cfs_sema_init(&filter->fo_alloc_lock, 1);
          init_brw_stats(&filter->fo_filter_stats);
+        cfs_spin_lock_init(&filter->fo_flags_lock);
          filter->fo_read_cache = 1; /* enable read-only cache by default */
          filter->fo_writethrough_cache = 1; /* enable writethrough cache */
          filter->fo_readcache_max_filesize = FILTER_MAX_CACHE_SIZE;
          filter->fo_fmd_max_num = FILTER_FMD_MAX_NUM_DEFAULT;
          filter->fo_fmd_max_age = FILTER_FMD_MAX_AGE_DEFAULT;
+        filter->fo_syncjournal = 0; /* Don't sync journals on i/o by default */
+        filter_slc_set(filter); /* initialize sync on lock cancel */
  
          rc = filter_prep(obd);
          if (rc)
@@ -2061,8 +2059,10 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg,
                  GOTO(err_post, rc = -ENOMEM);
  
          sprintf(ns_name, "filter-%s", obd->obd_uuid.uuid);
-        obd->obd_namespace = ldlm_namespace_new(obd, ns_name, LDLM_NAMESPACE_SERVER,
-                                                LDLM_NAMESPACE_GREEDY);
+        obd->obd_namespace = ldlm_namespace_new(obd, ns_name,
+                                                LDLM_NAMESPACE_SERVER,
+                                                LDLM_NAMESPACE_GREEDY,
+                                                LDLM_NS_TYPE_OST);
          if (obd->obd_namespace == NULL)
                  GOTO(err_post, rc = -ENOMEM);
          obd->obd_namespace->ns_lvbp = obd;
@@ -2088,13 +2088,13 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg,
                  GOTO(err_post, rc);
  
          q = bdev_get_queue(mnt->mnt_sb->s_bdev);
-        if (q->max_sectors < q->max_hw_sectors &&
-            q->max_sectors < PTLRPC_MAX_BRW_SIZE >> 9)
+        if (queue_max_sectors(q) < queue_max_hw_sectors(q) &&
+            queue_max_sectors(q) < PTLRPC_MAX_BRW_SIZE >> 9)
                  LCONSOLE_INFO("%s: underlying device %s should be tuned "
                                "for larger I/O requests: max_sectors = %u "
                                "could be up to max_hw_sectors=%u\n",
                                obd->obd_name, mnt->mnt_sb->s_id,
-                              q->max_sectors, q->max_hw_sectors);
+                              queue_max_sectors(q), queue_max_hw_sectors(q));
  
          uuid_ptr = fsfilt_uuid(obd, obd->u.obt.obt_sb);
          if (uuid_ptr != NULL) {
@@ -2507,9 +2507,9 @@ static int filter_llog_connect(struct obd_export *exp,
                obd->obd_name, body->lgdc_logid.lgl_oid,
                body->lgdc_logid.lgl_oseq, body->lgdc_logid.lgl_ogen);
  
-        cfs_spin_lock_bh(&obd->obd_processing_task_lock);
+        cfs_spin_lock(&obd->u.filter.fo_flags_lock);
          obd->u.filter.fo_mds_ost_sync = 1;
-        cfs_spin_unlock_bh(&obd->obd_processing_task_lock);
+        cfs_spin_unlock(&obd->u.filter.fo_flags_lock);
          rc = llog_connect(ctxt, &body->lgdc_logid,
                            &body->lgdc_gen, NULL);
          llog_ctxt_put(ctxt);
@@ -2602,7 +2602,7 @@ static int filter_cleanup(struct obd_device *obd)
  
          filter_post(obd);
  
-        LL_DQUOT_OFF(obd->u.obt.obt_sb);
+        ll_vfs_dq_off(obd->u.obt.obt_sb, 0);
          shrink_dcache_sb(obd->u.obt.obt_sb);
  
          server_put_mount(obd->obd_name, obd->u.obt.obt_vfsmnt);
@@ -2746,7 +2746,7 @@ static int filter_reconnect(const struct lu_env *env,
  
          rc = filter_connect_internal(exp, data, 1);
          if (rc == 0)
-                filter_export_stats_init(obd, exp, localdata);
+                filter_export_stats_init(obd, exp, 1, localdata);
  
          RETURN(rc);
  }
@@ -2777,7 +2777,7 @@ static int filter_connect(const struct lu_env *env,
          if (rc)
                  GOTO(cleanup, rc);
  
-        filter_export_stats_init(obd, lexp, localdata);
+        filter_export_stats_init(obd, lexp, 0, localdata);
          if (obd->obd_replayable) {
                  struct lsd_client_data *lcd = lexp->exp_target_data.ted_lcd;
                  LASSERT(lcd);
@@ -2805,7 +2805,6 @@ static int filter_connect(const struct lu_env *env,
  cleanup:
          if (rc) {
                  class_disconnect(lexp);
-                lprocfs_exp_cleanup(lexp);
                  *exp = NULL;
          } else {
                  *exp = lexp;
@@ -3189,13 +3188,15 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
                          *fcc = oa->o_lcookie;
          }
          if (ia_valid & (ATTR_SIZE | ATTR_UID | ATTR_GID)) {
-                DQUOT_INIT(inode);
+                unsigned long now = jiffies;
+                ll_vfs_dq_init(inode);
                  /* Filter truncates and writes are serialized by
                   * i_alloc_sem, see the comment in
                   * filter_preprw_write.*/
                  if (ia_valid & ATTR_SIZE)
                          down_write(&inode->i_alloc_sem);
                  LOCK_INODE_MUTEX(inode);
+                fsfilt_check_slow(exp->exp_obd, now, "i_alloc_sem and i_mutex");
                  old_size = i_size_read(inode);
          }
  
@@ -3273,11 +3274,14 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
          if (OBD_FAIL_CHECK(OBD_FAIL_OST_SETATTR_CREDITS))
                  fsfilt_extend(exp->exp_obd, inode, 0, handle);
  
-        /* The truncate might have used up our transaction credits.  Make
-         * sure we have one left for the last_rcvd update. */
-        err = fsfilt_extend(exp->exp_obd, inode, 1, handle);
+       /* The truncate might have used up our transaction credits.  Make sure
+        * we have two left for the last_rcvd and VBR inode version updates. */
+        err = fsfilt_extend(exp->exp_obd, inode, 2, handle);
+
+        /* Update inode version only if data has changed => size has changed */
+        rc = filter_finish_transno(exp, ia_valid & ATTR_SIZE ? inode : NULL,
+                                   oti, rc, sync);
  
-        rc = filter_finish_transno(exp, inode, oti, rc, sync);
          if (sync) {
                  filter_cancel_cookies_cb(exp->exp_obd, 0, fcc, rc);
                  fcc = NULL;
@@ -3392,7 +3396,9 @@ int filter_setattr(struct obd_export *exp, struct obd_info *oinfo,
           */
          if (oa->o_valid &
              (OBD_MD_FLMTIME | OBD_MD_FLATIME | OBD_MD_FLCTIME)) {
+                unsigned long now = jiffies;
                  down_write(&dentry->d_inode->i_alloc_sem);
+                fsfilt_check_slow(exp->exp_obd, now, "i_alloc_sem");
                  fmd = filter_fmd_get(exp, oa->o_id, oa->o_seq);
                  if (fmd && fmd->fmd_mactime_xid < oti->oti_xid)
                          fmd->fmd_mactime_xid = oti->oti_xid;
@@ -3538,7 +3544,14 @@ static int filter_destroy_precreated(struct obd_export *exp, struct obdo *oa,
                  filter_set_last_id(filter, id, doa.o_seq);
                  rc = filter_update_last_objid(exp->exp_obd, doa.o_seq, 1);
          } else {
-                /* don't reuse orphan object, return last used objid */
+                /*
+                 * We have destroyed orphan objects, but don't want to reuse
+                 * them. Therefore we don't reset last_id to the last created
+                 * objects. Instead, we report back to the MDS the object id
+                 * of the last orphan, so that the MDS can restart allocating
+                 * objects from this id + 1 and thus skip the whole orphan
+                 * object id range
+                 */
                  oa->o_id = last;
                  rc = 0;
          }
@@ -3758,7 +3771,8 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa,
                  OBD_ALLOC(osfs, sizeof(*osfs));
                  if (osfs == NULL)
                          RETURN(-ENOMEM);
-                rc = filter_statfs(obd, osfs, cfs_time_current_64() - CFS_HZ,
+                rc = filter_statfs(obd, osfs,
+                                   cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
                                     0);
                  if (rc == 0 && osfs->os_bavail < (osfs->os_blocks >> 10)) {
                          CDEBUG(D_RPCTRACE,"%s: not enough space for create "
@@ -3798,10 +3812,15 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa,
                  } else
                          next_id = filter_last_id(filter, group) + 1;
  
-                /* Temporary solution for oid in CMD before fid-on-OST */
-                if ((fid_seq_is_mdt0(oa->o_seq) && next_id >= IDIF_MAX_OID) &&
-                    (fid_seq_is_cmd(oa->o_seq) && next_id >= OBIF_MAX_OID)) {
-                        CERROR("%s:"POSTID" hit the max IDIF_MAX_OID(1<<48)!\n",
+                /* Don't create objects beyond the valid range for this SEQ */
+                if (unlikely(fid_seq_is_mdt0(group) &&
+                            next_id >= IDIF_MAX_OID)) {
+                        CERROR("%s:"POSTID" hit the IDIF_MAX_OID (1<<48)!\n",
+                                obd->obd_name, next_id, group);
+                        GOTO(cleanup, rc = -ENOSPC);
+               } else if (unlikely(!fid_seq_is_mdt0(group) &&
+                                   next_id >= OBIF_MAX_OID)) {
+                        CERROR("%s:"POSTID" hit the OBIF_MAX_OID (1<<32)!\n",
                                  obd->obd_name, next_id, group);
                          GOTO(cleanup, rc = -ENOSPC);
                  }
@@ -3904,6 +3923,7 @@ set_last_id:
                  if (rc)
                          break;
                  if (cfs_time_after(jiffies, enough_time)) {
+                        i++;
                          CDEBUG(D_RPCTRACE,
                                 "%s: precreate slow - want %d got %d \n",
                                 obd->obd_name, *num, i);
@@ -3919,8 +3939,8 @@ set_last_id:
          RETURN(rc);
  }
  
-static int filter_create(struct obd_export *exp, struct obdo *oa,
-                         struct lov_stripe_md **ea, struct obd_trans_info *oti)
+int filter_create(struct obd_export *exp, struct obdo *oa,
+                  struct lov_stripe_md **ea, struct obd_trans_info *oti)
  {
          struct obd_device *obd = exp->exp_obd;
          struct filter_export_data *fed;
@@ -3959,7 +3979,8 @@ static int filter_create(struct obd_export *exp, struct obdo *oa,
  
          if ((oa->o_valid & OBD_MD_FLFLAGS) &&
              (oa->o_flags & OBD_FL_RECREATE_OBJS)) {
-                if (oa->o_id > filter_last_id(filter, oa->o_seq)) {
+                if (!obd->obd_recovering ||
+                    oa->o_id > filter_last_id(filter, oa->o_seq)) {
                          CERROR("recreate objid "LPU64" > last id "LPU64"\n",
                                 oa->o_id, filter_last_id(filter, oa->o_seq));
                          rc = -EINVAL;
@@ -4002,6 +4023,7 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa,
          struct llog_cookie *fcc = NULL;
          int rc, rc2, cleanup_phase = 0, sync = 0;
          struct iattr iattr;
+        unsigned long now;
          ENTRY;
  
          rc = filter_auth_capa(exp, NULL, oa->o_seq,
@@ -4056,7 +4078,7 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa,
                  if (fcc != NULL)
                          *fcc = oa->o_lcookie;
          }
-        DQUOT_INIT(dchild->d_inode);
+        ll_vfs_dq_init(dchild->d_inode);
  
          /* we're gonna truncate it first in order to avoid possible deadlock:
           *      P1                      P2
@@ -4070,8 +4092,10 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa,
           * between page lock, i_mutex & starting new journal handle.
           * (see bug 20321) -johann
           */
+        now = jiffies;
          down_write(&dchild->d_inode->i_alloc_sem);
          LOCK_INODE_MUTEX(dchild->d_inode);
+        fsfilt_check_slow(exp->exp_obd, now, "i_alloc_sem and i_mutex");
  
          /* VBR: version recovery check */
          rc = filter_version_get_check(exp, oti, dchild->d_inode);
@@ -4335,6 +4359,12 @@ static int filter_get_info(struct obd_export *exp, __u32 keylen,
                  RETURN(rc);
          }
  
+        if (KEY_IS(KEY_SYNC_LOCK_CANCEL)) {
+                *((__u32 *) val) = obd->u.filter.fo_sync_lock_cancel;
+                *vallen = sizeof(__u32);
+                RETURN(0);
+        }
+
          CDEBUG(D_IOCTL, "invalid key\n");
          RETURN(-EINVAL);
  }