use special macro for print time_t, cleanup in includes.

[fs/lustre-release.git] / lustre / obdfilter / filter.c
diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c

index 399698f..f00b738 100644 (file)
--- a/lustre/obdfilter/filter.c
+++ b/lustre/obdfilter/filter.c
@@ -47,6 +47,7 @@
  #include <linux/mount.h>
  #include <linux/buffer_head.h>
  
+#include <obd_cksum.h>
  #include <obd_class.h>
  #include <obd_lov.h>
  #include <lustre_dlm.h>
@@ -159,14 +160,34 @@ static void init_brw_stats(struct brw_stats *brw_stats)
                  spin_lock_init(&brw_stats->hist[i].oh_lock);
  }
  
+static int lprocfs_init_rw_stats(struct obd_device *obd,
+                                 struct lprocfs_stats **stats)
+{
+        int num_stats;
+
+        num_stats = (sizeof(*obd->obd_type->typ_dt_ops) / sizeof(void *)) +
+                                                        LPROC_FILTER_LAST - 1;
+        *stats = lprocfs_alloc_stats(num_stats, 0);
+        if (*stats == NULL)
+                return -ENOMEM;
+
+        lprocfs_init_ops_stats(LPROC_FILTER_LAST, *stats);
+        lprocfs_counter_init(*stats, LPROC_FILTER_READ_BYTES,
+                             LPROCFS_CNTR_AVGMINMAX, "read_bytes", "bytes");
+        lprocfs_counter_init(*stats, LPROC_FILTER_WRITE_BYTES,
+                             LPROCFS_CNTR_AVGMINMAX, "write_bytes", "bytes");
+
+        return(0);
+}
+
  /* brw_stats are 2128, ops are 3916, ldlm are 204, so 6248 bytes per client,
     plus the procfs overhead :( */
  static int filter_export_stats_init(struct obd_device *obd,
-                                    struct obd_export *exp)
+                                    struct obd_export *exp,
+                                    void *client_nid)
  {
          struct filter_export_data *fed = &exp->exp_filter_data;
-        struct proc_dir_entry *brw_entry;
-        int rc, num_stats;
+        int rc, newnid = 0;
          ENTRY;
  
          init_brw_stats(&fed->fed_brw_stats);
@@ -175,30 +196,35 @@ static int filter_export_stats_init(struct obd_device *obd,
                  /* Self-export gets no proc entry */
                  RETURN(0);
  
-        rc = lprocfs_exp_setup(exp);
+        rc = lprocfs_exp_setup(exp, client_nid, &newnid);
          if (rc)
                  RETURN(rc);
  
-        /* Create a per export proc entry for brw_stats */
-        brw_entry = create_proc_entry("brw_stats", 0644, exp->exp_proc);
-        if (brw_entry == NULL)
-               RETURN(-ENOMEM);
-        brw_entry->proc_fops = &filter_per_export_stats_fops;
-        brw_entry->data = fed;
+        if (newnid) {
+                struct nid_stat *tmp = exp->exp_nid_stats;
+                LASSERT(tmp != NULL);
+
+                OBD_ALLOC(tmp->nid_brw_stats, sizeof(struct brw_stats));
+                if (tmp->nid_brw_stats == NULL)
+                        RETURN(-ENOMEM);
+
+                init_brw_stats(tmp->nid_brw_stats);
+                rc = lprocfs_seq_create(exp->exp_nid_stats->nid_proc, "brw_stats",
+                                        0644, &filter_per_nid_stats_fops,
+                                        exp->exp_nid_stats);
+                if (rc)
+                        CWARN("Error adding the brw_stats file\n");
+
+                rc = lprocfs_init_rw_stats(obd, &exp->exp_nid_stats->nid_stats);
+                if (rc)
+                        RETURN(rc);
+
+                rc = lprocfs_register_stats(tmp->nid_proc, "stats",
+                                            tmp->nid_stats);
+                if (rc)
+                        RETURN(rc);
+        }
  
-        /* Create a per export proc entry for ops stats */
-        num_stats = (sizeof(*obd->obd_type->typ_dt_ops) / sizeof(void *)) +
-                     LPROC_FILTER_LAST - 1;
-        exp->exp_ops_stats = lprocfs_alloc_stats(num_stats,
-                                                 LPROCFS_STATS_FLAG_NOPERCPU);
-        if (exp->exp_ops_stats == NULL)
-              RETURN(-ENOMEM);
-        lprocfs_init_ops_stats(LPROC_FILTER_LAST, exp->exp_ops_stats);
-        lprocfs_counter_init(exp->exp_ops_stats, LPROC_FILTER_READ_BYTES,
-                             LPROCFS_CNTR_AVGMINMAX, "read_bytes", "bytes");
-        lprocfs_counter_init(exp->exp_ops_stats, LPROC_FILTER_WRITE_BYTES,
-                             LPROCFS_CNTR_AVGMINMAX, "write_bytes", "bytes");
-        lprocfs_register_stats(exp->exp_proc, "stats", exp->exp_ops_stats);
          RETURN(0);
  }
  
@@ -548,8 +574,8 @@ static void filter_fmd_cleanup(struct obd_export *exp)
  static int filter_init_export(struct obd_export *exp)
  {
          spin_lock_init(&exp->exp_filter_data.fed_lock);
-        INIT_LIST_HEAD(&exp->exp_filter_data.fed_mod_list);
-       
+        CFS_INIT_LIST_HEAD(&exp->exp_filter_data.fed_mod_list);
+
          spin_lock(&exp->exp_lock);
          exp->exp_connecting = 1;
          spin_unlock(&exp->exp_lock);
@@ -771,7 +797,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                          fed = &exp->exp_filter_data;
                          fed->fed_fcd = fcd;
                          fed->fed_group = le32_to_cpu(fcd->fcd_group);
-                        filter_export_stats_init(obd, exp);
+                        filter_export_stats_init(obd, exp, NULL);
                          rc = filter_client_add(obd, exp, cl_idx);
                          /* can't fail for existing client */
                          LASSERTF(rc == 0, "rc = %d\n", rc);
@@ -1566,20 +1592,68 @@ static int filter_destroy_internal(struct obd_device *obd, obd_id objid,
          return(rc);
  }
  
+struct filter_intent_args {
+        struct ldlm_lock **victim;
+        __u64 size;
+        int *liblustre;
+};
+
+static enum interval_iter filter_intent_cb(struct interval_node *n,
+                                           void *args)
+{
+        struct ldlm_interval *node = (struct ldlm_interval *)n;
+        struct filter_intent_args *arg = (struct filter_intent_args*)args;
+        __u64 size = arg->size;
+        struct ldlm_lock **v = arg->victim;
+        struct ldlm_lock *lck;
+
+        /* If the interval is lower than the current file size,
+         * just break. */
+        if (interval_high(n) <= size)
+                return INTERVAL_ITER_STOP;
+
+        list_for_each_entry(lck, &node->li_group, l_sl_policy) {
+                /* Don't send glimpse ASTs to liblustre clients.
+                 * They aren't listening for them, and they do
+                 * entirely synchronous I/O anyways. */
+                if (lck->l_export == NULL ||
+                    lck->l_export->exp_libclient == 1)
+                        continue;
+
+                if (*arg->liblustre)
+                        *arg->liblustre = 0;
+
+                if (*v == NULL) {
+                        *v = LDLM_LOCK_GET(lck);
+                } else if ((*v)->l_policy_data.l_extent.start <
+                           lck->l_policy_data.l_extent.start) {
+                        LDLM_LOCK_PUT(*v);
+                        *v = LDLM_LOCK_GET(lck);
+                }
+
+                /* the same policy group - every lock has the
+                 * same extent, so needn't do it any more */
+                break;
+        }
+
+        return INTERVAL_ITER_CONT;
+}
+
  static int filter_intent_policy(struct ldlm_namespace *ns,
                                  struct ldlm_lock **lockp, void *req_cookie,
                                  ldlm_mode_t mode, int flags, void *data)
  {
-        struct list_head rpc_list = LIST_HEAD_INIT(rpc_list);
+        CFS_LIST_HEAD(rpc_list);
          struct ptlrpc_request *req = req_cookie;
          struct ldlm_lock *lock = *lockp, *l = NULL;
          struct ldlm_resource *res = lock->l_resource;
          ldlm_processing_policy policy;
          struct ost_lvb *res_lvb, *reply_lvb;
          struct ldlm_reply *rep;
-        struct list_head *tmp;
          ldlm_error_t err;
-        int rc, tmpflags = 0, only_liblustre = 0;
+        int idx, rc, tmpflags = 0, only_liblustre = 1;
+        struct ldlm_interval_tree *tree;
+        struct filter_intent_args arg;
          int repsize[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                             [DLM_LOCKREPLY_OFF]   = sizeof(*rep),
                             [DLM_REPLY_REC_OFF]   = sizeof(*reply_lvb) };
@@ -1604,7 +1678,9 @@ static int filter_intent_policy(struct ldlm_namespace *ns,
  
          /* If we grant any lock at all, it will be a whole-file read lock.
           * Call the extent policy function to see if our request can be
-         * granted, or is blocked. */
+         * granted, or is blocked. 
+         * If the OST lock has LDLM_FL_HAS_INTENT set, it means a glimpse lock
+         */
          lock->l_policy_data.l_extent.start = 0;
          lock->l_policy_data.l_extent.end = OBD_OBJECT_EOF;
          lock->l_req_mode = LCK_PR;
@@ -1652,42 +1728,23 @@ static int filter_intent_policy(struct ldlm_namespace *ns,
          LASSERT(res_lvb != NULL);
          *reply_lvb = *res_lvb;
  
-        list_for_each(tmp, &res->lr_granted) {
-                struct ldlm_lock *tmplock =
-                        list_entry(tmp, struct ldlm_lock, l_res_link);
-
-                if (tmplock->l_granted_mode == LCK_PR)
-                        continue;
-                /*
-                 * ->ns_lock guarantees that no new locks are granted, and,
-                 * therefore, that res->lr_lvb_data cannot increase beyond the
-                 * end of already granted lock. As a result, it is safe to
-                 * check against "stale" reply_lvb->lvb_size value without
-                 * res->lr_lvb_sem.
-                 */
-                if (tmplock->l_policy_data.l_extent.end <= reply_lvb->lvb_size)
-                        continue;
-
-                /* Don't send glimpse ASTs to liblustre clients.  They aren't
-                 * listening for them, and they do entirely synchronous I/O
-                 * anyways. */
-                if (tmplock->l_export == NULL ||
-                    tmplock->l_export->exp_libclient == 1) {
-                        only_liblustre = 1;
-                        continue;
-                }
-
-                if (l == NULL) {
-                        l = LDLM_LOCK_GET(tmplock);
-                        continue;
-                }
-
-                if (l->l_policy_data.l_extent.start >
-                    tmplock->l_policy_data.l_extent.start)
+        /*
+         * ->ns_lock guarantees that no new locks are granted, and,
+         * therefore, that res->lr_lvb_data cannot increase beyond the
+         * end of already granted lock. As a result, it is safe to
+         * check against "stale" reply_lvb->lvb_size value without
+         * res->lr_lvb_sem.
+         */
+        arg.size = reply_lvb->lvb_size;
+        arg.victim = &l;
+        arg.liblustre = &only_liblustre;
+        for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+                tree = &res->lr_itree[idx];
+                if (tree->lit_mode == LCK_PR)
                          continue;
  
-                LDLM_LOCK_PUT(l);
-                l = LDLM_LOCK_GET(tmplock);
+                interval_iterate_reverse(tree->lit_root, 
+                                         filter_intent_cb, &arg);
          }
          unlock_res(res);
  
@@ -1938,14 +1995,14 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg,
          filter->fo_fmd_max_num = FILTER_FMD_MAX_NUM_DEFAULT;
          filter->fo_fmd_max_age = FILTER_FMD_MAX_AGE_DEFAULT;
  
-        INIT_LIST_HEAD(&filter->fo_llog_list);
+        CFS_INIT_LIST_HEAD(&filter->fo_llog_list);
          spin_lock_init(&filter->fo_llog_list_lock);
  
          filter->fo_sptlrpc_lock = RW_LOCK_UNLOCKED;
          sptlrpc_rule_set_init(&filter->fo_sptlrpc_rset);
  
          filter->fo_fl_oss_capa = 0;
-        INIT_LIST_HEAD(&filter->fo_capa_keys);
+        CFS_INIT_LIST_HEAD(&filter->fo_capa_keys);
          filter->fo_capa_hash = init_capa_hash();
          if (filter->fo_capa_hash == NULL)
                  GOTO(err_ops, rc = -ENOMEM);
@@ -2051,9 +2108,20 @@ static int filter_setup(struct obd_device *obd, struct lustre_cfg* lcfg)
                                       "write_bytes", "bytes");
  
                  lproc_filter_attach_seqstat(obd);
-                obd->obd_proc_exports = proc_mkdir("exports",
-                                                   obd->obd_proc_entry);
+                obd->obd_proc_exports_entry = lprocfs_register("exports",
+                                                        obd->obd_proc_entry,
+                                                        NULL, NULL);
+                if (IS_ERR(obd->obd_proc_exports_entry)) {
+                        rc = PTR_ERR(obd->obd_proc_exports_entry);
+                        CERROR("error %d setting up lprocfs for %s\n",
+                               rc, "exports");
+                        obd->obd_proc_exports_entry = NULL;
+                }
          }
+        if (obd->obd_proc_exports_entry)
+                lprocfs_add_simple(obd->obd_proc_exports_entry, "clear",
+                                   lprocfs_nid_stats_clear_read,
+                                   lprocfs_nid_stats_clear_write, obd);
  
          memcpy((void *)addr, lustre_cfg_buf(lcfg, 4),
                 LUSTRE_CFG_BUFLEN(lcfg, 4));
@@ -2061,8 +2129,10 @@ static int filter_setup(struct obd_device *obd, struct lustre_cfg* lcfg)
          OBD_PAGE_FREE(page);
  
          if (rc) {
-                lprocfs_obd_cleanup(obd);
+                lprocfs_remove_proc_entry("clear", obd->obd_proc_exports_entry);
+                lprocfs_free_per_client_stats(obd);
                  lprocfs_free_obd_stats(obd);
+                lprocfs_obd_cleanup(obd);
          }
  
          return rc;
@@ -2322,8 +2392,10 @@ static int filter_cleanup(struct obd_device *obd)
                  }
          }
  
-        lprocfs_obd_cleanup(obd);
+        lprocfs_remove_proc_entry("clear", obd->obd_proc_exports_entry);
+        lprocfs_free_per_client_stats(obd);
          lprocfs_free_obd_stats(obd);
+        lprocfs_obd_cleanup(obd);
          lquota_cleanup(filter_quota_interface_ref, obd);
  
          /* Stop recovery before namespace cleanup. */
@@ -2339,9 +2411,8 @@ static int filter_cleanup(struct obd_device *obd)
  
          filter_post(obd);
  
-        shrink_dcache_parent(obd->u.obt.obt_sb->s_root);
-
          LL_DQUOT_OFF(obd->u.obt.obt_sb);
+        shrink_dcache_sb(obd->u.obt.obt_sb);
  
          server_put_mount(obd->obd_name, filter->fo_vfsmnt);
          obd->u.obt.obt_sb = NULL;
@@ -2427,6 +2498,30 @@ static int filter_connect_internal(struct obd_export *exp,
                  LASSERT(data->ocd_brw_size);
          }
  
+        if (data->ocd_connect_flags & OBD_CONNECT_CKSUM) {
+                __u32 cksum_types = data->ocd_cksum_types;
+
+                /* The client set in ocd_cksum_types the checksum types it
+                 * supports. We have to mask off the algorithms that we don't
+                 * support */
+                if (cksum_types & OBD_CKSUM_ALL)
+                        data->ocd_cksum_types &= OBD_CKSUM_ALL;
+                else
+                        data->ocd_cksum_types = OBD_CKSUM_CRC32;
+
+                CDEBUG(D_RPCTRACE, "%s: cli %s supports cksum type %x, return "
+                                   "%x\n", exp->exp_obd->obd_name,
+                                   obd_export_nid2str(exp), cksum_types,
+                                   data->ocd_cksum_types);
+        } else {
+                /* This client does not support OBD_CONNECT_CKSUM
+                 * fall back to CRC32 */
+                CDEBUG(D_RPCTRACE, "%s: cli %s does not support "
+                                   "OBD_CONNECT_CKSUM, CRC32 will be used\n",
+                                   exp->exp_obd->obd_name,
+                                   obd_export_nid2str(exp));
+        }
+
          /* FIXME: Do the same with the MDS UUID and fsd_peeruuid.
           * FIXME: We don't strictly need the COMPAT flag for that,
           * FIXME: as fsd_peeruuid[0] will tell us if that is set.
@@ -2455,7 +2550,7 @@ static int filter_reconnect(const struct lu_env *env,
  static int filter_connect(const struct lu_env *env,
                            struct lustre_handle *conn, struct obd_device *obd,
                            struct obd_uuid *cluuid,
-                          struct obd_connect_data *data)
+                          struct obd_connect_data *data, void *localdata)
  {
          struct lvfs_run_ctxt saved;
          struct obd_export *exp;
@@ -2480,7 +2575,7 @@ static int filter_connect(const struct lu_env *env,
          if (rc)
                  GOTO(cleanup, rc);
  
-        filter_export_stats_init(obd, exp);
+        filter_export_stats_init(obd, exp, localdata);
          group = data->ocd_group;
          if (obd->obd_replayable) {
                  OBD_ALLOC(fcd, sizeof(*fcd));
@@ -3039,6 +3134,26 @@ int filter_setattr(struct obd_export *exp, struct obd_info *oinfo,
          if (rc)
                  RETURN(rc);
  
+        /* This would be very bad - accidentally truncating a file when
+         * changing the time or similar - bug 12203. */
+        if (oinfo->oi_oa->o_valid & OBD_MD_FLSIZE && 
+            oinfo->oi_policy.l_extent.end != OBD_OBJECT_EOF) {
+                static char mdsinum[48];
+
+                if (oinfo->oi_oa->o_valid & OBD_MD_FLFID)
+                        snprintf(mdsinum, sizeof(mdsinum) - 1,
+                                 " of inode "LPU64"/%u", oinfo->oi_oa->o_fid,
+                                 oinfo->oi_oa->o_generation);
+                else
+                        mdsinum[0] = '\0';
+
+                CERROR("%s: setattr from %s trying to truncate objid "LPU64
+                       " %s\n",
+                       exp->exp_obd->obd_name, obd_export_nid2str(exp),
+                       oinfo->oi_oa->o_id, mdsinum);
+                RETURN(-EPERM);
+        }
+
          dentry = __filter_oa2dentry(exp->exp_obd, oinfo->oi_oa,
                                      __FUNCTION__, 1);
          if (IS_ERR(dentry))
@@ -3275,7 +3390,7 @@ out:
  }
  
  static int filter_statfs(struct obd_device *obd, struct obd_statfs *osfs,
-                         __u64 max_age)
+                         __u64 max_age, __u32 flags)
  {
          struct filter_obd *filter = &obd->u.filter;
          int blockbits = obd->u.obt.obt_sb->s_blocksize_bits;
@@ -3371,7 +3486,7 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa,
                  OBD_ALLOC(osfs, sizeof(*osfs));
                  if (osfs == NULL)
                          RETURN(-ENOMEM);
-                rc = filter_statfs(obd, osfs, cfs_time_current_64() - HZ);
+                rc = filter_statfs(obd, osfs, cfs_time_current_64() - HZ, 0);
                  if (rc == 0 && osfs->os_bavail < (osfs->os_blocks >> 10)) {
                          CDEBUG(D_RPCTRACE,"%s: not enough space for create "
                                 LPU64"\n", obd->obd_name, osfs->os_bavail <<