- update from HEAD

author alex <alex>

Fri, 6 Feb 2009 21:14:37 +0000 (21:14 +0000)

committer alex <alex>

Fri, 6 Feb 2009 21:14:37 +0000 (21:14 +0000)
author alex <alex>
Fri, 6 Feb 2009 21:14:37 +0000 (21:14 +0000)
committer alex <alex>
Fri, 6 Feb 2009 21:14:37 +0000 (21:14 +0000)
diff --git a/lustre/ChangeLog b/lustre/ChangeLog

index b8a9007..7d14da4 100644 (file)
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -14,6 +14,30 @@ tbd  Sun Microsystems, Inc.
         * File join has been disabled in this release, refer to Bugzilla 16929.
  
  Severity   : normal
+Frequency  : start MDS on uncleanly shutdowned MDS device
+Bugzilla   : 16839
+Descriptoin: ll_sync thread stay in waiting mds<>ost recovery finished
+Details    : stay in waiting mds<>ost recovery finished produce random bugs
+             due race between two ll_sync thread for one lov target. send 
+             ACTIVATE event only if connect realy finished and import have
+             FULL state.
+
+Severity   : normal
+Frequency  : rare, connect and disconnect target at same time
+Bugzilla   : 17310
+Descriptoin: ASSERTION(atomic_read(&imp->imp_inflight) == 0
+Details    : don't call obd_disconnect under lov_lock. this long time
+             operation and can block ptlrpcd which answer to connect request.
+
+Severity   : normal
+Frequency  : rare
+Bugzilla   : 18154
+Descriptoin: don't lose wakeup for imp_recovery_waitq
+Details    : recover_import_no_retry or invalidate_import and import_close can
+             both sleep on imp_recovery_waitq, but we was send only one wakeup
+             to sleep queue.
+
+Severity   : normal
  Frequency  : always with long access acl
  Bugzilla   : 17636
  Descriptoin: mds can't pack reply with long acl.
@@ -1923,6 +1947,22 @@ Details    : enable OBD_CONNECT_MDT flag when connecting from the MDS so that
              from a different NID, so we do not need to wait for the export to be
              evicted
  
+Severity   : major
+Frequency  : rare, only if using MMP with Linux RAID
+Bugzilla   : 17895
+Description: MMP doesn't work with Linux RAID
+Details    : While using HA for Lustre servers with Linux RAID, it is possible
+             that MMP will not detect multiple mounts. To make this work we
+             need to unplug the device queue in RAID when the MMP block is being
+             written. Also while reading the MMP block, we should read it from
+             disk and not the cached one.
+
+Severity   : enhancement
+Bugzilla   : 17187
+Description: open file using fid
+Details    : A file can be opened using just its fid, like
+            <mntpt>/.lustre/fid/SEQ:OID:VER - this is needed for HSM and replication
+
  --------------------------------------------------------------------------------
  
  2007-08-10         Cluster File Systems, Inc. <info@clusterfs.com>
diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4

index 4e00bf6..04023c7 100644 (file)
--- a/lustre/autoconf/lustre-core.m4
+++ b/lustre/autoconf/lustre-core.m4
@@ -623,7 +623,7 @@ dnl the AES symbol usually tied with arch, e.g. CRYPTO_AES_586
  dnl FIXME
  AC_DEFUN([LC_CONFIG_RMTCLIENT],
  [LB_LINUX_CONFIG_IM([CRYPTO_AES],[],[
-       AC_MSG_ERROR([Lustre remote client require that CONFIG_CRYPTO_AES is enabled in your kernel.])
+        AC_MSG_WARN([Lustre remote client require that CONFIG_CRYPTO_AES is enabled in your kernel.])
  ])
  ])
  
@@ -654,19 +654,19 @@ AC_DEFUN([LC_CONFIG_SUNRPC],
  AC_DEFUN([LC_CONFIG_GSS_KEYRING],
  [AC_MSG_CHECKING([whether to enable gss keyring backend])
   AC_ARG_ENABLE([gss_keyring],
-              [AC_HELP_STRING([--disable-gss-keyring],
+               [AC_HELP_STRING([--disable-gss-keyring],
                                 [disable gss keyring backend])],
-              [],[enable_gss_keyring='yes'])
+               [],[enable_gss_keyring='yes'])
   AC_MSG_RESULT([$enable_gss_keyring])
  
   if test x$enable_gss_keyring != xno; then
-       LB_LINUX_CONFIG_IM([KEYS],[],
+        LB_LINUX_CONFIG_IM([KEYS],[],
                             [AC_MSG_ERROR([GSS keyring backend require that CONFIG_KEYS be enabled in your kernel.])])
  
-       AC_CHECK_LIB([keyutils], [keyctl_search], [],
+        AC_CHECK_LIB([keyutils], [keyctl_search], [],
                       [AC_MSG_ERROR([libkeyutils is not found, which is required by gss keyring backend])],)
  
-       AC_DEFINE([HAVE_GSS_KEYRING], [1],
+        AC_DEFINE([HAVE_GSS_KEYRING], [1],
                    [Define this if you enable gss keyring backend])
   fi
  ])
@@ -685,37 +685,29 @@ AC_DEFUN([LC_CONFIG_GSS],
   AC_MSG_RESULT([$enable_gss])
  
   if test x$enable_gss == xyes; then
-       LC_CONFIG_GSS_KEYRING
+        LC_CONFIG_GSS_KEYRING
          LC_CONFIG_SUNRPC
  
+        AC_DEFINE([HAVE_GSS], [1], [Define this if you enable gss])
+
          LB_LINUX_CONFIG_IM([CRYPTO_MD5],[],
                             [AC_MSG_WARN([kernel MD5 support is recommended by using GSS.])])
-       LB_LINUX_CONFIG_IM([CRYPTO_SHA1],[],
+        LB_LINUX_CONFIG_IM([CRYPTO_SHA1],[],
                             [AC_MSG_WARN([kernel SHA1 support is recommended by using GSS.])])
-       LB_LINUX_CONFIG_IM([CRYPTO_SHA256],[],
+        LB_LINUX_CONFIG_IM([CRYPTO_SHA256],[],
                             [AC_MSG_WARN([kernel SHA256 support is recommended by using GSS.])])
-       LB_LINUX_CONFIG_IM([CRYPTO_SHA512],[],
+        LB_LINUX_CONFIG_IM([CRYPTO_SHA512],[],
                             [AC_MSG_WARN([kernel SHA512 support is recommended by using GSS.])])
-       LB_LINUX_CONFIG_IM([CRYPTO_WP512],[],
-                           [AC_MSG_WARN([kernel WP512 support is recommended by using GSS.])])
-       LB_LINUX_CONFIG_IM([CRYPTO_ARC4],[],
-                           [AC_MSG_WARN([kernel ARC4 support is recommended by using GSS.])])
-        LB_LINUX_CONFIG_IM([CRYPTO_DES],[],
-                           [AC_MSG_WARN([kernel DES support is recommended by using GSS.])])
-        LB_LINUX_CONFIG_IM([CRYPTO_TWOFISH],[],
-                           [AC_MSG_WARN([kernel TWOFISH support is recommended by using GSS.])])
-        LB_LINUX_CONFIG_IM([CRYPTO_CAST6],[],
-                           [AC_MSG_WARN([kernel CAST6 support is recommended by using GSS.])])
-
-       AC_CHECK_LIB([gssapi], [gss_init_sec_context],
+
+        AC_CHECK_LIB([gssapi], [gss_init_sec_context],
                       [GSSAPI_LIBS="$GSSAPI_LDFLAGS -lgssapi"],
                       [AC_CHECK_LIB([gssglue], [gss_init_sec_context],
                                     [GSSAPI_LIBS="$GSSAPI_LDFLAGS -lgssglue"],
                                     [AC_MSG_ERROR([libgssapi or libgssglue is not found, which is required by GSS.])])],)
  
-       AC_SUBST(GSSAPI_LIBS)
+        AC_SUBST(GSSAPI_LIBS)
  
-       AC_KERBEROS_V5
+        AC_KERBEROS_V5
   fi
  ])
  
diff --git a/lustre/autoconf/lustre-version.ac b/lustre/autoconf/lustre-version.ac

index 2663bf9..b367e2f 100644 (file)
--- a/lustre/autoconf/lustre-version.ac
+++ b/lustre/autoconf/lustre-version.ac
@@ -1,6 +1,6 @@
  m4_define([LUSTRE_MAJOR],[1])
  m4_define([LUSTRE_MINOR],[9])
-m4_define([LUSTRE_PATCH],[130])
+m4_define([LUSTRE_PATCH],[150])
  m4_define([LUSTRE_FIX],[0])
  
  dnl # don't forget to update the service tags info
diff --git a/lustre/cmm/cmm_device.c b/lustre/cmm/cmm_device.c

index 01f319d..681e2db 100644 (file)
--- a/lustre/cmm/cmm_device.c
+++ b/lustre/cmm/cmm_device.c
@@ -817,15 +817,29 @@ static void lprocfs_cmm_init_vars(struct lprocfs_static_vars *lvars)
  static int __init cmm_mod_init(void)
  {
          struct lprocfs_static_vars lvars;
+        int rc;
+
+        /* 
+         * Kludge code : it should be moved mdc_device.c if mdc_(mds)_device
+         * is really stacked.
+         */
+        rc = lu_device_type_init(&mdc_device_type);
+        if (rc)
+                return rc;
  
          lprocfs_cmm_init_vars(&lvars);
-        return class_register_type(&cmm_obd_device_ops, NULL, lvars.module_vars,
-                                   LUSTRE_CMM_NAME, &cmm_device_type);
+        rc = class_register_type(&cmm_obd_device_ops, NULL, lvars.module_vars,
+                                 LUSTRE_CMM_NAME, &cmm_device_type);
+        if (rc)
+                lu_device_type_fini(&mdc_device_type);
+
+        return rc;
  }
  
  static void __exit cmm_mod_exit(void)
  {
          class_unregister_type(LUSTRE_CMM_NAME);
+        lu_device_type_fini(&mdc_device_type);
  }
  
  MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
diff --git a/lustre/cmm/mdc_device.c b/lustre/cmm/mdc_device.c

index db2d0b1..d3a7c3b 100644 (file)
--- a/lustre/cmm/mdc_device.c
+++ b/lustre/cmm/mdc_device.c
@@ -130,7 +130,6 @@ static int mdc_obd_add(const struct lu_env *env,
                  CERROR("target %s not set up\n", mdc->obd_name);
                  rc = -EINVAL;
          } else {
-                struct lustre_handle *conn = &desc->cl_conn;
                  struct obd_connect_data *ocd;
  
                  CDEBUG(D_CONFIG, "connect to %s(%s)\n",
@@ -153,13 +152,12 @@ static int mdc_obd_add(const struct lu_env *env,
                                           OBD_CONNECT_MDS_MDS |
                                           OBD_CONNECT_FID |
                                           OBD_CONNECT_AT;
-                rc = obd_connect(env, conn, mdc, &mdc->obd_uuid, ocd, NULL);
+                rc = obd_connect(env, &desc->cl_exp, mdc, &mdc->obd_uuid, ocd, NULL);
                  OBD_FREE_PTR(ocd);
                  if (rc) {
                          CERROR("target %s connect error %d\n",
                                 mdc->obd_name, rc);
                  } else {
-                        desc->cl_exp = class_conn2export(conn);
                          /* set seq controller export for MDC0 if exists */
                          if (mc->mc_num == 0)
                                  ms->ms_control_exp =
diff --git a/lustre/cmm/mdc_internal.h b/lustre/cmm/mdc_internal.h

index 774912b..e7a1d13 100644 (file)
--- a/lustre/cmm/mdc_internal.h
+++ b/lustre/cmm/mdc_internal.h
@@ -50,7 +50,6 @@
  #include <md_object.h>
  
  struct mdc_cli_desc {
-        struct lustre_handle     cl_conn;
          /* uuid of remote MDT to connect */
          struct obd_uuid          cl_srv_uuid;
          /* mdc uuid */
diff --git a/lustre/fid/fid_lib.c b/lustre/fid/fid_lib.c

index 76e779a..ab6422c 100644 (file)
--- a/lustre/fid/fid_lib.c
+++ b/lustre/fid/fid_lib.c
@@ -70,6 +70,7 @@
   *
   * The first 0x400 sequences of normal FID are reserved for special purpose.
   * FID_SEQ_START + 1 is for local file id generation.
+ * FID_SEQ_START + 2 is for .lustre directory and its objects
   */
  const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = {
          FID_SEQ_START + 0x400ULL,
@@ -89,3 +90,15 @@ const struct lu_fid LUSTRE_BFL_FID = { .f_seq = 0x0000000000000003,
                                         .f_oid = 0x0000000000000001,
                                         .f_ver = 0x0000000000000000 };
  EXPORT_SYMBOL(LUSTRE_BFL_FID);
+
+/** Special fid for ".lustre" directory */
+const struct lu_fid LU_DOT_LUSTRE_FID = { .f_seq = LU_DOT_LUSTRE_SEQ,
+                                          .f_oid = 0x0000000000000001,
+                                          .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_DOT_LUSTRE_FID);
+
+/** Special fid for "fid" special object in .lustre */
+const struct lu_fid LU_OBF_FID = { .f_seq = LU_DOT_LUSTRE_SEQ,
+                                   .f_oid = 0x0000000000000002,
+                                   .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_OBF_FID);
diff --git a/lustre/fld/fld_handler.c b/lustre/fld/fld_handler.c

index 2b6ab12..5092ac1 100644 (file)
--- a/lustre/fld/fld_handler.c
+++ b/lustre/fld/fld_handler.c
@@ -67,6 +67,7 @@
  #include <lustre_fid.h>
  #include <lustre_req_layout.h>
  #include "fld_internal.h"
+#include <lustre_fid.h>
  
  #ifdef __KERNEL__
  
@@ -466,6 +467,7 @@ int fld_server_init(struct lu_server_fld *fld, struct dt_device *dt,
                      int mds_node_id)
  {
          int cache_size, cache_threshold;
+        struct lu_seq_range range;
          int rc;
          ENTRY;
  
@@ -499,6 +501,13 @@ int fld_server_init(struct lu_server_fld *fld, struct dt_device *dt,
                  GOTO(out, rc);
  
          fld->lsf_control_exp = NULL;
+
+        /* Insert reserved sequence number of ".lustre" into fld cache. */
+        range.lsr_start = LU_DOT_LUSTRE_SEQ;
+        range.lsr_end = LU_DOT_LUSTRE_SEQ + 1;
+        range.lsr_mdt = 0;
+        fld_cache_insert(fld->lsf_cache, &range);
+
          EXIT;
  out:
          if (rc)
diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h

index ad4b468..5b922bd 100644 (file)
--- a/lustre/include/cl_object.h
+++ b/lustre/include/cl_object.h
@@ -2100,16 +2100,22 @@ enum cl_enq_flags {
           */
          CEF_DISCARD_DATA = 0x00000004,
          /**
-         * tell the sub layers that it must be a `real' lock.
+         * tell the sub layers that it must be a `real' lock. This is used for
+         * mmapped-buffer locks and glimpse locks that must be never converted
+         * into lockless mode.
+         *
+         * \see vvp_mmap_locks(), cl_glimpse_lock().
           */
          CEF_MUST         = 0x00000008,
          /**
-         * tell the sub layers that never request a `real' lock.
-         * currently, the CEF_MUST & CEF_NEVER are only used for mmap locks.
-         * cl_io::ci_lockreq and these two flags: ci_lockreq just describes
-         * generic information of lock requirement for this IO, especially for
-         * locks which belong to the object doing IO; however, lock itself may
-         * have precise requirements, this is described by the latter.
+         * tell the sub layers that never request a `real' lock. This flag is
+         * not used currently.
+         *
+         * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless
+         * conversion policy: ci_lockreq describes generic information of lock
+         * requirement for this IO, especially for locks which belong to the
+         * object doing IO; however, lock itself may have precise requirements
+         * that are described by the enqueue flags.
           */
          CEF_NEVER        = 0x00000010,
          /**
diff --git a/lustre/include/class_hash.h b/lustre/include/class_hash.h

index 6210c7f..37bd8d2 100644 (file)
--- a/lustre/include/class_hash.h
+++ b/lustre/include/class_hash.h
@@ -85,11 +85,9 @@ lh_hash(lustre_hash_t *lh, void *key, unsigned mask)
  {
          LASSERT(lh);
          LASSERT(LHO(lh));
+        LASSERT(LHP(lh, hash));
  
-        if (LHP(lh, hash))
-                return LHP(lh, hash)(lh, key, mask);
-
-        return -EOPNOTSUPP;
+        return LHP(lh, hash)(lh, key, mask);
  }
  
  static inline void *
diff --git a/lustre/include/linux/lvfs.h b/lustre/include/linux/lvfs.h

index 17576c3..e90b155 100644 (file)
--- a/lustre/include/linux/lvfs.h
+++ b/lustre/include/linux/lvfs.h
@@ -104,6 +104,10 @@ int lustre_fread(struct file *file, void *buf, int len, loff_t *off);
  int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off);
  int lustre_fsync(struct file *file);
  long l_readdir(struct file * file, struct list_head *dentry_list);
+int l_notify_change(struct vfsmount *mnt, struct dentry *dchild,
+                    struct iattr *newattrs);
+int simple_truncate(struct dentry *dir, struct vfsmount *mnt,
+                               char *name, loff_t length);
  
  static inline void l_dput(struct dentry *de)
  {
diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h

index 7763498..f20dae2 100644 (file)
--- a/lustre/include/lprocfs_status.h
+++ b/lustre/include/lprocfs_status.h
@@ -222,9 +222,9 @@ static inline int opcode_offset(__u32 opc) {
                          (LDLM_LAST_OPC - LDLM_FIRST_OPC) +
                          (MDS_LAST_OPC - MDS_FIRST_OPC) +
                          (OST_LAST_OPC - OST_FIRST_OPC));
-        } else if (opc < FLD_LAST_OPC) {
-                /* FLD opcode */
-                return (opc - FLD_FIRST_OPC +
+        } else if (opc < QUOTA_LAST_OPC) {
+                /* LQUOTA Opcode */
+                return (opc - QUOTA_FIRST_OPC +
                          (LLOG_LAST_OPC - LLOG_FIRST_OPC) +
                          (OBD_LAST_OPC - OBD_FIRST_OPC) +
                          (MGS_LAST_OPC - MGS_FIRST_OPC) +
@@ -234,7 +234,7 @@ static inline int opcode_offset(__u32 opc) {
          } else if (opc < SEQ_LAST_OPC) {
                  /* SEQ opcode */
                  return (opc - SEQ_FIRST_OPC +
-                        (FLD_LAST_OPC - FLD_FIRST_OPC) +
+                        (QUOTA_LAST_OPC- QUOTA_FIRST_OPC) +
                          (LLOG_LAST_OPC - LLOG_FIRST_OPC) +
                          (OBD_LAST_OPC - OBD_FIRST_OPC) +
                          (MGS_LAST_OPC - MGS_FIRST_OPC) +
@@ -245,19 +245,19 @@ static inline int opcode_offset(__u32 opc) {
                  /* SEC opcode */
                  return (opc - SEC_FIRST_OPC +
                          (SEQ_LAST_OPC - SEQ_FIRST_OPC) +
-                        (FLD_LAST_OPC - FLD_FIRST_OPC) +
+                        (QUOTA_LAST_OPC- QUOTA_FIRST_OPC) +
                          (LLOG_LAST_OPC - LLOG_FIRST_OPC) +
                          (OBD_LAST_OPC - OBD_FIRST_OPC) +
                          (MGS_LAST_OPC - MGS_FIRST_OPC) +
                          (LDLM_LAST_OPC - LDLM_FIRST_OPC) +
                          (MDS_LAST_OPC - MDS_FIRST_OPC) +
                          (OST_LAST_OPC - OST_FIRST_OPC));
-        } else if (opc < QUOTA_LAST_OPC) {
-                /* LQUOTA Opcode */
-                return (opc - QUOTA_FIRST_OPC +
+        } else if (opc < FLD_LAST_OPC) {
+                /* FLD opcode */
+                 return (opc - FLD_FIRST_OPC +
                          (SEC_LAST_OPC - SEC_FIRST_OPC) +
                          (SEQ_LAST_OPC - SEQ_FIRST_OPC) +
-                        (FLD_LAST_OPC - FLD_FIRST_OPC) +
+                        (QUOTA_LAST_OPC- QUOTA_FIRST_OPC) +
                          (LLOG_LAST_OPC - LLOG_FIRST_OPC) +
                          (OBD_LAST_OPC - OBD_FIRST_OPC) +
                          (MGS_LAST_OPC - MGS_FIRST_OPC) +
@@ -270,16 +270,17 @@ static inline int opcode_offset(__u32 opc) {
          }
  }
  
-#define LUSTRE_MAX_OPCODES ((LDLM_LAST_OPC - LDLM_FIRST_OPC)   + \
+
+#define LUSTRE_MAX_OPCODES ((OST_LAST_OPC - OST_FIRST_OPC)     + \
                              (MDS_LAST_OPC - MDS_FIRST_OPC)     + \
-                            (OST_LAST_OPC - OST_FIRST_OPC)     + \
-                            (OBD_LAST_OPC - OBD_FIRST_OPC)     + \
-                            (FLD_LAST_OPC - FLD_FIRST_OPC)     + \
-                            (SEQ_LAST_OPC - SEQ_FIRST_OPC)     + \
+                            (LDLM_LAST_OPC - LDLM_FIRST_OPC)   + \
                              (MGS_LAST_OPC - MGS_FIRST_OPC)     + \
+                            (OBD_LAST_OPC - OBD_FIRST_OPC)     + \
                              (LLOG_LAST_OPC - LLOG_FIRST_OPC)   + \
+                            (QUOTA_LAST_OPC - QUOTA_FIRST_OPC) + \
+                            (SEQ_LAST_OPC - SEQ_FIRST_OPC)     + \
                              (SEC_LAST_OPC - SEC_FIRST_OPC)     + \
-                            (QUOTA_LAST_OPC - QUOTA_FIRST_OPC))
+                            (FLD_LAST_OPC - FLD_FIRST_OPC))
  
  #define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR)  + \
                             (EXTRA_LAST_OPC - EXTRA_FIRST_OPC))
diff --git a/lustre/include/lustre/ll_fiemap.h b/lustre/include/lustre/ll_fiemap.h

index e8620bf..8bac0f4 100644 (file)
--- a/lustre/include/lustre/ll_fiemap.h
+++ b/lustre/include/lustre/ll_fiemap.h
@@ -48,27 +48,27 @@
  #ifndef HAVE_LINUX_FIEMAP_H
  
  struct ll_fiemap_extent {
-        __u64   fe_logical;  /* logical offset in bytes for the start of
-                              * the extent from the beginning of the file */
-        __u64   fe_physical; /* physical offset in bytes for the start
-                              * of the extent from the beginning of the disk */
-        __u64   fe_length;   /* length in bytes for the extent */
-        __u32   fe_flags;    /* FIEMAP_EXTENT_* flags for the extent */
-        __u32   fe_device;   /* device number for this extent */
+        __u64 fe_logical;  /* logical offset in bytes for the start of
+                            * the extent from the beginning of the file */
+        __u64 fe_physical; /* physical offset in bytes for the start
+                            * of the extent from the beginning of the disk */
+        __u64 fe_length;   /* length in bytes for this extent */
+        __u64 fe_reserved64[2];
+        __u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
+        __u32 fe_device;   /* device number for this extent */
+        __u32 fe_reserved[2];
  };
  
  struct ll_user_fiemap {
-        __u64   fm_start;         /* logical offset (inclusive) at
-                                   * which to start mapping (in) */
-        __u64   fm_length;        /* logical length of mapping which
-                                   * userspace wants (in) */
-        __u32   fm_flags;         /* FIEMAP_FLAG_* flags for request (in/out) */
-        __u32   fm_mapped_extents;/* number of extents that were mapped (out) */
-        __u32   fm_extent_count;  /* size of fm_extents array (in) */
-        __u32   fm_reserved;
-        struct  ll_fiemap_extent   fm_extents[0]; /* array of mapped extents (out).
-                                                   * Lustre uses first extent to
-                                                   * send end_offset */
+        __u64 fm_start;  /* logical offset (inclusive) at
+                          * which to start mapping (in) */
+        __u64 fm_length; /* logical length of mapping which
+                          * userspace wants (in) */
+        __u32 fm_flags;  /* FIEMAP_FLAG_* flags for request (in/out) */
+        __u32 fm_mapped_extents;/* number of extents that were mapped (out) */
+        __u32 fm_extent_count;  /* size of fm_extents array (in) */
+        __u32 fm_reserved;
+        struct ll_fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
  };
  
  #define FIEMAP_MAX_OFFSET      (~0ULL)
@@ -80,30 +80,31 @@ struct ll_user_fiemap {
  #define FIEMAP_FLAGS_COMPAT    (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR | \
                                  FIEMAP_FLAG_DEVICE_ORDER)
  
-#define FIEMAP_EXTENT_LAST             0x00000001 /* Last extent in file. */
-#define FIEMAP_EXTENT_UNKNOWN          0x00000002 /* Data location unknown. */
-#define FIEMAP_EXTENT_DELALLOC         0x00000004 /* Location still pending.
-                                                   * Sets EXTENT_UNKNOWN. */
-#define FIEMAP_EXTENT_NO_DIRECT        0x00000008 /* Data mapping undefined */
-#define FIEMAP_EXTENT_SECONDARY        0x00000010 /* Data copied offline. May
-                                                   * set EXTENT_NO_DIRECT. */
-#define FIEMAP_EXTENT_NET              0x00000020 /* Data stored remotely.
-                                                   * Sets EXTENT_NO_DIRECT. */
-#define FIEMAP_EXTENT_DATA_COMPRESSED  0x00000040 /* Data is compressed by fs.
-                                                   * Sets EXTENT_NO_DIRECT. */
-#define FIEMAP_EXTENT_DATA_ENCRYPTED   0x00000080 /* Data is encrypted by fs.
-                                                   * Sets EXTENT_NO_DIRECT. */
-#define FIEMAP_EXTENT_NOT_ALIGNED      0x00000100 /* Extent offsets may not be
-                                                   * block aligned. */
-#define FIEMAP_EXTENT_DATA_INLINE      0x00000200 /* Data mixed with metadata.
-                                                   * Sets EXTENT_NOT_ALIGNED.*/
-#define FIEMAP_EXTENT_DATA_TAIL        0x00000400 /* Multiple files in block.
-                                                   * Sets EXTENT_NOT_ALIGNED.*/
-#define FIEMAP_EXTENT_UNWRITTEN        0x00000800 /* Space allocated, but
-                                                   * no data (i.e. zero). */
-#define FIEMAP_EXTENT_MERGED           0x00001000 /* File does not natively
-                                                   * support extents. Result
-                                                   * merged for efficiency. */
+
+#define FIEMAP_EXTENT_LAST              0x00000001 /* Last extent in file. */
+#define FIEMAP_EXTENT_UNKNOWN           0x00000002 /* Data location unknown. */
+#define FIEMAP_EXTENT_DELALLOC          0x00000004 /* Location still pending.
+                                                    * Sets EXTENT_UNKNOWN. */
+#define FIEMAP_EXTENT_ENCODED           0x00000008 /* Data can not be read
+                                                    * while fs is unmounted */
+#define FIEMAP_EXTENT_DATA_ENCRYPTED    0x00000080 /* Data is encrypted by fs.
+                                                    * Sets EXTENT_NO_DIRECT. */
+#define FIEMAP_EXTENT_NOT_ALIGNED       0x00000100 /* Extent offsets may not be
+                                                    * block aligned. */
+#define FIEMAP_EXTENT_DATA_INLINE       0x00000200 /* Data mixed with metadata.
+                                                    * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_DATA_TAIL         0x00000400 /* Multiple files in block.
+                                                    * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_UNWRITTEN         0x00000800 /* Space allocated, but
+                                                    * no data (i.e. zero). */
+#define FIEMAP_EXTENT_MERGED            0x00001000 /* File does not natively
+                                                    * support extents. Result
+                                                    * merged for efficiency. */
+
+/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
+#define FIEMAP_EXTENT_NO_DIRECT         0x40000000 /* Data mapping undefined */
+#define FIEMAP_EXTENT_NET               0x80000000 /* Data stored remotely.
+                                                    * Sets NO_DIRECT flag */
  
  #else
  
diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h

index c80dd5e..76327b2 100644 (file)
--- a/lustre/include/lustre/lustre_idl.h
+++ b/lustre/include/lustre/lustre_idl.h
@@ -42,10 +42,6 @@
   *
   * Lustre wire protocol definitions.
   *
- * We assume all nodes are either little-endian or big-endian, and we
- * always send messages in the sender's native format.  The receiver
- * detects the message format by checking the 'magic' field of the message
- * (see lustre_msg_swabbed() below).
   * ALL structs passing over the wire should be declared here.  Structs
   * that are used in interfaces with userspace should go in lustre_user.h.
   *
@@ -72,6 +68,11 @@
   * in the code to ensure that new/old clients that see this larger struct
   * do not fail, otherwise you need to implement protocol compatibility).
   *
+ * We assume all nodes are either little-endian or big-endian, and we
+ * always send messages in the sender's native format.  The receiver
+ * detects the message format by checking the 'magic' field of the message
+ * (see lustre_msg_swabbed() below).
+ *
   * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines,
   * implemented either here, inline (trivial implementations) or in
   * ptlrpc/pack_generic.c.  These 'swabbers' convert the type from "other"
@@ -371,6 +372,7 @@ static inline __u32 lu_igif_gen(const struct lu_fid *fid)
  }
  
  #define DFID "["LPX64":0x%x:0x%x]"
+#define SFID "0x%llx:0x%x:0x%x"
  
  #define PFID(fid)     \
          fid_seq(fid), \
@@ -654,48 +656,50 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
   * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT)
   */
  
-#define MSG_CONNECT_RECOVERING  0x1
-#define MSG_CONNECT_RECONNECT   0x2
-#define MSG_CONNECT_REPLAYABLE  0x4
+#define MSG_CONNECT_RECOVERING  0x00000001
+#define MSG_CONNECT_RECONNECT   0x00000002
+#define MSG_CONNECT_REPLAYABLE  0x00000004
  //#define MSG_CONNECT_PEER        0x8
-#define MSG_CONNECT_LIBCLIENT   0x10
-#define MSG_CONNECT_INITIAL     0x20
-#define MSG_CONNECT_ASYNC       0x40
-#define MSG_CONNECT_NEXT_VER    0x80  /* use next version of lustre_msg */
-#define MSG_CONNECT_TRANSNO     0x100 /* report transno */
+#define MSG_CONNECT_LIBCLIENT   0x00000010
+#define MSG_CONNECT_INITIAL     0x00000020
+#define MSG_CONNECT_ASYNC       0x00000040
+#define MSG_CONNECT_NEXT_VER    0x00000080 /* use next version of lustre_msg */
+#define MSG_CONNECT_TRANSNO     0x00000100 /* report transno */
  
  /* Connect flags */
-#define OBD_CONNECT_RDONLY     0x00000001ULL /* client allowed read-only access */
-#define OBD_CONNECT_INDEX      0x00000002ULL /* connect to specific LOV idx */
-#define OBD_CONNECT_MDS        0x00000004ULL /* connect from MDT to OST */
-#define OBD_CONNECT_GRANT      0x00000008ULL /* OSC acquires grant at connect */
-#define OBD_CONNECT_SRVLOCK    0x00000010ULL /* server takes locks for client */
-#define OBD_CONNECT_VERSION    0x00000020ULL /* Server supports versions in ocd */
-#define OBD_CONNECT_REQPORTAL  0x00000040ULL /* Separate portal for non-IO reqs */
-#define OBD_CONNECT_ACL        0x00000080ULL /* client uses access control lists */
-#define OBD_CONNECT_XATTR      0x00000100ULL /* client using extended attributes*/
-#define OBD_CONNECT_TRUNCLOCK  0x00000400ULL /* locks on server for punch b=9528 */
-#define OBD_CONNECT_IBITS      0x00001000ULL /* support for inodebits locks */
-#define OBD_CONNECT_JOIN       0x00002000ULL /* files can be concatenated */
-#define OBD_CONNECT_ATTRFID    0x00004000ULL /* Server supports GetAttr By Fid */
-#define OBD_CONNECT_NODEVOH    0x00008000ULL /* No open handle for special nodes */
-#define OBD_CONNECT_RMT_CLIENT 0x00010000ULL /* Remote client */
-#define OBD_CONNECT_RMT_CLIENT_FORCE 0x00020000ULL /* Remote client by force */
-#define OBD_CONNECT_BRW_SIZE   0x00040000ULL /* Max bytes per rpc */
-#define OBD_CONNECT_QUOTA64    0x00080000ULL /* 64bit qunit_data.qd_count b=10707*/
-#define OBD_CONNECT_MDS_CAPA   0x00100000ULL /* MDS capability */
-#define OBD_CONNECT_OSS_CAPA   0x00200000ULL /* OSS capability */
-#define OBD_CONNECT_CANCELSET  0x00400000ULL /* Early batched cancels. */
-#define OBD_CONNECT_SOM        0x00800000ULL /* SOM feature */
-#define OBD_CONNECT_AT         0x01000000ULL /* client uses adaptive timeouts */
-#define OBD_CONNECT_LRU_RESIZE 0x02000000ULL /* Lru resize feature. */
-#define OBD_CONNECT_MDS_MDS    0x04000000ULL /* MDS-MDS connection*/
-#define OBD_CONNECT_REAL       0x08000000ULL /* real connection */
-#define OBD_CONNECT_CHANGE_QS  0x10000000ULL /* shrink/enlarge qunit b=10600 */
-#define OBD_CONNECT_CKSUM      0x20000000ULL /* support several cksum algos */
-#define OBD_CONNECT_FID        0x40000000ULL /* FID is supported by server */
-#define OBD_CONNECT_LOV_V3    0x100000000ULL /* client supports lov v3 ea */
-
+#define OBD_CONNECT_RDONLY            0x1ULL /*client allowed read-only access*/
+#define OBD_CONNECT_INDEX             0x2ULL /*connect to specific LOV idx */
+#define OBD_CONNECT_MDS               0x4ULL /*connect from MDT to OST */
+#define OBD_CONNECT_GRANT             0x8ULL /*OSC acquires grant at connect */
+#define OBD_CONNECT_SRVLOCK          0x10ULL /*server takes locks for client */
+#define OBD_CONNECT_VERSION          0x20ULL /*Lustre versions in ocd */
+#define OBD_CONNECT_REQPORTAL        0x40ULL /*Separate non-IO request portal */
+#define OBD_CONNECT_ACL              0x80ULL /*access control lists */
+#define OBD_CONNECT_XATTR           0x100ULL /*client use extended attributes */
+#define OBD_CONNECT_CROW            0x200ULL /*MDS+OST create objects on write*/
+#define OBD_CONNECT_TRUNCLOCK       0x400ULL /*locks on server for punch */
+#define OBD_CONNECT_TRANSNO         0x800ULL /*replay sends initial transno */
+#define OBD_CONNECT_IBITS          0x1000ULL /*support for inodebits locks */
+#define OBD_CONNECT_JOIN           0x2000ULL /*files can be concatenated */
+#define OBD_CONNECT_ATTRFID        0x4000ULL /*Server supports GetAttr By Fid */
+#define OBD_CONNECT_NODEVOH        0x8000ULL /*No open handle on special nodes*/
+#define OBD_CONNECT_RMT_CLIENT 0x00010000ULL /*Remote client */
+#define OBD_CONNECT_RMT_CLIENT_FORCE 0x00020000ULL /*Remote client by force */
+#define OBD_CONNECT_BRW_SIZE      0x40000ULL /*Max bytes per rpc */
+#define OBD_CONNECT_QUOTA64       0x80000ULL /*64bit qunit_data.qd_count */
+#define OBD_CONNECT_MDS_CAPA     0x100000ULL /*MDS capability */
+#define OBD_CONNECT_OSS_CAPA     0x200000ULL /*OSS capability */
+#define OBD_CONNECT_CANCELSET    0x400000ULL /*Early batched cancels. */
+#define OBD_CONNECT_SOM        0x00800000ULL /*Size on MDS */
+#define OBD_CONNECT_AT         0x01000000ULL /*client uses adaptive timeouts */
+#define OBD_CONNECT_LRU_RESIZE 0x02000000ULL /*LRU resize feature. */
+#define OBD_CONNECT_MDS_MDS    0x04000000ULL /*MDS-MDS connection */
+#define OBD_CONNECT_REAL       0x08000000ULL /*real connection */
+#define OBD_CONNECT_CHANGE_QS  0x10000000ULL /*shrink/enlarge qunit b=10600 */
+#define OBD_CONNECT_CKSUM      0x20000000ULL /*support several cksum algos */
+#define OBD_CONNECT_FID        0x40000000ULL /*FID is supported by server */
+#define OBD_CONNECT_VBR        0x80000000ULL /*version based recovery */
+#define OBD_CONNECT_LOV_V3    0x100000000ULL /*client supports LOV v3 EA */
  /* also update obd_connect_names[] for lprocfs_rd_connect_flags()
   * and lustre/utils/wirecheck.c */
  
@@ -709,27 +713,26 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
                                  OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \
                                  OBD_CONNECT_IBITS | OBD_CONNECT_JOIN | \
                                  OBD_CONNECT_NODEVOH |/* OBD_CONNECT_ATTRFID |*/\
+                                OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
                                  OBD_CONNECT_RMT_CLIENT | \
                                  OBD_CONNECT_RMT_CLIENT_FORCE | \
                                  OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA | \
-                                OBD_CONNECT_MDS_MDS | OBD_CONNECT_CANCELSET | \
-                                OBD_CONNECT_FID | \
-                                LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_AT | \
+                                OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID | \
+                                LRU_RESIZE_CONNECT_FLAG | \
                                  OBD_CONNECT_LOV_V3)
  #define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
                                  OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
                                  OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
                                  OBD_CONNECT_BRW_SIZE | OBD_CONNECT_QUOTA64 | \
-                                OBD_CONNECT_OSS_CAPA | OBD_CONNECT_CANCELSET | \
-                                OBD_CONNECT_CKSUM | LRU_RESIZE_CONNECT_FLAG | \
-                                OBD_CONNECT_AT | OBD_CONNECT_CHANGE_QS | \
-                                OBD_CONNECT_RMT_CLIENT | \
-                                OBD_CONNECT_RMT_CLIENT_FORCE | OBD_CONNECT_MDS)
+                                OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
+                                LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_CKSUM | \
+                                OBD_CONNECT_CHANGE_QS | \
+                                OBD_CONNECT_OSS_CAPA  | OBD_CONNECT_RMT_CLIENT | \
+                                OBD_CONNECT_RMT_CLIENT_FORCE | \
+                                OBD_CONNECT_MDS)
  #define ECHO_CONNECT_SUPPORTED (0)
  #define MGS_CONNECT_SUPPORTED  (OBD_CONNECT_VERSION | OBD_CONNECT_AT)
  
-#define MAX_QUOTA_COUNT32 (0xffffffffULL)
-
  #define OBD_OCD_VERSION(major,minor,patch,fix) (((major)<<24) + ((minor)<<16) +\
                                                  ((patch)<<8) + (fix))
  #define OBD_OCD_VERSION_MAJOR(version) ((int)((version)>>24)&255)
@@ -805,12 +808,12 @@ typedef __u64 obd_time;
  typedef __u64 obd_size;
  typedef __u64 obd_off;
  typedef __u64 obd_blocks;
+typedef __u64 obd_valid;
  typedef __u32 obd_blksize;
  typedef __u32 obd_mode;
  typedef __u32 obd_uid;
  typedef __u32 obd_gid;
  typedef __u32 obd_flag;
-typedef __u64 obd_valid;
  typedef __u32 obd_count;
  
  #define OBD_FL_INLINEDATA    (0x00000001)
@@ -822,6 +825,7 @@ typedef __u32 obd_count;
  #define OBD_FL_DEBUG_CHECK   (0x00000040) /* echo client/server debug check */
  #define OBD_FL_NO_USRQUOTA   (0x00000100) /* the object's owner is over quota */
  #define OBD_FL_NO_GRPQUOTA   (0x00000200) /* the object's group is over quota */
+#define OBD_FL_CREATE_CROW   (0x00000400) /* object should be create on write */
  
  /**
   * Set this to delegate DLM locking during obd_punch() to the OSTs. Only OSTs
@@ -920,7 +924,7 @@ struct lov_mds_md_v3 {            /* LOV EA mds/wire data (little-endian) */
  #define OBD_MD_FLHANDLE    (0x00080000ULL) /* file/lock handle */
  #define OBD_MD_FLCKSUM     (0x00100000ULL) /* bulk data checksum */
  #define OBD_MD_FLQOS       (0x00200000ULL) /* quality of service stats */
-#define OBD_MD_FLOSCOPQ    (0x00400000ULL) /* osc opaque data */
+/*#define OBD_MD_FLOSCOPQ    (0x00400000ULL) osc opaque data, never used */
  #define OBD_MD_FLCOOKIE    (0x00800000ULL) /* log cancellation cookie */
  #define OBD_MD_FLGROUP     (0x01000000ULL) /* group */
  #define OBD_MD_FLFID       (0x02000000ULL) /* ->ost write inline fid */
@@ -1202,9 +1206,19 @@ static inline int ll_inode_to_ext_flags(int oflags, int iflags)
  }
  #endif
  
-struct mdt_body {
-        struct lu_fid  fid1;
-        struct lu_fid  fid2;
+/*
+ * while mds_body is to interact with 1.6, mdt_body is to interact with 2.0.
+ * both of them should have the same fields layout, because at client side
+ * one could be dynamically cast to the other.
+ *
+ * mdt_body has large size than mds_body, with unused padding (48 bytes)
+ * at the end. client always use size of mdt_body to prepare request/reply
+ * buffers, and actual data could be interepeted as mdt_body or mds_body
+ * accordingly.
+ */
+struct mds_body {
+        struct ll_fid  fid1;
+        struct ll_fid  fid2;
          struct lustre_handle handle;
          __u64          valid;
          __u64          size;   /* Offset, in the case of MDS_READPAGE */
@@ -1212,8 +1226,8 @@ struct mdt_body {
          __u64          atime;
          __u64          ctime;
          __u64          blocks; /* XID, in the case of MDS_READPAGE */
-        __u64          ioepoch;
-        __u64          ino;    /* for 1.6 compatibility */
+        __u64          io_epoch;
+        __u64          ino;
          __u32          fsuid;
          __u32          fsgid;
          __u32          capability;
@@ -1223,24 +1237,20 @@ struct mdt_body {
          __u32          flags; /* from vfs for pin/unpin, MDS_BFLAG for close */
          __u32          rdev;
          __u32          nlink; /* #bytes to read in the case of MDS_READPAGE */
-        __u32          generation; /* for 1.6 compatibility */
+        __u32          generation;
          __u32          suppgid;
          __u32          eadatasize;
          __u32          aclsize;
          __u32          max_mdsize;
          __u32          max_cookiesize;
-        __u32          padding_4; /* also fix lustre_swab_mdt_body */
-        __u64          padding_5;
-        __u64          padding_6;
-        __u64          padding_7;
-        __u64          padding_8;
-        __u64          padding_9;
-        __u64          padding_10;
+        __u32          padding_4; /* also fix lustre_swab_mds_body */
  };
  
-struct mds_body {
-        struct ll_fid  fid1;
-        struct ll_fid  fid2;
+extern void lustre_swab_mds_body (struct mds_body *b);
+
+struct mdt_body {
+        struct lu_fid  fid1;
+        struct lu_fid  fid2;
          struct lustre_handle handle;
          __u64          valid;
          __u64          size;   /* Offset, in the case of MDS_READPAGE */
@@ -1248,8 +1258,8 @@ struct mds_body {
          __u64          atime;
          __u64          ctime;
          __u64          blocks; /* XID, in the case of MDS_READPAGE */
-        __u64          io_epoch;
-        __u64          ino;
+        __u64          ioepoch;
+        __u64          ino;    /* for 1.6 compatibility */
          __u32          fsuid;
          __u32          fsgid;
          __u32          capability;
@@ -1259,16 +1269,21 @@ struct mds_body {
          __u32          flags; /* from vfs for pin/unpin, MDS_BFLAG for close */
          __u32          rdev;
          __u32          nlink; /* #bytes to read in the case of MDS_READPAGE */
-        __u32          generation;
+        __u32          generation; /* for 1.6 compatibility */
          __u32          suppgid;
          __u32          eadatasize;
          __u32          aclsize;
          __u32          max_mdsize;
          __u32          max_cookiesize;
-        __u32          padding_4; /* also fix lustre_swab_mds_body */
-};
+        __u32          padding_4; /* also fix lustre_swab_mdt_body */
+        __u64          padding_5;
+        __u64          padding_6;
+        __u64          padding_7;
+        __u64          padding_8;
+        __u64          padding_9;
+        __u64          padding_10;
+}; /* 216 */
  
-extern void lustre_swab_mds_body (struct mds_body *b);
  extern void lustre_swab_mdt_body (struct mdt_body *b);
  
  struct mdt_epoch {
@@ -1507,20 +1522,6 @@ enum {
          MDS_QUOTA_IGNORE = 1 << 5
  };
  
-struct mds_rec_join {
-        struct ll_fid  jr_fid;
-        __u64          jr_headsize;
-};
-
-extern void lustre_swab_mds_rec_join (struct mds_rec_join *jr);
-
-struct mdt_rec_join {
-        struct lu_fid  jr_fid;
-        __u64          jr_headsize;
-};
-
-extern void lustre_swab_mdt_rec_join (struct mdt_rec_join *jr);
-
  struct mds_rec_create {
          __u32           cr_opcode;
          __u32           cr_fsuid;
@@ -1555,7 +1556,7 @@ struct mdt_rec_create {
          __u32           cr_suppgid2_h;
          struct lu_fid   cr_fid1;
          struct lu_fid   cr_fid2;
-        struct lustre_handle cr_old_handle; /* u64 handle in case of open replay */
+        struct lustre_handle cr_old_handle; /* handle in case of open replay */
          __u64           cr_time;
          __u64           cr_rdev;
          __u64           cr_ioepoch;
@@ -1570,6 +1571,20 @@ struct mdt_rec_create {
  
  extern void lustre_swab_mdt_rec_create (struct mdt_rec_create *cr);
  
+struct mds_rec_join {
+        struct ll_fid  jr_fid;
+        __u64          jr_headsize;
+};
+
+extern void lustre_swab_mds_rec_join (struct mds_rec_join *jr);
+
+struct mdt_rec_join {
+        struct lu_fid  jr_fid;
+        __u64          jr_headsize;
+};
+
+extern void lustre_swab_mdt_rec_join (struct mdt_rec_join *jr);
+
  struct mds_rec_link {
          __u32           lk_opcode;
          __u32           lk_fsuid;
@@ -1761,13 +1776,49 @@ extern void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr);
  struct lmv_desc {
          __u32 ld_tgt_count;                /* how many MDS's */
          __u32 ld_active_tgt_count;         /* how many active */
+        __u32 ld_default_stripe_count;     /* how many objects are used */
+        __u32 ld_pattern;                  /* default MEA_MAGIC_* */
+        __u64 ld_default_hash_size;
+        __u64 ld_padding_1;                /* also fix lustre_swab_lmv_desc */
+        __u32 ld_padding_2;                /* also fix lustre_swab_lmv_desc */
+        __u32 ld_qos_maxage;               /* in second */
+        __u32 ld_padding_3;                /* also fix lustre_swab_lmv_desc */
+        __u32 ld_padding_4;                /* also fix lustre_swab_lmv_desc */
          struct obd_uuid ld_uuid;
  };
  
  extern void lustre_swab_lmv_desc (struct lmv_desc *ld);
  
+/* TODO: lmv_stripe_md should contain mds capabilities for all slave fids */
+struct lmv_stripe_md {
+        __u32         mea_magic;
+        __u32         mea_count;
+        __u32         mea_master;
+        __u32         mea_padding;
+        char          mea_pool_name[LOV_MAXPOOLNAME];
+        struct lu_fid mea_ids[0];
+};
+
+extern void lustre_swab_lmv_stripe_md(struct lmv_stripe_md *mea);
+
+/* lmv structures */
+#define MEA_MAGIC_LAST_CHAR      0xb2221ca1
+#define MEA_MAGIC_ALL_CHARS      0xb222a11c
+#define MEA_MAGIC_HASH_SEGMENT   0xb222a11b
+
+#define MAX_HASH_SIZE_32         0x7fffffffUL
+#define MAX_HASH_SIZE            0x7fffffffffffffffULL
+#define MAX_HASH_HIGHEST_BIT     0x1000000000000000ULL
+
+struct md_fld {
+        seqno_t mf_seq;
+        mdsno_t mf_mds;
+};
+
+extern void lustre_swab_md_fld (struct md_fld *mf);
+
  enum fld_rpc_opc {
-        FLD_QUERY                       = 600,
+        FLD_QUERY                       = 900,
          FLD_LAST_OPC,
          FLD_FIRST_OPC                   = FLD_QUERY
  };
@@ -1787,7 +1838,8 @@ enum seq_op {
   *  LOV data structures
   */
  
-#define LOV_MIN_STRIPE_SIZE 65536   /* maximum PAGE_SIZE (ia64), power of 2 */
+#define LOV_MIN_STRIPE_BITS 16   /* maximum PAGE_SIZE (ia64), power of 2 */
+#define LOV_MIN_STRIPE_SIZE (1<<LOV_MIN_STRIPE_BITS)
  #define LOV_MAX_STRIPE_COUNT  160   /* until bug 4424 is fixed */
  #define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
  
@@ -2023,7 +2075,6 @@ struct cfg_marker {
  extern void lustre_swab_cfg_marker(struct cfg_marker *marker,
                                     int swab, int size);
  
-
  /*
   * Opcodes for multiple servers.
   */
@@ -2416,6 +2467,7 @@ extern void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum);
  extern void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
                                              int stripe_count);
  extern void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj);
+extern void lustre_swab_lov_mds_md(struct lov_mds_md *lmm);
  
  /* llog_swab.c */
  extern void lustre_swab_llogd_body (struct llogd_body *d);
@@ -2471,8 +2523,8 @@ extern int quota_copy_qdata(void *request, struct qunit_data *qdata,
                              int is_req, int is_exp);
  
  typedef enum {
-        QUOTA_DQACQ     = 901,
-        QUOTA_DQREL     = 902,
+        QUOTA_DQACQ     = 601,
+        QUOTA_DQREL     = 602,
          QUOTA_LAST_OPC
  } quota_cmd_t;
  #define QUOTA_FIRST_OPC QUOTA_DQACQ
@@ -2490,6 +2542,7 @@ typedef enum {
  #define QUOTA_RET_NOLIMIT      2 /**< quota limit isn't set */
  #define QUOTA_RET_ACQUOTA      4 /**< need to acquire extra quota */
  
+
  /* security opcodes */
  typedef enum {
          SEC_CTX_INIT            = 801,
diff --git a/lustre/include/lustre/lustre_user.h b/lustre/include/lustre/lustre_user.h

index 12a0f0e..da7ed9b 100644 (file)
--- a/lustre/include/lustre/lustre_user.h
+++ b/lustre/include/lustre/lustre_user.h
@@ -62,7 +62,7 @@
  #define EXT3_IOC_SETVERSION             _IOW('f', 4, long)
  #define EXT3_IOC_GETVERSION_OLD         _IOR('v', 1, long)
  #define EXT3_IOC_SETVERSION_OLD         _IOW('v', 2, long)
-#define EXT3_IOC_FIEMAP                 _IOWR('f', 10, struct ll_user_fiemap)
+#define EXT3_IOC_FIEMAP                 _IOWR('f', 11, struct ll_user_fiemap)
  #endif
  
  /* FIEMAP flags supported by Lustre */
@@ -174,6 +174,11 @@ struct lov_user_md_v3 {           /* LOV EA user data (host-endian) */
          struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
  } __attribute__((packed));
  
+#define copy_lov_mds2user(user_md, mds_md) do { \
+        memcpy(user_md, mds_md, sizeof(*(user_md))); \
+        (user_md)->lmm_stripe_offset = 0; \
+        (user_md)->lmm_stripe_count = (mds_md)->lmm_stripe_count; } while(0)
+
  /* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
   * use this.  It is unsafe to #define those values in this header as it
   * is possible the application has already #included <sys/stat.h>. */
diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h

index 96c5543..9edf487 100644 (file)
--- a/lustre/include/lustre_dlm.h
+++ b/lustre/include/lustre_dlm.h
@@ -189,7 +189,14 @@ typedef enum {
  /* Flags sent in AST lock_flags to be mapped into the receiving lock. */
  #define LDLM_AST_FLAGS         (LDLM_FL_DISCARD_DATA)
  
-/* Used for marking lock as an target for -EINTR while cp_ast sleep situation
+/* 
+ * --------------------------------------------------------------------------
+ * NOTE! Starting from this point, that is, LDLM_FL_* flags with values above
+ * 0x80000000 will not be sent over the wire.
+ * --------------------------------------------------------------------------
+ */
+
+/* Used for marking lock as an target for -EINTR while cp_ast sleep
   * emulation + race with upcoming bl_ast.  */
  #define LDLM_FL_FAIL_LOC       0x100000000ULL
  
@@ -370,14 +377,6 @@ typedef enum {
  } ldlm_appetite_t;
  
  /*
- * Default value for ->ns_shrink_thumb. If lock is not extent one its cost
- * is one page. Here we have 256 pages which is 1M on i386. Thus by default
- * all extent locks which have more than 1M long extent will be kept in lru,
- * others (including ibits locks) will be canceled on memory pressure event.
- */
-#define LDLM_LOCK_SHRINK_THUMB 256
-
-/*
   * Default values for the "max_nolock_size", "contention_time" and
   * "contended_locks" namespace tunables.
   */
@@ -444,11 +443,6 @@ struct ldlm_namespace {
          unsigned int           ns_ctime_age_limit;
  
          /**
-         * Lower limit to number of pages in lock to keep it in cache.
-         */
-        unsigned long          ns_shrink_thumb;
-
-        /**
           * Next debug dump, jiffies.
           */
          cfs_time_t             ns_next_dump;
@@ -645,7 +639,11 @@ struct ldlm_lock {
           */
          cfs_waitq_t           l_waitq;
  
-        struct timeval        l_enqueued_time;
+        /** 
+         * Seconds. it will be updated if there is any activity related to 
+         * the lock, e.g. enqueue the lock or send block AST.
+         */
+        cfs_time_t            l_last_activity;
  
          /**
           * Jiffies. Should be converted to time if needed.
@@ -796,13 +794,16 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 mask,
                        ...)
          __attribute__ ((format (printf, 4, 5)));
  
-#define LDLM_ERROR(lock, fmt, a...) do {                                \
+#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do {                    \
          static cfs_debug_limit_state_t _ldlm_cdls;                      \
-        ldlm_lock_debug(&_ldlm_cdls, D_ERROR, lock,                     \
+        ldlm_lock_debug(&_ldlm_cdls, mask, lock,                        \
                          __FILE__, __FUNCTION__, __LINE__,               \
                          "### " fmt , ##a);                              \
  } while (0)
  
+#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a)
+#define LDLM_WARN(lock, fmt, a...)  LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a)
+
  #define LDLM_DEBUG(lock, fmt, a...)   do {                              \
          ldlm_lock_debug(NULL, D_DLMTRACE, lock,                         \
                          __FILE__, __FUNCTION__, __LINE__,               \
@@ -963,6 +964,7 @@ int  ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode);
  void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode);
  void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode);
  void ldlm_lock_allow_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock);
  ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, int flags,
                              const struct ldlm_res_id *, ldlm_type_t type,
                              ldlm_policy_data_t *, ldlm_mode_t mode,
diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h

index 94033ef..6ee97d6 100644 (file)
--- a/lustre/include/lustre_export.h
+++ b/lustre/include/lustre_export.h
@@ -92,7 +92,6 @@ struct filter_export_data {
          int                        fed_mod_count;/* items in fed_writing list */
          long                       fed_pending;  /* bytes just being written */
          __u32                      fed_group;
-        struct brw_stats           fed_brw_stats;
  };
  
  typedef struct nid_stat_uuid {
@@ -113,6 +112,12 @@ typedef struct nid_stat {
          int                      nid_exp_ref_count;
  }nid_stat_t;
  
+enum obd_option {
+        OBD_OPT_FORCE =         0x0001,
+        OBD_OPT_FAILOVER =      0x0002,
+        OBD_OPT_ABORT_RECOV =   0x0004,
+};
+
  struct obd_export {
          struct portals_handle     exp_handle;
          atomic_t                  exp_refcount;
@@ -137,7 +142,7 @@ struct obd_export {
          spinlock_t                exp_lock; /* protects flags int below */
          /* ^ protects exp_outstanding_replies too */
          __u64                     exp_connect_flags;
-        int                       exp_flags;
+        enum obd_option           exp_flags;
          unsigned long             exp_failed:1,
                                    exp_in_recovery:1,
                                    exp_disconnected:1,
diff --git a/lustre/include/lustre_fid.h b/lustre/include/lustre_fid.h

index 7c8085f..921b423 100644 (file)
--- a/lustre/include/lustre_fid.h
+++ b/lustre/include/lustre_fid.h
@@ -57,6 +57,8 @@ struct lu_context;
  extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE;
  extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE;
  extern const struct lu_fid LUSTRE_BFL_FID;
+extern const struct lu_fid LU_OBF_FID;
+extern const struct lu_fid LU_DOT_LUSTRE_FID;
  
  enum {
          /*
@@ -82,6 +84,9 @@ enum {
  /** special fid seq: used for local object create. */
  #define FID_SEQ_LOCAL_FILE      (FID_SEQ_START + 1)
  
+/** special fid seq: used for .lustre objects. */
+#define LU_DOT_LUSTRE_SEQ       (FID_SEQ_START + 0x02ULL)
+
  /** special OID for local objects */
  enum {
          /** \see osd_oi_index_create */
diff --git a/lustre/include/lustre_lib.h b/lustre/include/lustre_lib.h

index 2ec0f44..a058fda 100644 (file)
--- a/lustre/include/lustre_lib.h
+++ b/lustre/include/lustre_lib.h
@@ -74,8 +74,6 @@ void target_client_add_cb(struct obd_device *obd, __u64 transno, void *cb_data,
  int target_handle_connect(struct ptlrpc_request *req);
  int target_handle_disconnect(struct ptlrpc_request *req);
  void target_destroy_export(struct obd_export *exp);
-int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp,
-                            struct obd_uuid *cluuid, int);
  int target_pack_pool_reply(struct ptlrpc_request *req);
  int target_handle_ping(struct ptlrpc_request *req);
  void target_committed_to_req(struct ptlrpc_request *req);
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h

index efef5b4..67efebc 100644 (file)
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -636,8 +636,12 @@ struct ptlrpc_bulk_desc {
          lnet_handle_md_t       bd_md_h;         /* associated MD */
          lnet_nid_t             bd_sender;       /* stash event::sender */
  
-        cfs_page_t           **bd_enc_pages;
  #if defined(__KERNEL__)
+        /*
+         * encrypt iov, size is either 0 or bd_iov_count.
+         */
+        lnet_kiov_t           *bd_enc_iov;
+
          lnet_kiov_t            bd_iov[0];
  #else
          lnet_md_iovec_t        bd_iov[0];
@@ -1267,7 +1271,7 @@ static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req)
  int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg);
  int client_obd_cleanup(struct obd_device *obddev);
  int client_connect_import(const struct lu_env *env,
-                          struct lustre_handle *conn, struct obd_device *obd,
+                          struct obd_export **exp, struct obd_device *obd,
                            struct obd_uuid *cluuid, struct obd_connect_data *,
                            void *localdata);
  int client_disconnect_export(struct obd_export *exp);
diff --git a/lustre/include/lustre_sec.h b/lustre/include/lustre_sec.h

index 57a58c7..50274fc 100644 (file)
--- a/lustre/include/lustre_sec.h
+++ b/lustre/include/lustre_sec.h
@@ -94,99 +94,163 @@ enum sptlrpc_service_type {
          SPTLRPC_SVC_MAX,
  };
  
+enum sptlrpc_bulk_type {
+        SPTLRPC_BULK_DEFAULT            = 0,    /* follow rpc flavor */
+        SPTLRPC_BULK_HASH               = 1,    /* hash integrity */
+        SPTLRPC_BULK_MAX,
+};
+
+enum sptlrpc_bulk_service {
+        SPTLRPC_BULK_SVC_NULL           = 0,
+        SPTLRPC_BULK_SVC_AUTH           = 1,
+        SPTLRPC_BULK_SVC_INTG           = 2,
+        SPTLRPC_BULK_SVC_PRIV           = 3,
+        SPTLRPC_BULK_SVC_MAX,
+};
+
  /*
- * rpc flavor compose/extract, represented as 16 bits
+ * rpc flavor compose/extract, represented as 32 bits. currently the
+ * high 12 bits are unused, must be set as 0.
   *
- * 4b (reserved) | 4b (svc) | 4b (mech)  | 4b (policy)
+ * 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech)  | 4b (policy)
   */
-#define RPC_FLVR_POLICY_OFFSET        (0)
-#define RPC_FLVR_MECH_OFFSET          (4)
-#define RPC_FLVR_SVC_OFFSET           (8)
-
-#define MAKE_RPC_FLVR(policy, mech, svc)                                \
-        (((__u16)(policy) << RPC_FLVR_POLICY_OFFSET) |                  \
-         ((__u16)(mech) << RPC_FLVR_MECH_OFFSET) |                      \
-         ((__u16)(svc) << RPC_FLVR_SVC_OFFSET))
+#define FLVR_POLICY_OFFSET              (0)
+#define FLVR_MECH_OFFSET                (4)
+#define FLVR_SVC_OFFSET                 (8)
+#define FLVR_BULK_TYPE_OFFSET           (12)
+#define FLVR_BULK_SVC_OFFSET            (16)
+
+#define MAKE_FLVR(policy, mech, svc, btype, bsvc)                       \
+        (((__u32)(policy) << FLVR_POLICY_OFFSET) |                      \
+         ((__u32)(mech) << FLVR_MECH_OFFSET) |                          \
+         ((__u32)(svc) << FLVR_SVC_OFFSET) |                            \
+         ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) |                    \
+         ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET))
  
-#define MAKE_RPC_SUBFLVR(mech, svc)                                     \
-        ((__u16)(mech) |                                                \
-         ((__u16)(svc) << (RPC_FLVR_SVC_OFFSET - RPC_FLVR_MECH_OFFSET)))
-
-#define RPC_FLVR_SUB(flavor)                                            \
-        ((((__u16)(flavor)) >> RPC_FLVR_MECH_OFFSET) & 0xFF)
-
-#define RPC_FLVR_POLICY(flavor)                                         \
-        ((((__u16)(flavor)) >> RPC_FLVR_POLICY_OFFSET) & 0xF)
-#define RPC_FLVR_MECH(flavor)                                           \
-        ((((__u16)(flavor)) >> RPC_FLVR_MECH_OFFSET) & 0xF)
-#define RPC_FLVR_SVC(flavor)                                            \
-        ((((__u16)(flavor)) >> RPC_FLVR_SVC_OFFSET) & 0xF)
+/*
+ * extraction
+ */
+#define SPTLRPC_FLVR_POLICY(flavor)                                     \
+        ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_MECH(flavor)                                       \
+        ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_SVC(flavor)                                        \
+        ((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_TYPE(flavor)                                  \
+        ((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_SVC(flavor)                                   \
+        ((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF)
+
+#define SPTLRPC_FLVR_BASE(flavor)                                       \
+        ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF)
+#define SPTLRPC_FLVR_BASE_SUB(flavor)                                   \
+        ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF)
  
  /*
   * gss subflavors
   */
+#define MAKE_BASE_SUBFLVR(mech, svc)                                    \
+        ((__u32)(mech) |                                                \
+         ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET)))
+
  #define SPTLRPC_SUBFLVR_KRB5N                                           \
-        MAKE_RPC_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL)
+        MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL)
  #define SPTLRPC_SUBFLVR_KRB5A                                           \
-        MAKE_RPC_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH)
+        MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH)
  #define SPTLRPC_SUBFLVR_KRB5I                                           \
-        MAKE_RPC_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG)
+        MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG)
  #define SPTLRPC_SUBFLVR_KRB5P                                           \
-        MAKE_RPC_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV)
+        MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV)
  
  /*
   * "end user" flavors
   */
  #define SPTLRPC_FLVR_NULL                               \
-        MAKE_RPC_FLVR(SPTLRPC_POLICY_NULL,              \
-                      SPTLRPC_MECH_NULL,                \
-                      SPTLRPC_SVC_NULL)
+        MAKE_FLVR(SPTLRPC_POLICY_NULL,                  \
+                  SPTLRPC_MECH_NULL,                    \
+                  SPTLRPC_SVC_NULL,                     \
+                  SPTLRPC_BULK_DEFAULT,                 \
+                  SPTLRPC_BULK_SVC_NULL)
  #define SPTLRPC_FLVR_PLAIN                              \
-        MAKE_RPC_FLVR(SPTLRPC_POLICY_PLAIN,             \
-                      SPTLRPC_MECH_PLAIN,               \
-                      SPTLRPC_SVC_NULL)
+        MAKE_FLVR(SPTLRPC_POLICY_PLAIN,                 \
+                  SPTLRPC_MECH_PLAIN,                   \
+                  SPTLRPC_SVC_NULL,                     \
+                  SPTLRPC_BULK_HASH,                    \
+                  SPTLRPC_BULK_SVC_INTG)
  #define SPTLRPC_FLVR_KRB5N                              \
-        MAKE_RPC_FLVR(SPTLRPC_POLICY_GSS,               \
-                      SPTLRPC_MECH_GSS_KRB5,            \
-                      SPTLRPC_SVC_NULL)
+        MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+                  SPTLRPC_MECH_GSS_KRB5,                \
+                  SPTLRPC_SVC_NULL,                     \
+                  SPTLRPC_BULK_DEFAULT,                 \
+                  SPTLRPC_BULK_SVC_NULL)
  #define SPTLRPC_FLVR_KRB5A                              \
-        MAKE_RPC_FLVR(SPTLRPC_POLICY_GSS,               \
-                      SPTLRPC_MECH_GSS_KRB5,            \
-                      SPTLRPC_SVC_AUTH)
+        MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+                  SPTLRPC_MECH_GSS_KRB5,                \
+                  SPTLRPC_SVC_AUTH,                     \
+                  SPTLRPC_BULK_DEFAULT,                 \
+                  SPTLRPC_BULK_SVC_NULL)
  #define SPTLRPC_FLVR_KRB5I                              \
-        MAKE_RPC_FLVR(SPTLRPC_POLICY_GSS,               \
-                      SPTLRPC_MECH_GSS_KRB5,            \
-                      SPTLRPC_SVC_INTG)
+        MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+                  SPTLRPC_MECH_GSS_KRB5,                \
+                  SPTLRPC_SVC_INTG,                     \
+                  SPTLRPC_BULK_DEFAULT,                 \
+                  SPTLRPC_BULK_SVC_INTG)
  #define SPTLRPC_FLVR_KRB5P                              \
-        MAKE_RPC_FLVR(SPTLRPC_POLICY_GSS,               \
-                      SPTLRPC_MECH_GSS_KRB5,            \
-                      SPTLRPC_SVC_PRIV)
-
-#define SPTLRPC_FLVR_ANY                ((__u16) 0xf000)
-#define SPTLRPC_FLVR_INVALID            ((__u16) 0xffff)
+        MAKE_FLVR(SPTLRPC_POLICY_GSS,                   \
+                  SPTLRPC_MECH_GSS_KRB5,                \
+                  SPTLRPC_SVC_PRIV,                     \
+                  SPTLRPC_BULK_DEFAULT,                 \
+                  SPTLRPC_BULK_SVC_PRIV)
  
  #define SPTLRPC_FLVR_DEFAULT            SPTLRPC_FLVR_NULL
  
+#define SPTLRPC_FLVR_INVALID            ((__u32) 0xFFFFFFFF)
+#define SPTLRPC_FLVR_ANY                ((__u32) 0xFFF00000)
+
  /*
- * 32 bits wire flavor (msg->lm_secflvr), lower 12 bits is the rpc flavor,
- * higher 20 bits is not defined right now.
+ * extract the useful part from wire flavor
   */
-#define WIRE_FLVR_RPC(wflvr)            (((__u16) (wflvr)) & 0x0FFF)
+#define WIRE_FLVR(wflvr)                (((__u32) (wflvr)) & 0x000FFFFF)
  
-static inline void rpc_flvr_set_svc(__u16 *flvr, __u16 svc)
+static inline void flvr_set_svc(__u32 *flvr, __u32 svc)
  {
          LASSERT(svc < SPTLRPC_SVC_MAX);
-        *flvr = MAKE_RPC_FLVR(RPC_FLVR_POLICY(*flvr),
-                              RPC_FLVR_MECH(*flvr),
-                              svc);
+        *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+                          SPTLRPC_FLVR_MECH(*flvr),
+                          svc,
+                          SPTLRPC_FLVR_BULK_TYPE(*flvr),
+                          SPTLRPC_FLVR_BULK_SVC(*flvr));
  }
  
+static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc)
+{
+        LASSERT(svc < SPTLRPC_BULK_SVC_MAX);
+        *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+                          SPTLRPC_FLVR_MECH(*flvr),
+                          SPTLRPC_FLVR_SVC(*flvr),
+                          SPTLRPC_FLVR_BULK_TYPE(*flvr),
+                          svc);
+}
+
+struct bulk_spec_hash {
+        __u8    hash_alg;
+};
  
  struct sptlrpc_flavor {
-        __u16   sf_rpc;         /* rpc flavor */
-        __u8    sf_bulk_ciph;   /* bulk cipher alg */
-        __u8    sf_bulk_hash;   /* bulk hash alg */
+        __u32   sf_rpc;         /* wire flavor - should be renamed to sf_wire */
          __u32   sf_flags;       /* general flags */
+        /*
+         * rpc flavor specification
+         */
+        union {
+                /* nothing for now */
+        } u_rpc;
+        /*
+         * bulk flavor specification
+         */
+        union {
+                struct bulk_spec_hash hash;
+        } u_bulk;
  };
  
  enum lustre_sec_part {
@@ -216,6 +280,7 @@ struct sptlrpc_rule_set {
  };
  
  int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr);
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr);
  
  static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set)
  {
@@ -223,10 +288,9 @@ static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set)
  }
  
  void sptlrpc_rule_set_free(struct sptlrpc_rule_set *set);
-int  sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set, int expand);
+int  sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set);
  int  sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set,
-                            struct sptlrpc_rule *rule,
-                            int expand);
+                            struct sptlrpc_rule *rule);
  int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
                              enum lustre_sec_part from,
                              enum lustre_sec_part to,
@@ -396,10 +460,12 @@ struct ptlrpc_sec_sops {
                                                  int msgsize);
          void                    (*free_rs)     (struct ptlrpc_reply_state *rs);
          void                    (*free_ctx)    (struct ptlrpc_svc_ctx *ctx);
-        /* reverse credential */
+        /* reverse context */
          int                     (*install_rctx)(struct obd_import *imp,
                                                  struct ptlrpc_svc_ctx *ctx);
          /* bulk transform */
+        int                     (*prep_bulk)   (struct ptlrpc_request *req,
+                                                struct ptlrpc_bulk_desc *desc);
          int                     (*unwrap_bulk) (struct ptlrpc_request *req,
                                                  struct ptlrpc_bulk_desc *desc);
          int                     (*wrap_bulk)   (struct ptlrpc_request *req,
@@ -481,55 +547,30 @@ enum sptlrpc_bulk_hash_alg {
          BULK_HASH_ALG_SHA256,
          BULK_HASH_ALG_SHA384,
          BULK_HASH_ALG_SHA512,
-        BULK_HASH_ALG_WP256,
-        BULK_HASH_ALG_WP384,
-        BULK_HASH_ALG_WP512,
          BULK_HASH_ALG_MAX
  };
  
-enum sptlrpc_bulk_cipher_alg {
-        BULK_CIPH_ALG_NULL      = 0,
-        BULK_CIPH_ALG_ARC4,
-        BULK_CIPH_ALG_AES128,
-        BULK_CIPH_ALG_AES192,
-        BULK_CIPH_ALG_AES256,
-        BULK_CIPH_ALG_CAST128,
-        BULK_CIPH_ALG_CAST256,
-        BULK_CIPH_ALG_TWOFISH128,
-        BULK_CIPH_ALG_TWOFISH256,
-        BULK_CIPH_ALG_MAX
-};
-
  struct sptlrpc_hash_type {
          char           *sht_name;
          char           *sht_tfm_name;
          unsigned int    sht_size;
  };
  
-struct sptlrpc_ciph_type {
-        char           *sct_name;
-        char           *sct_tfm_name;
-        __u32           sct_tfm_flags;
-        unsigned int    sct_ivsize;
-        unsigned int    sct_keysize;
-};
-
  const struct sptlrpc_hash_type *sptlrpc_get_hash_type(__u8 hash_alg);
  const char * sptlrpc_get_hash_name(__u8 hash_alg);
-const struct sptlrpc_ciph_type *sptlrpc_get_ciph_type(__u8 ciph_alg);
-const char *sptlrpc_get_ciph_name(__u8 ciph_alg);
+__u8 sptlrpc_get_hash_alg(const char *algname);
  
-#define CIPHER_MAX_BLKSIZE      (16)
-#define CIPHER_MAX_KEYSIZE      (64)
+enum {
+        BSD_FL_ERR      = 1,
+};
  
  struct ptlrpc_bulk_sec_desc {
-        __u8            bsd_version;
-        __u8            bsd_flags;
-        __u8            bsd_pad[4];
-        __u8            bsd_hash_alg;                /* hash algorithm */
-        __u8            bsd_ciph_alg;                /* cipher algorithm */
-        __u8            bsd_key[CIPHER_MAX_KEYSIZE]; /* encrypt key seed */
-        __u8            bsd_csum[0];
+        __u8            bsd_version;    /* 0 */
+        __u8            bsd_type;       /* SPTLRPC_BULK_XXX */
+        __u8            bsd_svc;        /* SPTLRPC_BULK_SVC_XXXX */
+        __u8            bsd_flags;      /* flags */
+        __u32           bsd_nob;        /* nob of bulk data */
+        __u8            bsd_data[0];    /* policy-specific token */
  };
  
  
@@ -567,9 +608,12 @@ void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
  int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy);
  int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy);
  
-__u16 sptlrpc_name2rpcflavor(const char *name);
-const char *sptlrpc_rpcflavor2name(__u16 flavor);
-int sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize);
+__u32 sptlrpc_name2flavor_base(const char *name);
+const char *sptlrpc_flavor2name_base(__u32 flvr);
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+                               char *buf, int bufsize);
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize);
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize);
  
  static inline
  struct ptlrpc_sec_policy *sptlrpc_policy_get(struct ptlrpc_sec_policy *policy)
@@ -672,7 +716,7 @@ void sptlrpc_request_out_callback(struct ptlrpc_request *req);
   */
  int sptlrpc_import_sec_adapt(struct obd_import *imp,
                               struct ptlrpc_svc_ctx *ctx,
-                             __u16 rpc_flavor);
+                             struct sptlrpc_flavor *flvr);
  struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp);
  void sptlrpc_import_sec_put(struct obd_import *imp);
  
@@ -737,15 +781,23 @@ void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc);
  int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
                            struct ptlrpc_bulk_desc *desc);
  int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
-                                 int nob, obd_count pg_count,
-                                 struct brw_page **pga);
+                                 struct ptlrpc_bulk_desc *desc,
+                                 int nob);
  int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
                                    struct ptlrpc_bulk_desc *desc);
+int sptlrpc_svc_prep_bulk(struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc);
  int sptlrpc_svc_wrap_bulk(struct ptlrpc_request *req,
                            struct ptlrpc_bulk_desc *desc);
  int sptlrpc_svc_unwrap_bulk(struct ptlrpc_request *req,
                              struct ptlrpc_bulk_desc *desc);
  
+/* bulk helpers (internal use only by policies) */
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+                              void *buf, int buflen);
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset);
+
  /* user descriptor helpers */
  static inline int sptlrpc_user_desc_size(int ngroups)
  {
@@ -756,18 +808,6 @@ int sptlrpc_current_user_desc_size(void);
  int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset);
  int sptlrpc_unpack_user_desc(struct lustre_msg *msg, int offset);
  
-/* bulk helpers (internal use only by policies) */
-int bulk_sec_desc_size(__u8 hash_alg, int request, int read);
-int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset);
-
-int bulk_csum_cli_request(struct ptlrpc_bulk_desc *desc, int read,
-                          __u32 alg, struct lustre_msg *rmsg, int roff);
-int bulk_csum_cli_reply(struct ptlrpc_bulk_desc *desc, int read,
-                        struct lustre_msg *rmsg, int roff,
-                        struct lustre_msg *vmsg, int voff);
-int bulk_csum_svc(struct ptlrpc_bulk_desc *desc, int read,
-                  struct ptlrpc_bulk_sec_desc *bsdv, int vsize,
-                  struct ptlrpc_bulk_sec_desc *bsdr, int rsize);
  
  #define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN)
  #define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE)
diff --git a/lustre/include/obd.h b/lustre/include/obd.h

index 12daa90..f4ee771 100644 (file)
--- a/lustre/include/obd.h
+++ b/lustre/include/obd.h
@@ -648,6 +648,7 @@ struct lov_qos {
  };
  
  struct lov_tgt_desc {
+        struct list_head    ltd_kill;
          struct obd_uuid     ltd_uuid;
          struct obd_export  *ltd_exp;
          struct ltd_qos      ltd_qos;     /* qos info per target */
@@ -857,6 +858,8 @@ static inline void oti_free_cookies(struct obd_trans_info *oti)
   * Events signalled through obd_notify() upcall-chain.
   */
  enum obd_notify_event {
+        /* Device connect start */
+        OBD_NOTIFY_CONNECT,
          /* Device activated */
          OBD_NOTIFY_ACTIVE,
          /* Device deactivated */
@@ -1075,9 +1078,6 @@ struct obd_device {
          struct lu_ref          obd_reference;
  };
  
-#define OBD_OPT_FORCE           0x0001
-#define OBD_OPT_FAILOVER        0x0002
-
  #define OBD_LLOG_FL_SENDNOW     0x0001
  
  enum obd_cleanup_stage {
@@ -1111,7 +1111,7 @@ enum obd_cleanup_stage {
  #define KEY_CLEAR_FS            "clear_fs"
  #define KEY_BLOCKSIZE           "blocksize"
  #define KEY_BLOCKSIZE_BITS      "blocksize_bits"
-#define KEY_FIEMAP              "FIEMAP"
+#define KEY_FIEMAP              "fiemap"
  #define KEY_SPTLRPC_CONF        "sptlrpc_conf"
  #define KEY_MGSSEC              "mgssec"
  /* XXX unused ?*/
@@ -1217,7 +1217,7 @@ struct obd_ops {
           * granted by the target, which are guaranteed to be a subset of flags
           * asked for. If @ocd == NULL, use default parameters. */
          int (*o_connect)(const struct lu_env *env,
-                         struct lustre_handle *conn, struct obd_device *src,
+                         struct obd_export **exp, struct obd_device *src,
                           struct obd_uuid *cluuid, struct obd_connect_data *ocd,
                           void *localdata);
          int (*o_reconnect)(const struct lu_env *env,
@@ -1361,15 +1361,6 @@ struct obd_ops {
           * Also, add a wrapper function in include/linux/obd_class.h. */
  };
  
-/* TODO: lmv_stripe_md should contain mds capabilities for all slave fids */
-struct lmv_stripe_md {
-        __u32         mea_magic;
-        __u32         mea_count;
-        __u32         mea_master;
-        __u32         mea_padding;
-        struct lu_fid mea_ids[0];
-};
-
  enum {
          LUSTRE_OPC_MKDIR    = (1 << 0),
          LUSTRE_OPC_SYMLINK  = (1 << 1),
diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h

index 1bc75e1..b47d60b 100644 (file)
--- a/lustre/include/obd_class.h
+++ b/lustre/include/obd_class.h
@@ -206,9 +206,18 @@ int class_connect(struct lustre_handle *conn, struct obd_device *obd,
  int class_disconnect(struct obd_export *exp);
  void class_fail_export(struct obd_export *exp);
  void class_disconnect_exports(struct obd_device *obddev);
-int class_disconnect_stale_exports(struct obd_device *,
-                                    int (*test_export)(struct obd_export *));
  int class_manual_cleanup(struct obd_device *obd);
+int class_disconnect_stale_exports(struct obd_device *,
+                                   int (*test_export)(struct obd_export *),
+                                   enum obd_option flags);
+  
+static inline enum obd_option exp_flags_from_obd(struct obd_device *obd)
+{
+        return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
+                (obd->obd_force ? OBD_OPT_FORCE : 0) |
+                (obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) |
+                0);
+}
  
  void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid);
  void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj);
@@ -813,7 +822,7 @@ static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp)
  }
  
  static inline int obd_connect(const struct lu_env *env,
-                              struct lustre_handle *conn,struct obd_device *obd,
+                              struct obd_export **exp,struct obd_device *obd,
                                struct obd_uuid *cluuid,
                                struct obd_connect_data *d,
                                void *localdata)
@@ -827,7 +836,7 @@ static inline int obd_connect(const struct lu_env *env,
          OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP);
          OBD_COUNTER_INCREMENT(obd, connect);
  
-        rc = OBP(obd, connect)(env, conn, obd, cluuid, d, localdata);
+        rc = OBP(obd, connect)(env, exp, obd, cluuid, d, localdata);
          /* check that only subset is granted */
          LASSERT(ergo(d != NULL,
                       (d->ocd_connect_flags & ocf) == d->ocd_connect_flags));
diff --git a/lustre/kernel_patches/patches/md-mmp-unplug-dev-sles10.patch b/lustre/kernel_patches/patches/md-mmp-unplug-dev-sles10.patch

new file mode 100644 (file)

index 0000000..8bfdef3
--- /dev/null
+++ b/lustre/kernel_patches/patches/md-mmp-unplug-dev-sles10.patch
@@ -0,0 +1,22 @@
+Index: linux-2.6.16.60-0.33/drivers/md/raid5.c
+===================================================================
+--- linux-2.6.16.60-0.33.orig/drivers/md/raid5.c
++++ linux-2.6.16.60-0.33/drivers/md/raid5.c
+@@ -900,6 +900,8 @@ static int add_stripe_bio(struct stripe_
+               bi->bi_next = *bip;
+       *bip = bi;
+       bi->bi_phys_segments ++;
++      if (bio_sync(bi) && !forwrite)
++              clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); /* force to read from disk. */
+       spin_unlock_irq(&conf->device_lock);
+       spin_unlock(&sh->lock);
+ 
+@@ -1617,6 +1619,8 @@ static int make_request (request_queue_t
+               bi->bi_end_io(bi, bytes, 0);
+       }
+       spin_unlock_irq(&conf->device_lock);
++      if (bio_sync(bi))
++              raid5_unplug_device(q);
+       return 0;
+ }
+ 
diff --git a/lustre/kernel_patches/patches/md-mmp-unplug-dev.patch b/lustre/kernel_patches/patches/md-mmp-unplug-dev.patch

new file mode 100644 (file)

index 0000000..0334abd
--- /dev/null
+++ b/lustre/kernel_patches/patches/md-mmp-unplug-dev.patch
@@ -0,0 +1,22 @@
+Index: linux-2.6.22.14/drivers/md/raid5.c
+===================================================================
+--- linux-2.6.22.14.orig/drivers/md/raid5.c
++++ linux-2.6.22.14/drivers/md/raid5.c
+@@ -1268,6 +1268,8 @@ static int add_stripe_bio(struct stripe_
+               bi->bi_next = *bip;
+       *bip = bi;
+       bi->bi_phys_segments ++;
++      if (bio_sync(bi) && !forwrite)
++              clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); /* force to read from disk. */
+       spin_unlock_irq(&conf->device_lock);
+       spin_unlock(&sh->lock);
+ 
+@@ -2972,6 +2974,8 @@ static int make_request(request_queue_t 
+                             test_bit(BIO_UPTODATE, &bi->bi_flags)
+                               ? 0 : -EIO);
+       }
++      if (bio_sync(bi))
++              raid5_unplug_device(q);
+       return 0;
+ }
+ 
diff --git a/lustre/kernel_patches/series/2.6-rhel5.series b/lustre/kernel_patches/series/2.6-rhel5.series

index 0fc2b97..97060fd 100644 (file)
--- a/lustre/kernel_patches/series/2.6-rhel5.series
+++ b/lustre/kernel_patches/series/2.6-rhel5.series
@@ -21,3 +21,4 @@ md-rebuild-policy.patch
  md-soft-lockups.patch
  jbd-journal-chksum-2.6.18-vanilla.patch
  quota-large-limits-rhel5.patch
+md-mmp-unplug-dev.patch
diff --git a/lustre/kernel_patches/series/2.6-sles10.series b/lustre/kernel_patches/series/2.6-sles10.series

index 070f943..dc85e99 100644 (file)
--- a/lustre/kernel_patches/series/2.6-sles10.series
+++ b/lustre/kernel_patches/series/2.6-sles10.series
@@ -16,3 +16,4 @@ proc-sleep-2.6.16-sles10.patch
  export-nr_free_buffer_pages.patch 
  fmode-exec-2.6-sles10.patch
  quota-large-limits-sles10.patch
+md-mmp-unplug-dev-sles10.patch
diff --git a/lustre/kernel_patches/series/2.6.22-vanilla.series b/lustre/kernel_patches/series/2.6.22-vanilla.series

index fe32803..6fad0bd 100644 (file)
--- a/lustre/kernel_patches/series/2.6.22-vanilla.series
+++ b/lustre/kernel_patches/series/2.6.22-vanilla.series
@@ -12,3 +12,4 @@ export-2.6.18-vanilla.patch
  export-show_task-2.6.18-vanilla.patch 
  sd_iostats-2.6.22-vanilla.patch
  quota-large-limits-rhel5.patch
+md-mmp-unplug-dev.patch
diff --git a/lustre/lclient/glimpse.c b/lustre/lclient/glimpse.c

index 78acee6..ed81f15 100644 (file)
--- a/lustre/lclient/glimpse.c
+++ b/lustre/lclient/glimpse.c
@@ -118,11 +118,17 @@ int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
                          *descr = whole_file;
                          descr->cld_obj   = clob;
                          descr->cld_mode  = CLM_PHANTOM;
-                        /* The lockreq for glimpse should be mandatory,
-                         * otherwise, osc may decide to use lockless */
-                        io->ci_lockreq = CILR_MANDATORY;
                          cio->cui_glimpse = 1;
-                        lock = cl_lock_request(env, io, descr, CEF_ASYNC,
+                        /*
+                         * CEF_ASYNC is used because glimpse sub-locks cannot
+                         * deadlock (because they never conflict with other
+                         * locks) and, hence, can be enqueued out-of-order.
+                         *
+                         * CEF_MUST protects glimpse lock from conversion into
+                         * a lockless mode.
+                         */
+                        lock = cl_lock_request(env, io, descr,
+                                               CEF_ASYNC|CEF_MUST,
                                                 "glimpse", cfs_current());
                          cio->cui_glimpse = 0;
                          if (!IS_ERR(lock)) {
diff --git a/lustre/lclient/lcommon_cl.c b/lustre/lclient/lcommon_cl.c

index 6b56b4e..d68ba37 100644 (file)
--- a/lustre/lclient/lcommon_cl.c
+++ b/lustre/lclient/lcommon_cl.c
@@ -635,7 +635,7 @@ int ccc_lock_fits_into(const struct lu_env *env,
  }
  
  /**
- * Implements cl_lock_operations::clo_state() method for vvp layer, invoked
+ * Implements cl_lock_operations::clo_state() method for ccc layer, invoked
   * whenever lock state changes. Transfers object attributes, that might be
   * updated as a result of lock acquiring into inode.
   */
diff --git a/lustre/ldlm/Makefile.am b/lustre/ldlm/Makefile.am

index 600c679..57897f2 100644 (file)
--- a/lustre/ldlm/Makefile.am
+++ b/lustre/ldlm/Makefile.am
@@ -39,7 +39,7 @@
  #
  
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES = ldlm_extent.c ldlm_flock.c ldlm_internal.h ldlm_lib.c \
+EXTRA_DIST = ldlm_extent.c ldlm_flock.c ldlm_internal.h ldlm_lib.c \
         ldlm_lock.c ldlm_lockd.c ldlm_plain.c ldlm_request.c         \
         ldlm_resource.c l_lock.c ldlm_inodebits.c ldlm_pool.c        \
         interval_tree.c
diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h

index 5ff07e7..c01a702 100644 (file)
--- a/lustre/ldlm/ldlm_internal.h
+++ b/lustre/ldlm/ldlm_internal.h
@@ -79,8 +79,6 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync,
                      int flags);
  int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
                            int count, int max, int cancel_flags, int flags);
-int ldlm_cancel_lru_estimate(struct ldlm_namespace *ns, int count, int max,
-                             int flags);
  extern int ldlm_enqueue_min;
  int ldlm_get_enq_timeout(struct ldlm_lock *lock);
  
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c

index a71aa3d..b400a27 100644 (file)
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -385,27 +385,29 @@ int client_obd_cleanup(struct obd_device *obddev)
  
  /* ->o_connect() method for client side (OSC and MDC and MGC) */
  int client_connect_import(const struct lu_env *env,
-                          struct lustre_handle *dlm_handle,
+                          struct obd_export **exp,
                            struct obd_device *obd, struct obd_uuid *cluuid,
                            struct obd_connect_data *data, void *localdata)
  {
          struct client_obd *cli = &obd->u.cli;
          struct obd_import *imp = cli->cl_import;
-        struct obd_export *exp;
          struct obd_connect_data *ocd;
          struct ldlm_namespace *to_be_freed = NULL;
+        struct lustre_handle conn = { 0 };
          int rc;
          ENTRY;
  
+        *exp = NULL;
          down_write(&cli->cl_sem);
-        rc = class_connect(dlm_handle, obd, cluuid);
+        rc = class_connect(&conn, obd, cluuid);
          if (rc)
                  GOTO(out_sem, rc);
  
+        *exp = class_conn2export(&conn);
+
          cli->cl_conn_count++;
          if (cli->cl_conn_count > 1)
                  GOTO(out_sem, rc);
-        exp = class_conn2export(dlm_handle);
  
          if (obd->obd_namespace != NULL)
                  CERROR("already have namespace!\n");
@@ -415,7 +417,7 @@ int client_connect_import(const struct lu_env *env,
          if (obd->obd_namespace == NULL)
                  GOTO(out_disco, rc = -ENOMEM);
  
-        imp->imp_dlm_handle = *dlm_handle;
+        imp->imp_dlm_handle = conn;
          rc = ptlrpc_init_import(imp);
          if (rc != 0)
                  GOTO(out_ldlm, rc);
@@ -431,7 +433,7 @@ int client_connect_import(const struct lu_env *env,
                  LASSERT (imp->imp_state == LUSTRE_IMP_DISCON);
                  GOTO(out_ldlm, rc);
          }
-        LASSERT(exp->exp_connection);
+        LASSERT((*exp)->exp_connection);
  
          if (data) {
                  LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) ==
@@ -451,9 +453,8 @@ out_ldlm:
                  obd->obd_namespace = NULL;
  out_disco:
                  cli->cl_conn_count--;
-                class_disconnect(exp);
-        } else {
-                class_export_put(exp);
+                class_disconnect(*exp);
+                *exp = NULL;
          }
  out_sem:
          up_write(&cli->cl_sem);
@@ -513,7 +514,13 @@ int client_disconnect_export(struct obd_export *exp)
                  to_be_freed = obd->obd_namespace;
          }
  
+        /*
+         * there's no necessary to hold sem during diconnecting an import,
+         * and actually it may cause deadlock in gss.
+         */
+        up_write(&cli->cl_sem);
          rc = ptlrpc_disconnect_import(imp, 0);
+        down_write(&cli->cl_sem);
  
          ptlrpc_invalidate_import(imp);
          /* set obd_namespace to NULL only after invalidate, because we can have
@@ -545,11 +552,12 @@ int client_disconnect_export(struct obd_export *exp)
   * from old lib/target.c
   * -------------------------------------------------------------------------- */
  
-int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp,
-                            struct obd_uuid *cluuid, int initial_conn)
+static int target_handle_reconnect(struct lustre_handle *conn,
+                                   struct obd_export *exp,
+                                   struct obd_uuid *cluuid)
  {
          ENTRY;
-        if (exp->exp_connection && exp->exp_imp_reverse && !initial_conn) {
+        if (exp->exp_connection && exp->exp_imp_reverse) {
                  struct lustre_handle *hdl;
                  hdl = &exp->exp_imp_reverse->imp_remote_handle;
                  /* Might be a re-connect after a partition. */
@@ -611,7 +619,7 @@ int target_handle_connect(struct ptlrpc_request *req)
          struct obd_uuid remote_uuid;
          char *str;
          int rc = 0;
-        int initial_conn = 0;
+        int mds_conn = 0;
          struct obd_connect_data *data, *tmpdata;
          lnet_nid_t *client_nid = NULL;
          ENTRY;
@@ -717,17 +725,27 @@ int target_handle_connect(struct ptlrpc_request *req)
                  }
          }
  
-        if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL)
-                initial_conn = 1;
+        if ((lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) &&
+            (data->ocd_connect_flags & OBD_CONNECT_MDS))
+                mds_conn = 1;
  
          /* lctl gets a backstage, all-access pass. */
          if (obd_uuid_equals(&cluuid, &target->obd_uuid))
                  goto dont_check_exports;
  
-        spin_lock(&target->obd_dev_lock);
          export = lustre_hash_lookup(target->obd_uuid_hash, &cluuid);
  
-        if (export != NULL && export->exp_connecting) { /* bug 9635, et. al. */
+        if (export != NULL && mds_conn) {
+                /* mds reconnected after failover */
+                class_fail_export(export);
+                CWARN("%s: received MDS connection from NID %s,"
+                      " removing former export from NID %s\n",
+                      target->obd_name, libcfs_nid2str(req->rq_peer.nid),
+                      libcfs_nid2str(export->exp_connection->c_peer.nid));
+                class_export_put(export);
+                export = NULL;
+                rc = 0;
+        } else if (export != NULL && export->exp_connecting) { /* bug 9635, et. al. */
                  CWARN("%s: exp %p already connecting\n",
                        export->exp_obd->obd_name, export);
                  class_export_put(export);
@@ -735,25 +753,14 @@ int target_handle_connect(struct ptlrpc_request *req)
                  rc = -EALREADY;
          } else if (export != NULL && export->exp_connection != NULL &&
                     req->rq_peer.nid != export->exp_connection->c_peer.nid) {
-                /* make darn sure this is coming from the same peer
-                 * if the UUIDs matched */
-                if (data && data->ocd_connect_flags & OBD_CONNECT_MDS) {
-                        /* the MDS UUID can be reused, don't need to wait
-                         * for the export to be evicted */
-                        CWARN("%s: received MDS connection from a new NID %s,"
-                              " removing former export from NID %s\n",
-                            target->obd_name,
-                            libcfs_nid2str(req->rq_peer.nid),
-                            libcfs_nid2str(export->exp_connection->c_peer.nid));
-                        class_fail_export(export);
-                } else {
-                        CWARN("%s: cookie %s seen on new NID %s when "
-                              "existing NID %s is already connected\n",
-                            target->obd_name, cluuid.uuid,
-                            libcfs_nid2str(req->rq_peer.nid),
-                            libcfs_nid2str(export->exp_connection->c_peer.nid));
-                        rc = -EALREADY;
-                }
+                /* in mds failover we have static uuid but nid can be
+                 * changed*/
+                CWARN("%s: cookie %s seen on new NID %s when "
+                      "existing NID %s is already connected\n",
+                      target->obd_name, cluuid.uuid,
+                      libcfs_nid2str(req->rq_peer.nid),
+                      libcfs_nid2str(export->exp_connection->c_peer.nid));
+                rc = -EALREADY;
                  class_export_put(export);
                  export = NULL;
          } else if (export != NULL) {
@@ -761,15 +768,13 @@ int target_handle_connect(struct ptlrpc_request *req)
                  export->exp_connecting = 1;
                  spin_unlock(&export->exp_lock);
                  class_export_put(export);
-                spin_unlock(&target->obd_dev_lock);
                  LASSERT(export->exp_obd == target);
  
-                rc = target_handle_reconnect(&conn, export, &cluuid, initial_conn);
+                rc = target_handle_reconnect(&conn, export, &cluuid);
          }
  
          /* If we found an export, we already unlocked. */
          if (!export) {
-                spin_unlock(&target->obd_dev_lock);
                  OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_CONNECT, 2 * obd_timeout);
          } else if (req->rq_export == NULL &&
                     atomic_read(&export->exp_rpc_count) > 0) {
@@ -785,18 +790,13 @@ int target_handle_connect(struct ptlrpc_request *req)
                        libcfs_nid2str(req->rq_peer.nid),
                        export, atomic_read(&export->exp_rpc_count));
                  GOTO(out, rc = -EBUSY);
-        } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1 &&
-                   !initial_conn) {
+        } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) {
                  CERROR("%s: NID %s (%s) reconnected with 1 conn_cnt; "
                         "cookies not random?\n", target->obd_name,
                         libcfs_nid2str(req->rq_peer.nid), cluuid.uuid);
                  GOTO(out, rc = -EALREADY);
          } else {
                  OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout);
-                if (req->rq_export == NULL && initial_conn)
-                       export->exp_last_request_time =
-                               max(export->exp_last_request_time,
-                                   (time_t)cfs_time_current_sec());
          }
  
          if (rc < 0) {
@@ -849,12 +849,17 @@ int target_handle_connect(struct ptlrpc_request *req)
                  } else {
  dont_check_exports:
                          rc = obd_connect(req->rq_svc_thread->t_env,
-                                         &conn, target, &cluuid, data,
+                                         &export, target, &cluuid, data,
                                           client_nid);
+                        if (rc == 0)
+                                conn.cookie = export->exp_handle.h_cookie;
                  }
          } else {
                  rc = obd_reconnect(req->rq_svc_thread->t_env,
                                     export, target, &cluuid, data, client_nid);
+                if (rc == 0)
+                        /* prevous done via class_conn2export */
+                        class_export_get(export);
          }
          if (rc)
                  GOTO(out, rc);
@@ -872,15 +877,6 @@ dont_check_exports:
  
          lustre_msg_set_handle(req->rq_repmsg, &conn);
  
-        /* ownership of this export ref transfers to the request AFTER we
-         * drop any previous reference the request had, but we don't want
-         * that to go to zero before we get our new export reference. */
-        export = class_conn2export(&conn);
-        if (!export) {
-                DEBUG_REQ(D_ERROR, req, "Missing export!");
-                GOTO(out, rc = -ENODEV);
-        }
-
          /* If the client and the server are the same node, we will already
           * have an export that really points to the client's DLM export,
           * because we have a shared handles table.
@@ -894,9 +890,7 @@ dont_check_exports:
          req->rq_export = export;
  
          spin_lock(&export->exp_lock);
-        if (initial_conn) {
-                lustre_msg_set_conn_cnt(req->rq_repmsg, export->exp_conn_cnt + 1);
-        } else if (export->exp_conn_cnt >= lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
+        if (export->exp_conn_cnt >= lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
                  spin_unlock(&export->exp_lock);
                  CERROR("%s: %s already connected at higher conn_cnt: %d > %d\n",
                         cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
@@ -1003,8 +997,7 @@ dont_check_exports:
          else
                  revimp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
  
-        rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx,
-                                      req->rq_flvr.sf_rpc);
+        rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx, &req->rq_flvr);
          if (rc) {
                  CERROR("Failed to get sec for reverse import: %d\n", rc);
                  export->exp_imp_reverse = NULL;
@@ -1676,7 +1669,9 @@ static int target_recovery_thread(void *arg)
                         "evict them\n", obd->obd_connected_clients,
                         obd->obd_max_recoverable_clients);
                  obd->obd_abort_recovery = obd->obd_stopping;
-                class_disconnect_stale_exports(obd, connect_done);
+                class_disconnect_stale_exports(obd, connect_done, 
+                                               exp_flags_from_obd(obd) | 
+                                               OBD_OPT_ABORT_RECOV);
          }
          /* next stage: replay requests */
          delta = jiffies;
@@ -1706,7 +1701,9 @@ static int target_recovery_thread(void *arg)
          if (obd->obd_abort_recovery) {
                  CDEBUG(D_ERROR, "req replay timed out, aborting ...\n");
                  obd->obd_abort_recovery = obd->obd_stopping;
-                class_disconnect_stale_exports(obd, req_replay_done);
+                class_disconnect_stale_exports(obd, req_replay_done, 
+                                               exp_flags_from_obd(obd) | 
+                                               OBD_OPT_ABORT_RECOV);
                  abort_req_replay_queue(obd);
          }
  
@@ -1731,7 +1728,9 @@ static int target_recovery_thread(void *arg)
                  int stale;
                  CERROR("lock replay timed out, aborting ...\n");
                  obd->obd_abort_recovery = obd->obd_stopping;
-                stale = class_disconnect_stale_exports(obd, lock_replay_done);
+                stale = class_disconnect_stale_exports(obd, lock_replay_done, 
+                                                       exp_flags_from_obd(obd) | 
+                                                       OBD_OPT_ABORT_RECOV);
                  abort_lock_replay_queue(obd);
          }
  
diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c

index c0c566a..b3b36a8 100644 (file)
--- a/lustre/ldlm/ldlm_lock.c
+++ b/lustre/ldlm/ldlm_lock.c
@@ -187,8 +187,8 @@ int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock)
                  struct ldlm_namespace *ns = lock->l_resource->lr_namespace;
                  LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
                  list_del_init(&lock->l_lru);
+                LASSERT(ns->ns_nr_unused > 0);
                  ns->ns_nr_unused--;
-                LASSERT(ns->ns_nr_unused >= 0);
                  rc = 1;
          }
          return rc;
@@ -998,11 +998,16 @@ static struct ldlm_lock *search_queue(struct list_head *queue,
          return NULL;
  }
  
-void ldlm_lock_allow_match(struct ldlm_lock *lock)
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock)
  {
-        lock_res_and_lock(lock);
          lock->l_flags |= LDLM_FL_LVB_READY;
          cfs_waitq_signal(&lock->l_waitq);
+}
+
+void ldlm_lock_allow_match(struct ldlm_lock *lock)
+{
+        lock_res_and_lock(lock);
+        ldlm_lock_allow_match_locked(lock);
          unlock_res_and_lock(lock);
  }
  
@@ -1211,7 +1216,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
          struct ldlm_interval *node = NULL;
          ENTRY;
  
-        do_gettimeofday(&lock->l_enqueued_time);
+        lock->l_last_activity = cfs_time_current_sec();
          /* policies are not executed on the client or during replay */
          if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT
              && !local && ns->ns_policy) {
diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c

index b849987..58875a9 100644 (file)
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -336,7 +336,7 @@ repeat:
                  lock->l_resource->lr_namespace->ns_timeouts++;
                  LDLM_ERROR(lock, "lock callback timer expired after %lds: "
                             "evicting client at %s ",
-                           cfs_time_current_sec()- lock->l_enqueued_time.tv_sec,
+                           cfs_time_current_sec()- lock->l_last_activity,
                             libcfs_nid2str(
                                     lock->l_export->exp_connection->c_peer.nid));
  
@@ -391,7 +391,7 @@ static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds)
  
          if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT) ||
              OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
-                seconds = 2;
+                seconds = 1;
  
          timeout = cfs_time_shift(seconds);
          if (likely(cfs_time_after(timeout, lock->l_callback_timeout)))
@@ -795,7 +795,6 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
          struct ldlm_cb_set_arg *arg = data;
          struct ldlm_request    *body;
          struct ptlrpc_request  *req;
-        struct timeval          granted_time;
          long                    total_enqueue_wait;
          int                     instant_cancel = 0;
          int                     rc = 0;
@@ -804,14 +803,13 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
          LASSERT(lock != NULL);
          LASSERT(data != NULL);
  
-        do_gettimeofday(&granted_time);
-        total_enqueue_wait = cfs_timeval_sub(&granted_time,
-                                             &lock->l_enqueued_time, NULL);
+        total_enqueue_wait = cfs_time_sub(cfs_time_current_sec(),
+                                          lock->l_last_activity);
  
-        if (total_enqueue_wait / ONE_MILLION > obd_timeout)
+        if (total_enqueue_wait > obd_timeout)
                  /* non-fatal with AT - change to LDLM_DEBUG? */
-                LDLM_ERROR(lock, "enqueue wait took %luus from "CFS_TIME_T,
-                           total_enqueue_wait, lock->l_enqueued_time.tv_sec);
+                LDLM_WARN(lock, "enqueue wait took %lus from "CFS_TIME_T,
+                          total_enqueue_wait, lock->l_last_activity);
  
          req = ptlrpc_request_alloc(lock->l_export->exp_imp_reverse,
                                      &RQF_LDLM_CP_CALLBACK);
@@ -848,13 +846,13 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
                  unlock_res_and_lock(lock);
          }
  
-        LDLM_DEBUG(lock, "server preparing completion AST (after %ldus wait)",
+        LDLM_DEBUG(lock, "server preparing completion AST (after %lds wait)",
                     total_enqueue_wait);
  
          /* Server-side enqueue wait time estimate, used in
              __ldlm_add_waiting_lock to set future enqueue timers */
          at_add(&lock->l_resource->lr_namespace->ns_at_estimate,
-               total_enqueue_wait / ONE_MILLION);
+               total_enqueue_wait);
  
          ptlrpc_request_set_replen(req);
  
@@ -867,6 +865,8 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
          lock_res_and_lock(lock);
          if (lock->l_flags & LDLM_FL_AST_SENT) {
                  body->lock_flags |= LDLM_FL_AST_SENT;
+                /* copy ast flags like LDLM_FL_DISCARD_DATA */
+                body->lock_flags |= (lock->l_flags & LDLM_AST_FLAGS);
  
                  /* We might get here prior to ldlm_handle_enqueue setting
                   * LDLM_FL_CANCEL_ON_BLOCK flag. Then we will put this lock
@@ -1090,7 +1090,7 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
          if (!lock)
                  GOTO(out, rc = -ENOMEM);
  
-        do_gettimeofday(&lock->l_enqueued_time);
+        lock->l_last_activity = cfs_time_current_sec();
          lock->l_remote_handle = dlm_req->lock_handle[0];
          LDLM_DEBUG(lock, "server-side enqueue handler, new lock created");
  
@@ -1303,7 +1303,7 @@ int ldlm_handle_convert0(struct ptlrpc_request *req,
  
                  LDLM_DEBUG(lock, "server-side convert handler START");
  
-                do_gettimeofday(&lock->l_enqueued_time);
+                lock->l_last_activity = cfs_time_current_sec();
                  res = ldlm_lock_convert(lock, dlm_req->lock_desc.l_req_mode,
                                          &dlm_rep->lock_flags);
                  if (res) {
@@ -1812,7 +1812,7 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
                  RETURN(0);
          }
  
-        if ((lock->l_flags & LDLM_FL_FAIL_LOC) && 
+        if ((lock->l_flags & LDLM_FL_FAIL_LOC) &&
              lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK)
                  OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
  
@@ -2435,8 +2435,8 @@ int __init ldlm_init(void)
                  return -ENOMEM;
  
          ldlm_lock_slab = cfs_mem_cache_create("ldlm_locks",
-                                           sizeof(struct ldlm_lock), 0,
-                                           SLAB_HWCACHE_ALIGN);
+                                      sizeof(struct ldlm_lock), 0,
+                                      SLAB_HWCACHE_ALIGN | SLAB_DESTROY_BY_RCU);
          if (ldlm_lock_slab == NULL) {
                  cfs_mem_cache_destroy(ldlm_resource_slab);
                  return -ENOMEM;
@@ -2492,6 +2492,7 @@ EXPORT_SYMBOL(ldlm_lock_dump);
  EXPORT_SYMBOL(ldlm_lock_dump_handle);
  EXPORT_SYMBOL(ldlm_cancel_locks_for_export);
  EXPORT_SYMBOL(ldlm_reprocess_all_ns);
+EXPORT_SYMBOL(ldlm_lock_allow_match_locked);
  EXPORT_SYMBOL(ldlm_lock_allow_match);
  EXPORT_SYMBOL(ldlm_lock_downgrade);
  EXPORT_SYMBOL(ldlm_lock_convert);
diff --git a/lustre/ldlm/ldlm_pool.c b/lustre/ldlm/ldlm_pool.c

index 09b9590..54c0cf5 100644 (file)
--- a/lustre/ldlm/ldlm_pool.c
+++ b/lustre/ldlm/ldlm_pool.c
@@ -381,13 +381,12 @@ static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
                                  int nr, unsigned int gfp_mask)
  {
          __u32 limit;
-        ENTRY;
  
          /*
           * VM is asking how many entries may be potentially freed.
           */
          if (nr == 0)
-                RETURN(atomic_read(&pl->pl_granted));
+                return atomic_read(&pl->pl_granted);
  
          /*
           * Client already canceled locks but server is already in shrinker
@@ -427,7 +426,7 @@ static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
           * We did not really free any memory here so far, it only will be
           * freed later may be, so that we return 0 to not confuse VM.
           */
-        RETURN(0);
+        return 0;
  }
  
  /**
@@ -508,7 +507,7 @@ static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
           * It may be called when SLV has changed much, this is why we do not
           * take into account pl->pl_recalc_time here.
           */
-        RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LDLM_ASYNC,
+        RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LDLM_SYNC, 
                                 LDLM_CANCEL_LRUR));
  }
  
@@ -520,12 +519,15 @@ static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
  static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
                                  int nr, unsigned int gfp_mask)
  {
-        ENTRY;
+        struct ldlm_namespace *ns;
+        int canceled = 0, unused;
+
+        ns = ldlm_pl2ns(pl);
  
          /*
           * Do not cancel locks in case lru resize is disabled for this ns.
           */
-        if (!ns_connect_lru_resize(ldlm_pl2ns(pl)))
+        if (!ns_connect_lru_resize(ns))
                  RETURN(0);
  
          /*
@@ -533,19 +535,22 @@ static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
           */
          ldlm_cli_pool_pop_slv(pl);
  
+        spin_lock(&ns->ns_unused_lock);
+        unused = ns->ns_nr_unused;
+        spin_unlock(&ns->ns_unused_lock);
+        
+        if (nr) {
+                canceled = ldlm_cancel_lru(ns, nr, LDLM_SYNC, 
+                                           LDLM_CANCEL_SHRINK);
+        }
+#ifdef __KERNEL__
          /*
-         * Find out how many locks may be released according to shrink
-         * policy.
-         */
-        if (nr == 0)
-                RETURN(ldlm_cancel_lru_estimate(ldlm_pl2ns(pl), 0, 0,
-                                                LDLM_CANCEL_SHRINK));
-
-        /*
-         * Cancel @nr locks accoding to shrink policy.
+         * Retrun the number of potentially reclaimable locks.
           */
-        RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), nr, LDLM_SYNC,
-                               LDLM_CANCEL_SHRINK));
+        return ((unused - canceled) / 100) * sysctl_vfs_cache_pressure;
+#else
+        return unused - canceled;
+#endif
  }
  
  struct ldlm_pool_ops ldlm_srv_pool_ops = {
diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c

index 872ad36..b6fb1d1 100644 (file)
--- a/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@ -80,9 +80,9 @@ int ldlm_expired_completion_wait(void *data)
                  LDLM_ERROR(lock, "lock timed out (enqueued at "CFS_TIME_T", "
                             CFS_DURATION_T"s ago); not entering recovery in "
                             "server code, just going back to sleep",
-                           lock->l_enqueued_time.tv_sec,
+                           lock->l_last_activity,
                             cfs_time_sub(cfs_time_current_sec(),
-                           lock->l_enqueued_time.tv_sec));
+                           lock->l_last_activity));
                  if (cfs_time_after(cfs_time_current(), next_dump)) {
                          last_dump = next_dump;
                          next_dump = cfs_time_shift(300);
@@ -99,9 +99,8 @@ int ldlm_expired_completion_wait(void *data)
          ptlrpc_fail_import(imp, lwd->lwd_conn_cnt);
          LDLM_ERROR(lock, "lock timed out (enqueued at "CFS_TIME_T", "
                    CFS_DURATION_T"s ago), entering recovery for %s@%s",
-                  lock->l_enqueued_time.tv_sec,
-                  cfs_time_sub(cfs_time_current_sec(),
-                  lock->l_enqueued_time.tv_sec),
+                  lock->l_last_activity,
+                  cfs_time_sub(cfs_time_current_sec(), lock->l_last_activity),
                    obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid);
  
          RETURN(0);
@@ -136,7 +135,7 @@ static int ldlm_completion_tail(struct ldlm_lock *lock)
                  result = -EIO;
          } else {
                  delay = cfs_time_sub(cfs_time_current_sec(),
-                                     lock->l_enqueued_time.tv_sec);
+                                     lock->l_last_activity);
                  LDLM_DEBUG(lock, "client-side enqueue: granted after "
                             CFS_DURATION_T"s", delay);
  
@@ -1314,65 +1313,6 @@ static int ldlm_cancel_list(struct list_head *cancels, int count, int flags)
  }
  
  /**
- * Callback function for shrink policy. Makes decision whether to keep
- * \a lock in LRU for current \a LRU size \a unused, added in current scan
- * \a added and number of locks to be preferably canceled \a count.
- *
- * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
- *
- * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
- */
-static ldlm_policy_res_t ldlm_cancel_shrink_policy(struct ldlm_namespace *ns,
-                                                   struct ldlm_lock *lock,
-                                                   int unused, int added,
-                                                   int count)
-{
-        int lock_cost;
-        __u64 page_nr;
-
-        /*
-         * Stop lru processing when we reached passed @count or checked all
-         * locks in lru.
-         */
-        if (count && added >= count)
-                return LDLM_POLICY_KEEP_LOCK;
-
-        if (lock->l_resource->lr_type == LDLM_EXTENT) {
-                if (lock->l_weigh_ast) {
-                        /*
-                         * For liblustre, l_weigh_ast should return 0 since it
-                         * don't cache pages
-                         */
-                        page_nr = lock->l_weigh_ast(lock);
-                } else {
-                        struct ldlm_extent *l_extent;
-
-                        /*
-                         * For all extent locks cost is 1 + number of pages in
-                         * their extent.
-                         */
-                        l_extent = &lock->l_policy_data.l_extent;
-                        page_nr = l_extent->end - l_extent->start;
-                        do_div(page_nr, CFS_PAGE_SIZE);
-                }
-                lock_cost = 1 + page_nr;
-        } else {
-                /*
-                 * For all locks which are not extent ones cost is 1
-                 */
-                lock_cost = 1;
-        }
-
-        /*
-         * Keep all expensive locks in lru for the memory pressure time
-         * cancel policy. They anyways may be canceled by lru resize
-         * pplicy if they have not small enough CLV.
-         */
-        return lock_cost > ns->ns_shrink_thumb ?
-                LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
-}
-
-/**
   * Callback function for lru-resize policy. Makes decision whether to keep
   * \a lock in LRU for current \a LRU size \a unused, added in current scan
   * \a added and number of locks to be preferably canceled \a count.
@@ -1495,7 +1435,8 @@ ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags)
  {
          if (ns_connect_lru_resize(ns)) {
                  if (flags & LDLM_CANCEL_SHRINK)
-                        return ldlm_cancel_shrink_policy;
+                        /* We kill passed number of old locks. */
+                        return ldlm_cancel_passed_policy;
                  else if (flags & LDLM_CANCEL_LRUR)
                          return ldlm_cancel_lrur_policy;
                  else if (flags & LDLM_CANCEL_PASSED)
@@ -1647,61 +1588,6 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
          RETURN(ldlm_cancel_list(cancels, added, cancel_flags));
  }
  
-/* Returns number of locks which could be canceled next time when
- * ldlm_cancel_lru() is called. Used from locks pool shrinker. */
-int ldlm_cancel_lru_estimate(struct ldlm_namespace *ns,
-                             int count, int max, int flags)
-{
-        struct list_head disp = CFS_LIST_HEAD_INIT(disp);
-        ldlm_cancel_lru_policy_t pf;
-        struct ldlm_lock *lock;
-        int added = 0, unused;
-        int loop_stop = 0;
-        ENTRY;
-
-        pf = ldlm_cancel_lru_policy(ns, flags);
-        LASSERT(pf != NULL);
-        spin_lock(&ns->ns_unused_lock);
-        unused = ns->ns_nr_unused;
-        list_splice_init(&ns->ns_unused_list, &disp);
-        while (!list_empty(&disp)) {
-                lock = list_entry(disp.next, struct ldlm_lock, l_lru);
-                list_move_tail(&lock->l_lru, &ns->ns_unused_list);
-
-                /* For any flags, stop scanning if @max is reached. */
-                if (max && added >= max)
-                        break;
-
-                /* Somebody is already doing CANCEL or there is a
-                 * blocking request will send cancel. Let's not count
-                 * this lock. */
-                if ((lock->l_flags & LDLM_FL_CANCELING) ||
-                    (lock->l_flags & LDLM_FL_BL_AST))
-                        continue;
-
-                LDLM_LOCK_GET(lock);
-                spin_unlock(&ns->ns_unused_lock);
-                lu_ref_add(&lock->l_reference, __FUNCTION__, cfs_current());
-
-                /* Pass the lock through the policy filter and see if it
-                 * should stay in lru. */
-                if (pf(ns, lock, unused, added, count) == LDLM_POLICY_KEEP_LOCK)
-                        loop_stop = 1;
-
-                lu_ref_del(&lock->l_reference, __FUNCTION__, cfs_current());
-                LDLM_LOCK_RELEASE(lock);
-                spin_lock(&ns->ns_unused_lock);
-                if (loop_stop)
-                        break;
-
-                added++;
-                unused--;
-        }
-        list_splice(&disp, ns->ns_unused_list.prev);
-        spin_unlock(&ns->ns_unused_lock);
-        RETURN(added);
-}
-
  /* when called with LDLM_ASYNC the blocking callback will be handled
   * in a thread and this function will return after the thread has been
   * asked to call the callback.  when called with LDLM_SYNC the blocking
@@ -1723,8 +1609,8 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync,
                          RETURN(count);
          }
  
-        /* If an error occured in ASYNC mode, or
-         * this is SYNC mode, cancel the list. */
+        /* If an error occured in ASYNC mode, or this is SYNC mode,
+         * cancel the list. */
          ldlm_cli_cancel_list(&cancels, count, NULL, 0);
          RETURN(count);
  }
diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c

index 320a870..c04d948 100644 (file)
--- a/lustre/ldlm/ldlm_resource.c
+++ b/lustre/ldlm/ldlm_resource.c
@@ -266,13 +266,6 @@ void ldlm_proc_namespace(struct ldlm_namespace *ns)
                  lock_vars[0].write_fptr = lprocfs_wr_lru_size;
                  lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
  
-                snprintf(lock_name, MAX_STRING_SIZE, "%s/shrink_thumb",
-                         ns->ns_name);
-                lock_vars[0].data = ns;
-                lock_vars[0].read_fptr = lprocfs_rd_uint;
-                lock_vars[0].write_fptr = lprocfs_wr_uint;
-                lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0);
-
                  snprintf(lock_name, MAX_STRING_SIZE, "%s/lru_max_age",
                           ns->ns_name);
                  lock_vars[0].data = &ns->ns_max_age;
@@ -342,7 +335,6 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
          if (!ns->ns_hash)
                  GOTO(out_ns, NULL);
  
-        ns->ns_shrink_thumb = LDLM_LOCK_SHRINK_THUMB;
          ns->ns_appetite = apt;
  
          LASSERT(obd != NULL);
diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c

index 232ce2b..5a34a82 100644 (file)
--- a/lustre/liblustre/llite_lib.c
+++ b/lustre/liblustre/llite_lib.c
@@ -94,7 +94,6 @@ int liblustre_process_log(struct config_llog_instance *cfg,
          struct lustre_cfg *lcfg;
          char  *peer = "MGS_UUID";
          struct obd_device *obd;
-        struct lustre_handle mgc_conn = {0, };
          struct obd_export *exp;
          char  *name = "mgc_dev";
          class_uuid_t uuid;
@@ -184,15 +183,13 @@ int liblustre_process_log(struct config_llog_instance *cfg,
  #endif
          ocd->ocd_version = LUSTRE_VERSION_CODE;
  
-        rc = obd_connect(NULL, &mgc_conn, obd, &mgc_uuid, ocd, NULL);
+        rc = obd_connect(NULL, &exp, obd, &mgc_uuid, ocd, NULL);
          if (rc) {
                  CERROR("cannot connect to %s at %s: rc = %d\n",
                         LUSTRE_MGS_OBDNAME, mgsnid, rc);
                  GOTO(out_cleanup, rc);
          }
  
-        exp = class_conn2export(&mgc_conn);
-
          ctxt = llog_get_context(exp->exp_obd, LLOG_CONFIG_REPL_CTXT);
          cfg->cfg_flags |= CFG_F_COMPAT146;
          rc = class_config_parse_llog(ctxt, profile, cfg);
diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c

index a410d2f..5f4c017 100644 (file)
--- a/lustre/liblustre/super.c
+++ b/lustre/liblustre/super.c
@@ -1937,8 +1937,6 @@ llu_fsswop_mount(const char *source,
          struct obd_statfs osfs;
          static struct qstr noname = { NULL, 0, 0 };
          struct ptlrpc_request *request = NULL;
-        struct lustre_handle md_conn = {0, };
-        struct lustre_handle dt_conn = {0, };
          struct lustre_md md;
          class_uuid_t uuid;
          struct config_llog_instance cfg = {0, };
@@ -2026,12 +2024,11 @@ llu_fsswop_mount(const char *source,
          ocd.ocd_version = LUSTRE_VERSION_CODE;
  
          /* setup mdc */
-        err = obd_connect(NULL, &md_conn, obd, &sbi->ll_sb_uuid, &ocd, NULL);
+        err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, &ocd, NULL);
          if (err) {
                  CERROR("cannot connect to %s: rc = %d\n", mdc, err);
                  GOTO(out_free, err);
          }
-        sbi->ll_md_exp = class_conn2export(&md_conn);
  
          err = obd_statfs(obd, &osfs, 100000000, 0);
          if (err)
@@ -2057,12 +2054,11 @@ llu_fsswop_mount(const char *source,
                                  OBD_CONNECT_VERSION | OBD_CONNECT_TRUNCLOCK |
                                  OBD_CONNECT_FID | OBD_CONNECT_AT;
          ocd.ocd_version = LUSTRE_VERSION_CODE;
-        err = obd_connect(NULL, &dt_conn, obd, &sbi->ll_sb_uuid, &ocd, NULL);
+        err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, &ocd, NULL);
          if (err) {
                  CERROR("cannot connect to %s: rc = %d\n", osc, err);
                  GOTO(out_md, err);
          }
-        sbi->ll_dt_exp = class_conn2export(&dt_conn);
          sbi->ll_lco.lco_flags = ocd.ocd_connect_flags;
          sbi->ll_lco.lco_md_exp = sbi->ll_md_exp;
          sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp;
diff --git a/lustre/llite/Makefile.in b/lustre/llite/Makefile.in

index 848c26b..09689d2 100644 (file)
--- a/lustre/llite/Makefile.in
+++ b/lustre/llite/Makefile.in
@@ -8,4 +8,8 @@ lustre-objs += vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o
  
  llite_lloop-objs := lloop.o
  
+EXTRA_DIST := $(lustre-objs:.o=.c) llite_internal.h rw26.c super25.c
+EXTRA_DIST += $(llite_lloop-objs:.o=.c)
+EXTRA_DIST += vvp_internal.h
+
  @INCLUDE_RULES@
diff --git a/lustre/llite/autoMakefile.am b/lustre/llite/autoMakefile.am

index d5d1c10..391a8f6 100644 (file)
--- a/lustre/llite/autoMakefile.am
+++ b/lustre/llite/autoMakefile.am
@@ -38,7 +38,4 @@ if MODULES
  modulefs_DATA = lustre$(KMODEXT) llite_lloop$(KMODEXT)
  endif
  
-DIST_SOURCES := $(lustre-objs:.o=.c) llite_internal.h rw26.c super25.c 
-DIST_SOURCES += $(llite_lloop-objs:.o=.c)
-DIST_SOURCES += vvp_internal.h
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
diff --git a/lustre/llite/dcache.c b/lustre/llite/dcache.c

index bafb293..31fdff5 100644 (file)
--- a/lustre/llite/dcache.c
+++ b/lustre/llite/dcache.c
@@ -496,6 +496,13 @@ do_lock:
                  if (rc != -ESTALE) {
                          CDEBUG(D_INFO, "ll_intent_lock: rc %d : it->it_status "
                                 "%d\n", rc, it->d.lustre.it_status);
+                } else {
+#ifndef HAVE_VFS_INTENT_PATCHES
+                        if (it_disposition(it, DISP_OPEN_OPEN) &&
+                            !it_open_error(DISP_OPEN_OPEN, it))
+                                /* server have valid open - close file first*/
+                                ll_release_openhandle(de, it);
+#endif
                  }
                  GOTO(out, rc = 0);
          }
@@ -763,7 +770,7 @@ int ll_revalidate_nd(struct dentry *dentry, struct nameidata *nd)
   * nd->intent.open.file for error, so we need to return it as lookup's result
   * instead */
                                  if (IS_ERR(filp))
-                                        rc = 0;
+                                        rc = PTR_ERR(filp);
  #endif
                          }
  #else
diff --git a/lustre/llite/file.c b/lustre/llite/file.c

index 9850774..cfa7f03 100644 (file)
--- a/lustre/llite/file.c
+++ b/lustre/llite/file.c
@@ -696,15 +696,14 @@ out_openerr:
          return rc;
  }
  
-/* Fills the obdo with the attributes for the inode defined by lsm */
-int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
+/* Fills the obdo with the attributes for the lsm */
+static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
+                          struct obd_capa *capa, struct obdo *obdo)
  {
          struct ptlrpc_request_set *set;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct lov_stripe_md *lsm = lli->lli_smd;
+        struct obd_info            oinfo = { { { 0 } } };
+        int                        rc;
  
-        struct obd_info oinfo = { { { 0 } } };
-        int rc;
          ENTRY;
  
          LASSERT(lsm != NULL);
@@ -719,32 +718,44 @@ int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
                                 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
                                 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
                                 OBD_MD_FLGROUP;
-        oinfo.oi_capa = ll_mdscapa_get(inode);
+        oinfo.oi_capa = capa;
  
          set = ptlrpc_prep_set();
          if (set == NULL) {
                  CERROR("can't allocate ptlrpc set\n");
                  rc = -ENOMEM;
          } else {
-                rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
+                rc = obd_getattr_async(exp, &oinfo, set);
                  if (rc == 0)
                          rc = ptlrpc_set_wait(set);
                  ptlrpc_set_destroy(set);
          }
-        capa_put(oinfo.oi_capa);
-        if (rc)
-                RETURN(rc);
+        if (rc == 0)
+                oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
+                                         OBD_MD_FLATIME | OBD_MD_FLMTIME |
+                                         OBD_MD_FLCTIME | OBD_MD_FLSIZE);
+        RETURN(rc);
+}
  
-        oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
-                                 OBD_MD_FLATIME | OBD_MD_FLMTIME |
-                                 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
+/* Fills the obdo with the attributes for the inode defined by lsm */
+int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
+{
+        struct ll_inode_info *lli  = ll_i2info(inode);
+        struct obd_capa      *capa = ll_mdscapa_get(inode);
+        int rc;
+        ENTRY;
  
-        obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
-        CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
-               lli->lli_smd->lsm_object_id, i_size_read(inode),
-               (unsigned long long)inode->i_blocks,
-               (unsigned long)ll_inode_blksize(inode));
-        RETURN(0);
+        rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode), capa, obdo);
+        capa_put(capa);
+        if (rc == 0) {
+                obdo_refresh_inode(inode, obdo, obdo->o_valid);
+                CDEBUG(D_INODE,
+                       "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
+                       lli->lli_smd->lsm_object_id, i_size_read(inode),
+                       (unsigned long long)inode->i_blocks,
+                       (unsigned long)ll_inode_blksize(inode));
+        }
+        RETURN(rc);
  }
  
  int ll_merge_lvb(struct inode *inode)
@@ -773,8 +784,18 @@ int ll_merge_lvb(struct inode *inode)
  int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
                       lstat_t *st)
  {
-        /* XXX */
-        return -ENOSYS;
+        struct obdo obdo = { 0 };
+        int rc;
+
+        rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo);
+        if (rc == 0) {
+                st->st_size   = obdo.o_size;
+                st->st_blocks = obdo.o_blocks;
+                st->st_mtime  = obdo.o_mtime;
+                st->st_atime  = obdo.o_atime;
+                st->st_ctime  = obdo.o_ctime;
+        }
+        return rc;
  }
  
  void ll_io_init(struct cl_io *io, const struct file *file, int write)
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h

index a03e1bf..9576150 100644 (file)
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -635,7 +635,6 @@ extern ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
                                     struct lustre_handle *lockh);
  int ll_file_open(struct inode *inode, struct file *file);
  int ll_file_release(struct inode *inode, struct file *file);
-int ll_lsm_getattr(struct obd_export *, struct lov_stripe_md *, struct obdo *);
  int ll_glimpse_ioctl(struct ll_sb_info *sbi,
                       struct lov_stripe_md *lsm, lstat_t *st);
  int ll_local_open(struct file *file,
@@ -1216,4 +1215,25 @@ static inline int cl_merge_lvb(struct inode *inode)
  
  struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt);
  
+/** direct write pages */
+struct ll_dio_pages {
+        /** page array to be written. we don't support
+         * partial pages except the last one. */
+        struct page **ldp_pages;
+        /* offset of each page */
+        loff_t       *ldp_offsets;
+        /** if ldp_offsets is NULL, it means a sequential
+         * pages to be written, then this is the file offset
+         * of the * first page. */
+        loff_t        ldp_start_offset;
+        /** how many bytes are to be written. */
+        size_t        ldp_size;
+        /** # of pages in the array. */
+        int           ldp_nr;
+};
+
+extern ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
+                                  int rw, struct inode *inode,
+                                  struct ll_dio_pages *pv);
+
  #endif /* LLITE_INTERNAL_H */
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c

index 7532fa8..2f20b80 100644 (file)
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -166,8 +166,6 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
          struct obd_capa *oc = NULL;
          struct obd_statfs osfs;
          struct ptlrpc_request *request = NULL;
-        struct lustre_handle dt_conn = {0, };
-        struct lustre_handle md_conn = {0, };
          struct obd_connect_data *data = NULL;
          struct obd_uuid *uuid;
          struct lustre_md lmd;
@@ -232,7 +230,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
          if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
                  data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE;
  
-        err = obd_connect(NULL, &md_conn, obd, &sbi->ll_sb_uuid, data, NULL);
+        err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, data, NULL);
          if (err == -EBUSY) {
                  LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing "
                                     "recovery, of which this client is not a "
@@ -243,7 +241,6 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
                  CERROR("cannot connect to %s: rc = %d\n", md, err);
                  GOTO(out, err);
          }
-        sbi->ll_md_exp = class_conn2export(&md_conn);
  
          err = obd_fid_init(sbi->ll_md_exp);
          if (err) {
@@ -372,7 +369,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
          obd->obd_upcall.onu_upcall = cl_ocd_update;
          data->ocd_brw_size = PTLRPC_MAX_BRW_PAGES << CFS_PAGE_SHIFT;
  
-        err = obd_connect(NULL, &dt_conn, obd, &sbi->ll_sb_uuid, data, NULL);
+        err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data, NULL);
          if (err == -EBUSY) {
                  LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing "
                                     "recovery, of which this client is not a "
@@ -384,8 +381,6 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
                  GOTO(out_md_fid, err);
          }
  
-        sbi->ll_dt_exp = class_conn2export(&dt_conn);
-
          err = obd_fid_init(sbi->ll_dt_exp);
          if (err) {
                  CERROR("Can't init data layer FID infrastructure, "
diff --git a/lustre/llite/lloop.c b/lustre/llite/lloop.c

index 05026f1..f3a4410 100644 (file)
--- a/lustre/llite/lloop.c
+++ b/lustre/llite/lloop.c
@@ -42,9 +42,6 @@
   * Copyright 1993 by Theodore Ts'o.  Redistribution of this file is
   * permitted under the GNU General Public License.
   *
- * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993
- * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996
- *
   * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
   * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
   *
@@ -56,10 +53,6 @@
   *
   * Loadable modules and other fixes by AK, 1998
   *
- * Make real block number available to downstream transfer functions, enables
- * CBC (and relatives) mode encryption requiring unique IVs per data block.
- * Reed H. Petty, rhp@draper.net
- *
   * Maximum number of loop devices now dynamic via max_loop module parameter.
   * Russell Kroll <rkroll@exploits.org> 19990701
   *
@@ -129,37 +122,40 @@ enum {
  };
  
  struct lloop_device {
-        int                lo_number;
-        int                lo_refcnt;
-        loff_t             lo_offset;
-        loff_t             lo_sizelimit;
-        int                lo_flags;
+        int                  lo_number;
+        int                  lo_refcnt;
+        loff_t               lo_offset;
+        loff_t               lo_sizelimit;
+        int                  lo_flags;
          int                (*ioctl)(struct lloop_device *, int cmd,
-                                 unsigned long arg);
+                                    unsigned long arg);
  
-        struct file *      lo_backing_file;
+        struct file         *lo_backing_file;
          struct block_device *lo_device;
-        unsigned           lo_blocksize;
+        unsigned             lo_blocksize;
  
-        int                old_gfp_mask;
+        int                  old_gfp_mask;
  
-        spinlock_t         lo_lock;
-        struct bio         *lo_bio;
-        struct bio         *lo_biotail;
-        int                lo_state;
-        struct semaphore   lo_sem;
-        struct semaphore   lo_ctl_mutex;
-        struct semaphore   lo_bh_mutex;
-        atomic_t           lo_pending;
+        spinlock_t           lo_lock;
+        struct bio          *lo_bio;
+        struct bio          *lo_biotail;
+        int                  lo_state;
+        struct semaphore     lo_sem;
+        struct semaphore     lo_ctl_mutex;
+        atomic_t             lo_pending;
+        wait_queue_head_t    lo_bh_wait;
  
-        request_queue_t    *lo_queue;
+        request_queue_t     *lo_queue;
+
+        const struct lu_env *lo_env;
+        struct cl_io         lo_io;
+        struct ll_dio_pages  lo_pvec;
  
          /* data to handle bio for lustre. */
          struct lo_request_data {
-                struct brw_page    lrd_pages[LLOOP_MAX_SEGMENTS];
-                struct obdo        lrd_oa;
+                struct page *lrd_pages[LLOOP_MAX_SEGMENTS];
+                loff_t       lrd_offsets[LLOOP_MAX_SEGMENTS];
          } lo_requests[1];
-
  };
  
  /*
@@ -170,7 +166,8 @@ enum {
  };
  
  static int lloop_major;
-static int max_loop = 8;
+#define MAX_LOOP_DEFAULT  16
+static int max_loop = MAX_LOOP_DEFAULT;
  static struct lloop_device *loop_dev;
  static struct gendisk **disks;
  static struct semaphore lloop_mutex;
@@ -194,63 +191,88 @@ static loff_t get_loop_size(struct lloop_device *lo, struct file *file)
          return loopsize >> 9;
  }
  
-static int do_bio_filebacked(struct lloop_device *lo, struct bio *bio)
+static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
  {
-        struct inode *inode = lo->lo_backing_file->f_dentry->d_inode;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct lov_stripe_md *lsm = lli->lli_smd;
-        struct obd_info oinfo = {{{ 0 }}};
-        struct brw_page *pg = lo->lo_requests[0].lrd_pages;
-        struct obdo *oa = &lo->lo_requests[0].lrd_oa;
-        pgoff_t offset;
-        int ret, cmd, i, opc;
-        struct bio_vec *bvec;
-
-        BUG_ON(bio->bi_hw_segments > LLOOP_MAX_SEGMENTS);
-
-        offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset;
-        bio_for_each_segment(bvec, bio, i) {
-                BUG_ON(bvec->bv_offset != 0);
-                BUG_ON(bvec->bv_len != CFS_PAGE_SIZE);
-
-                pg->pg = bvec->bv_page;
-                pg->off = offset;
-                pg->count = bvec->bv_len;
-                pg->flag = OBD_BRW_SRVLOCK;
-
-                pg++;
-                offset += bvec->bv_len;
+        const struct lu_env  *env   = lo->lo_env;
+        struct cl_io         *io    = &lo->lo_io;
+        struct inode         *inode = lo->lo_backing_file->f_dentry->d_inode;
+        struct cl_object     *obj = ll_i2info(inode)->lli_clob;
+        pgoff_t               offset;
+        int                   ret;
+        int                   i;
+        int                   rw;
+        obd_count             page_count = 0;
+        struct bio_vec       *bvec;
+        struct bio           *bio;
+        ssize_t               bytes;
+
+        struct ll_dio_pages  *pvec = &lo->lo_pvec;
+        struct page         **pages = pvec->ldp_pages;
+        loff_t               *offsets = pvec->ldp_offsets;
+
+        truncate_inode_pages(inode->i_mapping, 0);
+
+        /* initialize the IO */
+        memset(io, 0, sizeof(*io));
+        io->ci_obj = obj;
+        ret = cl_io_init(env, io, CIT_MISC, obj);
+        if (ret)
+                return io->ci_result;
+        io->ci_lockreq = CILR_NEVER;
+
+        LASSERT(head != NULL);
+        rw = head->bi_rw;
+        for (bio = head; bio != NULL; bio = bio->bi_next) {
+                LASSERT(rw == bio->bi_rw);
+
+                offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset;
+                bio_for_each_segment(bvec, bio, i) {
+                        BUG_ON(bvec->bv_offset != 0);
+                        BUG_ON(bvec->bv_len != CFS_PAGE_SIZE);
+
+                        pages[page_count] = bvec->bv_page;
+                        offsets[page_count] = offset;
+                        page_count++;
+                        offset += bvec->bv_len;
+                }
+                LASSERT(page_count <= LLOOP_MAX_SEGMENTS);
          }
  
-        oa->o_mode = inode->i_mode;
-        oa->o_id = lsm->lsm_object_id;
-        oa->o_gr = lsm->lsm_object_gr;
-        oa->o_valid = OBD_MD_FLID | OBD_MD_FLMODE |
-                      OBD_MD_FLTYPE |OBD_MD_FLGROUP;
-        obdo_from_inode(oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER);
-
-        cmd = OBD_BRW_READ;
-        if (bio_rw(bio) == WRITE)
-                cmd = OBD_BRW_WRITE;
-
-        if (cmd == OBD_BRW_WRITE)
-                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_WRITE, bio->bi_size);
-        else
-                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_READ, bio->bi_size);
-        oinfo.oi_oa = oa;
-        oinfo.oi_md = lsm;
-        opc = cmd & OBD_BRW_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW;
-        oinfo.oi_capa = ll_osscapa_get(inode, opc);
-        ret = obd_brw(cmd, ll_i2dtexp(inode), &oinfo,
-                      (obd_count)(i - bio->bi_idx),
-                      lo->lo_requests[0].lrd_pages, NULL);
-        capa_put(oinfo.oi_capa);
-        if (ret == 0)
-                obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS);
-        return ret;
+        ll_stats_ops_tally(ll_i2sbi(inode),
+                        (rw == WRITE) ? LPROC_LL_BRW_WRITE : LPROC_LL_BRW_READ,
+                        page_count << PAGE_CACHE_SHIFT);
+
+        pvec->ldp_size = page_count << PAGE_CACHE_SHIFT;
+        pvec->ldp_nr = page_count;
+
+        /* FIXME: in ll_direct_rw_pages, it has to allocate many cl_page{}s to
+         * write those pages into OST. Even worse case is that more pages
+         * would be asked to write out to swap space, and then finally get here
+         * again.
+         * Unfortunately this is NOT easy to fix.
+         * Thoughts on solution:
+         * 0. Define a reserved pool for cl_pages, which could be a list of
+         *    pre-allocated cl_pages from cl_page_kmem;
+         * 1. Define a new operation in cl_object_operations{}, says clo_depth,
+         *    which measures how many layers for this lustre object. Generally
+         *    speaking, the depth would be 2, one for llite, and one for lovsub.
+         *    However, for SNS, there will be more since we need additional page
+         *    to store parity;
+         * 2. Reserve the # of (page_count * depth) cl_pages from the reserved
+         *    pool. Afterwards, the clio would allocate the pages from reserved 
+         *    pool, this guarantees we neeedn't allocate the cl_pages from
+         *    generic cl_page slab cache.
+         *    Of course, if there is NOT enough pages in the pool, we might
+         *    be asked to write less pages once, this purely depends on
+         *    implementation. Anyway, we should be careful to avoid deadlocking.
+         */
+        LOCK_INODE_MUTEX(inode);
+        bytes = ll_direct_rw_pages(env, io, rw, inode, pvec);
+        UNLOCK_INODE_MUTEX(inode);
+        cl_io_fini(env, io);
+        return (bytes == pvec->ldp_size) ? 0 : (int)bytes;
  }
  
-
  /*
   * Add bio to back of pending list
   */
@@ -266,41 +288,77 @@ static void loop_add_bio(struct lloop_device *lo, struct bio *bio)
                  lo->lo_bio = lo->lo_biotail = bio;
          spin_unlock_irqrestore(&lo->lo_lock, flags);
  
-        up(&lo->lo_bh_mutex);
+        atomic_inc(&lo->lo_pending);
+        if (waitqueue_active(&lo->lo_bh_wait))
+                wake_up(&lo->lo_bh_wait);
  }
  
  /*
   * Grab first pending buffer
   */
-static struct bio *loop_get_bio(struct lloop_device *lo)
+static unsigned int loop_get_bio(struct lloop_device *lo, struct bio **req)
  {
-        struct bio *bio;
+        struct bio *first;
+        struct bio **bio;
+        unsigned int count = 0;
+        unsigned int page_count = 0;
+        int rw;
  
          spin_lock_irq(&lo->lo_lock);
-        if ((bio = lo->lo_bio)) {
-                if (bio == lo->lo_biotail)
-                        lo->lo_biotail = NULL;
-                lo->lo_bio = bio->bi_next;
-                bio->bi_next = NULL;
+        first = lo->lo_bio;
+        if (unlikely(first == NULL)) {
+                spin_unlock_irq(&lo->lo_lock);
+                return 0;
          }
-        spin_unlock_irq(&lo->lo_lock);
  
-        return bio;
+        /* TODO: need to split the bio, too bad. */
+        LASSERT(first->bi_vcnt <= LLOOP_MAX_SEGMENTS);
+
+        rw = first->bi_rw;
+        bio = &lo->lo_bio;
+        while (*bio && (*bio)->bi_rw == rw) {
+                CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n",
+                       (unsigned long long)(*bio)->bi_sector, (*bio)->bi_size,
+                       page_count, (*bio)->bi_vcnt);
+                if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS)
+                        break;
+
+
+                page_count += (*bio)->bi_vcnt;
+                count++;
+                bio = &(*bio)->bi_next;
+        }
+        if (*bio) {
+                /* Some of bios can't be mergable. */
+                lo->lo_bio = *bio;
+                *bio = NULL;
+        } else {
+                /* Hit the end of queue */
+                lo->lo_biotail = NULL;
+                lo->lo_bio = NULL;
+        }
+        *req = first;
+        spin_unlock_irq(&lo->lo_lock);
+        return count;
  }
  
  static int loop_make_request(request_queue_t *q, struct bio *old_bio)
  {
          struct lloop_device *lo = q->queuedata;
          int rw = bio_rw(old_bio);
+        int inactive;
  
          if (!lo)
-                goto out;
+                goto err;
+
+        CDEBUG(D_INFO, "submit bio sector %llu size %u\n",
+               (unsigned long long)old_bio->bi_sector, old_bio->bi_size);
  
          spin_lock_irq(&lo->lo_lock);
-        if (lo->lo_state != LLOOP_BOUND)
-                goto inactive;
-        atomic_inc(&lo->lo_pending);
+        inactive = (lo->lo_state != LLOOP_BOUND);
          spin_unlock_irq(&lo->lo_lock);
+        if (inactive)
+                goto err;
  
          if (rw == WRITE) {
                  if (lo->lo_flags & LO_FLAGS_READ_ONLY)
@@ -314,14 +372,8 @@ static int loop_make_request(request_queue_t *q, struct bio *old_bio)
          loop_add_bio(lo, old_bio);
          return 0;
  err:
-        if (atomic_dec_and_test(&lo->lo_pending))
-                up(&lo->lo_bh_mutex);
-out:
          bio_io_error(old_bio, old_bio->bi_size);
          return 0;
-inactive:
-        spin_unlock_irq(&lo->lo_lock);
-        goto out;
  }
  
  /*
@@ -338,27 +390,50 @@ static void loop_unplug(request_queue_t *q)
  static inline void loop_handle_bio(struct lloop_device *lo, struct bio *bio)
  {
          int ret;
-        ret = do_bio_filebacked(lo, bio);
-        bio_endio(bio, bio->bi_size, ret);
+        ret = do_bio_lustrebacked(lo, bio);
+        while (bio) {
+                struct bio *tmp = bio->bi_next;
+                bio->bi_next = NULL;
+                bio_endio(bio, bio->bi_size, ret);
+                bio = tmp;
+        }
+}
+
+static inline int loop_active(struct lloop_device *lo)
+{
+        return atomic_read(&lo->lo_pending) || (lo->lo_state == LLOOP_RUNDOWN);
  }
  
  /*
   * worker thread that handles reads/writes to file backed loop devices,
- * to avoid blocking in our make_request_fn. it also does loop decrypting
- * on reads for block backed loop, as that is too heavy to do from
- * b_end_io context where irqs may be disabled.
+ * to avoid blocking in our make_request_fn.
   */
  static int loop_thread(void *data)
  {
          struct lloop_device *lo = data;
          struct bio *bio;
+        unsigned int count;
+        unsigned long times = 0;
+        unsigned long total_count = 0;
+
+        struct lu_env *env;
+        int refcheck;
+        int ret = 0;
  
          daemonize("lloop%d", lo->lo_number);
  
          set_user_nice(current, -20);
  
          lo->lo_state = LLOOP_BOUND;
-        atomic_inc(&lo->lo_pending);
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                GOTO(out, ret = PTR_ERR(env));
+
+        lo->lo_env = env;
+        memset(&lo->lo_pvec, 0, sizeof(lo->lo_pvec));
+        lo->lo_pvec.ldp_pages   = lo->lo_requests[0].lrd_pages;
+        lo->lo_pvec.ldp_offsets = lo->lo_requests[0].lrd_offsets;
  
          /*
           * up sem, we are running
@@ -366,40 +441,54 @@ static int loop_thread(void *data)
          up(&lo->lo_sem);
  
          for (;;) {
-                down_interruptible(&lo->lo_bh_mutex);
-                /*
-                 * could be upped because of tear-down, not because of
-                 * pending work
-                 */
-                if (!atomic_read(&lo->lo_pending))
-                        break;
+                wait_event(lo->lo_bh_wait, loop_active(lo));
+                if (!atomic_read(&lo->lo_pending)) {
+                        int exiting = 0;
+                        spin_lock_irq(&lo->lo_lock);
+                        exiting = (lo->lo_state == LLOOP_RUNDOWN);
+                        spin_unlock_irq(&lo->lo_lock);
+                        if (exiting)
+                                break;
+                }
  
-                bio = loop_get_bio(lo);
-                if (!bio) {
+                bio = NULL;
+                count = loop_get_bio(lo, &bio);
+                if (!count) {
                          CWARN("lloop(minor: %d): missing bio\n", lo->lo_number);
                          continue;
                  }
-                loop_handle_bio(lo, bio);
  
-                /*
-                 * upped both for pending work and tear-down, lo_pending
-                 * will hit zero then
-                 */
-                if (atomic_dec_and_test(&lo->lo_pending))
-                        break;
+                total_count += count;
+                if (total_count < count) {     /* overflow */
+                        total_count = count;
+                        times = 1;
+                } else {
+                        times++;
+                }
+                if ((times & 127) == 0) {
+                        CDEBUG(D_INFO, "total: %lu, count: %lu, avg: %lu\n",
+                               total_count, times, total_count / times);
+                }
+
+                LASSERT(bio != NULL);
+                LASSERT(count <= atomic_read(&lo->lo_pending));
+                loop_handle_bio(lo, bio);
+                atomic_sub(count, &lo->lo_pending);
          }
+        cl_env_put(env, &refcheck);
  
+out:
          up(&lo->lo_sem);
-        return 0;
+        return ret;
  }
  
  static int loop_set_fd(struct lloop_device *lo, struct file *unused,
                         struct block_device *bdev, struct file *file)
  {
-        struct inode        *inode;
+        struct inode         *inode;
          struct address_space *mapping;
-        int                lo_flags = 0;
-        int                error;
+        int                   lo_flags = 0;
+        int                   error;
          loff_t                size;
  
          if (!try_module_get(THIS_MODULE))
@@ -452,8 +541,10 @@ static int loop_set_fd(struct lloop_device *lo, struct file *unused,
  
          /* queue parameters */
          blk_queue_hardsect_size(lo->lo_queue, CFS_PAGE_SIZE);
-        blk_queue_max_sectors(lo->lo_queue, LLOOP_MAX_SEGMENTS);
+        blk_queue_max_sectors(lo->lo_queue,
+                              LLOOP_MAX_SEGMENTS << (CFS_PAGE_SHIFT - 9));
          blk_queue_max_phys_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS);
+        blk_queue_max_hw_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS);
  
          set_capacity(disks[lo->lo_number], size);
          bd_set_size(bdev, size << 9);
@@ -487,9 +578,8 @@ static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev,
  
          spin_lock_irq(&lo->lo_lock);
          lo->lo_state = LLOOP_RUNDOWN;
-        if (atomic_dec_and_test(&lo->lo_pending))
-                up(&lo->lo_bh_mutex);
          spin_unlock_irq(&lo->lo_lock);
+        wake_up(&lo->lo_bh_wait);
  
          down(&lo->lo_sem);
          lo->lo_backing_file = NULL;
@@ -533,7 +623,7 @@ static int lo_release(struct inode *inode, struct file *file)
  
  /* lloop device node's ioctl function. */
  static int lo_ioctl(struct inode *inode, struct file *unused,
-        unsigned int cmd, unsigned long arg)
+                    unsigned int cmd, unsigned long arg)
  {
          struct lloop_device *lo = inode->i_bdev->bd_disk->private_data;
          struct block_device *bdev = inode->i_bdev;
@@ -578,12 +668,13 @@ static struct block_device_operations lo_fops = {
  /* dynamic iocontrol callback.
   * This callback is registered in lloop_init and will be called by
   * ll_iocontrol_call.
+ *
   * This is a llite regular file ioctl function. It takes the responsibility
- * of attaching a file, and detaching a file by a lloop's device numner.
+ * of attaching or detaching a file by a lloop's device numner.
   */
  static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file,
-                unsigned int cmd, unsigned long arg,
-                void *magic, int *rcp)
+                                   unsigned int cmd, unsigned long arg,
+                                   void *magic, int *rcp)
  {
          struct lloop_device *lo = NULL;
          struct block_device *bdev = NULL;
@@ -684,25 +775,27 @@ static int __init lloop_init(void)
          };
  
          if (max_loop < 1 || max_loop > 256) {
+                max_loop = MAX_LOOP_DEFAULT;
                  CWARN("lloop: invalid max_loop (must be between"
-                      " 1 and 256), using default (8)\n");
-                max_loop = 8;
+                      " 1 and 256), using default (%u)\n", max_loop);
          }
  
          lloop_major = register_blkdev(0, "lloop");
          if (lloop_major < 0)
                  return -EIO;
  
+        CDEBUG(D_CONFIG, "registered lloop major %d with %u minors\n",
+               lloop_major, max_loop);
+
          ll_iocontrol_magic = ll_iocontrol_register(lloop_ioctl, 2, cmdlist);
          if (ll_iocontrol_magic == NULL)
                  goto out_mem1;
  
-        loop_dev = kmalloc(max_loop * sizeof(struct lloop_device), GFP_KERNEL);
+        OBD_ALLOC_WAIT(loop_dev, max_loop * sizeof(*loop_dev));
          if (!loop_dev)
                  goto out_mem1;
-        memset(loop_dev, 0, max_loop * sizeof(struct lloop_device));
  
-        disks = kmalloc(max_loop * sizeof(struct gendisk *), GFP_KERNEL);
+        OBD_ALLOC_WAIT(disks, max_loop * sizeof(*disks));
          if (!disks)
                  goto out_mem2;
  
@@ -718,14 +811,13 @@ static int __init lloop_init(void)
                  struct lloop_device *lo = &loop_dev[i];
                  struct gendisk *disk = disks[i];
  
-                memset(lo, 0, sizeof(*lo));
                  lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
                  if (!lo->lo_queue)
                          goto out_mem4;
  
                  init_MUTEX(&lo->lo_ctl_mutex);
                  init_MUTEX_LOCKED(&lo->lo_sem);
-                init_MUTEX_LOCKED(&lo->lo_bh_mutex);
+                init_waitqueue_head(&lo->lo_bh_wait);
                  lo->lo_number = i;
                  spin_lock_init(&lo->lo_lock);
                  disk->major = lloop_major;
@@ -748,9 +840,9 @@ out_mem4:
  out_mem3:
          while (i--)
                  put_disk(disks[i]);
-        kfree(disks);
+        OBD_FREE(disks, max_loop * sizeof(*disks));
  out_mem2:
-        kfree(loop_dev);
+        OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
  out_mem1:
          unregister_blkdev(lloop_major, "lloop");
          ll_iocontrol_unregister(ll_iocontrol_magic);
@@ -770,9 +862,11 @@ static void lloop_exit(void)
          }
          if (ll_unregister_blkdev(lloop_major, "lloop"))
                  CWARN("lloop: cannot unregister blkdev\n");
+        else
+                CDEBUG(D_CONFIG, "unregistered lloop major %d\n", lloop_major);
  
-        kfree(disks);
-        kfree(loop_dev);
+        OBD_FREE(disks, max_loop * sizeof(*disks));
+        OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
  }
  
  module_init(lloop_init);
diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c

index 031b1ab..fac56d7 100644 (file)
--- a/lustre/llite/rw26.c
+++ b/lustre/llite/rw26.c
@@ -216,11 +216,9 @@ static void ll_free_user_pages(struct page **pages, int npages, int do_dirty)
          OBD_FREE(pages, npages * sizeof(*pages));
  }
  
-static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
-                                   int rw, struct inode *inode,
-                                   struct address_space *mapping,
-                                   size_t size, loff_t file_offset,
-                                   struct page **pages, int page_count)
+ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
+                           int rw, struct inode *inode,
+                           struct ll_dio_pages *pv)
  {
          struct cl_page    *clp;
          struct ccc_page   *clup;
@@ -229,8 +227,11 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
          struct cl_sync_io *anchor = &ccc_env_info(env)->cti_sync_io;
          int i;
          ssize_t rc = 0;
-        ssize_t size_orig = size;
-        size_t page_size  = cl_page_size(obj);
+        loff_t file_offset  = pv->ldp_start_offset;
+        size_t size         = pv->ldp_size;
+        int page_count      = pv->ldp_nr;
+        struct page **pages = pv->ldp_pages;
+        size_t page_size    = cl_page_size(obj);
          ENTRY;
  
          cl_sync_io_init(anchor, page_count);
@@ -238,8 +239,11 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
          queue = &io->ci_queue;
          cl_2queue_init(queue);
          for (i = 0; i < page_count; i++) {
+                if (pv->ldp_offsets)
+                    file_offset = pv->ldp_offsets[i];
+                LASSERT(!(file_offset & (page_size - 1)));
                  clp = cl_page_find(env, obj, cl_index(obj, file_offset),
-                                   pages[i], CPT_TRANSIENT);
+                                   pv->ldp_pages[i], CPT_TRANSIENT);
                  if (IS_ERR(clp)) {
                          rc = PTR_ERR(clp);
                          break;
@@ -319,7 +323,7 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
                                  cl_sync_io_note(anchor, +1);
                          /* wait for the IO to be finished. */
                          rc = cl_sync_io_wait(env, io, &queue->c2_qout,
-                                             anchor) ?: size_orig;
+                                             anchor) ?: pv->ldp_size;
                  }
          }
  
@@ -328,6 +332,23 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
          cl_2queue_fini(env, queue);
          RETURN(rc);
  }
+EXPORT_SYMBOL(ll_direct_rw_pages);
+
+static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
+                                   int rw, struct inode *inode,
+                                   struct address_space *mapping,
+                                   size_t size, loff_t file_offset,
+                                   struct page **pages, int page_count)
+{
+    struct ll_dio_pages pvec = { .ldp_pages        = pages,
+                                 .ldp_nr           = page_count,
+                                 .ldp_size         = size,
+                                 .ldp_offsets      = NULL,
+                                 .ldp_start_offset = file_offset
+                               };
+
+    return ll_direct_rw_pages(env, io, rw, inode, &pvec);
+}
  
  /* This is the maximum size of a single O_DIRECT request, based on a 128kB
   * kmalloc limit.  We need to fit all of the brw_page structs, each one
diff --git a/lustre/llite/vvp_page.c b/lustre/llite/vvp_page.c

index d199ad6..b698f52 100644 (file)
--- a/lustre/llite/vvp_page.c
+++ b/lustre/llite/vvp_page.c
@@ -243,7 +243,12 @@ static void vvp_page_completion_common(const struct lu_env *env,
          struct cl_sync_io *anchor = cp->cpg_sync_io;
  
          LINVRNT(cl_page_is_vmlocked(env, clp));
-        KLASSERT(!PageWriteback(vmpage));
+
+        /* Don't assert the page writeback bit here because the lustre file
+         * may be as a backend of swap space. in this case, the page writeback
+         * is set by VM, and obvious we shouldn't clear it at all. Fortunately
+         * this type of pages are all TRANSIENT pages. */
+        KLASSERT(ergo(clp->cp_type == CPT_CACHEABLE, !PageWriteback(vmpage)));
  
          vvp_vmpage_error(inode, vmpage, ioret);
  
diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c

index 7543a8c..f98f511 100644 (file)
--- a/lustre/lmv/lmv_obd.c
+++ b/lustre/lmv/lmv_obd.c
@@ -239,7 +239,7 @@ static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
   * caller that everything is okay. Real connection will be performed later.
   */
  static int lmv_connect(const struct lu_env *env,
-                       struct lustre_handle *conn, struct obd_device *obd,
+                       struct obd_export **exp, struct obd_device *obd,
                         struct obd_uuid *cluuid, struct obd_connect_data *data,
                         void *localdata)
  {
@@ -247,29 +247,30 @@ static int lmv_connect(const struct lu_env *env,
          struct proc_dir_entry *lmv_proc_dir;
  #endif
          struct lmv_obd        *lmv = &obd->u.lmv;
-        struct obd_export     *exp;
+        struct lustre_handle  conn = { 0 };
          int                    rc = 0;
          ENTRY;
  
-        rc = class_connect(conn, obd, cluuid);
-        if (rc) {
-                CERROR("class_connection() returned %d\n", rc);
-                RETURN(rc);
-        }
-
-        exp = class_conn2export(conn);
-
          /*
           * We don't want to actually do the underlying connections more than
           * once, so keep track.
           */
          lmv->refcount++;
          if (lmv->refcount > 1) {
-                class_export_put(exp);
+                *exp = NULL;
                  RETURN(0);
          }
  
-        lmv->exp = exp;
+        rc = class_connect(&conn, obd, cluuid);
+        if (rc) {
+                CERROR("class_connection() returned %d\n", rc);
+                RETURN(rc);
+        }
+
+        *exp = class_conn2export(&conn);
+        class_export_get(*exp);
+
+        lmv->exp = *exp;
          lmv->connected = 0;
          lmv->cluuid = *cluuid;
  
@@ -383,7 +384,6 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
          struct obd_uuid         *cluuid = &lmv->cluuid;
          struct obd_connect_data *mdc_data = NULL;
          struct obd_uuid          lmv_mdc_uuid = { "LMV_MDC_UUID" };
-        struct lustre_handle     conn = {0, };
          struct obd_device       *mdc_obd;
          struct obd_export       *mdc_exp;
          struct lu_fld_target     target;
@@ -407,15 +407,13 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
                  RETURN(-EINVAL);
          }
  
-        rc = obd_connect(NULL, &conn, mdc_obd, &lmv_mdc_uuid,
+        rc = obd_connect(NULL, &mdc_exp, mdc_obd, &lmv_mdc_uuid,
                           &lmv->conn_data, NULL);
          if (rc) {
                  CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc);
                  RETURN(rc);
          }
  
-        mdc_exp = class_conn2export(&conn);
-
          /*
           * Init fid sequence client for this mdc and add new fld target.
           */
diff --git a/lustre/lov/Makefile.in b/lustre/lov/Makefile.in

index 5a2aad7..59f7c79 100644 (file)
--- a/lustre/lov/Makefile.in
+++ b/lustre/lov/Makefile.in
@@ -1,4 +1,6 @@
  MODULES := lov
  lov-objs := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o lov_request.o lov_qos.o lov_ea.o lov_dev.o lov_object.o lov_page.o lov_lock.o lov_io.o lovsub_dev.o lovsub_object.o lovsub_page.o lovsub_lock.o lovsub_io.o lov_pool.o
  
+EXTRA_DIST = $(lov-objs:.o=.c) lov_internal.h lov_cl_internal.h
+
  @INCLUDE_RULES@
diff --git a/lustre/lov/autoMakefile.am b/lustre/lov/autoMakefile.am

index e18070c..77c91b0 100644 (file)
--- a/lustre/lov/autoMakefile.am
+++ b/lustre/lov/autoMakefile.am
@@ -84,5 +84,4 @@ endif # MODULES
  
  install-data-hook: $(install_data_hook)
  
-DIST_SOURCES = $(lov-objs:.o=.c) lov_internal.h lov_cl_internal.h
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
diff --git a/lustre/lov/lov_cl_internal.h b/lustre/lov/lov_cl_internal.h

index 6a98fbc..98c1270 100644 (file)
--- a/lustre/lov/lov_cl_internal.h
+++ b/lustre/lov/lov_cl_internal.h
@@ -88,14 +88,14 @@
   *       cl_lock::cll_guard, and will be automatically cleared by the sub-lock
   *       when the latter is destroyed. When a sub-lock is canceled, a
   *       reference to it is removed from the top-lock array, and top-lock is
- *       moved into CLS_NEW state. It is guaranteed that all sub-locks exits
+ *       moved into CLS_NEW state. It is guaranteed that all sub-locks exist
   *       while their top-lock is in CLS_HELD or CLS_CACHED states.
   *
   *     - IO's are not reference counted.
   *
   * To implement a connection between top and sub entities, lov layer is split
   * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both
- * implementing full set of cl-interfaces. For example, top-object has clu and
+ * implementing full set of cl-interfaces. For example, top-object has vvp and
   * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is
   * used to track child-parent relationship.
   *
diff --git a/lustre/lov/lov_internal.h b/lustre/lov/lov_internal.h

index c6c3a69..2aaaff4 100644 (file)
--- a/lustre/lov/lov_internal.h
+++ b/lustre/lov/lov_internal.h
@@ -162,7 +162,7 @@ int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off);
  #define LOV_USES_ASSIGNED_STRIPE        0
  #define LOV_USES_DEFAULT_STRIPE         1
  int qos_add_tgt(struct obd_device *obd, __u32 index);
-int qos_del_tgt(struct obd_device *obd, __u32 index);
+int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt);
  void qos_shrink_lsm(struct lov_request_set *set);
  int qos_prep_create(struct obd_export *exp, struct lov_request_set *set);
  void qos_update(struct lov_obd *lov);
@@ -320,5 +320,6 @@ int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname);
  void lov_dump_pool(int level, struct pool_desc *pool);
  struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname);
  int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool);
+void lov_pool_putref(struct pool_desc *pool);
  
  #endif
diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c

index 86058ed..00ce37d 100644 (file)
--- a/lustre/lov/lov_obd.c
+++ b/lustre/lov/lov_obd.c
@@ -82,26 +82,43 @@ void lov_getref(struct obd_device *obd)
          return;
  }
  
-static void __lov_del_obd(struct obd_device *obd, __u32 index);
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt);
  
  void lov_putref(struct obd_device *obd)
  {
          struct lov_obd *lov = &obd->u.lov;
+
          mutex_down(&lov->lov_lock);
          /* ok to dec to 0 more than once -- ltd_exp's will be null */
          if (atomic_dec_and_test(&lov->lov_refcount) && lov->lov_death_row) {
+                CFS_LIST_HEAD(kill);
                  int i;
+                struct lov_tgt_desc *tgt, *n;
                  CDEBUG(D_CONFIG, "destroying %d lov targets\n",
                         lov->lov_death_row);
                  for (i = 0; i < lov->desc.ld_tgt_count; i++) {
-                        if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_reap)
+                        tgt = lov->lov_tgts[i];
+
+                        if (!tgt || !tgt->ltd_reap)
                                  continue;
-                        /* Disconnect and delete from list */
-                        __lov_del_obd(obd, i);
+                        list_add(&tgt->ltd_kill, &kill);
+                        /* XXX - right now there is a dependency on ld_tgt_count
+                         * being the maximum tgt index for computing the
+                         * mds_max_easize. So we can't shrink it. */
+                        lov_ost_pool_remove(&lov->lov_packed, i);
+                        lov->lov_tgts[i] = NULL;
                          lov->lov_death_row--;
                  }
+                mutex_up(&lov->lov_lock);
+
+                list_for_each_entry_safe(tgt, n, &kill, ltd_kill) {
+                        list_del(&tgt->ltd_kill);
+                        /* Disconnect */
+                        __lov_del_obd(obd, tgt);
+                }
+        } else {
+                mutex_up(&lov->lov_lock);
          }
-        mutex_up(&lov->lov_lock);
  }
  
  static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
@@ -118,7 +135,6 @@ int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
          struct obd_uuid tgt_uuid;
          struct obd_device *tgt_obd;
          struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" };
-        struct lustre_handle conn = {0, };
          struct obd_import *imp;
  
  #ifdef __KERNEL__
@@ -162,39 +178,28 @@ int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
                  ptlrpc_activate_import(imp);
          }
  
+        rc = obd_register_observer(tgt_obd, obd);
+        if (rc) {
+                CERROR("Target %s register_observer error %d\n",
+                       obd_uuid2str(&tgt_uuid), rc);
+                RETURN(rc);
+        }
+
+
          if (imp->imp_invalid) {
                  CERROR("not connecting OSC %s; administratively "
                         "disabled\n", obd_uuid2str(&tgt_uuid));
-                rc = obd_register_observer(tgt_obd, obd);
-                if (rc) {
-                        CERROR("Target %s register_observer error %d; "
-                               "will not be able to reactivate\n",
-                               obd_uuid2str(&tgt_uuid), rc);
-                }
                  RETURN(0);
          }
  
-        rc = obd_connect(NULL, &conn, tgt_obd, &lov_osc_uuid, data, NULL);
-        if (rc) {
+        rc = obd_connect(NULL, &lov->lov_tgts[index]->ltd_exp, tgt_obd,
+                         &lov_osc_uuid, data, NULL);
+        if (rc || !lov->lov_tgts[index]->ltd_exp) {
                  CERROR("Target %s connect error %d\n",
                         obd_uuid2str(&tgt_uuid), rc);
-                RETURN(rc);
-        }
-        lov->lov_tgts[index]->ltd_exp = class_conn2export(&conn);
-        if (!lov->lov_tgts[index]->ltd_exp) {
-                CERROR("Target %s: null export!\n", obd_uuid2str(&tgt_uuid));
                  RETURN(-ENODEV);
          }
  
-        rc = obd_register_observer(tgt_obd, obd);
-        if (rc) {
-                CERROR("Target %s register_observer error %d\n",
-                       obd_uuid2str(&tgt_uuid), rc);
-                obd_disconnect(lov->lov_tgts[index]->ltd_exp);
-                lov->lov_tgts[index]->ltd_exp = NULL;
-                RETURN(rc);
-        }
-
          lov->lov_tgts[index]->ltd_reap = 0;
          if (activate) {
                  lov->lov_tgts[index]->ltd_active = 1;
@@ -207,7 +212,7 @@ int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
  #ifdef __KERNEL__
          lov_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds");
          if (lov_proc_dir) {
-                struct obd_device *osc_obd = class_conn2obd(&conn);
+                struct obd_device *osc_obd = lov->lov_tgts[index]->ltd_exp->exp_obd;
                  cfs_proc_dir_entry_t *osc_symlink;
                  char name[MAX_STRING_SIZE];
  
@@ -237,21 +242,24 @@ int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
  }
  
  static int lov_connect(const struct lu_env *env,
-                       struct lustre_handle *conn, struct obd_device *obd,
+                       struct obd_export **exp, struct obd_device *obd,
                         struct obd_uuid *cluuid, struct obd_connect_data *data,
                         void *localdata)
  {
          struct lov_obd *lov = &obd->u.lov;
          struct lov_tgt_desc *tgt;
+        struct lustre_handle conn;
          int i, rc;
          ENTRY;
  
          CDEBUG(D_CONFIG, "connect #%d\n", lov->lov_connects);
  
-        rc = class_connect(conn, obd, cluuid);
+        rc = class_connect(&conn, obd, cluuid);
          if (rc)
                  RETURN(rc);
  
+        *exp = class_conn2export(&conn);
+
          /* Why should there ever be more than 1 connect? */
          lov->lov_connects++;
          LASSERT(lov->lov_connects == 1);
@@ -277,7 +285,7 @@ static int lov_connect(const struct lu_env *env,
                          continue;
  
                  rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd,
-                                OBD_NOTIFY_ACTIVE, (void *)&i);
+                                OBD_NOTIFY_CONNECT, (void *)&i);
                  if (rc) {
                          CERROR("%s error sending notify %d\n",
                                 obd->obd_name, rc);
@@ -288,26 +296,22 @@ static int lov_connect(const struct lu_env *env,
          RETURN(0);
  }
  
-static int lov_disconnect_obd(struct obd_device *obd, __u32 index)
+static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
  {
          cfs_proc_dir_entry_t *lov_proc_dir;
          struct lov_obd *lov = &obd->u.lov;
          struct obd_device *osc_obd;
          int rc;
-
          ENTRY;
  
-        if (lov->lov_tgts[index] == NULL)
-                RETURN(-EINVAL);
-
-        osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp);
+        osc_obd = class_exp2obd(tgt->ltd_exp);
          CDEBUG(D_CONFIG, "%s: disconnecting target %s\n",
                 obd->obd_name, osc_obd->obd_name);
  
-        if (lov->lov_tgts[index]->ltd_active) {
-                lov->lov_tgts[index]->ltd_active = 0;
+        if (tgt->ltd_active) {
+                tgt->ltd_active = 0;
                  lov->desc.ld_active_tgt_count--;
-                lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 1;
+                tgt->ltd_exp->exp_obd->obd_inactive = 1;
          }
  
          lov_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds");
@@ -336,16 +340,16 @@ static int lov_disconnect_obd(struct obd_device *obd, __u32 index)
  
          obd_register_observer(osc_obd, NULL);
  
-        rc = obd_disconnect(lov->lov_tgts[index]->ltd_exp);
+        rc = obd_disconnect(tgt->ltd_exp);
          if (rc) {
                  CERROR("Target %s disconnect error %d\n",
-                       lov_uuid2str(lov, index), rc);
+                       tgt->ltd_uuid.uuid, rc);
                  rc = 0;
          }
  
-        qos_del_tgt(obd, index);
+        qos_del_tgt(obd, tgt);
  
-        lov->lov_tgts[index]->ltd_exp = NULL;
+        tgt->ltd_exp = NULL;
          RETURN(0);
  }
  
@@ -615,7 +619,7 @@ int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
                  GOTO(out, rc = 0);
  
          rc = lov_notify(obd, tgt->ltd_exp->exp_obd,
-                        active ? OBD_NOTIFY_ACTIVE : OBD_NOTIFY_INACTIVE,
+                        active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE,
                          (void *)&index);
  
  out:
@@ -671,12 +675,9 @@ out:
          RETURN(rc);
  }
  
-/* We are holding lov_lock */
-static void __lov_del_obd(struct obd_device *obd, __u32 index)
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
  {
-        struct lov_obd *lov = &obd->u.lov;
          struct obd_device *osc_obd;
-        struct lov_tgt_desc *tgt = lov->lov_tgts[index];
  
          LASSERT(tgt);
          LASSERT(tgt->ltd_reap);
@@ -684,18 +685,12 @@ static void __lov_del_obd(struct obd_device *obd, __u32 index)
          osc_obd = class_exp2obd(tgt->ltd_exp);
  
          CDEBUG(D_CONFIG, "Removing tgt %s : %s\n",
-               lov_uuid2str(lov, index),
+               tgt->ltd_uuid.uuid,
                 osc_obd ? osc_obd->obd_name : "<no obd>");
  
          if (tgt->ltd_exp)
-                lov_disconnect_obd(obd, index);
-
-        /* XXX - right now there is a dependency on ld_tgt_count being the
-         * maximum tgt index for computing the mds_max_easize. So we can't
-         * shrink it. */
+                lov_disconnect_obd(obd, tgt);
  
-        lov_ost_pool_remove(&lov->lov_packed, index);
-        lov->lov_tgts[index] = NULL;
          OBD_FREE_PTR(tgt);
  
          /* Manual cleanup - no cleanup logs to clean up the osc's.  We must
@@ -846,11 +841,12 @@ int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
  static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
  {
          int rc = 0;
+        struct lov_obd *lov = &obd->u.lov;
+
          ENTRY;
  
          switch (stage) {
          case OBD_CLEANUP_EARLY: {
-                struct lov_obd *lov = &obd->u.lov;
                  int i;
                  for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                          if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
@@ -875,22 +871,19 @@ static int lov_cleanup(struct obd_device *obd)
          struct list_head *pos, *tmp;
          struct pool_desc *pool;
  
-        lprocfs_obd_cleanup(obd);
-
-        /* Delete hash entries and kill hash table before freeing pools
-         * and get to use after free issue. */
-        lustre_hash_exit(lov->lov_pools_hash_body);
-
          list_for_each_safe(pos, tmp, &lov->lov_pool_list) {
                  pool = list_entry(pos, struct pool_desc, pool_list);
                  /* free pool structs */
+                CDEBUG(D_INFO, "delete pool %p\n", pool);
                  lov_pool_del(obd, pool->pool_name);
          }
+        lustre_hash_exit(lov->lov_pools_hash_body);
          lov_ost_pool_free(&(lov->lov_qos.lq_rr.lqr_pool));
          lov_ost_pool_free(&lov->lov_packed);
  
          if (lov->lov_tgts) {
                  int i;
+                lov_getref(obd);
                  for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                          if (!lov->lov_tgts[i])
                                  continue;
@@ -907,11 +900,15 @@ static int lov_cleanup(struct obd_device *obd)
                                         atomic_read(&lov->lov_refcount));
                          lov_del_target(obd, i, 0, 0);
                  }
+                lov_putref(obd);
                  OBD_FREE(lov->lov_tgts, sizeof(*lov->lov_tgts) *
                           lov->lov_tgt_size);
                  lov->lov_tgt_size = 0;
          }
  
+        /* clear pools parent proc entry only after all pools is killed */
+        lprocfs_obd_cleanup(obd);
+
          RETURN(0);
  }
  
diff --git a/lustre/lov/lov_pack.c b/lustre/lov/lov_pack.c

index 12c2d28..7fd5470 100644 (file)
--- a/lustre/lov/lov_pack.c
+++ b/lustre/lov/lov_pack.c
@@ -484,8 +484,7 @@ static int __lov_setstripe(struct obd_export *exp, struct lov_stripe_md **lsmp,
                          rc = lov_check_index_in_pool(lumv3.lmm_stripe_offset,
                                                       pool);
                          if (rc < 0) {
-                                lh_put(lov->lov_pools_hash_body,
-                                       &pool->pool_hash);
+                                lov_pool_putref(pool);
                                  RETURN(-EINVAL);
                          }
                  }
@@ -493,7 +492,7 @@ static int __lov_setstripe(struct obd_export *exp, struct lov_stripe_md **lsmp,
                  if (stripe_count > pool_tgt_count(pool))
                          stripe_count = pool_tgt_count(pool);
  
-                lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
+                lov_pool_putref(pool);
          }
  
          if ((__u64)lumv1->lmm_stripe_size * stripe_count > ~0UL) {
@@ -640,14 +639,21 @@ int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm,
                  CLASSERT(sizeof lum.lmm_objects[0] ==
                           sizeof lmmk->lmm_objects[0]);
  
+                if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) &&
+                    (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC)))
+                        lustre_swab_lov_mds_md(lmmk);
                  /* User wasn't expecting this many OST entries */
                  if (lum.lmm_stripe_count == 0) {
-                        if (copy_to_user(lump, lmmk, lum_size))
+                        copy_lov_mds2user(&lum, lmmk);
+                        if (copy_to_user(lump, &lum, lum_size))
                                  rc = -EFAULT;
                  } else if (lum.lmm_stripe_count < lmmk->lmm_stripe_count) {
                          rc = -EOVERFLOW;
-                } else if (copy_to_user(lump, lmmk, lmm_size))
-                        rc = -EFAULT;
+                } else {
+                        copy_lov_mds2user(&lum, lmmk);
+                        if (copy_to_user(lump, &lum, lmm_size))
+                                rc = -EFAULT;
+                }
  
                  obd_free_diskmd(exp, &lmmk);
          }
diff --git a/lustre/lov/lov_pool.c b/lustre/lov/lov_pool.c

index 764a494..3df37a7 100644 (file)
--- a/lustre/lov/lov_pool.c
+++ b/lustre/lov/lov_pool.c
@@ -38,6 +38,8 @@
   * OST pool methods
   *
   * Author: Jacques-Charles LAFOUCRIERE <jc.lafoucriere@cea.fr>
+ * Author: Alex Lyashkov <Alexey.Lyashkov@Sun.COM>
+ * Author: Nathaniel Rutman <Nathan.Rutman@Sun.COM>
   */
  
  #define DEBUG_SUBSYSTEM S_LOV
@@ -51,15 +53,23 @@
  #include <obd.h>
  #include "lov_internal.h"
  
-static void lov_pool_getref(struct pool_desc *pool) {
+static void lov_pool_getref(struct pool_desc *pool)
+{
+        CDEBUG(D_INFO, "pool %p\n", pool);
          atomic_inc(&pool->pool_refcount);
  }
  
-static void lov_pool_putref(struct pool_desc *pool) {
+void lov_pool_putref(struct pool_desc *pool) 
+{
+        CDEBUG(D_INFO, "pool %p\n", pool);
          if (atomic_dec_and_test(&pool->pool_refcount)) {
+                LASSERT(hlist_unhashed(&pool->pool_hash));
+                LASSERT(list_empty(&pool->pool_list));
+                LASSERT(pool->pool_proc_entry == NULL);
                  lov_ost_pool_free(&(pool->pool_rr.lqr_pool));
                  lov_ost_pool_free(&(pool->pool_obds));
                  OBD_FREE_PTR(pool);
+                EXIT;
          }
  }
  
@@ -302,6 +312,8 @@ void lov_dump_pool(int level, struct pool_desc *pool)
  #define LOV_POOL_INIT_COUNT 2
  int lov_ost_pool_init(struct ost_pool *op, unsigned int count)
  {
+        ENTRY;
+
          if (count == 0)
                  count = LOV_POOL_INIT_COUNT;
          op->op_array = NULL;
@@ -311,8 +323,9 @@ int lov_ost_pool_init(struct ost_pool *op, unsigned int count)
          OBD_ALLOC(op->op_array, op->op_size * sizeof(op->op_array[0]));
          if (op->op_array == NULL) {
                  op->op_size = 0;
-                return -ENOMEM;
+                RETURN(-ENOMEM);
          }
+        EXIT;
          return 0;
  }
  
@@ -359,6 +372,7 @@ int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count)
          /* ost not found we add it */
          op->op_array[op->op_count] = idx;
          op->op_count++;
+        EXIT;
  out:
          up_write(&op->op_rw_sem);
          return rc;
@@ -367,6 +381,7 @@ out:
  int lov_ost_pool_remove(struct ost_pool *op, __u32 idx)
  {
          int i;
+        ENTRY;
  
          down_write(&op->op_rw_sem);
  
@@ -376,18 +391,21 @@ int lov_ost_pool_remove(struct ost_pool *op, __u32 idx)
                                  (op->op_count - i - 1) * sizeof(op->op_array[0]));
                          op->op_count--;
                          up_write(&op->op_rw_sem);
+                        EXIT;
                          return 0;
                  }
          }
  
          up_write(&op->op_rw_sem);
-        return -EINVAL;
+        RETURN(-EINVAL);
  }
  
  int lov_ost_pool_free(struct ost_pool *op)
  {
+        ENTRY;
+
          if (op->op_size == 0)
-                return 0;
+                RETURN(0);
  
          down_write(&op->op_rw_sem);
  
@@ -397,7 +415,7 @@ int lov_ost_pool_free(struct ost_pool *op)
          op->op_size = 0;
  
          up_write(&op->op_rw_sem);
-        return 0;
+        RETURN(0);
  }
  
  
@@ -430,48 +448,54 @@ int lov_pool_new(struct obd_device *obd, char *poolname)
  
          memset(&(new_pool->pool_rr), 0, sizeof(struct lov_qos_rr));
          rc = lov_ost_pool_init(&new_pool->pool_rr.lqr_pool, 0);
-        if (rc) {
-                lov_ost_pool_free(&new_pool->pool_obds);
-                GOTO(out_err, rc);
-        }
+        if (rc)
+                GOTO(out_free_pool_obds, rc);
  
          INIT_HLIST_NODE(&new_pool->pool_hash);
-        rc = lustre_hash_add_unique(lov->lov_pools_hash_body, poolname,
-                                    &new_pool->pool_hash);
-        if (rc) {
-                lov_ost_pool_free(&new_pool->pool_rr.lqr_pool);
-                lov_ost_pool_free(&new_pool->pool_obds);
-                GOTO(out_err, rc = -EEXIST);
-        }
-
-        spin_lock(&obd->obd_dev_lock);
-        list_add_tail(&new_pool->pool_list, &lov->lov_pool_list);
-        lov->lov_pool_count++;
-
-        spin_unlock(&obd->obd_dev_lock);
-
-        CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n",
-               poolname, lov->lov_pool_count);
  
  #ifdef LPROCFS
-        /* ifdef needed for liblustre */
+        /* we need this assert seq_file is not implementated for liblustre */
          /* get ref for /proc file */
          lov_pool_getref(new_pool);
          new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry,
                                                         poolname, NULL, NULL,
                                                         new_pool,
                                                         &pool_proc_operations);
-#endif
-
          if (IS_ERR(new_pool->pool_proc_entry)) {
                  CWARN("Cannot add proc pool entry "LOV_POOLNAMEF"\n", poolname);
                  new_pool->pool_proc_entry = NULL;
                  lov_pool_putref(new_pool);
          }
+        CDEBUG(D_INFO, "pool %p - proc %p\n", new_pool, new_pool->pool_proc_entry);
+#endif
+
+        spin_lock(&obd->obd_dev_lock);
+        list_add_tail(&new_pool->pool_list, &lov->lov_pool_list);
+        lov->lov_pool_count++;
+        spin_unlock(&obd->obd_dev_lock);
+
+        /* add to find only when it fully ready  */
+        rc = lustre_hash_add_unique(lov->lov_pools_hash_body, poolname,
+                                    &new_pool->pool_hash);
+        if (rc)
+                GOTO(out_err, rc = -EEXIST);
+
+        CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n",
+               poolname, lov->lov_pool_count);
  
          RETURN(0);
  
  out_err:
+        spin_lock(&obd->obd_dev_lock);
+        list_del_init(&new_pool->pool_list);
+        lov->lov_pool_count--;
+        spin_unlock(&obd->obd_dev_lock);
+
+        lprocfs_remove(&new_pool->pool_proc_entry);
+
+        lov_ost_pool_free(&new_pool->pool_rr.lqr_pool);
+out_free_pool_obds:
+        lov_ost_pool_free(&new_pool->pool_obds);
          OBD_FREE_PTR(new_pool);
          return rc;
  }
@@ -484,33 +508,23 @@ int lov_pool_del(struct obd_device *obd, char *poolname)
  
          lov = &(obd->u.lov);
  
-        spin_lock(&obd->obd_dev_lock);
-
-        pool = lustre_hash_lookup(lov->lov_pools_hash_body, poolname);
-        if (pool == NULL) {
-                spin_unlock(&obd->obd_dev_lock);
+        /* lookup and kill hash reference */
+        pool = lustre_hash_del_key(lov->lov_pools_hash_body, poolname);
+        if (pool == NULL)
                  RETURN(-ENOENT);
-        }
  
-#ifdef LPROCFS
          if (pool->pool_proc_entry != NULL) {
-                remove_proc_entry(pool->pool_proc_entry->name,
-                                  pool->pool_proc_entry->parent);
-                /* remove ref for /proc file */
+                CDEBUG(D_INFO, "proc entry %p\n", pool->pool_proc_entry);
+                lprocfs_remove(&pool->pool_proc_entry);
                  lov_pool_putref(pool);
          }
-#endif
  
-        lustre_hash_del_key(lov->lov_pools_hash_body, poolname);
+        spin_lock(&obd->obd_dev_lock);
          list_del_init(&pool->pool_list);
-
          lov->lov_pool_count--;
-        lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
          spin_unlock(&obd->obd_dev_lock);
  
-        /* remove ref got when pool was created in memory
-         * pool will be freed when refount will reach 0
-         */
+        /* release last reference */
          lov_pool_putref(pool);
  
          RETURN(0);
@@ -522,7 +536,7 @@ int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
          struct obd_uuid ost_uuid;
          struct lov_obd *lov;
          struct pool_desc *pool;
-        unsigned int i, lov_idx;
+        unsigned int lov_idx;
          int rc;
          ENTRY;
  
@@ -536,22 +550,17 @@ int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
  
  
          /* search ost in lov array */
-        mutex_down(&lov->lov_lock);
-        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
-                if (!lov->lov_tgts[i])
+        lov_getref(obd);
+        for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+                if (!lov->lov_tgts[lov_idx])
                          continue;
-                if (obd_uuid_equals(&ost_uuid, &(lov->lov_tgts[i]->ltd_uuid)))
+                if (obd_uuid_equals(&ost_uuid,
+                                    &(lov->lov_tgts[lov_idx]->ltd_uuid)))
                          break;
          }
-
          /* test if ost found in lov */
-        if (i == lov->desc.ld_tgt_count) {
-                mutex_up(&lov->lov_lock);
+        if (lov_idx == lov->desc.ld_tgt_count)
                  GOTO(out, rc = -EINVAL);
-        }
-        mutex_up(&lov->lov_lock);
-
-        lov_idx = i;
  
          rc = lov_ost_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size);
          if (rc)
@@ -564,7 +573,8 @@ int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
  
          EXIT;
  out:
-        lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
+        lov_putref(obd);
+        lov_pool_putref(pool);
          return rc;
  }
  
@@ -573,39 +583,32 @@ int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
          struct obd_uuid ost_uuid;
          struct lov_obd *lov;
          struct pool_desc *pool;
-        unsigned int i, lov_idx;
+        unsigned int lov_idx;
          int rc = 0;
          ENTRY;
  
          lov = &(obd->u.lov);
  
-        spin_lock(&obd->obd_dev_lock);
          pool = lustre_hash_lookup(lov->lov_pools_hash_body, poolname);
-        if (pool == NULL) {
-                spin_unlock(&obd->obd_dev_lock);
+        if (pool == NULL)
                  RETURN(-ENOENT);
-        }
  
          obd_str2uuid(&ost_uuid, ostname);
  
+        lov_getref(obd);
          /* search ost in lov array, to get index */
-        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
-                if (!lov->lov_tgts[i])
+        for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+                if (!lov->lov_tgts[lov_idx])
                          continue;
  
-                if (obd_uuid_equals(&ost_uuid, &(lov->lov_tgts[i]->ltd_uuid)))
+                if (obd_uuid_equals(&ost_uuid,
+                                    &(lov->lov_tgts[lov_idx]->ltd_uuid)))
                          break;
          }
  
          /* test if ost found in lov */
-        if (i == lov->desc.ld_tgt_count) {
-                spin_unlock(&obd->obd_dev_lock);
+        if (lov_idx == lov->desc.ld_tgt_count)
                  GOTO(out, rc = -EINVAL);
-        }
-
-        spin_unlock(&obd->obd_dev_lock);
-
-        lov_idx = i;
  
          lov_ost_pool_remove(&pool->pool_obds, lov_idx);
  
@@ -616,7 +619,8 @@ int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
  
          EXIT;
  out:
-        lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
+        lov_putref(obd);
+        lov_pool_putref(pool);
          return rc;
  }
  
@@ -660,7 +664,7 @@ struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname)
                          CWARN("Request for an empty pool ("LOV_POOLNAMEF")\n",
                                 poolname);
                          /* pool is ignored, so we remove ref on it */
-                        lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
+                        lov_pool_putref(pool);
                          pool = NULL;
                  }
          }
diff --git a/lustre/lov/lov_qos.c b/lustre/lov/lov_qos.c

index 45245ed..94539b3 100644 (file)
--- a/lustre/lov/lov_qos.c
+++ b/lustre/lov/lov_qos.c
@@ -121,19 +121,16 @@ out:
          RETURN(rc);
  }
  
-int qos_del_tgt(struct obd_device *obd, __u32 index)
+int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt)
  {
          struct lov_obd *lov = &obd->u.lov;
          struct lov_qos_oss *oss;
          int rc = 0;
          ENTRY;
  
-        if (!lov->lov_tgts[index])
-                RETURN(0);
-
          down_write(&lov->lov_qos.lq_rw_sem);
  
-        oss = lov->lov_tgts[index]->ltd_qos.ltq_oss;
+        oss = tgt->ltd_qos.ltq_oss;
          if (!oss)
                  GOTO(out, rc = -ENOENT);
  
@@ -640,7 +637,7 @@ out:
          if (pool != NULL) {
                  up_read(&pool_tgt_rw_sem(pool));
                  /* put back ref got by lov_find_pool() */
-                lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
+                lov_pool_putref(pool);
          }
  
          RETURN(rc);
@@ -732,7 +729,7 @@ out:
          if (pool != NULL) {
                  up_read(&pool_tgt_rw_sem(pool));
                  /* put back ref got by lov_find_pool() */
-                lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
+                lov_pool_putref(pool);
          }
  
          RETURN(rc);
@@ -927,7 +924,7 @@ out_nolock:
          if (pool != NULL) {
                  up_read(&pool_tgt_rw_sem(pool));
                  /* put back ref got by lov_find_pool() */
-                lh_put(lov->lov_pools_hash_body, &pool->pool_hash);
+                lov_pool_putref(pool);
          }
  
          if (rc == -EAGAIN)
diff --git a/lustre/lvfs/Makefile.in b/lustre/lvfs/Makefile.in

index 4b8773b..80687ea 100644 (file)
--- a/lustre/lvfs/Makefile.in
+++ b/lustre/lvfs/Makefile.in
@@ -12,6 +12,11 @@ fsfilt_@BACKINGFS@-objs := fsfilt-@BACKINGFS@.o
  $(obj)/fsfilt-%.c: $(obj)/fsfilt_%.c
         ln -s $< $@
  
+EXTRA_DIST = $(lvfs-objs:.o=.c)  $(quotafmt-objs:.o=.c) \
+       fsfilt_ext3.c fsfilt_reiserfs.c \
+       lvfs_internal.h lvfs_userfs.c \
+       lustre_quota_fmt.c lustre_quota_fmt.h quotafmt_test.c
+
  # for <ext3/xattr.h> on 2.6
  EXTRA_PRE_CFLAGS := -I@LINUX@/fs -I@LDISKFS_DIR@ -I@LDISKFS_DIR@/ldiskfs
  
diff --git a/lustre/lvfs/autoMakefile.am b/lustre/lvfs/autoMakefile.am

index b80a28d..a7122cc 100644 (file)
--- a/lustre/lvfs/autoMakefile.am
+++ b/lustre/lvfs/autoMakefile.am
@@ -101,10 +101,5 @@ endif # MODULES
  
  install-data-hook: $(install_data_hook)
  
-DIST_SOURCES = fsfilt.c fsfilt_ext3.c fsfilt_reiserfs.c lvfs_common.c \
-       lvfs_internal.h lvfs_linux.c lvfs_userfs.c \
-       upcall_cache.c prng.c lvfs_lib.c \
-       lustre_quota_fmt.c lustre_quota_fmt.h quotafmt_test.c
-
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
  CLEANFILES = fsfilt-*.c fsfilt_ldiskfs*.c fsfilt_extN.c sources
diff --git a/lustre/lvfs/lvfs_linux.c b/lustre/lvfs/lvfs_linux.c

index 5d07875..88b4334 100644 (file)
--- a/lustre/lvfs/lvfs_linux.c
+++ b/lustre/lvfs/lvfs_linux.c
@@ -421,6 +421,58 @@ long l_readdir(struct file *file, struct list_head *dentry_list)
  }
  EXPORT_SYMBOL(l_readdir);
  
+int l_notify_change(struct vfsmount *mnt, struct dentry *dchild,
+                 struct iattr *newattrs)
+{
+        int rc;
+
+        LOCK_INODE_MUTEX(dchild->d_inode);
+#ifdef HAVE_SECURITY_PLUG
+        rc = notify_change(dchild, mnt, newattrs);
+#else
+        rc = notify_change(dchild, newattrs);
+#endif
+        UNLOCK_INODE_MUTEX(dchild->d_inode);
+        return rc;
+}
+EXPORT_SYMBOL(l_notify_change);
+
+/* utility to truncate a file */
+int simple_truncate(struct dentry *dir, struct vfsmount *mnt, 
+                 char *name, loff_t length)
+{
+        struct dentry *dchild;
+        struct iattr newattrs;
+        int err = 0;
+        ENTRY;
+
+        CDEBUG(D_INODE, "truncating file %.*s to %lld\n", (int)strlen(name),
+               name, (long long)length);
+        dchild = ll_lookup_one_len(name, dir, strlen(name));
+        if (IS_ERR(dchild))
+                GOTO(out, err = PTR_ERR(dchild));
+
+        if (dchild->d_inode) {
+                int old_mode = dchild->d_inode->i_mode;
+                if (S_ISDIR(old_mode)) {
+                        CERROR("found %s (%lu/%u) is mode %o\n", name,
+                               dchild->d_inode->i_ino,
+                               dchild->d_inode->i_generation, old_mode);
+                        GOTO(out_dput, err = -EISDIR);
+                }
+
+                newattrs.ia_size = length;
+                newattrs.ia_valid = ATTR_SIZE;
+                err = l_notify_change(mnt, dchild, &newattrs);
+        }
+        EXIT;
+out_dput:
+        dput(dchild);
+out:
+        return err;
+}
+EXPORT_SYMBOL(simple_truncate);
+
  #ifdef LUSTRE_KERNEL_VERSION
  #ifndef HAVE_CLEAR_RDONLY_ON_PUT
  #error rdonly patchset must be updated [cfs bz11248]
diff --git a/lustre/mdc/Makefile.in b/lustre/mdc/Makefile.in

index b9b9793..f007298 100644 (file)
--- a/lustre/mdc/Makefile.in
+++ b/lustre/mdc/Makefile.in
@@ -1,4 +1,6 @@
  MODULES := mdc
  mdc-objs := mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o
  
+EXTRA_DIST = $(mdc-objs:.o=.c) mdc_internal.h
+
  @INCLUDE_RULES@
diff --git a/lustre/mdc/autoMakefile.am b/lustre/mdc/autoMakefile.am

index 65be657..ace974d 100644 (file)
--- a/lustre/mdc/autoMakefile.am
+++ b/lustre/mdc/autoMakefile.am
@@ -45,5 +45,4 @@ if MODULES
  modulefs_DATA = mdc$(KMODEXT)
  endif
  
-DIST_SOURCES = $(mdc-objs:.o=.c) mdc_internal.h
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c

index 48c79ee..a5698b5 100644 (file)
--- a/lustre/mdc/mdc_request.c
+++ b/lustre/mdc/mdc_request.c
@@ -961,7 +961,10 @@ int mdc_sendpage(struct obd_export *exp, const struct lu_fid *fid,
  
          ptlrpc_request_set_replen(req);
          rc = ptlrpc_queue_wait(req);
-        GOTO(out, rc);
+        if (rc)
+                GOTO(out, rc);
+
+        rc = sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk);
  out:
          ptlrpc_req_finished(req);
          return rc;
@@ -1011,6 +1014,13 @@ int mdc_readpage(struct obd_export *exp, const struct lu_fid *fid,
                  RETURN(rc);
          }
  
+        rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk,
+                                          req->rq_bulk->bd_nob_transferred);
+        if (rc < 0) {
+                ptlrpc_req_finished(req);
+                RETURN(rc);
+        }
+
          if (req->rq_bulk->bd_nob_transferred != CFS_PAGE_SIZE) {
                  CERROR("Unexpected # bytes transferred: %d (%ld expected)\n",
                          req->rq_bulk->bd_nob_transferred, CFS_PAGE_SIZE);
@@ -1789,7 +1799,7 @@ static int mdc_renew_capa(struct obd_export *exp, struct obd_capa *oc,
  }
  
  static int mdc_connect(const struct lu_env *env,
-                       struct lustre_handle *dlm_handle,
+                       struct obd_export **exp,
                         struct obd_device *obd, struct obd_uuid *cluuid,
                         struct obd_connect_data *data,
                         void *localdata)
@@ -1806,7 +1816,7 @@ static int mdc_connect(const struct lu_env *env,
                         obd->obd_name);
          }
  
-        return client_connect_import(env, dlm_handle, obd, cluuid, data, NULL);
+        return client_connect_import(env, exp, obd, cluuid, data, NULL);
  }
  
  struct obd_ops mdc_obd_ops = {
diff --git a/lustre/mdd/mdd_device.c b/lustre/mdd/mdd_device.c

index e9bfcfc..07924a0 100644 (file)
--- a/lustre/mdd/mdd_device.c
+++ b/lustre/mdd/mdd_device.c
@@ -60,12 +60,16 @@
  #include <lustre/lustre_idl.h>
  #include <lustre_disk.h>      /* for changelogs */
  #include <lustre_param.h>
+#include <lustre_fid.h>
  
  #include "mdd_internal.h"
  
  const struct md_device_operations mdd_ops;
+static struct lu_device_type mdd_device_type;
  
  static const char mdd_root_dir_name[] = "ROOT";
+static const char mdd_obf_dir_name[] = "fid";
+static const char mdd_dot_lustre_name[] = ".lustre";
  
  static int mdd_device_init(const struct lu_env *env, struct lu_device *d,
                             const char *name, struct lu_device *next)
@@ -112,6 +116,8 @@ static void mdd_device_shutdown(const struct lu_env *env,
          ENTRY;
          mdd_changelog_fini(env, m);
          dt_txn_callback_del(m->mdd_child, &m->mdd_txn_cb);
+        mdd_object_put(env, m->mdd_dot_lustre_objs.mdd_obf);
+        mdd_object_put(env, m->mdd_dot_lustre);
          if (m->mdd_obd_dev)
                  mdd_fini_obd(env, m, cfg);
          orph_index_fini(env, m);
@@ -300,6 +306,369 @@ int mdd_changelog_write_header(struct mdd_device *mdd, int markerflags)
          RETURN(rc);
  }
  
+/**
+ * Create ".lustre" directory.
+ */
+static int create_dot_lustre_dir(const struct lu_env *env, struct mdd_device *m)
+{
+        struct lu_fid *fid = &mdd_env_info(env)->mti_fid;
+        struct md_object *mdo;
+        int rc;
+
+        memcpy(fid, &LU_DOT_LUSTRE_FID, sizeof(struct lu_fid));
+        mdo = llo_store_create_index(env, &m->mdd_md_dev, m->mdd_child,
+                                     mdd_root_dir_name, mdd_dot_lustre_name,
+                                     fid, &dt_directory_features);
+        /* .lustre dir may be already present */
+        if (IS_ERR(mdo) && PTR_ERR(mdo) != -EEXIST) {
+                rc = PTR_ERR(mdo);
+                CERROR("creating obj [%s] fid = "DFID" rc = %d\n",
+                        mdd_dot_lustre_name, PFID(fid), rc);
+                RETURN(rc);
+        }
+
+        return 0;
+}
+
+static int dot_lustre_attr_get(const struct lu_env *env, struct md_object *obj,
+                               struct md_attr *ma)
+{
+        struct mdd_object *mdd_obj = md2mdd_obj(obj);
+
+        return mdd_attr_get_internal_locked(env, mdd_obj, ma);
+}
+
+static int dot_lustre_attr_set(const struct lu_env *env, struct md_object *obj,
+                               const struct md_attr *ma)
+{
+        return -EPERM;
+}
+
+static int dot_lustre_xattr_get(const struct lu_env *env,
+                                struct md_object *obj, struct lu_buf *buf,
+                                const char *name)
+{
+        return 0;
+}
+
+/**
+ * Direct access to the ".lustre" directory is not allowed.
+ */
+static int dot_lustre_mdd_open(const struct lu_env *env, struct md_object *obj,
+                               int flags)
+{
+        return -EPERM;
+}
+
+static int dot_lustre_path(const struct lu_env *env, struct md_object *obj,
+                           char *path, int pathlen, __u64 recno, int *linkno)
+{
+        return -ENOSYS;
+}
+
+static struct md_object_operations mdd_dot_lustre_obj_ops = {
+        .moo_attr_get   = dot_lustre_attr_get,
+        .moo_attr_set   = dot_lustre_attr_set,
+        .moo_xattr_get  = dot_lustre_xattr_get,
+        .moo_open       = dot_lustre_mdd_open,
+        .moo_path       = dot_lustre_path
+};
+
+static int dot_lustre_lookup(const struct lu_env *env, struct md_object *p,
+                             const struct lu_name *lname, struct lu_fid *f,
+                             struct md_op_spec *spec)
+{
+        if (strcmp(lname->ln_name, mdd_obf_dir_name) == 0)
+                *f = LU_OBF_FID;
+        else
+                return -ENOENT;
+
+        return 0;
+}
+
+static int dot_lustre_create(const struct lu_env *env, struct md_object *pobj,
+                             const struct lu_name *lname,
+                             struct md_object *child, struct md_op_spec *spec,
+                             struct md_attr* ma)
+{
+        return -EPERM;
+}
+
+static int dot_lustre_rename(const struct lu_env *env,
+                             struct md_object *src_pobj,
+                             struct md_object *tgt_pobj,
+                             const struct lu_fid *lf,
+                             const struct lu_name *lsname,
+                             struct md_object *tobj,
+                             const struct lu_name *ltname, struct md_attr *ma)
+{
+        return -EPERM;
+}
+
+static int dot_lustre_link(const struct lu_env *env, struct md_object *tgt_obj,
+                           struct md_object *src_obj,
+                           const struct lu_name *lname, struct md_attr *ma)
+{
+        return -EPERM;
+}
+
+static int dot_lustre_unlink(const struct lu_env *env, struct md_object *pobj,
+                             struct md_object *cobj, const struct lu_name *lname,
+                             struct md_attr *ma)
+{
+        return -EPERM;
+}
+
+static struct md_dir_operations mdd_dot_lustre_dir_ops = {
+        .mdo_lookup = dot_lustre_lookup,
+        .mdo_create = dot_lustre_create,
+        .mdo_rename = dot_lustre_rename,
+        .mdo_link   = dot_lustre_link,
+        .mdo_unlink = dot_lustre_unlink,
+};
+
+static int obf_attr_get(const struct lu_env *env, struct md_object *obj,
+                        struct md_attr *ma)
+{
+        int rc = 0;
+
+        if (ma->ma_need & MA_INODE) {
+                struct mdd_device *mdd = mdo2mdd(obj);
+
+                /* "fid" is a virtual object and hence does not have any "real"
+                 * attributes. So we reuse attributes of .lustre for "fid" dir */
+                ma->ma_need |= MA_INODE;
+                rc = dot_lustre_attr_get(env, &mdd->mdd_dot_lustre->mod_obj, ma);
+                if (rc)
+                        return rc;
+                ma->ma_valid |= MA_INODE;
+        }
+
+        /* "fid" directory does not have any striping information. */
+        if (ma->ma_need & MA_LOV) {
+                struct mdd_object *mdd_obj = md2mdd_obj(obj);
+
+                if (ma->ma_valid & MA_LOV)
+                        return 0;
+
+                if (!(S_ISREG(mdd_object_type(mdd_obj)) ||
+                      S_ISDIR(mdd_object_type(mdd_obj))))
+                        return 0;
+
+                if (ma->ma_need & MA_LOV_DEF) {
+                        rc = mdd_get_default_md(mdd_obj, ma->ma_lmm,
+                                        &ma->ma_lmm_size);
+                        if (rc > 0) {
+                                ma->ma_valid |= MA_LOV;
+                                rc = 0;
+                        }
+                }
+        }
+
+        return rc;
+}
+
+static int obf_attr_set(const struct lu_env *env, struct md_object *obj,
+                        const struct md_attr *ma)
+{
+        return -EPERM;
+}
+
+static int obf_xattr_get(const struct lu_env *env,
+                         struct md_object *obj, struct lu_buf *buf,
+                         const char *name)
+{
+        return 0;
+}
+
+static int obf_mdd_open(const struct lu_env *env, struct md_object *obj,
+                        int flags)
+{
+        struct mdd_object *mdd_obj = md2mdd_obj(obj);
+
+        mdd_write_lock(env, mdd_obj, MOR_TGT_CHILD);
+        mdd_obj->mod_count++;
+        mdd_write_unlock(env, mdd_obj);
+
+        return 0;
+}
+
+static int obf_mdd_close(const struct lu_env *env, struct md_object *obj,
+                         struct md_attr *ma)
+{
+        struct mdd_object *mdd_obj = md2mdd_obj(obj);
+
+        mdd_write_lock(env, mdd_obj, MOR_TGT_CHILD);
+        mdd_obj->mod_count--;
+        mdd_write_unlock(env, mdd_obj);
+
+        return 0;
+}
+
+/** Nothing to list in "fid" directory */
+static int obf_mdd_readpage(const struct lu_env *env, struct md_object *obj,
+                            const struct lu_rdpg *rdpg)
+{
+        return -EPERM;
+}
+
+static int obf_path(const struct lu_env *env, struct md_object *obj,
+                    char *path, int pathlen, __u64 recno, int *linkno)
+{
+        return -ENOSYS;
+}
+
+static struct md_object_operations mdd_obf_obj_ops = {
+        .moo_attr_get   = obf_attr_get,
+        .moo_attr_set   = obf_attr_set,
+        .moo_xattr_get  = obf_xattr_get,
+        .moo_open       = obf_mdd_open,
+        .moo_close      = obf_mdd_close,
+        .moo_readpage   = obf_mdd_readpage,
+        .moo_path       = obf_path
+};
+
+/**
+ * Lookup method for "fid" object. Only filenames with correct SEQ:OID format
+ * are valid. We also check if object with passed fid exists or not.
+ */
+static int obf_lookup(const struct lu_env *env, struct md_object *p,
+                      const struct lu_name *lname, struct lu_fid *f,
+                      struct md_op_spec *spec)
+{
+        char *name = (char *)lname->ln_name;
+        struct mdd_device *mdd = mdo2mdd(p);
+        struct mdd_object *child;
+        int rc = 0;
+
+        while (*name == '[')
+                name++;
+
+        sscanf(name, SFID, &(f->f_seq), &(f->f_oid),
+               &(f->f_ver));
+        if (!fid_is_sane(f)) {
+                CWARN("bad FID format [%s], should be "DFID"\n", lname->ln_name,
+                      (__u64)1, 2, 0);
+                GOTO(out, rc = -EINVAL);
+        }
+
+        /* Check if object with this fid exists */
+        child = mdd_object_find(env, mdd, f);
+        if (child == NULL)
+                GOTO(out, rc = 0);
+        if (IS_ERR(child))
+                GOTO(out, rc = PTR_ERR(child));
+
+        if (mdd_object_exists(child) == 0)
+                rc = -ENOENT;
+
+        mdd_object_put(env, child);
+
+out:
+        return rc;
+}
+
+static int obf_create(const struct lu_env *env, struct md_object *pobj,
+                      const struct lu_name *lname, struct md_object *child,
+                      struct md_op_spec *spec, struct md_attr* ma)
+{
+        return -EPERM;
+}
+
+static int obf_rename(const struct lu_env *env,
+                      struct md_object *src_pobj, struct md_object *tgt_pobj,
+                      const struct lu_fid *lf, const struct lu_name *lsname,
+                      struct md_object *tobj, const struct lu_name *ltname,
+                      struct md_attr *ma)
+{
+        return -EPERM;
+}
+
+static int obf_link(const struct lu_env *env, struct md_object *tgt_obj,
+                    struct md_object *src_obj, const struct lu_name *lname,
+                    struct md_attr *ma)
+{
+        return -EPERM;
+}
+
+static int obf_unlink(const struct lu_env *env, struct md_object *pobj,
+                      struct md_object *cobj, const struct lu_name *lname,
+                      struct md_attr *ma)
+{
+        return -EPERM;
+}
+
+static struct md_dir_operations mdd_obf_dir_ops = {
+        .mdo_lookup = obf_lookup,
+        .mdo_create = obf_create,
+        .mdo_rename = obf_rename,
+        .mdo_link   = obf_link,
+        .mdo_unlink = obf_unlink
+};
+
+/**
+ * Create special in-memory "fid" object for open-by-fid.
+ */
+static int mdd_obf_setup(const struct lu_env *env, struct mdd_device *m)
+{
+        struct mdd_object *mdd_obf;
+        struct lu_object *obf_lu_obj;
+        int rc = 0;
+
+        m->mdd_dot_lustre_objs.mdd_obf = mdd_object_find(env, m,
+                                                         &LU_OBF_FID);
+        if (m->mdd_dot_lustre_objs.mdd_obf == NULL ||
+            IS_ERR(m->mdd_dot_lustre_objs.mdd_obf))
+                GOTO(out, rc = -ENOENT);
+
+        mdd_obf = m->mdd_dot_lustre_objs.mdd_obf;
+        mdd_obf->mod_obj.mo_dir_ops = &mdd_obf_dir_ops;
+        mdd_obf->mod_obj.mo_ops = &mdd_obf_obj_ops;
+        /* Don't allow objects to be created in "fid" dir */
+        mdd_obf->mod_flags |= IMMUTE_OBJ;
+
+        obf_lu_obj = mdd2lu_obj(mdd_obf);
+        obf_lu_obj->lo_header->loh_attr |= (LOHA_EXISTS | S_IFDIR);
+
+out:
+        return rc;
+}
+
+/** Setup ".lustre" directory object */
+static int mdd_dot_lustre_setup(const struct lu_env *env, struct mdd_device *m)
+{
+        struct dt_object *dt_dot_lustre;
+        struct lu_fid *fid = &mdd_env_info(env)->mti_fid;
+        int rc;
+
+        rc = create_dot_lustre_dir(env, m);
+        if (rc)
+                return rc;
+
+        dt_dot_lustre = dt_store_open(env, m->mdd_child, mdd_root_dir_name,
+                                      mdd_dot_lustre_name, fid);
+        if (IS_ERR(dt_dot_lustre)) {
+                rc = PTR_ERR(dt_dot_lustre);
+                GOTO(out, rc);
+        }
+
+        /* references are released in mdd_device_shutdown() */
+        m->mdd_dot_lustre = lu2mdd_obj(lu_object_locate(dt_dot_lustre->do_lu.lo_header,
+                                                        &mdd_device_type));
+
+        lu_object_put(env, &dt_dot_lustre->do_lu);
+
+        m->mdd_dot_lustre->mod_obj.mo_dir_ops = &mdd_dot_lustre_dir_ops;
+        m->mdd_dot_lustre->mod_obj.mo_ops = &mdd_dot_lustre_obj_ops;
+
+        rc = mdd_obf_setup(env, m);
+        if (rc)
+                CERROR("Error initializing \"fid\" object - %d.\n", rc);
+
+out:
+        RETURN(rc);
+}
+
  static int mdd_process_config(const struct lu_env *env,
                                struct lu_device *d, struct lustre_cfg *cfg)
  {
@@ -435,8 +804,17 @@ static int mdd_prepare(const struct lu_env *env,
                  LASSERT(root != NULL);
                  lu_object_put(env, &root->do_lu);
                  rc = orph_index_init(env, mdd);
-        } else
+        } else {
                  rc = PTR_ERR(root);
+        }
+        if (rc)
+                GOTO(out, rc);
+
+        rc = mdd_dot_lustre_setup(env, mdd);
+        if (rc) {
+                CERROR("Error(%d) initializing .lustre objects\n", rc);
+                GOTO(out, rc);
+        }
  
  out:
          RETURN(rc);
diff --git a/lustre/mdd/mdd_internal.h b/lustre/mdd/mdd_internal.h

index c855f80..0d66a68 100644 (file)
--- a/lustre/mdd/mdd_internal.h
+++ b/lustre/mdd/mdd_internal.h
@@ -110,6 +110,11 @@ struct mdd_changelog {
          __u64                            mc_starttime;
  };
  
+/** Objects in .lustre dir */
+struct mdd_dot_lustre_objs {
+        struct mdd_object *mdd_obf;
+};
+
  struct mdd_device {
          struct md_device                 mdd_md_dev;
          struct dt_device                *mdd_child;
@@ -123,6 +128,8 @@ struct mdd_device {
          struct mdd_txn_op_descr          mdd_tod[MDD_TXN_LAST_OP];
          struct mdd_changelog             mdd_cl;
          unsigned long                    mdd_atime_diff;
+        struct mdd_object               *mdd_dot_lustre;
+        struct mdd_dot_lustre_objs       mdd_dot_lustre_objs;
  };
  
  enum mod_flags {
@@ -362,6 +369,8 @@ extern const struct lu_device_operations mdd_lu_ops;
  struct mdd_object *mdd_object_find(const struct lu_env *env,
                                     struct mdd_device *d,
                                     const struct lu_fid *f);
+int mdd_get_default_md(struct mdd_object *mdd_obj, struct lov_mds_md *lmm,
+                       int *size);
  
  /* mdd_quota.c*/
  #ifdef HAVE_QUOTA_SUPPORT
diff --git a/lustre/mdd/mdd_object.c b/lustre/mdd/mdd_object.c

index 0cf918a..9d0dbc1 100644 (file)
--- a/lustre/mdd/mdd_object.c
+++ b/lustre/mdd/mdd_object.c
@@ -582,8 +582,8 @@ int mdd_iattr_get(const struct lu_env *env, struct mdd_object *mdd_obj,
          RETURN(rc);
  }
  
-static int mdd_get_default_md(struct mdd_object *mdd_obj,
-                struct lov_mds_md *lmm, int *size)
+int mdd_get_default_md(struct mdd_object *mdd_obj, struct lov_mds_md *lmm,
+                       int *size)
  {
          struct lov_desc *ldesc;
          struct mdd_device *mdd = mdo2mdd(&mdd_obj->mod_obj);
diff --git a/lustre/mds/Makefile.in b/lustre/mds/Makefile.in

index a6400b8..0bb2876 100644 (file)
--- a/lustre/mds/Makefile.in
+++ b/lustre/mds/Makefile.in
@@ -1,4 +1,6 @@
  MODULES := mds
  mds-objs := handler.o  lproc_mds.o  mds_fs.o  mds_log.o  mds_lov.o
  
+EXTRA_DIST := $(mds-objs:%.o=%.c) mds_internal.h
+
  @INCLUDE_RULES@
diff --git a/lustre/mds/autoMakefile.am b/lustre/mds/autoMakefile.am

index d2aafc6..4cc5dea 100644 (file)
--- a/lustre/mds/autoMakefile.am
+++ b/lustre/mds/autoMakefile.am
@@ -39,4 +39,3 @@ modulefs_DATA = mds$(KMODEXT)
  endif
  
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES := $(mds-objs:%.o=%.c) mds_internal.h
diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c

index d0a2076..9c6142f 100644 (file)
--- a/lustre/mds/mds_lov.c
+++ b/lustre/mds/mds_lov.c
@@ -601,7 +601,6 @@ out:
  int mds_lov_connect(struct obd_device *obd, char * lov_name)
  {
          struct mds_obd *mds = &obd->u.mds;
-        struct lustre_handle conn = {0,};
          struct obd_connect_data *data;
          int rc;
          ENTRY;
@@ -655,14 +654,13 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
          /* send the list of supported checksum types */
          data->ocd_cksum_types = OBD_CKSUM_ALL;
          /* NB: lov_connect() needs to fill in .ocd_index for each OST */
-        rc = obd_connect(NULL, &conn, mds->mds_osc_obd, &obd->obd_uuid, data, NULL);
+        rc = obd_connect(NULL, &mds->mds_osc_exp, mds->mds_osc_obd, &obd->obd_uuid, data, NULL);
          OBD_FREE(data, sizeof(*data));
          if (rc) {
                  CERROR("MDS cannot connect to LOV %s (%d)\n", lov_name, rc);
                  mds->mds_osc_obd = ERR_PTR(rc);
                  RETURN(rc);
          }
-        mds->mds_osc_exp = class_conn2export(&conn);
  
          /* I want to see a callback happen when the OBD moves to a
           * "For General Use" state, and that's when we'll call
diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c

index 7acac5d..b46baaf 100644 (file)
--- a/lustre/mdt/mdt_handler.c
+++ b/lustre/mdt/mdt_handler.c
@@ -1168,6 +1168,10 @@ static int mdt_sendpage(struct mdt_thread_info *info,
          }
  
          LASSERT(desc->bd_nob == rdpg->rp_count);
+        rc = sptlrpc_svc_wrap_bulk(req, desc);
+        if (rc)
+                GOTO(free_desc, rc);
+
          rc = ptlrpc_start_bulk_transfer(desc);
          if (rc)
                  GOTO(free_desc, rc);
@@ -1327,6 +1331,9 @@ static int mdt_writepage(struct mdt_thread_info *info)
          ptlrpc_prep_bulk_page(desc, page, (int)reqbody->size,
                                (int)reqbody->nlink);
  
+        rc = sptlrpc_svc_prep_bulk(req, desc);
+        if (rc != 0)
+                GOTO(cleanup_page, rc);
          /*
           * Check if client was evicted while we were doing i/o before touching
           * network.
@@ -2771,6 +2778,15 @@ static int mdt_handle0(struct ptlrpc_request *req,
          if (likely(rc == 0)) {
                  rc = mdt_recovery(info);
                  if (likely(rc == +1)) {
+                        switch (lustre_msg_get_opc(msg)) {
+                        case MDS_READPAGE:
+                                req->rq_bulk_read = 1;
+                                break;
+                        case MDS_WRITEPAGE:
+                                req->rq_bulk_write = 1;
+                                break;
+                        }
+
                          h = mdt_handler_find(lustre_msg_get_opc(msg),
                                               supported);
                          if (likely(h != NULL)) {
@@ -4826,39 +4842,40 @@ static int mdt_connect_check_sptlrpc(struct mdt_device *mdt,
  
  /* mds_connect copy */
  static int mdt_obd_connect(const struct lu_env *env,
-                           struct lustre_handle *conn, struct obd_device *obd,
+                           struct obd_export **exp, struct obd_device *obd,
                             struct obd_uuid *cluuid,
                             struct obd_connect_data *data,
                             void *localdata)
  {
          struct mdt_thread_info *info;
          struct lsd_client_data *lcd;
-        struct obd_export      *exp;
+        struct obd_export      *lexp;
+        struct lustre_handle    conn = { 0 };
          struct mdt_device      *mdt;
          struct ptlrpc_request  *req;
          int                     rc;
          ENTRY;
  
          LASSERT(env != NULL);
-        if (!conn || !obd || !cluuid)
+        if (!exp || !obd || !cluuid)
                  RETURN(-EINVAL);
  
          info = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
          req = info->mti_pill->rc_req;
          mdt = mdt_dev(obd->obd_lu_dev);
  
-        rc = class_connect(conn, obd, cluuid);
+        rc = class_connect(&conn, obd, cluuid);
          if (rc)
                  RETURN(rc);
  
-        exp = class_conn2export(conn);
-        LASSERT(exp != NULL);
+        lexp = class_conn2export(&conn);
+        LASSERT(lexp != NULL);
  
-        rc = mdt_connect_check_sptlrpc(mdt, exp, req);
+        rc = mdt_connect_check_sptlrpc(mdt, lexp, req);
          if (rc)
                  GOTO(out, rc);
  
-        rc = mdt_connect_internal(exp, mdt, data);
+        rc = mdt_connect_internal(lexp, mdt, data);
          if (rc == 0) {
                  OBD_ALLOC_PTR(lcd);
                  if (lcd != NULL) {
@@ -4866,15 +4883,15 @@ static int mdt_obd_connect(const struct lu_env *env,
                          mti = lu_context_key_get(&env->le_ctx,
                                                   &mdt_thread_key);
                          LASSERT(mti != NULL);
-                        mti->mti_exp = exp;
+                        mti->mti_exp = lexp;
                          memcpy(lcd->lcd_uuid, cluuid, sizeof lcd->lcd_uuid);
-                        exp->exp_mdt_data.med_lcd = lcd;
+                        lexp->exp_mdt_data.med_lcd = lcd;
                          rc = mdt_client_new(env, mdt);
                          if (rc != 0) {
                                  OBD_FREE_PTR(lcd);
-                                exp->exp_mdt_data.med_lcd = NULL;
+                                lexp->exp_mdt_data.med_lcd = NULL;
                          } else {
-                                mdt_export_stats_init(obd, exp, localdata);
+                                mdt_export_stats_init(obd, lexp, localdata);
                          }
                  } else
                          rc = -ENOMEM;
@@ -4882,9 +4899,9 @@ static int mdt_obd_connect(const struct lu_env *env,
  
  out:
          if (rc != 0)
-                class_disconnect(exp);
+                class_disconnect(lexp);
          else
-                class_export_put(exp);
+                *exp = lexp;
  
          RETURN(rc);
  }
diff --git a/lustre/mdt/mdt_open.c b/lustre/mdt/mdt_open.c

index e2e8802..ba2d4c2 100644 (file)
--- a/lustre/mdt/mdt_open.c
+++ b/lustre/mdt/mdt_open.c
@@ -896,9 +896,9 @@ int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc)
          LASSERT(info->mti_pill->rc_fmt == &RQF_LDLM_INTENT_OPEN);
          ldlm_rep = req_capsule_server_get(info->mti_pill, &RMF_DLM_REP);
  
-        /* TODO: JOIN file */
+        /* JOIN file was deprecated since 1.6.5, but may be revived one day */
          if (create_flags & MDS_OPEN_JOIN_FILE) {
-                CERROR("JOIN file will be supported soon\n");
+                CERROR("file join is unsupported in this version of Lustre\n");
                  GOTO(out, result = err_serious(-EOPNOTSUPP));
          }
          msg_flags = lustre_msg_get_flags(req->rq_reqmsg);
@@ -1234,8 +1234,11 @@ int mdt_close(struct mdt_thread_info *info)
          req_capsule_set_size(info->mti_pill, &RMF_LOGCOOKIES, RCL_SERVER,
                               info->mti_mdt->mdt_max_cookiesize);
          rc = req_capsule_server_pack(info->mti_pill);
-        if (mdt_check_resent(info, mdt_reconstruct_generic, NULL))
+        if (mdt_check_resent(info, mdt_reconstruct_generic, NULL)) {
+                if (rc == 0)
+                        mdt_shrink_reply(info);
                  RETURN(lustre_msg_get_status(req->rq_repmsg));
+        }
  
          /* Continue to close handle even if we can not pack reply */
          if (rc == 0) {
diff --git a/lustre/mdt/mdt_recovery.c b/lustre/mdt/mdt_recovery.c

index b5e263e..c32ae5a 100644 (file)
--- a/lustre/mdt/mdt_recovery.c
+++ b/lustre/mdt/mdt_recovery.c
@@ -49,7 +49,8 @@
  #include "mdt_internal.h"
  
  static int mdt_server_data_update(const struct lu_env *env,
-                                  struct mdt_device *mdt);
+                                  struct mdt_device *mdt,
+                                  int need_sync);
  
  struct lu_buf *mdt_buf(const struct lu_env *env, void *area, ssize_t len)
  {
@@ -243,8 +244,16 @@ static inline int mdt_last_rcvd_header_read(const struct lu_env *env,
          return rc;
  }
  
+static void mdt_client_cb(const struct mdt_device *mdt, __u64 transno,
+                          void *data, int err)
+{
+        struct obd_device *obd = mdt2obd_dev(mdt);
+        target_client_add_cb(obd, transno, data, err);
+}
+
  static inline int mdt_last_rcvd_header_write(const struct lu_env *env,
-                                             struct mdt_device *mdt)
+                                             struct mdt_device *mdt,
+                                             int need_sync)
  {
          struct mdt_thread_info *mti;
          struct thandle *th;
@@ -253,6 +262,11 @@ static inline int mdt_last_rcvd_header_write(const struct lu_env *env,
  
          mti = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
  
+        if (mti->mti_exp) {
+                spin_lock(&mti->mti_exp->exp_lock);
+                mti->mti_exp->exp_need_sync = need_sync;
+                spin_unlock(&mti->mti_exp->exp_lock);
+        }
          mdt_trans_credit_init(env, mdt, MDT_TXN_LAST_RCVD_WRITE_OP);
          th = mdt_trans_start(env, mdt);
          if (IS_ERR(th))
@@ -261,6 +275,9 @@ static inline int mdt_last_rcvd_header_write(const struct lu_env *env,
          mti->mti_off = 0;
          lsd_cpu_to_le(&mdt->mdt_lsd, &mti->mti_lsd);
  
+        if (need_sync && mti->mti_exp)
+                mdt_trans_add_cb(th, mdt_client_cb, mti->mti_exp);
+
          rc = mdt_record_write(env, mdt->mdt_last_rcvd,
                                mdt_buf_const(env, &mti->mti_lsd,
                                              sizeof(mti->mti_lsd)),
@@ -561,7 +578,8 @@ static int mdt_server_data_init(const struct lu_env *env,
          lsd->lsd_mount_count = mdt->mdt_mount_count;
  
          /* save it, so mount count and last_transno is current */
-        rc = mdt_server_data_update(env, mdt);
+        rc = mdt_server_data_update(env, mdt, (mti->mti_exp && 
+                                               mti->mti_exp->exp_need_sync));
          if (rc)
                  GOTO(err_client, rc);
  
@@ -574,7 +592,8 @@ out:
  }
  
  static int mdt_server_data_update(const struct lu_env *env,
-                                  struct mdt_device *mdt)
+                                  struct mdt_device *mdt,
+                                  int need_sync)
  {
          int rc = 0;
          ENTRY;
@@ -591,18 +610,10 @@ static int mdt_server_data_update(const struct lu_env *env,
           * mdt->mdt_last_rcvd may be NULL that time.
           */
          if (mdt->mdt_last_rcvd != NULL)
-                rc = mdt_last_rcvd_header_write(env, mdt);
+                rc = mdt_last_rcvd_header_write(env, mdt, need_sync);
          RETURN(rc);
  }
  
-void mdt_cb_new_client(const struct mdt_device *mdt, __u64 transno,
-                                  void *data, int err)
-{
-        struct obd_device *obd = mdt2obd_dev(mdt);
-
-        target_client_add_cb(obd, transno, data, err);
-}
-
  int mdt_client_new(const struct lu_env *env, struct mdt_device *mdt)
  {
          unsigned long *bitmap = mdt->mdt_client_bitmap;
@@ -651,16 +662,22 @@ int mdt_client_new(const struct lu_env *env, struct mdt_device *mdt)
          init_mutex(&med->med_lcd_lock);
  
          LASSERTF(med->med_lr_off > 0, "med_lr_off = %llu\n", med->med_lr_off);
-        /* write new client data */
+
+        /* Write new client data. */
          off = med->med_lr_off;
          mdt_trans_credit_init(env, mdt, MDT_TXN_LAST_RCVD_WRITE_OP);
+
          th = mdt_trans_start(env, mdt);
          if (IS_ERR(th))
                  RETURN(PTR_ERR(th));
  
-        /* until this operations will be committed the sync is needed for this
-         * export */
-        mdt_trans_add_cb(th, mdt_cb_new_client, mti->mti_exp);
+        /* 
+         * Until this operations will be committed the sync is needed
+         * for this export. This should be done _after_ starting the
+         * transaction so that many connecting clients will not bring
+         * server down with lots of sync writes. 
+         */
+        mdt_trans_add_cb(th, mdt_client_cb, mti->mti_exp);
          spin_lock(&mti->mti_exp->exp_lock);
          mti->mti_exp->exp_need_sync = 1;
          spin_unlock(&mti->mti_exp->exp_lock);
@@ -730,21 +747,24 @@ int mdt_client_del(const struct lu_env *env, struct mdt_device *mdt)
          struct mdt_export_data *med;
          struct lsd_client_data *lcd;
          struct obd_device      *obd = mdt2obd_dev(mdt);
-        struct thandle *th;
-        loff_t off;
-        int rc = 0;
+        struct obd_export      *exp;
+        struct thandle         *th;
+        int                     need_sync;
+        loff_t                  off;
+        int                     rc = 0;
          ENTRY;
  
          mti = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
          LASSERT(mti != NULL);
  
-        med = &mti->mti_exp->exp_mdt_data;
+        exp = mti->mti_exp;
+        med = &exp->exp_mdt_data;
          lcd = med->med_lcd;
          if (!lcd)
                  RETURN(0);
  
          /* XXX: If lcd_uuid were a real obd_uuid, I could use obd_uuid_equals */
-        if (!strcmp(med->med_lcd->lcd_uuid, obd->obd_uuid.uuid))
+        if (!strcmp(lcd->lcd_uuid, obd->obd_uuid.uuid))
                  GOTO(free, 0);
  
          CDEBUG(D_INFO, "freeing client at idx %u, offset %lld\n",
@@ -772,16 +792,34 @@ int mdt_client_del(const struct lu_env *env, struct mdt_device *mdt)
                  LBUG();
          }
  
+        /* Don't force sync on disconnect if aborting recovery,
+         * or it does num_clients * num_osts.  b=17194 */
+        need_sync = (!exp->exp_libclient || exp->exp_need_sync) &&
+                     !(exp->exp_flags & OBD_OPT_ABORT_RECOV);
+
          /*
           * This may be called from difficult reply handler path and
           * mdt->mdt_last_rcvd may be NULL that time.
           */
          if (mdt->mdt_last_rcvd != NULL) {
                  mdt_trans_credit_init(env, mdt, MDT_TXN_LAST_RCVD_WRITE_OP);
+
+                spin_lock(&exp->exp_lock);
+                exp->exp_need_sync = need_sync;
+                spin_unlock(&exp->exp_lock);
+
                  th = mdt_trans_start(env, mdt);
                  if (IS_ERR(th))
                          GOTO(free, rc = PTR_ERR(th));
  
+                if (need_sync) {
+                        /* 
+                         * Until this operations will be committed the sync
+                         * is needed for this export. 
+                         */
+                        mdt_trans_add_cb(th, mdt_client_cb, exp);
+                }
+
                  mutex_down(&med->med_lcd_lock);
                  memset(lcd, 0, sizeof *lcd);
  
@@ -791,18 +829,20 @@ int mdt_client_del(const struct lu_env *env, struct mdt_device *mdt)
          }
  
          CDEBUG(rc == 0 ? D_INFO : D_ERROR, "Zeroing out client idx %u in "
-               "%s rc %d\n",  med->med_lr_idx, LAST_RCVD, rc);
+               "%s %ssync rc %d\n",  med->med_lr_idx, LAST_RCVD, 
+               need_sync ? "" : "a", rc);
  
          spin_lock(&mdt->mdt_client_bitmap_lock);
          clear_bit(med->med_lr_idx, mdt->mdt_client_bitmap);
          spin_unlock(&mdt->mdt_client_bitmap_lock);
  
-        /*
-         * Make sure the server's last_transno is up to date. Do this after the
-         * client is freed so we know all the client's transactions have been
-         * committed.
+        /* 
+         * Make sure the server's last_transno is up to date. Do this
+         * after the client is freed so we know all the client's
+         * transactions have been committed. 
           */
-        mdt_server_data_update(env, mdt);
+        mdt_server_data_update(env, mdt, need_sync);
+
          EXIT;
  free:
          OBD_FREE_PTR(lcd);
@@ -866,7 +906,9 @@ static int mdt_last_rcvd_update(struct mdt_thread_info *mti,
           */
          if (mti->mti_transno == 0 &&
              *transno_p == mdt->mdt_last_transno)
-                mdt_server_data_update(mti->mti_env, mdt);
+                mdt_server_data_update(mti->mti_env, mdt, 
+                                      (mti->mti_exp && 
+                                       mti->mti_exp->exp_need_sync));
  
          *transno_p = mti->mti_transno;
  
diff --git a/lustre/mgc/Makefile.in b/lustre/mgc/Makefile.in

index 8adca32..7ce8a37 100644 (file)
--- a/lustre/mgc/Makefile.in
+++ b/lustre/mgc/Makefile.in
@@ -1,4 +1,6 @@
  MODULES := mgc
  mgc-objs := mgc_request.o lproc_mgc.o
  
+EXTRA_DIST := $(mgc-objs:%.o=%.c) libmgc.c mgc_internal.h
+
  @INCLUDE_RULES@
diff --git a/lustre/mgc/autoMakefile.am b/lustre/mgc/autoMakefile.am

index db9a433..e337ea9 100644 (file)
--- a/lustre/mgc/autoMakefile.am
+++ b/lustre/mgc/autoMakefile.am
@@ -46,4 +46,3 @@ modulefs_DATA = mgc$(KMODEXT)
  endif
  
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES := $(mgc-objs:%.o=%.c) libmgc.c mgc_internal.h
diff --git a/lustre/mgs/Makefile.in b/lustre/mgs/Makefile.in

index 8bb6a5f..413f381 100644 (file)
--- a/lustre/mgs/Makefile.in
+++ b/lustre/mgs/Makefile.in
@@ -1,4 +1,6 @@
  MODULES := mgs
  mgs-objs := mgs_handler.o mgs_fs.o mgs_llog.o lproc_mgs.o
  
+EXTRA_DIST := $(mgs-objs:%.o=%.c) mgs_internal.h
+
  @INCLUDE_RULES@
diff --git a/lustre/mgs/autoMakefile.am b/lustre/mgs/autoMakefile.am

index c538cb4..a57c433 100644 (file)
--- a/lustre/mgs/autoMakefile.am
+++ b/lustre/mgs/autoMakefile.am
@@ -39,4 +39,3 @@ modulefs_DATA = mgs$(KMODEXT)
  endif
  
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES := $(mgs-objs:%.o=%.c) mgs_internal.h
diff --git a/lustre/mgs/mgs_handler.c b/lustre/mgs/mgs_handler.c

index 581f1e1..8861a75 100644 (file)
--- a/lustre/mgs/mgs_handler.c
+++ b/lustre/mgs/mgs_handler.c
@@ -63,37 +63,39 @@
  
  /* Establish a connection to the MGS.*/
  static int mgs_connect(const struct lu_env *env,
-                       struct lustre_handle *conn, struct obd_device *obd,
+                       struct obd_export **exp, struct obd_device *obd,
                         struct obd_uuid *cluuid, struct obd_connect_data *data,
                         void *localdata)
  {
-        struct obd_export *exp;
+        struct obd_export *lexp;
+        struct lustre_handle conn = { 0 };
          int rc;
          ENTRY;
  
-        if (!conn || !obd || !cluuid)
+        if (!exp || !obd || !cluuid)
                  RETURN(-EINVAL);
  
-        rc = class_connect(conn, obd, cluuid);
+        rc = class_connect(&conn, obd, cluuid);
          if (rc)
                  RETURN(rc);
-        exp = class_conn2export(conn);
-        LASSERT(exp);
  
-        mgs_counter_incr(exp, LPROC_MGS_CONNECT);
+        lexp = class_conn2export(&conn);
+        LASSERT(lexp);
+
+        mgs_counter_incr(lexp, LPROC_MGS_CONNECT);
  
          if (data != NULL) {
                  data->ocd_connect_flags &= MGS_CONNECT_SUPPORTED;
-                exp->exp_connect_flags = data->ocd_connect_flags;
+                lexp->exp_connect_flags = data->ocd_connect_flags;
                  data->ocd_version = LUSTRE_VERSION_CODE;
          }
  
-        rc = mgs_client_add(obd, exp, localdata);
+        rc = mgs_client_add(obd, lexp, localdata);
  
          if (rc) {
-                class_disconnect(exp);
+                class_disconnect(lexp);
          } else {
-                class_export_put(exp);
+                *exp = lexp;
          }
  
          RETURN(rc);
@@ -220,7 +222,11 @@ static int mgs_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
          ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
                             "mgs_ldlm_client", &obd->obd_ldlm_client);
  
-        LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb)));
+        if (lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb))) {
+                CERROR("%s: Underlying device is marked as read-only. "
+                       "Setup failed\n", obd->obd_name);
+                GOTO(err_ops, rc = -EROFS);
+        }
  
          rc = mgs_fs_setup(obd, mnt);
          if (rc) {
diff --git a/lustre/mgs/mgs_llog.c b/lustre/mgs/mgs_llog.c

index e328f33..e5adb2f 100644 (file)
--- a/lustre/mgs/mgs_llog.c
+++ b/lustre/mgs/mgs_llog.c
@@ -2032,7 +2032,7 @@ static int mgs_srpc_set_param_mem(struct fs_db *fsdb,
                  rset = &fsdb->fsdb_srpc_gen;
          }
  
-        rc = sptlrpc_rule_set_merge(rset, &rule, 1);
+        rc = sptlrpc_rule_set_merge(rset, &rule);
  
          RETURN(rc);
  }
@@ -2046,6 +2046,9 @@ static int mgs_srpc_set_param(struct obd_device *obd,
          int                     rc, copy_size;
          ENTRY;
  
+#ifndef HAVE_GSS
+        RETURN(-EINVAL);
+#endif
          /* keep a copy of original param, which could be destroied
           * during parsing */
          copy_size = strlen(param) + 1;
diff --git a/lustre/obdclass/Makefile.in b/lustre/obdclass/Makefile.in

index 90d898e..1ab1d54 100644 (file)
--- a/lustre/obdclass/Makefile.in
+++ b/lustre/obdclass/Makefile.in
@@ -26,4 +26,7 @@ llog_test-objs := llog-test.o
  $(obj)/llog-test.c: $(obj)/llog_test.c
         ln -sf $< $@
  
+EXTRA_DIST  = $(filter-out llog-test.c,$(obdclass-all-objs:.o=.c)) $(llog-test-objs:.o=.c) llog_test.c llog_internal.h
+EXTRA_DIST += cl_internal.h
+
  @INCLUDE_RULES@
diff --git a/lustre/obdclass/autoMakefile.am b/lustre/obdclass/autoMakefile.am

index b7fb43e..af30e10 100644 (file)
--- a/lustre/obdclass/autoMakefile.am
+++ b/lustre/obdclass/autoMakefile.am
@@ -54,4 +54,3 @@ install-data-hook: $(install_data_hook)
  
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@  llog-test.c
  MOSTLYCLEANFILES += linux/*.o darwin/*.o
-DIST_SOURCES = $(filter-out llog-test.c,$(obdclass-all-objs:.o=.c)) $(llog-test-objs:.o=.c) llog_test.c llog_internal.h cl_internal.h
diff --git a/lustre/obdclass/cl_page.c b/lustre/obdclass/cl_page.c

index feac1ff..e88427b 100644 (file)
--- a/lustre/obdclass/cl_page.c
+++ b/lustre/obdclass/cl_page.c
@@ -1259,7 +1259,12 @@ void cl_page_completion(const struct lu_env *env,
                                 (const struct lu_env *,
                                  const struct cl_page_slice *, int), ioret);
  
-        KLASSERT(!PageWriteback(cl_page_vmpage(env, pg)));
+        /* Don't assert the page writeback bit here because the lustre file
+         * may be as a backend of swap space. in this case, the page writeback
+         * is set by VM, and obvious we shouldn't clear it at all. Fortunately
+         * this type of pages are all TRANSIENT pages. */
+        KLASSERT(ergo(pg->cp_type == CPT_CACHEABLE,
+                      !PageWriteback(cl_page_vmpage(env, pg))));
          EXIT;
  }
  EXPORT_SYMBOL(cl_page_completion);
diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c

index 63f443a..7341aaa 100644 (file)
--- a/lustre/obdclass/genops.c
+++ b/lustre/obdclass/genops.c
@@ -1068,7 +1068,8 @@ int class_disconnect(struct obd_export *export)
          RETURN(0);
  }
  
-static void class_disconnect_export_list(struct list_head *list, int flags)
+static void class_disconnect_export_list(struct list_head *list,
+                                         enum obd_option flags)
  {
          int rc;
          struct lustre_handle fake_conn;
@@ -1118,12 +1119,6 @@ static void class_disconnect_export_list(struct list_head *list, int flags)
          EXIT;
  }
  
-static inline int get_exp_flags_from_obd(struct obd_device *obd)
-{
-        return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
-                (obd->obd_force ? OBD_OPT_FORCE : 0));
-}
-
  void class_disconnect_exports(struct obd_device *obd)
  {
          struct list_head work_list;
@@ -1139,7 +1134,7 @@ void class_disconnect_exports(struct obd_device *obd)
                  CDEBUG(D_HA, "OBD device %d (%p) has exports, "
                         "disconnecting them\n", obd->obd_minor, obd);
                  class_disconnect_export_list(&work_list,
-                                             get_exp_flags_from_obd(obd));
+                                             exp_flags_from_obd(obd));
          } else
                  CDEBUG(D_HA, "OBD device %d (%p) has no exports\n",
                         obd->obd_minor, obd);
@@ -1150,7 +1145,8 @@ EXPORT_SYMBOL(class_disconnect_exports);
  /* Remove exports that have not completed recovery.
   */
  int class_disconnect_stale_exports(struct obd_device *obd,
-                                   int (*test_export)(struct obd_export *))
+                                   int (*test_export)(struct obd_export *),
+                                   enum obd_option flags)
  {
          struct list_head work_list;
          struct list_head *pos, *n;
@@ -1182,7 +1178,7 @@ int class_disconnect_stale_exports(struct obd_device *obd,
  
          CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n",
                 obd->obd_name, cnt);
-        class_disconnect_export_list(&work_list, get_exp_flags_from_obd(obd));
+        class_disconnect_export_list(&work_list, flags);
          RETURN(cnt);
  }
  EXPORT_SYMBOL(class_disconnect_stale_exports);
diff --git a/lustre/obdclass/llog_test.c b/lustre/obdclass/llog_test.c

index 21be99b..ce887c8 100644 (file)
--- a/lustre/obdclass/llog_test.c
+++ b/lustre/obdclass/llog_test.c
@@ -501,7 +501,6 @@ static int llog_test_6(struct obd_device *obd, char *name)
          struct obd_device *mgc_obd;
          struct llog_ctxt *ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
          struct obd_uuid *mgs_uuid = &ctxt->loc_exp->exp_obd->obd_uuid;
-        struct lustre_handle exph = {0, };
          struct obd_export *exp;
          struct obd_uuid uuid = {"LLOG_TEST6_UUID"};
          struct llog_handle *llh = NULL;
@@ -516,13 +515,13 @@ static int llog_test_6(struct obd_device *obd, char *name)
                  GOTO(ctxt_release, rc = -ENOENT);
          }
  
-        rc = obd_connect(NULL, &exph, mgc_obd, &uuid,
+        rc = obd_connect(NULL, &exp, mgc_obd, &uuid,
                           NULL /* obd_connect_data */, NULL);
          if (rc) {
                  CERROR("6: failed to connect to MGC: %s\n", mgc_obd->obd_name);
                  GOTO(ctxt_release, rc);
          }
-        exp = class_conn2export(&exph);
+        LASSERTF(exp->exp_obd == mgc_obd, "%p - %p - %p\n", exp, exp->exp_obd, mgc_obd);
  
          nctxt = llog_get_context(mgc_obd, LLOG_CONFIG_REPL_CTXT);
          rc = llog_create(nctxt, &llh, NULL, name);
@@ -552,6 +551,8 @@ parse_out:
          if (rc) {
                  CERROR("6: llog_close failed: rc = %d\n", rc);
          }
+       CDEBUG(D_INFO, "obd %p - %p - %p - %p\n",
+              mgc_obd, exp, exp->exp_obd, exp->exp_obd->obd_type);
          rc = obd_disconnect(exp);
  ctxt_release:
          llog_ctxt_put(ctxt);
diff --git a/lustre/obdclass/lu_object.c b/lustre/obdclass/lu_object.c

index 5cae3c1..d5f6f5c 100644 (file)
--- a/lustre/obdclass/lu_object.c
+++ b/lustre/obdclass/lu_object.c
@@ -1193,6 +1193,7 @@ void *lu_context_key_get(const struct lu_context *ctx,
  {
          LINVRNT(ctx->lc_state == LCS_ENTERED);
          LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+        LASSERT(lu_keys[key->lct_index] == key);
          return ctx->lc_value[key->lct_index];
  }
  EXPORT_SYMBOL(lu_context_key_get);
diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c

index eb8d415..7783a3a 100644 (file)
--- a/lustre/obdclass/obd_config.c
+++ b/lustre/obdclass/obd_config.c
@@ -598,9 +598,7 @@ void class_decref(struct obd_device *obd, const char *scope, const void *source)
                     be no more in-progress ops by this point.*/
  
                  spin_lock(&obd->obd_self_export->exp_lock);
-                obd->obd_self_export->exp_flags |=
-                        (obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
-                        (obd->obd_force ? OBD_OPT_FORCE : 0);
+                obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
                  spin_unlock(&obd->obd_self_export->exp_lock);
  
                  /* note that we'll recurse into class_decref again */
diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c

index f568574..04d012c 100644 (file)
--- a/lustre/obdclass/obd_mount.c
+++ b/lustre/obdclass/obd_mount.c
@@ -567,7 +567,6 @@ DECLARE_MUTEX(mgc_start_lock);
   */
  static int lustre_start_mgc(struct super_block *sb)
  {
-        struct lustre_handle mgc_conn = {0, };
          struct obd_connect_data *data = NULL;
          struct lustre_sb_info *lsi = s2lsi(sb);
          struct obd_device *obd;
@@ -768,14 +767,13 @@ static int lustre_start_mgc(struct super_block *sb)
          data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_FID |
                                    OBD_CONNECT_AT;
          data->ocd_version = LUSTRE_VERSION_CODE;
-        rc = obd_connect(NULL, &mgc_conn, obd, &(obd->obd_uuid), data, NULL);
+        rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
          OBD_FREE_PTR(data);
          if (rc) {
                  CERROR("connect failed %d\n", rc);
                  GOTO(out, rc);
          }
  
-        exp = class_conn2export(&mgc_conn);
          obd->u.cli.cl_mgc_mgsexp = exp;
  
  out:
@@ -1358,6 +1356,10 @@ static struct vfsmount *server_kernel_mount(struct super_block *sb)
                  GOTO(out_free, rc);
          }
  
+        if (lmd->lmd_flags & LMD_FLG_ABORT_RECOV)
+                simple_truncate(mnt->mnt_sb->s_root, mnt, LAST_RCVD,
+                                LR_CLIENT_START);
+
          OBD_PAGE_FREE(__page);
          lsi->lsi_ldd = ldd;   /* freed at lsi cleanup */
          CDEBUG(D_SUPER, "%s: mnt = %p\n", lmd->lmd_dev, mnt);
diff --git a/lustre/obdecho/Makefile.in b/lustre/obdecho/Makefile.in

index 66e61ed..c9069e5 100644 (file)
--- a/lustre/obdecho/Makefile.in
+++ b/lustre/obdecho/Makefile.in
@@ -1,4 +1,6 @@
  MODULES := obdecho
  obdecho-objs := echo.o echo_client.o lproc_echo.o
  
+EXTRA_DIST = $(obdecho-objs:%.o=%.c) echo_internal.h
+
  @INCLUDE_RULES@
diff --git a/lustre/obdecho/autoMakefile.am b/lustre/obdecho/autoMakefile.am

index c8b7df3..313b0f8 100644 (file)
--- a/lustre/obdecho/autoMakefile.am
+++ b/lustre/obdecho/autoMakefile.am
@@ -68,4 +68,3 @@ endif # MODULES
  install-data-hook: $(install_data_hook)
  
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES = $(obdecho-objs:%.o=%.c) echo_internal.h
diff --git a/lustre/obdecho/echo.c b/lustre/obdecho/echo.c

index d69cf59..6684c7a 100644 (file)
--- a/lustre/obdecho/echo.c
+++ b/lustre/obdecho/echo.c
@@ -66,12 +66,22 @@ enum {
  };
  
  static int echo_connect(const struct lu_env *env,
-                        struct lustre_handle *conn, struct obd_device *obd,
+                        struct obd_export **exp, struct obd_device *obd,
                          struct obd_uuid *cluuid, struct obd_connect_data *data,
                          void *localdata)
  {
+        struct lustre_handle conn = { 0 };
+        int rc;
+
          data->ocd_connect_flags &= ECHO_CONNECT_SUPPORTED;
-        return class_connect(conn, obd, cluuid);
+        rc = class_connect(&conn, obd, cluuid);
+        if (rc) {
+                CERROR("can't connect %d\n", rc);
+                return rc;
+        }
+        *exp = class_conn2export(&conn);
+
+        return 0;
  }
  
  static int echo_disconnect(struct obd_export *exp)
diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c

index 01b9572..16cab35 100644 (file)
--- a/lustre/obdecho/echo_client.c
+++ b/lustre/obdecho/echo_client.c
@@ -1870,7 +1870,6 @@ static int echo_client_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
  {
          struct echo_client_obd *ec = &obddev->u.echo_client;
          struct obd_device *tgt;
-        struct lustre_handle conn = {0, };
          struct obd_uuid echo_uuid = { "ECHO_UUID" };
          struct obd_connect_data *ocd = NULL;
          int rc;
@@ -1906,7 +1905,7 @@ static int echo_client_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
          ocd->ocd_version = LUSTRE_VERSION_CODE;
          ocd->ocd_group = FILTER_GROUP_ECHO;
  
-        rc = obd_connect(NULL, &conn, tgt, &echo_uuid, ocd, NULL);
+        rc = obd_connect(NULL, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL);
  
          OBD_FREE(ocd, sizeof(*ocd));
  
@@ -1915,7 +1914,6 @@ static int echo_client_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
                         lustre_cfg_string(lcfg, 1));
                  return (rc);
          }
-        ec->ec_exp = class_conn2export(&conn);
  
          RETURN(rc);
  }
@@ -1939,18 +1937,17 @@ static int echo_client_cleanup(struct obd_device *obddev)
  }
  
  static int echo_client_connect(const struct lu_env *env,
-                               struct lustre_handle *conn,
+                               struct obd_export **exp,
                                 struct obd_device *src, struct obd_uuid *cluuid,
                                 struct obd_connect_data *data, void *localdata)
  {
-        struct obd_export *exp;
          int                rc;
+        struct lustre_handle conn = { 0 };
  
          ENTRY;
-        rc = class_connect(conn, src, cluuid);
+        rc = class_connect(&conn, src, cluuid);
          if (rc == 0) {
-                exp = class_conn2export(conn);
-                class_export_put(exp);
+                *exp = class_conn2export(&conn);
          }
  
          RETURN (rc);
diff --git a/lustre/obdfilter/Makefile.in b/lustre/obdfilter/Makefile.in

index 2a15c71..c1ebc18 100644 (file)
--- a/lustre/obdfilter/Makefile.in
+++ b/lustre/obdfilter/Makefile.in
@@ -4,4 +4,6 @@ obdfilter-objs := filter.o filter_io.o filter_log.o
  obdfilter-objs += lproc_obdfilter.o filter_lvb.o filter_capa.o
  obdfilter-objs += filter_io_26.o
  
+EXTRA_DIST = $(obdfilter-objs:%.o=%.c) filter_io_26.c filter_internal.h
+
  @INCLUDE_RULES@
diff --git a/lustre/obdfilter/autoMakefile.am b/lustre/obdfilter/autoMakefile.am

index 89490fb..cfef4e9 100644 (file)
--- a/lustre/obdfilter/autoMakefile.am
+++ b/lustre/obdfilter/autoMakefile.am
@@ -39,4 +39,3 @@ modulefs_DATA = obdfilter$(KMODEXT)
  endif
  
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES = $(obdfilter-objs:%.o=%.c) filter_io_26.c filter_internal.h
diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c

index d1678d7..191b3e5 100644 (file)
--- a/lustre/obdfilter/filter.c
+++ b/lustre/obdfilter/filter.c
@@ -202,12 +202,9 @@ static int filter_export_stats_init(struct obd_device *obd,
                                      struct obd_export *exp,
                                      void *client_nid)
  {
-        struct filter_export_data *fed = &exp->exp_filter_data;
          int rc, newnid = 0;
          ENTRY;
  
-        init_brw_stats(&fed->fed_brw_stats);
-
          if (obd_uuid_equals(&exp->exp_client_uuid, &obd->obd_uuid))
                  /* Self-export gets no proc entry */
                  RETURN(0);
@@ -357,12 +354,13 @@ static int filter_client_add(struct obd_device *obd, struct obd_export *exp,
          RETURN(0);
  }
  
+struct lsd_client_data zero_lcd; /* globals are implicitly zeroed */
+
  static int filter_client_free(struct obd_export *exp)
  {
          struct filter_export_data *fed = &exp->exp_filter_data;
          struct filter_obd *filter = &exp->exp_obd->u.filter;
          struct obd_device *obd = exp->exp_obd;
-        struct lsd_client_data zero_lcd;
          struct lvfs_run_ctxt saved;
          int rc;
          loff_t off;
@@ -399,23 +397,26 @@ static int filter_client_free(struct obd_export *exp)
          }
  
          if (!(exp->exp_flags & OBD_OPT_FAILOVER)) {
-                memset(&zero_lcd, 0, sizeof zero_lcd);
+                /* Don't force sync on disconnect if aborting recovery,
+                 * or it does num_clients * num_osts.  b=17194 */
+                int need_sync = (!exp->exp_libclient || exp->exp_need_sync) &&
+                                 !(exp->exp_flags&OBD_OPT_ABORT_RECOV);
                  push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
                  rc = fsfilt_write_record(obd, filter->fo_rcvd_filp, &zero_lcd,
-                                         sizeof(zero_lcd), &off,
-                                         (!exp->exp_libclient ||
-                                          exp->exp_need_sync));
+                                         sizeof(zero_lcd), &off, 0);
+
+                /* Make sure the server's last_transno is up to date. Do this
+                 * after the client is freed so we know all the client's
+                 * transactions have been committed. */
                  if (rc == 0)
-                        /* update server's transno */
                          filter_update_server_data(obd, filter->fo_rcvd_filp,
-                                                  filter->fo_fsd,
-                                                  !exp->exp_libclient);
+                                                  filter->fo_fsd, need_sync);
                  pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
  
                  CDEBUG(rc == 0 ? D_INFO : D_ERROR,
-                       "zeroing out client %s at idx %u (%llu) in %s rc %d\n",
+                       "zero out client %s at idx %u/%llu in %s %ssync rc %d\n",
                         fed->fed_lcd->lcd_uuid, fed->fed_lr_idx, fed->fed_lr_off,
-                       LAST_RCVD, rc);
+                       LAST_RCVD, need_sync ? "" : "a", rc);
          }
  
          if (!test_and_clear_bit(fed->fed_lr_idx, filter->fo_last_rcvd_slots)) {
@@ -2012,7 +2013,11 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg,
          if (rc != 0)
                  GOTO(err_ops, rc);
  
-        LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb)));
+        if (lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb))) {
+                CERROR("%s: Underlying device is marked as read-only. "
+                       "Setup failed\n", obd->obd_name);
+                GOTO(err_ops, rc = -EROFS);
+        }
  
          /* failover is the default */
          obd->obd_replayable = 1;
@@ -2765,34 +2770,35 @@ static int filter_reconnect(const struct lu_env *env,
  
  /* nearly identical to mds_connect */
  static int filter_connect(const struct lu_env *env,
-                          struct lustre_handle *conn, struct obd_device *obd,
+                          struct obd_export **exp, struct obd_device *obd,
                            struct obd_uuid *cluuid,
                            struct obd_connect_data *data, void *localdata)
  {
          struct lvfs_run_ctxt saved;
-        struct obd_export *exp;
+        struct lustre_handle conn = { 0 };
+        struct obd_export *lexp;
          struct filter_export_data *fed;
          struct lsd_client_data *lcd = NULL;
          __u32 group;
          int rc;
          ENTRY;
  
-        if (conn == NULL || obd == NULL || cluuid == NULL)
+        if (exp == NULL || obd == NULL || cluuid == NULL)
                  RETURN(-EINVAL);
  
-        rc = class_connect(conn, obd, cluuid);
+        rc = class_connect(&conn, obd, cluuid);
          if (rc)
                  RETURN(rc);
-        exp = class_conn2export(conn);
-        LASSERT(exp != NULL);
+        lexp = class_conn2export(&conn);
+        LASSERT(lexp != NULL);
  
-        fed = &exp->exp_filter_data;
+        fed = &lexp->exp_filter_data;
  
-        rc = filter_connect_internal(exp, data);
+        rc = filter_connect_internal(lexp, data);
          if (rc)
                  GOTO(cleanup, rc);
  
-        filter_export_stats_init(obd, exp, localdata);
+        filter_export_stats_init(obd, lexp, localdata);
          if (obd->obd_replayable) {
                  OBD_ALLOC(lcd, sizeof(*lcd));
                  if (!lcd) {
@@ -2802,7 +2808,7 @@ static int filter_connect(const struct lu_env *env,
  
                  memcpy(lcd->lcd_uuid, cluuid, sizeof(lcd->lcd_uuid));
                  fed->fed_lcd = lcd;
-                rc = filter_client_add(obd, exp, -1);
+                rc = filter_client_add(obd, lexp, -1);
                  if (rc)
                          GOTO(cleanup, rc);
          }
@@ -2810,7 +2816,7 @@ static int filter_connect(const struct lu_env *env,
          group = data->ocd_group;
  
          CWARN("%s: Received MDS connection ("LPX64"); group %d\n",
-              obd->obd_name, exp->exp_handle.h_cookie, group);
+              obd->obd_name, lexp->exp_handle.h_cookie, group);
  
          push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
          rc = filter_read_groups(obd, group, 1);
@@ -2828,9 +2834,10 @@ cleanup:
                          OBD_FREE_PTR(lcd);
                          fed->fed_lcd = NULL;
                  }
-                class_disconnect(exp);
+                class_disconnect(lexp);
+                *exp = NULL;
          } else {
-                class_export_put(exp);
+                *exp = lexp;
          }
  
          RETURN(rc);
@@ -3051,6 +3058,8 @@ static int filter_disconnect(struct obd_export *exp)
          /* Flush any remaining cancel messages out to the target */
          filter_sync_llogs(obd, exp);
  
+        lquota_clearinfo(filter_quota_interface_ref, exp, exp->exp_obd);
+
          /* Disconnect early so that clients can't keep using export */
          rc = class_disconnect(exp);
          if (exp->exp_obd->obd_namespace != NULL)
@@ -4327,9 +4336,11 @@ static int filter_set_info_async(struct obd_export *exp, __u32 keylen,
          obd->u.filter.fo_mdc_conn.cookie = exp->exp_handle.h_cookie;
  
          /* setup llog imports */
-        LASSERT(val != NULL);
+        if (val != NULL)
+                group = (int)(*(__u32 *)val);
+        else
+                group = 0; /* default value */
  
-        group = (int)(*(__u32 *)val);
          LASSERT_MDS_GROUP(group);
          rc = filter_setup_llog_group(exp, obd, group);
          if (rc)
diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c

index 960b097..68b2ef4 100644 (file)
--- a/lustre/obdfilter/filter_io_26.c
+++ b/lustre/obdfilter/filter_io_26.c
@@ -80,20 +80,32 @@ static void record_start_io(struct filter_iobuf *iobuf, int rw, int size,
                  atomic_inc(&filter->fo_r_in_flight);
                  lprocfs_oh_tally(&filter->fo_filter_stats.hist[BRW_R_RPC_HIST],
                                   atomic_read(&filter->fo_r_in_flight));
-                lprocfs_oh_tally_log2(&filter->fo_filter_stats.hist[BRW_R_DISK_IOSIZE],
+                lprocfs_oh_tally_log2(&filter->
+                                       fo_filter_stats.hist[BRW_R_DISK_IOSIZE],
                                        size);
-                lprocfs_oh_tally(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_RPC_HIST],
-                                 atomic_read(&filter->fo_r_in_flight));
-                lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_DISK_IOSIZE], size);
+                if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats) {
+                        lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats->
+                                          hist[BRW_R_RPC_HIST],
+                                         atomic_read(&filter->fo_r_in_flight));
+                        lprocfs_oh_tally_log2(&exp->exp_nid_stats->
+                                         nid_brw_stats->hist[BRW_R_DISK_IOSIZE],
+                                              size);
+                }
          } else {
                  atomic_inc(&filter->fo_w_in_flight);
                  lprocfs_oh_tally(&filter->fo_filter_stats.hist[BRW_W_RPC_HIST],
                                   atomic_read(&filter->fo_w_in_flight));
-                lprocfs_oh_tally_log2(&filter->fo_filter_stats.hist[BRW_W_DISK_IOSIZE],
+                lprocfs_oh_tally_log2(&filter->
+                                       fo_filter_stats.hist[BRW_W_DISK_IOSIZE],
                                        size);
-                lprocfs_oh_tally(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_RPC_HIST],
-                                 atomic_read(&filter->fo_w_in_flight));
-                lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_DISK_IOSIZE], size);
+                if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats) {
+                        lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats->
+                                          hist[BRW_W_RPC_HIST],
+                                         atomic_read(&filter->fo_r_in_flight));
+                        lprocfs_oh_tally_log2(&exp->exp_nid_stats->
+                                        nid_brw_stats->hist[BRW_W_DISK_IOSIZE],
+                                              size);
+                }
          }
  }
  
@@ -405,31 +417,32 @@ int filter_do_bio(struct obd_export *exp, struct inode *inode,
          wait_event(iobuf->dr_wait, atomic_read(&iobuf->dr_numreqs) == 0);
  
          if (rw == OBD_BRW_READ) {
-                lprocfs_oh_tally(&obd->u.filter.fo_filter_stats.hist[BRW_R_DIO_FRAGS],
-                                 frags);
-                lprocfs_oh_tally(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_DIO_FRAGS],
+                lprocfs_oh_tally(&obd->u.filter.fo_filter_stats.
+                                  hist[BRW_R_DIO_FRAGS],
                                   frags);
-                lprocfs_oh_tally_log2(&obd->u.filter.fo_filter_stats.hist[BRW_R_IO_TIME],
+                lprocfs_oh_tally_log2(&obd->u.filter.
+                                       fo_filter_stats.hist[BRW_R_IO_TIME],
                                        jiffies - start_time);
-                lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_IO_TIME], jiffies - start_time);
                  if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats) {
-                        lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_R_DIO_FRAGS],
+                        lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats->
+                                          hist[BRW_R_DIO_FRAGS],
                                           frags);
-                        lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_R_IO_TIME],
+                        lprocfs_oh_tally_log2(&exp->exp_nid_stats->
+                                             nid_brw_stats->hist[BRW_R_IO_TIME],
                                                jiffies - start_time);
                  }
          } else {
-                lprocfs_oh_tally(&obd->u.filter.fo_filter_stats.hist[BRW_W_DIO_FRAGS],
-                                 frags);
-                lprocfs_oh_tally(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_DIO_FRAGS],
-                                 frags);
-                lprocfs_oh_tally_log2(&obd->u.filter.fo_filter_stats.hist[BRW_W_IO_TIME],
+                lprocfs_oh_tally(&obd->u.filter.fo_filter_stats.
+                                  hist[BRW_W_DIO_FRAGS], frags);
+                lprocfs_oh_tally_log2(&obd->u.filter.fo_filter_stats.
+                                       hist[BRW_W_IO_TIME],
                                        jiffies - start_time);
-                lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_IO_TIME], jiffies - start_time);
                  if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats) {
-                        lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_W_DIO_FRAGS],
+                        lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats->
+                                          hist[BRW_W_DIO_FRAGS],
                                           frags);
-                        lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_W_IO_TIME],
+                        lprocfs_oh_tally_log2(&exp->exp_nid_stats->
+                                             nid_brw_stats->hist[BRW_W_IO_TIME],
                                                jiffies - start_time);
                  }
          }
diff --git a/lustre/obdfilter/lproc_obdfilter.c b/lustre/obdfilter/lproc_obdfilter.c

index 5e5f1c7..7d4da23 100644 (file)
--- a/lustre/obdfilter/lproc_obdfilter.c
+++ b/lustre/obdfilter/lproc_obdfilter.c
@@ -381,7 +381,6 @@ void filter_tally(struct obd_export *exp, struct page **pages, int nr_pages,
                    unsigned long *blocks, int blocks_per_page, int wr)
  {
          struct filter_obd *filter = &exp->exp_obd->u.filter;
-        struct filter_export_data *fed = &exp->exp_filter_data;
          struct page *last_page = NULL;
          unsigned long *last_block = NULL;
          unsigned long discont_pages = 0;
@@ -393,8 +392,6 @@ void filter_tally(struct obd_export *exp, struct page **pages, int nr_pages,
  
          lprocfs_oh_tally_log2(&filter->fo_filter_stats.hist[BRW_R_PAGES + wr],
                                nr_pages);
-        lprocfs_oh_tally_log2(&fed->fed_brw_stats.hist[BRW_R_PAGES + wr],
-                              nr_pages);
          if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats)
                  lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats->
                                          hist[BRW_R_PAGES + wr], nr_pages);
@@ -413,12 +410,8 @@ void filter_tally(struct obd_export *exp, struct page **pages, int nr_pages,
  
          lprocfs_oh_tally(&filter->fo_filter_stats.hist[BRW_R_DISCONT_PAGES +wr],
                           discont_pages);
-        lprocfs_oh_tally(&fed->fed_brw_stats.hist[BRW_R_DISCONT_PAGES + wr],
-                         discont_pages);
          lprocfs_oh_tally(&filter->fo_filter_stats.hist[BRW_R_DISCONT_BLOCKS+wr],
                           discont_blocks);
-        lprocfs_oh_tally(&fed->fed_brw_stats.hist[BRW_R_DISCONT_BLOCKS + wr],
-                         discont_blocks);
  
          if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats) {
                  lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats->
@@ -546,30 +539,6 @@ int lproc_filter_attach_seqstat(struct obd_device *dev)
                                        &filter_brw_stats_fops, dev);
  }
  
-static int filter_per_export_stats_seq_show(struct seq_file *seq, void *v)
-{
-        struct filter_export_data *fed = seq->private;
-
-        brw_stats_show(seq, &fed->fed_brw_stats);
-
-        return 0;
-}
-
-static ssize_t filter_per_export_stats_seq_write(struct file *file,
-                                       const char *buf, size_t len, loff_t *off)
-{
-        struct seq_file *seq = file->private_data;
-        struct filter_export_data *fed = seq->private;
-        int i;
-
-        for (i = 0; i < BRW_LAST; i++)
-                lprocfs_oh_clear(&fed->fed_brw_stats.hist[i]);
-
-        return len;
-}
-
-LPROC_SEQ_FOPS(filter_per_export_stats);
-
  void lprocfs_filter_init_vars(struct lprocfs_static_vars *lvars)
  {
      lvars->module_vars  = lprocfs_filter_module_vars;
@@ -578,10 +547,10 @@ void lprocfs_filter_init_vars(struct lprocfs_static_vars *lvars)
  
  static int filter_per_nid_stats_seq_show(struct seq_file *seq, void *v)
  {
-        nid_stat_t *tmp = seq->private;
+        nid_stat_t * stat = seq->private;
  
-        if (tmp->nid_brw_stats)
-                brw_stats_show(seq, tmp->nid_brw_stats);
+        if (stat->nid_brw_stats)
+                brw_stats_show(seq, stat->nid_brw_stats);
  
          return 0;
  }
@@ -590,13 +559,13 @@ static ssize_t filter_per_nid_stats_seq_write(struct file *file,
                                                const char *buf, size_t len,
                                                loff_t *off)
  {
-        struct seq_file *seq = file->private_data;
-        nid_stat_t *tmp = seq->private;
+        struct seq_file *seq  = file->private_data;
+        nid_stat_t      *stat = seq->private;
          int i;
  
-        if (tmp->nid_brw_stats)
+        if (stat->nid_brw_stats)
                  for (i = 0; i < BRW_LAST; i++)
-                        lprocfs_oh_clear(&tmp->nid_brw_stats->hist[i]);
+                        lprocfs_oh_clear(&stat->nid_brw_stats->hist[i]);
  
          return len;
  }
diff --git a/lustre/osc/Makefile.in b/lustre/osc/Makefile.in

index 438ce4c..40ffa16 100644 (file)
--- a/lustre/osc/Makefile.in
+++ b/lustre/osc/Makefile.in
@@ -1,4 +1,6 @@
  MODULES := osc
  osc-objs := osc_request.o lproc_osc.o osc_create.o osc_dev.o osc_object.o osc_page.o osc_lock.o osc_io.o
  
+EXTRA_DIST = $(osc-objs:%.o=%.c) osc_internal.h osc_cl_internal.h
+
  @INCLUDE_RULES@
diff --git a/lustre/osc/autoMakefile.am b/lustre/osc/autoMakefile.am

index cf370ba..59c37b2 100644 (file)
--- a/lustre/osc/autoMakefile.am
+++ b/lustre/osc/autoMakefile.am
@@ -75,4 +75,3 @@ endif
  install-data-hook: $(install_data_hook)
  
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES = $(osc-objs:%.o=%.c) osc_internal.h osc_cl_internal.h
diff --git a/lustre/osc/osc_lock.c b/lustre/osc/osc_lock.c

index 8f2b2b5..247c044 100644 (file)
--- a/lustre/osc/osc_lock.c
+++ b/lustre/osc/osc_lock.c
@@ -312,6 +312,8 @@ static void osc_ast_data_put(const struct lu_env *env, struct osc_lock *olck)
   *
   * This can be optimized to not update attributes when lock is a result of a
   * local match.
+ *
+ * Called under lock and resource spin-locks.
   */
  static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck,
                                  int rc)
@@ -344,6 +346,8 @@ static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck,
                  dlmlock = olck->ols_lock;
                  LASSERT(dlmlock != NULL);
  
+                /* re-grab LVB from a dlm lock under DLM spin-locks. */
+                *lvb = *(struct ost_lvb *)dlmlock->l_lvb_data;
                  size = lvb->lvb_size;
                  /* Extend KMS up to the end of this lock and no further
                   * A lock on [x,y] means a KMS of up to y + 1 bytes! */
@@ -360,7 +364,7 @@ static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck,
                                     lvb->lvb_size, oinfo->loi_kms,
                                     dlmlock->l_policy_data.l_extent.end);
                  }
-                ldlm_lock_allow_match(dlmlock);
+                ldlm_lock_allow_match_locked(dlmlock);
          } else if (rc == -ENAVAIL && olck->ols_glimpse) {
                  CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving"
                         " kms="LPU64"\n", lvb->lvb_size, oinfo->loi_kms);
@@ -375,6 +379,13 @@ static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck,
          EXIT;
  }
  
+/**
+ * Called when a lock is granted, from an upcall (when server returned a
+ * granted lock), or from completion AST, when server returned a blocked lock.
+ *
+ * Called under lock and resource spin-locks, that are released temporarily
+ * here.
+ */
  static void osc_lock_granted(const struct lu_env *env, struct osc_lock *olck,
                               struct ldlm_lock *dlmlock, int rc)
  {
@@ -399,11 +410,19 @@ static void osc_lock_granted(const struct lu_env *env, struct osc_lock *olck,
                   * tell upper layers the extent of the lock that was actually
                   * granted
                   */
-                cl_lock_modify(env, lock, descr);
                  LINVRNT(osc_lock_invariant(olck));
                  olck->ols_state = OLS_GRANTED;
                  osc_lock_lvb_update(env, olck, rc);
+
+                /* release DLM spin-locks to allow cl_lock_{modify,signal}()
+                 * to take a semaphore on a parent lock. This is safe, because
+                 * spin-locks are needed to protect consistency of
+                 * dlmlock->l_*_mode and LVB, and we have finished processing
+                 * them. */
+                unlock_res_and_lock(dlmlock);
+                cl_lock_modify(env, lock, descr);
                  cl_lock_signal(env, lock);
+                lock_res_and_lock(dlmlock);
          }
          EXIT;
  }
@@ -424,7 +443,6 @@ static void osc_lock_upcall0(const struct lu_env *env, struct osc_lock *olck)
          LASSERT(olck->ols_lock == NULL);
          olck->ols_lock = dlmlock;
          spin_unlock(&osc_ast_guard);
-        unlock_res_and_lock(dlmlock);
  
          /*
           * Lock might be not yet granted. In this case, completion ast
@@ -433,6 +451,8 @@ static void osc_lock_upcall0(const struct lu_env *env, struct osc_lock *olck)
           */
          if (dlmlock->l_granted_mode == dlmlock->l_req_mode)
                  osc_lock_granted(env, olck, dlmlock, 0);
+        unlock_res_and_lock(dlmlock);
+
          /*
           * osc_enqueue_interpret() decrefs asynchronous locks, counter
           * this.
@@ -751,6 +771,7 @@ static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock,
                           * to lock->l_lvb_data, store it in osc_lock.
                           */
                          LASSERT(dlmlock->l_lvb_data != NULL);
+                        lock_res_and_lock(dlmlock);
                          olck->ols_lvb = *(struct ost_lvb *)dlmlock->l_lvb_data;
                          if (olck->ols_lock == NULL)
                                  /*
@@ -767,6 +788,7 @@ static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock,
                                  osc_lock_granted(env, olck, dlmlock, dlmrc);
                          if (dlmrc != 0)
                                  cl_lock_error(env, lock, dlmrc);
+                        unlock_res_and_lock(dlmlock);
                          cl_lock_mutex_put(env, lock);
                          osc_ast_data_put(env, olck);
                          result = 0;
@@ -1038,6 +1060,7 @@ static void osc_lock_to_lockless(const struct lu_env *env,
                          slice->cls_ops = &osc_lock_lockless_ops;
                  }
          }
+        LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
  }
  
  /**
@@ -1273,7 +1296,7 @@ static int osc_lock_enqueue(const struct lu_env *env,
                          ols->ols_state = OLS_GRANTED;
                  }
          }
-
+        LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
          RETURN(result);
  }
  
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c

index 1812828..1f7e465 100644 (file)
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -1175,7 +1175,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa,
          /* size[REQ_REC_OFF] still sizeof (*body) */
          if (opc == OST_WRITE) {
                  if (unlikely(cli->cl_checksum) &&
-                    req->rq_flvr.sf_bulk_hash == BULK_HASH_ALG_NULL) {
+                    !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
                          /* store cl_cksum_type in a local variable since
                           * it can be changed via lprocfs */
                          cksum_type_t cksum_type = cli->cl_cksum_type;
@@ -1204,7 +1204,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa,
                                       sizeof(__u32) * niocount);
          } else {
                  if (unlikely(cli->cl_checksum) &&
-                    req->rq_flvr.sf_bulk_hash == BULK_HASH_ALG_NULL) {
+                    !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
                          if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
                                  body->oa.o_flags = 0;
                          body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type);
@@ -1331,6 +1331,9 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
                  }
                  LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob);
  
+                if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
+                        RETURN(-EAGAIN);
+
                  if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
                      check_write_checksum(&body->oa, peer, client_cksum,
                                           body->oa.o_cksum, aa->aa_requested_nob,
@@ -1338,15 +1341,17 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
                                           cksum_type_unpack(aa->aa_oa->o_flags)))
                          RETURN(-EAGAIN);
  
-                if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
-                        RETURN(-EAGAIN);
-
                  rc = check_write_rcs(req, aa->aa_requested_nob,aa->aa_nio_count,
                                       aa->aa_page_count, aa->aa_ppga);
                  GOTO(out, rc);
          }
  
          /* The rest of this function executes only for OST_READs */
+
+        rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+        if (rc < 0)
+                GOTO(out, rc);
+
          if (rc > aa->aa_requested_nob) {
                  CERROR("Unexpected rc %d (%d requested)\n", rc,
                         aa->aa_requested_nob);
@@ -1362,10 +1367,6 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
          if (rc < aa->aa_requested_nob)
                  handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
  
-        if (sptlrpc_cli_unwrap_bulk_read(req, rc, aa->aa_page_count,
-                                         aa->aa_ppga))
-                GOTO(out, rc = -EAGAIN);
-
          if (body->oa.o_valid & OBD_MD_FLCKSUM) {
                  static int cksum_counter;
                  __u32      server_cksum = body->oa.o_cksum;
diff --git a/lustre/osd/osd_handler.c b/lustre/osd/osd_handler.c

index 97452a6..363b81a 100644 (file)
--- a/lustre/osd/osd_handler.c
+++ b/lustre/osd/osd_handler.c
@@ -3033,9 +3033,10 @@ static void osd_it_ea_fini(const struct lu_env *env, struct dt_it *di)
  {
          struct osd_it_ea     *it   = (struct osd_it_ea *)di;
          struct osd_object    *obj  = it->oie_obj;
-
+        struct inode       *inode  = obj->oo_inode;
  
          ENTRY;
+        it->oie_file.f_op->release(inode, &it->oie_file);
          lu_object_put(env, &obj->oo_dt.do_lu);
          EXIT;
  }
@@ -3088,8 +3089,6 @@ static int osd_ldiskfs_filldir(char *buf, const char *name, int namelen,
  {
          struct osd_it_ea   *it     = (struct osd_it_ea *)buf;
          struct dirent64    *dirent = &it->oie_dirent64;
-        int                 reclen = LDISKFS_DIR_REC_LEN(namelen);
-
  
          ENTRY;
          if (it->oie_namelen)
@@ -3101,8 +3100,6 @@ static int osd_ldiskfs_filldir(char *buf, const char *name, int namelen,
          strncpy(dirent->d_name, name, LDISKFS_NAME_LEN);
          dirent->d_name[namelen] = 0;
          dirent->d_ino           = ino;
-        dirent->d_off           = offset;
-        dirent->d_reclen        = reclen;
          it->oie_namelen         = namelen;
          it->oie_curr_pos        = offset;
  
@@ -3134,7 +3131,7 @@ int osd_ldiskfs_it_fill(const struct dt_it *di)
  
          it->oie_next_pos = it->oie_file.f_pos;
  
-        if(!result && it->oie_namelen == 0)
+        if (it->oie_namelen == 0)
                  result = -EIO;
  
          RETURN(result);
@@ -3232,6 +3229,8 @@ static struct dt_rec *osd_it_ea_rec(const struct lu_env *env,
          }
  
          rc = osd_ea_fid_get(env, dentry, (struct dt_rec*) rec);
+        if (rc != 0)
+                rec = ERR_PTR(rc);
  
          iput(inode);
          RETURN((struct dt_rec *)rec);
@@ -3270,7 +3269,7 @@ static int osd_it_ea_load(const struct lu_env *env,
          int rc;
  
          ENTRY;
-        it->oie_curr_pos = it->oie_next_pos = hash;
+        it->oie_curr_pos = hash;
  
          rc =  osd_ldiskfs_it_fill(di);
          if (rc == 0)
diff --git a/lustre/ost/Makefile.in b/lustre/ost/Makefile.in

index 99002e4..6bd8be3 100644 (file)
--- a/lustre/ost/Makefile.in
+++ b/lustre/ost/Makefile.in
@@ -1,4 +1,6 @@
  MODULES := ost
  ost-objs := ost_handler.o lproc_ost.o
  
+EXTRA_DIST = $(ost-objs:%.o=%.c) ost_internal.h
+
  @INCLUDE_RULES@
diff --git a/lustre/ost/autoMakefile.am b/lustre/ost/autoMakefile.am

index 8db3fe4..907a0e0 100644 (file)
--- a/lustre/ost/autoMakefile.am
+++ b/lustre/ost/autoMakefile.am
@@ -39,4 +39,3 @@ modulefs_DATA = ost$(KMODEXT)
  endif
  
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES = $(ost-objs:%.o=%.c) ost_internal.h
diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c

index 28430a4..cb520bb 100644 (file)
--- a/lustre/ost/ost_handler.c
+++ b/lustre/ost/ost_handler.c
@@ -751,9 +751,9 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
                  if (exp->exp_failed)
                          rc = -ENOTCONN;
                  else {
-                        sptlrpc_svc_wrap_bulk(req, desc);
-
-                        rc = ptlrpc_start_bulk_transfer(desc);
+                        rc = sptlrpc_svc_wrap_bulk(req, desc);
+                        if (rc == 0)
+                                rc = ptlrpc_start_bulk_transfer(desc);
                  }
  
                  if (rc == 0) {
@@ -978,6 +978,10 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                                        local_nb[i].offset & ~CFS_PAGE_MASK,
                                        local_nb[i].len);
  
+        rc = sptlrpc_svc_prep_bulk(req, desc);
+        if (rc != 0)
+                GOTO(out_lock, rc);
+
          /* Check if client was evicted while we were doing i/o before touching
             network */
          if (desc->bd_export->exp_failed)
@@ -1012,23 +1016,18 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                          DEBUG_REQ(D_ERROR, req, "Eviction on bulk GET");
                          rc = -ENOTCONN;
                          ptlrpc_abort_bulk(desc);
-                } else if (!desc->bd_success ||
-                           desc->bd_nob_transferred != desc->bd_nob) {
-                        DEBUG_REQ(D_ERROR, req, "%s bulk GET %d(%d)",
-                                  desc->bd_success ?
-                                  "truncated" : "network error on",
-                                  desc->bd_nob_transferred, desc->bd_nob);
+                } else if (!desc->bd_success) {
+                        DEBUG_REQ(D_ERROR, req, "network error on bulk GET");
                          /* XXX should this be a different errno? */
                          rc = -ETIMEDOUT;
+                } else {
+                        rc = sptlrpc_svc_unwrap_bulk(req, desc);
                  }
          } else {
                  DEBUG_REQ(D_ERROR, req, "ptlrpc_bulk_get failed: rc %d", rc);
          }
          no_reply = rc != 0;
  
-        if (rc == 0)
-                sptlrpc_svc_unwrap_bulk(req, desc);
-
          repbody = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
                                   sizeof(*repbody));
          memcpy(&repbody->oa, &body->oa, sizeof(repbody->oa));
@@ -1606,6 +1605,11 @@ static int ost_rw_hpreq_lock_match(struct ptlrpc_request *req,
          end = (nb[ioo->ioo_bufcnt - 1].offset +
                 nb[ioo->ioo_bufcnt - 1].len - 1) | ~CFS_PAGE_MASK;
  
+        LASSERT(lock->l_resource != NULL);
+        if (!osc_res_name_eq(ioo->ioo_id, ioo->ioo_gr, 
+                             &lock->l_resource->lr_name))
+                RETURN(0);
+
          if (!(lock->l_granted_mode & mode))
                  RETURN(0);
  
diff --git a/lustre/ptlrpc/Makefile.in b/lustre/ptlrpc/Makefile.in

index aee4786..3660c7f 100644 (file)
--- a/lustre/ptlrpc/Makefile.in
+++ b/lustre/ptlrpc/Makefile.in
@@ -30,6 +30,7 @@ l_lock.c: @LUSTRE@/ldlm/l_lock.c
  interval_tree.c: @LUSTRE@/ldlm/interval_tree.c
         ln -sf $< $@
  
+EXTRA_DIST = $(ptlrpc_objs:.o=.c) ptlrpc_internal.h
  EXTRA_PRE_CFLAGS := -I@LUSTRE@/ldlm
  
  @INCLUDE_RULES@
diff --git a/lustre/ptlrpc/autoMakefile.am b/lustre/ptlrpc/autoMakefile.am

index c0d8c41..35577b5 100644 (file)
--- a/lustre/ptlrpc/autoMakefile.am
+++ b/lustre/ptlrpc/autoMakefile.am
@@ -117,5 +117,4 @@ SUBDIRS = gss
  endif
  
  install-data-hook: $(install_data_hook)
-DIST_SOURCES = $(ptlrpc_objs:.o=.c) ptlrpc_internal.h
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@  ldlm_*.c l_lock.c interval_tree.c
diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c

index 75f365a..1188d41 100644 (file)
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -1427,7 +1427,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                  spin_unlock(&imp->imp_lock);
  
                  set->set_remaining--;
-                cfs_waitq_signal(&imp->imp_recovery_waitq);
+                cfs_waitq_broadcast(&imp->imp_recovery_waitq);
          }
  
          /* If we hit an error, we want to recover promptly. */
@@ -2299,7 +2299,7 @@ after_send:
  
          LASSERT(!req->rq_receiving_reply);
          ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
-        cfs_waitq_signal(&imp->imp_recovery_waitq);
+        cfs_waitq_broadcast(&imp->imp_recovery_waitq);
          RETURN(rc);
  }
  
diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c

index 2d20ab7..92cdd7b 100644 (file)
--- a/lustre/ptlrpc/events.c
+++ b/lustre/ptlrpc/events.c
@@ -194,7 +194,9 @@ void client_bulk_callback (lnet_event_t *ev)
                  desc->bd_sender = ev->sender;
          }
  
-        sptlrpc_enc_pool_put_pages(desc);
+        /* release the encrypted pages for write */
+        if (desc->bd_req->rq_bulk_write)
+                sptlrpc_enc_pool_put_pages(desc);
  
          /* NB don't unlock till after wakeup; desc can disappear under us
           * otherwise */
diff --git a/lustre/ptlrpc/gss/gss_api.h b/lustre/ptlrpc/gss/gss_api.h

index 11b1c37..3b20c99 100644 (file)
--- a/lustre/ptlrpc/gss/gss_api.h
+++ b/lustre/ptlrpc/gss/gss_api.h
@@ -51,11 +51,15 @@ __u32 lgss_get_mic(
                  struct gss_ctx          *ctx,
                  int                      msgcnt,
                  rawobj_t                *msgs,
+                int                      iovcnt,
+                lnet_kiov_t             *iovs,
                  rawobj_t                *mic_token);
  __u32 lgss_verify_mic(
                  struct gss_ctx          *ctx,
                  int                      msgcnt,
                  rawobj_t                *msgs,
+                int                      iovcnt,
+                lnet_kiov_t             *iovs,
                  rawobj_t                *mic_token);
  __u32 lgss_wrap(
                  struct gss_ctx          *ctx,
@@ -68,12 +72,18 @@ __u32 lgss_unwrap(
                  rawobj_t                *gsshdr,
                  rawobj_t                *token,
                  rawobj_t                *out_msg);
-__u32 lgss_plain_encrypt(
-                struct gss_ctx          *ctx,
-                int                      decrypt,
-                int                      length,
-                void                    *in_buf,
-                void                    *out_buf);
+__u32 lgss_prep_bulk(
+                struct gss_ctx          *gctx,
+                struct ptlrpc_bulk_desc *desc);
+__u32 lgss_wrap_bulk(
+                struct gss_ctx          *gctx,
+                struct ptlrpc_bulk_desc *desc,
+                rawobj_t                *token,
+                int                      adj_nob);
+__u32 lgss_unwrap_bulk(
+                struct gss_ctx          *gctx,
+                struct ptlrpc_bulk_desc *desc,
+                rawobj_t                *token);
  __u32 lgss_delete_sec_context(
                  struct gss_ctx         **ctx);
  int lgss_display(
@@ -115,11 +125,15 @@ struct gss_api_ops {
                          struct gss_ctx         *ctx,
                          int                     msgcnt,
                          rawobj_t               *msgs,
+                        int                     iovcnt,
+                        lnet_kiov_t            *iovs,
                          rawobj_t               *mic_token);
          __u32 (*gss_verify_mic)(
                          struct gss_ctx         *ctx,
                          int                     msgcnt,
                          rawobj_t               *msgs,
+                        int                     iovcnt,
+                        lnet_kiov_t            *iovs,
                          rawobj_t               *mic_token);
          __u32 (*gss_wrap)(
                          struct gss_ctx         *ctx,
@@ -132,12 +146,18 @@ struct gss_api_ops {
                          rawobj_t               *gsshdr,
                          rawobj_t               *token,
                          rawobj_t               *out_msg);
-        __u32 (*gss_plain_encrypt)(
-                        struct gss_ctx         *ctx,
-                        int                     decrypt,
-                        int                     length,
-                        void                   *in_buf,
-                        void                   *out_buf);
+        __u32 (*gss_prep_bulk)(
+                        struct gss_ctx         *gctx,
+                        struct ptlrpc_bulk_desc *desc);
+        __u32 (*gss_wrap_bulk)(
+                        struct gss_ctx         *gctx,
+                        struct ptlrpc_bulk_desc *desc,
+                        rawobj_t               *token,
+                        int                     adj_nob);
+        __u32 (*gss_unwrap_bulk)(
+                        struct gss_ctx         *gctx,
+                        struct ptlrpc_bulk_desc *desc,
+                        rawobj_t               *token);
          void (*gss_delete_sec_context)(
                          void                   *ctx);
          int  (*gss_display)(
diff --git a/lustre/ptlrpc/gss/gss_bulk.c b/lustre/ptlrpc/gss/gss_bulk.c

index 03fd0ce..f8723f5 100644 (file)
--- a/lustre/ptlrpc/gss/gss_bulk.c
+++ b/lustre/ptlrpc/gss/gss_bulk.c
@@ -67,391 +67,26 @@
  #include "gss_internal.h"
  #include "gss_api.h"
  
-static __u8 zero_iv[CIPHER_MAX_BLKSIZE] = { 0, };
-
-static void buf_to_sl(struct scatterlist *sl,
-                      void *buf, unsigned int len)
-{
-        sl->page = virt_to_page(buf);
-        sl->offset = offset_in_page(buf);
-        sl->length = len;
-}
-
-/*
- * CTS CBC encryption:
- * 1. X(n-1) = P(n-1)
- * 2. E(n-1) = Encrypt(K, X(n-1))
- * 3. C(n)   = HEAD(E(n-1))
- * 4. P      = P(n) | 0
- * 5. D(n)   = E(n-1) XOR P
- * 6. C(n-1) = Encrypt(K, D(n))
- *
- * CTS encryption using standard CBC interface:
- * 1. pad the last partial block with 0.
- * 2. do CBC encryption.
- * 3. swap the last two ciphertext blocks.
- * 4. truncate to original plaintext size.
- */
-static int cbc_cts_encrypt(struct ll_crypto_cipher *tfm,
-                           struct scatterlist      *sld,
-                           struct scatterlist      *sls)
-{
-        struct scatterlist      slst, sldt;
-        struct blkcipher_desc   desc;
-        void                   *data;
-        __u8                    sbuf[CIPHER_MAX_BLKSIZE];
-        __u8                    dbuf[CIPHER_MAX_BLKSIZE];
-        unsigned int            blksize, blks, tail;
-        int                     rc;
-
-        blksize = ll_crypto_blkcipher_blocksize(tfm);
-        blks = sls->length / blksize;
-        tail = sls->length % blksize;
-        LASSERT(blks > 0 && tail > 0);
-
-        /* pad tail block with 0, copy to sbuf */
-        data = cfs_kmap(sls->page);
-        memcpy(sbuf, data + sls->offset + blks * blksize, tail);
-        memset(sbuf + tail, 0, blksize - tail);
-        cfs_kunmap(sls->page);
-
-        buf_to_sl(&slst, sbuf, blksize);
-        buf_to_sl(&sldt, dbuf, blksize);
-        desc.tfm   = tfm;
-        desc.flags = 0;
-
-        /* encrypt head */
-        rc = ll_crypto_blkcipher_encrypt(&desc, sld, sls, sls->length - tail);
-        if (unlikely(rc)) {
-                CERROR("encrypt head (%u) data: %d\n", sls->length - tail, rc);
-                return rc;
-        }
-        /* encrypt tail */
-        rc = ll_crypto_blkcipher_encrypt(&desc, &sldt, &slst, blksize);
-        if (unlikely(rc)) {
-                CERROR("encrypt tail (%u) data: %d\n", slst.length, rc);
-                return rc;
-        }
-
-        /* swab C(n) and C(n-1), if n == 1, then C(n-1) is the IV */
-        data = cfs_kmap(sld->page);
-
-        memcpy(data + sld->offset + blks * blksize,
-               data + sld->offset + (blks - 1) * blksize, tail);
-        memcpy(data + sld->offset + (blks - 1) * blksize, dbuf, blksize);
-        cfs_kunmap(sld->page);
-
-        return 0;
-}
-
-/*
- * CTS CBC decryption:
- * 1. D(n)   = Decrypt(K, C(n-1))
- * 2. C      = C(n) | 0
- * 3. X(n)   = D(n) XOR C
- * 4. P(n)   = HEAD(X(n))
- * 5. E(n-1) = C(n) | TAIL(X(n))
- * 6. X(n-1) = Decrypt(K, E(n-1))
- * 7. P(n-1) = X(n-1) XOR C(n-2)
- *
- * CTS decryption using standard CBC interface:
- * 1. D(n)   = Decrypt(K, C(n-1))
- * 2. C(n)   = C(n) | TAIL(D(n))
- * 3. swap the last two ciphertext blocks.
- * 4. do CBC decryption.
- * 5. truncate to original ciphertext size.
- */
-static int cbc_cts_decrypt(struct ll_crypto_cipher *tfm,
-                           struct scatterlist *sld,
-                           struct scatterlist *sls)
-{
-        struct blkcipher_desc   desc;
-        struct scatterlist      slst, sldt;
-        void                   *data;
-        __u8                    sbuf[CIPHER_MAX_BLKSIZE];
-        __u8                    dbuf[CIPHER_MAX_BLKSIZE];
-        unsigned int            blksize, blks, tail;
-        int                     rc;
-
-        blksize = ll_crypto_blkcipher_blocksize(tfm);
-        blks = sls->length / blksize;
-        tail = sls->length % blksize;
-        LASSERT(blks > 0 && tail > 0);
-
-        /* save current IV, and set IV to zero */
-        ll_crypto_blkcipher_get_iv(tfm, sbuf, blksize);
-        ll_crypto_blkcipher_set_iv(tfm, zero_iv, blksize);
-
-        /* D(n) = Decrypt(K, C(n-1)) */
-        slst = *sls;
-        slst.offset += (blks - 1) * blksize;
-        slst.length = blksize;
-
-        buf_to_sl(&sldt, dbuf, blksize);
-        desc.tfm   = tfm;
-        desc.flags = 0;
-
-        rc = ll_crypto_blkcipher_decrypt(&desc, &sldt, &slst, blksize);
-        if (unlikely(rc)) {
-                CERROR("decrypt C(n-1) (%u): %d\n", slst.length, rc);
-                return rc;
-        }
-
-        /* restore IV */
-        ll_crypto_blkcipher_set_iv(tfm, sbuf, blksize);
-
-        data = cfs_kmap(sls->page);
-        /* C(n) = C(n) | TAIL(D(n)) */
-        memcpy(dbuf, data + sls->offset + blks * blksize, tail);
-        /* swab C(n) and C(n-1) */
-        memcpy(sbuf, data + sls->offset + (blks - 1) * blksize, blksize);
-        memcpy(data + sls->offset + (blks - 1) * blksize, dbuf, blksize);
-        cfs_kunmap(sls->page);
-
-        /* do cbc decrypt */
-        buf_to_sl(&slst, sbuf, blksize);
-        buf_to_sl(&sldt, dbuf, blksize);
-
-        /* decrypt head */
-        rc = ll_crypto_blkcipher_decrypt(&desc, sld, sls, sls->length - tail);
-        if (unlikely(rc)) {
-                CERROR("decrypt head (%u) data: %d\n", sls->length - tail, rc);
-                return rc;
-        }
-        /* decrypt tail */
-        rc = ll_crypto_blkcipher_decrypt(&desc, &sldt, &slst, blksize);
-        if (unlikely(rc)) {
-                CERROR("decrypt tail (%u) data: %d\n", slst.length, rc);
-                return rc;
-        }
-
-        /* truncate to original ciphertext size */
-        data = cfs_kmap(sld->page);
-        memcpy(data + sld->offset + blks * blksize, dbuf, tail);
-        cfs_kunmap(sld->page);
-
-        return 0;
-}
-
-static inline int do_cts_tfm(struct ll_crypto_cipher *tfm,
-                             int encrypt,
-                             struct scatterlist *sld,
-                             struct scatterlist *sls)
-{
-#ifndef HAVE_ASYNC_BLOCK_CIPHER
-        LASSERT(tfm->crt_cipher.cit_mode == CRYPTO_TFM_MODE_CBC);
-#endif
-
-        if (encrypt)
-                return cbc_cts_encrypt(tfm, sld, sls);
-        else
-                return cbc_cts_decrypt(tfm, sld, sls);
-}
-
-/*
- * normal encrypt/decrypt of data of even blocksize
- */
-static inline int do_cipher_tfm(struct ll_crypto_cipher *tfm,
-                                int encrypt,
-                                struct scatterlist *sld,
-                                struct scatterlist *sls)
-{
-        struct blkcipher_desc desc;
-        desc.tfm   = tfm;
-        desc.flags = 0;
-        if (encrypt)
-                return ll_crypto_blkcipher_encrypt(&desc, sld, sls, sls->length);
-        else
-                return ll_crypto_blkcipher_decrypt(&desc, sld, sls, sls->length);
-}
-
-static struct ll_crypto_cipher *get_stream_cipher(__u8 *key, unsigned int keylen)
-{
-        const struct sptlrpc_ciph_type *ct;
-        struct ll_crypto_cipher        *tfm;
-        int                             rc;
-
-        /* using ARC4, the only stream cipher in linux for now */
-        ct = sptlrpc_get_ciph_type(BULK_CIPH_ALG_ARC4);
-        LASSERT(ct);
-
-        tfm = ll_crypto_alloc_blkcipher(ct->sct_tfm_name, 0, 0);
-        if (tfm == NULL) {
-                CERROR("Failed to allocate stream TFM %s\n", ct->sct_name);
-                return NULL;
-        }
-        LASSERT(ll_crypto_blkcipher_blocksize(tfm));
-
-        if (keylen > ct->sct_keysize)
-                keylen = ct->sct_keysize;
-
-        LASSERT(keylen >= crypto_tfm_alg_min_keysize(tfm));
-        LASSERT(keylen <= crypto_tfm_alg_max_keysize(tfm));
-
-        rc = ll_crypto_blkcipher_setkey(tfm, key, keylen);
-        if (rc) {
-                CERROR("Failed to set key for TFM %s: %d\n", ct->sct_name, rc);
-                ll_crypto_free_blkcipher(tfm);
-                return NULL;
-        }
-
-        return tfm;
-}
-
-static int do_bulk_privacy(struct gss_ctx *gctx,
-                           struct ptlrpc_bulk_desc *desc,
-                           int encrypt, __u32 alg,
-                           struct ptlrpc_bulk_sec_desc *bsd)
-{
-        const struct sptlrpc_ciph_type *ct = sptlrpc_get_ciph_type(alg);
-        struct ll_crypto_cipher  *tfm;
-        struct ll_crypto_cipher  *stfm = NULL; /* backup stream cipher */
-        struct scatterlist        sls, sld, *sldp;
-        unsigned int              blksize, keygen_size;
-        int                       i, rc;
-        __u8                      key[CIPHER_MAX_KEYSIZE];
-
-        LASSERT(ct);
-
-        if (encrypt)
-                bsd->bsd_ciph_alg = BULK_CIPH_ALG_NULL;
-
-        if (alg == BULK_CIPH_ALG_NULL)
-                return 0;
-
-        if (desc->bd_iov_count <= 0) {
-                if (encrypt)
-                        bsd->bsd_ciph_alg = alg;
-                return 0;
-        }
-
-        tfm = ll_crypto_alloc_blkcipher(ct->sct_tfm_name, 0, 0 );
-        if (tfm == NULL) {
-                CERROR("Failed to allocate TFM %s\n", ct->sct_name);
-                return -ENOMEM;
-        }
-        blksize = ll_crypto_blkcipher_blocksize(tfm);
-
-        LASSERT(crypto_tfm_alg_max_keysize(tfm) >= ct->sct_keysize);
-        LASSERT(crypto_tfm_alg_min_keysize(tfm) <= ct->sct_keysize);
-        LASSERT(ct->sct_ivsize == 0 ||
-                ll_crypto_blkcipher_ivsize(tfm) == ct->sct_ivsize);
-        LASSERT(ct->sct_keysize <= CIPHER_MAX_KEYSIZE);
-        LASSERT(blksize <= CIPHER_MAX_BLKSIZE);
-
-        /* generate ramdom key seed and compute the secret key based on it.
-         * note determined by algorithm which lgss_plain_encrypt use, it
-         * might require the key size be its (blocksize * n). so here for
-         * simplicity, we force it's be n * MAX_BLKSIZE by padding 0 */
-        keygen_size = (ct->sct_keysize + CIPHER_MAX_BLKSIZE - 1) &
-                      ~(CIPHER_MAX_BLKSIZE - 1);
-        if (encrypt) {
-                get_random_bytes(bsd->bsd_key, ct->sct_keysize);
-                if (ct->sct_keysize < keygen_size)
-                        memset(bsd->bsd_key + ct->sct_keysize, 0,
-                               keygen_size - ct->sct_keysize);
-        }
-
-        rc = lgss_plain_encrypt(gctx, 0, keygen_size, bsd->bsd_key, key);
-        if (rc) {
-                CERROR("failed to compute secret key: %d\n", rc);
-                goto out;
-        }
-
-        rc = ll_crypto_blkcipher_setkey(tfm, key, ct->sct_keysize);
-        if (rc) {
-                CERROR("Failed to set key for TFM %s: %d\n", ct->sct_name, rc);
-                goto out;
-        }
-
-        /* stream cipher doesn't need iv */
-        if (blksize > 1)
-                ll_crypto_blkcipher_set_iv(tfm, zero_iv, blksize);
-
-        for (i = 0; i < desc->bd_iov_count; i++) {
-                sls.page = desc->bd_iov[i].kiov_page;
-                sls.offset = desc->bd_iov[i].kiov_offset;
-                sls.length = desc->bd_iov[i].kiov_len;
-
-                if (unlikely(sls.length == 0)) {
-                        CWARN("page %d with 0 length data?\n", i);
-                        continue;
-                }
-
-                if (unlikely(sls.offset % blksize)) {
-                        CERROR("page %d with odd offset %u, TFM %s\n",
-                               i, sls.offset, ct->sct_name);
-                        rc = -EINVAL;
-                        goto out;
-                }
-
-                if (desc->bd_enc_pages) {
-                        sld.page = desc->bd_enc_pages[i];
-                        sld.offset = desc->bd_iov[i].kiov_offset;
-                        sld.length = desc->bd_iov[i].kiov_len;
-
-                        sldp = &sld;
-                } else {
-                        sldp = &sls;
-                }
-
-                if (likely(sls.length % blksize == 0)) {
-                        /* data length is n * blocksize, do the normal tfm */
-                        rc = do_cipher_tfm(tfm, encrypt, sldp, &sls);
-                } else if (sls.length < blksize) {
-                        /* odd data length, and smaller than 1 block, CTS
-                         * doesn't work in this case because it requires
-                         * transfer a modified IV to peer. here we use a
-                         * "backup" stream cipher to do the tfm */
-                        if (stfm == NULL) {
-                                stfm = get_stream_cipher(key, ct->sct_keysize);
-                                if (tfm == NULL) {
-                                        rc = -ENOMEM;
-                                        goto out;
-                                }
-                        }
-                        rc = do_cipher_tfm(stfm, encrypt, sldp, &sls);
-                } else {
-                        /* odd data length but > 1 block, do CTS tfm */
-                        rc = do_cts_tfm(tfm, encrypt, sldp, &sls);
-                }
-
-                if (unlikely(rc)) {
-                        CERROR("error %s page %d/%d: %d\n",
-                               encrypt ? "encrypt" : "decrypt",
-                               i + 1, desc->bd_iov_count, rc);
-                        goto out;
-                }
-
-                if (desc->bd_enc_pages)
-                        desc->bd_iov[i].kiov_page = desc->bd_enc_pages[i];
-        }
-
-        if (encrypt)
-                bsd->bsd_ciph_alg = alg;
-
-out:
-        if (stfm)
-                ll_crypto_free_blkcipher(stfm);
-
-        ll_crypto_free_blkcipher(tfm);
-        return rc;
-}
-
  int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
                            struct ptlrpc_request *req,
                            struct ptlrpc_bulk_desc *desc)
  {
          struct gss_cli_ctx              *gctx;
          struct lustre_msg               *msg;
-        struct ptlrpc_bulk_sec_desc     *bsdr;
-        int                              offset, rc;
+        struct ptlrpc_bulk_sec_desc     *bsd;
+        rawobj_t                         token;
+        __u32                            maj;
+        int                              offset;
+        int                              rc;
          ENTRY;
  
          LASSERT(req->rq_pack_bulk);
          LASSERT(req->rq_bulk_read || req->rq_bulk_write);
  
-        switch (RPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+        gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+        LASSERT(gctx->gc_mechctx);
+
+        switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
          case SPTLRPC_SVC_NULL:
                  LASSERT(req->rq_reqbuf->lm_bufcount >= 3);
                  msg = req->rq_reqbuf;
@@ -472,42 +107,68 @@ int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
                  LBUG();
          }
  
-        /* make checksum */
-        rc = bulk_csum_cli_request(desc, req->rq_bulk_read,
-                                   req->rq_flvr.sf_bulk_hash, msg, offset);
-        if (rc) {
-                CERROR("client bulk %s: failed to generate checksum: %d\n",
-                       req->rq_bulk_read ? "read" : "write", rc);
-                RETURN(rc);
-        }
+        bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
+        bsd->bsd_version = 0;
+        bsd->bsd_flags = 0;
+        bsd->bsd_type = SPTLRPC_BULK_DEFAULT;
+        bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc);
  
-        if (req->rq_flvr.sf_bulk_ciph == BULK_CIPH_ALG_NULL)
+        if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL)
                  RETURN(0);
  
-        /* previous bulk_csum_cli_request() has verified bsdr is good */
-        bsdr = lustre_msg_buf(msg, offset, 0);
+        LASSERT(bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG ||
+                bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV);
  
          if (req->rq_bulk_read) {
-                bsdr->bsd_ciph_alg = req->rq_flvr.sf_bulk_ciph;
-                RETURN(0);
-        }
-
-        /* it turn out to be bulk write */
-        rc = sptlrpc_enc_pool_get_pages(desc);
-        if (rc) {
-                CERROR("bulk write: failed to allocate encryption pages\n");
-                RETURN(rc);
-        }
+                /*
+                 * bulk read: prepare receiving pages only for privacy mode.
+                 */
+                if (bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV)
+                        return gss_cli_prep_bulk(req, desc);
+        } else {
+                /*
+                 * bulk write: sign or encrypt bulk pages.
+                 */
+                bsd->bsd_nob = desc->bd_nob;
+
+                if (bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG) {
+                        /* integrity mode */
+                        token.data = bsd->bsd_data;
+                        token.len = lustre_msg_buflen(msg, offset) -
+                                    sizeof(*bsd);
+
+                        maj = lgss_get_mic(gctx->gc_mechctx, 0, NULL,
+                                           desc->bd_iov_count, desc->bd_iov,
+                                           &token);
+                        if (maj != GSS_S_COMPLETE) {
+                                CWARN("failed to sign bulk data: %x\n", maj);
+                                RETURN(-EACCES);
+                        }
+                } else {
+                        /* privacy mode */
+                        if (desc->bd_iov_count == 0)
+                                RETURN(0);
+
+                        rc = sptlrpc_enc_pool_get_pages(desc);
+                        if (rc) {
+                                CERROR("bulk write: failed to allocate "
+                                       "encryption pages: %d\n", rc);
+                                RETURN(rc);
+                        }
  
-        gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
-        LASSERT(gctx->gc_mechctx);
+                        token.data = bsd->bsd_data;
+                        token.len = lustre_msg_buflen(msg, offset) -
+                                    sizeof(*bsd);
  
-        rc = do_bulk_privacy(gctx->gc_mechctx, desc, 1,
-                             req->rq_flvr.sf_bulk_ciph, bsdr);
-        if (rc)
-                CERROR("bulk write: client failed to encrypt pages\n");
+                        maj = lgss_wrap_bulk(gctx->gc_mechctx, desc, &token, 0);
+                        if (maj != GSS_S_COMPLETE) {
+                                CWARN("fail to encrypt bulk data: %x\n", maj);
+                                RETURN(-EACCES);
+                        }
+                }
+        }
  
-        RETURN(rc);
+        RETURN(0);
  }
  
  int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
@@ -517,13 +178,15 @@ int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
          struct gss_cli_ctx              *gctx;
          struct lustre_msg               *rmsg, *vmsg;
          struct ptlrpc_bulk_sec_desc     *bsdr, *bsdv;
-        int                              roff, voff, rc;
+        rawobj_t                         token;
+        __u32                            maj;
+        int                              roff, voff;
          ENTRY;
  
          LASSERT(req->rq_pack_bulk);
          LASSERT(req->rq_bulk_read || req->rq_bulk_write);
  
-        switch (RPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+        switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
          case SPTLRPC_SVC_NULL:
                  vmsg = req->rq_repdata;
                  voff = vmsg->lm_bufcount - 1;
@@ -556,34 +219,158 @@ int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
                  LBUG();
          }
  
-        if (req->rq_bulk_read) {
-                bsdr = lustre_msg_buf(rmsg, roff, 0);
-                if (bsdr->bsd_ciph_alg == BULK_CIPH_ALG_NULL)
-                        goto verify_csum;
-
-                bsdv = lustre_msg_buf(vmsg, voff, 0);
-                if (bsdr->bsd_ciph_alg != bsdv->bsd_ciph_alg) {
-                        CERROR("bulk read: cipher algorithm mismatch: client "
-                               "request %s but server reply with %s. try to "
-                               "use the new one for decryption\n",
-                               sptlrpc_get_ciph_name(bsdr->bsd_ciph_alg),
-                               sptlrpc_get_ciph_name(bsdv->bsd_ciph_alg));
+        bsdr = lustre_msg_buf(rmsg, roff, sizeof(*bsdr));
+        bsdv = lustre_msg_buf(vmsg, voff, sizeof(*bsdv));
+        LASSERT(bsdr && bsdv);
+
+        if (bsdr->bsd_version != bsdv->bsd_version ||
+            bsdr->bsd_type != bsdv->bsd_type ||
+            bsdr->bsd_svc != bsdv->bsd_svc) {
+                CERROR("bulk security descriptor mismatch: "
+                       "(%u,%u,%u) != (%u,%u,%u)\n",
+                       bsdr->bsd_version, bsdr->bsd_type, bsdr->bsd_svc,
+                       bsdv->bsd_version, bsdv->bsd_type, bsdv->bsd_svc);
+                RETURN(-EPROTO);
+        }
+
+        LASSERT(bsdv->bsd_svc == SPTLRPC_BULK_SVC_NULL ||
+                bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG ||
+                bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV);
+
+        /*
+         * in privacy mode if return success, make sure bd_nob_transferred
+         * is the actual size of the clear text, otherwise upper layer
+         * may be surprised.
+         */
+        if (req->rq_bulk_write) {
+                if (bsdv->bsd_flags & BSD_FL_ERR) {
+                        CERROR("server reported bulk i/o failure\n");
+                        RETURN(-EIO);
                  }
  
+                if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV)
+                        desc->bd_nob_transferred = desc->bd_nob;
+        } else {
+                /*
+                 * bulk read, upon return success, bd_nob_transferred is
+                 * the size of plain text actually received.
+                 */
                  gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
                  LASSERT(gctx->gc_mechctx);
  
-                rc = do_bulk_privacy(gctx->gc_mechctx, desc, 0,
-                                     bsdv->bsd_ciph_alg, bsdv);
-                if (rc) {
-                        CERROR("bulk read: client failed to decrypt data\n");
-                        RETURN(rc);
+                if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG) {
+                        int i, nob;
+
+                        /* fix the actual data size */
+                        for (i = 0, nob = 0; i < desc->bd_iov_count; i++) {
+                                if (desc->bd_iov[i].kiov_len + nob >
+                                    desc->bd_nob_transferred) {
+                                        desc->bd_iov[i].kiov_len =
+                                                desc->bd_nob_transferred - nob;
+                                }
+                                nob += desc->bd_iov[i].kiov_len;
+                        }
+
+                        token.data = bsdv->bsd_data;
+                        token.len = lustre_msg_buflen(vmsg, voff) -
+                                    sizeof(*bsdv);
+
+                        maj = lgss_verify_mic(gctx->gc_mechctx, 0, NULL,
+                                              desc->bd_iov_count, desc->bd_iov,
+                                              &token);
+                        if (maj != GSS_S_COMPLETE) {
+                                CERROR("failed to verify bulk read: %x\n", maj);
+                                RETURN(-EACCES);
+                        }
+                } else if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV) {
+                        desc->bd_nob = bsdv->bsd_nob;
+                        if (desc->bd_nob == 0)
+                                RETURN(0);
+
+                        token.data = bsdv->bsd_data;
+                        token.len = lustre_msg_buflen(vmsg, voff) -
+                                    sizeof(*bsdr);
+
+                        maj = lgss_unwrap_bulk(gctx->gc_mechctx, desc, &token);
+                        if (maj != GSS_S_COMPLETE) {
+                                CERROR("failed to decrypt bulk read: %x\n",
+                                       maj);
+                                RETURN(-EACCES);
+                        }
+
+                        desc->bd_nob_transferred = desc->bd_nob;
                  }
          }
  
-verify_csum:
-        rc = bulk_csum_cli_reply(desc, req->rq_bulk_read,
-                                 rmsg, roff, vmsg, voff);
+        RETURN(0);
+}
+
+static int gss_prep_bulk(struct ptlrpc_bulk_desc *desc,
+                         struct gss_ctx *mechctx)
+{
+        int     rc;
+
+        if (desc->bd_iov_count == 0)
+                return 0;
+
+        rc = sptlrpc_enc_pool_get_pages(desc);
+        if (rc)
+                return rc;
+
+        if (lgss_prep_bulk(mechctx, desc) != GSS_S_COMPLETE)
+                return -EACCES;
+
+        return 0;
+}
+
+int gss_cli_prep_bulk(struct ptlrpc_request *req,
+                      struct ptlrpc_bulk_desc *desc)
+{
+        int             rc;
+        ENTRY;
+
+        LASSERT(req->rq_cli_ctx);
+        LASSERT(req->rq_pack_bulk);
+        LASSERT(req->rq_bulk_read);
+
+        if (SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_BULK_SVC_PRIV)
+                RETURN(0);
+
+        rc = gss_prep_bulk(desc, ctx2gctx(req->rq_cli_ctx)->gc_mechctx);
+        if (rc)
+                CERROR("bulk read: failed to prepare encryption "
+                       "pages: %d\n", rc);
+
+        RETURN(rc);
+}
+
+int gss_svc_prep_bulk(struct ptlrpc_request *req,
+                      struct ptlrpc_bulk_desc *desc)
+{
+        struct gss_svc_reqctx        *grctx;
+        struct ptlrpc_bulk_sec_desc  *bsd;
+        int                           rc;
+        ENTRY;
+
+        LASSERT(req->rq_svc_ctx);
+        LASSERT(req->rq_pack_bulk);
+        LASSERT(req->rq_bulk_write);
+
+        grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+        LASSERT(grctx->src_reqbsd);
+        LASSERT(grctx->src_repbsd);
+        LASSERT(grctx->src_ctx);
+        LASSERT(grctx->src_ctx->gsc_mechctx);
+
+        bsd = grctx->src_reqbsd;
+        if (bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)
+                RETURN(0);
+
+        rc = gss_prep_bulk(desc, grctx->src_ctx->gsc_mechctx);
+        if (rc)
+                CERROR("bulk write: failed to prepare encryption "
+                       "pages: %d\n", rc);
+
          RETURN(rc);
  }
  
@@ -591,7 +378,9 @@ int gss_svc_unwrap_bulk(struct ptlrpc_request *req,
                          struct ptlrpc_bulk_desc *desc)
  {
          struct gss_svc_reqctx        *grctx;
-        int                           rc;
+        struct ptlrpc_bulk_sec_desc  *bsdr, *bsdv;
+        rawobj_t                      token;
+        __u32                         maj;
          ENTRY;
  
          LASSERT(req->rq_svc_ctx);
@@ -605,29 +394,64 @@ int gss_svc_unwrap_bulk(struct ptlrpc_request *req,
          LASSERT(grctx->src_ctx);
          LASSERT(grctx->src_ctx->gsc_mechctx);
  
-        /* decrypt bulk data if it's encrypted */
-        if (grctx->src_reqbsd->bsd_ciph_alg != BULK_CIPH_ALG_NULL) {
-                rc = do_bulk_privacy(grctx->src_ctx->gsc_mechctx, desc, 0,
-                                     grctx->src_reqbsd->bsd_ciph_alg,
-                                     grctx->src_reqbsd);
-                if (rc) {
-                        CERROR("bulk write: server failed to decrypt data\n");
-                        RETURN(rc);
+        bsdr = grctx->src_reqbsd;
+        bsdv = grctx->src_repbsd;
+
+        /* bsdr has been sanity checked during unpacking */
+        bsdv->bsd_version = 0;
+        bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+        bsdv->bsd_svc = bsdr->bsd_svc;
+        bsdv->bsd_flags = 0;
+
+        switch (bsdv->bsd_svc) {
+        case SPTLRPC_BULK_SVC_INTG:
+                token.data = bsdr->bsd_data;
+                token.len = grctx->src_reqbsd_size - sizeof(*bsdr);
+
+                maj = lgss_verify_mic(grctx->src_ctx->gsc_mechctx, 0, NULL,
+                                      desc->bd_iov_count, desc->bd_iov, &token);
+                if (maj != GSS_S_COMPLETE) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("failed to verify bulk signature: %x\n", maj);
+                        RETURN(-EACCES);
+                }
+                break;
+        case SPTLRPC_BULK_SVC_PRIV:
+                if (bsdr->bsd_nob != desc->bd_nob) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("prepared nob %d doesn't match the actual "
+                               "nob %d\n", desc->bd_nob, bsdr->bsd_nob);
+                        RETURN(-EPROTO);
                  }
-        }
  
-        /* verify bulk data checksum */
-        rc = bulk_csum_svc(desc, req->rq_bulk_read,
-                           grctx->src_reqbsd, grctx->src_reqbsd_size,
-                           grctx->src_repbsd, grctx->src_repbsd_size);
+                if (desc->bd_iov_count == 0) {
+                        LASSERT(desc->bd_nob == 0);
+                        break;
+                }
  
-        RETURN(rc);
+                token.data = bsdr->bsd_data;
+                token.len = grctx->src_reqbsd_size - sizeof(*bsdr);
+
+                maj = lgss_unwrap_bulk(grctx->src_ctx->gsc_mechctx,
+                                       desc, &token);
+                if (maj != GSS_S_COMPLETE) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("failed decrypt bulk data: %x\n", maj);
+                        RETURN(-EACCES);
+                }
+                break;
+        }
+
+        RETURN(0);
  }
  
  int gss_svc_wrap_bulk(struct ptlrpc_request *req,
                        struct ptlrpc_bulk_desc *desc)
  {
          struct gss_svc_reqctx        *grctx;
+        struct ptlrpc_bulk_sec_desc  *bsdr, *bsdv;
+        rawobj_t                      token;
+        __u32                         maj;
          int                           rc;
          ENTRY;
  
@@ -642,22 +466,56 @@ int gss_svc_wrap_bulk(struct ptlrpc_request *req,
          LASSERT(grctx->src_ctx);
          LASSERT(grctx->src_ctx->gsc_mechctx);
  
-        /* generate bulk data checksum */
-        rc = bulk_csum_svc(desc, req->rq_bulk_read,
-                           grctx->src_reqbsd, grctx->src_reqbsd_size,
-                           grctx->src_repbsd, grctx->src_repbsd_size);
-        if (rc)
-                RETURN(rc);
-
-        /* encrypt bulk data if required */
-        if (grctx->src_reqbsd->bsd_ciph_alg != BULK_CIPH_ALG_NULL) {
-                rc = do_bulk_privacy(grctx->src_ctx->gsc_mechctx, desc, 1,
-                                     grctx->src_reqbsd->bsd_ciph_alg,
-                                     grctx->src_repbsd);
-                if (rc)
-                        CERROR("bulk read: server failed to encrypt data: "
-                               "rc %d\n", rc);
+        bsdr = grctx->src_reqbsd;
+        bsdv = grctx->src_repbsd;
+
+        /* bsdr has been sanity checked during unpacking */
+        bsdv->bsd_version = 0;
+        bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+        bsdv->bsd_svc = bsdr->bsd_svc;
+        bsdv->bsd_flags = 0;
+
+        switch (bsdv->bsd_svc) {
+        case SPTLRPC_BULK_SVC_INTG:
+                token.data = bsdv->bsd_data;
+                token.len = grctx->src_repbsd_size - sizeof(*bsdv);
+
+                maj = lgss_get_mic(grctx->src_ctx->gsc_mechctx, 0, NULL,
+                                   desc->bd_iov_count, desc->bd_iov, &token);
+                if (maj != GSS_S_COMPLETE) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("failed to sign bulk data: %x\n", maj);
+                        RETURN(-EACCES);
+                }
+                break;
+        case SPTLRPC_BULK_SVC_PRIV:
+                bsdv->bsd_nob = desc->bd_nob;
+
+                if (desc->bd_iov_count == 0) {
+                        LASSERT(desc->bd_nob == 0);
+                        break;
+                }
+
+                rc = sptlrpc_enc_pool_get_pages(desc);
+                if (rc) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("bulk read: failed to allocate encryption "
+                               "pages: %d\n", rc);
+                        RETURN(rc);
+                }
+
+                token.data = bsdv->bsd_data;
+                token.len = grctx->src_repbsd_size - sizeof(*bsdv);
+
+                maj = lgss_wrap_bulk(grctx->src_ctx->gsc_mechctx,
+                                     desc, &token, 1);
+                if (maj != GSS_S_COMPLETE) {
+                        bsdv->bsd_flags |= BSD_FL_ERR;
+                        CERROR("failed to encrypt bulk data: %x\n", maj);
+                        RETURN(-EACCES);
+                }
+                break;
          }
  
-        RETURN(rc);
+        RETURN(0);
  }
diff --git a/lustre/ptlrpc/gss/gss_internal.h b/lustre/ptlrpc/gss/gss_internal.h

index afbb614..66afd61 100644 (file)
--- a/lustre/ptlrpc/gss/gss_internal.h
+++ b/lustre/ptlrpc/gss/gss_internal.h
@@ -433,12 +433,16 @@ int  __init gss_init_pipefs(void);
  void __exit gss_exit_pipefs(void);
  
  /* gss_bulk.c */
+int gss_cli_prep_bulk(struct ptlrpc_request *req,
+                      struct ptlrpc_bulk_desc *desc);
  int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
                            struct ptlrpc_request *req,
                            struct ptlrpc_bulk_desc *desc);
  int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
                              struct ptlrpc_request *req,
                              struct ptlrpc_bulk_desc *desc);
+int gss_svc_prep_bulk(struct ptlrpc_request *req,
+                      struct ptlrpc_bulk_desc *desc);
  int gss_svc_unwrap_bulk(struct ptlrpc_request *req,
                          struct ptlrpc_bulk_desc *desc);
  int gss_svc_wrap_bulk(struct ptlrpc_request *req,
diff --git a/lustre/ptlrpc/gss/gss_keyring.c b/lustre/ptlrpc/gss/gss_keyring.c

index 74c786d..8906109 100644 (file)
--- a/lustre/ptlrpc/gss/gss_keyring.c
+++ b/lustre/ptlrpc/gss/gss_keyring.c
@@ -1450,6 +1450,7 @@ static struct ptlrpc_sec_sops gss_sec_keyring_sops = {
          .authorize              = gss_svc_authorize,
          .free_rs                = gss_svc_free_rs,
          .free_ctx               = gss_svc_free_ctx,
+        .prep_bulk              = gss_svc_prep_bulk,
          .unwrap_bulk            = gss_svc_unwrap_bulk,
          .wrap_bulk              = gss_svc_wrap_bulk,
          .install_rctx           = gss_svc_install_rctx_kr,
diff --git a/lustre/ptlrpc/gss/gss_krb5_mech.c b/lustre/ptlrpc/gss/gss_krb5_mech.c

index a9a5388..7eb0c95 100644 (file)
--- a/lustre/ptlrpc/gss/gss_krb5_mech.c
+++ b/lustre/ptlrpc/gss/gss_krb5_mech.c
@@ -531,7 +531,7 @@ void gss_delete_sec_context_kerberos(void *internal_ctx)
  }
  
  static
-void buf_to_sg(struct scatterlist *sg, char *ptr, int len)
+void buf_to_sg(struct scatterlist *sg, void *ptr, int len)
  {
          sg->page = virt_to_page(ptr);
          sg->offset = offset_in_page(ptr);
@@ -582,13 +582,15 @@ out:
          return(ret);
  }
  
+#ifdef HAVE_ASYNC_BLOCK_CIPHER
+
  static inline
  int krb5_digest_hmac(struct ll_crypto_hash *tfm,
                       rawobj_t *key,
                       struct krb5_header *khdr,
                       int msgcnt, rawobj_t *msgs,
+                     int iovcnt, lnet_kiov_t *iovs,
                       rawobj_t *cksum)
-#ifdef HAVE_ASYNC_BLOCK_CIPHER
  {
          struct hash_desc   desc;
          struct scatterlist sg[1];
@@ -607,6 +609,15 @@ int krb5_digest_hmac(struct ll_crypto_hash *tfm,
                  ll_crypto_hash_update(&desc, sg, msgs[i].len);
          }
  
+        for (i = 0; i < iovcnt; i++) {
+                if (iovs[i].kiov_len == 0)
+                        continue;
+                sg[0].page = iovs[i].kiov_page;
+                sg[0].offset = iovs[i].kiov_offset;
+                sg[0].length = iovs[i].kiov_len;
+                ll_crypto_hash_update(&desc, sg, iovs[i].kiov_len);
+        }
+
          if (khdr) {
                  buf_to_sg(sg, (char *) khdr, sizeof(*khdr));
                  ll_crypto_hash_update(&desc, sg, sizeof(*khdr));
@@ -614,7 +625,16 @@ int krb5_digest_hmac(struct ll_crypto_hash *tfm,
  
          return ll_crypto_hash_final(&desc, cksum->data);
  }
-#else /* HAVE_ASYNC_BLOCK_CIPHER */
+
+#else /* ! HAVE_ASYNC_BLOCK_CIPHER */
+
+static inline
+int krb5_digest_hmac(struct ll_crypto_hash *tfm,
+                     rawobj_t *key,
+                     struct krb5_header *khdr,
+                     int msgcnt, rawobj_t *msgs,
+                     int iovcnt, lnet_kiov_t *iovs,
+                     rawobj_t *cksum)
  {
          struct scatterlist sg[1];
          __u32              keylen = key->len, i;
@@ -628,6 +648,15 @@ int krb5_digest_hmac(struct ll_crypto_hash *tfm,
                  crypto_hmac_update(tfm, sg, 1);
          }
  
+        for (i = 0; i < iovcnt; i++) {
+                if (iovs[i].kiov_len == 0)
+                        continue;
+                sg[0].page = iovs[i].kiov_page;
+                sg[0].offset = iovs[i].kiov_offset;
+                sg[0].length = iovs[i].kiov_len;
+                crypto_hmac_update(tfm, sg, 1);
+        }
+
          if (khdr) {
                  buf_to_sg(sg, (char *) khdr, sizeof(*khdr));
                  crypto_hmac_update(tfm, sg, 1);
@@ -636,6 +665,7 @@ int krb5_digest_hmac(struct ll_crypto_hash *tfm,
          crypto_hmac_final(tfm, key->data, &keylen, cksum->data);
          return 0;
  }
+
  #endif /* HAVE_ASYNC_BLOCK_CIPHER */
  
  static inline
@@ -643,6 +673,7 @@ int krb5_digest_norm(struct ll_crypto_hash *tfm,
                       struct krb5_keyblock *kb,
                       struct krb5_header *khdr,
                       int msgcnt, rawobj_t *msgs,
+                     int iovcnt, lnet_kiov_t *iovs,
                       rawobj_t *cksum)
  {
          struct hash_desc   desc;
@@ -662,6 +693,15 @@ int krb5_digest_norm(struct ll_crypto_hash *tfm,
                  ll_crypto_hash_update(&desc, sg, msgs[i].len);
          }
  
+        for (i = 0; i < iovcnt; i++) {
+                if (iovs[i].kiov_len == 0)
+                        continue;
+                sg[0].page = iovs[i].kiov_page;
+                sg[0].offset = iovs[i].kiov_offset;
+                sg[0].length = iovs[i].kiov_len;
+                ll_crypto_hash_update(&desc, sg, iovs[i].kiov_len);
+        }
+
          if (khdr) {
                  buf_to_sg(sg, (char *) khdr, sizeof(*khdr));
                  ll_crypto_hash_update(&desc, sg, sizeof(*khdr));
@@ -682,6 +722,7 @@ __s32 krb5_make_checksum(__u32 enctype,
                           struct krb5_keyblock *kb,
                           struct krb5_header *khdr,
                           int msgcnt, rawobj_t *msgs,
+                         int iovcnt, lnet_kiov_t *iovs,
                           rawobj_t *cksum)
  {
          struct krb5_enctype   *ke = &enctypes[enctype];
@@ -703,10 +744,10 @@ __s32 krb5_make_checksum(__u32 enctype,
  
          if (ke->ke_hash_hmac)
                  rc = krb5_digest_hmac(tfm, &kb->kb_key,
-                                      khdr, msgcnt, msgs, cksum);
+                                      khdr, msgcnt, msgs, iovcnt, iovs, cksum);
          else
                  rc = krb5_digest_norm(tfm, kb,
-                                      khdr, msgcnt, msgs, cksum);
+                                      khdr, msgcnt, msgs, iovcnt, iovs, cksum);
  
          if (rc == 0)
                  code = GSS_S_COMPLETE;
@@ -715,38 +756,96 @@ out_tfm:
          return code;
  }
  
+static void fill_krb5_header(struct krb5_ctx *kctx,
+                             struct krb5_header *khdr,
+                             int privacy)
+{
+        unsigned char acceptor_flag;
+
+        acceptor_flag = kctx->kc_initiate ? 0 : FLAG_SENDER_IS_ACCEPTOR;
+
+        if (privacy) {
+                khdr->kh_tok_id = cpu_to_be16(KG_TOK_WRAP_MSG);
+                khdr->kh_flags = acceptor_flag | FLAG_WRAP_CONFIDENTIAL;
+                khdr->kh_ec = cpu_to_be16(0);
+                khdr->kh_rrc = cpu_to_be16(0);
+        } else {
+                khdr->kh_tok_id = cpu_to_be16(KG_TOK_MIC_MSG);
+                khdr->kh_flags = acceptor_flag;
+                khdr->kh_ec = cpu_to_be16(0xffff);
+                khdr->kh_rrc = cpu_to_be16(0xffff);
+        }
+
+        khdr->kh_filler = 0xff;
+        spin_lock(&krb5_seq_lock);
+        khdr->kh_seq = cpu_to_be64(kctx->kc_seq_send++);
+        spin_unlock(&krb5_seq_lock);
+}
+
+static __u32 verify_krb5_header(struct krb5_ctx *kctx,
+                                struct krb5_header *khdr,
+                                int privacy)
+{
+        unsigned char acceptor_flag;
+        __u16         tok_id, ec_rrc;
+
+        acceptor_flag = kctx->kc_initiate ? FLAG_SENDER_IS_ACCEPTOR : 0;
+
+        if (privacy) {
+                tok_id = KG_TOK_WRAP_MSG;
+                ec_rrc = 0x0;
+        } else {
+                tok_id = KG_TOK_MIC_MSG;
+                ec_rrc = 0xffff;
+        }
+
+        /* sanity checks */
+        if (be16_to_cpu(khdr->kh_tok_id) != tok_id) {
+                CERROR("bad token id\n");
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+        if ((khdr->kh_flags & FLAG_SENDER_IS_ACCEPTOR) != acceptor_flag) {
+                CERROR("bad direction flag\n");
+                return GSS_S_BAD_SIG;
+        }
+        if (privacy && (khdr->kh_flags & FLAG_WRAP_CONFIDENTIAL) == 0) {
+                CERROR("missing confidential flag\n");
+                return GSS_S_BAD_SIG;
+        }
+        if (khdr->kh_filler != 0xff) {
+                CERROR("bad filler\n");
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+        if (be16_to_cpu(khdr->kh_ec) != ec_rrc ||
+            be16_to_cpu(khdr->kh_rrc) != ec_rrc) {
+                CERROR("bad EC or RRC\n");
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+        return GSS_S_COMPLETE;
+}
+
  static
  __u32 gss_get_mic_kerberos(struct gss_ctx *gctx,
                             int msgcnt,
                             rawobj_t *msgs,
+                           int iovcnt,
+                           lnet_kiov_t *iovs,
                             rawobj_t *token)
  {
          struct krb5_ctx     *kctx = gctx->internal_ctx_id;
          struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
          struct krb5_header  *khdr;
-        unsigned char        acceptor_flag;
          rawobj_t             cksum = RAWOBJ_EMPTY;
-        __u32                rc = GSS_S_FAILURE;
-
-        acceptor_flag = kctx->kc_initiate ? 0 : FLAG_SENDER_IS_ACCEPTOR;
  
          /* fill krb5 header */
          LASSERT(token->len >= sizeof(*khdr));
          khdr = (struct krb5_header *) token->data;
-
-        khdr->kh_tok_id = cpu_to_be16(KG_TOK_MIC_MSG);
-        khdr->kh_flags = acceptor_flag;
-        khdr->kh_filler = 0xff;
-        khdr->kh_ec = cpu_to_be16(0xffff);
-        khdr->kh_rrc = cpu_to_be16(0xffff);
-        spin_lock(&krb5_seq_lock);
-        khdr->kh_seq = cpu_to_be64(kctx->kc_seq_send++);
-        spin_unlock(&krb5_seq_lock);
+        fill_krb5_header(kctx, khdr, 0);
  
          /* checksum */
          if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
-                               khdr, msgcnt, msgs, &cksum))
-                goto out_err;
+                               khdr, msgcnt, msgs, iovcnt, iovs, &cksum))
+                return GSS_S_FAILURE;
  
          LASSERT(cksum.len >= ke->ke_hash_size);
          LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size);
@@ -754,26 +853,23 @@ __u32 gss_get_mic_kerberos(struct gss_ctx *gctx,
                 ke->ke_hash_size);
  
          token->len = sizeof(*khdr) + ke->ke_hash_size;
-        rc = GSS_S_COMPLETE;
-out_err:
          rawobj_free(&cksum);
-        return rc;
+        return GSS_S_COMPLETE;
  }
  
  static
  __u32 gss_verify_mic_kerberos(struct gss_ctx *gctx,
                                int msgcnt,
                                rawobj_t *msgs,
+                              int iovcnt,
+                              lnet_kiov_t *iovs,
                                rawobj_t *token)
  {
          struct krb5_ctx     *kctx = gctx->internal_ctx_id;
          struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
          struct krb5_header  *khdr;
-        unsigned char        acceptor_flag;
          rawobj_t             cksum = RAWOBJ_EMPTY;
-        __u32                rc = GSS_S_FAILURE;
-
-        acceptor_flag = kctx->kc_initiate ? FLAG_SENDER_IS_ACCEPTOR : 0;
+        __u32                major;
  
          if (token->len < sizeof(*khdr)) {
                  CERROR("short signature: %u\n", token->len);
@@ -782,47 +878,34 @@ __u32 gss_verify_mic_kerberos(struct gss_ctx *gctx,
  
          khdr = (struct krb5_header *) token->data;
  
-        /* sanity checks */
-        if (be16_to_cpu(khdr->kh_tok_id) != KG_TOK_MIC_MSG) {
-                CERROR("bad token id\n");
-                return GSS_S_DEFECTIVE_TOKEN;
-        }
-        if ((khdr->kh_flags & FLAG_SENDER_IS_ACCEPTOR) != acceptor_flag) {
-                CERROR("bad direction flag\n");
-                return GSS_S_BAD_SIG;
-        }
-        if (khdr->kh_filler != 0xff) {
-                CERROR("bad filler\n");
-                return GSS_S_DEFECTIVE_TOKEN;
-        }
-        if (be16_to_cpu(khdr->kh_ec) != 0xffff ||
-            be16_to_cpu(khdr->kh_rrc) != 0xffff) {
-                CERROR("bad EC or RRC\n");
-                return GSS_S_DEFECTIVE_TOKEN;
+        major = verify_krb5_header(kctx, khdr, 0);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("bad krb5 header\n");
+                return major;
          }
  
          if (token->len < sizeof(*khdr) + ke->ke_hash_size) {
                  CERROR("short signature: %u, require %d\n",
                         token->len, (int) sizeof(*khdr) + ke->ke_hash_size);
-                goto out;
+                return GSS_S_FAILURE;
          }
  
          if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
-                               khdr, msgcnt, msgs, &cksum))
+                               khdr, msgcnt, msgs, iovcnt, iovs, &cksum)) {
+                CERROR("failed to make checksum\n");
                  return GSS_S_FAILURE;
+        }
  
          LASSERT(cksum.len >= ke->ke_hash_size);
          if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
                     ke->ke_hash_size)) {
                  CERROR("checksum mismatch\n");
-                rc = GSS_S_BAD_SIG;
-                goto out;
+                rawobj_free(&cksum);
+                return GSS_S_BAD_SIG;
          }
  
-        rc = GSS_S_COMPLETE;
-out:
          rawobj_free(&cksum);
-        return rc;
+        return GSS_S_COMPLETE;
  }
  
  static
@@ -902,6 +985,195 @@ int krb5_encrypt_rawobjs(struct ll_crypto_cipher *tfm,
  }
  
  static
+int krb5_encrypt_bulk(struct ll_crypto_cipher *tfm,
+                      struct krb5_header *khdr,
+                      char *confounder,
+                      struct ptlrpc_bulk_desc *desc,
+                      rawobj_t *cipher,
+                      int adj_nob)
+{
+        struct blkcipher_desc   ciph_desc;
+        __u8                    local_iv[16] = {0};
+        struct scatterlist      src, dst;
+        int                     blocksize, i, rc, nob = 0;
+
+        LASSERT(desc->bd_iov_count);
+        LASSERT(desc->bd_enc_iov);
+
+        blocksize = ll_crypto_blkcipher_blocksize(tfm);
+        LASSERT(blocksize > 1);
+        LASSERT(cipher->len == blocksize + sizeof(*khdr));
+
+        ciph_desc.tfm  = tfm;
+        ciph_desc.info = local_iv;
+        ciph_desc.flags = 0;
+
+        /* encrypt confounder */
+        buf_to_sg(&src, confounder, blocksize);
+        buf_to_sg(&dst, cipher->data, blocksize);
+
+        rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src, blocksize);
+        if (rc) {
+                CERROR("error to encrypt confounder: %d\n", rc);
+                return rc;
+        }
+
+        /* encrypt clear pages */
+        for (i = 0; i < desc->bd_iov_count; i++) {
+                src.page = desc->bd_iov[i].kiov_page;
+                src.offset = desc->bd_iov[i].kiov_offset;
+                src.length = (desc->bd_iov[i].kiov_len + blocksize - 1) &
+                             (~(blocksize - 1));
+
+                if (adj_nob)
+                        nob += src.length;
+
+                dst.page = desc->bd_enc_iov[i].kiov_page;
+                dst.offset = src.offset;
+                dst.length = src.length;
+
+                desc->bd_enc_iov[i].kiov_offset = dst.offset;
+                desc->bd_enc_iov[i].kiov_len = dst.length;
+
+                rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src,
+                                                    src.length);
+                if (rc) {
+                        CERROR("error to encrypt page: %d\n", rc);
+                        return rc;
+                }
+        }
+
+        /* encrypt krb5 header */
+        buf_to_sg(&src, khdr, sizeof(*khdr));
+        buf_to_sg(&dst, cipher->data + blocksize, sizeof(*khdr));
+
+        rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc,
+                                            &dst, &src, sizeof(*khdr));
+        if (rc) {
+                CERROR("error to encrypt krb5 header: %d\n", rc);
+                return rc;
+        }
+
+        if (adj_nob)
+                desc->bd_nob = nob;
+
+        return 0;
+}
+
+/*
+ * desc->bd_nob_transferred is the size of cipher text received.
+ * desc->bd_nob is the target size of plain text supposed to be.
+ */
+static
+int krb5_decrypt_bulk(struct ll_crypto_cipher *tfm,
+                      struct krb5_header *khdr,
+                      struct ptlrpc_bulk_desc *desc,
+                      rawobj_t *cipher,
+                      rawobj_t *plain)
+{
+        struct blkcipher_desc   ciph_desc;
+        __u8                    local_iv[16] = {0};
+        struct scatterlist      src, dst;
+        int                     ct_nob = 0, pt_nob = 0;
+        int                     blocksize, i, rc;
+
+        LASSERT(desc->bd_iov_count);
+        LASSERT(desc->bd_enc_iov);
+        LASSERT(desc->bd_nob_transferred);
+
+        blocksize = ll_crypto_blkcipher_blocksize(tfm);
+        LASSERT(blocksize > 1);
+        LASSERT(cipher->len == blocksize + sizeof(*khdr));
+
+        ciph_desc.tfm  = tfm;
+        ciph_desc.info = local_iv;
+        ciph_desc.flags = 0;
+
+        if (desc->bd_nob_transferred % blocksize) {
+                CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred);
+                return -EPROTO;
+        }
+
+        /* decrypt head (confounder) */
+        buf_to_sg(&src, cipher->data, blocksize);
+        buf_to_sg(&dst, plain->data, blocksize);
+
+        rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src, blocksize);
+        if (rc) {
+                CERROR("error to decrypt confounder: %d\n", rc);
+                return rc;
+        }
+
+        /*
+         * decrypt clear pages. note the enc_iov is prepared by prep_bulk()
+         * which already done some sanity checkings.
+         *
+         * desc->bd_nob is the actual plain text size supposed to be
+         * transferred. desc->bd_nob_transferred is the actual cipher
+         * text received.
+         */
+        for (i = 0; i < desc->bd_iov_count && ct_nob < desc->bd_nob_transferred;
+             i++) {
+                if (desc->bd_enc_iov[i].kiov_len == 0)
+                        continue;
+
+                if (ct_nob + desc->bd_enc_iov[i].kiov_len >
+                    desc->bd_nob_transferred)
+                        desc->bd_enc_iov[i].kiov_len =
+                                desc->bd_nob_transferred - ct_nob;
+
+                desc->bd_iov[i].kiov_len = desc->bd_enc_iov[i].kiov_len;
+                if (pt_nob + desc->bd_enc_iov[i].kiov_len > desc->bd_nob)
+                        desc->bd_iov[i].kiov_len = desc->bd_nob - pt_nob;
+
+                src.page = desc->bd_enc_iov[i].kiov_page;
+                src.offset = desc->bd_enc_iov[i].kiov_offset;
+                src.length = desc->bd_enc_iov[i].kiov_len;
+
+                dst = src;
+
+                if (desc->bd_iov[i].kiov_offset % blocksize == 0)
+                        dst.page = desc->bd_iov[i].kiov_page;
+
+                rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src,
+                                                    src.length);
+                if (rc) {
+                        CERROR("error to decrypt page: %d\n", rc);
+                        return rc;
+                }
+
+                if (desc->bd_iov[i].kiov_offset % blocksize) {
+                        memcpy(cfs_page_address(desc->bd_iov[i].kiov_page) +
+                               desc->bd_iov[i].kiov_offset,
+                               cfs_page_address(desc->bd_enc_iov[i].kiov_page) +
+                               desc->bd_iov[i].kiov_offset,
+                               desc->bd_iov[i].kiov_len);
+                }
+
+                ct_nob += desc->bd_enc_iov[i].kiov_len;
+                pt_nob += desc->bd_iov[i].kiov_len;
+        }
+
+        /* decrypt tail (krb5 header) */
+        buf_to_sg(&src, cipher->data + blocksize, sizeof(*khdr));
+        buf_to_sg(&dst, cipher->data + blocksize, sizeof(*khdr));
+
+        rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc,
+                                            &dst, &src, sizeof(*khdr));
+        if (rc) {
+                CERROR("error to decrypt tail: %d\n", rc);
+                return rc;
+        }
+
+        if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) {
+                CERROR("krb5 header doesn't match\n");
+                return -EACCES;
+        }
+
+        return 0;
+}
+
+static
  __u32 gss_wrap_kerberos(struct gss_ctx *gctx,
                          rawobj_t *gsshdr,
                          rawobj_t *msg,
@@ -911,12 +1183,11 @@ __u32 gss_wrap_kerberos(struct gss_ctx *gctx,
          struct krb5_ctx     *kctx = gctx->internal_ctx_id;
          struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
          struct krb5_header  *khdr;
-        unsigned char        acceptor_flag;
          int                  blocksize;
          rawobj_t             cksum = RAWOBJ_EMPTY;
-        rawobj_t             data_desc[4], cipher;
+        rawobj_t             data_desc[3], cipher;
          __u8                 conf[GSS_MAX_CIPHER_BLOCK];
-        int                  enc_rc = 0;
+        int                  rc = 0;
  
          LASSERT(ke);
          LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
@@ -934,16 +1205,7 @@ __u32 gss_wrap_kerberos(struct gss_ctx *gctx,
          /* fill krb5 header */
          LASSERT(token->len >= sizeof(*khdr));
          khdr = (struct krb5_header *) token->data;
-        acceptor_flag = kctx->kc_initiate ? 0 : FLAG_SENDER_IS_ACCEPTOR;
-
-        khdr->kh_tok_id = cpu_to_be16(KG_TOK_WRAP_MSG);
-        khdr->kh_flags = acceptor_flag | FLAG_WRAP_CONFIDENTIAL;
-        khdr->kh_filler = 0xff;
-        khdr->kh_ec = cpu_to_be16(0);
-        khdr->kh_rrc = cpu_to_be16(0);
-        spin_lock(&krb5_seq_lock);
-        khdr->kh_seq = cpu_to_be64(kctx->kc_seq_send++);
-        spin_unlock(&krb5_seq_lock);
+        fill_krb5_header(kctx, khdr, 1);
  
          /* generate confounder */
          get_random_bytes(conf, ke->ke_conf_size);
@@ -975,12 +1237,10 @@ __u32 gss_wrap_kerberos(struct gss_ctx *gctx,
          data_desc[1].len = gsshdr->len;
          data_desc[2].data = msg->data;
          data_desc[2].len = msg->len;
-        data_desc[3].data = (__u8 *) khdr;
-        data_desc[3].len = sizeof(*khdr);
  
          /* compute checksum */
          if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
-                               khdr, 4, data_desc, &cksum))
+                               khdr, 3, data_desc, 0, NULL, &cksum))
                  return GSS_S_FAILURE;
          LASSERT(cksum.len >= ke->ke_hash_size);
  
@@ -1007,26 +1267,26 @@ __u32 gss_wrap_kerberos(struct gss_ctx *gctx,
                  struct ll_crypto_cipher *arc4_tfm;
  
                  if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
-                                       NULL, 1, &cksum, &arc4_keye)) {
+                                       NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
                          CERROR("failed to obtain arc4 enc key\n");
-                        GOTO(arc4_out, enc_rc = -EACCES);
+                        GOTO(arc4_out, rc = -EACCES);
                  }
  
                  arc4_tfm = ll_crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
                  if (arc4_tfm == NULL) {
                          CERROR("failed to alloc tfm arc4 in ECB mode\n");
-                        GOTO(arc4_out_key, enc_rc = -EACCES);
+                        GOTO(arc4_out_key, rc = -EACCES);
                  }
  
                  if (ll_crypto_blkcipher_setkey(arc4_tfm, arc4_keye.data,
                                                 arc4_keye.len)) {
                          CERROR("failed to set arc4 key, len %d\n",
                                 arc4_keye.len);
-                        GOTO(arc4_out_tfm, enc_rc = -EACCES);
+                        GOTO(arc4_out_tfm, rc = -EACCES);
                  }
  
-                enc_rc = krb5_encrypt_rawobjs(arc4_tfm, 1,
-                                              3, data_desc, &cipher, 1);
+                rc = krb5_encrypt_rawobjs(arc4_tfm, 1,
+                                          3, data_desc, &cipher, 1);
  arc4_out_tfm:
                  ll_crypto_free_blkcipher(arc4_tfm);
  arc4_out_key:
@@ -1034,11 +1294,155 @@ arc4_out_key:
  arc4_out:
                  do {} while(0); /* just to avoid compile warning */
          } else {
-                enc_rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0,
-                                              3, data_desc, &cipher, 1);
+                rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0,
+                                          3, data_desc, &cipher, 1);
+        }
+
+        if (rc != 0) {
+                rawobj_free(&cksum);
+                return GSS_S_FAILURE;
+        }
+
+        /* fill in checksum */
+        LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+        memcpy((char *)(khdr + 1) + cipher.len,
+               cksum.data + cksum.len - ke->ke_hash_size,
+               ke->ke_hash_size);
+        rawobj_free(&cksum);
+
+        /* final token length */
+        token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+        return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx,
+                             struct ptlrpc_bulk_desc *desc)
+{
+        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+        int                  blocksize, i;
+
+        LASSERT(desc->bd_iov_count);
+        LASSERT(desc->bd_enc_iov);
+        LASSERT(kctx->kc_keye.kb_tfm);
+
+        blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+
+        for (i = 0; i < desc->bd_iov_count; i++) {
+                LASSERT(desc->bd_enc_iov[i].kiov_page);
+                /*
+                 * offset should always start at page boundary of either
+                 * client or server side.
+                 */
+                if (desc->bd_iov[i].kiov_offset & blocksize) {
+                        CERROR("odd offset %d in page %d\n",
+                               desc->bd_iov[i].kiov_offset, i);
+                        return GSS_S_FAILURE;
+                }
+
+                desc->bd_enc_iov[i].kiov_offset = desc->bd_iov[i].kiov_offset;
+                desc->bd_enc_iov[i].kiov_len = (desc->bd_iov[i].kiov_len +
+                                                blocksize - 1) & (~(blocksize - 1));
+        }
+
+        return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_wrap_bulk_kerberos(struct gss_ctx *gctx,
+                             struct ptlrpc_bulk_desc *desc,
+                             rawobj_t *token, int adj_nob)
+{
+        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+        struct krb5_header  *khdr;
+        int                  blocksize;
+        rawobj_t             cksum = RAWOBJ_EMPTY;
+        rawobj_t             data_desc[1], cipher;
+        __u8                 conf[GSS_MAX_CIPHER_BLOCK];
+        int                  rc = 0;
+
+        LASSERT(ke);
+        LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+
+        /*
+         * final token format:
+         * --------------------------------------------------
+         * | krb5 header | head/tail cipher text | checksum |
+         * --------------------------------------------------
+         */
+
+        /* fill krb5 header */
+        LASSERT(token->len >= sizeof(*khdr));
+        khdr = (struct krb5_header *) token->data;
+        fill_krb5_header(kctx, khdr, 1);
+
+        /* generate confounder */
+        get_random_bytes(conf, ke->ke_conf_size);
+
+        /* get encryption blocksize. note kc_keye might not associated with
+         * a tfm, currently only for arcfour-hmac */
+        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+                LASSERT(kctx->kc_keye.kb_tfm == NULL);
+                blocksize = 1;
+        } else {
+                LASSERT(kctx->kc_keye.kb_tfm);
+                blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+        }
+
+        /*
+         * we assume the size of krb5_header (16 bytes) must be n * blocksize.
+         * the bulk token size would be exactly (sizeof(krb5_header) +
+         * blocksize + sizeof(krb5_header) + hashsize)
+         */
+        LASSERT(blocksize <= ke->ke_conf_size);
+        LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
+        LASSERT(token->len >= sizeof(*khdr) + blocksize + sizeof(*khdr) + 16);
+
+        /*
+         * clear text layout for checksum:
+         * ------------------------------------------
+         * | confounder | clear pages | krb5 header |
+         * ------------------------------------------
+         */
+        data_desc[0].data = conf;
+        data_desc[0].len = ke->ke_conf_size;
+
+        /* compute checksum */
+        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+                               khdr, 1, data_desc,
+                               desc->bd_iov_count, desc->bd_iov,
+                               &cksum))
+                return GSS_S_FAILURE;
+        LASSERT(cksum.len >= ke->ke_hash_size);
+
+        /*
+         * clear text layout for encryption:
+         * ------------------------------------------
+         * | confounder | clear pages | krb5 header |
+         * ------------------------------------------
+         *        |              |             |
+         *        ----------  (cipher pages)   |
+         * result token:   |                   |
+         * -------------------------------------------
+         * | krb5 header | cipher text | cipher text |
+         * -------------------------------------------
+         */
+        data_desc[0].data = conf;
+        data_desc[0].len = ke->ke_conf_size;
+
+        cipher.data = (__u8 *) (khdr + 1);
+        cipher.len = blocksize + sizeof(*khdr);
+
+        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+                LBUG();
+                rc = 0;
+        } else {
+                rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+                                       conf, desc, &cipher, adj_nob);
          }
  
-        if (enc_rc != 0) {
+        if (rc != 0) {
                  rawobj_free(&cksum);
                  return GSS_S_FAILURE;
          }
@@ -1064,18 +1468,16 @@ __u32 gss_unwrap_kerberos(struct gss_ctx  *gctx,
          struct krb5_ctx     *kctx = gctx->internal_ctx_id;
          struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
          struct krb5_header  *khdr;
-        unsigned char        acceptor_flag;
          unsigned char       *tmpbuf;
          int                  blocksize, bodysize;
          rawobj_t             cksum = RAWOBJ_EMPTY;
          rawobj_t             cipher_in, plain_out;
          rawobj_t             hash_objs[3];
-        __u32                rc = GSS_S_FAILURE, enc_rc = 0;
+        int                  rc = 0;
+        __u32                major;
  
          LASSERT(ke);
  
-        acceptor_flag = kctx->kc_initiate ? FLAG_SENDER_IS_ACCEPTOR : 0;
-
          if (token->len < sizeof(*khdr)) {
                  CERROR("short signature: %u\n", token->len);
                  return GSS_S_DEFECTIVE_TOKEN;
@@ -1083,27 +1485,10 @@ __u32 gss_unwrap_kerberos(struct gss_ctx  *gctx,
  
          khdr = (struct krb5_header *) token->data;
  
-        /* sanity check header */
-        if (be16_to_cpu(khdr->kh_tok_id) != KG_TOK_WRAP_MSG) {
-                CERROR("bad token id\n");
-                return GSS_S_DEFECTIVE_TOKEN;
-        }
-        if ((khdr->kh_flags & FLAG_SENDER_IS_ACCEPTOR) != acceptor_flag) {
-                CERROR("bad direction flag\n");
-                return GSS_S_BAD_SIG;
-        }
-        if ((khdr->kh_flags & FLAG_WRAP_CONFIDENTIAL) == 0) {
-                CERROR("missing confidential flag\n");
-                return GSS_S_BAD_SIG;
-        }
-        if (khdr->kh_filler != 0xff) {
-                CERROR("bad filler\n");
-                return GSS_S_DEFECTIVE_TOKEN;
-        }
-        if (be16_to_cpu(khdr->kh_ec) != 0x0 ||
-            be16_to_cpu(khdr->kh_rrc) != 0x0) {
-                CERROR("bad EC or RRC\n");
-                return GSS_S_DEFECTIVE_TOKEN;
+        major = verify_krb5_header(kctx, khdr, 1);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("bad krb5 header\n");
+                return major;
          }
  
          /* block size */
@@ -1143,6 +1528,8 @@ __u32 gss_unwrap_kerberos(struct gss_ctx  *gctx,
          if (!tmpbuf)
                  return GSS_S_FAILURE;
  
+        major = GSS_S_FAILURE;
+
          cipher_in.data = (__u8 *) (khdr + 1);
          cipher_in.len = bodysize;
          plain_out.data = tmpbuf;
@@ -1156,26 +1543,26 @@ __u32 gss_unwrap_kerberos(struct gss_ctx  *gctx,
                  cksum.len = ke->ke_hash_size;
  
                  if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
-                                       NULL, 1, &cksum, &arc4_keye)) {
+                                       NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
                          CERROR("failed to obtain arc4 enc key\n");
-                        GOTO(arc4_out, enc_rc = -EACCES);
+                        GOTO(arc4_out, rc = -EACCES);
                  }
  
                  arc4_tfm = ll_crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
                  if (arc4_tfm == NULL) {
                          CERROR("failed to alloc tfm arc4 in ECB mode\n");
-                        GOTO(arc4_out_key, enc_rc = -EACCES);
+                        GOTO(arc4_out_key, rc = -EACCES);
                  }
  
                  if (ll_crypto_blkcipher_setkey(arc4_tfm,
                                           arc4_keye.data, arc4_keye.len)) {
                          CERROR("failed to set arc4 key, len %d\n",
                                 arc4_keye.len);
-                        GOTO(arc4_out_tfm, enc_rc = -EACCES);
+                        GOTO(arc4_out_tfm, rc = -EACCES);
                  }
  
-                enc_rc = krb5_encrypt_rawobjs(arc4_tfm, 1,
-                                              1, &cipher_in, &plain_out, 0);
+                rc = krb5_encrypt_rawobjs(arc4_tfm, 1,
+                                          1, &cipher_in, &plain_out, 0);
  arc4_out_tfm:
                  ll_crypto_free_blkcipher(arc4_tfm);
  arc4_out_key:
@@ -1183,11 +1570,11 @@ arc4_out_key:
  arc4_out:
                  cksum = RAWOBJ_EMPTY;
          } else {
-                enc_rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0,
-                                              1, &cipher_in, &plain_out, 0);
+                rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0,
+                                          1, &cipher_in, &plain_out, 0);
          }
  
-        if (enc_rc != 0) {
+        if (rc != 0) {
                  CERROR("error decrypt\n");
                  goto out_free;
          }
@@ -1215,46 +1602,119 @@ arc4_out:
          hash_objs[0].data = plain_out.data;
          hash_objs[1].len = gsshdr->len;
          hash_objs[1].data = gsshdr->data;
-        hash_objs[2].len = plain_out.len - ke->ke_conf_size;
+        hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr);
          hash_objs[2].data = plain_out.data + ke->ke_conf_size;
          if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
-                               khdr, 3, hash_objs, &cksum))
+                               khdr, 3, hash_objs, 0, NULL, &cksum))
                  goto out_free;
  
          LASSERT(cksum.len >= ke->ke_hash_size);
          if (memcmp((char *)(khdr + 1) + bodysize,
                     cksum.data + cksum.len - ke->ke_hash_size,
                     ke->ke_hash_size)) {
-                CERROR("cksum mismatch\n");
+                CERROR("checksum mismatch\n");
                  goto out_free;
          }
  
          msg->len =  bodysize - ke->ke_conf_size - sizeof(*khdr);
          memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len);
  
-        rc = GSS_S_COMPLETE;
+        major = GSS_S_COMPLETE;
  out_free:
          OBD_FREE(tmpbuf, bodysize);
          rawobj_free(&cksum);
-        return rc;
+        return major;
  }
  
  static
-__u32 gss_plain_encrypt_kerberos(struct gss_ctx  *ctx,
-                                 int              decrypt,
-                                 int              length,
-                                 void            *in_buf,
-                                 void            *out_buf)
+__u32 gss_unwrap_bulk_kerberos(struct gss_ctx *gctx,
+                               struct ptlrpc_bulk_desc *desc,
+                               rawobj_t *token)
  {
-        struct krb5_ctx        *kctx = ctx->internal_ctx_id;
-        __u32                   rc;
+        struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+        struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+        struct krb5_header  *khdr;
+        int                  blocksize;
+        rawobj_t             cksum = RAWOBJ_EMPTY;
+        rawobj_t             cipher, plain;
+        rawobj_t             data_desc[1];
+        int                  rc;
+        __u32                major;
+
+        LASSERT(ke);
+
+        if (token->len < sizeof(*khdr)) {
+                CERROR("short signature: %u\n", token->len);
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+
+        khdr = (struct krb5_header *) token->data;
+
+        major = verify_krb5_header(kctx, khdr, 1);
+        if (major != GSS_S_COMPLETE) {
+                CERROR("bad krb5 header\n");
+                return major;
+        }
+
+        /* block size */
+        if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+                LASSERT(kctx->kc_keye.kb_tfm == NULL);
+                blocksize = 1;
+                LBUG();
+        } else {
+                LASSERT(kctx->kc_keye.kb_tfm);
+                blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+        }
+        LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
+
+        /*
+         * token format is expected as:
+         * -----------------------------------------------
+         * | krb5 header | head/tail cipher text | cksum |
+         * -----------------------------------------------
+         */
+        if (token->len < sizeof(*khdr) + blocksize + sizeof(*khdr) +
+                         ke->ke_hash_size) {
+                CERROR("short token size: %u\n", token->len);
+                return GSS_S_DEFECTIVE_TOKEN;
+        }
+
+        cipher.data = (__u8 *) (khdr + 1);
+        cipher.len = blocksize + sizeof(*khdr);
+        plain.data = cipher.data;
+        plain.len = cipher.len;
  
-        rc = krb5_encrypt(kctx->kc_keye.kb_tfm, decrypt,
-                          NULL, in_buf, out_buf, length);
+        rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+                               desc, &cipher, &plain);
          if (rc)
-                CERROR("plain encrypt error: %d\n", rc);
+                return GSS_S_DEFECTIVE_TOKEN;
+
+        /*
+         * verify checksum, compose clear text as layout:
+         * ------------------------------------------
+         * | confounder | clear pages | krb5 header |
+         * ------------------------------------------
+         */
+        data_desc[0].data = plain.data;
+        data_desc[0].len = blocksize;
+
+        if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+                               khdr, 1, data_desc,
+                               desc->bd_iov_count, desc->bd_iov,
+                               &cksum))
+                return GSS_S_FAILURE;
+        LASSERT(cksum.len >= ke->ke_hash_size);
+
+        if (memcmp(plain.data + blocksize + sizeof(*khdr),
+                   cksum.data + cksum.len - ke->ke_hash_size,
+                   ke->ke_hash_size)) {
+                CERROR("checksum mismatch\n");
+                rawobj_free(&cksum);
+                return GSS_S_BAD_SIG;
+        }
  
-        return rc;
+        rawobj_free(&cksum);
+        return GSS_S_COMPLETE;
  }
  
  int gss_display_kerberos(struct gss_ctx        *ctx,
@@ -1277,7 +1737,9 @@ static struct gss_api_ops gss_kerberos_ops = {
          .gss_verify_mic             = gss_verify_mic_kerberos,
          .gss_wrap                   = gss_wrap_kerberos,
          .gss_unwrap                 = gss_unwrap_kerberos,
-        .gss_plain_encrypt          = gss_plain_encrypt_kerberos,
+        .gss_prep_bulk              = gss_prep_bulk_kerberos,
+        .gss_wrap_bulk              = gss_wrap_bulk_kerberos,
+        .gss_unwrap_bulk            = gss_unwrap_bulk_kerberos,
          .gss_delete_sec_context     = gss_delete_sec_context_kerberos,
          .gss_display                = gss_display_kerberos,
  };
diff --git a/lustre/ptlrpc/gss/gss_mech_switch.c b/lustre/ptlrpc/gss/gss_mech_switch.c

index 8a4e627..ca55fe8 100644 (file)
--- a/lustre/ptlrpc/gss/gss_mech_switch.c
+++ b/lustre/ptlrpc/gss/gss_mech_switch.c
@@ -214,6 +214,8 @@ __u32 lgss_inquire_context(struct gss_ctx *context_handle,
  __u32 lgss_get_mic(struct gss_ctx *context_handle,
                     int msgcnt,
                     rawobj_t *msg,
+                   int iovcnt,
+                   lnet_kiov_t *iovs,
                     rawobj_t *mic_token)
  {
          LASSERT(context_handle);
@@ -225,6 +227,8 @@ __u32 lgss_get_mic(struct gss_ctx *context_handle,
                  ->gss_get_mic(context_handle,
                                msgcnt,
                                msg,
+                              iovcnt,
+                              iovs,
                                mic_token);
  }
  
@@ -232,6 +236,8 @@ __u32 lgss_get_mic(struct gss_ctx *context_handle,
  __u32 lgss_verify_mic(struct gss_ctx *context_handle,
                        int msgcnt,
                        rawobj_t *msg,
+                      int iovcnt,
+                      lnet_kiov_t *iovs,
                        rawobj_t *mic_token)
  {
          LASSERT(context_handle);
@@ -243,6 +249,8 @@ __u32 lgss_verify_mic(struct gss_ctx *context_handle,
                  ->gss_verify_mic(context_handle,
                                   msgcnt,
                                   msg,
+                                 iovcnt,
+                                 iovs,
                                   mic_token);
  }
  
@@ -276,19 +284,43 @@ __u32 lgss_unwrap(struct gss_ctx *context_handle,
  }
  
  
-__u32 lgss_plain_encrypt(struct gss_ctx *ctx,
-                         int decrypt,
-                         int length,
-                         void *in_buf,
-                         void *out_buf)
+__u32 lgss_prep_bulk(struct gss_ctx *context_handle,
+                     struct ptlrpc_bulk_desc *desc)
  {
-        LASSERT(ctx);
-        LASSERT(ctx->mech_type);
-        LASSERT(ctx->mech_type->gm_ops);
-        LASSERT(ctx->mech_type->gm_ops->gss_plain_encrypt);
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_prep_bulk);
  
-        return ctx->mech_type->gm_ops
-                ->gss_plain_encrypt(ctx, decrypt, length, in_buf, out_buf);
+        return context_handle->mech_type->gm_ops
+                ->gss_prep_bulk(context_handle, desc);
+}
+
+__u32 lgss_wrap_bulk(struct gss_ctx *context_handle,
+                     struct ptlrpc_bulk_desc *desc,
+                     rawobj_t *token,
+                     int adj_nob)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_wrap_bulk);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_wrap_bulk(context_handle, desc, token, adj_nob);
+}
+
+__u32 lgss_unwrap_bulk(struct gss_ctx *context_handle,
+                       struct ptlrpc_bulk_desc *desc,
+                       rawobj_t *token)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_unwrap_bulk);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_unwrap_bulk(context_handle, desc, token);
  }
  
  /* gss_delete_sec_context: free all resources associated with context_handle.
diff --git a/lustre/ptlrpc/gss/sec_gss.c b/lustre/ptlrpc/gss/sec_gss.c

index f3aae3f..9b531f2 100644 (file)
--- a/lustre/ptlrpc/gss/sec_gss.c
+++ b/lustre/ptlrpc/gss/sec_gss.c
@@ -182,7 +182,7 @@ static int gss_sign_msg(struct lustre_msg *msg,
                          rawobj_t *handle)
  {
          struct gss_header      *ghdr;
-        rawobj_t                text[3], mic;
+        rawobj_t                text[4], mic;
          int                     textcnt, max_textcnt, mic_idx;
          __u32                   major;
  
@@ -223,7 +223,7 @@ static int gss_sign_msg(struct lustre_msg *msg,
          mic.len = msg->lm_buflens[mic_idx];
          mic.data = lustre_msg_buf(msg, mic_idx, 0);
  
-        major = lgss_get_mic(mechctx, textcnt, text, &mic);
+        major = lgss_get_mic(mechctx, textcnt, text, 0, NULL, &mic);
          if (major != GSS_S_COMPLETE) {
                  CERROR("fail to generate MIC: %08x\n", major);
                  return -EPERM;
@@ -241,7 +241,7 @@ __u32 gss_verify_msg(struct lustre_msg *msg,
                       struct gss_ctx *mechctx,
                       __u32 svc)
  {
-        rawobj_t        text[3], mic;
+        rawobj_t        text[4], mic;
          int             textcnt, max_textcnt;
          int             mic_idx;
          __u32           major;
@@ -262,7 +262,7 @@ __u32 gss_verify_msg(struct lustre_msg *msg,
          mic.len = msg->lm_buflens[mic_idx];
          mic.data = lustre_msg_buf(msg, mic_idx, 0);
  
-        major = lgss_verify_mic(mechctx, textcnt, text, &mic);
+        major = lgss_verify_mic(mechctx, textcnt, text, 0, NULL, &mic);
          if (major != GSS_S_COMPLETE)
                  CERROR("mic verify error: %08x\n", major);
  
@@ -584,6 +584,33 @@ static inline int gss_cli_payload(struct ptlrpc_cli_ctx *ctx,
          return gss_mech_payload(NULL, msgsize, privacy);
  }
  
+static int gss_cli_bulk_payload(struct ptlrpc_cli_ctx *ctx,
+                                struct sptlrpc_flavor *flvr,
+                                int reply, int read)
+{
+        int     payload = sizeof(struct ptlrpc_bulk_sec_desc);
+
+        LASSERT(SPTLRPC_FLVR_BULK_TYPE(flvr->sf_rpc) == SPTLRPC_BULK_DEFAULT);
+
+        if ((!reply && !read) || (reply && read)) {
+                switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+                case SPTLRPC_BULK_SVC_NULL:
+                        break;
+                case SPTLRPC_BULK_SVC_INTG:
+                        payload += gss_cli_payload(ctx, 0, 0);
+                        break;
+                case SPTLRPC_BULK_SVC_PRIV:
+                        payload += gss_cli_payload(ctx, 0, 1);
+                        break;
+                case SPTLRPC_BULK_SVC_AUTH:
+                default:
+                        LBUG();
+                }
+        }
+
+        return payload;
+}
+
  int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred)
  {
          return (ctx->cc_vcred.vc_uid == vcred->vc_uid);
@@ -627,7 +654,7 @@ int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx,
          if (req->rq_ctx_init)
                  RETURN(0);
  
-        svc = RPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+        svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
          if (req->rq_pack_bulk)
                  flags |= LUSTRE_GSS_PACK_BULK;
          if (req->rq_pack_udesc)
@@ -798,8 +825,10 @@ int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx,
                          gss_header_swabber(ghdr);
  
                  major = gss_verify_msg(msg, gctx->gc_mechctx, reqhdr->gh_svc);
-                if (major != GSS_S_COMPLETE)
+                if (major != GSS_S_COMPLETE) {
+                        CERROR("failed to verify reply: %x\n", major);
                          RETURN(-EPERM);
+                }
  
                  if (req->rq_early && reqhdr->gh_svc == SPTLRPC_SVC_NULL) {
                          __u32 cksum;
@@ -996,6 +1025,7 @@ int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx,
                  major = gss_unseal_msg(gctx->gc_mechctx, msg,
                                         &msglen, req->rq_repdata_len);
                  if (major != GSS_S_COMPLETE) {
+                        CERROR("failed to unwrap reply: %x\n", major);
                          rc = -EPERM;
                          break;
                  }
@@ -1018,7 +1048,7 @@ int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx,
                          }
  
                          /* bulk checksum is the last segment */
-                        if (bulk_sec_desc_unpack(msg, msg->lm_bufcount-1))
+                        if (bulk_sec_desc_unpack(msg, msg->lm_bufcount - 1))
                                  RETURN(-EPROTO);
                  }
  
@@ -1067,12 +1097,13 @@ int gss_sec_create_common(struct gss_sec *gsec,
          struct ptlrpc_sec   *sec;
  
          LASSERT(imp);
-        LASSERT(RPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_GSS);
+        LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_GSS);
  
-        gsec->gs_mech = lgss_subflavor_to_mech(RPC_FLVR_SUB(sf->sf_rpc));
+        gsec->gs_mech = lgss_subflavor_to_mech(
+                                SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc));
          if (!gsec->gs_mech) {
                  CERROR("gss backend 0x%x not found\n",
-                       RPC_FLVR_SUB(sf->sf_rpc));
+                       SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc));
                  return -EOPNOTSUPP;
          }
  
@@ -1099,8 +1130,7 @@ int gss_sec_create_common(struct gss_sec *gsec,
                  sec->ps_gc_interval = 0;
          }
  
-        if (sec->ps_flvr.sf_bulk_ciph != BULK_CIPH_ALG_NULL &&
-            sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_BULK)
+        if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV)
                  sptlrpc_enc_pool_add_user();
  
          CDEBUG(D_SEC, "create %s%s@%p\n", (svcctx ? "reverse " : ""),
@@ -1124,8 +1154,7 @@ void gss_sec_destroy_common(struct gss_sec *gsec)
  
          class_import_put(sec->ps_import);
  
-        if (sec->ps_flvr.sf_bulk_ciph != BULK_CIPH_ALG_NULL &&
-            sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_BULK)
+        if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV)
                  sptlrpc_enc_pool_del_user();
  
          EXIT;
@@ -1247,9 +1276,9 @@ int gss_alloc_reqbuf_intg(struct ptlrpc_sec *sec,
          }
  
          if (req->rq_pack_bulk) {
-                buflens[bufcnt] = bulk_sec_desc_size(
-                                                req->rq_flvr.sf_bulk_hash, 1,
-                                                req->rq_bulk_read);
+                buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                       &req->rq_flvr,
+                                                       0, req->rq_bulk_read);
                  if (svc == SPTLRPC_SVC_INTG)
                          txtsize += buflens[bufcnt];
                  bufcnt++;
@@ -1313,9 +1342,9 @@ int gss_alloc_reqbuf_priv(struct ptlrpc_sec *sec,
          if (req->rq_pack_udesc)
                  ibuflens[ibufcnt++] = sptlrpc_current_user_desc_size();
          if (req->rq_pack_bulk)
-                ibuflens[ibufcnt++] = bulk_sec_desc_size(
-                                                req->rq_flvr.sf_bulk_hash, 1,
-                                                req->rq_bulk_read);
+                ibuflens[ibufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                           &req->rq_flvr, 0,
+                                                           req->rq_bulk_read);
  
          clearsize = lustre_msg_size_v2(ibufcnt, ibuflens);
          /* to allow append padding during encryption */
@@ -1375,7 +1404,7 @@ int gss_alloc_reqbuf(struct ptlrpc_sec *sec,
                       struct ptlrpc_request *req,
                       int msgsize)
  {
-        int     svc = RPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+        int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
  
          LASSERT(!req->rq_pack_bulk ||
                  (req->rq_bulk_read || req->rq_bulk_write));
@@ -1400,7 +1429,7 @@ void gss_free_reqbuf(struct ptlrpc_sec *sec,
          ENTRY;
  
          LASSERT(!req->rq_pool || req->rq_reqbuf);
-        privacy = RPC_FLVR_SVC(req->rq_flvr.sf_rpc) == SPTLRPC_SVC_PRIV;
+        privacy = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) == SPTLRPC_SVC_PRIV;
  
          if (!req->rq_clrbuf)
                  goto release_reqbuf;
@@ -1477,9 +1506,9 @@ int gss_alloc_repbuf_intg(struct ptlrpc_sec *sec,
                  txtsize += buflens[1];
  
          if (req->rq_pack_bulk) {
-                buflens[bufcnt] = bulk_sec_desc_size(
-                                                req->rq_flvr.sf_bulk_hash, 0,
-                                                req->rq_bulk_read);
+                buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                       &req->rq_flvr,
+                                                       1, req->rq_bulk_read);
                  if (svc == SPTLRPC_SVC_INTG)
                          txtsize += buflens[bufcnt];
                  bufcnt++;
@@ -1513,9 +1542,9 @@ int gss_alloc_repbuf_priv(struct ptlrpc_sec *sec,
          buflens[0] = msgsize;
  
          if (req->rq_pack_bulk)
-                buflens[bufcnt++] = bulk_sec_desc_size(
-                                                req->rq_flvr.sf_bulk_hash, 0,
-                                                req->rq_bulk_read);
+                buflens[bufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                         &req->rq_flvr,
+                                                         1, req->rq_bulk_read);
          txtsize = lustre_msg_size_v2(bufcnt, buflens);
          txtsize += GSS_MAX_CIPHER_BLOCK;
  
@@ -1535,7 +1564,7 @@ int gss_alloc_repbuf(struct ptlrpc_sec *sec,
                       struct ptlrpc_request *req,
                       int msgsize)
  {
-        int     svc = RPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+        int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
          ENTRY;
  
          LASSERT(!req->rq_pack_bulk ||
@@ -1771,7 +1800,7 @@ int gss_enlarge_reqbuf(struct ptlrpc_sec *sec,
                         struct ptlrpc_request *req,
                         int segment, int newsize)
  {
-        int     svc = RPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+        int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
  
          LASSERT(!req->rq_ctx_init && !req->rq_ctx_fini);
  
@@ -2066,8 +2095,10 @@ int gss_svc_verify_request(struct ptlrpc_request *req,
          }
  
          *major = gss_verify_msg(msg, gctx->gsc_mechctx, gw->gw_svc);
-        if (*major != GSS_S_COMPLETE)
+        if (*major != GSS_S_COMPLETE) {
+                CERROR("failed to verify request: %x\n", *major);
                  RETURN(-EACCES);
+        }
  
          if (gctx->gsc_reverse == 0 &&
              gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) {
@@ -2094,10 +2125,10 @@ verified:
                  offset++;
          }
  
-        /* check bulk cksum data */
+        /* check bulk_sec_desc data */
          if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
                  if (msg->lm_bufcount < (offset + 1)) {
-                        CERROR("no bulk checksum included\n");
+                        CERROR("missing bulk sec descriptor\n");
                          RETURN(-EINVAL);
                  }
  
@@ -2133,8 +2164,10 @@ int gss_svc_unseal_request(struct ptlrpc_request *req,
  
          *major = gss_unseal_msg(gctx->gsc_mechctx, msg,
                                 &msglen, req->rq_reqdata_len);
-        if (*major != GSS_S_COMPLETE)
+        if (*major != GSS_S_COMPLETE) {
+                CERROR("failed to unwrap request: %x\n", *major);
                  RETURN(-EACCES);
+        }
  
          if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) {
                  CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq);
@@ -2405,6 +2438,31 @@ int gss_svc_payload(struct gss_svc_reqctx *grctx, int early,
          return gss_mech_payload(NULL, msgsize, privacy);
  }
  
+static int gss_svc_bulk_payload(struct gss_svc_ctx *gctx,
+                                struct sptlrpc_flavor *flvr,
+                                int read)
+{
+        int     payload = sizeof(struct ptlrpc_bulk_sec_desc);
+
+        if (read) {
+                switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+                case SPTLRPC_BULK_SVC_NULL:
+                        break;
+                case SPTLRPC_BULK_SVC_INTG:
+                        payload += gss_mech_payload(NULL, 0, 0);
+                        break;
+                case SPTLRPC_BULK_SVC_PRIV:
+                        payload += gss_mech_payload(NULL, 0, 1);
+                        break;
+                case SPTLRPC_BULK_SVC_AUTH:
+                default:
+                        LBUG();
+                }
+        }
+
+        return payload;
+}
+
  int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
  {
          struct gss_svc_reqctx       *grctx;
@@ -2422,7 +2480,7 @@ int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
                  RETURN(-EPROTO);
          }
  
-        svc = RPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+        svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
          early = (req->rq_packed_final == 0);
  
          grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
@@ -2440,9 +2498,10 @@ int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
                          LASSERT(grctx->src_reqbsd);
  
                          bsd_off = ibufcnt;
-                        ibuflens[ibufcnt++] = bulk_sec_desc_size(
-                                                grctx->src_reqbsd->bsd_hash_alg,
-                                                0, req->rq_bulk_read);
+                        ibuflens[ibufcnt++] = gss_svc_bulk_payload(
+                                                        grctx->src_ctx,
+                                                        &req->rq_flvr,
+                                                        req->rq_bulk_read);
                  }
  
                  txtsize = lustre_msg_size_v2(ibufcnt, ibuflens);
@@ -2465,9 +2524,10 @@ int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
                          LASSERT(grctx->src_reqbsd);
  
                          bsd_off = bufcnt;
-                        buflens[bufcnt] = bulk_sec_desc_size(
-                                                grctx->src_reqbsd->bsd_hash_alg,
-                                                0, req->rq_bulk_read);
+                        buflens[bufcnt] = gss_svc_bulk_payload(
+                                                        grctx->src_ctx,
+                                                        &req->rq_flvr,
+                                                        req->rq_bulk_read);
                          if (svc == SPTLRPC_SVC_INTG)
                                  txtsize += buflens[bufcnt];
                          bufcnt++;
diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c

index 3b22441..65eedd1 100644 (file)
--- a/lustre/ptlrpc/import.c
+++ b/lustre/ptlrpc/import.c
@@ -351,7 +351,7 @@ out:
          sptlrpc_import_flush_all_ctx(imp);
  
          atomic_dec(&imp->imp_inval_count);
-        cfs_waitq_signal(&imp->imp_recovery_waitq);
+        cfs_waitq_broadcast(&imp->imp_recovery_waitq);
  }
  
  /* unset imp_invalid */
@@ -810,14 +810,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                          IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
                  } else {
                          IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
-                }
-
-                spin_lock(&imp->imp_lock);
-                if (imp->imp_invalid) {
-                        spin_unlock(&imp->imp_lock);
                          ptlrpc_activate_import(imp);
-                } else {
-                        spin_unlock(&imp->imp_lock);
                  }
  
                  GOTO(finish, rc = 0);
@@ -1146,7 +1139,7 @@ out:
          imp->imp_last_recon = 0;
          spin_unlock(&imp->imp_lock);
  
-        cfs_waitq_signal(&imp->imp_recovery_waitq);
+        cfs_waitq_broadcast(&imp->imp_recovery_waitq);
          RETURN(rc);
  }
  
@@ -1326,7 +1319,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
          }
  
          if (imp->imp_state == LUSTRE_IMP_FULL) {
-                cfs_waitq_signal(&imp->imp_recovery_waitq);
+                cfs_waitq_broadcast(&imp->imp_recovery_waitq);
                  ptlrpc_wake_delayed(imp);
          }
  
diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c

index 5c3aaca..7303382 100644 (file)
--- a/lustre/ptlrpc/lproc_ptlrpc.c
+++ b/lustre/ptlrpc/lproc_ptlrpc.c
@@ -117,13 +117,13 @@ struct ll_rpc_opcode {
          { LLOG_CATINFO,                  "llog_catinfo" },
          { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" },
          { LLOG_ORIGIN_HANDLE_DESTROY,    "llog_origin_handle_destroy" },
-        { FLD_QUERY,        "fld_query" },
+        { QUOTA_DQACQ,      "quota_acquire" },
+        { QUOTA_DQREL,      "quota_release" },
          { SEQ_QUERY,        "seq_query" },
          { SEC_CTX_INIT,     "sec_ctx_init" },
          { SEC_CTX_INIT_CONT,"sec_ctx_init_cont" },
          { SEC_CTX_FINI,     "sec_ctx_fini" },
-        { QUOTA_DQACQ,      "quota_acquire" },
-        { QUOTA_DQREL,      "quota_release" }
+        { FLD_QUERY,        "fld_query" }
  };
  
  struct ll_eopcode {
diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c

index 11f6641..357e559 100644 (file)
--- a/lustre/ptlrpc/niobuf.c
+++ b/lustre/ptlrpc/niobuf.c
@@ -529,6 +529,9 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
          lustre_msghdr_set_flags(request->rq_reqmsg,
                                  request->rq_import->imp_msghdr_flags);
  
+        if (request->rq_resend)
+                lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
+
          rc = sptlrpc_cli_wrap_request(request);
          if (rc)
                  RETURN(rc);
@@ -540,9 +543,6 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
                          RETURN(rc);
          }
  
-        if (request->rq_resend)
-                lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
-
          if (!noreply) {
                  LASSERT (request->rq_replen != 0);
                  if (request->rq_repbuf == NULL) {
diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c

index 1f41109..e85951a 100644 (file)
--- a/lustre/ptlrpc/pack_generic.c
+++ b/lustre/ptlrpc/pack_generic.c
@@ -389,8 +389,9 @@ void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size)
  
          buflen = m->lm_buflens[n];
          if (unlikely(buflen < min_size)) {
-                CERROR("msg %p buffer[%d] size %d too small (required %d)\n",
-                       m, n, buflen, min_size);
+                CERROR("msg %p buffer[%d] size %d too small "
+                       "(required %d, opc=%d)\n",
+                       m, n, buflen, min_size, lustre_msg_get_opc(m));
                  return NULL;
          }
  
@@ -1951,14 +1952,26 @@ void lustre_swab_lov_desc (struct lov_desc *ld)
          /* uuid endian insensitive */
  }
  
-/*begin adding MDT by huanghua@clusterfs.com*/
  void lustre_swab_lmv_desc (struct lmv_desc *ld)
  {
          __swab32s (&ld->ld_tgt_count);
          __swab32s (&ld->ld_active_tgt_count);
+        __swab32s (&ld->ld_default_stripe_count);
+        __swab32s (&ld->ld_pattern);
+        __swab64s (&ld->ld_default_hash_size);
+        __swab32s (&ld->ld_qos_maxage);
          /* uuid endian insensitive */
  }
  
+void lustre_swab_lmv_stripe_md (struct lmv_stripe_md *mea)
+{
+        __swab32s(&mea->mea_magic);
+        __swab32s(&mea->mea_count);
+        __swab32s(&mea->mea_master);
+        CLASSERT(offsetof(typeof(*mea), mea_padding) != 0);
+}
+
+
  static void print_lum (struct lov_user_md *lum)
  {
          CDEBUG(D_OTHER, "lov_user_md %p:\n", lum);
@@ -2014,6 +2027,19 @@ void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum)
          EXIT;
  }
  
+void lustre_swab_lov_mds_md(struct lov_mds_md *lmm)
+{
+        ENTRY;
+        CDEBUG(D_IOCTL, "swabbing lov_mds_md\n");
+        __swab32s(&lmm->lmm_magic);
+        __swab32s(&lmm->lmm_pattern);
+        __swab64s(&lmm->lmm_object_id);
+        __swab64s(&lmm->lmm_object_gr);
+        __swab32s(&lmm->lmm_stripe_size);
+        __swab32s(&lmm->lmm_stripe_count);
+        EXIT;
+}
+
  void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj)
  {
          ENTRY;
diff --git a/lustre/ptlrpc/pers.c b/lustre/ptlrpc/pers.c

index d53d42c..1b5f1ed 100644 (file)
--- a/lustre/ptlrpc/pers.c
+++ b/lustre/ptlrpc/pers.c
@@ -57,8 +57,11 @@ void ptlrpc_fill_bulk_md (lnet_md_t *md, struct ptlrpc_bulk_desc *desc)
          LASSERT (!(md->options & (LNET_MD_IOVEC | LNET_MD_KIOV | LNET_MD_PHYS)));
  
          md->options |= LNET_MD_KIOV;
-        md->start = &desc->bd_iov[0];
          md->length = desc->bd_iov_count;
+        if (desc->bd_enc_iov)
+                md->start = desc->bd_enc_iov;
+        else
+                md->start = desc->bd_iov;
  }
  
  void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, cfs_page_t *page,
diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c

index e4f0a0e..c097f65 100644 (file)
--- a/lustre/ptlrpc/ptlrpc_module.c
+++ b/lustre/ptlrpc/ptlrpc_module.c
@@ -264,6 +264,7 @@ EXPORT_SYMBOL(lustre_swab_lov_user_md_v1);
  EXPORT_SYMBOL(lustre_swab_lov_user_md_v3);
  EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
  EXPORT_SYMBOL(lustre_swab_lov_user_md_join);
+EXPORT_SYMBOL(lustre_swab_lov_mds_md);
  EXPORT_SYMBOL(lustre_swab_ldlm_res_id);
  EXPORT_SYMBOL(lustre_swab_ldlm_policy_data);
  EXPORT_SYMBOL(lustre_swab_ldlm_intent);
diff --git a/lustre/ptlrpc/recov_thread.c b/lustre/ptlrpc/recov_thread.c

index dc4b13b..e90142b 100644 (file)
--- a/lustre/ptlrpc/recov_thread.c
+++ b/lustre/ptlrpc/recov_thread.c
@@ -591,6 +591,7 @@ int llog_obd_repl_cancel(struct llog_ctxt *ctxt,
  
          mutex_down(&ctxt->loc_sem);
          lcm = ctxt->loc_lcm;
+        CDEBUG(D_INFO, "cancel on lsm %p\n", lcm);
  
          /*
           * Let's check if we have all structures alive. We also check for
diff --git a/lustre/ptlrpc/sec.c b/lustre/ptlrpc/sec.c

index d268380..69e618f 100644 (file)
--- a/lustre/ptlrpc/sec.c
+++ b/lustre/ptlrpc/sec.c
@@ -118,12 +118,13 @@ int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy)
  EXPORT_SYMBOL(sptlrpc_unregister_policy);
  
  static
-struct ptlrpc_sec_policy * sptlrpc_rpcflavor2policy(__u16 flavor)
+struct ptlrpc_sec_policy * sptlrpc_wireflavor2policy(__u32 flavor)
  {
          static DECLARE_MUTEX(load_mutex);
          static atomic_t           loaded = ATOMIC_INIT(0);
          struct ptlrpc_sec_policy *policy;
-        __u16                     number = RPC_FLVR_POLICY(flavor), flag = 0;
+        __u16                     number = SPTLRPC_FLVR_POLICY(flavor);
+        __u16                     flag = 0;
  
          if (number >= SPTLRPC_POLICY_MAX)
                  return NULL;
@@ -157,7 +158,7 @@ struct ptlrpc_sec_policy * sptlrpc_rpcflavor2policy(__u16 flavor)
          return policy;
  }
  
-__u16 sptlrpc_name2rpcflavor(const char *name)
+__u32 sptlrpc_name2flavor_base(const char *name)
  {
          if (!strcmp(name, "null"))
                  return SPTLRPC_FLVR_NULL;
@@ -174,51 +175,86 @@ __u16 sptlrpc_name2rpcflavor(const char *name)
  
          return SPTLRPC_FLVR_INVALID;
  }
-EXPORT_SYMBOL(sptlrpc_name2rpcflavor);
+EXPORT_SYMBOL(sptlrpc_name2flavor_base);
  
-const char *sptlrpc_rpcflavor2name(__u16 flavor)
+const char *sptlrpc_flavor2name_base(__u32 flvr)
  {
-        switch (flavor) {
-        case SPTLRPC_FLVR_NULL:
+        __u32   base = SPTLRPC_FLVR_BASE(flvr);
+
+        if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL))
                  return "null";
-        case SPTLRPC_FLVR_PLAIN:
+        else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN))
                  return "plain";
-        case SPTLRPC_FLVR_KRB5N:
+        else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5N))
                  return "krb5n";
-        case SPTLRPC_FLVR_KRB5A:
+        else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5A))
                  return "krb5a";
-        case SPTLRPC_FLVR_KRB5I:
+        else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5I))
                  return "krb5i";
-        case SPTLRPC_FLVR_KRB5P:
+        else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5P))
                  return "krb5p";
-        default:
-                CERROR("invalid rpc flavor 0x%x(p%u,s%u,v%u)\n", flavor,
-                       RPC_FLVR_POLICY(flavor), RPC_FLVR_MECH(flavor),
-                       RPC_FLVR_SVC(flavor));
-        }
-        return "unknown";
+
+        CERROR("invalid wire flavor 0x%x\n", flvr);
+        return "invalid";
  }
-EXPORT_SYMBOL(sptlrpc_rpcflavor2name);
+EXPORT_SYMBOL(sptlrpc_flavor2name_base);
  
-int sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize)
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+                               char *buf, int bufsize)
  {
-        char           *bulk;
-
-        if (sf->sf_bulk_ciph != BULK_CIPH_ALG_NULL)
-                bulk = "bulkp";
-        else if (sf->sf_bulk_hash != BULK_HASH_ALG_NULL)
-                bulk = "bulki";
+        if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN)
+                snprintf(buf, bufsize, "hash:%s",
+                         sptlrpc_get_hash_name(sf->u_bulk.hash.hash_alg));
          else
-                bulk = "bulkn";
+                snprintf(buf, bufsize, "%s",
+                         sptlrpc_flavor2name_base(sf->sf_rpc));
  
-        snprintf(buf, bufsize, "%s-%s:%s/%s",
-                 sptlrpc_rpcflavor2name(sf->sf_rpc), bulk,
-                 sptlrpc_get_hash_name(sf->sf_bulk_hash),
-                 sptlrpc_get_ciph_name(sf->sf_bulk_ciph));
-        return 0;
+        buf[bufsize - 1] = '\0';
+        return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_bulk);
+
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize)
+{
+        snprintf(buf, bufsize, "%s", sptlrpc_flavor2name_base(sf->sf_rpc));
+
+        /*
+         * currently we don't support customized bulk specification for
+         * flavors other than plain
+         */
+        if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) {
+                char bspec[16];
+
+                bspec[0] = '-';
+                sptlrpc_flavor2name_bulk(sf, &bspec[1], sizeof(bspec) - 1);
+                strncat(buf, bspec, bufsize);
+        }
+
+        buf[bufsize - 1] = '\0';
+        return buf;
  }
  EXPORT_SYMBOL(sptlrpc_flavor2name);
  
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize)
+{
+        buf[0] = '\0';
+
+        if (flags & PTLRPC_SEC_FL_REVERSE)
+                strncat(buf, "reverse,", bufsize);
+        if (flags & PTLRPC_SEC_FL_ROOTONLY)
+                strncat(buf, "rootonly,", bufsize);
+        if (flags & PTLRPC_SEC_FL_UDESC)
+                strncat(buf, "udesc,", bufsize);
+        if (flags & PTLRPC_SEC_FL_BULK)
+                strncat(buf, "bulk,", bufsize);
+        if (buf[0] == '\0')
+                strncat(buf, "-,", bufsize);
+
+        buf[bufsize - 1] = '\0';
+        return buf;
+}
+EXPORT_SYMBOL(sptlrpc_secflags2str);
+
  /**************************************************
   * client context APIs                            *
   **************************************************/
@@ -752,9 +788,11 @@ void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode)
          /* special security flags accoding to opcode */
          switch (opcode) {
          case OST_READ:
+        case MDS_READPAGE:
                  req->rq_bulk_read = 1;
                  break;
          case OST_WRITE:
+        case MDS_WRITEPAGE:
                  req->rq_bulk_write = 1;
                  break;
          case SEC_CTX_INIT:
@@ -783,9 +821,9 @@ void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode)
          /* force SVC_NULL for context initiation rpc, SVC_INTG for context
           * destruction rpc */
          if (unlikely(req->rq_ctx_init))
-                rpc_flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL);
+                flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL);
          else if (unlikely(req->rq_ctx_fini))
-                rpc_flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG);
+                flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG);
  
          /* user descriptor flag, null security can't do it anyway */
          if ((sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC) &&
@@ -794,14 +832,13 @@ void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode)
  
          /* bulk security flag */
          if ((req->rq_bulk_read || req->rq_bulk_write) &&
-            (req->rq_flvr.sf_bulk_ciph != BULK_CIPH_ALG_NULL ||
-             req->rq_flvr.sf_bulk_hash != BULK_HASH_ALG_NULL))
+            sptlrpc_flavor_has_bulk(&req->rq_flvr))
                  req->rq_pack_bulk = 1;
  }
  
  void sptlrpc_request_out_callback(struct ptlrpc_request *req)
  {
-        if (RPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV)
+        if (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV)
                  return;
  
          LASSERT(req->rq_clrbuf);
@@ -885,7 +922,7 @@ int sptlrpc_cli_wrap_request(struct ptlrpc_request *req)
                          RETURN(rc);
          }
  
-        switch (RPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+        switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
          case SPTLRPC_SVC_NULL:
          case SPTLRPC_SVC_AUTH:
          case SPTLRPC_SVC_INTG:
@@ -913,7 +950,7 @@ static int do_cli_unwrap_reply(struct ptlrpc_request *req)
  {
          struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
          int                    rc;
-        __u16                  rpc_flvr;
+        __u32                  flvr;
          ENTRY;
  
          LASSERT(ctx);
@@ -929,26 +966,26 @@ static int do_cli_unwrap_reply(struct ptlrpc_request *req)
          }
  
          /* v2 message, check request/reply policy match */
-        rpc_flvr = WIRE_FLVR_RPC(req->rq_repdata->lm_secflvr);
+        flvr = WIRE_FLVR(req->rq_repdata->lm_secflvr);
  
          if (req->rq_repdata->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED)
-                __swab16s(&rpc_flvr);
+                __swab32s(&flvr);
  
-        if (RPC_FLVR_POLICY(rpc_flvr) !=
-            RPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) {
+        if (SPTLRPC_FLVR_POLICY(flvr) !=
+            SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) {
                  CERROR("request policy was %u while reply with %u\n",
-                       RPC_FLVR_POLICY(req->rq_flvr.sf_rpc),
-                       RPC_FLVR_POLICY(rpc_flvr));
+                       SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc),
+                       SPTLRPC_FLVR_POLICY(flvr));
                  RETURN(-EPROTO);
          }
  
          /* do nothing if it's null policy; otherwise unpack the
           * wrapper message */
-        if (RPC_FLVR_POLICY(rpc_flvr) != SPTLRPC_POLICY_NULL &&
+        if (SPTLRPC_FLVR_POLICY(flvr) != SPTLRPC_POLICY_NULL &&
              lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len))
                  RETURN(-EPROTO);
  
-        switch (RPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+        switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
          case SPTLRPC_SVC_NULL:
          case SPTLRPC_SVC_AUTH:
          case SPTLRPC_SVC_INTG:
@@ -1188,7 +1225,7 @@ void sptlrpc_sec_put(struct ptlrpc_sec *sec)
  EXPORT_SYMBOL(sptlrpc_sec_put);
  
  /*
- * it's policy module responsible for taking refrence of import
+ * policy module is responsible for taking refrence of import
   */
  static
  struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp,
@@ -1198,6 +1235,7 @@ struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp,
  {
          struct ptlrpc_sec_policy *policy;
          struct ptlrpc_sec        *sec;
+        char                      str[32];
          ENTRY;
  
          if (svc_ctx) {
@@ -1206,7 +1244,7 @@ struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp,
                  CDEBUG(D_SEC, "%s %s: reverse sec using flavor %s\n",
                         imp->imp_obd->obd_type->typ_name,
                         imp->imp_obd->obd_name,
-                       sptlrpc_rpcflavor2name(sf->sf_rpc));
+                       sptlrpc_flavor2name(sf, str, sizeof(str)));
  
                  policy = sptlrpc_policy_get(svc_ctx->sc_policy);
                  sf->sf_flags |= PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY;
@@ -1216,9 +1254,9 @@ struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp,
                  CDEBUG(D_SEC, "%s %s: select security flavor %s\n",
                         imp->imp_obd->obd_type->typ_name,
                         imp->imp_obd->obd_name,
-                       sptlrpc_rpcflavor2name(sf->sf_rpc));
+                       sptlrpc_flavor2name(sf, str, sizeof(str)));
  
-                policy = sptlrpc_rpcflavor2policy(sf->sf_rpc);
+                policy = sptlrpc_wireflavor2policy(sf->sf_rpc);
                  if (!policy) {
                          CERROR("invalid flavor 0x%x\n", sf->sf_rpc);
                          RETURN(NULL);
@@ -1272,52 +1310,49 @@ static void sptlrpc_import_sec_install(struct obd_import *imp,
          }
  }
  
+static inline
+int flavor_equal(struct sptlrpc_flavor *sf1, struct sptlrpc_flavor *sf2)
+{
+        return (memcmp(sf1, sf2, sizeof(*sf1)) == 0);
+}
+
+static inline
+void flavor_copy(struct sptlrpc_flavor *dst, struct sptlrpc_flavor *src)
+{
+        *dst = *src;
+}
+
  static void sptlrpc_import_sec_adapt_inplace(struct obd_import *imp,
                                               struct ptlrpc_sec *sec,
                                               struct sptlrpc_flavor *sf)
  {
-        if (sf->sf_bulk_ciph != sec->ps_flvr.sf_bulk_ciph ||
-            sf->sf_bulk_hash != sec->ps_flvr.sf_bulk_hash) {
-                CWARN("imp %p (%s->%s): changing bulk flavor %s/%s -> %s/%s\n",
-                      imp, imp->imp_obd->obd_name,
-                      obd_uuid2str(&imp->imp_connection->c_remote_uuid),
-                      sptlrpc_get_ciph_name(sec->ps_flvr.sf_bulk_ciph),
-                      sptlrpc_get_hash_name(sec->ps_flvr.sf_bulk_hash),
-                      sptlrpc_get_ciph_name(sf->sf_bulk_ciph),
-                      sptlrpc_get_hash_name(sf->sf_bulk_hash));
-
-                spin_lock(&sec->ps_lock);
-                sec->ps_flvr.sf_bulk_ciph = sf->sf_bulk_ciph;
-                sec->ps_flvr.sf_bulk_hash = sf->sf_bulk_hash;
-                spin_unlock(&sec->ps_lock);
-        }
+        char    str1[32], str2[32];
  
-        if (!equi(sf->sf_flags & PTLRPC_SEC_FL_UDESC,
-                  sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC)) {
-                CWARN("imp %p (%s->%s): %s shipping user descriptor\n",
-                      imp, imp->imp_obd->obd_name,
-                      obd_uuid2str(&imp->imp_connection->c_remote_uuid),
-                      (sf->sf_flags & PTLRPC_SEC_FL_UDESC) ? "start" : "stop");
+        if (sec->ps_flvr.sf_flags != sf->sf_flags)
+                CWARN("changing sec flags: %s -> %s\n",
+                      sptlrpc_secflags2str(sec->ps_flvr.sf_flags,
+                                           str1, sizeof(str1)),
+                      sptlrpc_secflags2str(sf->sf_flags,
+                                           str2, sizeof(str2)));
  
-                spin_lock(&sec->ps_lock);
-                sec->ps_flvr.sf_flags &= ~PTLRPC_SEC_FL_UDESC;
-                sec->ps_flvr.sf_flags |= sf->sf_flags & PTLRPC_SEC_FL_UDESC;
-                spin_unlock(&sec->ps_lock);
-        }
+        spin_lock(&sec->ps_lock);
+        flavor_copy(&sec->ps_flvr, sf);
+        spin_unlock(&sec->ps_lock);
  }
  
  /*
- * for normal import, @svc_ctx should be NULL and @rpc_flavor is ignored;
- * for reverse import, @svc_ctx and @rpc_flavor is from incoming request.
+ * for normal import, @svc_ctx should be NULL and @flvr is ignored;
+ * for reverse import, @svc_ctx and @flvr is from incoming request.
   */
  int sptlrpc_import_sec_adapt(struct obd_import *imp,
                               struct ptlrpc_svc_ctx *svc_ctx,
-                             __u16 rpc_flavor)
+                             struct sptlrpc_flavor *flvr)
  {
          struct ptlrpc_connection   *conn;
          struct sptlrpc_flavor       sf;
          struct ptlrpc_sec          *sec, *newsec;
          enum lustre_sec_part        sp;
+        char                        str[24];
          int                         rc;
  
          might_sleep();
@@ -1344,57 +1379,45 @@ int sptlrpc_import_sec_adapt(struct obd_import *imp,
                  sp = imp->imp_obd->u.cli.cl_sp_me;
          } else {
                  /* reverse import, determine flavor from incoming reqeust */
-                sf.sf_rpc = rpc_flavor;
-                sf.sf_bulk_ciph = BULK_CIPH_ALG_NULL;
-                sf.sf_bulk_hash = BULK_HASH_ALG_NULL;
-                sf.sf_flags = PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY;
+                sf = *flvr;
+
+                if (sf.sf_rpc != SPTLRPC_FLVR_NULL)
+                        sf.sf_flags = PTLRPC_SEC_FL_REVERSE |
+                                      PTLRPC_SEC_FL_ROOTONLY;
  
                  sp = sptlrpc_target_sec_part(imp->imp_obd);
          }
  
          sec = sptlrpc_import_sec_ref(imp);
          if (sec) {
-                if (svc_ctx == NULL) {
-                        /* normal import, only check rpc flavor, if just bulk
-                         * flavor or flags changed, we can handle it on the fly
-                         * without switching sec. */
-                        if (sf.sf_rpc == sec->ps_flvr.sf_rpc) {
-                                sptlrpc_import_sec_adapt_inplace(imp, sec, &sf);
-
-                                rc = 0;
-                                goto out;
-                        }
-                } else {
-                        /* reverse import, do not compare bulk flavor */
-                        if (sf.sf_rpc == sec->ps_flvr.sf_rpc) {
-                                rc = 0;
-                                goto out;
-                        }
-                }
+                char    str2[24];
+
+                if (flavor_equal(&sf, &sec->ps_flvr))
+                        goto out;
  
                  CWARN("%simport %p (%s%s%s): changing flavor "
-                      "(%s, %s/%s) -> (%s, %s/%s)\n",
-                      svc_ctx ? "reverse " : "",
+                      "%s -> %s\n", svc_ctx ? "reverse " : "",
                        imp, imp->imp_obd->obd_name,
                        svc_ctx == NULL ? "->" : "<-",
                        obd_uuid2str(&conn->c_remote_uuid),
-                      sptlrpc_rpcflavor2name(sec->ps_flvr.sf_rpc),
-                      sptlrpc_get_hash_name(sec->ps_flvr.sf_bulk_hash),
-                      sptlrpc_get_ciph_name(sec->ps_flvr.sf_bulk_ciph),
-                      sptlrpc_rpcflavor2name(sf.sf_rpc),
-                      sptlrpc_get_hash_name(sf.sf_bulk_hash),
-                      sptlrpc_get_ciph_name(sf.sf_bulk_ciph));
+                      sptlrpc_flavor2name(&sec->ps_flvr, str, sizeof(str)),
+                      sptlrpc_flavor2name(&sf, str2, sizeof(str2)));
+
+                if (SPTLRPC_FLVR_POLICY(sf.sf_rpc) ==
+                    SPTLRPC_FLVR_POLICY(sec->ps_flvr.sf_rpc) &&
+                    SPTLRPC_FLVR_MECH(sf.sf_rpc) ==
+                    SPTLRPC_FLVR_MECH(sec->ps_flvr.sf_rpc)) {
+                        sptlrpc_import_sec_adapt_inplace(imp, sec, &sf);
+                        goto out;
+                }
          } else {
-                CWARN("%simport %p (%s%s%s) netid %x: "
-                      "select initial flavor (%s, %s/%s)\n",
+                CWARN("%simport %p (%s%s%s) netid %x: select flavor %s\n",
                        svc_ctx == NULL ? "" : "reverse ",
                        imp, imp->imp_obd->obd_name,
                        svc_ctx == NULL ? "->" : "<-",
                        obd_uuid2str(&conn->c_remote_uuid),
                        LNET_NIDNET(conn->c_self),
-                      sptlrpc_rpcflavor2name(sf.sf_rpc),
-                      sptlrpc_get_hash_name(sf.sf_bulk_hash),
-                      sptlrpc_get_ciph_name(sf.sf_bulk_ciph));
+                      sptlrpc_flavor2name(&sf, str, sizeof(str)));
          }
  
          mutex_down(&imp->imp_sec_mutex);
@@ -1659,8 +1682,9 @@ static int flavor_allowed(struct sptlrpc_flavor *exp,
                  return 1;
  
          if ((req->rq_ctx_init || req->rq_ctx_fini) &&
-            RPC_FLVR_POLICY(exp->sf_rpc) == RPC_FLVR_POLICY(flvr->sf_rpc) &&
-            RPC_FLVR_MECH(exp->sf_rpc) == RPC_FLVR_MECH(flvr->sf_rpc))
+            SPTLRPC_FLVR_POLICY(exp->sf_rpc) ==
+            SPTLRPC_FLVR_POLICY(flvr->sf_rpc) &&
+            SPTLRPC_FLVR_MECH(exp->sf_rpc) == SPTLRPC_FLVR_MECH(flvr->sf_rpc))
                  return 1;
  
          return 0;
@@ -1725,7 +1749,7 @@ int sptlrpc_target_export_check(struct obd_export *exp,
                  spin_unlock(&exp->exp_lock);
  
                  return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
-                                                req->rq_svc_ctx, flavor.sf_rpc);
+                                                req->rq_svc_ctx, &flavor);
          }
  
          /* if it equals to the current flavor, we accept it, but need to
@@ -1759,7 +1783,7 @@ int sptlrpc_target_export_check(struct obd_export *exp,
  
                          return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
                                                          req->rq_svc_ctx,
-                                                        flavor.sf_rpc);
+                                                        &flavor);
                  } else {
                          CDEBUG(D_SEC, "exp %p (%x|%x|%x): is current flavor, "
                                 "install rvs ctx\n", exp, exp->exp_flvr.sf_rpc,
@@ -1866,7 +1890,7 @@ void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
                                               exp->exp_connection->c_peer.nid,
                                               &new_flvr);
                  if (exp->exp_flvr_changed ||
-                    memcmp(&new_flvr, &exp->exp_flvr, sizeof(new_flvr))) {
+                    !flavor_equal(&new_flvr, &exp->exp_flvr)) {
                          exp->exp_flvr_old[1] = new_flvr;
                          exp->exp_flvr_expire[1] = 0;
                          exp->exp_flvr_changed = 1;
@@ -1931,13 +1955,14 @@ static int sptlrpc_svc_check_from(struct ptlrpc_request *req, int svc_rc)
  int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req)
  {
          struct ptlrpc_sec_policy *policy;
-        struct lustre_msg *msg = req->rq_reqbuf;
-        int rc;
+        struct lustre_msg        *msg = req->rq_reqbuf;
+        int                       rc;
          ENTRY;
  
          LASSERT(msg);
          LASSERT(req->rq_reqmsg == NULL);
          LASSERT(req->rq_repmsg == NULL);
+        LASSERT(req->rq_svc_ctx == NULL);
  
          req->rq_sp_from = LUSTRE_SP_ANY;
          req->rq_auth_uid = INVALID_UID;
@@ -1949,19 +1974,28 @@ int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req)
          }
  
          /*
-         * v2 message.
+         * only expect v2 message.
           */
-        if (msg->lm_magic == LUSTRE_MSG_MAGIC_V2)
-                req->rq_flvr.sf_rpc = WIRE_FLVR_RPC(msg->lm_secflvr);
-        else
-                req->rq_flvr.sf_rpc = WIRE_FLVR_RPC(__swab32(msg->lm_secflvr));
+        switch (msg->lm_magic) {
+        case LUSTRE_MSG_MAGIC_V2:
+                req->rq_flvr.sf_rpc = WIRE_FLVR(msg->lm_secflvr);
+                break;
+        case LUSTRE_MSG_MAGIC_V2_SWABBED:
+                req->rq_flvr.sf_rpc = WIRE_FLVR(__swab32(msg->lm_secflvr));
+                break;
+        default:
+                CERROR("invalid magic %x\n", msg->lm_magic);
+                RETURN(SECSVC_DROP);
+        }
  
          /* unpack the wrapper message if the policy is not null */
-        if ((RPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) &&
-             lustre_unpack_msg(msg, req->rq_reqdata_len))
+        if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL &&
+            lustre_unpack_msg(msg, req->rq_reqdata_len)) {
+                CERROR("invalid wrapper msg format\n");
                  RETURN(SECSVC_DROP);
+        }
  
-        policy = sptlrpc_rpcflavor2policy(req->rq_flvr.sf_rpc);
+        policy = sptlrpc_wireflavor2policy(req->rq_flvr.sf_rpc);
          if (!policy) {
                  CERROR("unsupported rpc flavor %x\n", req->rq_flvr.sf_rpc);
                  RETURN(SECSVC_DROP);
@@ -1971,22 +2005,11 @@ int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req)
          rc = policy->sp_sops->accept(req);
  
          LASSERT(req->rq_reqmsg || rc != SECSVC_OK);
+        LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP);
          sptlrpc_policy_put(policy);
  
          /* sanity check for the request source */
          rc = sptlrpc_svc_check_from(req, rc);
-
-        /* FIXME move to proper place */
-        if (rc == SECSVC_OK) {
-                __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
-
-                if (opc == OST_WRITE)
-                        req->rq_bulk_write = 1;
-                else if (opc == OST_READ)
-                        req->rq_bulk_read = 1;
-        }
-
-        LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP);
          RETURN(rc);
  }
  
@@ -2111,11 +2134,11 @@ int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
  {
          struct ptlrpc_cli_ctx *ctx;
  
+        LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
          if (!req->rq_pack_bulk)
                  return 0;
  
-        LASSERT(req->rq_bulk_read || req->rq_bulk_write);
-
          ctx = req->rq_cli_ctx;
          if (ctx->cc_ops->wrap_bulk)
                  return ctx->cc_ops->wrap_bulk(ctx, req, desc);
@@ -2123,79 +2146,61 @@ int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
  }
  EXPORT_SYMBOL(sptlrpc_cli_wrap_bulk);
  
-static
-void pga_to_bulk_desc(int nob, obd_count pg_count, struct brw_page **pga,
-                      struct ptlrpc_bulk_desc *desc)
-{
-        int i;
-
-        LASSERT(pga);
-        LASSERT(*pga);
-
-        for (i = 0; i < pg_count && nob > 0; i++) {
-#ifdef __KERNEL__
-                desc->bd_iov[i].kiov_page = pga[i]->pg;
-                desc->bd_iov[i].kiov_len = pga[i]->count > nob ?
-                                           nob : pga[i]->count;
-                desc->bd_iov[i].kiov_offset = pga[i]->off & ~CFS_PAGE_MASK;
-#else
-                /* FIXME currently liblustre doesn't support bulk encryption.
-                 * if we do, check again following may not be right. */
-                LASSERTF(0, "Bulk encryption not implemented for liblustre\n");
-                desc->bd_iov[i].iov_base = pga[i]->pg->addr;
-                desc->bd_iov[i].iov_len = pga[i]->count > nob ?
-                                           nob : pga[i]->count;
-#endif
-
-                desc->bd_iov_count++;
-                nob -= pga[i]->count;
-        }
-}
-
+/*
+ * return nob of actual plain text size received, or error code.
+ */
  int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
-                                 int nob, obd_count pg_count,
-                                 struct brw_page **pga)
+                                 struct ptlrpc_bulk_desc *desc,
+                                 int nob)
  {
-        struct ptlrpc_bulk_desc *desc;
-        struct ptlrpc_cli_ctx *ctx;
-        int rc = 0;
-
-        if (!req->rq_pack_bulk)
-                return 0;
+        struct ptlrpc_cli_ctx  *ctx;
+        int                     rc;
  
          LASSERT(req->rq_bulk_read && !req->rq_bulk_write);
  
-        OBD_ALLOC(desc, offsetof(struct ptlrpc_bulk_desc, bd_iov[pg_count]));
-        if (desc == NULL) {
-                CERROR("out of memory, can't verify bulk read data\n");
-                return -ENOMEM;
-        }
-
-        pga_to_bulk_desc(nob, pg_count, pga, desc);
+        if (!req->rq_pack_bulk)
+                return desc->bd_nob_transferred;
  
          ctx = req->rq_cli_ctx;
-        if (ctx->cc_ops->unwrap_bulk)
+        if (ctx->cc_ops->unwrap_bulk) {
                  rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
-
-        OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc, bd_iov[pg_count]));
-
-        return rc;
+                if (rc < 0)
+                        return rc;
+        }
+        return desc->bd_nob_transferred;
  }
  EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_read);
  
+/*
+ * return 0 for success or error code.
+ */
  int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
                                    struct ptlrpc_bulk_desc *desc)
  {
-        struct ptlrpc_cli_ctx *ctx;
+        struct ptlrpc_cli_ctx  *ctx;
+        int                     rc;
+
+        LASSERT(!req->rq_bulk_read && req->rq_bulk_write);
  
          if (!req->rq_pack_bulk)
                  return 0;
  
-        LASSERT(!req->rq_bulk_read && req->rq_bulk_write);
-
          ctx = req->rq_cli_ctx;
-        if (ctx->cc_ops->unwrap_bulk)
-                return ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+        if (ctx->cc_ops->unwrap_bulk) {
+                rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+                if (rc < 0)
+                        return rc;
+        }
+
+        /*
+         * if everything is going right, nob should equals to nob_transferred.
+         * in case of privacy mode, nob_transferred needs to be adjusted.
+         */
+        if (desc->bd_nob != desc->bd_nob_transferred) {
+                CERROR("nob %d doesn't match transferred nob %d",
+                       desc->bd_nob, desc->bd_nob_transferred);
+                return -EPROTO;
+        }
  
          return 0;
  }
@@ -2206,11 +2211,11 @@ int sptlrpc_svc_wrap_bulk(struct ptlrpc_request *req,
  {
          struct ptlrpc_svc_ctx *ctx;
  
+        LASSERT(req->rq_bulk_read);
+
          if (!req->rq_pack_bulk)
                  return 0;
  
-        LASSERT(req->rq_bulk_read || req->rq_bulk_write);
-
          ctx = req->rq_svc_ctx;
          if (ctx->sc_policy->sp_sops->wrap_bulk)
                  return ctx->sc_policy->sp_sops->wrap_bulk(req, desc);
@@ -2223,20 +2228,50 @@ int sptlrpc_svc_unwrap_bulk(struct ptlrpc_request *req,
                              struct ptlrpc_bulk_desc *desc)
  {
          struct ptlrpc_svc_ctx *ctx;
+        int                    rc;
+
+        LASSERT(req->rq_bulk_write);
+
+        if (desc->bd_nob_transferred != desc->bd_nob &&
+            SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) !=
+            SPTLRPC_BULK_SVC_PRIV) {
+                DEBUG_REQ(D_ERROR, req, "truncated bulk GET %d(%d)",
+                          desc->bd_nob_transferred, desc->bd_nob);
+                return -ETIMEDOUT;
+        }
  
          if (!req->rq_pack_bulk)
                  return 0;
  
-        LASSERT(req->rq_bulk_read || req->rq_bulk_write);
-
          ctx = req->rq_svc_ctx;
-        if (ctx->sc_policy->sp_sops->unwrap_bulk);
-                return ctx->sc_policy->sp_sops->unwrap_bulk(req, desc);
+        if (ctx->sc_policy->sp_sops->unwrap_bulk) {
+                rc = ctx->sc_policy->sp_sops->unwrap_bulk(req, desc);
+                if (rc)
+                        CERROR("error unwrap bulk: %d\n", rc);
+        }
  
+        /* return 0 to allow reply be sent */
          return 0;
  }
  EXPORT_SYMBOL(sptlrpc_svc_unwrap_bulk);
  
+int sptlrpc_svc_prep_bulk(struct ptlrpc_request *req,
+                          struct ptlrpc_bulk_desc *desc)
+{
+        struct ptlrpc_svc_ctx *ctx;
+
+        LASSERT(req->rq_bulk_write);
+
+        if (!req->rq_pack_bulk)
+                return 0;
+
+        ctx = req->rq_svc_ctx;
+        if (ctx->sc_policy->sp_sops->prep_bulk)
+                return ctx->sc_policy->sp_sops->prep_bulk(req, desc);
+
+        return 0;
+}
+EXPORT_SYMBOL(sptlrpc_svc_prep_bulk);
  
  /****************************************
   * user descriptor helpers              *
@@ -2337,6 +2372,21 @@ const char * sec2target_str(struct ptlrpc_sec *sec)
  }
  EXPORT_SYMBOL(sec2target_str);
  
+/*
+ * return true if the bulk data is protected
+ */
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr)
+{
+        switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+        case SPTLRPC_BULK_SVC_INTG:
+        case SPTLRPC_BULK_SVC_PRIV:
+                return 1;
+        default:
+                return 0;
+        }
+}
+EXPORT_SYMBOL(sptlrpc_flavor_has_bulk);
+
  /****************************************
   * crypto API helper/alloc blkciper     *
   ****************************************/
diff --git a/lustre/ptlrpc/sec_bulk.c b/lustre/ptlrpc/sec_bulk.c

index 12ff171..c09cf0c 100644 (file)
--- a/lustre/ptlrpc/sec_bulk.c
+++ b/lustre/ptlrpc/sec_bulk.c
@@ -456,8 +456,10 @@ out:
  
  static inline void enc_pools_wakeup(void)
  {
+        LASSERT_SPIN_LOCKED(&page_pools.epp_lock);
+        LASSERT(page_pools.epp_waitqlen >= 0);
+
          if (unlikely(page_pools.epp_waitqlen)) {
-                LASSERT(page_pools.epp_waitqlen > 0);
                  LASSERT(cfs_waitq_active(&page_pools.epp_waitq));
                  cfs_waitq_broadcast(&page_pools.epp_waitq);
          }
@@ -476,11 +478,15 @@ static int enc_pools_should_grow(int page_needed, long now)
          if (page_pools.epp_total_pages < page_needed)
                  return 1;
  
-        /* if we just did a shrink due to memory tight, we'd better
-         * wait a while to grow again.
+        /*
+         * we wanted to return 0 here if there was a shrink just happened
+         * moment ago, but this may cause deadlock if both client and ost
+         * live on single node.
           */
+#if 0
          if (now - page_pools.epp_last_shrink < 2)
                  return 0;
+#endif
  
          /*
           * here we perhaps need consider other factors like wait queue
@@ -503,32 +509,32 @@ int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
          int             p_idx, g_idx;
          int             i;
  
-        LASSERT(desc->bd_max_iov > 0);
-        LASSERT(desc->bd_max_iov <= page_pools.epp_max_pages);
+        LASSERT(desc->bd_iov_count > 0);
+        LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages);
  
-        /* resent bulk, enc pages might have been allocated previously */
-        if (desc->bd_enc_pages != NULL)
+        /* resent bulk, enc iov might have been allocated previously */
+        if (desc->bd_enc_iov != NULL)
                  return 0;
  
-        OBD_ALLOC(desc->bd_enc_pages,
-                  desc->bd_max_iov * sizeof(*desc->bd_enc_pages));
-        if (desc->bd_enc_pages == NULL)
+        OBD_ALLOC(desc->bd_enc_iov,
+                  desc->bd_iov_count * sizeof(*desc->bd_enc_iov));
+        if (desc->bd_enc_iov == NULL)
                  return -ENOMEM;
  
          spin_lock(&page_pools.epp_lock);
  
          page_pools.epp_st_access++;
  again:
-        if (unlikely(page_pools.epp_free_pages < desc->bd_max_iov)) {
+        if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) {
                  if (tick == 0)
                          tick = cfs_time_current();
  
                  now = cfs_time_current_sec();
  
                  page_pools.epp_st_missings++;
-                page_pools.epp_pages_short += desc->bd_max_iov;
+                page_pools.epp_pages_short += desc->bd_iov_count;
  
-                if (enc_pools_should_grow(desc->bd_max_iov, now)) {
+                if (enc_pools_should_grow(desc->bd_iov_count, now)) {
                          page_pools.epp_growing = 1;
  
                          spin_unlock(&page_pools.epp_lock);
@@ -536,6 +542,8 @@ again:
                          spin_lock(&page_pools.epp_lock);
  
                          page_pools.epp_growing = 0;
+
+                        enc_pools_wakeup();
                  } else {
                          if (++page_pools.epp_waitqlen >
                              page_pools.epp_st_max_wqlen)
@@ -549,14 +557,13 @@ again:
                          spin_unlock(&page_pools.epp_lock);
                          cfs_waitq_wait(&waitlink, CFS_TASK_UNINT);
                          cfs_waitq_del(&page_pools.epp_waitq, &waitlink);
-                        spin_lock(&page_pools.epp_lock);
-
                          LASSERT(page_pools.epp_waitqlen > 0);
+                        spin_lock(&page_pools.epp_lock);
                          page_pools.epp_waitqlen--;
                  }
  
-                LASSERT(page_pools.epp_pages_short >= desc->bd_max_iov);
-                page_pools.epp_pages_short -= desc->bd_max_iov;
+                LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count);
+                page_pools.epp_pages_short -= desc->bd_iov_count;
  
                  this_idle = 0;
                  goto again;
@@ -570,14 +577,15 @@ again:
          }
  
          /* proceed with rest of allocation */
-        page_pools.epp_free_pages -= desc->bd_max_iov;
+        page_pools.epp_free_pages -= desc->bd_iov_count;
  
          p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
          g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
  
-        for (i = 0; i < desc->bd_max_iov; i++) {
+        for (i = 0; i < desc->bd_iov_count; i++) {
                  LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
-                desc->bd_enc_pages[i] = page_pools.epp_pools[p_idx][g_idx];
+                desc->bd_enc_iov[i].kiov_page =
+                                        page_pools.epp_pools[p_idx][g_idx];
                  page_pools.epp_pools[p_idx][g_idx] = NULL;
  
                  if (++g_idx == PAGES_PER_POOL) {
@@ -612,26 +620,27 @@ void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc)
          int     p_idx, g_idx;
          int     i;
  
-        if (desc->bd_enc_pages == NULL)
-                return;
-        if (desc->bd_max_iov == 0)
+        if (desc->bd_enc_iov == NULL)
                  return;
  
+        LASSERT(desc->bd_iov_count > 0);
+
          spin_lock(&page_pools.epp_lock);
  
          p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
          g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
  
-        LASSERT(page_pools.epp_free_pages + desc->bd_max_iov <=
+        LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <=
                  page_pools.epp_total_pages);
          LASSERT(page_pools.epp_pools[p_idx]);
  
-        for (i = 0; i < desc->bd_max_iov; i++) {
-                LASSERT(desc->bd_enc_pages[i] != NULL);
+        for (i = 0; i < desc->bd_iov_count; i++) {
+                LASSERT(desc->bd_enc_iov[i].kiov_page != NULL);
                  LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]);
                  LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL);
  
-                page_pools.epp_pools[p_idx][g_idx] = desc->bd_enc_pages[i];
+                page_pools.epp_pools[p_idx][g_idx] =
+                                        desc->bd_enc_iov[i].kiov_page;
  
                  if (++g_idx == PAGES_PER_POOL) {
                          p_idx++;
@@ -639,15 +648,15 @@ void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc)
                  }
          }
  
-        page_pools.epp_free_pages += desc->bd_max_iov;
+        page_pools.epp_free_pages += desc->bd_iov_count;
  
          enc_pools_wakeup();
  
          spin_unlock(&page_pools.epp_lock);
  
-        OBD_FREE(desc->bd_enc_pages,
-                 desc->bd_max_iov * sizeof(*desc->bd_enc_pages));
-        desc->bd_enc_pages = NULL;
+        OBD_FREE(desc->bd_enc_iov,
+                 desc->bd_iov_count * sizeof(*desc->bd_enc_iov));
+        desc->bd_enc_iov = NULL;
  }
  EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages);
  
@@ -668,7 +677,8 @@ int sptlrpc_enc_pool_add_user(void)
          spin_unlock(&page_pools.epp_lock);
  
          if (need_grow) {
-                enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES);
+                enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES +
+                                    PTLRPC_MAX_BRW_PAGES);
  
                  spin_lock(&page_pools.epp_lock);
                  page_pools.epp_growing = 0;
@@ -815,9 +825,6 @@ static struct sptlrpc_hash_type hash_types[] = {
          [BULK_HASH_ALG_SHA256]  = { "sha256",   "sha256",       32 },
          [BULK_HASH_ALG_SHA384]  = { "sha384",   "sha384",       48 },
          [BULK_HASH_ALG_SHA512]  = { "sha512",   "sha512",       64 },
-        [BULK_HASH_ALG_WP256]   = { "wp256",    "wp256",        32 },
-        [BULK_HASH_ALG_WP384]   = { "wp384",    "wp384",        48 },
-        [BULK_HASH_ALG_WP512]   = { "wp512",    "wp512",        64 },
  };
  
  const struct sptlrpc_hash_type *sptlrpc_get_hash_type(__u8 hash_alg)
@@ -845,24 +852,21 @@ const char * sptlrpc_get_hash_name(__u8 hash_alg)
  }
  EXPORT_SYMBOL(sptlrpc_get_hash_name);
  
-int bulk_sec_desc_size(__u8 hash_alg, int request, int read)
+__u8 sptlrpc_get_hash_alg(const char *algname)
  {
-        int size = sizeof(struct ptlrpc_bulk_sec_desc);
-
-        LASSERT(hash_alg < BULK_HASH_ALG_MAX);
-
-        /* read request don't need extra data */
-        if (!(read && request))
-                size += hash_types[hash_alg].sht_size;
+        int     i;
  
-        return size;
+        for (i = 0; i < BULK_HASH_ALG_MAX; i++)
+                if (!strcmp(hash_types[i].sht_name, algname))
+                        break;
+        return i;
  }
-EXPORT_SYMBOL(bulk_sec_desc_size);
+EXPORT_SYMBOL(sptlrpc_get_hash_alg);
  
  int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset)
  {
          struct ptlrpc_bulk_sec_desc *bsd;
-        int    size = msg->lm_buflens[offset];
+        int                          size = msg->lm_buflens[offset];
  
          bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
          if (bsd == NULL) {
@@ -870,35 +874,27 @@ int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset)
                  return -EINVAL;
          }
  
-        /* nothing to swab */
+        if (lustre_msg_swabbed(msg)) {
+                __swab32s(&bsd->bsd_nob);
+        }
  
          if (unlikely(bsd->bsd_version != 0)) {
                  CERROR("Unexpected version %u\n", bsd->bsd_version);
                  return -EPROTO;
          }
  
-        if (unlikely(bsd->bsd_flags != 0)) {
-                CERROR("Unexpected flags %x\n", bsd->bsd_flags);
+        if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) {
+                CERROR("Invalid type %u\n", bsd->bsd_type);
                  return -EPROTO;
          }
  
-        if (unlikely(!sptlrpc_get_hash_type(bsd->bsd_hash_alg))) {
-                CERROR("Unsupported checksum algorithm %u\n",
-                       bsd->bsd_hash_alg);
-                return -EINVAL;
-        }
+        /* FIXME more sanity check here */
  
-        if (unlikely(!sptlrpc_get_ciph_type(bsd->bsd_ciph_alg))) {
-                CERROR("Unsupported cipher algorithm %u\n",
-                       bsd->bsd_ciph_alg);
-                return -EINVAL;
-        }
-
-        if (unlikely(size > sizeof(*bsd)) &&
-            size < sizeof(*bsd) + hash_types[bsd->bsd_hash_alg].sht_size) {
-                CERROR("Mal-formed checksum data: csum alg %u, size %d\n",
-                       bsd->bsd_hash_alg, size);
-                return -EINVAL;
+        if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+                     bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG &&
+                     bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) {
+                CERROR("Invalid svc %u\n", bsd->bsd_svc);
+                return -EPROTO;
          }
  
          return 0;
@@ -957,14 +953,17 @@ static int do_bulk_checksum_crc32(struct ptlrpc_bulk_desc *desc, void *buf)
          return 0;
  }
  
-static int do_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u32 alg, void *buf)
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+                              void *buf, int buflen)
  {
          struct hash_desc    hdesc;
-        struct scatterlist *sl;
-        int i, rc = 0, bytes = 0;
+        int                 hashsize;
+        char                hashbuf[64];
+        struct scatterlist  sl;
+        int                 i;
  
-        LASSERT(alg > BULK_HASH_ALG_NULL &&
-                alg < BULK_HASH_ALG_MAX);
+        LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX);
+        LASSERT(buflen >= 4);
  
          switch (alg) {
          case BULK_HASH_ALG_ADLER32:
@@ -983,35 +982,35 @@ static int do_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u32 alg, void *buf)
                  CERROR("Unable to allocate TFM %s\n", hash_types[alg].sht_name);
                  return -ENOMEM;
          }
+
          hdesc.flags = 0;
+        ll_crypto_hash_init(&hdesc);
  
-        OBD_ALLOC(sl, sizeof(*sl) * desc->bd_iov_count);
-        if (sl == NULL) {
-                rc = -ENOMEM;
-                goto out_tfm;
-        }
+        hashsize = ll_crypto_hash_digestsize(hdesc.tfm);
  
          for (i = 0; i < desc->bd_iov_count; i++) {
-                sl[i].page = desc->bd_iov[i].kiov_page;
-                sl[i].offset = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
-                sl[i].length = desc->bd_iov[i].kiov_len;
-                bytes += desc->bd_iov[i].kiov_len;
+                sl.page = desc->bd_iov[i].kiov_page;
+                sl.offset = desc->bd_iov[i].kiov_offset;
+                sl.length = desc->bd_iov[i].kiov_len;
+                ll_crypto_hash_update(&hdesc, &sl, sl.length);
          }
  
-        ll_crypto_hash_init(&hdesc);
-        ll_crypto_hash_update(&hdesc, sl, bytes);
-        ll_crypto_hash_final(&hdesc, buf);
-
-        OBD_FREE(sl, sizeof(*sl) * desc->bd_iov_count);
+        if (hashsize > buflen) {
+                ll_crypto_hash_final(&hdesc, hashbuf);
+                memcpy(buf, hashbuf, buflen);
+        } else {
+                ll_crypto_hash_final(&hdesc, buf);
+        }
  
-out_tfm:
          ll_crypto_free_hash(hdesc.tfm);
-        return rc;
+        return 0;
  }
+EXPORT_SYMBOL(sptlrpc_get_bulk_checksum);
  
  #else /* !__KERNEL__ */
  
-static int do_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u32 alg, void *buf)
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+                              void *buf, int buflen)
  {
          __u32   csum32;
          int     i;
@@ -1048,328 +1047,3 @@ static int do_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u32 alg, void *buf)
  }
  
  #endif /* __KERNEL__ */
-
-/*
- * perform algorithm @alg checksum on @desc, store result in @buf.
- * if anything goes wrong, leave 'alg' be BULK_HASH_ALG_NULL.
- */
-static
-int generate_bulk_csum(struct ptlrpc_bulk_desc *desc, __u32 alg,
-                       struct ptlrpc_bulk_sec_desc *bsd, int bsdsize)
-{
-        int rc;
-
-        LASSERT(bsd);
-        LASSERT(alg < BULK_HASH_ALG_MAX);
-
-        bsd->bsd_hash_alg = BULK_HASH_ALG_NULL;
-
-        if (alg == BULK_HASH_ALG_NULL)
-                return 0;
-
-        LASSERT(bsdsize >= sizeof(*bsd) + hash_types[alg].sht_size);
-
-        rc = do_bulk_checksum(desc, alg, bsd->bsd_csum);
-        if (rc == 0)
-                bsd->bsd_hash_alg = alg;
-
-        return rc;
-}
-
-static
-int verify_bulk_csum(struct ptlrpc_bulk_desc *desc, int read,
-                     struct ptlrpc_bulk_sec_desc *bsdv, int bsdvsize,
-                     struct ptlrpc_bulk_sec_desc *bsdr, int bsdrsize)
-{
-        char *csum_p;
-        char *buf = NULL;
-        int   csum_size, rc = 0;
-
-        LASSERT(bsdv);
-        LASSERT(bsdv->bsd_hash_alg < BULK_HASH_ALG_MAX);
-
-        if (bsdr)
-                bsdr->bsd_hash_alg = BULK_HASH_ALG_NULL;
-
-        if (bsdv->bsd_hash_alg == BULK_HASH_ALG_NULL)
-                return 0;
-
-        /* for all supported algorithms */
-        csum_size = hash_types[bsdv->bsd_hash_alg].sht_size;
-
-        if (bsdvsize < sizeof(*bsdv) + csum_size) {
-                CERROR("verifier size %d too small, require %d\n",
-                       bsdvsize, (int) sizeof(*bsdv) + csum_size);
-                return -EINVAL;
-        }
-
-        if (bsdr) {
-                LASSERT(bsdrsize >= sizeof(*bsdr) + csum_size);
-                csum_p = (char *) bsdr->bsd_csum;
-        } else {
-                OBD_ALLOC(buf, csum_size);
-                if (buf == NULL)
-                        return -EINVAL;
-                csum_p = buf;
-        }
-
-        rc = do_bulk_checksum(desc, bsdv->bsd_hash_alg, csum_p);
-
-        if (memcmp(bsdv->bsd_csum, csum_p, csum_size)) {
-                CERROR("BAD %s CHECKSUM (%s), data mutated during "
-                       "transfer!\n", read ? "READ" : "WRITE",
-                       hash_types[bsdv->bsd_hash_alg].sht_name);
-                rc = -EINVAL;
-        } else {
-                CDEBUG(D_SEC, "bulk %s checksum (%s) verified\n",
-                      read ? "read" : "write",
-                      hash_types[bsdv->bsd_hash_alg].sht_name);
-        }
-
-        if (bsdr) {
-                bsdr->bsd_hash_alg = bsdv->bsd_hash_alg;
-                memcpy(bsdr->bsd_csum, csum_p, csum_size);
-        } else {
-                LASSERT(buf);
-                OBD_FREE(buf, csum_size);
-        }
-
-        return rc;
-}
-
-int bulk_csum_cli_request(struct ptlrpc_bulk_desc *desc, int read,
-                          __u32 alg, struct lustre_msg *rmsg, int roff)
-{
-        struct ptlrpc_bulk_sec_desc *bsdr;
-        int    rsize, rc = 0;
-
-        rsize = rmsg->lm_buflens[roff];
-        bsdr = lustre_msg_buf(rmsg, roff, sizeof(*bsdr));
-
-        LASSERT(bsdr);
-        LASSERT(rsize >= sizeof(*bsdr));
-        LASSERT(alg < BULK_HASH_ALG_MAX);
-
-        if (read) {
-                bsdr->bsd_hash_alg = alg;
-        } else {
-                rc = generate_bulk_csum(desc, alg, bsdr, rsize);
-                if (rc)
-                        CERROR("bulk write: client failed to compute "
-                               "checksum: %d\n", rc);
-
-                /* For sending we only compute the wrong checksum instead
-                 * of corrupting the data so it is still correct on a redo */
-                if (rc == 0 && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) &&
-                    bsdr->bsd_hash_alg != BULK_HASH_ALG_NULL)
-                        bsdr->bsd_csum[0] ^= 0x1;
-        }
-
-        return rc;
-}
-EXPORT_SYMBOL(bulk_csum_cli_request);
-
-int bulk_csum_cli_reply(struct ptlrpc_bulk_desc *desc, int read,
-                        struct lustre_msg *rmsg, int roff,
-                        struct lustre_msg *vmsg, int voff)
-{
-        struct ptlrpc_bulk_sec_desc *bsdv, *bsdr;
-        int    rsize, vsize;
-
-        rsize = rmsg->lm_buflens[roff];
-        vsize = vmsg->lm_buflens[voff];
-        bsdr = lustre_msg_buf(rmsg, roff, 0);
-        bsdv = lustre_msg_buf(vmsg, voff, 0);
-
-        if (bsdv == NULL || vsize < sizeof(*bsdv)) {
-                CERROR("Invalid checksum verifier from server: size %d\n",
-                       vsize);
-                return -EINVAL;
-        }
-
-        LASSERT(bsdr);
-        LASSERT(rsize >= sizeof(*bsdr));
-        LASSERT(vsize >= sizeof(*bsdv));
-
-        if (bsdr->bsd_hash_alg != bsdv->bsd_hash_alg) {
-                CERROR("bulk %s: checksum algorithm mismatch: client request "
-                       "%s but server reply with %s. try to use the new one "
-                       "for checksum verification\n",
-                       read ? "read" : "write",
-                       hash_types[bsdr->bsd_hash_alg].sht_name,
-                       hash_types[bsdv->bsd_hash_alg].sht_name);
-        }
-
-        if (read)
-                return verify_bulk_csum(desc, 1, bsdv, vsize, NULL, 0);
-        else {
-                char *cli, *srv, *new = NULL;
-                int csum_size = hash_types[bsdr->bsd_hash_alg].sht_size;
-
-                LASSERT(bsdr->bsd_hash_alg < BULK_HASH_ALG_MAX);
-                if (bsdr->bsd_hash_alg == BULK_HASH_ALG_NULL)
-                        return 0;
-
-                if (vsize < sizeof(*bsdv) + csum_size) {
-                        CERROR("verifier size %d too small, require %d\n",
-                               vsize, (int) sizeof(*bsdv) + csum_size);
-                        return -EINVAL;
-                }
-
-                cli = (char *) (bsdr + 1);
-                srv = (char *) (bsdv + 1);
-
-                if (!memcmp(cli, srv, csum_size)) {
-                        /* checksum confirmed */
-                        CDEBUG(D_SEC, "bulk write checksum (%s) confirmed\n",
-                               hash_types[bsdr->bsd_hash_alg].sht_name);
-                        return 0;
-                }
-
-                /* checksum mismatch, re-compute a new one and compare with
-                 * others, give out proper warnings. */
-                OBD_ALLOC(new, csum_size);
-                if (new == NULL)
-                        return -ENOMEM;
-
-                do_bulk_checksum(desc, bsdr->bsd_hash_alg, new);
-
-                if (!memcmp(new, srv, csum_size)) {
-                        CERROR("BAD WRITE CHECKSUM (%s): pages were mutated "
-                               "on the client after we checksummed them\n",
-                               hash_types[bsdr->bsd_hash_alg].sht_name);
-                } else if (!memcmp(new, cli, csum_size)) {
-                        CERROR("BAD WRITE CHECKSUM (%s): pages were mutated "
-                               "in transit\n",
-                               hash_types[bsdr->bsd_hash_alg].sht_name);
-                } else {
-                        CERROR("BAD WRITE CHECKSUM (%s): pages were mutated "
-                               "in transit, and the current page contents "
-                               "don't match the originals and what the server "
-                               "received\n",
-                               hash_types[bsdr->bsd_hash_alg].sht_name);
-                }
-                OBD_FREE(new, csum_size);
-
-                return -EINVAL;
-        }
-}
-EXPORT_SYMBOL(bulk_csum_cli_reply);
-
-#ifdef __KERNEL__
-static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc)
-{
-        char           *ptr;
-        unsigned int    off, i;
-
-        for (i = 0; i < desc->bd_iov_count; i++) {
-                if (desc->bd_iov[i].kiov_len == 0)
-                        continue;
-
-                ptr = cfs_kmap(desc->bd_iov[i].kiov_page);
-                off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
-                ptr[off] ^= 0x1;
-                cfs_kunmap(desc->bd_iov[i].kiov_page);
-                return;
-        }
-}
-#else
-static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc)
-{
-}
-#endif /* __KERNEL__ */
-
-int bulk_csum_svc(struct ptlrpc_bulk_desc *desc, int read,
-                  struct ptlrpc_bulk_sec_desc *bsdv, int vsize,
-                  struct ptlrpc_bulk_sec_desc *bsdr, int rsize)
-{
-        int    rc;
-
-        LASSERT(vsize >= sizeof(*bsdv));
-        LASSERT(rsize >= sizeof(*bsdr));
-        LASSERT(bsdv && bsdr);
-
-        if (read) {
-                rc = generate_bulk_csum(desc, bsdv->bsd_hash_alg, bsdr, rsize);
-                if (rc)
-                        CERROR("bulk read: server failed to generate %s "
-                               "checksum: %d\n",
-                               hash_types[bsdv->bsd_hash_alg].sht_name, rc);
-
-                /* corrupt the data after we compute the checksum, to
-                 * simulate an OST->client data error */
-                if (rc == 0 && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))
-                        corrupt_bulk_data(desc);
-        } else {
-                rc = verify_bulk_csum(desc, 0, bsdv, vsize, bsdr, rsize);
-        }
-
-        return rc;
-}
-EXPORT_SYMBOL(bulk_csum_svc);
-
-/****************************************
- * Helpers to assist policy modules to  *
- * implement encryption funcationality  *
- ****************************************/
-
-/* FIXME */
-#ifndef __KERNEL__
-#define CRYPTO_TFM_MODE_ECB     (0)
-#define CRYPTO_TFM_MODE_CBC     (1)
-#endif
-
-static struct sptlrpc_ciph_type cipher_types[] = {
-        [BULK_CIPH_ALG_NULL]    = {
-                "null",         "null",       0,                   0,  0
-        },
-        [BULK_CIPH_ALG_ARC4]    = {
-                "arc4",         "ecb(arc4)",       0, 0,  16
-        },
-        [BULK_CIPH_ALG_AES128]  = {
-                "aes128",       "cbc(aes)",        0, 16, 16
-        },
-        [BULK_CIPH_ALG_AES192]  = {
-                "aes192",       "cbc(aes)",        0, 16, 24
-        },
-        [BULK_CIPH_ALG_AES256]  = {
-                "aes256",       "cbc(aes)",        0, 16, 32
-        },
-        [BULK_CIPH_ALG_CAST128] = {
-                "cast128",      "cbc(cast5)",      0, 8,  16
-        },
-        [BULK_CIPH_ALG_CAST256] = {
-                "cast256",      "cbc(cast6)",      0, 16, 32
-        },
-        [BULK_CIPH_ALG_TWOFISH128] = {
-                "twofish128",   "cbc(twofish)",    0, 16, 16
-        },
-        [BULK_CIPH_ALG_TWOFISH256] = {
-                "twofish256",   "cbc(twofish)",    0, 16, 32
-        },
-};
-
-const struct sptlrpc_ciph_type *sptlrpc_get_ciph_type(__u8 ciph_alg)
-{
-        struct sptlrpc_ciph_type *ct;
-
-        if (ciph_alg < BULK_CIPH_ALG_MAX) {
-                ct = &cipher_types[ciph_alg];
-                if (ct->sct_tfm_name)
-                        return ct;
-        }
-        return NULL;
-}
-EXPORT_SYMBOL(sptlrpc_get_ciph_type);
-
-const char *sptlrpc_get_ciph_name(__u8 ciph_alg)
-{
-        const struct sptlrpc_ciph_type *ct;
-
-        ct = sptlrpc_get_ciph_type(ciph_alg);
-        if (ct)
-                return ct->sct_name;
-        else
-                return "unknown";
-}
-EXPORT_SYMBOL(sptlrpc_get_ciph_name);
diff --git a/lustre/ptlrpc/sec_config.c b/lustre/ptlrpc/sec_config.c

index b54a3a4..e9fe66f 100644 (file)
--- a/lustre/ptlrpc/sec_config.c
+++ b/lustre/ptlrpc/sec_config.c
@@ -102,222 +102,67 @@ EXPORT_SYMBOL(sptlrpc_target_sec_part);
   * user supplied flavor string parsing  *
   ****************************************/
  
-#ifdef HAVE_ADLER
-#define BULK_HASH_ALG_DEFAULT   BULK_HASH_ALG_ADLER32
-#else
-#define BULK_HASH_ALG_DEFAULT   BULK_HASH_ALG_CRC32
-#endif
-
-typedef enum {
-        BULK_TYPE_N = 0,
-        BULK_TYPE_I = 1,
-        BULK_TYPE_P = 2
-} bulk_type_t;
-
-static void get_default_flavor(struct sptlrpc_flavor *sf)
-{
-        sf->sf_rpc = SPTLRPC_FLVR_NULL;
-        sf->sf_bulk_ciph = BULK_CIPH_ALG_NULL;
-        sf->sf_bulk_hash = BULK_HASH_ALG_NULL;
-        sf->sf_flags = 0;
-}
-
-static void get_flavor_by_rpc(struct sptlrpc_flavor *flvr, __u16 rpc_flavor)
-{
-        get_default_flavor(flvr);
-
-        flvr->sf_rpc = rpc_flavor;
-
-        switch (rpc_flavor) {
-        case SPTLRPC_FLVR_NULL:
-                break;
-        case SPTLRPC_FLVR_PLAIN:
-        case SPTLRPC_FLVR_KRB5N:
-        case SPTLRPC_FLVR_KRB5A:
-                flvr->sf_bulk_hash = BULK_HASH_ALG_DEFAULT;
-                break;
-        case SPTLRPC_FLVR_KRB5P:
-                flvr->sf_bulk_ciph = BULK_CIPH_ALG_AES128;
-                /* fall through */
-        case SPTLRPC_FLVR_KRB5I:
-                flvr->sf_bulk_hash = BULK_HASH_ALG_SHA1;
-                break;
-        default:
-                LBUG();
-        }
-}
-
-static void get_flavor_by_bulk(struct sptlrpc_flavor *flvr,
-                               __u16 rpc_flavor, bulk_type_t bulk_type)
-{
-        switch (bulk_type) {
-        case BULK_TYPE_N:
-                flvr->sf_bulk_hash = BULK_HASH_ALG_NULL;
-                flvr->sf_bulk_ciph = BULK_CIPH_ALG_NULL;
-                break;
-        case BULK_TYPE_I:
-                switch (rpc_flavor) {
-                case SPTLRPC_FLVR_PLAIN:
-                case SPTLRPC_FLVR_KRB5N:
-                case SPTLRPC_FLVR_KRB5A:
-                        flvr->sf_bulk_hash = BULK_HASH_ALG_DEFAULT;
-                        break;
-                case SPTLRPC_FLVR_KRB5I:
-                case SPTLRPC_FLVR_KRB5P:
-                        flvr->sf_bulk_hash = BULK_HASH_ALG_SHA1;
-                        break;
-                default:
-                        LBUG();
-                }
-                flvr->sf_bulk_ciph = BULK_CIPH_ALG_NULL;
-                break;
-        case BULK_TYPE_P:
-                flvr->sf_bulk_hash = BULK_HASH_ALG_SHA1;
-                flvr->sf_bulk_ciph = BULK_CIPH_ALG_AES128;
-                break;
-        default:
-                LBUG();
-        }
-}
-
-static __u16 __flavors[] = {
-        SPTLRPC_FLVR_NULL,
-        SPTLRPC_FLVR_PLAIN,
-        SPTLRPC_FLVR_KRB5N,
-        SPTLRPC_FLVR_KRB5A,
-        SPTLRPC_FLVR_KRB5I,
-        SPTLRPC_FLVR_KRB5P,
-};
-
-#define __nflavors      ARRAY_SIZE(__flavors)
-
  /*
- * flavor string format: rpc[-bulk{n|i|p}[:cksum/enc]]
- * for examples:
- *  null
- *  plain-bulki
- *  krb5p-bulkn
- *  krb5i-bulkp
- *  krb5i-bulkp:sha512/arc4
+ * format: <base_flavor>[-<bulk_type:alg_spec>]
   */
  int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr)
  {
-        const char     *f;
-        char           *bulk, *alg, *enc;
-        char            buf[64];
-        bulk_type_t     bulk_type;
-        __u8            i;
-        ENTRY;
+        char            buf[32];
+        char           *bulk, *alg;
+
+        memset(flvr, 0, sizeof(*flvr));
  
          if (str == NULL || str[0] == '\0') {
                  flvr->sf_rpc = SPTLRPC_FLVR_INVALID;
-                goto out;
+                return 0;
          }
  
-        for (i = 0; i < __nflavors; i++) {
-                f = sptlrpc_rpcflavor2name(__flavors[i]);
-                if (strncmp(str, f, strlen(f)) == 0)
-                        break;
-        }
-
-        if (i >= __nflavors)
-                GOTO(invalid, -EINVAL);
+        strncpy(buf, str, sizeof(buf));
+        buf[sizeof(buf) - 1] = '\0';
  
-        /* prepare local buffer thus we can modify it as we want */
-        strncpy(buf, str, 64);
-        buf[64 - 1] = '\0';
-
-        /* find bulk string */
          bulk = strchr(buf, '-');
          if (bulk)
                  *bulk++ = '\0';
  
-        /* now the first part must equal to rpc flavor name */
-        if (strcmp(buf, f) != 0)
-                GOTO(invalid, -EINVAL);
-
-        get_flavor_by_rpc(flvr, __flavors[i]);
-
-        if (bulk == NULL)
-                goto out;
-
-        /* find bulk algorithm string */
-        alg = strchr(bulk, ':');
-        if (alg)
-                *alg++ = '\0';
-
-        /* verify bulk section */
-        if (strcmp(bulk, "bulkn") == 0) {
-                flvr->sf_bulk_hash = BULK_HASH_ALG_NULL;
-                flvr->sf_bulk_ciph = BULK_CIPH_ALG_NULL;
-                bulk_type = BULK_TYPE_N;
-        } else if (strcmp(bulk, "bulki") == 0)
-                bulk_type = BULK_TYPE_I;
-        else if (strcmp(bulk, "bulkp") == 0)
-                bulk_type = BULK_TYPE_P;
-        else
-                GOTO(invalid, -EINVAL);
-
-        /* null flavor don't support bulk i/p */
-        if (__flavors[i] == SPTLRPC_FLVR_NULL && bulk_type != BULK_TYPE_N)
-                GOTO(invalid, -EINVAL);
-
-        /* plain policy dosen't support bulk p */
-        if (__flavors[i] == SPTLRPC_FLVR_PLAIN && bulk_type == BULK_TYPE_P)
-                GOTO(invalid, -EINVAL);
-
-        get_flavor_by_bulk(flvr, __flavors[i], bulk_type);
-
-        if (alg == NULL)
-                goto out;
-
-        /* find encryption algorithm string */
-        enc = strchr(alg, '/');
-        if (enc)
-                *enc++ = '\0';
-
-        /* checksum algorithm */
-        for (i = 0; i < BULK_HASH_ALG_MAX; i++) {
-                if (strcmp(alg, sptlrpc_get_hash_name(i)) == 0) {
-                        flvr->sf_bulk_hash = i;
-                        break;
-                }
-        }
-        if (i >= BULK_HASH_ALG_MAX)
-                GOTO(invalid, -EINVAL);
-
-        /* privacy algorithm */
-        if (enc) {
-                for (i = 0; i < BULK_CIPH_ALG_MAX; i++) {
-                        if (strcmp(enc, sptlrpc_get_ciph_name(i)) == 0) {
-                                flvr->sf_bulk_ciph = i;
-                                break;
-                        }
-                }
-                if (i >= BULK_CIPH_ALG_MAX)
-                        GOTO(invalid, -EINVAL);
-        }
+        flvr->sf_rpc = sptlrpc_name2flavor_base(buf);
+        if (flvr->sf_rpc == SPTLRPC_FLVR_INVALID)
+                goto err_out;
  
          /*
-         * bulk combination sanity checks
+         * currently only base flavor "plain" can have bulk specification.
           */
-        if (bulk_type == BULK_TYPE_P &&
-            flvr->sf_bulk_ciph == BULK_CIPH_ALG_NULL)
-                GOTO(invalid, -EINVAL);
-
-        if (bulk_type == BULK_TYPE_I &&
-            (flvr->sf_bulk_hash == BULK_HASH_ALG_NULL ||
-             flvr->sf_bulk_ciph != BULK_CIPH_ALG_NULL))
-                GOTO(invalid, -EINVAL);
+        if (flvr->sf_rpc == SPTLRPC_FLVR_PLAIN) {
+                flvr->u_bulk.hash.hash_alg = BULK_HASH_ALG_ADLER32;
+                if (bulk) {
+                        /*
+                         * format: plain-hash:<hash_alg>
+                         */
+                        alg = strchr(bulk, ':');
+                        if (alg == NULL)
+                                goto err_out;
+                        *alg++ = '\0';
+
+                        if (strcmp(bulk, "hash"))
+                                goto err_out;
+
+                        flvr->u_bulk.hash.hash_alg = sptlrpc_get_hash_alg(alg);
+                        if (flvr->u_bulk.hash.hash_alg >= BULK_HASH_ALG_MAX)
+                                goto err_out;
+                }
  
-        if (bulk_type == BULK_TYPE_N &&
-            (flvr->sf_bulk_hash != BULK_HASH_ALG_NULL ||
-             flvr->sf_bulk_ciph != BULK_CIPH_ALG_NULL))
-                GOTO(invalid, -EINVAL);
+                if (flvr->u_bulk.hash.hash_alg == BULK_HASH_ALG_NULL)
+                        flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_NULL);
+                else
+                        flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_INTG);
+        } else {
+                if (bulk)
+                        goto err_out;
+        }
  
-out:
+        flvr->sf_flags = 0;
          return 0;
-invalid:
+
+err_out:
          CERROR("invalid flavor string: %s\n", str);
          return -EINVAL;
  }
@@ -327,6 +172,14 @@ EXPORT_SYMBOL(sptlrpc_parse_flavor);
   * configure rules                      *
   ****************************************/
  
+static void get_default_flavor(struct sptlrpc_flavor *sf)
+{
+        memset(sf, 0, sizeof(*sf));
+
+        sf->sf_rpc = SPTLRPC_FLVR_NULL;
+        sf->sf_flags = 0;
+}
+
  static void sptlrpc_rule_init(struct sptlrpc_rule *rule)
  {
          rule->sr_netid = LNET_NIDNET(LNET_NID_ANY);
@@ -411,19 +264,17 @@ EXPORT_SYMBOL(sptlrpc_rule_set_free);
  
  /*
   * return 0 if the rule set could accomodate one more rule.
- * if @expand != 0, the rule set might be expanded.
   */
-int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset, int expand)
+int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset)
  {
          struct sptlrpc_rule *rules;
          int nslot;
  
+        might_sleep();
+
          if (rset->srs_nrule < rset->srs_nslot)
                  return 0; 
  
-        if (expand == 0)
-                return -E2BIG;
-
          nslot = rset->srs_nslot + 8;
  
          /* better use realloc() if available */
@@ -468,16 +319,17 @@ static inline int rule_match_net(struct sptlrpc_rule *r1,
  
  /*
   * merge @rule into @rset.
- * if @expand != 0 then @rset slots might be expanded.
+ * the @rset slots might be expanded.
   */
  int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset, 
-                           struct sptlrpc_rule *rule,
-                           int expand)
+                           struct sptlrpc_rule *rule)
  {
          struct sptlrpc_rule      *p = rset->srs_rules;
          int                       spec_dir, spec_net;
          int                       rc, n, match = 0;
  
+        might_sleep();
+
          spec_net = rule_spec_net(rule);
          spec_dir = rule_spec_dir(rule);
  
@@ -537,7 +389,7 @@ int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset,
                  LASSERT(n >= 0 && n <= rset->srs_nrule);
  
                  if (rule->sr_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) {
-                        rc = sptlrpc_rule_set_expand(rset, expand);
+                        rc = sptlrpc_rule_set_expand(rset);
                          if (rc)
                                  return rc;
  
@@ -616,6 +468,8 @@ static int sptlrpc_rule_set_extract(struct sptlrpc_rule_set *gen,
          struct sptlrpc_rule     *rule;
          int                      i, n, rc;
  
+        might_sleep();
+
          /* merge general rules firstly, then target-specific rules */
          for (i = 0; i < 2; i++) {
                  if (src[i] == NULL)
@@ -633,7 +487,7 @@ static int sptlrpc_rule_set_extract(struct sptlrpc_rule_set *gen,
                              rule->sr_to != to)
                                  continue;
  
-                        rc = sptlrpc_rule_set_merge(rset, rule, 1);
+                        rc = sptlrpc_rule_set_merge(rset, rule);
                          if (rc) {
                                  CERROR("can't merge: %d\n", rc);
                                  return rc;
@@ -800,7 +654,7 @@ static int sptlrpc_conf_merge_rule(struct sptlrpc_conf *conf,
                  }
          }
  
-        return sptlrpc_rule_set_merge(rule_set, rule, 1);
+        return sptlrpc_rule_set_merge(rule_set, rule);
  }
  
  /**
@@ -829,7 +683,7 @@ static int __sptlrpc_process_config(struct lustre_cfg *lcfg,
                  RETURN(-EINVAL);
          }
  
-        CDEBUG(D_SEC, "got one rule: %s.%s\n", target, param);
+        CDEBUG(D_SEC, "processing rule: %s.%s\n", target, param);
  
          /* parse rule to make sure the format is correct */
          if (strncmp(param, PARAM_SRPC_FLVR, sizeof(PARAM_SRPC_FLVR) - 1) != 0) {
@@ -974,6 +828,13 @@ static void inline flavor_set_flags(struct sptlrpc_flavor *sf,
                                      enum lustre_sec_part to,
                                      unsigned int fl_udesc)
  {
+        /*
+         * null flavor doesn't need to set any flavor, and in fact
+         * we'd better not do that because everybody share a single sec.
+         */
+        if (sf->sf_rpc == SPTLRPC_FLVR_NULL)
+                return;
+
          if (from == LUSTRE_SP_MDT) {
                  /* MDT->MDT; MDT->OST */
                  sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY;
diff --git a/lustre/ptlrpc/sec_lproc.c b/lustre/ptlrpc/sec_lproc.c

index 51bace7..5a6fae9 100644 (file)
--- a/lustre/ptlrpc/sec_lproc.c
+++ b/lustre/ptlrpc/sec_lproc.c
@@ -66,7 +66,7 @@
  struct proc_dir_entry *sptlrpc_proc_root = NULL;
  EXPORT_SYMBOL(sptlrpc_proc_root);
  
-void sec_flags2str(unsigned long flags, char *buf, int bufsize)
+char *sec_flags2str(unsigned long flags, char *buf, int bufsize)
  {
          buf[0] = '\0';
  
@@ -82,7 +82,7 @@ void sec_flags2str(unsigned long flags, char *buf, int bufsize)
                  strncat(buf, "-,", bufsize);
  
          buf[strlen(buf) - 1] = '\0';
-
+        return buf;
  }
  
  static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v)
@@ -90,7 +90,7 @@ static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v)
          struct obd_device *dev = seq->private;
          struct client_obd *cli = &dev->u.cli;
          struct ptlrpc_sec *sec = NULL;
-        char               flags_str[32];
+        char               str[32];
  
          LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
                  strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
@@ -101,14 +101,14 @@ static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v)
          if (sec == NULL)
                  goto out;
  
-        sec_flags2str(sec->ps_flvr.sf_flags, flags_str, sizeof(flags_str));
+        sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str));
  
          seq_printf(seq, "rpc flavor:    %s\n",
-                   sptlrpc_rpcflavor2name(sec->ps_flvr.sf_rpc));
-        seq_printf(seq, "bulk flavor:   %s/%s\n",
-                   sptlrpc_get_hash_name(sec->ps_flvr.sf_bulk_hash),
-                   sptlrpc_get_ciph_name(sec->ps_flvr.sf_bulk_ciph));
-        seq_printf(seq, "flags:         %s\n", flags_str);
+                   sptlrpc_flavor2name_base(sec->ps_flvr.sf_rpc));
+        seq_printf(seq, "bulk flavor:   %s\n",
+                   sptlrpc_flavor2name_bulk(&sec->ps_flvr, str, sizeof(str)));
+        seq_printf(seq, "flags:         %s\n",
+                   sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str)));
          seq_printf(seq, "id:            %d\n", sec->ps_id);
          seq_printf(seq, "refcount:      %d\n", atomic_read(&sec->ps_refcount));
          seq_printf(seq, "nctx:          %d\n", atomic_read(&sec->ps_nctx));
diff --git a/lustre/ptlrpc/sec_null.c b/lustre/ptlrpc/sec_null.c

index 7b4368d..08baf12 100644 (file)
--- a/lustre/ptlrpc/sec_null.c
+++ b/lustre/ptlrpc/sec_null.c
@@ -59,13 +59,13 @@ static struct ptlrpc_cli_ctx    null_cli_ctx;
  static struct ptlrpc_svc_ctx    null_svc_ctx;
  
  /*
- * null sec temporarily use the third byte of lm_secflvr to identify
+ * we can temporarily use the topmost 8-bits of lm_secflvr to identify
   * the source sec part.
   */
  static inline
  void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp)
  {
-        msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 16;
+        msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 24;
  }
  
  static inline
@@ -73,9 +73,9 @@ enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg)
  {
          switch (msg->lm_magic) {
          case LUSTRE_MSG_MAGIC_V2:
-                return (msg->lm_secflvr >> 16) & 0xFF;
+                return (msg->lm_secflvr >> 24) & 0xFF;
          case LUSTRE_MSG_MAGIC_V2_SWABBED:
-                return (msg->lm_secflvr >> 8) & 0xFF;
+                return (msg->lm_secflvr) & 0xFF;
          default:
                  return LUSTRE_SP_ANY;
          }
@@ -135,14 +135,7 @@ struct ptlrpc_sec *null_create_sec(struct obd_import *imp,
                                     struct ptlrpc_svc_ctx *svc_ctx,
                                     struct sptlrpc_flavor *sf)
  {
-        LASSERT(RPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL);
-
-        if (sf->sf_bulk_ciph != BULK_CIPH_ALG_NULL ||
-            sf->sf_bulk_hash != BULK_HASH_ALG_NULL) {
-                CERROR("null sec don't support bulk algorithm: %u/%u\n",
-                       sf->sf_bulk_ciph, sf->sf_bulk_hash);
-                return NULL;
-        }
+        LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL);
  
          /* general layer has take a module reference for us, because we never
           * really destroy the sec, simply release the reference here.
@@ -300,7 +293,8 @@ static struct ptlrpc_svc_ctx null_svc_ctx = {
  static
  int null_accept(struct ptlrpc_request *req)
  {
-        LASSERT(RPC_FLVR_POLICY(req->rq_flvr.sf_rpc) == SPTLRPC_POLICY_NULL);
+        LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+                SPTLRPC_POLICY_NULL);
  
          if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL) {
                  CERROR("Invalid rpc flavor 0x%x\n", req->rq_flvr.sf_rpc);
@@ -428,8 +422,6 @@ static void null_init_internal(void)
          null_sec.ps_id = -1;
          null_sec.ps_import = NULL;
          null_sec.ps_flvr.sf_rpc = SPTLRPC_FLVR_NULL;
-        null_sec.ps_flvr.sf_bulk_ciph = BULK_CIPH_ALG_NULL;
-        null_sec.ps_flvr.sf_bulk_hash = BULK_HASH_ALG_NULL;
          null_sec.ps_flvr.sf_flags = 0;
          null_sec.ps_part = LUSTRE_SP_ANY;
          null_sec.ps_dying = 0;
diff --git a/lustre/ptlrpc/sec_plain.c b/lustre/ptlrpc/sec_plain.c

index eb9ee82..9b03d77 100644 (file)
--- a/lustre/ptlrpc/sec_plain.c
+++ b/lustre/ptlrpc/sec_plain.c
@@ -71,44 +71,124 @@ static struct ptlrpc_svc_ctx    plain_svc_ctx;
  static unsigned int plain_at_offset;
  
  /*
- * flavor flags (maximum 8 flags)
+ * for simplicity, plain policy rpc use fixed layout.
   */
-#define PLAIN_WFLVR_FLAGS_OFFSET        (12)
-#define PLAIN_WFLVR_FLAG_BULK           (1 << (0 + PLAIN_WFLVR_FLAGS_OFFSET))
-#define PLAIN_WFLVR_FLAG_USER           (1 << (1 + PLAIN_WFLVR_FLAGS_OFFSET))
+#define PLAIN_PACK_SEGMENTS             (4)
+
+#define PLAIN_PACK_HDR_OFF              (0)
+#define PLAIN_PACK_MSG_OFF              (1)
+#define PLAIN_PACK_USER_OFF             (2)
+#define PLAIN_PACK_BULK_OFF             (3)
+
+#define PLAIN_FL_USER                   (0x01)
+#define PLAIN_FL_BULK                   (0x02)
+
+struct plain_header {
+        __u8            ph_ver;            /* 0 */
+        __u8            ph_flags;
+        __u8            ph_sp;             /* source */
+        __u8            ph_bulk_hash_alg;  /* complete flavor desc */
+        __u8            ph_pad[4];
+};
  
-#define PLAIN_WFLVR_HAS_BULK(wflvr)      \
-        (((wflvr) & PLAIN_WFLVR_FLAG_BULK) != 0)
-#define PLAIN_WFLVR_HAS_USER(wflvr)      \
-        (((wflvr) & PLAIN_WFLVR_FLAG_USER) != 0)
+struct plain_bulk_token {
+        __u8            pbt_hash[8];
+};
  
-#define PLAIN_WFLVR_TO_RPC(wflvr)       \
-        ((wflvr) & ((1 << PLAIN_WFLVR_FLAGS_OFFSET) - 1))
+#define PLAIN_BSD_SIZE \
+        (sizeof(struct ptlrpc_bulk_sec_desc) + sizeof(struct plain_bulk_token))
  
-/*
- * similar to null sec, temporarily use the third byte of lm_secflvr to identify
- * the source sec part.
- */
-static inline
-void plain_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp)
+/****************************************
+ * bulk checksum helpers                *
+ ****************************************/
+
+static int plain_unpack_bsd(struct lustre_msg *msg)
  {
-        msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 16;
+        struct ptlrpc_bulk_sec_desc *bsd;
+
+        if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF))
+                return -EPROTO;
+
+        bsd = lustre_msg_buf(msg, PLAIN_PACK_BULK_OFF, PLAIN_BSD_SIZE);
+        if (bsd == NULL) {
+                CERROR("bulk sec desc has short size %d\n",
+                       lustre_msg_buflen(msg, PLAIN_PACK_BULK_OFF));
+                return -EPROTO;
+        }
+
+        if (bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+            bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG) {
+                CERROR("invalid bulk svc %u\n", bsd->bsd_svc);
+                return -EPROTO;
+        }
+
+        return 0;
  }
  
-static inline
-enum lustre_sec_part plain_decode_sec_part(struct lustre_msg *msg)
+static int plain_generate_bulk_csum(struct ptlrpc_bulk_desc *desc,
+                                    __u8 hash_alg,
+                                    struct plain_bulk_token *token)
  {
-        return (msg->lm_secflvr >> 16) & 0xFF;
+        if (hash_alg == BULK_HASH_ALG_NULL)
+                return 0;
+
+        memset(token->pbt_hash, 0, sizeof(token->pbt_hash));
+        return sptlrpc_get_bulk_checksum(desc, hash_alg, token->pbt_hash,
+                                         sizeof(token->pbt_hash));
  }
  
-/*
- * for simplicity, plain policy rpc use fixed layout.
- */
-#define PLAIN_PACK_SEGMENTS             (3)
+static int plain_verify_bulk_csum(struct ptlrpc_bulk_desc *desc,
+                                  __u8 hash_alg,
+                                  struct plain_bulk_token *tokenr)
+{
+        struct plain_bulk_token tokenv;
+        int                     rc;
+
+        if (hash_alg == BULK_HASH_ALG_NULL)
+                return 0;
  
-#define PLAIN_PACK_MSG_OFF              (0)
-#define PLAIN_PACK_USER_OFF             (1)
-#define PLAIN_PACK_BULK_OFF             (2)
+        memset(&tokenv.pbt_hash, 0, sizeof(tokenv.pbt_hash));
+        rc = sptlrpc_get_bulk_checksum(desc, hash_alg, tokenv.pbt_hash,
+                                       sizeof(tokenv.pbt_hash));
+        if (rc)
+                return rc;
+
+        if (memcmp(tokenr->pbt_hash, tokenv.pbt_hash, sizeof(tokenr->pbt_hash)))
+                return -EACCES;
+        return 0;
+}
+
+#ifdef __KERNEL__
+static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc)
+{
+        char           *ptr;
+        unsigned int    off, i;
+
+        for (i = 0; i < desc->bd_iov_count; i++) {
+                if (desc->bd_iov[i].kiov_len == 0)
+                        continue;
+
+                ptr = cfs_kmap(desc->bd_iov[i].kiov_page);
+                off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
+                ptr[off] ^= 0x1;
+                cfs_kunmap(desc->bd_iov[i].kiov_page);
+                return;
+        }
+}
+#else
+static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc)
+{
+        unsigned int    i;
+
+        for (i = 0; i < desc->bd_iov_count; i++) {
+                if (desc->bd_iov[i].iov_len == 0)
+                        continue;
+
+                ((char *)desc->bd_iov[i].iov_base)[i] ^= 0x1;
+                return;
+        }
+}
+#endif /* __KERNEL__ */
  
  /****************************************
   * cli_ctx apis                         *
@@ -131,16 +211,22 @@ int plain_ctx_validate(struct ptlrpc_cli_ctx *ctx)
  static
  int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
  {
-        struct lustre_msg_v2 *msg = req->rq_reqbuf;
+        struct lustre_msg   *msg = req->rq_reqbuf;
+        struct plain_header *phdr;
          ENTRY;
  
          msg->lm_secflvr = req->rq_flvr.sf_rpc;
-        if (req->rq_pack_bulk)
-                msg->lm_secflvr |= PLAIN_WFLVR_FLAG_BULK;
-        if (req->rq_pack_udesc)
-                msg->lm_secflvr |= PLAIN_WFLVR_FLAG_USER;
  
-        plain_encode_sec_part(msg, ctx->cc_sec->ps_part);
+        phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+        phdr->ph_ver = 0;
+        phdr->ph_flags = 0;
+        phdr->ph_sp = ctx->cc_sec->ps_part;
+        phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+        if (req->rq_pack_udesc)
+                phdr->ph_flags |= PLAIN_FL_USER;
+        if (req->rq_pack_bulk)
+                phdr->ph_flags |= PLAIN_FL_BULK;
  
          req->rq_reqdata_len = lustre_msg_size_v2(msg->lm_bufcount,
                                                   msg->lm_buflens);
@@ -150,8 +236,9 @@ int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
  static
  int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
  {
-        struct lustre_msg *msg = req->rq_repdata;
-        __u32              cksum;
+        struct lustre_msg   *msg = req->rq_repdata;
+        struct plain_header *phdr;
+        __u32                cksum;
          ENTRY;
  
          if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) {
@@ -159,12 +246,29 @@ int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
                  RETURN(-EPROTO);
          }
  
+        phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+        if (phdr == NULL) {
+                CERROR("missing plain header\n");
+                RETURN(-EPROTO);
+        }
+
+        if (phdr->ph_ver != 0) {
+                CERROR("Invalid header version\n");
+                RETURN(-EPROTO);
+        }
+
          /* expect no user desc in reply */
-        if (PLAIN_WFLVR_HAS_USER(msg->lm_secflvr)) {
+        if (phdr->ph_flags & PLAIN_FL_USER) {
                  CERROR("Unexpected udesc flag in reply\n");
                  RETURN(-EPROTO);
          }
  
+        if (phdr->ph_bulk_hash_alg != req->rq_flvr.u_bulk.hash.hash_alg) {
+                CERROR("reply bulk flavor %u != %u\n", phdr->ph_bulk_hash_alg,
+                       req->rq_flvr.u_bulk.hash.hash_alg);
+                RETURN(-EPROTO);
+        }
+
          if (unlikely(req->rq_early)) {
                  cksum = crc32_le(!(__u32) 0,
                                   lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
@@ -179,16 +283,15 @@ int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
                   * in reply, except for early reply */
                  if (!req->rq_early &&
                      !equi(req->rq_pack_bulk == 1,
-                          PLAIN_WFLVR_HAS_BULK(msg->lm_secflvr))) {
+                          phdr->ph_flags & PLAIN_FL_BULK)) {
                          CERROR("%s bulk checksum in reply\n",
                                 req->rq_pack_bulk ? "Missing" : "Unexpected");
                          RETURN(-EPROTO);
                  }
  
-                if (PLAIN_WFLVR_HAS_BULK(msg->lm_secflvr) &&
-                    bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF)) {
-                        CERROR("Mal-formed bulk checksum reply\n");
-                        RETURN(-EINVAL);
+                if (phdr->ph_flags & PLAIN_FL_BULK) {
+                        if (plain_unpack_bsd(msg))
+                                RETURN(-EPROTO);
                  }
          }
  
@@ -202,13 +305,42 @@ int plain_cli_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
                          struct ptlrpc_request *req,
                          struct ptlrpc_bulk_desc *desc)
  {
+        struct ptlrpc_bulk_sec_desc *bsd;
+        struct plain_bulk_token     *token;
+        int                          rc;
+
          LASSERT(req->rq_pack_bulk);
          LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
  
-        return bulk_csum_cli_request(desc, req->rq_bulk_read,
-                                     req->rq_flvr.sf_bulk_hash,
-                                     req->rq_reqbuf,
-                                     PLAIN_PACK_BULK_OFF);
+        bsd = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+        token = (struct plain_bulk_token *) bsd->bsd_data;
+
+        bsd->bsd_version = 0;
+        bsd->bsd_flags = 0;
+        bsd->bsd_type = SPTLRPC_BULK_DEFAULT;
+        bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc);
+
+        if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+                RETURN(0);
+
+        if (req->rq_bulk_read)
+                RETURN(0);
+
+        rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+                                      token);
+        if (rc) {
+                CERROR("bulk write: failed to compute checksum: %d\n", rc);
+        } else {
+                /*
+                 * for sending we only compute the wrong checksum instead
+                 * of corrupting the data so it is still correct on a redo
+                 */
+                if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) &&
+                    req->rq_flvr.u_bulk.hash.hash_alg != BULK_HASH_ALG_NULL)
+                        token->pbt_hash[0] ^= 0x1;
+        }
+
+        return rc;
  }
  
  static
@@ -216,13 +348,45 @@ int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
                            struct ptlrpc_request *req,
                            struct ptlrpc_bulk_desc *desc)
  {
+        struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+        struct plain_bulk_token     *tokenr, *tokenv;
+        int                          rc;
+#ifdef __KERNEL__
+        int                          i, nob;
+#endif
+
          LASSERT(req->rq_pack_bulk);
          LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
          LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS);
  
-        return bulk_csum_cli_reply(desc, req->rq_bulk_read,
-                                   req->rq_reqbuf, PLAIN_PACK_BULK_OFF,
-                                   req->rq_repdata, PLAIN_PACK_BULK_OFF);
+        bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+        tokenr = (struct plain_bulk_token *) bsdr->bsd_data;
+        bsdv = lustre_msg_buf(req->rq_repdata, PLAIN_PACK_BULK_OFF, 0);
+        tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+        if (req->rq_bulk_write) {
+                if (bsdv->bsd_flags & BSD_FL_ERR)
+                        return -EIO;
+                return 0;
+        }
+
+#ifdef __KERNEL__
+        /* fix the actual data size */
+        for (i = 0, nob = 0; i < desc->bd_iov_count; i++) {
+                if (desc->bd_iov[i].kiov_len + nob > desc->bd_nob_transferred) {
+                        desc->bd_iov[i].kiov_len =
+                                desc->bd_nob_transferred - nob;
+                }
+                nob += desc->bd_iov[i].kiov_len;
+        }
+#endif
+
+        rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+                                    tokenv);
+        if (rc)
+                CERROR("bulk read: client verify failed: %d\n", rc);
+
+        return rc;
  }
  
  /****************************************
@@ -303,13 +467,7 @@ struct ptlrpc_sec *plain_create_sec(struct obd_import *imp,
          struct ptlrpc_cli_ctx  *ctx;
          ENTRY;
  
-        LASSERT(RPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN);
-
-        if (sf->sf_bulk_ciph != BULK_CIPH_ALG_NULL) {
-                CERROR("plain policy don't support bulk cipher: %u\n",
-                       sf->sf_bulk_ciph);
-                RETURN(NULL);
-        }
+        LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN);
  
          OBD_ALLOC_PTR(plsec);
          if (plsec == NULL)
@@ -410,9 +568,10 @@ int plain_alloc_reqbuf(struct ptlrpc_sec *sec,
                         int msgsize)
  {
          __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
-        int alloc_len;
+        int   alloc_len;
          ENTRY;
  
+        buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
          buflens[PLAIN_PACK_MSG_OFF] = msgsize;
  
          if (req->rq_pack_udesc)
@@ -420,10 +579,7 @@ int plain_alloc_reqbuf(struct ptlrpc_sec *sec,
  
          if (req->rq_pack_bulk) {
                  LASSERT(req->rq_bulk_read || req->rq_bulk_write);
-
-                buflens[PLAIN_PACK_BULK_OFF] = bulk_sec_desc_size(
-                                                req->rq_flvr.sf_bulk_hash, 1,
-                                                req->rq_bulk_read);
+                buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
          }
  
          alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
@@ -444,7 +600,7 @@ int plain_alloc_reqbuf(struct ptlrpc_sec *sec,
          }
  
          lustre_init_msg_v2(req->rq_reqbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
-        req->rq_reqmsg = lustre_msg_buf_v2(req->rq_reqbuf, 0, 0);
+        req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0);
  
          if (req->rq_pack_udesc)
                  sptlrpc_pack_user_desc(req->rq_reqbuf, PLAIN_PACK_USER_OFF);
@@ -476,13 +632,12 @@ int plain_alloc_repbuf(struct ptlrpc_sec *sec,
          int alloc_len;
          ENTRY;
  
+        buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
          buflens[PLAIN_PACK_MSG_OFF] = msgsize;
  
          if (req->rq_pack_bulk) {
                  LASSERT(req->rq_bulk_read || req->rq_bulk_write);
-                buflens[PLAIN_PACK_BULK_OFF] = bulk_sec_desc_size(
-                                                req->rq_flvr.sf_bulk_hash, 0,
-                                                req->rq_bulk_read);
+                buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
          }
  
          alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
@@ -581,24 +736,46 @@ static struct ptlrpc_svc_ctx plain_svc_ctx = {
  static
  int plain_accept(struct ptlrpc_request *req)
  {
-        struct lustre_msg *msg = req->rq_reqbuf;
+        struct lustre_msg   *msg = req->rq_reqbuf;
+        struct plain_header *phdr;
          ENTRY;
  
-        LASSERT(RPC_FLVR_POLICY(req->rq_flvr.sf_rpc) == SPTLRPC_POLICY_PLAIN);
+        LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+                SPTLRPC_POLICY_PLAIN);
+
+        if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) !=
+            SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) ||
+            SPTLRPC_FLVR_BULK_TYPE(req->rq_flvr.sf_rpc) !=
+            SPTLRPC_FLVR_BULK_TYPE(SPTLRPC_FLVR_PLAIN)) {
+                CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc);
+                RETURN(SECSVC_DROP);
+        }
  
          if (msg->lm_bufcount < PLAIN_PACK_SEGMENTS) {
                  CERROR("unexpected request buf count %u\n", msg->lm_bufcount);
                  RETURN(SECSVC_DROP);
          }
  
-        if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_PLAIN) {
-                CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc);
-                RETURN(SECSVC_DROP);
+        phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+        if (phdr == NULL) {
+                CERROR("missing plain header\n");
+                RETURN(-EPROTO);
          }
  
-        req->rq_sp_from = plain_decode_sec_part(msg);
+        if (phdr->ph_ver != 0) {
+                CERROR("Invalid header version\n");
+                RETURN(-EPROTO);
+        }
  
-        if (PLAIN_WFLVR_HAS_USER(msg->lm_secflvr)) {
+        if (phdr->ph_bulk_hash_alg >= BULK_HASH_ALG_MAX) {
+                CERROR("invalid hash algorithm: %u\n", phdr->ph_bulk_hash_alg);
+                RETURN(-EPROTO);
+        }
+
+        req->rq_sp_from = phdr->ph_sp;
+        req->rq_flvr.u_bulk.hash.hash_alg = phdr->ph_bulk_hash_alg;
+
+        if (phdr->ph_flags & PLAIN_FL_USER) {
                  if (sptlrpc_unpack_user_desc(msg, PLAIN_PACK_USER_OFF)) {
                          CERROR("Mal-formed user descriptor\n");
                          RETURN(SECSVC_DROP);
@@ -608,11 +785,9 @@ int plain_accept(struct ptlrpc_request *req)
                  req->rq_user_desc = lustre_msg_buf(msg, PLAIN_PACK_USER_OFF, 0);
          }
  
-        if (PLAIN_WFLVR_HAS_BULK(msg->lm_secflvr)) {
-                if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF)) {
-                        CERROR("Mal-formed bulk checksum request\n");
+        if (phdr->ph_flags & PLAIN_FL_BULK) {
+                if (plain_unpack_bsd(msg))
                          RETURN(SECSVC_DROP);
-                }
  
                  req->rq_pack_bulk = 1;
          }
@@ -630,24 +805,18 @@ static
  int plain_alloc_rs(struct ptlrpc_request *req, int msgsize)
  {
          struct ptlrpc_reply_state   *rs;
-        struct ptlrpc_bulk_sec_desc *bsd;
          __u32                        buflens[PLAIN_PACK_SEGMENTS] = { 0, };
          int                          rs_size = sizeof(*rs);
          ENTRY;
  
          LASSERT(msgsize % 8 == 0);
  
+        buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
          buflens[PLAIN_PACK_MSG_OFF] = msgsize;
  
-        if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write)) {
-                bsd = lustre_msg_buf(req->rq_reqbuf,
-                                     PLAIN_PACK_BULK_OFF, sizeof(*bsd));
-                LASSERT(bsd);
+        if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write))
+                buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
  
-                buflens[PLAIN_PACK_BULK_OFF] = bulk_sec_desc_size(
-                                                        bsd->bsd_hash_alg, 0,
-                                                        req->rq_bulk_read);
-        }
          rs_size += lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
  
          rs = req->rq_reply_state;
@@ -693,6 +862,7 @@ int plain_authorize(struct ptlrpc_request *req)
  {
          struct ptlrpc_reply_state *rs = req->rq_reply_state;
          struct lustre_msg_v2      *msg = rs->rs_repbuf;
+        struct plain_header       *phdr;
          int                        len;
          ENTRY;
  
@@ -706,8 +876,14 @@ int plain_authorize(struct ptlrpc_request *req)
                  len = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
  
          msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+        phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+        phdr->ph_ver = 0;
+        phdr->ph_flags = 0;
+        phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
          if (req->rq_pack_bulk)
-                msg->lm_secflvr |= PLAIN_WFLVR_FLAG_BULK;
+                phdr->ph_flags |= PLAIN_FL_BULK;
  
          rs->rs_repdata_len = len;
  
@@ -730,44 +906,73 @@ static
  int plain_svc_unwrap_bulk(struct ptlrpc_request *req,
                            struct ptlrpc_bulk_desc *desc)
  {
-        struct ptlrpc_reply_state      *rs = req->rq_reply_state;
+        struct ptlrpc_reply_state   *rs = req->rq_reply_state;
+        struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+        struct plain_bulk_token     *tokenr, *tokenv;
+        int                          rc;
  
-        LASSERT(rs);
+        LASSERT(req->rq_bulk_write);
          LASSERT(req->rq_pack_bulk);
-        LASSERT(req->rq_reqbuf->lm_bufcount >= PLAIN_PACK_SEGMENTS);
-        LASSERT(rs->rs_repbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
  
-        return bulk_csum_svc(desc, req->rq_bulk_read,
-                             lustre_msg_buf(req->rq_reqbuf,
-                                            PLAIN_PACK_BULK_OFF, 0),
-                             lustre_msg_buflen(req->rq_reqbuf,
-                                               PLAIN_PACK_BULK_OFF),
-                             lustre_msg_buf(rs->rs_repbuf,
-                                            PLAIN_PACK_BULK_OFF, 0),
-                             lustre_msg_buflen(rs->rs_repbuf,
-                                               PLAIN_PACK_BULK_OFF));
+        bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+        tokenr = (struct plain_bulk_token *) bsdr->bsd_data;
+        bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+        tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+        bsdv->bsd_version = 0;
+        bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+        bsdv->bsd_svc = bsdr->bsd_svc;
+        bsdv->bsd_flags = 0;
+
+        if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+                return 0;
+
+        rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+                                    tokenr);
+        if (rc) {
+                bsdv->bsd_flags |= BSD_FL_ERR;
+                CERROR("bulk write: server verify failed: %d\n", rc);
+        }
+
+        return rc;
  }
  
  static
  int plain_svc_wrap_bulk(struct ptlrpc_request *req,
                          struct ptlrpc_bulk_desc *desc)
  {
-        struct ptlrpc_reply_state      *rs = req->rq_reply_state;
+        struct ptlrpc_reply_state   *rs = req->rq_reply_state;
+        struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+        struct plain_bulk_token     *tokenr, *tokenv;
+        int                          rc;
  
-        LASSERT(rs);
+        LASSERT(req->rq_bulk_read);
          LASSERT(req->rq_pack_bulk);
-        LASSERT(req->rq_reqbuf->lm_bufcount >= PLAIN_PACK_SEGMENTS);
-        LASSERT(rs->rs_repbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
  
-        return bulk_csum_svc(desc, req->rq_bulk_read,
-                             lustre_msg_buf(req->rq_reqbuf,
-                                            PLAIN_PACK_BULK_OFF, 0),
-                             lustre_msg_buflen(req->rq_reqbuf,
-                                               PLAIN_PACK_BULK_OFF),
-                             lustre_msg_buf(rs->rs_repbuf,
-                                            PLAIN_PACK_BULK_OFF, 0),
-                             lustre_msg_buflen(rs->rs_repbuf,
-                                               PLAIN_PACK_BULK_OFF));
+        bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+        tokenr = (struct plain_bulk_token *) bsdr->bsd_data;
+        bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+        tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+        bsdv->bsd_version = 0;
+        bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+        bsdv->bsd_svc = bsdr->bsd_svc;
+        bsdv->bsd_flags = 0;
+
+        if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+                return 0;
+
+        rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+                                      tokenv);
+        if (rc) {
+                CERROR("bulk read: server failed to compute "
+                       "checksum: %d\n", rc);
+        } else {
+                if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))
+                        corrupt_bulk_data(desc);
+        }
+
+        return rc;
  }
  
  static struct ptlrpc_ctx_ops plain_ctx_ops = {
@@ -787,8 +992,8 @@ static struct ptlrpc_sec_cops plain_sec_cops = {
          .release_ctx            = plain_release_ctx,
          .flush_ctx_cache        = plain_flush_ctx_cache,
          .alloc_reqbuf           = plain_alloc_reqbuf,
-        .alloc_repbuf           = plain_alloc_repbuf,
          .free_reqbuf            = plain_free_reqbuf,
+        .alloc_repbuf           = plain_alloc_repbuf,
          .free_repbuf            = plain_free_repbuf,
          .enlarge_reqbuf         = plain_enlarge_reqbuf,
  };
diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c

index cb101cf..a8d0785 100644 (file)
--- a/lustre/ptlrpc/service.c
+++ b/lustre/ptlrpc/service.c
@@ -1311,6 +1311,17 @@ ptlrpc_server_handle_req_in(struct ptlrpc_service *svc)
                  goto err_req;
          }
  
+        switch(lustre_msg_get_opc(req->rq_reqmsg)) {
+        case MDS_WRITEPAGE:
+        case OST_WRITE:
+                req->rq_bulk_write = 1;
+                break;
+        case MDS_READPAGE:
+        case OST_READ:
+                req->rq_bulk_read = 1;
+                break;
+        }
+
          CDEBUG(D_NET, "got req "LPD64"\n", req->rq_xid);
  
          req->rq_export = class_conn2export(
diff --git a/lustre/ptlrpc/wiretest.c b/lustre/ptlrpc/wiretest.c

index d1f4475..c93be5d 100644 (file)
--- a/lustre/ptlrpc/wiretest.c
+++ b/lustre/ptlrpc/wiretest.c
@@ -65,8 +65,8 @@ void lustre_assert_wire_constants(void)
  {
          /* Wire protocol assertions generated by 'wirecheck'
           * (make -C lustre/utils newwiretest)
-         * running on Linux lin2 2.6.18-92.1.17-prep #3 Sun Nov 23 14:29:36 IST 2008 i686 i686 i386 G
-         * with gcc version 3.4.6 20060404 (Red Hat 3.4.6-10) */
+         * running on Linux localhost.localdomain 2.6.18-prep #3 SMP Sun Nov 23 08:04:44 EST 2008 i68
+         * with gcc version 4.1.1 20061011 (Red Hat 4.1.1-30) */
  
  
          /* Constants... */
@@ -254,9 +254,9 @@ void lustre_assert_wire_constants(void)
                   (long long)OBD_QC_CALLBACK);
          LASSERTF(OBD_LAST_OPC == 403, " found %lld\n",
                   (long long)OBD_LAST_OPC);
-        LASSERTF(QUOTA_DQACQ == 901, " found %lld\n",
+        LASSERTF(QUOTA_DQACQ == 601, " found %lld\n",
                   (long long)QUOTA_DQACQ);
-        LASSERTF(QUOTA_DQREL == 902, " found %lld\n",
+        LASSERTF(QUOTA_DQREL == 602, " found %lld\n",
                   (long long)QUOTA_DQREL);
          LASSERTF(MGS_CONNECT == 250, " found %lld\n",
                   (long long)MGS_CONNECT);
@@ -447,31 +447,31 @@ void lustre_assert_wire_constants(void)
                   (long long)(int)offsetof(struct obd_connect_data, padding2));
          LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding2) == 8, " found %lld\n",
                   (long long)(int)sizeof(((struct obd_connect_data *)0)->padding2));
-        CLASSERT(OBD_CONNECT_RDONLY == 0x00000001ULL);
-        CLASSERT(OBD_CONNECT_INDEX == 0x00000002ULL);
-        CLASSERT(OBD_CONNECT_GRANT == 0x00000008ULL);
-        CLASSERT(OBD_CONNECT_SRVLOCK == 0x00000010ULL);
-        CLASSERT(OBD_CONNECT_VERSION == 0x00000020ULL);
-        CLASSERT(OBD_CONNECT_REQPORTAL == 0x00000040ULL);
-        CLASSERT(OBD_CONNECT_ACL == 0x00000080ULL);
-        CLASSERT(OBD_CONNECT_XATTR == 0x00000100ULL);
+        CLASSERT(OBD_CONNECT_RDONLY == 0x1ULL);
+        CLASSERT(OBD_CONNECT_INDEX == 0x2ULL);
+        CLASSERT(OBD_CONNECT_GRANT == 0x8ULL);
+        CLASSERT(OBD_CONNECT_SRVLOCK == 0x10ULL);
+        CLASSERT(OBD_CONNECT_VERSION == 0x20ULL);
+        CLASSERT(OBD_CONNECT_REQPORTAL == 0x40ULL);
+        CLASSERT(OBD_CONNECT_ACL == 0x80ULL);
+        CLASSERT(OBD_CONNECT_XATTR == 0x100ULL);
          CLASSERT(OBD_CONNECT_REAL == 0x08000000ULL);
          CLASSERT(OBD_CONNECT_CKSUM == 0x20000000ULL);
-        CLASSERT(OBD_CONNECT_TRUNCLOCK == 0x00000400ULL);
-        CLASSERT(OBD_CONNECT_IBITS == 0x00001000ULL);
-        CLASSERT(OBD_CONNECT_JOIN == 0x00002000ULL);
-        CLASSERT(OBD_CONNECT_ATTRFID == 0x00004000ULL);
-        CLASSERT(OBD_CONNECT_NODEVOH == 0x00008000ULL);
+        CLASSERT(OBD_CONNECT_TRUNCLOCK == 0x400ULL);
+        CLASSERT(OBD_CONNECT_IBITS == 0x1000ULL);
+        CLASSERT(OBD_CONNECT_JOIN == 0x2000ULL);
+        CLASSERT(OBD_CONNECT_ATTRFID == 0x4000ULL);
+        CLASSERT(OBD_CONNECT_NODEVOH == 0x8000ULL);
          CLASSERT(OBD_CONNECT_RMT_CLIENT == 0x00010000ULL);
          CLASSERT(OBD_CONNECT_RMT_CLIENT_FORCE == 0x00020000ULL);
-        CLASSERT(OBD_CONNECT_BRW_SIZE == 0x00040000ULL);
-        CLASSERT(OBD_CONNECT_QUOTA64 == 0x00080000ULL);
-        CLASSERT(OBD_CONNECT_MDS_CAPA == 0x00100000ULL);
-        CLASSERT(OBD_CONNECT_OSS_CAPA == 0x00200000ULL);
+        CLASSERT(OBD_CONNECT_BRW_SIZE == 0x40000ULL);
+        CLASSERT(OBD_CONNECT_QUOTA64 == 0x80000ULL);
+        CLASSERT(OBD_CONNECT_MDS_CAPA == 0x100000ULL);
+        CLASSERT(OBD_CONNECT_OSS_CAPA == 0x200000ULL);
          CLASSERT(OBD_CONNECT_MDS_MDS == 0x04000000ULL);
          CLASSERT(OBD_CONNECT_SOM == 0x00800000ULL);
          CLASSERT(OBD_CONNECT_AT == 0x01000000ULL);
-        CLASSERT(OBD_CONNECT_CANCELSET == 0x00400000ULL);
+        CLASSERT(OBD_CONNECT_CANCELSET == 0x400000ULL);
          CLASSERT(OBD_CONNECT_LRU_RESIZE == 0x02000000ULL);
  
          /* Checks for struct obdo */
@@ -2389,7 +2389,7 @@ void lustre_assert_wire_constants(void)
          CLASSERT(FIEMAP_FLAG_DEVICE_ORDER == 0x40000000);
  
          /* Checks for struct ll_fiemap_extent */
-        LASSERTF((int)sizeof(struct ll_fiemap_extent) == 32, " found %lld\n",
+        LASSERTF((int)sizeof(struct ll_fiemap_extent) == 56, " found %lld\n",
                   (long long)(int)sizeof(struct ll_fiemap_extent));
          LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_logical) == 0, " found %lld\n",
                   (long long)(int)offsetof(struct ll_fiemap_extent, fe_logical));
@@ -2403,27 +2403,26 @@ void lustre_assert_wire_constants(void)
                   (long long)(int)offsetof(struct ll_fiemap_extent, fe_length));
          LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_length) == 8, " found %lld\n",
                   (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_length));
-        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 24, " found %lld\n",
+        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 40, " found %lld\n",
                   (long long)(int)offsetof(struct ll_fiemap_extent, fe_flags));
          LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags) == 4, " found %lld\n",
                   (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags));
-        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 28, " found %lld\n",
+        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 44, " found %lld\n",
                   (long long)(int)offsetof(struct ll_fiemap_extent, fe_device));
          LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_device) == 4, " found %lld\n",
                   (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_device));
          CLASSERT(FIEMAP_EXTENT_LAST == 0x00000001);
          CLASSERT(FIEMAP_EXTENT_UNKNOWN == 0x00000002);
          CLASSERT(FIEMAP_EXTENT_DELALLOC == 0x00000004);
-        CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x00000008);
-        CLASSERT(FIEMAP_EXTENT_SECONDARY == 0x00000010);
-        CLASSERT(FIEMAP_EXTENT_NET == 0x00000020);
-        CLASSERT(FIEMAP_EXTENT_DATA_COMPRESSED == 0x00000040);
+        CLASSERT(FIEMAP_EXTENT_ENCODED == 0x00000008);
          CLASSERT(FIEMAP_EXTENT_DATA_ENCRYPTED == 0x00000080);
          CLASSERT(FIEMAP_EXTENT_NOT_ALIGNED == 0x00000100);
          CLASSERT(FIEMAP_EXTENT_DATA_INLINE == 0x00000200);
          CLASSERT(FIEMAP_EXTENT_DATA_TAIL == 0x00000400);
          CLASSERT(FIEMAP_EXTENT_UNWRITTEN == 0x00000800);
          CLASSERT(FIEMAP_EXTENT_MERGED == 0x00001000);
+        CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x40000000);
+        CLASSERT(FIEMAP_EXTENT_NET == 0x80000000);
  #ifdef LIBLUSTRE_POSIX_ACL
  
          /* Checks for type posix_acl_xattr_entry */
diff --git a/lustre/quota/Makefile.in b/lustre/quota/Makefile.in

index f052b42..50efef3 100644 (file)
--- a/lustre/quota/Makefile.in
+++ b/lustre/quota/Makefile.in
@@ -3,5 +3,7 @@ MODULES := lquota
  lquota-objs := quota_check.o quota_context.o quota_ctl.o quota_interface.o
  lquota-objs += quota_master.o quota_adjust_qunit.o lproc_quota.o
  
+EXTRA_DIST := $(lquota-objs:%.o=%.c) $(quotactl-objs:%.o=%.c) $(quotacheck-objs:%.o=%.c) quota_internal.h
+
  @INCLUDE_RULES@
  
diff --git a/lustre/quota/autoMakefile.am b/lustre/quota/autoMakefile.am

index 9a20d28..0c9bd1f 100644 (file)
--- a/lustre/quota/autoMakefile.am
+++ b/lustre/quota/autoMakefile.am
@@ -46,4 +46,3 @@ modulefs_DATA = lquota$(KMODEXT)
  endif
  
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES := $(lquota-objs:%.o=%.c) quota_internal.h
diff --git a/lustre/quota/quota_adjust_qunit.c b/lustre/quota/quota_adjust_qunit.c

index df2115d..abe57dd 100644 (file)
--- a/lustre/quota/quota_adjust_qunit.c
+++ b/lustre/quota/quota_adjust_qunit.c
@@ -331,7 +331,8 @@ int filter_quota_adjust_qunit(struct obd_export *exp,
  
          if (rc > 0) {
                  rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, 1, 0, NULL);
-                if (rc == -EDQUOT || rc == -EBUSY || rc == -EAGAIN) {
+                if (rc == -EDQUOT || rc == -EBUSY ||
+                    rc == QUOTA_REQ_RETURNED || rc == -EAGAIN) {
                          CDEBUG(D_QUOTA, "rc: %d.\n", rc);
                          rc = 0;
                  }
diff --git a/lustre/quota/quota_check.c b/lustre/quota/quota_check.c

index 62fc1f0..c2238e2 100644 (file)
--- a/lustre/quota/quota_check.c
+++ b/lustre/quota/quota_check.c
@@ -114,6 +114,7 @@ static int target_quotacheck_thread(void *data)
          pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
  
          rc = target_quotacheck_callback(exp, oqctl);
+        class_export_put(exp);
  
          atomic_inc(qta->qta_sem);
  
@@ -155,6 +156,9 @@ int target_quota_check(struct obd_device *obd, struct obd_export *exp,
                  }
          }
  
+        /* we get ref for exp because target_quotacheck_callback() will use this
+         * export later b=18126 */
+        class_export_get(exp);
          rc = kernel_thread(target_quotacheck_thread, qta, CLONE_VM|CLONE_FILES);
          if (rc >= 0) {
                  CDEBUG(D_INFO, "%s: target_quotacheck_thread: %d\n",
@@ -162,6 +166,7 @@ int target_quota_check(struct obd_device *obd, struct obd_export *exp,
                  RETURN(0);
          }
  
+        class_export_put(exp);
          CERROR("%s: error starting quotacheck_thread: %d\n",
                 obd->obd_name, rc);
          OBD_FREE_PTR(qta);
@@ -274,12 +279,14 @@ int lov_quota_check(struct obd_device *unused, struct obd_export *exp,
          ENTRY;
  
          for (i = 0; i < lov->desc.ld_tgt_count; i++) {
-                int err;
-
                  if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active) {
                          CERROR("lov idx %d inactive\n", i);
                          RETURN(-EIO);
                  }
+        }
+
+        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                int err;
  
                  err = obd_quotacheck(lov->lov_tgts[i]->ltd_exp, oqctl);
                  if (err && !rc)
diff --git a/lustre/quota/quota_context.c b/lustre/quota/quota_context.c

index bb0bbd2..162f0af 100644 (file)
--- a/lustre/quota/quota_context.c
+++ b/lustre/quota/quota_context.c
@@ -636,8 +636,10 @@ out:
  
          compute_lqs_after_removing_qunit(qunit);
  
-        /* wake up all waiters */
+        if (rc == 0)
+                rc = QUOTA_REQ_RETURNED;
          QUNIT_SET_STATE_AND_RC(qunit, QUNIT_FINISHED, rc);
+        /* wake up all waiters */
          wake_up_all(&qunit->lq_waitq);
  
          /* this is for dqacq_in_flight() */
@@ -664,7 +666,7 @@ out:
                   CERROR("adjust slave's qunit size failed!(rc:%d)\n", rc1);
                   RETURN(rc1);
           }
-         if (err || (rc && rc != -EBUSY && rc1 == 0) || is_master(qctxt))
+         if (err || (rc < 0 && rc != -EBUSY && rc1 == 0) || is_master(qctxt))
                  RETURN(err);
  
          /* reschedule another dqacq/dqrel if needed */
@@ -774,25 +776,56 @@ int check_qm(struct lustre_quota_ctxt *qctxt)
          RETURN(rc);
  }
  
+/* wake up all waiting threads when lqc_import is NULL */
+void dqacq_interrupt(struct lustre_quota_ctxt *qctxt)
+{
+        struct lustre_qunit *qunit, *tmp;
+        int i;
+        ENTRY;
+
+        spin_lock(&qunit_hash_lock);
+        for (i = 0; i < NR_DQHASH; i++) {
+                list_for_each_entry_safe(qunit, tmp, &qunit_hash[i], lq_hash) {
+                        if (qunit->lq_ctxt != qctxt)
+                                continue;
+
+                        /* Wake up all waiters. Do not change lq_state.
+                         * The waiters will check lq_rc which is kept as 0
+                         * if no others change it, then the waiters will return
+                         * -EAGAIN to caller who can perform related quota
+                         * acq/rel if necessary. */
+                        wake_up_all(&qunit->lq_waitq);
+                }
+        }
+        spin_unlock(&qunit_hash_lock);
+        EXIT;
+}
+
  static int got_qunit(struct lustre_qunit *qunit)
  {
-        int rc;
+        struct lustre_quota_ctxt *qctxt = qunit->lq_ctxt;
+        int rc = 0;
          ENTRY;
  
          spin_lock(&qunit->lq_lock);
          switch (qunit->lq_state) {
          case QUNIT_IN_HASH:
          case QUNIT_RM_FROM_HASH:
-                rc = 0;
                  break;
          case QUNIT_FINISHED:
                  rc = 1;
                  break;
          default:
-                rc = 0;
                  CERROR("invalid qunit state %d\n", qunit->lq_state);
          }
          spin_unlock(&qunit->lq_lock);
+
+        if (!rc) {
+                spin_lock(&qctxt->lqc_lock);
+                rc = !qctxt->lqc_import || !qctxt->lqc_valid;
+                spin_unlock(&qctxt->lqc_lock);
+        }
+
          RETURN(rc);
  }
  
@@ -952,16 +985,14 @@ wait_completion:
  
                  QDATA_DEBUG(p, "qunit(%p) is waiting for dqacq.\n", qunit);
                  l_wait_event(qunit->lq_waitq, got_qunit(qunit), &lwi);
-                /* rc = -EAGAIN, it means a quota req is finished;
+                /* rc = -EAGAIN, it means the quota master isn't ready yet
+                 * rc = QUOTA_REQ_RETURNED, it means a quota req is finished;
                   * rc = -EDQUOT, it means out of quota
                   * rc = -EBUSY, it means recovery is happening
                   * other rc < 0, it means real errors, functions who call
                   * schedule_dqacq should take care of this */
                  spin_lock(&qunit->lq_lock);
-                if (qunit->lq_rc == 0)
-                        rc = -EAGAIN;
-                else
-                        rc = qunit->lq_rc;
+                rc = qunit->lq_rc;
                  spin_unlock(&qunit->lq_lock);
                  CDEBUG(D_QUOTA, "qunit(%p) finishes waiting. (rc:%d)\n",
                         qunit, rc);
@@ -1057,10 +1088,7 @@ qctxt_wait_pending_dqacq(struct lustre_quota_ctxt *qctxt, unsigned int id,
                         qunit, qunit->lq_rc);
                  /* keep same as schedule_dqacq() b=17030 */
                  spin_lock(&qunit->lq_lock);
-                if (qunit->lq_rc == 0)
-                        rc = -EAGAIN;
-                else
-                        rc = qunit->lq_rc;
+                rc = qunit->lq_rc;
                  spin_unlock(&qunit->lq_lock);
                  /* this is for dqacq_in_flight() */
                  qunit_put(qunit);
diff --git a/lustre/quota/quota_interface.c b/lustre/quota/quota_interface.c

index 71bed54..2ef1b67 100644 (file)
--- a/lustre/quota/quota_interface.c
+++ b/lustre/quota/quota_interface.c
@@ -137,6 +137,7 @@ static int filter_quota_clearinfo(struct obd_export *exp, struct obd_device *obd
                  spin_lock(&qctxt->lqc_lock);
                  qctxt->lqc_import = NULL;
                  spin_unlock(&qctxt->lqc_lock);
+                dqacq_interrupt(qctxt);
                  CDEBUG(D_QUOTA, "%s: lqc_import of obd(%p) is invalid now.\n",
                         obd->obd_name, obd);
          }
@@ -380,7 +381,7 @@ static int quota_chk_acq_common(struct obd_device *obd, unsigned int uid,
  
                  /* please reference to dqacq_completion for the below */
                  /* a new request is finished, try again */
-                if (rc == -EAGAIN) {
+                if (rc == QUOTA_REQ_RETURNED) {
                          CDEBUG(D_QUOTA, "finish a quota req, try again\n");
                          continue;
                  }
diff --git a/lustre/quota/quota_internal.h b/lustre/quota/quota_internal.h

index e9073be..8856af3 100644 (file)
--- a/lustre/quota/quota_internal.h
+++ b/lustre/quota/quota_internal.h
@@ -113,6 +113,7 @@ int compute_remquota(struct obd_device *obd,
                       struct lustre_quota_ctxt *qctxt, struct qunit_data *qdata,
                       int isblk);
  int check_qm(struct lustre_quota_ctxt *qctxt);
+void dqacq_interrupt(struct lustre_quota_ctxt *qctxt);
  /* quota_master.c */
  int lustre_dquot_init(void);
  void lustre_dquot_exit(void);
@@ -186,6 +187,8 @@ extern cfs_proc_dir_entry_t *lquota_type_proc_dir;
  #define LQS_INO_DECREASE 4
  #define LQS_INO_INCREASE 8
  
+/* the return status of quota operation */
+#define QUOTA_REQ_RETURNED 1
  
  #endif
  int client_quota_adjust_qunit(struct obd_export *exp,
diff --git a/lustre/quota/quota_master.c b/lustre/quota/quota_master.c

index 9629357..62b7127 100644 (file)
--- a/lustre/quota/quota_master.c
+++ b/lustre/quota/quota_master.c
@@ -552,8 +552,9 @@ int mds_quota_adjust(struct obd_device *obd, unsigned int qcids[],
          }
  
          if (rc2)
-                CDEBUG(rc2 == -EAGAIN ? D_QUOTA: D_ERROR,
-                       "mds adjust qunit failed! (opc:%d rc:%d)\n", opc, rc2);
+                CDEBUG(rc2 == QUOTA_REQ_RETURNED ? D_QUOTA: D_ERROR,
+                       "mds adjust qunit %ssuccessfully! (opc:%d rc:%d)\n",
+                       rc2 == QUOTA_REQ_RETURNED ? "" : "un", opc, rc2);
          RETURN(0);
  }
  
@@ -590,9 +591,9 @@ int filter_quota_adjust(struct obd_device *obd, unsigned int qcids[],
          if (rc || rc2) {
                  if (!rc)
                          rc = rc2;
-                CDEBUG(rc == -EAGAIN ? D_QUOTA: D_ERROR,
-                       "filter adjust qunit failed! (opc:%d rc%d)\n",
-                       opc, rc);
+                CDEBUG(rc == QUOTA_REQ_RETURNED ? D_QUOTA: D_ERROR,
+                       "filter adjust qunit %ssuccessfully! (opc:%d rc%d)\n",
+                       QUOTA_REQ_RETURNED ? "" : "un", opc, rc);
          }
  
          RETURN(0);
diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh

index fa4a14b..912e184 100755 (executable)
--- a/lustre/tests/acceptance-small.sh
+++ b/lustre/tests/acceptance-small.sh
@@ -61,7 +61,7 @@ setup_if_needed() {
      local MOUNTED=$(mounted_lustre_filesystems)
      if $(echo $MOUNTED | grep -w -q $MOUNT); then
          check_config $MOUNT
-        init_versions_vars
+        init_param_vars
          return
      fi
  
diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh

index 443348e..60f880a 100644 (file)
--- a/lustre/tests/conf-sanity.sh
+++ b/lustre/tests/conf-sanity.sh
@@ -501,7 +501,7 @@ test_18() {
          check_mount || return 41
  
          echo "check journal size..."
-        local FOUNDSIZE=`do_facet mds "$$DEBUGFS -c -R 'stat <8>' $MDSDEV" | awk '/Size: / { print $NF; exit;}'`
+        local FOUNDSIZE=`do_facet mds "$DEBUGFS -c -R 'stat <8>' $MDSDEV" | awk '/Size: / { print $NF; exit;}'`
          if [ $FOUNDSIZE -gt $((32 * 1024 * 1024)) ]; then
                  log "Success: mkfs creates large journals. Size: $((FOUNDSIZE >> 20))M"
          else
@@ -771,29 +771,6 @@ test_26() {
  }
  run_test 26 "MDT startup failure cleans LOV (should return errs)"
  
-wait_update () {
-       local node=$1
-       local TEST=$2
-       local FINAL=$3
-
-       local RESULT
-       local MAX=90
-       local WAIT=0
-       local sleep=5
-       while [ $WAIT -lt $MAX ]; do
-           RESULT=$(do_node $node "$TEST") 
-           if [ $RESULT -eq $FINAL ]; then
-               echo "Updated config after $WAIT sec: wanted $FINAL got $RESULT"
-               return 0
-           fi
-           WAIT=$((WAIT + sleep))
-           echo "Waiting $((MAX - WAIT)) secs for config update" 
-           sleep $sleep
-       done
-       echo "Config update not seen after $MAX sec: wanted $FINAL got $RESULT"
-       return 3
-}
-
  set_and_check() {
         local myfacet=$1
         local TEST=$2
diff --git a/lustre/tests/mdsrate-create-large.sh b/lustre/tests/mdsrate-create-large.sh

index 00ad399..b45dae1 100644 (file)
--- a/lustre/tests/mdsrate-create-large.sh
+++ b/lustre/tests/mdsrate-create-large.sh
@@ -13,9 +13,9 @@ MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
  TESTDIR=$MOUNT
  
  # Requirements
+# set NUM_FILES=0 to force TIME_PERIOD work  
+NUM_FILES=${NUM_FILES:-1000000}
  TIME_PERIOD=${TIME_PERIOD:-600}                        # seconds
-SINGLE_TARGET_RATE=$((1300 / OSTCOUNT))     # ops/sec
-AGGREGATE_TARGET_RATE=$((7000 / OSTCOUNT))  # ops/sec
  
  # Local test variables
  TESTDIR_SINGLE="${TESTDIR}/single"
@@ -32,6 +32,11 @@ log "===== $0 ====== "
  
  check_and_setup_lustre
  
+IFree=$(inodes_available)
+if [ $IFree -lt $NUM_FILES ]; then
+    NUM_FILES=$IFree
+fi
+
  generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile"
  
  $LFS setstripe $TESTDIR -c -1
@@ -47,7 +52,7 @@ else
      echo "Running creates on 1 node(s)."
  
      COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD}
-                        --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'"
+                --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'"
      echo "+ ${COMMAND}"
      mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
  
@@ -56,14 +61,11 @@ else
         error "mpirun ... mdsrate ... failed, aborting"
      fi
      
-    check_rate create ${SINGLE_TARGET_RATE} 1 ${LOG} || true
-
      log "===== $0 ### 1 NODE UNLINK ###"
      echo "Running unlinks on 1 node(s)."
  
-    let NUM_FILES=${SINGLE_TARGET_RATE}\*${TIME_PERIOD}
      COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --time ${TIME_PERIOD}
-                 --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'"
+                --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'"
      echo "+ ${COMMAND}"
      mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
   
@@ -71,8 +73,11 @@ else
         [ -f $LOG ] && cat $LOG
         error "mpirun ... mdsrate ... failed, aborting"
      fi
+fi
  
-    check_rate unlink ${SINGLE_TARGET_RATE} 1 ${LOG} || true
+IFree=$(inodes_available)
+if [ $IFree -lt $NUM_FILES ]; then
+    NUM_FILES=$IFree
  fi
  
  if [ -n "$NOMULTI" ]; then
@@ -83,7 +88,7 @@ else
      echo "Running creates on ${NUM_CLIENTS} node(s)."
  
      COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD}
-                        --dir ${TESTDIR_MULTI} --filefmt 'f%%d'"
+                --nfiles $NUM_FILES --dir ${TESTDIR_MULTI} --filefmt 'f%%d'"
      echo "+ ${COMMAND}"
      mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
  
@@ -92,13 +97,10 @@ else
         error "mpirun ... mdsrate ... failed, aborting"
      fi
  
-    check_rate create ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true
-
      echo "Running unlinks on ${NUM_CLIENTS} node(s)."
  
-    let NUM_FILES=${AGGREGATE_TARGET_RATE}\*${TIME_PERIOD}
      COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --time ${TIME_PERIOD}
-                  --nfiles ${NUM_FILES} --dir ${TESTDIR_MULTI} --filefmt 'f%%d'"
+                --nfiles ${NUM_FILES} --dir ${TESTDIR_MULTI} --filefmt 'f%%d'"
      echo "+ ${COMMAND}"
      mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
  
@@ -107,12 +109,10 @@ else
         error "mpirun ... mdsrate ... failed, aborting"
      fi
  
-    check_rate unlink ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true
  fi
  
  equals_msg `basename $0`: test complete, cleaning up
  rm -f $MACHINEFILE
-zconf_umount_clients $NODES_TO_USE $MOUNT
  check_and_cleanup_lustre
  #rm -f $LOG
  
diff --git a/lustre/tests/mdsrate-create-small.sh b/lustre/tests/mdsrate-create-small.sh

index 0f42e5d..5455796 100644 (file)
--- a/lustre/tests/mdsrate-create-small.sh
+++ b/lustre/tests/mdsrate-create-small.sh
@@ -13,10 +13,8 @@ MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
  TESTDIR=$MOUNT
  
  # Requirements
-# The default number of stripes per file is set to 1 in test3/run_test.sh.
+NUM_FILES=${NUM_FILES:-1000000}
  TIME_PERIOD=${TIME_PERIOD:-600}                        # seconds
-SINGLE_TARGET_RATE=1400                # ops/sec
-AGGREGATE_TARGET_RATE=10000            # ops/sec
  
  # Local test variables
  TESTDIR_SINGLE="${TESTDIR}/single"
@@ -42,6 +40,11 @@ log "===== $0 ====== "
  
  check_and_setup_lustre
  
+IFree=$(inodes_available)
+if [ $IFree -lt $NUM_FILES ]; then
+    NUM_FILES=$IFree
+fi
+  
  generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile"
  
  $LFS setstripe $TESTDIR -i 0 -c 1
@@ -59,7 +62,7 @@ else
          echo "Running creates on 1 node(s)."
  
          COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD}
-                            --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'"
+                    --nfiles $NUM_FILES --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'"
          echo "+ ${COMMAND}"
          mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
  
@@ -67,7 +70,6 @@ else
          [ -f $LOG ] && cat $LOG
              error "mpirun ... mdsrate ... failed, aborting"
          fi
-        check_rate create ${SINGLE_TARGET_RATE} 1 ${LOG} || true
      fi
  
      if [ -n "$NOUNLINK" ]; then
@@ -76,7 +78,6 @@ else
          log "===== $0 ### 1 NODE UNLINK ###"
          echo "Running unlinks on 1 node(s)."
  
-        let NUM_FILES=${SINGLE_TARGET_RATE}\*${TIME_PERIOD}
          COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --time ${TIME_PERIOD}
                       --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'"
          echo "+ ${COMMAND}"
@@ -86,10 +87,14 @@ else
          [ -f $LOG ] && cat $LOG
              error "mpirun ... mdsrate ... failed, aborting"
          fi
-        check_rate unlink ${SINGLE_TARGET_RATE} 1 ${LOG} || true
      fi
  fi
  
+IFree=$(inodes_available)
+if [ $IFree -lt $NUM_FILES ]; then
+    NUM_FILES=$IFree
+fi
+
  if [ -n "$NOMULTI" ]; then
      echo "NO tests on multiple nodes."
  else
@@ -102,7 +107,7 @@ else
          echo "Running creates on ${NUM_CLIENTS} node(s) with $THREADS_PER_CLIENT threads per client."
  
          COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD}
-                            --dir ${TESTDIR_MULTI} --filefmt 'f%%d'"
+                    --nfiles $NUM_FILES --dir ${TESTDIR_MULTI} --filefmt 'f%%d'"
          echo "+ ${COMMAND}"
          mpi_run -np $((NUM_CLIENTS * THREADS_PER_CLIENT)) -machinefile ${MACHINEFILE} \
              ${COMMAND} | tee ${LOG}
@@ -110,7 +115,6 @@ else
              [ -f $LOG ] && cat $LOG
              error "mpirun ... mdsrate ... failed, aborting"
          fi
-        check_rate create ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true
      fi
  
      if [ -n "$NOUNLINK" ]; then
@@ -119,7 +123,6 @@ else
          log "===== $0 ### $NUM_CLIENTS NODES UNLINK ###"
          echo "Running unlinks on ${NUM_CLIENTS} node(s) with $THREADS_PER_CLIENT threads per client."
  
-        let NUM_FILES=${AGGREGATE_TARGET_RATE}\*${TIME_PERIOD}
          COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --time ${TIME_PERIOD}
                        --nfiles ${NUM_FILES} --dir ${TESTDIR_MULTI} --filefmt 'f%%d'"
          echo "+ ${COMMAND}"
@@ -129,13 +132,11 @@ else
              [ -f $LOG ] && cat $LOG
              error "mpirun ... mdsrate ... failed, aborting"
          fi
-        check_rate unlink ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true
      fi
  fi
  
  equals_msg `basename $0`: test complete, cleaning up
  rm -f $MACHINEFILE 
-zconf_umount_clients $NODES_TO_USE $MOUNT
  check_and_cleanup_lustre
  #rm -f $LOG
  
diff --git a/lustre/tests/mdsrate-lookup-1dir.sh b/lustre/tests/mdsrate-lookup-1dir.sh

index eb5f497..3387a56 100644 (file)
--- a/lustre/tests/mdsrate-lookup-1dir.sh
+++ b/lustre/tests/mdsrate-lookup-1dir.sh
@@ -21,8 +21,6 @@ TESTDIR=$MOUNT
  # Requirements
  NUM_FILES=${NUM_FILES:-1000000}
  TIME_PERIOD=${TIME_PERIOD:-600}                        # seconds
-SINGLE_TARGET_RATE=5900                  # ops/sec
-AGGREGATE_TARGET_RATE=62000              # ops/sec
  
  LOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log}
  CLIENT=$SINGLECLIENT
@@ -37,6 +35,11 @@ log "===== $0 ====== "
  
  check_and_setup_lustre
  
+IFree=$(inodes_available)
+if [ $IFree -lt $NUM_FILES ]; then
+    NUM_FILES=$IFree
+fi
+
  generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile"
  
  $LFS setstripe $TESTDIR -c 1
@@ -78,7 +81,6 @@ else
          [ -f $LOG ] && cat $LOG
          error "mpirun ... mdsrate ... failed, aborting"
      fi
-    check_rate lookup ${SINGLE_TARGET_RATE} 1 ${LOG} || true
  fi
  
  # 2
@@ -94,12 +96,11 @@ else
          [ -f $LOG ] && cat $LOG
          error "mpirun ... mdsrate ... failed, aborting"
      fi
-    check_rate lookup ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true
  fi
  
  equals_msg `basename $0`: test complete, cleaning up
+mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR 'f%%d'
  rm -f $MACHINEFILE
-zconf_umount_clients $NODES_TO_USE $MOUNT
  check_and_cleanup_lustre
  #rm -f $LOG
  
diff --git a/lustre/tests/mdsrate-stat-large.sh b/lustre/tests/mdsrate-stat-large.sh

index daadc40..a26ffc8 100644 (file)
--- a/lustre/tests/mdsrate-stat-large.sh
+++ b/lustre/tests/mdsrate-stat-large.sh
@@ -23,8 +23,6 @@ TESTDIR=$MOUNT
  # Requirements
  NUM_FILES=${NUM_FILES:-1000000}
  TIME_PERIOD=${TIME_PERIOD:-600}                        # seconds
-SINGLE_TARGET_RATE=$((3300 / OSTCOUNT))      # ops/sec
-AGGREGATE_TARGET_RATE=$((28500 / OSTCOUNT))  # ops/sec
  
  # --random_order (default) -OR- --readdir_order
  DIR_ORDER=${DIR_ORDER:-"--readdir_order"}
@@ -42,6 +40,11 @@ log "===== $0 ====== "
  
  check_and_setup_lustre
  
+IFree=$(inodes_available)
+if [ $IFree -lt $NUM_FILES ]; then
+    NUM_FILES=$IFree
+fi
+
  generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile"
  
  $LFS setstripe $TESTDIR -c -1
@@ -86,7 +89,6 @@ else
          [ -f $LOG ] && cat $LOG
          error "mpirun ... mdsrate ... failed, aborting"
      fi
-    check_rate stat ${SINGLE_TARGET_RATE} 1 ${LOG} || true
  fi
  
  # 2
@@ -104,12 +106,11 @@ else
          [ -f $LOG ] && cat $LOG
          error "mpirun ... mdsrate ... failed, aborting"
      fi
-    check_rate stat ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true
  fi
  
  equals_msg `basename $0`: test complete, cleaning up
+mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR 'f%%d'
  rm -f $MACHINEFILE
-zconf_umount_clients $NODES_TO_USE $MOUNT
  check_and_cleanup_lustre
  #rm -f $LOG
  
diff --git a/lustre/tests/mdsrate-stat-small.sh b/lustre/tests/mdsrate-stat-small.sh

index 1503416..f667ee6 100644 (file)
--- a/lustre/tests/mdsrate-stat-small.sh
+++ b/lustre/tests/mdsrate-stat-small.sh
@@ -23,8 +23,6 @@ TESTDIR=$MOUNT
  # Requirements
  NUM_FILES=${NUM_FILES:-1000000}
  TIME_PERIOD=${TIME_PERIOD:-600}                        # seconds
-SINGLE_TARGET_RATE=3200                     # ops/sec
-AGGREGATE_TARGET_RATE=29000                 # ops/sec
  
  # --random_order (default) -OR- --readdir_order
  DIR_ORDER=${DIR_ORDER:-"--readdir_order"}
@@ -42,6 +40,11 @@ log "===== $0 ====== "
  
  check_and_setup_lustre
  
+IFree=$(inodes_available)
+if [ $IFree -lt $NUM_FILES ]; then
+    NUM_FILES=$IFree
+fi
+
  generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile"
  
  $LFS setstripe $TESTDIR -i 0 -c 1
@@ -86,7 +89,6 @@ else
          [ -f $LOG ] && cat $LOG
          error "mpirun ... mdsrate ... failed, aborting"
      fi
-    check_rate stat ${SINGLE_TARGET_RATE} 1 ${LOG} || true
  fi
  
  # 2
@@ -103,12 +105,11 @@ else
          [ -f $LOG ] && cat $LOG
          error "mpirun ... mdsrate ... failed, aborting"
      fi
-    check_rate stat ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true
  fi
  
  equals_msg `basename $0`: test complete, cleaning up
+mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR 'f%%d'
  rm -f $MACHINEFILE
-zconf_umount_clients $NODES_TO_USE $MOUNT
  check_and_cleanup_lustre
  #rm -f $LOG
  
diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh

index 9efed77..41087cf 100755 (executable)
--- a/lustre/tests/recovery-small.sh
+++ b/lustre/tests/recovery-small.sh
@@ -644,54 +644,65 @@ test_24() { # bug 11710 details correct fsync() behavior
  }
  run_test 24 "fsync error (should return error)"
  
+wait_client_evicted () {
+       local facet=$1
+       local exports=$2
+       local varsvc=${facet}_svc
+
+       wait_update $(facet_host $facet) "lctl get_param -n *.${!varsvc}.num_exports | cut -d' ' -f2" $((exports - 1)) $3
+}
+
  test_26a() {      # was test_26 bug 5921 - evict dead exports by pinger
  # this test can only run from a client on a separate node.
         remote_ost || { skip "local OST" && return 0; }
         remote_ost_nodsh && skip "remote OST with nodsh" && return 0
         remote_mds || { skip "local MDS" && return 0; }
-       OST_FILE=obdfilter.${ost1_svc}.num_exports
-        OST_EXP="`do_facet ost1 lctl get_param -n $OST_FILE`"
-       OST_NEXP1=`echo $OST_EXP | cut -d' ' -f2`
-       echo starting with $OST_NEXP1 OST exports
+
+       check_timeout || return 1
+
+       local OST_NEXP=$(do_facet ost1 lctl get_param -n obdfilter.${ost1_svc}.num_exports | cut -d' ' -f2)
+
+       echo starting with $OST_NEXP OST exports
  # OBD_FAIL_PTLRPC_DROP_RPC 0x505
         do_facet client lctl set_param fail_loc=0x505
          # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict.
          # But if there's a race to start the evictor from various obds,
          # the loser might have to wait for the next ping.
-       echo Waiting for $(($TIMEOUT * 8)) secs
-       sleep $(($TIMEOUT * 8))
-        OST_EXP="`do_facet ost1 lctl get_param -n $OST_FILE`"
-       OST_NEXP2=`echo $OST_EXP | cut -d' ' -f2`
-       echo ending with $OST_NEXP2 OST exports
+
+       local rc=0
+       wait_client_evicted ost1 $OST_NEXP $((TIMEOUT * 2 + TIMEOUT * 3 / 4))
+       rc=$?
         do_facet client lctl set_param fail_loc=0x0
-        [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted"
-       return 0
+        [ $rc -eq 0 ] || error "client not evicted from OST"
  }
  run_test 26a "evict dead exports"
  
  test_26b() {      # bug 10140 - evict dead exports by pinger
         remote_ost_nodsh && skip "remote OST with nodsh" && return 0
  
+       check_timeout || return 1
         client_df
-        zconf_mount `hostname` $MOUNT2 || error "Failed to mount $MOUNT2"
-        sleep 1 # wait connections being established
-       MDS_FILE=mdt.${mds1_svc}.num_exports
-        MDS_NEXP1="`do_facet $SINGLEMDS lctl get_param -n $MDS_FILE | cut -d' ' -f2`"
-        OST_FILE=obdfilter.${ost1_svc}.num_exports
-        OST_NEXP1="`do_facet ost1 lctl get_param -n $OST_FILE | cut -d' ' -f2`"
-        echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports
-        zconf_umount `hostname` $MOUNT2 -f
-        # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict.  
-        # But if there's a race to start the evictor from various obds, 
-        # the loser might have to wait for the next ping.
-        echo Waiting for $(($TIMEOUT * 3)) secs
-        sleep $(($TIMEOUT * 3))
-        OST_NEXP2="`do_facet ost1 lctl get_param -n $OST_FILE | cut -d' ' -f2`"
-        MDS_NEXP2="`do_facet $SINGLEMDS lctl get_param -n $MDS_FILE | cut -d' ' -f2`"
-        echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports
-        [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted from OST"
-        [ $MDS_NEXP1 -le $MDS_NEXP2 ] && error "client not evicted from MDS"
-       return 0
+       zconf_mount `hostname` $MOUNT2 || error "Failed to mount $MOUNT2"
+       sleep 1 # wait connections being established
+
+       local MDS_NEXP=$(do_facet $SINGLEMDS lctl get_param -n mdt.${mds1_svc}.num_exports | cut -d' ' -f2)
+       local OST_NEXP=$(do_facet ost1 lctl get_param -n obdfilter.${ost1_svc}.num_exports | cut -d' ' -f2)
+
+       echo starting with $OST_NEXP OST and $MDS_NEXP MDS exports
+
+       zconf_umount `hostname` $MOUNT2 -f
+
+       # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict.  
+       # But if there's a race to start the evictor from various obds, 
+       # the loser might have to wait for the next ping.
+       # PING_INTERVAL max(obd_timeout / 4, 1U)
+       # sleep (2*PING_INTERVAL) 
+
+        local rc=0
+        wait_client_evicted ost1 $OST_NEXP $((TIMEOUT * 2 + TIMEOUT * 3 / 4)) || \
+               error "Client was not evicted by ost" rc=1
+       wait_client_evicted $SINGLEMDS $MDS_NEXP $((TIMEOUT * 2 + TIMEOUT * 3 / 4)) || \
+               error "Client was not evicted by mds"
  }
  run_test 26b "evict dead exports"
  
diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh

index 925a089..71d83a3 100755 (executable)
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -19,8 +19,8 @@ GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""}
  remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0
  
  # Skip these tests
-# bug number:  17466
-ALWAYS_EXCEPT="61d   $REPLAY_SINGLE_EXCEPT"
+# bug number:  17466 15962
+ALWAYS_EXCEPT="61d   33b $REPLAY_SINGLE_EXCEPT"
  
  if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then
      CONFIG_EXCEPTIONS="0b 42 47 61a 61c"
@@ -730,7 +730,7 @@ test_33a() {        # was test_33
  }
  run_test 33a "abort recovery before client does replay"
  
-# Stale FID sequence
+# Stale FID sequence bug 15962
  test_33b() {   # was test_33a
      replay_barrier $SINGLEMDS
      createmany -o $DIR/$tfile-%d 10
@@ -1112,6 +1112,8 @@ test_53a() {
  run_test 53a "|X| close request while two MDC requests in flight"
  
  test_53b() {
+        rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
+
          mkdir -p $DIR/${tdir}-1
          mkdir -p $DIR/${tdir}-2
          multiop $DIR/${tdir}-1/f O_c &
@@ -1141,6 +1143,8 @@ test_53b() {
  run_test 53b "|X| open request while two MDC requests in flight"
  
  test_53c() {
+        rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
+
          mkdir -p $DIR/${tdir}-1
          mkdir -p $DIR/${tdir}-2
          multiop $DIR/${tdir}-1/f O_c &
@@ -1172,6 +1176,8 @@ test_53c() {
  run_test 53c "|X| open request and close request while two MDC requests in flight"
  
  test_53d() {
+        rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
+
          mkdir -p $DIR/${tdir}-1
          mkdir -p $DIR/${tdir}-2
          multiop $DIR/${tdir}-1/f O_c &
@@ -1198,6 +1204,8 @@ test_53d() {
  run_test 53d "|X| close reply while two MDC requests in flight"
  
  test_53e() {
+        rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
+
          mkdir -p $DIR/${tdir}-1
          mkdir -p $DIR/${tdir}-2
          multiop $DIR/${tdir}-1/f O_c &
@@ -1227,6 +1235,8 @@ test_53e() {
  run_test 53e "|X| open reply while two MDC requests in flight"
  
  test_53f() {
+        rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
+
          mkdir -p $DIR/${tdir}-1
          mkdir -p $DIR/${tdir}-2
          multiop $DIR/${tdir}-1/f O_c &
@@ -1258,6 +1268,8 @@ test_53f() {
  run_test 53f "|X| open reply and close reply while two MDC requests in flight"
  
  test_53g() {
+        rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
+
          mkdir -p $DIR/${tdir}-1
          mkdir -p $DIR/${tdir}-2
          multiop $DIR/${tdir}-1/f O_c &
@@ -1289,6 +1301,8 @@ test_53g() {
  run_test 53g "|X| drop open reply and close request while close and open are both in flight"
  
  test_53h() {
+        rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
+
          mkdir -p $DIR/${tdir}-1
          mkdir -p $DIR/${tdir}-2
          multiop $DIR/${tdir}-1/f O_c &
@@ -1513,10 +1527,31 @@ run_test 62 "don't mis-drop resent replay"
  
  #Adaptive Timeouts (bug 3055)
  AT_MAX_SET=0
-# Suppose that all osts have the same at_max
-for facet in mds client ost; do
-    eval AT_MAX_SAVE_${facet}=$(at_max_get $facet)
-done
+
+at_cleanup () {
+    local var
+    local facet
+    local at_new
+
+    echo "Cleaning up AT ..."
+    if [ -n "$ATOLDBASE" ]; then
+        local at_history=$(do_facet mds "find /sys/ -name at_history")
+        do_facet mds "echo $ATOLDBASE >> $at_history" || true
+        do_facet ost1 "echo $ATOLDBASE >> $at_history" || true
+    fi
+
+    if [ $AT_MAX_SET -ne 0 ]; then
+        for facet in mds client ost; do
+            var=AT_MAX_SAVE_${facet}
+            echo restore AT on $facet to saved value ${!var}
+            at_max_set ${!var} $facet
+            at_new=$(at_max_get $facet)
+            echo Restored AT value on $facet $at_new
+            [ $at_new -eq ${!var} ] || \
+            error "$facet : AT value was not restored SAVED ${!var} NEW $at_new"
+        done
+    fi
+}
  
  at_start()
  {
@@ -1526,8 +1561,15 @@ at_start()
          return 1
      fi
  
+    # Save at_max original values
+    local facet
+    if [ $AT_MAX_SET -eq 0 ]; then
+        # Suppose that all osts have the same at_max
+        for facet in mds client ost; do
+            eval AT_MAX_SAVE_${facet}=$(at_max_get $facet)
+        done
+    fi
      local at_max
-
      for facet in mds client ost; do
          at_max=$(at_max_get $facet)
          if [ $at_max -ne $at_max_new ]; then
@@ -1736,24 +1778,7 @@ test_68 () #bug 13813
  }
  run_test 68 "AT: verify slowing locks"
  
-if [ -n "$ATOLDBASE" ]; then
-    at_history=$(do_facet mds "find /sys/ -name at_history")
-    do_facet mds "echo $ATOLDBASE >> $at_history" || true
-    do_facet ost1 "echo $ATOLDBASE >> $at_history" || true
-fi
-
-if [ $AT_MAX_SET -ne 0 ]; then
-    for facet in mds client ost; do
-        var=AT_MAX_SAVE_${facet}
-        echo restore AT on $facet to saved value ${!var}
-        at_max_set ${!var} $facet
-        AT_NEW=$(at_max_get $facet)
-        echo Restored AT value on $facet $AT_NEW 
-        [ $AT_NEW -ne ${!var} ] && \
-            error "$facet : AT value was not restored SAVED ${!var} NEW $AT_NEW"
-    done
-fi
-
+at_cleanup
  # end of AT tests includes above lines
  
  
diff --git a/lustre/tests/sanity-gss.sh b/lustre/tests/sanity-gss.sh

index 018c242..478f872 100644 (file)
--- a/lustre/tests/sanity-gss.sh
+++ b/lustre/tests/sanity-gss.sh
@@ -83,7 +83,7 @@ check_and_setup_lustre
  
  rm -rf $DIR/[df][0-9]*
  
-check_runas_id $RUNAS_ID $RUNAS
+check_runas_id $RUNAS_ID $RUNAS_ID $RUNAS
  
  build_test_filter
  
@@ -647,27 +647,39 @@ run_test 7 "exercise enlarge_reqbuf()"
  
  test_8()
  {
-    sleep $TIMEOUT
+    local ATHISTORY=$(do_facet mds "find /sys/ -name at_history")
+    local ATOLDBASE=$(do_facet mds "cat $ATHISTORY")
+    do_facet mds "echo 8 >> $ATHISTORY"
+
      $LCTL dk > /dev/null
      debugsave
      sysctl -w lnet.debug="+other"
  
+    mkdir -p $DIR/d8
+    chmod a+w $DIR/d8
+
+    REQ_DELAY=`lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts |
+               awk '/portal 12/ {print $5}' | tail -1`
+    REQ_DELAY=$((${REQ_DELAY} + ${REQ_DELAY} / 4 + 5))
+
      # sleep sometime in ctx handle
-    do_facet mds lctl set_param fail_val=30
+    do_facet mds lctl set_param fail_val=$REQ_DELAY
  #define OBD_FAIL_SEC_CTX_HDL_PAUSE       0x1204
      do_facet mds lctl set_param fail_loc=0x1204
  
      $RUNAS $LFS flushctx || error "can't flush ctx"
  
-    $RUNAS df $DIR &
-    DFPID=$!
-    echo "waiting df (pid $TOUCHPID) to finish..."
-    sleep 2 # give df a chance to really trigger context init rpc
+    $RUNAS touch $DIR/d8/f &
+    TOUCHPID=$!
+    echo "waiting for touch (pid $TOUCHPID) to finish..."
+    sleep 2 # give it a chance to really trigger context init rpc
      do_facet mds sysctl -w lustre.fail_loc=0
-    wait $DFPID || error "df should have succeeded"
+    wait $TOUCHPID || error "touch should have succeeded"
  
      $LCTL dk | grep "Early reply #" || error "No early reply"
+
      debugrestore
+    do_facet mds "echo $ATOLDBASE >> $ATHISTORY" || true
  }
  run_test 8 "Early reply sent for slow gss context negotiation"
  
@@ -676,98 +688,6 @@ run_test 8 "Early reply sent for slow gss context negotiation"
  # so each test should not assume any start flavor.
  #
  
-test_50() {
-    local sample=$TMP/sanity-gss-8
-    local tdir=$MOUNT/dir8
-    local iosize="256K"
-    local hash_algs="adler32 crc32 md5 sha1 sha256 sha384 sha512 wp256 wp384 wp512"
-
-    # create sample file with aligned size for direct i/o
-    dd if=/dev/zero of=$sample bs=$iosize count=1 || error
-    dd conv=notrunc if=/etc/termcap of=$sample bs=$iosize count=1 || error
-
-    rm -rf $tdir
-    mkdir $tdir || error "create dir $tdir"
-
-    restore_to_default_flavor
-
-    for alg in $hash_algs; do
-        echo "Testing $alg..."
-        flavor=krb5i-bulki:$alg/null
-        set_rule $FSNAME any cli2ost $flavor
-        wait_flavor cli2ost $flavor $cnt_cli2ost
-
-        dd if=$sample of=$tdir/$alg oflag=direct,dsync bs=$iosize || error "$alg write"
-        diff $sample $tdir/$alg || error "$alg read"
-    done
-
-    rm -rf $tdir
-    rm -f $sample
-}
-run_test 50 "verify bulk hash algorithms works"
-
-test_51() {
-    local s1=$TMP/sanity-gss-9.1
-    local s2=$TMP/sanity-gss-9.2
-    local s3=$TMP/sanity-gss-9.3
-    local s4=$TMP/sanity-gss-9.4
-    local tdir=$MOUNT/dir9
-    local s1_size=4194304   # n * pagesize (4M)
-    local s2_size=512       # n * blksize
-    local s3_size=111       # n * blksize + m
-    local s4_size=5         # m
-    local cipher_algs="arc4 aes128 aes192 aes256 cast128 cast256 twofish128 twofish256"
-
-    # create sample files for each situation
-    rm -f $s1 $s2 $s2 $s4
-    dd if=/dev/urandom of=$s1 bs=1M count=4 || error
-    dd if=/dev/urandom of=$s2 bs=$s2_size count=1 || error
-    dd if=/dev/urandom of=$s3 bs=$s3_size count=1 || error
-    dd if=/dev/urandom of=$s4 bs=$s4_size count=1 || error
-
-    rm -rf $tdir
-    mkdir $tdir || error "create dir $tdir"
-
-    restore_to_default_flavor
-
-    #
-    # different bulk data alignment will lead to different behavior of
-    # the implementation: (n > 0; 0 < m < encryption_block_size)
-    #  - full page i/o
-    #  - partial page, size = n * encryption_block_size
-    #  - partial page, size = n * encryption_block_size + m
-    #  - partial page, size = m
-    #
-    for alg in $cipher_algs; do
-        echo "Testing $alg..."
-        flavor=krb5p-bulkp:sha1/$alg
-        set_rule $FSNAME any cli2ost $flavor
-        wait_flavor cli2ost $flavor $cnt_cli2ost
-
-        # sync write
-        dd if=$s1 of=$tdir/$alg.1 oflag=dsync bs=1M || error "write $alg.1"
-        dd if=$s2 of=$tdir/$alg.2 oflag=dsync || error "write $alg.2"
-        dd if=$s3 of=$tdir/$alg.3 oflag=dsync || error "write $alg.3"
-        dd if=$s4 of=$tdir/$alg.4 oflag=dsync || error "write $alg.4"
-
-        # remount client
-        umount_client $MOUNT
-        umount_client $MOUNT2
-        mount_client $MOUNT
-        mount_client $MOUNT2
-
-        # read & compare
-        diff $tdir/$alg.1 $s1 || error "read $alg.1"
-        diff $tdir/$alg.2 $s2 || error "read $alg.2"
-        diff $tdir/$alg.3 $s3 || error "read $alg.3"
-        diff $tdir/$alg.4 $s4 || error "read $alg.4"
-    done
-
-    rm -rf $tdir
-    rm -f $sample
-}
-run_test 51 "bulk data alignment test under encryption mode"
-
  test_90() {
      if [ "$SLOW" = "no" ]; then
          total=10
diff --git a/lustre/tests/sanity-quota.sh b/lustre/tests/sanity-quota.sh

index e1cb070..5e56c85 100644 (file)
--- a/lustre/tests/sanity-quota.sh
+++ b/lustre/tests/sanity-quota.sh
@@ -121,22 +121,24 @@ set_file_unitsz() {
  lustre_fail() {
         local fail_node=$1
         local fail_loc=$2
-
-       case $fail_node in
-           "mds" )
-               do_facet $SINGLEMDS "lctl set_param fail_loc=$fail_loc" ;;
-           "ost" )
-               for num in `seq $OSTCOUNT`; do
-                   do_facet ost$num "lctl set_param fail_loc=$fail_loc"
-               done ;;
-           "mds_ost" )
-               do_facet $SINGLEMDS "lctl set_param fail_loc=$fail_loc" ;
-               for num in `seq $OSTCOUNT`; do
-                   do_facet ost$num "lctl set_param fail_loc=$fail_loc"
-               done ;;
-           * ) echo "usage: lustre_fail fail_node fail_loc" ;
-               return 1 ;;
-       esac
+       local fail_val=${3:-0}
+
+       if [ $fail_node == "mds" ] || [ $fail_node == "mds_ost" ]; then
+           if [ $((fail_loc & 0x10000000)) -ne 0  -a $fail_val -gt 0 ] || \
+               [ $((fail_loc)) -eq 0 ]; then
+               do_facet $SINGLEMDS "lctl set_param fail_val=$fail_val"
+           fi
+           do_facet $SINGLEMDS "lctl set_param fail_loc=$fail_loc"
+       fi
+       if [ $fail_node == "ost" ] || [ $fail_node == "mds_ost" ]; then
+           for num in `seq $OSTCOUNT`; do
+               if [ $((fail_loc & 0x10000000)) -ne 0 -a $fail_val -gt 0 ] || \
+                   [ $((fail_loc)) -eq 0 ]; then
+                   do_facet ost$num "lctl set_param fail_val=$fail_val"
+               fi
+               do_facet ost$num "lctl set_param fail_loc=$fail_loc"
+           done
+       fi
  }
  
  RUNAS="runas -u $TSTID -g $TSTID"
@@ -968,11 +970,6 @@ test_12() {
         [ "$(grep $DIR2 /proc/mounts)" ] || mount_client $DIR2 || \
                 { skip "Need lustre mounted on $MOUNT2 " && retutn 0; }
  
-       if [ $OSTCOUNT -lt 2 ]; then
-               skip "$OSTCOUNT < 2, too few osts"
-               return 0;
-       fi
-
         LIMIT=$(( $BUNIT_SZ * $(($OSTCOUNT + 1)) * 10)) # 10 bunits each sever
         TESTFILE="$DIR/$tdir/$tfile-0"
         TESTFILE2="$DIR2/$tdir/$tfile-1"
@@ -984,11 +981,12 @@ test_12() {
  
         $LFS setstripe $TESTFILE -i 0 -c 1
         chown $TSTUSR.$TSTUSR $TESTFILE
-       $LFS setstripe $TESTFILE2 -i 1 -c 1
+       $LFS setstripe $TESTFILE2 -i 0 -c 1
         chown $TSTUSR2.$TSTUSR2 $TESTFILE2
  
         #define OBD_FAIL_OST_HOLD_WRITE_RPC      0x21f
-       lustre_fail ost 0x0000021f
+       #define OBD_FAIL_SOME        0x10000000 /* fail N times */
+       lustre_fail ost $((0x0000021f | 0x10000000)) 1
  
         echo "   step1: write out of block quota ..."
         $RUNAS2 dd if=/dev/zero of=$TESTFILE2 bs=$BLK_SZ count=102400 &
@@ -1785,24 +1783,25 @@ test_25_sub() {
         chmod 0777 $DIR/$tdir
         TESTFILE="$DIR/$tdir/$tfile-0"
         rm -f $TESTFILE
+       LIMIT=$(( $BUNIT_SZ * ($OSTCOUNT + 1) + 4096 ))
  
         wait_delete_completed
  
          # set quota for $TSTUSR
          log "setquota for $TSTUSR"
-       $LFS setquota $1 $TSTUSR -b 10240 -B 10240 -i 10 -I 10 $DIR
+       $LFS setquota $1 $TSTUSR -b $LIMIT -B $LIMIT -i 10 -I 10 $DIR
         sleep 3
          show_quota $1 $TSTUSR
  
          # set quota for $TSTUSR2
          log "setquota for $TSTUSR2"
-       $LFS setquota $1 $TSTUSR2 -b 10240 -B 10240 -i 10 -I 10 $DIR
+       $LFS setquota $1 $TSTUSR2 -b $LIMIT -B $LIMIT -i 10 -I 10 $DIR
         sleep 3
          show_quota $1 $TSTUSR2
  
          # set stripe index to 0
          log "setstripe for $DIR/$tdir to 0"
-       $LFS setstripe $DIR/$tdir -i 0
+       $LFS setstripe $DIR/$tdir -c 1 -i 0
         MDS_UUID=`do_facet $SINGLEMDS $LCTL dl | grep -m1 " mdt " | awk '{print $((NF-1))}'`
         OST0_UUID=`do_facet ost1 $LCTL dl | grep -m1 obdfilter | awk '{print $((NF-1))}'`
         MDS_QUOTA_USED_OLD=`$LFS quota -o $MDS_UUID $1 $TSTUSR $DIR | awk '/^.*[[:digit:]+][[:space:]+]/ { print $4 }'`
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh

index fd20f97..568620d 100644 (file)
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -8,7 +8,7 @@ set -e
  
  ONLY=${ONLY:-"$*"}
  # bug number for skipped test: 13297 2108 9789 3637 9789 3561 12622 12653 12653 5188 10764 16260
-ALWAYS_EXCEPT="                27u   42a  42b  42c  42d  45   51d   65a   65e   68   75    119d  $SANITY_EXCEPT"
+ALWAYS_EXCEPT="                27u   42a  42b  42c  42d  45   51d   65a   65e   68b   75    119d  $SANITY_EXCEPT"
  # bug number for skipped test: 2108 9789 3637 9789 3561 5188/5749 1443
  #ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"27m 42a 42b 42c 42d 45 68 76"}
  # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
@@ -3075,12 +3075,15 @@ LLOOP=
  cleanup_68() {
         trap 0
         if [ ! -z "$LLOOP" ]; then
-               swapoff $LLOOP || error "swapoff failed"
+               if swapon -s | grep -q $LLOOP; then
+                       swapoff $LLOOP || error "swapoff failed"
+               fi
+
                 $LCTL blockdev_detach $LLOOP || error "detach failed"
                 rm -f $LLOOP
                 unset LLOOP
         fi
-       rm -f $DIR/f68
+       rm -f $DIR/f68*
  }
  
  meminfo() {
@@ -3091,10 +3094,29 @@ swap_used() {
         swapon -s | awk '($1 == "'$1'") { print $4 }'
  }
  
+# test case for lloop driver, basic function
+test_68a() {
+       [ "$UID" != 0 ] && skip "must run as root" && return
+
+       grep -q llite_lloop /proc/modules
+       [ $? -ne 0 ] && skip "can't find module llite_lloop" && return
+
+       LLOOP=$TMP/lloop.`date +%s`.`date +%N`
+       dd if=/dev/zero of=$DIR/f68a bs=4k count=1024
+       $LCTL blockdev_attach $DIR/f68a $LLOOP || error "attach failed"
+
+       trap cleanup_68 EXIT
+
+       directio rdwr $LLOOP 0 1024 4096 || error "direct write failed"
+       directio rdwr $LLOOP 0 1025 4096 && error "direct write should fail"
+
+       cleanup_68
+}
+run_test 68a "lloop driver - basic test ========================"
  
  # excercise swapping to lustre by adding a high priority swapfile entry
  # and then consuming memory until it is used.
-test_68() {
+test_68b() {  # was test_68
         [ "$UID" != 0 ] && skip "must run as root" && return
         lctl get_param -n devices | grep -q obdfilter && \
                 skip "local OST" && return
@@ -3110,10 +3132,10 @@ test_68() {
         [[ $NR_BLOCKS -le 2048 ]] && NR_BLOCKS=2048
  
         LLOOP=$TMP/lloop.`date +%s`.`date +%N`
-       dd if=/dev/zero of=$DIR/f68 bs=64k seek=$NR_BLOCKS count=1
-       mkswap $DIR/f68
+       dd if=/dev/zero of=$DIR/f68b bs=64k seek=$NR_BLOCKS count=1
+       mkswap $DIR/f68b
  
-       $LCTL blockdev_attach $DIR/f68 $LLOOP || error "attach failed"
+       $LCTL blockdev_attach $DIR/f68b $LLOOP || error "attach failed"
  
         trap cleanup_68 EXIT
  
@@ -3128,7 +3150,7 @@ test_68() {
  
         [ $SWAPUSED -eq 0 ] && echo "no swap used???" || true
  }
-run_test 68 "support swapping to Lustre ========================"
+run_test 68b "support swapping to Lustre ========================"
  
  # bug5265, obdfilter oa2dentry return -ENOENT
  # #define OBD_FAIL_OST_ENOENT 0x217
@@ -3410,6 +3432,7 @@ setup_f77() {
  }
  
  test_77a() { # bug 10889
+       $GSS && skip "could not run with gss" && return
         [ ! -f $F77_TMP ] && setup_f77
         set_checksums 1
         dd if=$F77_TMP of=$DIR/$tfile bs=1M count=$F77SZ || error "dd error"
@@ -3419,6 +3442,7 @@ test_77a() { # bug 10889
  run_test 77a "normal checksum read/write operation ============="
  
  test_77b() { # bug 10889
+       $GSS && skip "could not run with gss" && return
         [ ! -f $F77_TMP ] && setup_f77
         #define OBD_FAIL_OSC_CHECKSUM_SEND       0x409
         lctl set_param fail_loc=0x80000409
@@ -3432,6 +3456,7 @@ test_77b() { # bug 10889
  run_test 77b "checksum error on client write ===================="
  
  test_77c() { # bug 10889
+       $GSS && skip "could not run with gss" && return
         [ ! -f $DIR/f77b ] && skip "requires 77b - skipping" && return
         set_checksums 1
         for algo in $CKSUM_TYPES; do
@@ -3448,6 +3473,7 @@ test_77c() { # bug 10889
  run_test 77c "checksum error on client read ==================="
  
  test_77d() { # bug 10889
+       $GSS && skip "could not run with gss" && return
         #define OBD_FAIL_OSC_CHECKSUM_SEND       0x409
         lctl set_param fail_loc=0x80000409
         set_checksums 1
@@ -3459,6 +3485,7 @@ test_77d() { # bug 10889
  run_test 77d "checksum error on OST direct write ==============="
  
  test_77e() { # bug 10889
+       $GSS && skip "could not run with gss" && return
         [ ! -f $DIR/f77 ] && skip "requires 77d - skipping" && return
         #define OBD_FAIL_OSC_CHECKSUM_RECEIVE    0x408
         lctl set_param fail_loc=0x80000408
@@ -3472,6 +3499,7 @@ test_77e() { # bug 10889
  run_test 77e "checksum error on OST direct read ================"
  
  test_77f() { # bug 10889
+       $GSS && skip "could not run with gss" && return
         set_checksums 1
         for algo in $CKSUM_TYPES; do
                 cancel_lru_locks osc
@@ -3488,6 +3516,7 @@ test_77f() { # bug 10889
  run_test 77f "repeat checksum error on write (expect error) ===="
  
  test_77g() { # bug 10889
+       $GSS && skip "could not run with gss" && return
         remote_ost_nodsh && skip "remote OST with nodsh" && return
  
         [ ! -f $F77_TMP ] && setup_f77
@@ -3504,6 +3533,7 @@ test_77g() { # bug 10889
  run_test 77g "checksum error on OST write ======================"
  
  test_77h() { # bug 10889
+       $GSS && skip "could not run with gss" && return
         remote_ost_nodsh && skip "remote OST with nodsh" && return
  
         [ ! -f $DIR/f77g ] && skip "requires 77g - skipping" && return
@@ -3518,6 +3548,7 @@ test_77h() { # bug 10889
  run_test 77h "checksum error on OST read ======================="
  
  test_77i() { # bug 13805
+       $GSS && skip "could not run with gss" && return
         #define OBD_FAIL_OSC_CONNECT_CKSUM       0x40b
         lctl set_param fail_loc=0x40b
         remount_client $MOUNT
@@ -3532,6 +3563,7 @@ test_77i() { # bug 13805
  run_test 77i "client not supporting OSD_CONNECT_CKSUM =========="
  
  test_77j() { # bug 13805
+       $GSS && skip "could not run with gss" && return
         #define OBD_FAIL_OSC_CKSUM_ADLER_ONLY    0x40c
         lctl set_param fail_loc=0x40c
         remount_client $MOUNT
@@ -3866,7 +3898,6 @@ setup_test102() {
  
         trap cleanup_test102 EXIT
         cd $DIR
-       # $1 = runas
         $1 $SETSTRIPE $tdir -s $STRIPE_SIZE -i $STRIPE_OFFSET -c $STRIPE_COUNT
         cd $DIR/$tdir
         for num in 1 2 3 4
@@ -3883,10 +3914,7 @@ setup_test102() {
         done
  
         cd $DIR
-       if [ "$TAR" == "tar" ]; then
-               TAR_OPTS="--xattrs"
-       fi
-       $1 $TAR cf $TMP/f102.tar $tdir $TAR_OPTS
+       $1 $TAR cf $TMP/f102.tar $tdir --xattrs
         SETUP_TEST102=yes
  }
  
@@ -4047,70 +4075,35 @@ compare_stripe_info2() {
  }
  
  find_lustre_tar() {
-       [ -n "$(which star 2>/dev/null)" ] && strings $(which star) | grep -q lustre && echo star && return
         [ -n "$(which tar 2>/dev/null)" ] && strings $(which tar) | grep -q lustre && echo tar
  }
  
  test_102d() {
-       # b10930: (s)tar test for trusted.lov xattr
+       # b10930: tar test for trusted.lov xattr
         TAR=$(find_lustre_tar)
-       [ -z "$TAR" ] && skip "lustre-aware (s)tar is not installed" && return
+       [ -z "$TAR" ] && skip "lustre-aware tar is not installed" && return
         [ "$OSTCOUNT" -lt "2" ] && skip "skipping N-stripe test" && return
         setup_test102
         mkdir -p $DIR/d102d
-       if [ "$TAR" == "tar" ]; then
-               TAR_OPTS="--xattrs"
-       fi
-       $TAR xf $TMP/f102.tar -C $DIR/d102d $TAR_OPTS
+       $TAR xf $TMP/f102.tar -C $DIR/d102d --xattrs
         cd $DIR/d102d/$tdir
         compare_stripe_info1
  }
-run_test 102d "(s)tar restore stripe info from tarfile,not keep osts ==========="
-
-test_102e() {
-       # b10930: star test for trusted.lov xattr
-       TAR=$(find_lustre_tar)
-       [ "$TAR" != star ] && skip "lustre-aware star is not installed" && return
-       [ "$OSTCOUNT" -lt "2" ] && skip "skipping N-stripe test" && return
-       setup_test102
-       mkdir -p $DIR/d102e
-       star -x  -preserve-osts f=$TMP/f102.tar -C $DIR/d102e
-       cd $DIR/d102e/$tdir
-       compare_stripe_info2
-}
-run_test 102e "star restore stripe info from tarfile, keep osts ==========="
+run_test 102d "tar restore stripe info from tarfile,not keep osts ==========="
  
  test_102f() {
-       # b10930: (s)tar test for trusted.lov xattr
+       # b10930: tar test for trusted.lov xattr
         TAR=$(find_lustre_tar)
-       [ -z "$TAR" ] && skip "lustre-aware (s)tar is not installed" && return
+       [ -z "$TAR" ] && skip "lustre-aware tar is not installed" && return
         [ "$OSTCOUNT" -lt "2" ] && skip "skipping N-stripe test" && return
         setup_test102
         mkdir -p $DIR/d102f
         cd $DIR
-       if [ "$TAR" == "tar" ]; then
-               TAR_OPTS="--xattrs"
-       fi
-       $TAR cf - $TAR_OPTS . | $TAR xf - $TAR_OPTS -C $DIR/d102f
+       $TAR cf - --xattrs $tdir | $TAR xf - --xattrs -C $DIR/d102f
         cd $DIR/d102f/$tdir
         compare_stripe_info1
  }
-run_test 102f "(s)tar copy files, not keep osts ==========="
-
-test_102g() {
-       # b10930: star test for trusted.lov xattr
-       TAR=$(find_lustre_tar)
-       [ "$TAR" != star ] && skip "lustre-aware star is not installed" && return
-       [ "$OSTCOUNT" -lt "2" ] && skip "skipping N-stripe test" && return
-       setup_test102
-       mkdir -p $DIR/d102g
-       cd $DIR
-       star -copy -preserve-osts $tdir $DIR/d102g
-       cd $DIR/d102g/$tdir
-       compare_stripe_info2
-       cleanup_test102
-}
-run_test 102g "star copy files, keep osts ==========="
+run_test 102f "tar copy files, not keep osts ==========="
  
  test_102h() { # bug 15777
         [ -z $(lctl get_param -n mdc.*.connect_flags | grep xattr) ] &&
@@ -4168,19 +4161,16 @@ run_test 102i "lgetxattr test on symbolic link ============"
  
  test_102j() {
         TAR=$(find_lustre_tar)
-       [ -z "$TAR" ] && skip "lustre-aware (s)tar is not installed" && return
+       [ -z "$TAR" ] && skip "lustre-aware tar is not installed" && return
         [ "$OSTCOUNT" -lt "2" ] && skip "skipping N-stripe test" && return
         setup_test102 "$RUNAS"
         mkdir -p $DIR/d102j
         chown $RUNAS_ID $DIR/d102j
-       if [ "$TAR" == "tar" ]; then
-               TAR_OPTS="--xattrs"
-       fi
-       $RUNAS $TAR xf $TMP/f102.tar -C $DIR/d102j $TAR_OPTS
+       $RUNAS $TAR xf $TMP/f102.tar -C $DIR/d102j --xattrs
         cd $DIR/d102j/$tdir
         compare_stripe_info1 "$RUNAS"
  }
-run_test 102j "non-root (s)tar restore stripe info from tarfile,not keep osts ="
+run_test 102j "non-root tar restore stripe info from tarfile, not keep osts ==="
  
  run_acl_subtest()
  {
@@ -5127,7 +5117,7 @@ test_123a() { # was test 123, statahead(bug 11401)
                  cancel_lru_locks mdc
                  cancel_lru_locks osc
                  stime=`date +%s`
-                time ls -l $DIR/$tdir > /dev/null
+                time ls -l $DIR/$tdir | wc -l
                  etime=`date +%s`
                  delta=$((etime - stime))
                  log "ls $i files without statahead: $delta sec"
@@ -5138,10 +5128,10 @@ test_123a() { # was test 123, statahead(bug 11401)
                  cancel_lru_locks mdc
                  cancel_lru_locks osc
                  stime=`date +%s`
-                time ls -l $DIR/$tdir > /dev/null
+                time ls -l $DIR/$tdir | wc -l
                  etime=`date +%s`
                  delta_sa=$((etime - stime))
-                log "ls $i files with statahead:    $delta_sa sec"
+                log "ls $i files with statahead: $delta_sa sec"
                 lctl get_param -n llite.*.statahead_stats
                  ewrong=`lctl get_param -n llite.*.statahead_stats | grep "statahead wrong:" | awk '{print $3}'`
  
@@ -5149,13 +5139,41 @@ test_123a() { # was test 123, statahead(bug 11401)
                          log "statahead was stopped, maybe too many locks held!"
                  fi
  
+                [ $delta -eq 0 ] && continue
+
                  if [ $((delta_sa * 100)) -gt $((delta * 105)) ]; then
                          if [  $SLOWOK -eq 0 ]; then
                                  error "ls $i files is slower with statahead!"
+
+                                max=`lctl get_param -n llite.*.statahead_max | head -n 1`
+                                lctl set_param -n llite.*.statahead_max 0
+                                lctl get_param llite.*.statahead_max
+                                cancel_lru_locks mdc
+                                cancel_lru_locks osc
+                                $LCTL dk > /dev/null
+                                stime=`date +%s`
+                                time ls -l $DIR/$tdir | wc -l
+                                etime=`date +%s`
+                                $LCTL dk > $TMP/sanity_test_123a_${i}_disable_${etime}.log
+                                delta=$((etime - stime))
+                                log "ls $i files without statahead: $delta sec, dump to $TMP/sanity_test_123a_${i}_disable_${etime}.log"
+                                lctl set_param llite.*.statahead_max=$max
+
+                                lctl get_param -n llite.*.statahead_max | grep '[0-9]'
+                                cancel_lru_locks mdc
+                                cancel_lru_locks osc
+                                $LCTL dk > /dev/null
+                                stime=`date +%s`
+                                time ls -l $DIR/$tdir | wc -l
+                                etime=`date +%s`
+                                $LCTL dk > $TMP/sanity_test_123a_${i}_enable_${etime}.log
+                                delta_sa=$((etime - stime))
+                                log "ls $i files with statahead: $delta_sa sec, dump to $TMP/sanity_test_123a_${i}_enable_${etime}.log"
+                               lctl get_param -n llite.*.statahead_stats
                          else
                                  log "ls $i files is slower with statahead!"
                          fi
-                        break;
+                        break
                  fi
  
                  [ $delta -gt 20 ] && break
@@ -5930,6 +5948,19 @@ err17935 () {
      fi
  }
  
+test_154() {
+       cp /etc/hosts $DIR/$tfile
+
+       fid=`$LFS path2fid $DIR/$tfile`
+       rc=$?
+       [ $rc -ne 0 ] && error "error: could not get fid for $DIR/$tfile."
+
+       diff $DIR/$tfile $DIR/.lustre/fid/$fid || error "open by fid failed: did not find expected data in file."
+
+       echo "Opening a file by FID succeeded"
+}
+run_test 154 "Opening a file by FID"
+
  #Changelogs
  test_160() {
      remote_mds && skip "remote MDS" && return
diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh

index 983dc80..9008dce 100644 (file)
--- a/lustre/tests/sanityN.sh
+++ b/lustre/tests/sanityN.sh
@@ -804,8 +804,6 @@ test_34() { #16129
                  echo writing on client1
                  dd if=/dev/zero of=$DIR1/$tfile count=100 conv=notrunc > /dev/null 2>&1
                  sync &
-                # wait for the flush
-                sleep 1
                  echo reading on client2
                  dd of=/dev/null if=$DIR2/$tfile > /dev/null 2>&1
                  # wait for a lock timeout
@@ -884,6 +882,32 @@ test_35() { # bug 17645
  }
  run_test 35 "-EINTR cp_ast vs. bl_ast race does not evict client"
  
+test_36() { #bug 16417
+    local SIZE
+    mkdir -p $MOUNT1/$tdir
+    lfs setstripe -c -1 $MOUNT1/$tdir
+    i=0
+    SIZE=100
+
+    while [ $i -le 10 ]; do
+       lctl mark "start test"
+       before=$($LFS df | awk '{if ($1 ~/^filesystem/) {print $5; exit} }')
+       dd if=/dev/zero of=$MOUNT1/$tdir/file000 bs=1M count=$SIZE
+       dd if=$MOUNT2/$tdir/file000 of=/dev/null bs=1M count=$SIZE &
+       read_pid=$!
+       sleep 0.1
+       rm -f $MOUNT1/$tdir/file000
+       wait $read_pid
+       after=$($LFS df | awk '{if ($1 ~/^filesystem/) {print $5; exit} }')
+       if [ $before -gt $after ]; then
+           error "space leaked"
+           exit;
+       fi
+       let i=i+1
+    done
+}
+run_test 36 "handle ESTALE/open-unlink corectly"
+
  log "cleanup: ======================================================"
  
  check_and_cleanup_lustre
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh

index 32516c1..07430ff 100644 (file)
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -128,7 +128,7 @@ init_test_env() {
      export TUNEFS=${TUNEFS:-"$LUSTRE/utils/tunefs.lustre"}
      [ ! -f "$TUNEFS" ] && export TUNEFS=$(which tunefs.lustre)
      export CHECKSTAT="${CHECKSTAT:-"checkstat -v"} "
-    export FSYTPE=${FSTYPE:-"ldiskfs"}
+    export FSTYPE=${FSTYPE:-"ldiskfs"}
      export NAME=${NAME:-local}
      export LGSSD=${LGSSD:-"$LUSTRE/utils/gss/lgssd"}
      [ "$GSS_PIPEFS" = "true" ] && [ ! -f "$LGSSD" ] && \
@@ -252,6 +252,7 @@ load_modules() {
      load_module mgc/mgc
      if [ -z "$CLIENTONLY" ] && [ -z "$CLIENTMODSONLY" ]; then
          grep -q crc16 /proc/kallsyms || { modprobe crc16 2>/dev/null || true; }
+        grep -q jbd /proc/kallsyms || { modprobe jbd 2>/dev/null || true; }
          [ "$FSTYPE" = "ldiskfs" ] && load_module ../ldiskfs/ldiskfs/ldiskfs
          load_module mgs/mgs
          load_module mds/mds
@@ -427,6 +428,10 @@ stop_gss_daemons() {
  init_gss() {
      if $GSS; then
          start_gss_daemons
+
+        if [ -n "$LGSS_KEYRING_DEBUG" ]; then
+            echo $LGSS_KEYRING_DEBUG > /proc/fs/lustre/sptlrpc/gss/lgss_keyring/debug_level
+        fi
      fi
  }
  
@@ -753,6 +758,34 @@ cleanup_check() {
      return 0
  }
  
+wait_update () {
+    local node=$1
+    local TEST=$2
+    local FINAL=$3
+    local MAX=${4:-90}
+
+        local RESULT
+        local WAIT=0
+        local sleep=5
+        while [ $WAIT -lt $MAX ]; do
+            sleep $sleep
+            RESULT=$(do_node $node "$TEST")
+            if [ $RESULT -eq $FINAL ]; then
+                echo "Updated after $WAIT sec: wanted $FINAL got $RESULT"
+                return 0
+            fi
+            WAIT=$((WAIT + sleep))
+            echo "Waiting $((MAX - WAIT)) secs for update"
+        done
+        echo "Update not seen after $MAX sec: wanted $FINAL got $RESULT"
+        return 3
+}
+
+wait_update_facet () {
+    local facet=$1
+    wait_update  $(facet_host $facet) $@
+}
+
  wait_delete_completed () {
      local TOTALPREV=`lctl get_param -n osc.*.kbytesavail | \
                       awk 'BEGIN{total=0}; {total+=$1}; END{print total}'`
@@ -772,14 +805,14 @@ wait_delete_completed () {
  }
  
  wait_for_host() {
-    HOST=$1
+    local HOST=$1
      check_network "$HOST" 900
      while ! do_node $HOST "ls -d $LUSTRE " > /dev/null; do sleep 5; done
  }
  
  wait_for() {
-    facet=$1
-    HOST=`facet_active_host $facet`
+    local facet=$1
+    local HOST=`facet_active_host $facet`
      wait_for_host $HOST
  }
  
@@ -788,8 +821,8 @@ wait_mds_recovery_done () {
  #define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2)
  # as we are in process of changing obd_timeout in different ways
  # let's set MAX longer than that
-    MAX=$(( timeout * 4 ))
-    WAIT=0
+    local MAX=$(( timeout * 4 ))
+    local WAIT=0
      while [ $WAIT -lt $MAX ]; do
          STATUS=`do_facet $SINGLEMDS "lctl get_param -n mdt.*-MDT0000.recovery_status | grep status"`
          echo $STATUS | grep COMPLETE && return 0
@@ -876,8 +909,8 @@ client_reconnect() {
  }
  
  facet_failover() {
-    facet=$1
-    sleep_time=$2
+    local facet=$1
+    local sleep_time=$2
      echo "Failing $facet on node `facet_active_host $facet`"
      shutdown_facet $facet
      [ -n "$sleep_time" ] && sleep $sleep_time
@@ -1292,16 +1325,6 @@ remount_client()
         zconf_mount `hostname` $1 || error "mount failed"
  }
  
-set_obd_timeout() {
-    local facet=$1
-    local timeout=$2
-
-    do_facet $facet lsmod | grep -q obdclass || \
-        do_facet $facet "modprobe obdclass"
-
-    do_facet $facet "lctl set_param timeout=$timeout"
-}
-
  writeconf_facet () {
      local facet=$1
      local dev=$2
@@ -1330,7 +1353,6 @@ setupall() {
              writeconf_all
          for num in `seq $MDSCOUNT`; do
              DEVNAME=$(mdsdevname $num)
-            set_obd_timeout mds$num $TIMEOUT
              start mds$num $DEVNAME $MDS_MOUNT_OPTS
  
              # We started mds, now we should set failover variables properly.
@@ -1346,7 +1368,6 @@ setupall() {
          done
          for num in `seq $OSTCOUNT`; do
              DEVNAME=$(ostdevname $num)
-            set_obd_timeout ost$num $TIMEOUT
              start ost$num $DEVNAME $OST_MOUNT_OPTS
  
              # We started ost$num, now we should set ost${num}failover variable properly.
@@ -1371,7 +1392,7 @@ setupall() {
          [ -n "$CLIENTS" ] && zconf_mount_clients $CLIENTS $MOUNT2
      fi
  
-    init_versions_vars
+    init_param_vars
  
      # by remounting mdt before ost, initial connect from mdt to ost might
      # timeout because ost is not ready yet. wait some time to its fully
@@ -1425,10 +1446,13 @@ init_facets_vars () {
      done
  }
  
-init_versions_vars () {
+init_param_vars () {
      export MDSVER=$(do_facet $SINGLEMDS "lctl get_param version" | cut -d. -f1,2)
      export OSTVER=$(do_facet ost1 "lctl get_param version" | cut -d. -f1,2)
      export CLIVER=$(lctl get_param version | cut -d. -f 1,2)
+
+    TIMEOUT=$(do_facet $SINGLEMDS "lctl get_param -n timeout")
+    log "Using TIMEOUT=$TIMEOUT"
  }
  
  check_config () {
@@ -1449,6 +1473,15 @@ check_config () {
      fi
  }
  
+check_timeout () {
+    local mdstimeout=$(do_facet $SINGLEMDS "lctl get_param -n timeout")
+    local cltimeout=$(lctl get_param -n timeout)
+    if [ $mdstimeout -ne $TIMEOUT ] || [ $mdstimeout -ne $cltimeout ]; then
+        error "timeouts are wrong! mds: $mdstimeout, client: $cltimeout, TIMEOUT=$TIMEOUT"
+        return 1
+    fi
+}
+
  check_and_setup_lustre() {
      local MOUNTED=$(mounted_lustre_filesystems)
      if [ -z "$MOUNTED" ] || ! $(echo $MOUNTED | grep -w -q $MOUNT); then
@@ -1460,7 +1493,7 @@ check_and_setup_lustre() {
      else
          check_config $MOUNT
          init_facets_vars
-        init_versions_vars
+        init_param_vars
      fi
      if [ "$ONLY" == "setup" ]; then
          exit 0
@@ -2231,25 +2264,9 @@ multiop_bg_pause() {
      return 0
  }
  
-check_rate() {
-    local OP=$1
-    local TARGET_RATE=$2
-    local NUM_CLIENTS=$3
-    local LOG=$4
-
-    local RATE=$(awk '/^Rate: [0-9\.]+ '"${OP}"'s\/sec/ { print $2}' ${LOG})
-
-    # We need to use bc since the rate is a floating point number
-    local RES=$(echo "${RATE} < ${TARGET_RATE}" | bc -l )
-    if [ "${RES}" = 0 ]; then
-        echo "Success: ${RATE} ${OP}s/sec met target rate" \
-             "${TARGET_RATE} ${OP}s/sec for ${NUM_CLIENTS} client(s)."
-        return 0
-    else
-        echo "Failure: ${RATE} ${OP}s/sec did not meet target rate" \
-             "${TARGET_RATE} ${OP}s/sec for ${NUM_CLIENTS} client(s)."
-        return 1
-    fi
+inodes_available () {
+    local IFree=$($LFS df -i $MOUNT | grep ^$FSNAME | awk '{print $4}' | sort -un | head -1) || return 1
+    echo $IFree
  }
  
  # reset llite stat counters
@@ -2369,3 +2386,7 @@ mpi_run () {
      eval $command
  }
  
+mdsrate_cleanup () {
+    mpi_run -np $1 -machinefile $2 ${MDSRATE} --unlink --nfiles $3 --dir $4 --filefmt $5
+}
+
diff --git a/lustre/utils/liblustreapi.c b/lustre/utils/liblustreapi.c

index 3927ca6..e0d8e1d 100644 (file)
--- a/lustre/utils/liblustreapi.c
+++ b/lustre/utils/liblustreapi.c
@@ -1525,8 +1525,8 @@ static int cb_find_init(char *path, DIR *parent, DIR *dir,
          }
  
  obd_matches:
-        /* If file still fits the request, ask osd for updated info.
-           The regulat stat is almost of the same speed as some new
+        /* If file still fits the request, ask ost for updated info.
+           The regular stat is almost of the same speed as some new
             'glimpse-size-ioctl'. */
          if (!decision && S_ISREG(st->st_mode) &&
              (param->lmd->lmd_lmm.lmm_stripe_count || param->size)) {
diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c

index a408a9d..8945f11 100644 (file)
--- a/lustre/utils/obd.c
+++ b/lustre/utils/obd.c
@@ -103,9 +103,7 @@ const int thread = 0;
  const int nthreads = 1;
  #endif
  
-static char rawbuf[8192];
-static char *buf = rawbuf;
-static int max = sizeof(rawbuf);
+#define MAX_IOC_BUFLEN 8192
  
  static int cur_device = -1;
  
@@ -122,42 +120,25 @@ static int l2_ioctl(int dev_id, int opc, void *buf)
          return l_ioctl(dev_id, opc, buf);
  }
  
-#define IOC_INIT(data)                                                  \
-do {                                                                    \
-        memset(&data, 0, sizeof(data));                                 \
-        data.ioc_dev = cur_device;                                      \
-} while (0)
-
-#define IOC_PACK(func, data)                                            \
-do {                                                                    \
-        memset(buf, 0, sizeof(rawbuf));                                 \
-        if (obd_ioctl_pack(&data, &buf, max)) {                         \
-                fprintf(stderr, "error: %s: invalid ioctl\n",           \
-                        jt_cmdname(func));                                 \
-                return -2;                                              \
-        }                                                               \
-} while (0)
-
-#define IOC_UNPACK(func, data)                                          \
-do {                                                                    \
-        if (obd_ioctl_unpack(&data, buf, max)) {                        \
-                fprintf(stderr, "error: %s: invalid reply\n",           \
-                        jt_cmdname(func));                                 \
-                return -2;                                              \
-        }                                                               \
-} while (0)
-
  int lcfg_ioctl(char * func, int dev_id, struct lustre_cfg *lcfg)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          int rc;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          data.ioc_type = LUSTRE_CFG_TYPE;
          data.ioc_plen1 = lustre_cfg_len(lcfg->lcfg_bufcount,
                                          lcfg->lcfg_buflens);
          data.ioc_pbuf1 = (void *)lcfg;
-        IOC_PACK(func, data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(func));
+                return rc;
+        }
  
          rc =  l_ioctl(dev_id, OBD_IOC_PROCESS_CFG, buf);
  
@@ -190,9 +171,10 @@ static int get_mgs_device()
  int lcfg_mgs_ioctl(char *func, int dev_id, struct lustre_cfg *lcfg)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          int rc;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
          rc = data.ioc_dev = get_mgs_device();
          if (rc < 0)
                  goto out;
@@ -200,7 +182,13 @@ int lcfg_mgs_ioctl(char *func, int dev_id, struct lustre_cfg *lcfg)
          data.ioc_plen1 = lustre_cfg_len(lcfg->lcfg_bufcount,
                                          lcfg->lcfg_buflens);
          data.ioc_pbuf1 = (void *)lcfg;
-        IOC_PACK(func, data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(func));
+                return rc;
+        }
  
          rc = l_ioctl(dev_id, OBD_IOC_PARAM, buf);
  out:
@@ -234,18 +222,30 @@ char *obdo_print(struct obdo *obd)
  static int do_name2dev(char *func, char *name)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          int rc;
  
-        IOC_INIT(data);
-
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          data.ioc_inllen1 = strlen(name) + 1;
          data.ioc_inlbuf1 = name;
  
-        IOC_PACK(func, data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(func));
+                return rc;
+        }
          rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_NAME2DEV, buf);
          if (rc < 0)
                  return errno;
-        IOC_UNPACK(func, data);
+        rc = obd_ioctl_unpack(&data, buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid reply\n",
+                        jt_cmdname(func));
+                return rc;
+        }
  
          return data.ioc_dev + N2D_OFF;
  }
@@ -272,7 +272,7 @@ int parse_devname(char *func, char *name)
                          // printf("Name %s is device %d\n", name, ret);
                  } else {
                          fprintf(stderr, "No device found for name %s: %s\n",
-                               name, strerror(rc));
+                                name, strerror(rc));
                  }
          }
          return ret;
@@ -820,14 +820,22 @@ int jt_opt_net(int argc, char **argv)
  int jt_obd_no_transno(int argc, char **argv)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          int rc;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
  
          if (argc != 1)
                  return CMD_HELP;
  
-        IOC_PACK(argv[0], data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
          rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_NO_TRANSNO, buf);
          if (rc < 0)
                  fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
@@ -839,14 +847,22 @@ int jt_obd_no_transno(int argc, char **argv)
  int jt_obd_set_readonly(int argc, char **argv)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          int rc;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
  
          if (argc != 1)
                  return CMD_HELP;
  
-        IOC_PACK(argv[0], data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
          rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_SET_READONLY, buf);
          if (rc < 0)
                  fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
@@ -858,14 +874,22 @@ int jt_obd_set_readonly(int argc, char **argv)
  int jt_obd_abort_recovery(int argc, char **argv)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          int rc;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
  
          if (argc != 1)
                  return CMD_HELP;
  
-        IOC_PACK(argv[0], data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
          rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_ABORT_RECOVERY, buf);
          if (rc < 0)
                  fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
@@ -877,15 +901,15 @@ int jt_obd_abort_recovery(int argc, char **argv)
  int jt_get_version(int argc, char **argv)
  {
          int rc;
-        char buf[8192];
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          struct obd_ioctl_data *data = (struct obd_ioctl_data *)buf;
  
          if (argc != 1)
                  return CMD_HELP;
  
-        memset(buf, 0, sizeof(buf));
+        memset(buf, 0, sizeof(rawbuf));
          data->ioc_version = OBD_IOCTL_VERSION;
-        data->ioc_inllen1 = sizeof(buf) - size_round(sizeof(*data));
+        data->ioc_inllen1 = sizeof(rawbuf) - size_round(sizeof(*data));
          data->ioc_inlbuf1 = buf + size_round(sizeof(*data));
          data->ioc_len = obd_ioctl_packlen(data);
  
@@ -950,7 +974,7 @@ fail:
  int jt_obd_list_ioctl(int argc, char **argv)
  {
          int rc, index;
-        char buf[8192];
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          struct obd_ioctl_data *data = (struct obd_ioctl_data *)buf;
  
          if (argc > 2)
@@ -960,9 +984,9 @@ int jt_obd_list_ioctl(int argc, char **argv)
                  return CMD_HELP;
  
          for (index = 0;; index++) {
-                memset(buf, 0, sizeof(buf));
+                memset(buf, 0, sizeof(rawbuf));
                  data->ioc_version = OBD_IOCTL_VERSION;
-                data->ioc_inllen1 = sizeof(buf) - size_round(sizeof(*data));
+                data->ioc_inllen1 = sizeof(rawbuf) - size_round(sizeof(*data));
                  data->ioc_inlbuf1 = buf + size_round(sizeof(*data));
                  data->ioc_len = obd_ioctl_packlen(data);
                  data->ioc_count = index;
@@ -978,8 +1002,7 @@ int jt_obd_list_ioctl(int argc, char **argv)
                          rc = 0;
                  else
                          fprintf(stderr, "Error getting device list: %s: "
-                                        "check dmesg.\n",
-                                        strerror(errno));
+                                "check dmesg.\n", strerror(errno));
          }
          return rc;
  }
@@ -1017,9 +1040,6 @@ int jt_obd_list(int argc, char **argv)
          return 0;
  }
  
-
-
-
  /* Create one or more objects, arg[4] may describe stripe meta-data.  If
   * not, defaults assumed.  This echo-client instance stashes the stripe
   * object ids.  Use get_stripe on this node to print full lsm and
@@ -1028,13 +1048,15 @@ int jt_obd_list(int argc, char **argv)
  /* create <count> [<file_create_mode>] [q|v|# verbosity] [striping] */
  int jt_obd_create(int argc, char **argv)
  {
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          struct obd_ioctl_data data;
          struct timeval next_time;
          __u64 count = 1, next_count, base_id = 0;
          int verbose = 1, mode = 0100644, rc = 0, i, valid_lsm = 0;
          char *end;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          if (argc < 2 || argc > 5)
                  return CMD_HELP;
  
@@ -1092,9 +1114,15 @@ int jt_obd_create(int argc, char **argv)
                          data.ioc_pbuf1 = (char *)&lsm_buffer;
                  }
  
-                IOC_PACK(argv[0], data);
+                memset(buf, 0, sizeof(rawbuf));
+                rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+                if (rc) {
+                        fprintf(stderr, "error: %s: invalid ioctl\n",
+                                jt_cmdname(argv[0]));
+                        return rc;
+                }
                  rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_CREATE, buf);
-                IOC_UNPACK(argv[0], data);
+                obd_ioctl_unpack(&data, buf, sizeof(rawbuf));
                  shmem_bump();
                  if (rc < 0) {
                          fprintf(stderr, "error: %s: #%d - %s\n",
@@ -1118,10 +1146,12 @@ int jt_obd_create(int argc, char **argv)
  int jt_obd_setattr(int argc, char **argv)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          char *end;
          int rc;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          if (argc != 2)
                  return CMD_HELP;
  
@@ -1139,7 +1169,13 @@ int jt_obd_setattr(int argc, char **argv)
          }
          data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE;
  
-        IOC_PACK(argv[0], data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
          rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_SETATTR, buf);
          if (rc < 0)
                  fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
@@ -1153,6 +1189,7 @@ int jt_obd_test_setattr(int argc, char **argv)
          struct obd_ioctl_data data;
          struct timeval start, next_time;
          __u64 i, count, next_count;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          int verbose = 1;
          obd_id objid = 3;
          char *end;
@@ -1161,7 +1198,8 @@ int jt_obd_test_setattr(int argc, char **argv)
          if (argc < 2 || argc > 4)
                  return CMD_HELP;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          count = strtoull(argv[1], &end, 0);
          if (*end) {
                  fprintf(stderr, "error: %s: invalid iteration count '%s'\n",
@@ -1200,7 +1238,13 @@ int jt_obd_test_setattr(int argc, char **argv)
                  data.ioc_obdo1.o_id = objid;
                  data.ioc_obdo1.o_mode = S_IFREG;
                  data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE;
-                IOC_PACK(argv[0], data);
+                memset(buf, 0x00, sizeof(rawbuf));
+                rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+                if (rc) {
+                        fprintf(stderr, "error: %s: invalid ioctl\n",
+                                jt_cmdname(argv[0]));
+                        return rc;
+                }
                  rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_SETATTR, &data);
                  shmem_bump();
                  if (rc < 0) {
@@ -1236,13 +1280,15 @@ int jt_obd_destroy(int argc, char **argv)
  {
          struct obd_ioctl_data data;
          struct timeval next_time;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          __u64 count = 1, next_count;
          int verbose = 1;
          __u64 id;
          char *end;
          int rc = 0, i;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          if (argc < 2 || argc > 4)
                  return CMD_HELP;
  
@@ -1277,9 +1323,15 @@ int jt_obd_destroy(int argc, char **argv)
                  data.ioc_obdo1.o_mode = S_IFREG | 0644;
                  data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLMODE;
  
-                IOC_PACK(argv[0], data);
+                memset(buf, 0, sizeof(rawbuf));
+                rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+                if (rc) {
+                        fprintf(stderr, "error: %s: invalid ioctl\n",
+                                jt_cmdname(argv[0]));
+                        return rc;
+                }
                  rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_DESTROY, buf);
-                IOC_UNPACK(argv[0], data);
+                obd_ioctl_unpack(&data, buf, sizeof(rawbuf));
                  shmem_bump();
                  if (rc < 0) {
                          fprintf(stderr, "error: %s: objid "LPX64": %s\n",
@@ -1298,13 +1350,15 @@ int jt_obd_destroy(int argc, char **argv)
  int jt_obd_getattr(int argc, char **argv)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          char *end;
          int rc;
  
          if (argc != 2)
                  return CMD_HELP;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          data.ioc_obdo1.o_id = strtoull(argv[1], &end, 0);
          if (*end) {
                  fprintf(stderr, "error: %s: invalid objid '%s'\n",
@@ -1316,9 +1370,15 @@ int jt_obd_getattr(int argc, char **argv)
          data.ioc_obdo1.o_valid = 0xffffffff;
          printf("%s: object id "LPX64"\n", jt_cmdname(argv[0]),data.ioc_obdo1.o_id);
  
-        IOC_PACK(argv[0], data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
          rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_GETATTR, buf);
-        IOC_UNPACK(argv[0], data);
+        obd_ioctl_unpack(&data, buf, sizeof(rawbuf));
          if (rc) {
                  fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
                          strerror(rc = errno));
@@ -1333,6 +1393,7 @@ int jt_obd_test_getattr(int argc, char **argv)
  {
          struct obd_ioctl_data data;
          struct timeval start, next_time;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          __u64 i, count, next_count;
          int verbose = 1;
          obd_id objid = 3;
@@ -1342,7 +1403,8 @@ int jt_obd_test_getattr(int argc, char **argv)
          if (argc < 2 || argc > 4)
                  return CMD_HELP;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          count = strtoull(argv[1], &end, 0);
          if (*end) {
                  fprintf(stderr, "error: %s: invalid iteration count '%s'\n",
@@ -1381,7 +1443,13 @@ int jt_obd_test_getattr(int argc, char **argv)
                  data.ioc_obdo1.o_id = objid;
                  data.ioc_obdo1.o_mode = S_IFREG;
                  data.ioc_obdo1.o_valid = 0xffffffff;
-                IOC_PACK(argv[0], data);
+                memset(buf, 0x00, sizeof(rawbuf));
+                rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+                if (rc) {
+                        fprintf(stderr, "error: %s: invalid ioctl\n",
+                                jt_cmdname(argv[0]));
+                        return rc;
+                }
                  rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_GETATTR, &data);
                  shmem_bump();
                  if (rc < 0) {
@@ -1423,6 +1491,7 @@ int jt_obd_test_brw(int argc, char **argv)
  {
          struct obd_ioctl_data data;
          struct timeval start, next_time;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          __u64 count, next_count, len, stride, thr_offset = 0, objid = 3;
          int write = 0, verbose = 1, cmd, i, rc = 0, pages = 1;
          int offset_pages = 0;
@@ -1513,7 +1582,8 @@ int jt_obd_test_brw(int argc, char **argv)
                  }
          }
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
  
          /* communicate the 'type' of brw test and batching to echo_client.
           * don't start.  we'd love to refactor this lctl->echo_client
@@ -1598,7 +1668,13 @@ int jt_obd_test_brw(int argc, char **argv)
          cmd = write ? OBD_IOC_BRW_WRITE : OBD_IOC_BRW_READ;
          for (i = 1, next_count = verbose; i <= count && shmem_running(); i++) {
                  data.ioc_obdo1.o_valid &= ~(OBD_MD_FLBLOCKS|OBD_MD_FLGRANT);
-                IOC_PACK(argv[0], data);
+                memset(buf, 0x00, sizeof(rawbuf));
+                rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+                if (rc) {
+                        fprintf(stderr, "error: %s: invalid ioctl\n",
+                                jt_cmdname(argv[0]));
+                        return rc;
+                }
                  rc = l2_ioctl(OBD_DEV_ID, cmd, buf);
                  shmem_bump();
                  if (rc) {
@@ -1658,11 +1734,13 @@ int jt_obd_lov_getconfig(int argc, char **argv)
          struct obd_ioctl_data data;
          struct lov_desc desc;
          struct obd_uuid *uuidarray;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          __u32 *obdgens;
          char *path;
          int rc, fd;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
  
          if (argc != 2)
                  return CMD_HELP;
@@ -1680,7 +1758,6 @@ int jt_obd_lov_getconfig(int argc, char **argv)
          desc.ld_tgt_count = ((OBD_MAX_IOCTL_BUFFER-sizeof(data)-sizeof(desc)) /
                               (sizeof(*uuidarray) + sizeof(*obdgens)));
  
-
  repeat:
          uuidarray = calloc(desc.ld_tgt_count, sizeof(*uuidarray));
          if (!uuidarray) {
@@ -1697,6 +1774,7 @@ repeat:
                  goto out_uuidarray;
          }
  
+        memset(buf, 0x00, sizeof(rawbuf));
          data.ioc_inllen1 = sizeof(desc);
          data.ioc_inlbuf1 = (char *)&desc;
          data.ioc_inllen2 = desc.ld_tgt_count * sizeof(*uuidarray);
@@ -1704,7 +1782,7 @@ repeat:
          data.ioc_inllen3 = desc.ld_tgt_count * sizeof(*obdgens);
          data.ioc_inlbuf3 = (char *)obdgens;
  
-        if (obd_ioctl_pack(&data, &buf, max)) {
+        if (obd_ioctl_pack(&data, &buf, sizeof(rawbuf))) {
                  fprintf(stderr, "error: %s: invalid ioctl\n",
                          jt_cmdname(argv[0]));
                  rc = -EINVAL;
@@ -1723,7 +1801,7 @@ repeat:
                  __u32 *genp;
                  int i;
  
-                if (obd_ioctl_unpack(&data, buf, max)) {
+                if (obd_ioctl_unpack(&data, buf, sizeof(rawbuf))) {
                          fprintf(stderr, "error: %s: invalid reply\n",
                                  jt_cmdname(argv[0]));
                          rc = -EINVAL;
@@ -1759,10 +1837,12 @@ int jt_obd_ldlm_regress_start(int argc, char **argv)
  {
          int rc;
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          char argstring[200];
          int i, count = sizeof(argstring) - 1;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          if (argc > 5)
                  return CMD_HELP;
  
@@ -1779,7 +1859,13 @@ int jt_obd_ldlm_regress_start(int argc, char **argv)
                  data.ioc_inllen1 = strlen(argstring) + 1;
          }
  
-        IOC_PACK(argv[0], data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
          rc = l2_ioctl(OBD_DEV_ID, IOC_LDLM_REGRESS_START, buf);
          if (rc)
                  fprintf(stderr, "error: %s: test failed: %s\n",
@@ -1791,13 +1877,22 @@ int jt_obd_ldlm_regress_start(int argc, char **argv)
  int jt_obd_ldlm_regress_stop(int argc, char **argv)
  {
          int rc;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          struct obd_ioctl_data data;
-        IOC_INIT(data);
+
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
  
          if (argc != 1)
                  return CMD_HELP;
  
-        IOC_PACK(argv[0], data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
          rc = l2_ioctl(OBD_DEV_ID, IOC_LDLM_REGRESS_STOP, buf);
  
          if (rc)
@@ -1809,16 +1904,24 @@ int jt_obd_ldlm_regress_stop(int argc, char **argv)
  static int do_activate(int argc, char **argv, int flag)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          int rc;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          if (argc != 1)
                  return CMD_HELP;
  
          /* reuse offset for 'active' */
          data.ioc_offset = flag;
  
-        IOC_PACK(argv[0], data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
          rc = l2_ioctl(OBD_DEV_ID, IOC_OSC_SET_ACTIVE, buf);
          if (rc)
                  fprintf(stderr, "error: %s: failed: %s\n",
@@ -1840,9 +1943,11 @@ int jt_obd_activate(int argc, char **argv)
  int jt_obd_recover(int argc, char **argv)
  {
          int rc;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          struct obd_ioctl_data data;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          if (argc > 2)
                  return CMD_HELP;
  
@@ -1851,7 +1956,13 @@ int jt_obd_recover(int argc, char **argv)
                  data.ioc_inlbuf1 = argv[1];
          }
  
-        IOC_PACK(argv[0], data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
          rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_CLIENT_RECOVER, buf);
          if (rc < 0) {
                  fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
@@ -1864,6 +1975,7 @@ int jt_obd_recover(int argc, char **argv)
  int jt_obd_mdc_lookup(int argc, char **argv)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          char *parent, *child;
          int rc, fd, verbose = 1;
  
@@ -1875,12 +1987,19 @@ int jt_obd_mdc_lookup(int argc, char **argv)
          if (argc == 4)
                  verbose = get_verbose(argv[0], argv[3]);
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
  
          data.ioc_inllen1 = strlen(child) + 1;
          data.ioc_inlbuf1 = child;
  
-        IOC_PACK(argv[0], data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
  
          fd = open(parent, O_RDONLY);
          if (fd < 0) {
@@ -1897,7 +2016,12 @@ int jt_obd_mdc_lookup(int argc, char **argv)
          close(fd);
  
          if (verbose) {
-                IOC_UNPACK(argv[0], data);
+                rc = obd_ioctl_unpack(&data, buf, sizeof(rawbuf));
+                if (rc) {
+                        fprintf(stderr, "error: %s: invalid reply\n",
+                                jt_cmdname(argv[0]));
+                        return rc;
+                }
                  printf("%s: mode %o uid %d gid %d\n", child,
                         data.ioc_obdo1.o_mode, data.ioc_obdo1.o_uid,
                         data.ioc_obdo1.o_gid);
@@ -1909,16 +2033,24 @@ int jt_obd_mdc_lookup(int argc, char **argv)
  int jt_cfg_dump_log(int argc, char **argv)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          int rc;
  
          if (argc != 2)
                  return CMD_HELP;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          data.ioc_inllen1 = strlen(argv[1]) + 1;
          data.ioc_inlbuf1 = argv[1];
  
-        IOC_PACK(argv[0], data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
          rc = l_ioctl(OBD_DEV_ID, OBD_IOC_DUMP_LOG, buf);
          if (rc < 0)
                  fprintf(stderr, "OBD_IOC_DUMP_LOG failed: %s\n",
@@ -1930,15 +2062,22 @@ int jt_cfg_dump_log(int argc, char **argv)
  int jt_llog_catlist(int argc, char **argv)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          int rc;
  
          if (argc != 1)
                  return CMD_HELP;
  
-        IOC_INIT(data);
-        data.ioc_inllen1 = max - size_round(sizeof(data));
-        IOC_PACK(argv[0], data);
-
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
+        data.ioc_inllen1 = sizeof(rawbuf) - size_round(sizeof(data));
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
          rc = l_ioctl(OBD_DEV_ID, OBD_IOC_CATLOGLIST, buf);
          if (rc == 0)
                  fprintf(stdout, "%s", ((struct obd_ioctl_data*)buf)->ioc_bulk);
@@ -1952,17 +2091,25 @@ int jt_llog_catlist(int argc, char **argv)
  int jt_llog_info(int argc, char **argv)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          int rc;
  
          if (argc != 2)
                  return CMD_HELP;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          data.ioc_inllen1 = strlen(argv[1]) + 1;
          data.ioc_inlbuf1 = argv[1];
-        data.ioc_inllen2 = max - size_round(sizeof(data)) -
+        data.ioc_inllen2 = sizeof(rawbuf) - size_round(sizeof(data)) -
                  size_round(data.ioc_inllen1);
-        IOC_PACK(argv[0], data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
  
          rc = l_ioctl(OBD_DEV_ID, OBD_IOC_LLOG_INFO, buf);
          if (rc == 0)
@@ -1977,12 +2124,14 @@ int jt_llog_info(int argc, char **argv)
  int jt_llog_print(int argc, char **argv)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          int rc;
  
          if (argc != 2 && argc != 4)
                  return CMD_HELP;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          data.ioc_inllen1 = strlen(argv[1]) + 1;
          data.ioc_inlbuf1 = argv[1];
          if (argc == 4) {
@@ -1997,11 +2146,17 @@ int jt_llog_print(int argc, char **argv)
                  data.ioc_inllen3 = strlen(to) + 1;
                  data.ioc_inlbuf3 = to;
          }
-        data.ioc_inllen4 = max - size_round(sizeof(data)) -
+        data.ioc_inllen4 = sizeof(rawbuf) - size_round(sizeof(data)) -
                  size_round(data.ioc_inllen1) -
                  size_round(data.ioc_inllen2) -
                  size_round(data.ioc_inllen3);
-        IOC_PACK(argv[0], data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
  
          rc = l_ioctl(OBD_DEV_ID, OBD_IOC_LLOG_PRINT, buf);
          if (rc == 0)
@@ -2016,19 +2171,27 @@ int jt_llog_print(int argc, char **argv)
  int jt_llog_cancel(int argc, char **argv)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          int rc;
  
          if (argc != 4)
                  return CMD_HELP;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          data.ioc_inllen1 = strlen(argv[1]) + 1;
          data.ioc_inlbuf1 = argv[1];
          data.ioc_inllen2 = strlen(argv[2]) + 1;
          data.ioc_inlbuf2 = argv[2];
          data.ioc_inllen3 = strlen(argv[3]) + 1;
          data.ioc_inlbuf3 = argv[3];
-        IOC_PACK(argv[0], data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
  
          rc = l_ioctl(OBD_DEV_ID, OBD_IOC_LLOG_CANCEL, buf);
          if (rc == 0)
@@ -2043,12 +2206,14 @@ int jt_llog_cancel(int argc, char **argv)
  int jt_llog_check(int argc, char **argv)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          int rc;
  
          if (argc != 2 && argc != 4)
                  return CMD_HELP;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          data.ioc_inllen1 = strlen(argv[1]) + 1;
          data.ioc_inlbuf1 = argv[1];
          if (argc == 4) {
@@ -2063,11 +2228,17 @@ int jt_llog_check(int argc, char **argv)
                  data.ioc_inllen3 = strlen(to) + 1;
                  data.ioc_inlbuf3 = to;
          }
-        data.ioc_inllen4 = max - size_round(sizeof(data)) -
+        data.ioc_inllen4 = sizeof(rawbuf) - size_round(sizeof(data)) -
                  size_round(data.ioc_inllen1) -
                  size_round(data.ioc_inllen2) -
                  size_round(data.ioc_inllen3);
-        IOC_PACK(argv[0], data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
  
          rc = l_ioctl(OBD_DEV_ID, OBD_IOC_LLOG_CHECK, buf);
          if (rc == 0)
@@ -2081,19 +2252,27 @@ int jt_llog_check(int argc, char **argv)
  int jt_llog_remove(int argc, char **argv)
  {
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
          int rc;
  
          if (argc != 3 && argc != 2)
                  return CMD_HELP;
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
+        data.ioc_dev = cur_device;
          data.ioc_inllen1 = strlen(argv[1]) + 1;
          data.ioc_inlbuf1 = argv[1];
          if (argc == 3){
                  data.ioc_inllen2 = strlen(argv[2]) + 1;
                  data.ioc_inlbuf2 = argv[2];
          }
-        IOC_PACK(argv[0], data);
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(argv[0]));
+                return rc;
+        }
  
          rc = l_ioctl(OBD_DEV_ID, OBD_IOC_LLOG_REMOVE, buf);
          if (rc == 0) {
@@ -2142,14 +2321,15 @@ static int jt_blockdev_find_module(const char *module)
  {
          FILE *fp;
          int found = 0;
-        char modname[256];
+        char buf[1024];
  
          fp = fopen("/proc/modules", "r");
          if (fp == NULL)
                  return -1;
  
-        while (fscanf(fp, "%s %*s %*s %*s %*s %*s", modname) == 1) {
-                if (strcmp(module, modname) == 0) {
+        while (fgets(buf, 1024, fp) != NULL) {
+                *strchr(buf, ' ') = 0;
+                if (strcmp(module, buf) == 0) {
                          found = 1;
                          break;
                  }
@@ -2635,6 +2815,7 @@ static int pool_cmd(enum lcfg_command_type cmd,
          struct obd_ioctl_data data;
          struct lustre_cfg_bufs bufs;
          struct lustre_cfg *lcfg;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
  
          rc = check_pool_cmd(cmd, fsname, poolname, ostname);
          if (rc)
@@ -2652,7 +2833,7 @@ static int pool_cmd(enum lcfg_command_type cmd,
                  return rc;
          }
  
-        IOC_INIT(data);
+        memset(&data, 0x00, sizeof(data));
          rc = data.ioc_dev = get_mgs_device();
          if (rc < 0)
                  goto out;
@@ -2661,8 +2842,14 @@ static int pool_cmd(enum lcfg_command_type cmd,
          data.ioc_plen1 = lustre_cfg_len(lcfg->lcfg_bufcount,
                                          lcfg->lcfg_buflens);
          data.ioc_pbuf1 = (void *)lcfg;
-        IOC_PACK(cmdname, data);
  
+        memset(buf, 0, sizeof(rawbuf));
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fprintf(stderr, "error: %s: invalid ioctl\n",
+                        jt_cmdname(cmdname));
+                return rc;
+        }
          rc = l_ioctl(OBD_DEV_ID, OBD_IOC_POOL, buf);
  out:
          if (rc)
@@ -2935,13 +3122,14 @@ void  llapi_ping_target(char *obd_type, char *obd_name,
  {
          int  rc;
          struct obd_ioctl_data data;
+        char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf;
  
          memset(&data, 0, sizeof(data));
          data.ioc_inlbuf4 = obd_name;
          data.ioc_inllen4 = strlen(obd_name) + 1;
          data.ioc_dev = OBD_DEV_BY_DEVNAME;
          memset(buf, 0, sizeof(rawbuf));
-        if (obd_ioctl_pack(&data, &buf, max)) {
+        if (obd_ioctl_pack(&data, &buf, sizeof(rawbuf))) {
                  fprintf(stderr, "error: invalid ioctl\n");
                  return;
          }
diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c

index b8e5d06..956af80 100644 (file)
--- a/lustre/utils/wirecheck.c
+++ b/lustre/utils/wirecheck.c
@@ -1182,16 +1182,15 @@ check_ll_fiemap_extent(void)
          CHECK_CDEFINE(FIEMAP_EXTENT_LAST);
          CHECK_CDEFINE(FIEMAP_EXTENT_UNKNOWN);
          CHECK_CDEFINE(FIEMAP_EXTENT_DELALLOC);
-        CHECK_CDEFINE(FIEMAP_EXTENT_NO_DIRECT);
-        CHECK_CDEFINE(FIEMAP_EXTENT_SECONDARY);
-        CHECK_CDEFINE(FIEMAP_EXTENT_NET);
-        CHECK_CDEFINE(FIEMAP_EXTENT_DATA_COMPRESSED);
+        CHECK_CDEFINE(FIEMAP_EXTENT_ENCODED);
          CHECK_CDEFINE(FIEMAP_EXTENT_DATA_ENCRYPTED);
          CHECK_CDEFINE(FIEMAP_EXTENT_NOT_ALIGNED);
          CHECK_CDEFINE(FIEMAP_EXTENT_DATA_INLINE);
          CHECK_CDEFINE(FIEMAP_EXTENT_DATA_TAIL);
          CHECK_CDEFINE(FIEMAP_EXTENT_UNWRITTEN);
          CHECK_CDEFINE(FIEMAP_EXTENT_MERGED);
+        CHECK_CDEFINE(FIEMAP_EXTENT_NO_DIRECT);
+        CHECK_CDEFINE(FIEMAP_EXTENT_NET);
  }
  
  static void
diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c

index 344ce94..1a1fa03 100644 (file)
--- a/lustre/utils/wiretest.c
+++ b/lustre/utils/wiretest.c
@@ -62,8 +62,8 @@ void lustre_assert_wire_constants(void)
  {
          /* Wire protocol assertions generated by 'wirecheck'
           * (make -C lustre/utils newwiretest)
-         * running on Linux lin2 2.6.18-92.1.17-prep #3 Sun Nov 23 14:29:36 IST 2008 i686 i686 i386 G
-         * with gcc version 3.4.6 20060404 (Red Hat 3.4.6-10) */
+         * running on Linux localhost.localdomain 2.6.18-prep #3 SMP Sun Nov 23 08:04:44 EST 2008 i68
+         * with gcc version 4.1.1 20061011 (Red Hat 4.1.1-30) */
  
  
          /* Constants... */
@@ -251,9 +251,9 @@ void lustre_assert_wire_constants(void)
                   (long long)OBD_QC_CALLBACK);
          LASSERTF(OBD_LAST_OPC == 403, " found %lld\n",
                   (long long)OBD_LAST_OPC);
-        LASSERTF(QUOTA_DQACQ == 901, " found %lld\n",
+        LASSERTF(QUOTA_DQACQ == 601, " found %lld\n",
                   (long long)QUOTA_DQACQ);
-        LASSERTF(QUOTA_DQREL == 902, " found %lld\n",
+        LASSERTF(QUOTA_DQREL == 602, " found %lld\n",
                   (long long)QUOTA_DQREL);
          LASSERTF(MGS_CONNECT == 250, " found %lld\n",
                   (long long)MGS_CONNECT);
@@ -444,31 +444,31 @@ void lustre_assert_wire_constants(void)
                   (long long)(int)offsetof(struct obd_connect_data, padding2));
          LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding2) == 8, " found %lld\n",
                   (long long)(int)sizeof(((struct obd_connect_data *)0)->padding2));
-        CLASSERT(OBD_CONNECT_RDONLY == 0x00000001ULL);
-        CLASSERT(OBD_CONNECT_INDEX == 0x00000002ULL);
-        CLASSERT(OBD_CONNECT_GRANT == 0x00000008ULL);
-        CLASSERT(OBD_CONNECT_SRVLOCK == 0x00000010ULL);
-        CLASSERT(OBD_CONNECT_VERSION == 0x00000020ULL);
-        CLASSERT(OBD_CONNECT_REQPORTAL == 0x00000040ULL);
-        CLASSERT(OBD_CONNECT_ACL == 0x00000080ULL);
-        CLASSERT(OBD_CONNECT_XATTR == 0x00000100ULL);
+        CLASSERT(OBD_CONNECT_RDONLY == 0x1ULL);
+        CLASSERT(OBD_CONNECT_INDEX == 0x2ULL);
+        CLASSERT(OBD_CONNECT_GRANT == 0x8ULL);
+        CLASSERT(OBD_CONNECT_SRVLOCK == 0x10ULL);
+        CLASSERT(OBD_CONNECT_VERSION == 0x20ULL);
+        CLASSERT(OBD_CONNECT_REQPORTAL == 0x40ULL);
+        CLASSERT(OBD_CONNECT_ACL == 0x80ULL);
+        CLASSERT(OBD_CONNECT_XATTR == 0x100ULL);
          CLASSERT(OBD_CONNECT_REAL == 0x08000000ULL);
          CLASSERT(OBD_CONNECT_CKSUM == 0x20000000ULL);
-        CLASSERT(OBD_CONNECT_TRUNCLOCK == 0x00000400ULL);
-        CLASSERT(OBD_CONNECT_IBITS == 0x00001000ULL);
-        CLASSERT(OBD_CONNECT_JOIN == 0x00002000ULL);
-        CLASSERT(OBD_CONNECT_ATTRFID == 0x00004000ULL);
-        CLASSERT(OBD_CONNECT_NODEVOH == 0x00008000ULL);
+        CLASSERT(OBD_CONNECT_TRUNCLOCK == 0x400ULL);
+        CLASSERT(OBD_CONNECT_IBITS == 0x1000ULL);
+        CLASSERT(OBD_CONNECT_JOIN == 0x2000ULL);
+        CLASSERT(OBD_CONNECT_ATTRFID == 0x4000ULL);
+        CLASSERT(OBD_CONNECT_NODEVOH == 0x8000ULL);
          CLASSERT(OBD_CONNECT_RMT_CLIENT == 0x00010000ULL);
          CLASSERT(OBD_CONNECT_RMT_CLIENT_FORCE == 0x00020000ULL);
-        CLASSERT(OBD_CONNECT_BRW_SIZE == 0x00040000ULL);
-        CLASSERT(OBD_CONNECT_QUOTA64 == 0x00080000ULL);
-        CLASSERT(OBD_CONNECT_MDS_CAPA == 0x00100000ULL);
-        CLASSERT(OBD_CONNECT_OSS_CAPA == 0x00200000ULL);
+        CLASSERT(OBD_CONNECT_BRW_SIZE == 0x40000ULL);
+        CLASSERT(OBD_CONNECT_QUOTA64 == 0x80000ULL);
+        CLASSERT(OBD_CONNECT_MDS_CAPA == 0x100000ULL);
+        CLASSERT(OBD_CONNECT_OSS_CAPA == 0x200000ULL);
          CLASSERT(OBD_CONNECT_MDS_MDS == 0x04000000ULL);
          CLASSERT(OBD_CONNECT_SOM == 0x00800000ULL);
          CLASSERT(OBD_CONNECT_AT == 0x01000000ULL);
-        CLASSERT(OBD_CONNECT_CANCELSET == 0x00400000ULL);
+        CLASSERT(OBD_CONNECT_CANCELSET == 0x400000ULL);
          CLASSERT(OBD_CONNECT_LRU_RESIZE == 0x02000000ULL);
  
          /* Checks for struct obdo */
@@ -2386,7 +2386,7 @@ void lustre_assert_wire_constants(void)
          CLASSERT(FIEMAP_FLAG_DEVICE_ORDER == 0x40000000);
  
          /* Checks for struct ll_fiemap_extent */
-        LASSERTF((int)sizeof(struct ll_fiemap_extent) == 32, " found %lld\n",
+        LASSERTF((int)sizeof(struct ll_fiemap_extent) == 56, " found %lld\n",
                   (long long)(int)sizeof(struct ll_fiemap_extent));
          LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_logical) == 0, " found %lld\n",
                   (long long)(int)offsetof(struct ll_fiemap_extent, fe_logical));
@@ -2400,28 +2400,27 @@ void lustre_assert_wire_constants(void)
                   (long long)(int)offsetof(struct ll_fiemap_extent, fe_length));
          LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_length) == 8, " found %lld\n",
                   (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_length));
-        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 24, " found %lld\n",
+        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 40, " found %lld\n",
                   (long long)(int)offsetof(struct ll_fiemap_extent, fe_flags));
          LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags) == 4, " found %lld\n",
                   (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags));
-        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 28, " found %lld\n",
+        LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 44, " found %lld\n",
                   (long long)(int)offsetof(struct ll_fiemap_extent, fe_device));
          LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_device) == 4, " found %lld\n",
                   (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_device));
          CLASSERT(FIEMAP_EXTENT_LAST == 0x00000001);
          CLASSERT(FIEMAP_EXTENT_UNKNOWN == 0x00000002);
          CLASSERT(FIEMAP_EXTENT_DELALLOC == 0x00000004);
-        CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x00000008);
-        CLASSERT(FIEMAP_EXTENT_SECONDARY == 0x00000010);
-        CLASSERT(FIEMAP_EXTENT_NET == 0x00000020);
-        CLASSERT(FIEMAP_EXTENT_DATA_COMPRESSED == 0x00000040);
+        CLASSERT(FIEMAP_EXTENT_ENCODED == 0x00000008);
          CLASSERT(FIEMAP_EXTENT_DATA_ENCRYPTED == 0x00000080);
          CLASSERT(FIEMAP_EXTENT_NOT_ALIGNED == 0x00000100);
          CLASSERT(FIEMAP_EXTENT_DATA_INLINE == 0x00000200);
          CLASSERT(FIEMAP_EXTENT_DATA_TAIL == 0x00000400);
          CLASSERT(FIEMAP_EXTENT_UNWRITTEN == 0x00000800);
          CLASSERT(FIEMAP_EXTENT_MERGED == 0x00001000);
-#if defined(LIBLUSTRE_POSIX_ACL) && defined(CONFIG_FS_POSIX_ACL)
+        CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x40000000);
+        CLASSERT(FIEMAP_EXTENT_NET == 0x80000000);
+#ifdef LIBLUSTRE_POSIX_ACL
  
          /* Checks for type posix_acl_xattr_entry */
          LASSERTF((int)sizeof(xattr_acl_entry) == 8, " found %lld\n",
author	alex <alex>
	Fri, 6 Feb 2009 21:14:37 +0000 (21:14 +0000)
committer	alex <alex>
	Fri, 6 Feb 2009 21:14:37 +0000 (21:14 +0000)
lustre/ChangeLog		patch \| blob \| history
lustre/autoconf/lustre-core.m4		patch \| blob \| history
lustre/autoconf/lustre-version.ac		patch \| blob \| history
lustre/cmm/cmm_device.c		patch \| blob \| history
lustre/cmm/mdc_device.c		patch \| blob \| history
lustre/cmm/mdc_internal.h		patch \| blob \| history
lustre/fid/fid_lib.c		patch \| blob \| history
lustre/fld/fld_handler.c		patch \| blob \| history
lustre/include/cl_object.h		patch \| blob \| history
lustre/include/class_hash.h		patch \| blob \| history
lustre/include/linux/lvfs.h		patch \| blob \| history
lustre/include/lprocfs_status.h		patch \| blob \| history
lustre/include/lustre/ll_fiemap.h		patch \| blob \| history
lustre/include/lustre/lustre_idl.h		patch \| blob \| history
lustre/include/lustre/lustre_user.h		patch \| blob \| history
lustre/include/lustre_dlm.h		patch \| blob \| history
lustre/include/lustre_export.h		patch \| blob \| history
lustre/include/lustre_fid.h		patch \| blob \| history
lustre/include/lustre_lib.h		patch \| blob \| history
lustre/include/lustre_net.h		patch \| blob \| history
lustre/include/lustre_sec.h		patch \| blob \| history
lustre/include/obd.h		patch \| blob \| history
lustre/include/obd_class.h		patch \| blob \| history
lustre/kernel_patches/patches/md-mmp-unplug-dev-sles10.patch	[new file with mode: 0644]	patch \| blob
lustre/kernel_patches/patches/md-mmp-unplug-dev.patch	[new file with mode: 0644]	patch \| blob
lustre/kernel_patches/series/2.6-rhel5.series		patch \| blob \| history
lustre/kernel_patches/series/2.6-sles10.series		patch \| blob \| history
lustre/kernel_patches/series/2.6.22-vanilla.series		patch \| blob \| history
lustre/lclient/glimpse.c		patch \| blob \| history
lustre/lclient/lcommon_cl.c		patch \| blob \| history
lustre/ldlm/Makefile.am		patch \| blob \| history
lustre/ldlm/ldlm_internal.h		patch \| blob \| history
lustre/ldlm/ldlm_lib.c		patch \| blob \| history
lustre/ldlm/ldlm_lock.c		patch \| blob \| history
lustre/ldlm/ldlm_lockd.c		patch \| blob \| history
lustre/ldlm/ldlm_pool.c		patch \| blob \| history
lustre/ldlm/ldlm_request.c		patch \| blob \| history
lustre/ldlm/ldlm_resource.c		patch \| blob \| history
lustre/liblustre/llite_lib.c		patch \| blob \| history
lustre/liblustre/super.c		patch \| blob \| history
lustre/llite/Makefile.in		patch \| blob \| history
lustre/llite/autoMakefile.am		patch \| blob \| history
lustre/llite/dcache.c		patch \| blob \| history
lustre/llite/file.c		patch \| blob \| history
lustre/llite/llite_internal.h		patch \| blob \| history
lustre/llite/llite_lib.c		patch \| blob \| history
lustre/llite/lloop.c		patch \| blob \| history
lustre/llite/rw26.c		patch \| blob \| history
lustre/llite/vvp_page.c		patch \| blob \| history
lustre/lmv/lmv_obd.c		patch \| blob \| history
lustre/lov/Makefile.in		patch \| blob \| history
lustre/lov/autoMakefile.am		patch \| blob \| history
lustre/lov/lov_cl_internal.h		patch \| blob \| history
lustre/lov/lov_internal.h		patch \| blob \| history
lustre/lov/lov_obd.c		patch \| blob \| history
lustre/lov/lov_pack.c		patch \| blob \| history
lustre/lov/lov_pool.c		patch \| blob \| history
lustre/lov/lov_qos.c		patch \| blob \| history
lustre/lvfs/Makefile.in		patch \| blob \| history
lustre/lvfs/autoMakefile.am		patch \| blob \| history
lustre/lvfs/lvfs_linux.c		patch \| blob \| history
lustre/mdc/Makefile.in		patch \| blob \| history
lustre/mdc/autoMakefile.am		patch \| blob \| history
lustre/mdc/mdc_request.c		patch \| blob \| history
lustre/mdd/mdd_device.c		patch \| blob \| history
lustre/mdd/mdd_internal.h		patch \| blob \| history
lustre/mdd/mdd_object.c		patch \| blob \| history
lustre/mds/Makefile.in		patch \| blob \| history
lustre/mds/autoMakefile.am		patch \| blob \| history
lustre/mds/mds_lov.c		patch \| blob \| history
lustre/mdt/mdt_handler.c		patch \| blob \| history
lustre/mdt/mdt_open.c		patch \| blob \| history
lustre/mdt/mdt_recovery.c		patch \| blob \| history
lustre/mgc/Makefile.in		patch \| blob \| history
lustre/mgc/autoMakefile.am		patch \| blob \| history
lustre/mgs/Makefile.in		patch \| blob \| history
lustre/mgs/autoMakefile.am		patch \| blob \| history
lustre/mgs/mgs_handler.c		patch \| blob \| history
lustre/mgs/mgs_llog.c		patch \| blob \| history
lustre/obdclass/Makefile.in		patch \| blob \| history
lustre/obdclass/autoMakefile.am		patch \| blob \| history
lustre/obdclass/cl_page.c		patch \| blob \| history
lustre/obdclass/genops.c		patch \| blob \| history
lustre/obdclass/llog_test.c		patch \| blob \| history
lustre/obdclass/lu_object.c		patch \| blob \| history
lustre/obdclass/obd_config.c		patch \| blob \| history
lustre/obdclass/obd_mount.c		patch \| blob \| history
lustre/obdecho/Makefile.in		patch \| blob \| history
lustre/obdecho/autoMakefile.am		patch \| blob \| history
lustre/obdecho/echo.c		patch \| blob \| history
lustre/obdecho/echo_client.c		patch \| blob \| history
lustre/obdfilter/Makefile.in		patch \| blob \| history
lustre/obdfilter/autoMakefile.am		patch \| blob \| history
lustre/obdfilter/filter.c		patch \| blob \| history
lustre/obdfilter/filter_io_26.c		patch \| blob \| history
lustre/obdfilter/lproc_obdfilter.c		patch \| blob \| history
lustre/osc/Makefile.in		patch \| blob \| history
lustre/osc/autoMakefile.am		patch \| blob \| history
lustre/osc/osc_lock.c		patch \| blob \| history
lustre/osc/osc_request.c		patch \| blob \| history
lustre/osd/osd_handler.c		patch \| blob \| history
lustre/ost/Makefile.in		patch \| blob \| history
lustre/ost/autoMakefile.am		patch \| blob \| history
lustre/ost/ost_handler.c		patch \| blob \| history
lustre/ptlrpc/Makefile.in		patch \| blob \| history
lustre/ptlrpc/autoMakefile.am		patch \| blob \| history
lustre/ptlrpc/client.c		patch \| blob \| history
lustre/ptlrpc/events.c		patch \| blob \| history
lustre/ptlrpc/gss/gss_api.h		patch \| blob \| history
lustre/ptlrpc/gss/gss_bulk.c		patch \| blob \| history
lustre/ptlrpc/gss/gss_internal.h		patch \| blob \| history
lustre/ptlrpc/gss/gss_keyring.c		patch \| blob \| history
lustre/ptlrpc/gss/gss_krb5_mech.c		patch \| blob \| history
lustre/ptlrpc/gss/gss_mech_switch.c		patch \| blob \| history
lustre/ptlrpc/gss/sec_gss.c		patch \| blob \| history
lustre/ptlrpc/import.c		patch \| blob \| history
lustre/ptlrpc/lproc_ptlrpc.c		patch \| blob \| history
lustre/ptlrpc/niobuf.c		patch \| blob \| history
lustre/ptlrpc/pack_generic.c		patch \| blob \| history
lustre/ptlrpc/pers.c		patch \| blob \| history
lustre/ptlrpc/ptlrpc_module.c		patch \| blob \| history
lustre/ptlrpc/recov_thread.c		patch \| blob \| history
lustre/ptlrpc/sec.c		patch \| blob \| history
lustre/ptlrpc/sec_bulk.c		patch \| blob \| history
lustre/ptlrpc/sec_config.c		patch \| blob \| history
lustre/ptlrpc/sec_lproc.c		patch \| blob \| history
lustre/ptlrpc/sec_null.c		patch \| blob \| history
lustre/ptlrpc/sec_plain.c		patch \| blob \| history
lustre/ptlrpc/service.c		patch \| blob \| history
lustre/ptlrpc/wiretest.c		patch \| blob \| history
lustre/quota/Makefile.in		patch \| blob \| history
lustre/quota/autoMakefile.am		patch \| blob \| history
lustre/quota/quota_adjust_qunit.c		patch \| blob \| history
lustre/quota/quota_check.c		patch \| blob \| history
lustre/quota/quota_context.c		patch \| blob \| history
lustre/quota/quota_interface.c		patch \| blob \| history
lustre/quota/quota_internal.h		patch \| blob \| history
lustre/quota/quota_master.c		patch \| blob \| history
lustre/tests/acceptance-small.sh		patch \| blob \| history
lustre/tests/conf-sanity.sh		patch \| blob \| history
lustre/tests/mdsrate-create-large.sh		patch \| blob \| history
lustre/tests/mdsrate-create-small.sh		patch \| blob \| history
lustre/tests/mdsrate-lookup-1dir.sh		patch \| blob \| history
lustre/tests/mdsrate-stat-large.sh		patch \| blob \| history
lustre/tests/mdsrate-stat-small.sh		patch \| blob \| history
lustre/tests/recovery-small.sh		patch \| blob \| history
lustre/tests/replay-single.sh		patch \| blob \| history
lustre/tests/sanity-gss.sh		patch \| blob \| history
lustre/tests/sanity-quota.sh		patch \| blob \| history
lustre/tests/sanity.sh		patch \| blob \| history
lustre/tests/sanityN.sh		patch \| blob \| history
lustre/tests/test-framework.sh		patch \| blob \| history
lustre/utils/liblustreapi.c		patch \| blob \| history
lustre/utils/obd.c		patch \| blob \| history
lustre/utils/wirecheck.c		patch \| blob \| history
lustre/utils/wiretest.c		patch \| blob \| history