Whamcloud - gitweb
branch: b1_5
authorericm <ericm>
Mon, 27 Feb 2006 20:53:36 +0000 (20:53 +0000)
committerericm <ericm>
Mon, 27 Feb 2006 20:53:36 +0000 (20:53 +0000)
fix massive confliction during merge from b1_4.

67 files changed:
lustre/include/linux/lustre_user.h
lustre/include/lprocfs_status.h
lustre/include/lustre/lustre_user.h
lustre/include/lustre_cfg.h
lustre/include/lustre_dlm.h
lustre/include/lustre_export.h
lustre/include/lustre_idl.h
lustre/include/lustre_import.h
lustre/include/lustre_lib.h
lustre/include/lustre_lite.h
lustre/include/lustre_log.h
lustre/include/lustre_mds.h
lustre/include/lustre_net.h
lustre/include/lustre_quota.h
lustre/include/obd.h
lustre/include/obd_class.h
lustre/include/obd_support.h
lustre/ldlm/ldlm_lib.c
lustre/ldlm/ldlm_request.c
lustre/llite/dcache.c
lustre/llite/llite_internal.h
lustre/llite/llite_lib.c
lustre/lov/lov_ea.c
lustre/lov/lov_internal.h
lustre/lov/lov_obd.c
lustre/lov/lov_qos.c
lustre/lov/lov_request.c
lustre/lov/lproc_lov.c
lustre/mdc/mdc_internal.h
lustre/mdc/mdc_request.c
lustre/mds/handler.c
lustre/mds/mds_internal.h
lustre/mds/mds_join.c
lustre/mds/mds_log.c
lustre/mds/mds_lov.c
lustre/mds/mds_unlink_open.c
lustre/obdclass/Makefile.in
lustre/obdclass/class_obd.c
lustre/obdclass/genops.c
lustre/obdclass/llog.c
lustre/obdclass/llog_cat.c
lustre/obdclass/llog_ioctl.c
lustre/obdclass/llog_lvfs.c
lustre/obdclass/llog_obd.c
lustre/obdclass/llog_swab.c
lustre/obdclass/llog_test.c
lustre/obdclass/lustre_handles.c
lustre/obdclass/obd_config.c
lustre/obdecho/echo_client.c
lustre/obdfilter/filter.c
lustre/obdfilter/filter_internal.h
lustre/obdfilter/filter_log.c
lustre/osc/osc_create.c
lustre/osc/osc_request.c
lustre/ost/ost_handler.c
lustre/ptlrpc/import.c
lustre/ptlrpc/llog_client.c
lustre/ptlrpc/llog_net.c
lustre/ptlrpc/llog_server.c
lustre/ptlrpc/pack_generic.c
lustre/ptlrpc/pinger.c
lustre/ptlrpc/ptlrpcd.c
lustre/ptlrpc/recov_thread.c
lustre/ptlrpc/service.c
lustre/utils/lustre_cfg.c
lustre/utils/wirecheck.c
lustre/utils/wiretest.c

index dd4723d..7bbcca7 100644 (file)
@@ -45,7 +45,8 @@
 #include <sys/stat.h>
 #endif
 
-#if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__)
+#if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__) || \
+    defined(__craynv)
 typedef struct stat     lstat_t;
 #define HAVE_LOV_USER_MDS_DATA
 #elif defined(__USE_LARGEFILE64) || defined(__KERNEL__)
index 34b9d1b..c6b8005 100644 (file)
@@ -219,6 +219,8 @@ extern int lprocfs_rd_server_uuid(char *page, char **start, off_t off,
                                   int count, int *eof, void *data);
 extern int lprocfs_rd_conn_uuid(char *page, char **start, off_t off,
                                 int count, int *eof, void *data);
+extern int lprocfs_rd_connect_flags(char *page, char **start, off_t off,
+                                    int count, int *eof, void *data);
 extern int lprocfs_rd_num_exports(char *page, char **start, off_t off,
                                   int count, int *eof, void *data);
 extern int lprocfs_rd_numrefs(char *page, char **start, off_t off,
@@ -316,6 +318,9 @@ static inline int lprocfs_rd_server_uuid(char *page, char **start, off_t off,
 static inline int lprocfs_rd_conn_uuid(char *page, char **start, off_t off,
                                        int count, int *eof, void *data)
 { return 0; }
+static inline int lprocfs_rd_connect_flags(char *page, char **start, off_t off,
+                                           int count, int *eof, void *data)
+{ return 0; }
 static inline int lprocfs_rd_num_exports(char *page, char **start, off_t off,
                                          int count, int *eof, void *data)
 { return 0; }
index 15abb53..85a0268 100644 (file)
 #define LL_IOC_QUOTACHECK               _IOW ('f', 160, int)
 #define LL_IOC_POLL_QUOTACHECK          _IOR ('f', 161, struct if_quotacheck *)
 #define LL_IOC_QUOTACTL                 _IOWR('f', 162, struct if_quotactl *)
+#define LL_IOC_JOIN                     _IOW ('f', 163, long)
+#define LL_IOC_OBD_STATFS               _IOWR('f', 164, struct obd_statfs *)
+
+#define LL_STATFS_MDC           1
+#define LL_STATFS_LOV           2
 
 #define IOC_MDC_TYPE            'i'
 #define IOC_MDC_GETSTRIPE       _IOWR(IOC_MDC_TYPE, 21, struct lov_mds_md *)
 #define IOC_MDC_GETFILEINFO     _IOWR(IOC_MDC_TYPE, 22, struct lov_mds_data *)
 
 #define O_LOV_DELAY_CREATE 0100000000  /* hopefully this does not conflict */
+#define O_JOIN_FILE        0400000000  /* hopefully this does not conflict */
 
 #define LL_FILE_IGNORE_LOCK             0x00000001
 #define LL_FILE_GROUP_LOCKED            0x00000002
@@ -57,6 +63,8 @@
 #define LOV_USER_MAGIC_V1 0x0BD10BD0
 #define LOV_USER_MAGIC    LOV_USER_MAGIC_V1
 
+#define LOV_USER_MAGIC_JOIN 0x0BD20BD0
+
 #define LOV_PATTERN_RAID0 0x001
 #define LOV_PATTERN_RAID1 0x002
 #define LOV_PATTERN_FIRST 0x100
index 1290cde..cd13b97 100644 (file)
@@ -47,7 +47,9 @@ enum lcfg_command_type {
         LCFG_ADD_CONN       = 0x00cf00b,
         LCFG_DEL_CONN       = 0x00cf00c,
         LCFG_LOV_ADD_OBD    = 0x00cf00d,
-        LCFG_LOV_DEL_OBD    = 0x00cf00e
+        LCFG_LOV_DEL_OBD    = 0x00cf00e,
+        LCFG_PARAM          = 0x00cf00f,
+        LCFG_MARKER         = 0x00cf010
 };
 
 struct lustre_cfg_bufs {
@@ -56,6 +58,9 @@ struct lustre_cfg_bufs {
         uint32_t lcfg_bufcount;
 };
 
+/* Mountconf transitional hack, should go away after 1.6 */
+#define LCFG_FLG_MOUNTCONF 0x400
+
 struct lustre_cfg {
         uint32_t lcfg_version;
         uint32_t lcfg_command;
@@ -198,7 +203,6 @@ static inline void lustre_cfg_free(struct lustre_cfg *lcfg)
 {
         int len;
 
-        ENTRY;
         len = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens);
 
         OBD_FREE(lcfg, len);
@@ -233,13 +237,46 @@ static inline int lustre_cfg_sanity_check(void *buf, int len)
         RETURN(0);
 }
 
+
+#define LMD_MAGIC       0xbdacbd03
+#define LMD_MAGIC_MASK (0xffffff00 & LMD_MAGIC)
+
+#define lmd_bad_magic(LMDP)                                             \
+({                                                                      \
+        struct lustre_mount_data *_lmd__ = (LMDP);                      \
+        int _ret__ = 0;                                                 \
+        if (!_lmd__) {                                                  \
+                LCONSOLE_ERROR("Missing mount data: "                   \
+                       "check that /sbin/mount.lustre is installed.\n");\
+                _ret__ = 1;                                             \
+        } else if (_lmd__->lmd_magic == LMD_MAGIC) {                    \
+                _ret__ = 0;                                             \
+        } else if ((_lmd__->lmd_magic & LMD_MAGIC_MASK) == LMD_MAGIC_MASK) { \
+                LCONSOLE_ERROR("You're using an old version of "        \
+                       "/sbin/mount.lustre.  Please install version "   \
+                       "1.%d\n", LMD_MAGIC & 0xFF);                     \
+                _ret__ = 1;                                             \
+        } else {                                                        \
+                LCONSOLE_ERROR("Invalid mount data (%#x != %#x): "      \
+                       "check that /sbin/mount.lustre is installed\n",  \
+                       _lmd__->lmd_magic, LMD_MAGIC);                   \
+                _ret__ = 1;                                             \
+        }                                                               \
+        _ret__;                                                         \
+})
+
+#define MAX_FAILOVER_NIDS 10
+
 /* Passed by mount */
+/* Any changes in the alignment of elements in this stuct require a change to
+   LMD_MAGIC */
 struct lustre_mount_data {
-        uint32_t lmd_magic;
-        uint32_t lmd_flags;
-        uint64_t lmd_nid;
-        char     lmd_mds[64];
-        char     lmd_profile[64];
+        uint32_t   lmd_magic;
+        uint32_t   lmd_flags;
+        uint16_t   lmd_nid_count; /* how many failover nids we have for the MDS */
+        lnet_nid_t lmd_nid[MAX_FAILOVER_NIDS];
+        char       lmd_mds[64];
+        char       lmd_profile[64];
 };
 
 #define LMD_FLG_FLOCK           0x0001
index 7dbfb5f..d38e4a5 100644 (file)
@@ -27,7 +27,7 @@ struct obd_device;
 
 #define OBD_LDLM_DEVICENAME  "ldlm"
 
-#define LDLM_DEFAULT_LRU_SIZE 100
+#define LDLM_DEFAULT_LRU_SIZE (100 * smp_num_cpus)
 
 typedef enum {
         ELDLM_OK = 0,
@@ -128,15 +128,7 @@ typedef enum {
 #define LCK_COMPAT_NL  (LCK_COMPAT_CR | LCK_EX)
 #define LCK_COMPAT_GROUP  (LCK_GROUP | LCK_NL)
 
-static ldlm_mode_t lck_compat_array[] = {
-        [LCK_EX] LCK_COMPAT_EX,
-        [LCK_PW] LCK_COMPAT_PW,
-        [LCK_PR] LCK_COMPAT_PR,
-        [LCK_CW] LCK_COMPAT_CW,
-        [LCK_CR] LCK_COMPAT_CR,
-        [LCK_NL] LCK_COMPAT_NL,
-        [LCK_GROUP] LCK_COMPAT_GROUP
-};
+extern ldlm_mode_t lck_compat_array[];
 
 static inline void lockmode_verify(ldlm_mode_t mode)
 {
@@ -510,7 +502,7 @@ void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode);
 void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode);
 void ldlm_lock_allow_match(struct ldlm_lock *lock);
 int ldlm_lock_match(struct ldlm_namespace *ns, int flags, struct ldlm_res_id *,
-                    __u32 type, ldlm_policy_data_t *, ldlm_mode_t mode,
+                    ldlm_type_t type, ldlm_policy_data_t *, ldlm_mode_t mode,
                     struct lustre_handle *);
 struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
                                         int *flags);
@@ -535,7 +527,7 @@ static inline void ldlm_proc_cleanup(void) {}
 /* resource.c - internal */
 struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns,
                                         struct ldlm_resource *parent,
-                                        struct ldlm_res_id, __u32 type,
+                                        struct ldlm_res_id, ldlm_type_t type,
                                         int create);
 struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res);
 int ldlm_resource_putref(struct ldlm_resource *res);
@@ -559,7 +551,7 @@ int ldlm_cli_enqueue(struct obd_export *exp,
                      struct ptlrpc_request *req,
                      struct ldlm_namespace *ns,
                      struct ldlm_res_id,
-                     __u32 type,
+                     ldlm_type_t type,
                      ldlm_policy_data_t *,
                      ldlm_mode_t mode,
                      int *flags,
index 2e1ce6e..32f93ce 100644 (file)
@@ -14,16 +14,21 @@ struct mds_export_data {
         struct list_head        med_open_head;
         spinlock_t              med_open_lock; /* lock med_open_head, mfd_list*/
         struct mds_client_data *med_mcd;
+        __u64                   med_ibits_known;
         loff_t                  med_lr_off;
         int                     med_lr_idx;
 };
 
 struct osc_creator {
         spinlock_t              oscc_lock;
+        struct list_head        oscc_list;
         struct obd_device       *oscc_obd;
+        obd_id                  oscc_last_id;//last available pre-created object
+        obd_id                  oscc_next_id;// what object id to give out next
+        int                     oscc_grow_count;
+        struct obdo             oscc_oa;
         int                     oscc_flags;
-        obd_id                  oscc_next_id;
-        cfs_waitq_t             oscc_waitq;
+        cfs_waitq_t             oscc_waitq; /* creating procs wait on this */
 };
 
 struct ldlm_export_data {
index 3c4882f..2f85102 100644 (file)
@@ -67,7 +67,7 @@
 //#define OSC_REQUEST_PORTAL            3
 #define OSC_REPLY_PORTAL                4
 //#define OSC_BULK_PORTAL               5
-#define OST_REQUEST_PORTAL              6
+#define OST_IO_PORTAL                   6
 #define OST_CREATE_PORTAL               7
 #define OST_BULK_PORTAL                 8
 //#define MDC_REQUEST_PORTAL            9
 #define LDLM_CB_REPLY_PORTAL           16
 #define LDLM_CANCEL_REQUEST_PORTAL     17
 #define LDLM_CANCEL_REPLY_PORTAL       18
-#define PTLBD_REQUEST_PORTAL           19
-#define PTLBD_REPLY_PORTAL             20
-#define PTLBD_BULK_PORTAL              21
+//#define PTLBD_REQUEST_PORTAL           19
+//#define PTLBD_REPLY_PORTAL             20
+//#define PTLBD_BULK_PORTAL              21
 #define MDS_SETATTR_PORTAL             22
 #define MDS_READPAGE_PORTAL            23
-#define MGMT_REQUEST_PORTAL            24
-#define MGMT_REPLY_PORTAL              25
-#define MGMT_CLI_REQUEST_PORTAL        26
-#define MGMT_CLI_REPLY_PORTAL          27
+
+#define OST_REQUEST_PORTAL             28
 
 #define SVC_KILLED               1
 #define SVC_EVENT                2
 #define LUSTRE_OST_VERSION  0x00030000
 #define LUSTRE_DLM_VERSION  0x00040000
 #define LUSTRE_LOG_VERSION  0x00050000
-#define LUSTRE_PBD_VERSION  0x00060000
 
 struct lustre_handle {
         __u64 cookie;
 };
 #define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL
 
+static inline int lustre_handle_is_used(struct lustre_handle *lh)
+{
+        return lh->cookie != 0ull;
+}
+
+static inline int lustre_handle_equal(struct lustre_handle *lh1,
+                                      struct lustre_handle *lh2)
+{
+        return lh1->cookie == lh2->cookie;
+}
+
+static inline void lustre_handle_copy(struct lustre_handle *tgt,
+                                      struct lustre_handle *src)
+{
+        tgt->cookie = src->cookie;
+}
+
 /* we depend on this structure to be 8-byte aligned */
 /* this type is only endian-adjusted in lustre_unpack_msg() */
 struct lustre_msg {
@@ -197,27 +211,52 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags)
 #define MSG_CONNECT_ASYNC       0x40
 
 /* Connect flags */
-#define OBD_CONNECT_RDONLY      0x0001ULL
-#define OBD_CONNECT_SRVLOCK     0x0010ULL /* server takes locks for client */
-#define OBD_CONNECT_ACL         0x0080ULL
-#define OBD_CONNECT_USER_XATTR  0x0100ULL
-#define OBD_CONNECT_CROW        0x0200ULL /* OST is CROW able */
-
-#define MDS_CONNECT_SUPPORTED  (OBD_CONNECT_RDONLY |            \
-                                OBD_CONNECT_ACL |               \
-                                OBD_CONNECT_USER_XATTR)
-#define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_CROW)
+#define OBD_CONNECT_RDONLY       0x1ULL /* client allowed read-only access */
+#define OBD_CONNECT_INDEX        0x2ULL /* connect to specific LOV idx */
+#define OBD_CONNECT_GRANT        0x8ULL /* OSC acquires grant at connect */
+#define OBD_CONNECT_SRVLOCK     0x10ULL /* server takes locks for client */
+#define OBD_CONNECT_VERSION     0x20ULL /* Server supports versions in ocd */
+#define OBD_CONNECT_REQPORTAL   0x40ULL /* Separate portal for non-IO reqs */
+#define OBD_CONNECT_ACL         0x80ULL /* client using access control lists */
+#define OBD_CONNECT_XATTR      0x100ULL /* client using extended attributes*/
+#define OBD_CONNECT_CROW       0x200ULL /* MDS+OST do object create-on-write */
+#define OBD_CONNECT_TRUNCLOCK  0x400ULL /* server gets locks for punch b=9528 */
+#define OBD_CONNECT_TRANSNO    0x800ULL /* replay is sending initial transno */
+#define OBD_CONNECT_IBITS     0x1000ULL /* support for inodebits locks */
+#define OBD_CONNECT_JOIN      0x2000ULL /* files can be concatenated */
+/* also update obd_connect_names[] for lprocfs_rd_connect_flags() */
+
+#define MDS_CONNECT_SUPPORTED  (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \
+                                OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \
+                                OBD_CONNECT_IBITS | OBD_CONNECT_JOIN)
+#define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
+                                OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
+                                OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX)
 #define ECHO_CONNECT_SUPPORTED (0)
 
+#define OBD_OCD_VERSION(major,minor,patch,fix) (((major)<<24) + ((minor)<<16) +\
+                                                ((patch)<<8) + (fix))
+#define OBD_OCD_VERSION_MAJOR(version) ((int)((version)>>24)&255)
+#define OBD_OCD_VERSION_MINOR(version) ((int)((version)>>16)&255)
+#define OBD_OCD_VERSION_PATCH(version) ((int)((version)>>8)&255)
+#define OBD_OCD_VERSION_FIX(version)   ((int)(version)&255)
+
 /* This structure is used for both request and reply.
  *
  * If we eventually have separate connect data for different types, which we
  * almost certainly will, then perhaps we stick a union in here. */
 struct obd_connect_data {
-        __u64 ocd_connect_flags;    /* connection flags, server should return
-                                     * subset of what is asked for. */
-        
-        __u64 padding[8];
+        __u64 ocd_connect_flags;        /* OBD_CONNECT_* per above */
+        __u32 ocd_version;              /* lustre release version number */
+        __u32 ocd_grant;                /* initial cache grant amount (bytes) */
+        __u32 ocd_index;                /* LOV index to connect to */
+        __u32 ocd_unused;
+        __u64 ocd_ibits_known;          /* inode bits this client understands */
+        __u64 padding2;                 /* also fix lustre_swab_connect */
+        __u64 padding3;                 /* also fix lustre_swab_connect */
+        __u64 padding4;                 /* also fix lustre_swab_connect */
+        __u64 padding5;                 /* also fix lustre_swab_connect */
+        __u64 padding6;                 /* also fix lustre_swab_connect */
 };
 
 extern void lustre_swab_connect(struct obd_connect_data *ocd);
@@ -275,7 +314,14 @@ typedef uint32_t        obd_count;
 #define OBD_FL_DEBUG_CHECK   (0x00000040) /* echo client/server debug check */
 #define OBD_FL_NO_USRQUOTA   (0x00000100) /* the object's owner is over quota */
 #define OBD_FL_NO_GRPQUOTA   (0x00000200) /* the object's group is over quota */
-#define OBD_FL_CREATE_CROW   (0x00000400) /* object should be created with crow */
+#define OBD_FL_CREATE_CROW   (0x00000400) /* object should be create on write */
+
+/*
+ * set this to delegate DLM locking during obd_punch() to the OSTs. Only OSTs
+ * that declared OBD_CONNECT_TRUNCLOCK in their connect flags support this
+ * functionality.
+ */
+#define OBD_FL_TRUNCLOCK     (0x00000800)
 
 /* this should be not smaller than sizeof(struct lustre_handle) + sizeof(struct
  * llog_cookie) + sizeof(ll_fid). Nevertheless struct ll_fid is not longer
@@ -294,7 +340,7 @@ struct obdo {
         obd_time                o_ctime;
         obd_blocks              o_blocks;       /* brw: cli sent cached bytes */
         obd_size                o_grant;
-        
+
         /* 32-bit fields start here: keep an even number of them via padding */
         obd_blksize             o_blksize;      /* optimal IO blocksize */
         obd_mode                o_mode;         /* brw: cli sent cache remain */
@@ -316,15 +362,12 @@ struct obdo {
 #define o_dropped o_misc
 #define o_cksum   o_nlink
 
-#define OBDO_URGENT_CREATE(oa)                      \
-        (!((oa)->o_valid & OBD_MD_FLFLAGS) ||       \
-         !((oa)->o_flags & OBD_FL_CREATE_CROW) ||   \
-         ((oa)->o_flags & OBD_FL_RECREATE_OBJS))
-
 extern void lustre_swab_obdo (struct obdo *o);
 
+
 #define LOV_MAGIC_V1      0x0BD10BD0
 #define LOV_MAGIC         LOV_MAGIC_V1
+#define LOV_MAGIC_JOIN    0x0BD20BD0
 
 #define LOV_PATTERN_RAID0 0x001   /* stripes are used round-robin */
 #define LOV_PATTERN_RAID1 0x002   /* stripes are mirrors of each other */
@@ -350,6 +393,7 @@ struct lov_mds_md_v1 {            /* LOV EA mds/wire data (little-endian) */
         struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
 };
 
+
 #define OBD_MD_FLID        (0x00000001ULL) /* object ID */
 #define OBD_MD_FLATIME     (0x00000002ULL) /* access time */
 #define OBD_MD_FLMTIME     (0x00000004ULL) /* data modification time */
@@ -380,6 +424,7 @@ struct lov_mds_md_v1 {            /* LOV EA mds/wire data (little-endian) */
 #define OBD_MD_FLDIREA     (0x10000000ULL) /* dir's extended attribute data */
 #define OBD_MD_FLUSRQUOTA  (0x20000000ULL) /* over quota flags sent from ost */
 #define OBD_MD_FLGRPQUOTA  (0x40000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */
 
 #define OBD_MD_MDS         (0x0000000100000000ULL) /* where an inode lives on */
 #define OBD_MD_REINT       (0x0000000200000000ULL) /* reintegrate oa */
@@ -419,7 +464,7 @@ struct obd_statfs {
         __u32           os_bsize;
         __u32           os_namelen;
         __u64           os_maxbytes;
-        __u32           os_state;
+        __u32           os_state;       /* positive error code on server */
         __u32           os_spare1;
         __u32           os_spare2;
         __u32           os_spare3;
@@ -494,6 +539,14 @@ extern void lustre_swab_ost_lvb(struct ost_lvb *);
  *   MDS REQ RECORDS
  */
 
+/* FIXME: this is different from HEAD, adjust it
+ * while merge GSS */
+#define MDS_REQ_REC_OFF                 0
+
+#define MDS_REQ_INTENT_LOCKREQ_OFF      0
+#define MDS_REQ_INTENT_IT_OFF           1
+#define MDS_REQ_INTENT_REC_OFF          2
+
 /* opcodes */
 typedef enum {
         MDS_GETATTR      = 33,
@@ -557,7 +610,7 @@ typedef enum {
 struct ll_fid {
         __u64 id;         /* holds object id */
         __u32 generation; /* holds object generation */
-        
+
         __u32 f_type;     /* holds object type or stripe idx when passing it to
                            * OST for saving into EA. */
 };
@@ -601,8 +654,8 @@ struct mds_body {
         __u32          suppgid;
         __u32          eadatasize;
         __u32          aclsize;
-        __u32          padding_2; /* also fix lustre_swab_mds_body */
-        __u32          padding_3; /* also fix lustre_swab_mds_body */
+        __u32          max_mdsize;
+        __u32          max_cookiesize; /* also fix lustre_swab_mds_body */
         __u32          padding_4; /* also fix lustre_swab_mds_body */
 };
 
@@ -672,6 +725,7 @@ extern void lustre_swab_mds_rec_setattr (struct mds_rec_setattr *sa);
 
 #define MDS_OPEN_DELAY_CREATE  0100000000 /* delay initial object create */
 #define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
+#define MDS_OPEN_JOIN_FILE     0400000000 /* open for join file*/
 #define MDS_OPEN_HAS_EA      010000000000 /* specify object create pattern */
 #define MDS_OPEN_HAS_OBJS    020000000000 /* Just set the EA the obj exist */
 
@@ -696,6 +750,13 @@ struct mds_rec_create {
 
 extern void lustre_swab_mds_rec_create (struct mds_rec_create *cr);
 
+struct mds_rec_join {
+        struct ll_fid  jr_fid;
+        __u64          jr_headsize;
+};
+
+extern void lustre_swab_mds_rec_join (struct mds_rec_join *jr);
+
 struct mds_rec_link {
         __u32           lk_opcode;
         __u32           lk_fsuid;
@@ -771,10 +832,10 @@ struct lov_desc {
         __u32 ld_pattern;                  /* PATTERN_RAID0, PATTERN_RAID1 */
         __u64 ld_default_stripe_size;      /* in bytes */
         __u64 ld_default_stripe_offset;    /* in bytes */
-        __u32 ld_qos_threshold;            /* in MB */
-        __u32 ld_qos_maxage;               /* in second */
         __u32 ld_padding_1;                /* also fix lustre_swab_lov_desc */
         __u32 ld_padding_2;                /* also fix lustre_swab_lov_desc */
+        __u32 ld_padding_3;                /* also fix lustre_swab_lov_desc */
+        __u32 ld_padding_4;                /* also fix lustre_swab_lov_desc */
         struct obd_uuid ld_uuid;
 };
 
@@ -867,7 +928,7 @@ extern void lustre_swab_ldlm_intent (struct ldlm_intent *i);
 
 struct ldlm_resource_desc {
         ldlm_type_t lr_type;
-        __u32 lr_padding;
+        __u32 lr_padding;       /* also fix lustre_swab_ldlm_resource_desc */
         struct ldlm_res_id lr_name;
 };
 
@@ -884,7 +945,7 @@ extern void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l);
 
 struct ldlm_request {
         __u32 lock_flags;
-        __u32 lock_padding;
+        __u32 lock_padding;     /* also fix lustre_swab_ldlm_request */
         struct ldlm_lock_desc lock_desc;
         struct lustre_handle lock_handle1;
         struct lustre_handle lock_handle2;
@@ -894,7 +955,7 @@ extern void lustre_swab_ldlm_request (struct ldlm_request *rq);
 
 struct ldlm_reply {
         __u32 lock_flags;
-        __u32 lock_padding;
+        __u32 lock_padding;     /* also fix lustre_swab_ldlm_reply */
         struct ldlm_lock_desc lock_desc;
         struct lustre_handle lock_handle;
         __u64  lock_policy_res1;
@@ -904,57 +965,6 @@ struct ldlm_reply {
 extern void lustre_swab_ldlm_reply (struct ldlm_reply *r);
 
 /*
- * ptlbd, portal block device requests
- */
-typedef enum {
-        PTLBD_QUERY = 200,
-        PTLBD_READ = 201,
-        PTLBD_WRITE = 202,
-        PTLBD_FLUSH = 203,
-        PTLBD_CONNECT = 204,
-        PTLBD_DISCONNECT = 205,
-        PTLBD_LAST_OPC
-} ptlbd_cmd_t;
-#define PTLBD_FIRST_OPC PTLBD_QUERY
-
-struct ptlbd_op {
-        __u16 op_cmd;
-        __u16 op_lun;
-        __u16 op_niob_cnt;
-        __u16 op__padding;
-        __u32 op_block_cnt;
-};
-
-extern void lustre_swab_ptlbd_op (struct ptlbd_op *op);
-
-struct ptlbd_niob {
-        __u64 n_xid;
-        __u64 n_block_nr;
-        __u32 n_offset;
-        __u32 n_length;
-};
-
-extern void lustre_swab_ptlbd_niob (struct ptlbd_niob *n);
-
-struct ptlbd_rsp {
-        __u16 r_status;
-        __u16 r_error_cnt;
-};
-
-extern void lustre_swab_ptlbd_rsp (struct ptlbd_rsp *r);
-
-/*
- * Opcodes for management/monitoring node.
- */
-typedef enum {
-        MGMT_CONNECT = 250,
-        MGMT_DISCONNECT,
-        MGMT_EXCEPTION,         /* node died, etc. */
-        MGMT_LAST_OPC
-} mgmt_cmd_t;
-#define MGMT_FIRST_OPC MGMT_CONNECT
-
-/*
  * Opcodes for multiple servers.
  */
 
@@ -979,9 +989,19 @@ struct llog_logid {
 #define CATLIST "CATALOGS"
 struct llog_catid {
         struct llog_logid       lci_logid;
-        __u32                   lci_padding[3];
+        __u32                   lci_padding1;
+        __u32                   lci_padding2;
+        __u32                   lci_padding3;
 } __attribute__((packed));
 
+/*join file lov mds md*/
+struct lov_mds_md_join {
+        struct lov_mds_md lmmj_md;
+        /*join private info*/
+        struct llog_logid lmmj_array_id; /*array object id*/
+        __u32  lmmj_extent_count;        /*array extent count*/
+};
+
 /* Log data record types - there is no specific reason that these need to
  * be related to the RPC opcodes, but no reason not to (may be handy later?)
  */
@@ -997,6 +1017,7 @@ typedef enum {
         OBD_CFG_REC      = LLOG_OP_MAGIC | 0x20000,
         PTL_CFG_REC      = LLOG_OP_MAGIC | 0x30000, /* obsolete */
         LLOG_GEN_REC     = LLOG_OP_MAGIC | 0x40000,
+        LLOG_JOIN_REC    = LLOG_OP_MAGIC | 0x50000,
         LLOG_HDR_MAGIC   = LLOG_OP_MAGIC | 0x45539,
         LLOG_LOGID_MAGIC = LLOG_OP_MAGIC | 0x4553b,
 } llog_op_type;
@@ -1029,10 +1050,30 @@ struct llog_rec_tail {
 struct llog_logid_rec {
         struct llog_rec_hdr     lid_hdr;
         struct llog_logid       lid_id;
-        __u32                   padding[5];
+        __u32                   padding1;
+        __u32                   padding2;
+        __u32                   padding3;
+        __u32                   padding4;
+        __u32                   padding5;
         struct llog_rec_tail    lid_tail;
 } __attribute__((packed));
 
+/* MDS extent description
+ * It is for joined file extent info, each extent info for joined file
+ * just like (start, end, lmm).
+ */
+struct mds_extent_desc {
+        __u64                   med_start; /* extent start */
+        __u64                   med_len;   /* extent length */
+        struct lov_mds_md       med_lmm;   /* extent's lmm  */
+};
+/*Joined file array extent log record*/
+struct llog_array_rec {
+        struct llog_rec_hdr     lmr_hdr;
+        struct mds_extent_desc  lmr_med;
+        struct llog_rec_tail    lmr_tail;
+};
+
 struct llog_create_rec {
         struct llog_rec_hdr     lcr_hdr;
         struct ll_fid           lcr_fid;
@@ -1134,6 +1175,8 @@ enum llogd_rpc_ops {
         LLOG_ORIGIN_HANDLE_CLOSE        = 505,
         LLOG_ORIGIN_CONNECT             = 506,
         LLOG_CATINFO                    = 507,  /* for lfs catinfo */
+        LLOG_ORIGIN_HANDLE_PREV_BLOCK   = 508,
+        LLOG_ORIGIN_HANDLE_DESTROY      = 509,  /* for destroy llog object*/
 };
 
 struct llogd_body {
@@ -1152,8 +1195,32 @@ struct llogd_conn_body {
         __u32                   lgdc_ctxt_idx;
 } __attribute__((packed));
 
+struct lov_user_ost_data_join {   /* per-stripe data structure */
+        __u64 l_extent_start;     /* extent start*/
+        __u64 l_extent_end;       /* extent end*/
+        __u64 l_object_id;        /* OST object ID */
+        __u64 l_object_gr;        /* OST object group (creating MDS number) */
+        __u32 l_ost_gen;          /* generation of this OST index */
+        __u32 l_ost_idx;          /* OST index in LOV */
+} __attribute__((packed));
+
+struct lov_user_md_join {         /* LOV EA user data (host-endian) */
+        __u32 lmm_magic;          /* magic number = LOV_MAGIC_JOIN */
+        __u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+        __u64 lmm_object_id;      /* LOV object ID */
+        __u64 lmm_object_gr;      /* LOV object group */
+        __u32 lmm_stripe_size;    /* size of stripe in bytes */
+        __u32 lmm_stripe_count;   /* num stripes in use for this object */
+        __u32 lmm_extent_count;   /* extent count of lmm*/
+        __u64 lmm_tree_id;        /* mds tree object id */
+        __u64 lmm_tree_gen;       /* mds tree object gen */
+        struct llog_logid lmm_array_id; /* mds extent desc llog object id */
+        struct lov_user_ost_data_join lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed));
+
 extern void lustre_swab_lov_user_md(struct lov_user_md *lum);
 extern void lustre_swab_lov_user_md_objects(struct lov_user_md *lum);
+extern void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj);
 
 /* llog_swab.c */
 extern void lustre_swab_llogd_body (struct llogd_body *d);
@@ -1179,4 +1246,5 @@ typedef enum {
         QUOTA_DQREL     = 602,
 } quota_cmd_t;
 
+#define JOIN_FILE_ALIGN 4096
 #endif
index 5cd78ea..315dc01 100644 (file)
@@ -38,6 +38,7 @@ enum obd_import_event {
         IMP_EVENT_INACTIVE   = 0x808002,
         IMP_EVENT_INVALIDATE = 0x808003,
         IMP_EVENT_ACTIVE     = 0x808004,
+        IMP_EVENT_OCD        = 0x808005,
 };
 
 struct obd_import_conn {
@@ -53,7 +54,6 @@ struct obd_import {
         struct lustre_handle      imp_dlm_handle; /* client's ldlm export */
         struct ptlrpc_connection *imp_connection;
         struct ptlrpc_client     *imp_client;
-        struct list_head          imp_observers;
         struct list_head          imp_pinger_chain;
 
         /* Lists of requests that are retained for replay, waiting for a reply,
@@ -87,16 +87,14 @@ struct obd_import {
         /* flags */
         unsigned int              imp_invalid:1, imp_replayable:1,
                                   imp_dlm_fake:1, imp_server_timeout:1,
-                                  imp_initial_recov:1, imp_force_verify:1,
-                                  imp_pingable:1, imp_resend_replay:1,
-                                  imp_deactive:1;
+                                  imp_initial_recov:1, imp_initial_recov_bk:1,
+                                  imp_force_verify:1, imp_pingable:1,
+                                  imp_resend_replay:1, imp_deactive:1;
         __u32                     imp_connect_op;
         struct obd_connect_data   imp_connect_data;
+        __u64                     imp_connect_flags_orig;
 };
 
-#define IMP_CROW_ABLE(imp) \
-        ((imp)->imp_connect_data.ocd_connect_flags & OBD_CONNECT_CROW)
-
 typedef void (*obd_import_callback)(struct obd_import *imp, void *closure,
                                     int event, void *event_arg, void *cb_data);
 
index 70b6d04..bf2093c 100644 (file)
 #error Unsupported operating system.
 #endif
 
+/* prng.c */
+unsigned int ll_rand(void);        /* returns a random 32-bit integer */
+void ll_srand(unsigned int, unsigned int);     /* seed the generator */
+
 /* target.c */
 struct ptlrpc_request;
 struct recovd_data;
@@ -66,7 +70,7 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req);
 
 void target_cancel_recovery_timer(struct obd_device *obd);
 
-#define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2) /* *waves hands* */
+#define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */
 void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler);
 void target_abort_recovery(void *data);
 void target_cleanup_recovery(struct obd_device *obd);
@@ -312,7 +316,7 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg)
         ENTRY;
 
         err = copy_from_user(&hdr, (void *)arg, sizeof(hdr));
-        if ( err 
+        if (err
                 RETURN(err);
 
         if (hdr.ioc_version != OBD_IOCTL_VERSION) {
@@ -343,7 +347,7 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg)
         data = (struct obd_ioctl_data *)*buf;
 
         err = copy_from_user(*buf, (void *)arg, hdr.ioc_len);
-        if ( err ) {
+        if (err) {
                 OBD_VFREE(*buf, hdr.ioc_len);
                 RETURN(err);
         }
@@ -373,8 +377,7 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg)
                 data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
         }
 
-        EXIT;
-        return 0;
+        RETURN(0);
 }
 
 static inline int obd_ioctl_popdata(void *arg, void *data, int len)
@@ -395,77 +398,76 @@ static inline void obd_ioctl_freedata(char *buf, int len)
         return;
 }
 
-#define OBD_IOC_CREATE                 _IOR ('f', 101, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_DESTROY                _IOW ('f', 104, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_PREALLOCATE            _IOWR('f', 105, OBD_IOC_DATA_TYPE)
-
-#define OBD_IOC_SETATTR                _IOW ('f', 107, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_GETATTR                _IOR ('f', 108, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_READ                   _IOWR('f', 109, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_WRITE                  _IOWR('f', 110, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CREATE                 _IOR ('f', 101, long)
+#define OBD_IOC_DESTROY                _IOW ('f', 104, long)
+#define OBD_IOC_PREALLOCATE            _IOWR('f', 105, long)
 
+#define OBD_IOC_SETATTR                _IOW ('f', 107, long)
+#define OBD_IOC_GETATTR                _IOR ('f', 108, long)
+#define OBD_IOC_READ                   _IOWR('f', 109, long)
+#define OBD_IOC_WRITE                  _IOWR('f', 110, long)
 
-#define OBD_IOC_STATFS                 _IOWR('f', 113, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_SYNC                   _IOW ('f', 114, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_READ2                  _IOWR('f', 115, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_FORMAT                 _IOWR('f', 116, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_PARTITION              _IOWR('f', 117, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_COPY                   _IOWR('f', 120, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_MIGR                   _IOWR('f', 121, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_PUNCH                  _IOWR('f', 122, OBD_IOC_DATA_TYPE)
 
-#define OBD_IOC_MODULE_DEBUG           _IOWR('f', 124, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_BRW_READ               _IOWR('f', 125, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_BRW_WRITE              _IOWR('f', 126, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_NAME2DEV               _IOWR('f', 127, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_UUID2DEV               _IOWR('f', 130, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_GETNAME                _IOR ('f', 131, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_STATFS                 _IOWR('f', 113, long)
+#define OBD_IOC_SYNC                   _IOW ('f', 114, long)
+#define OBD_IOC_READ2                  _IOWR('f', 115, long)
+#define OBD_IOC_FORMAT                 _IOWR('f', 116, long)
+#define OBD_IOC_PARTITION              _IOWR('f', 117, long)
+#define OBD_IOC_COPY                   _IOWR('f', 120, long)
+#define OBD_IOC_MIGR                   _IOWR('f', 121, long)
+#define OBD_IOC_PUNCH                  _IOWR('f', 122, long)
 
-#define OBD_IOC_LOV_GET_CONFIG         _IOWR('f', 132, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_CLIENT_RECOVER         _IOW ('f', 133, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_MODULE_DEBUG           _IOWR('f', 124, long)
+#define OBD_IOC_BRW_READ               _IOWR('f', 125, long)
+#define OBD_IOC_BRW_WRITE              _IOWR('f', 126, long)
+#define OBD_IOC_NAME2DEV               _IOWR('f', 127, long)
+#define OBD_IOC_UUID2DEV               _IOWR('f', 130, long)
+#define OBD_IOC_GETNAME                _IOR ('f', 131, long)
 
+#define OBD_IOC_LOV_GET_CONFIG         _IOWR('f', 132, long)
+#define OBD_IOC_CLIENT_RECOVER         _IOW ('f', 133, long)
 
 #define OBD_IOC_DEC_FS_USE_COUNT       _IO  ('f', 139      )
-#define OBD_IOC_NO_TRANSNO             _IOW ('f', 140, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_SET_READONLY           _IOW ('f', 141, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_ABORT_RECOVERY         _IOR ('f', 142, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_NO_TRANSNO             _IOW ('f', 140, long)
+#define OBD_IOC_SET_READONLY           _IOW ('f', 141, long)
+#define OBD_IOC_ABORT_RECOVERY         _IOR ('f', 142, long)
 
-#define OBD_GET_VERSION                _IOWR ('f', 144, OBD_IOC_DATA_TYPE)
+#define OBD_GET_VERSION                _IOWR ('f', 144, long)
 
-#define OBD_IOC_CLOSE_UUID             _IOWR ('f', 147, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLOSE_UUID             _IOWR ('f', 147, long)
 
-#define OBD_IOC_GETDEVICE              _IOWR ('f', 149, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETDEVICE              _IOWR ('f', 149, long)
 
-#define OBD_IOC_LOV_SETSTRIPE          _IOW ('f', 154, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_LOV_GETSTRIPE          _IOW ('f', 155, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_LOV_SETEA              _IOW ('f', 156, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LOV_SETSTRIPE          _IOW ('f', 154, long)
+#define OBD_IOC_LOV_GETSTRIPE          _IOW ('f', 155, long)
+#define OBD_IOC_LOV_SETEA              _IOW ('f', 156, long)
 
 #define OBD_IOC_QUOTACHECK             _IOW ('f', 160, int)
 #define OBD_IOC_POLL_QUOTACHECK        _IOR ('f', 161, struct if_quotacheck *)
 #define OBD_IOC_QUOTACTL               _IOWR('f', 162, struct if_quotactl *)
 
-#define OBD_IOC_MOUNTOPT               _IOWR('f', 170, OBD_IOC_DATA_TYPE)
-
-#define OBD_IOC_RECORD                 _IOWR('f', 180, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_ENDRECORD              _IOWR('f', 181, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_PARSE                  _IOWR('f', 182, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_DORECORD               _IOWR('f', 183, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_PROCESS_CFG            _IOWR('f', 184, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_DUMP_LOG               _IOWR('f', 185, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_CLEAR_LOG              _IOWR('f', 186, OBD_IOC_DATA_TYPE)
-
-#define OBD_IOC_CATLOGLIST             _IOWR('f', 190, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_LLOG_INFO              _IOWR('f', 191, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_LLOG_PRINT             _IOWR('f', 192, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_LLOG_CANCEL            _IOWR('f', 193, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_LLOG_REMOVE            _IOWR('f', 194, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_LLOG_CHECK             _IOWR('f', 195, OBD_IOC_DATA_TYPE)
-#define OBD_IOC_LLOG_CATINFO           _IOWR('f', 196, OBD_IOC_DATA_TYPE)
-
-#define ECHO_IOC_GET_STRIPE            _IOWR('f', 200, OBD_IOC_DATA_TYPE)
-#define ECHO_IOC_SET_STRIPE            _IOWR('f', 201, OBD_IOC_DATA_TYPE)
-#define ECHO_IOC_ENQUEUE               _IOWR('f', 202, OBD_IOC_DATA_TYPE)
-#define ECHO_IOC_CANCEL                _IOWR('f', 203, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_MOUNTOPT               _IOWR('f', 170, long)
+
+#define OBD_IOC_RECORD                 _IOWR('f', 180, long)
+#define OBD_IOC_ENDRECORD              _IOWR('f', 181, long)
+#define OBD_IOC_PARSE                  _IOWR('f', 182, long)
+#define OBD_IOC_DORECORD               _IOWR('f', 183, long)
+#define OBD_IOC_PROCESS_CFG            _IOWR('f', 184, long)
+#define OBD_IOC_DUMP_LOG               _IOWR('f', 185, long)
+#define OBD_IOC_CLEAR_LOG              _IOWR('f', 186, long)
+
+#define OBD_IOC_CATLOGLIST             _IOWR('f', 190, long)
+#define OBD_IOC_LLOG_INFO              _IOWR('f', 191, long)
+#define OBD_IOC_LLOG_PRINT             _IOWR('f', 192, long)
+#define OBD_IOC_LLOG_CANCEL            _IOWR('f', 193, long)
+#define OBD_IOC_LLOG_REMOVE            _IOWR('f', 194, long)
+#define OBD_IOC_LLOG_CHECK             _IOWR('f', 195, long)
+#define OBD_IOC_LLOG_CATINFO           _IOWR('f', 196, long)
+
+#define ECHO_IOC_GET_STRIPE            _IOWR('f', 200, long)
+#define ECHO_IOC_SET_STRIPE            _IOWR('f', 201, long)
+#define ECHO_IOC_ENQUEUE               _IOWR('f', 202, long)
+#define ECHO_IOC_CANCEL                _IOWR('f', 203, long)
 
 /* XXX _IOWR('f', 250, long) has been defined in
  * lnet/include/libcfs/kp30.h for debug, don't use it
@@ -477,237 +479,273 @@ static inline void obd_ioctl_freedata(char *buf, int len)
 
 #define POISON_BULK 0
 
-static inline int ll_insecure_random_int(void)
-{
-        struct timeval t;
-        do_gettimeofday(&t);
-        return (int)(t.tv_usec);
-}
-
 /*
  * l_wait_event is a flexible sleeping function, permitting simple caller
  * configuration of interrupt and timeout sensitivity along with actions to
  * be performed in the event of either exception.
  *
- * Common usage looks like this:
+ * The first form of usage looks like this:
  *
  * struct l_wait_info lwi = LWI_TIMEOUT_INTR(timeout, timeout_handler,
  *                                           intr_handler, callback_data);
  * rc = l_wait_event(waitq, condition, &lwi);
  *
- * (LWI_TIMEOUT and LWI_INTR macros are available for timeout- and
- * interrupt-only variants, respectively.)
+ * l_wait_event() makes the current process wait on 'waitq' until 'condition'
+ * is TRUE or a "killable" signal (SIGTERM, SIKGILL, SIGINT) is pending.  It
+ * returns 0 to signify 'condition' is TRUE, but if a signal wakes it before
+ * 'condition' becomes true, it optionally calls the specified 'intr_handler'
+ * if not NULL, and returns -EINTR.
+ *
+ * If a non-zero timeout is specified, signals are ignored until the timeout
+ * has expired.  At this time, if 'timeout_handler' is not NULL it is called.
+ * If it returns FALSE l_wait_event() continues to wait as described above with
+ * signals enabled.  Otherwise it returns -ETIMEDOUT.
+ *
+ * LWI_INTR(intr_handler, callback_data) is shorthand for
+ * LWI_TIMEOUT_INTR(0, NULL, intr_handler, callback_data)
+ *
+ * The second form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT(timeout, timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * This form is the same as the first except that it COMPLETELY IGNORES
+ * SIGNALS.  The caller must therefore beware that if 'timeout' is zero, or if
+ * 'timeout_handler' is not NULL and returns FALSE, then the ONLY thing that
+ * can unblock the current process is 'condition' becoming TRUE.
+ *
+ * Another form of usage is:
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTERVAL(timeout, interval,
+ *                                               timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ * This is the same as previous case, but condition is checked once every
+ * 'interval' jiffies (if non-zero).
+ *
+ * Subtle synchronization point: this macro does *not* necessary takes
+ * wait-queue spin-lock before returning, and, hence, following idiom is safe
+ * ONLY when caller provides some external locking:
+ *
+ *             Thread1                            Thread2
+ *
+ *   l_wait_event(&obj->wq, ....);                                       (1)
+ *
+ *                                    wake_up(&obj->wq):                 (2)
+ *                                         spin_lock(&q->lock);          (2.1)
+ *                                         __wake_up_common(q, ...);     (2.2)
+ *                                         spin_unlock(&q->lock, flags); (2.3)
+ *
+ *   OBD_FREE_PTR(obj);                                                  (3)
+ *
+ * As l_wait_event() may "short-cut" execution and return without taking
+ * wait-queue spin-lock, some additional synchronization is necessary to
+ * guarantee that step (3) can begin only after (2.3) finishes.
  *
- * If a timeout is specified, the timeout_handler will be invoked in the event
- * that the timeout expires before the process is awakened.  (Note that any
- * waking of the process will restart the timeout, even if the condition is
- * not satisfied and the process immediately returns to sleep.  This might be
- * considered a bug.)  If the timeout_handler returns non-zero, l_wait_event
- * will return -ETIMEDOUT and the caller will continue.  If the handler returns
- * zero instead, the process will go back to sleep until it is awakened by the
- * waitq or some similar mechanism, or an interrupt occurs (if the caller has
- * asked for interrupts to be detected).  The timeout will only fire once, so
- * callers should take care that a timeout_handler which returns zero will take
- * future steps to awaken the process.  N.B. that these steps must include
- * making the provided condition become true.
+ * XXX nikita: some ptlrpc daemon threads have races of that sort.
  *
- * If the interrupt flag (lwi_signals) is non-zero, then the process will be
- * interruptible, and will be awakened by any "killable" signal (SIGTERM,
- * SIGKILL or SIGINT).  If a timeout is also specified, then the process will
- * only become interruptible _after_ the timeout has expired, though it can be
- * awakened by a signal that was delivered before the timeout and is still
- * pending when the timeout expires.  If a timeout is not specified, the process
- * will be interruptible at all times during l_wait_event.
  */
 
+#define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1))
+
 struct l_wait_info {
         cfs_duration_t lwi_timeout;
+        cfs_duration_t lwi_interval;
         int  (*lwi_on_timeout)(void *);
-        long   lwi_signals;
         void (*lwi_on_signal)(void *);
         void  *lwi_cb_data;
 };
 
-#define LWI_TIMEOUT(time, cb, data)                                            \
-((struct l_wait_info) {                                                        \
-        lwi_timeout:    time,                                                  \
-        lwi_on_timeout: cb,                                                    \
-        lwi_cb_data:    data                                                   \
+/* NB: LWI_TIMEOUT ignores signals completely */
+#define LWI_TIMEOUT(time, cb, data)             \
+((struct l_wait_info) {                         \
+        .lwi_timeout    = time,                 \
+        .lwi_on_timeout = cb,                   \
+        .lwi_cb_data    = data,                 \
+        .lwi_interval   = 0                     \
 })
 
-#define LWI_INTR(cb, data)                                                     \
-((struct l_wait_info) {                                                        \
-        lwi_signals:   1,                                                      \
-        lwi_on_signal: cb,                                                     \
-        lwi_cb_data:   data                                                    \
+#define LWI_TIMEOUT_INTERVAL(time, interval, cb, data)  \
+((struct l_wait_info) {                                 \
+        .lwi_timeout    = time,                         \
+        .lwi_on_timeout = cb,                           \
+        .lwi_cb_data    = data,                         \
+        .lwi_interval   = interval                      \
 })
 
 #define LWI_TIMEOUT_INTR(time, time_cb, sig_cb, data)                          \
 ((struct l_wait_info) {                                                        \
-        lwi_timeout:    time,                                                  \
-        lwi_on_timeout: time_cb,                                               \
-        lwi_signals:    1,                                                     \
-        lwi_on_signal:  sig_cb,                                                \
-        lwi_cb_data:    data                                                   \
+        .lwi_timeout    = time,                                                \
+        .lwi_on_timeout = time_cb,                                             \
+        .lwi_on_signal = (sig_cb == NULL) ? LWI_ON_SIGNAL_NOOP : sig_cb,       \
+        .lwi_cb_data    = data,                                                \
+        .lwi_interval    = 0                                                   \
 })
 
+#define LWI_INTR(cb, data)  LWI_TIMEOUT_INTR(0, NULL, cb, data)
+
+#define LUSTRE_FATAL_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) |                \
+                           sigmask(SIGTERM) | sigmask(SIGQUIT) |               \
+                           sigmask(SIGALRM))
+
 #ifdef __KERNEL__
 
+/*
+ * wait for @condition to become true, but no longer than timeout, specified
+ * by @info.
+ */
 #define __l_wait_event(wq, condition, info, ret, excl)                         \
 do {                                                                           \
         cfs_waitlink_t __wait;                                                 \
-        cfs_duration_t __timed_out = 0;                                        \
-        cfs_sigset_t blocked;                                                  \
-        cfs_time_t timeout_remaining;                                          \
+        cfs_duration_t __timeout = info->lwi_timeout;                          \
+        unsigned long  __irqflags;                                             \
+        cfs_sigset_t   __blocked;                                              \
+                                                                               \
+        ret = 0;                                                               \
+        if (condition)                                                         \
+                break;                                                         \
                                                                                \
         cfs_waitlink_init(&__wait);                                            \
         if (excl)                                                              \
-            cfs_waitq_add_exclusive(&wq, &__wait);                             \
+                cfs_waitq_add_exclusive(&wq, &__wait);                         \
         else                                                                   \
-            cfs_waitq_add(&wq, &__wait);                                       \
+                cfs_waitq_add(&wq, &__wait);                                   \
                                                                                \
         /* Block all signals (just the non-fatal ones if no timeout). */       \
-        if (info->lwi_signals && !info->lwi_timeout)                           \
-            blocked = l_w_e_set_sigs(LUSTRE_FATAL_SIGS);                       \
+        if (info->lwi_on_signal != NULL && __timeout == 0)                     \
+                __blocked = l_w_e_set_sigs(LUSTRE_FATAL_SIGS);                 \
         else                                                                   \
-            blocked = l_w_e_set_sigs(0);                                       \
-                                                                               \
-        timeout_remaining = info->lwi_timeout;                                 \
+                __blocked = l_w_e_set_sigs(0);                                 \
                                                                                \
         for (;;) {                                                             \
-            set_current_state(TASK_INTERRUPTIBLE);                             \
-            if (condition)                                                     \
-                    break;                                                     \
-            if (info->lwi_timeout && !__timed_out) {                           \
-                timeout_remaining = cfs_waitq_timedwait(&__wait,               \
-                                                        CFS_TASK_INTERRUPTIBLE,\
-                                                        timeout_remaining);    \
-                if (timeout_remaining == 0) {                                  \
-                    __timed_out = 1;                                           \
-                    if (!info->lwi_on_timeout ||                               \
-                        info->lwi_on_timeout(info->lwi_cb_data)) {             \
-                        ret = -ETIMEDOUT;                                      \
+                set_current_state(TASK_INTERRUPTIBLE);                         \
+                                                                               \
+                if (condition)                                                 \
                         break;                                                 \
-                    }                                                          \
-                    /* We'll take signals after a timeout. */                  \
-                    if (info->lwi_signals)                                     \
-                        (void)l_w_e_set_sigs(LUSTRE_FATAL_SIGS);               \
-                }                                                              \
-            } else {                                                           \
-                cfs_waitq_wait(&__wait, CFS_TASK_INTERRUPTIBLE);;              \
-            }                                                                  \
-            if (condition)                                                     \
-                    break;                                                     \
-            if (cfs_signal_pending()) {                                        \
-                    if (!info->lwi_timeout || __timed_out) {                   \
-                            break;                                             \
-                    } else {                                                   \
-                            /* We have to do this here because some signals */ \
-                            /* are not blockable - ie from strace(1).       */ \
-                            /* In these cases we want to schedule_timeout() */ \
-                            /* again, because we don't want that to return  */ \
-                            /* -EINTR when the RPC actually succeeded.      */ \
-                            /* the RECALC_SIGPENDING below will deliver the */ \
-                            /* signal properly.                             */ \
-                            cfs_clear_sigpending();                            \
-                    }                                                          \
-            }                                                                  \
-        }                                                                      \
                                                                                \
-        cfs_block_sigs(blocked);                                               \
+                if (__timeout == 0) {                                          \
+                        schedule();                                            \
+                } else {                                                       \
+                        unsigned long interval = info->lwi_interval?           \
+                                             min_t(unsigned long,              \
+                                                 info->lwi_interval,__timeout):\
+                                             __timeout;                        \
+                        __timeout -= interval - schedule_timeout(interval);    \
+                        if (__timeout == 0) {                                  \
+                                if (info->lwi_on_timeout == NULL ||            \
+                                    info->lwi_on_timeout(info->lwi_cb_data)) { \
+                                        ret = -ETIMEDOUT;                      \
+                                        break;                                 \
+                                }                                              \
+                                /* Take signals after the timeout expires. */  \
+                                if (info->lwi_on_signal != NULL)               \
+                                    (void)l_w_e_set_sigs(LUSTRE_FATAL_SIGS);   \
+                        }                                                      \
+                }                                                              \
                                                                                \
-        if ((!info->lwi_timeout || __timed_out) &&                             \
-            cfs_signal_pending()) {                                            \
-                if (info->lwi_on_signal)                                       \
-                        info->lwi_on_signal(info->lwi_cb_data);                \
-                ret = -EINTR;                                                  \
+                if (condition)                                                 \
+                        break;                                                 \
+                if (cfs_signal_pending()) {                                    \
+                        if (info->lwi_on_signal != NULL && __timeout == 0) {   \
+                                if (info->lwi_on_signal != LWI_ON_SIGNAL_NOOP) \
+                                        info->lwi_on_signal(info->lwi_cb_data);\
+                                ret = -EINTR;                                  \
+                                break;                                         \
+                        }                                                      \
+                        /* We have to do this here because some signals */     \
+                        /* are not blockable - ie from strace(1).       */     \
+                        /* In these cases we want to schedule_timeout() */     \
+                        /* again, because we don't want that to return  */     \
+                        /* -EINTR when the RPC actually succeeded.      */     \
+                        /* the RECALC_SIGPENDING below will deliver the */     \
+                        /* signal properly.                             */     \
+                        cfs_sigmask_lock(__irqflags);                          \
+                        cfs_clear_sigpending();                                \
+                        cfs_sigmask_unlock(__irqflags);                        \
+                }                                                              \
         }                                                                      \
                                                                                \
+        cfs_sigmask_lock(__irqflags);                                          \
+        cfs_block_sigs(__blocked);                                             \
+        RECALC_SIGPENDING; /*XXX cfs_recalc_sigpending();*/                    \
+        cfs_sigmask_unlock(__irqflags);                                        \
+                                                                               \
         set_current_state(TASK_RUNNING);                                       \
         cfs_waitq_del(&wq, &__wait);                                           \
-} while(0)
+} while (0)
 
 #else /* !__KERNEL__ */
 #define __l_wait_event(wq, condition, info, ret, excl)                         \
-do {                                                                           \
-        long timeout = info->lwi_timeout, elapse, last = 0;                    \
-        int __timed_out = 0;                                                   \
-                                                                               \
-        if (info->lwi_timeout == 0)                                            \
-            timeout = 1000000000;                                              \
-        else                                                                   \
-            last = time(NULL);                                                 \
-                                                                               \
-        for (;;) {                                                             \
-            if (condition)                                                     \
-                break;                                                         \
-            if (liblustre_wait_event(timeout)) {                               \
-                if (timeout == 0 || info->lwi_timeout == 0)                    \
-                        continue;                                              \
-                elapse = time(NULL) - last;                                    \
-                if (elapse) {                                                  \
-                        last += elapse;                                        \
-                        timeout -= elapse;                                     \
-                        if (timeout < 0)                                       \
-                                timeout = 0;                                   \
-                }                                                              \
-                continue;                                                      \
-            }                                                                  \
-            if (info->lwi_timeout && !__timed_out) {                           \
-                __timed_out = 1;                                               \
-                if (info->lwi_on_timeout == NULL ||                            \
-                    info->lwi_on_timeout(info->lwi_cb_data)) {                 \
-                    ret = -ETIMEDOUT;                                          \
-                    break;                                                     \
-                }                                                              \
-            }                                                                  \
-        }                                                                      \
+do {                                                                    \
+        long __timeout = info->lwi_timeout;                             \
+        long __now;                                                     \
+        long __then = 0;                                                \
+        int  __timed_out = 0;                                           \
+                                                                        \
+        ret = 0;                                                        \
+        if (condition)                                                  \
+                break;                                                  \
+                                                                        \
+        if (__timeout == 0)                                             \
+                __timeout = 1000000000;                                 \
+        else                                                            \
+                __then = time(NULL);                                    \
+                                                                        \
+        while (!(condition)) {                                          \
+                if (liblustre_wait_event(info->lwi_interval?:__timeout) || \
+                    (info->lwi_interval && info->lwi_interval < __timeout)) {\
+                        if (__timeout != 0 && info->lwi_timeout != 0) { \
+                                __now = time(NULL);                     \
+                                __timeout -= __now - __then;            \
+                                if (__timeout < 0)                      \
+                                        __timeout = 0;                  \
+                                __then = __now;                         \
+                        }                                               \
+                        continue;                                       \
+                }                                                       \
+                                                                        \
+                if (info->lwi_timeout != 0 && !__timed_out) {           \
+                        __timed_out = 1;                                \
+                        if (info->lwi_on_timeout == NULL ||             \
+                            info->lwi_on_timeout(info->lwi_cb_data)) {  \
+                                ret = -ETIMEDOUT;                       \
+                                break;                                  \
+                        }                                               \
+                }                                                       \
+        }                                                               \
 } while (0)
 
 #endif /* __KERNEL__ */
 
-#define l_wait_event(wq, condition, info)                                      \
-({                                                                             \
-        int __ret = 0;                                                         \
-        struct l_wait_info *__info = (info);                                   \
-        if (!(condition))                                                      \
-                __l_wait_event(wq, condition, __info, __ret, 0);               \
-        __ret;                                                                 \
+#define l_wait_event(wq, condition, info)                       \
+({                                                              \
+        int                 __ret;                              \
+        struct l_wait_info *__info = (info);                    \
+                                                                \
+        __l_wait_event(wq, condition, __info, __ret, 0);        \
+        __ret;                                                  \
 })
 
-#define l_wait_event_exclusive(wq, condition, info)                            \
-({                                                                             \
-        int __ret = 0;                                                         \
-        struct l_wait_info *__info = (info);                                   \
-        if (!(condition))                                                      \
-                __l_wait_event(wq, condition, __info, __ret, 1);               \
-        __ret;                                                                 \
+#define l_wait_event_exclusive(wq, condition, info)             \
+({                                                              \
+        int                 __ret;                              \
+        struct l_wait_info *__info = (info);                    \
+                                                                \
+        __l_wait_event(wq, condition, __info, __ret, 1);        \
+        __ret;                                                  \
 })
 
-#define LMD_MAGIC_R1 0xbdacbdac
-#define LMD_MAGIC    0xbdacbd02
-
-#define lmd_bad_magic(LMDP)                                             \
-({                                                                      \
-        struct lustre_mount_data *_lmd__ = (LMDP);                      \
-        int _ret__ = 0;                                                 \
-        if (!_lmd__) {                                                  \
-                LCONSOLE_ERROR("Missing mount data: "                   \
-                       "check that /sbin/mount.lustre is installed.\n");\
-                _ret__ = 1;                                             \
-        } else if (_lmd__->lmd_magic == LMD_MAGIC_R1) {                 \
-                LCONSOLE_ERROR("You're using an old version of "        \
-                       "/sbin/mount.lustre.  Please install version "   \
-                       "1.%d\n", LMD_MAGIC & 0xFF);                     \
-                _ret__ = 1;                                             \
-        } else if (_lmd__->lmd_magic != LMD_MAGIC) {                    \
-                LCONSOLE_ERROR("Invalid mount data (%#x != %#x): "      \
-                       "check that /sbin/mount.lustre is installed\n",  \
-                       _lmd__->lmd_magic, LMD_MAGIC);                   \
-                _ret__ = 1;                                             \
-        }                                                               \
-        _ret__;                                                         \
-})
+#ifdef __KERNEL__
+/* initialize ost_lvb according to inode */
+static inline void inode_init_lvb(struct inode *inode, struct ost_lvb *lvb)
+{
+        lvb->lvb_size = inode->i_size;
+        lvb->lvb_blocks = inode->i_blocks;
+        lvb->lvb_mtime = LTIME_S(inode->i_mtime);
+        lvb->lvb_atime = LTIME_S(inode->i_atime);
+        lvb->lvb_ctime = LTIME_S(inode->i_ctime);
+}
+#else
+/* defined in liblustre/llite_lib.h */
+#endif
 
 #ifdef __KERNEL__
 #define LIBLUSTRE_CLIENT (0)
index f35b918..db7ad6c 100644 (file)
@@ -16,6 +16,7 @@
 #endif
 
 #include <obd_class.h>
+#include <obd_ost.h>
 #include <lustre_net.h>
 #include <lustre_mds.h>
 #include <lustre_ha.h>
@@ -82,4 +83,56 @@ static inline void lustre_build_lock_params(int cmd, unsigned long open_flags,
                 LDLM_FL_BLOCK_NOWAIT : 0;
 }
 
+/*
+ * This is embedded into liblustre and llite super-blocks to keep track of
+ * connect flags (capabilities) supported by all imports given mount is
+ * connected to.
+ */
+struct lustre_client_ocd {
+        /*
+         * This is conjunction of connect_flags across all imports (LOVs) this
+         * mount is connected to. This field is updated by ll_ocd_update()
+         * under ->lco_lock.
+         */
+        __u64      lco_flags;
+        spinlock_t lco_lock;
+};
+
+/*
+ * This function is used as an upcall-callback hooked by liblustre and llite
+ * clients into obd_notify() listeners chain to handle notifications about
+ * change of import connect_flags. See llu_fsswop_mount() and
+ * lustre_common_fill_super().
+ *
+ * Again, it is dumped into this header for the lack of a better place.
+ */
+static inline int ll_ocd_update(struct obd_device *host,
+                                struct obd_device *watched,
+                                enum obd_notify_event ev, void *owner)
+{
+        struct lustre_client_ocd *lco;
+        struct client_obd        *cli;
+        __u64 flags;
+        int   result;
+
+        ENTRY;
+        if (!strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+                cli = &watched->u.cli;
+                lco = owner;
+                flags = cli->cl_import->imp_connect_data.ocd_connect_flags;
+                CDEBUG(D_SUPER, "Changing connect_flags: "LPX64" -> "LPX64"\n",
+                       lco->lco_flags, flags);
+                spin_lock(&lco->lco_lock);
+                lco->lco_flags &= flags;
+                spin_unlock(&lco->lco_lock);
+                result = 0;
+        } else {
+                CERROR("unexpected notification of %s %s!\n",
+                       watched->obd_type->typ_name,
+                       watched->obd_name);
+                result = -EINVAL;
+        }
+        RETURN(result);
+}
+
 #endif
index c1184a0..d305fb4 100644 (file)
@@ -46,6 +46,7 @@
 #endif
 
 #include <obd.h>
+#include <obd_ost.h>
 #include <lustre_idl.h>
 
 #define LOG_NAME_LIMIT(logname, name)                   \
@@ -87,6 +88,8 @@ int llog_init_handle(struct llog_handle *handle, int flags,
 extern void llog_free_handle(struct llog_handle *handle);
 int llog_process(struct llog_handle *loghandle, llog_cb_t cb,
                  void *data, void *catdata);
+int llog_reverse_process(struct llog_handle *loghandle, llog_cb_t cb,
+                         void *data, void *catdata);
 extern int llog_cancel_rec(struct llog_handle *loghandle, int index);
 extern int llog_close(struct llog_handle *cathandle);
 
@@ -108,6 +111,7 @@ int llog_cat_add_rec(struct llog_handle *cathandle, struct llog_rec_hdr *rec,
 int llog_cat_cancel_records(struct llog_handle *cathandle, int count,
                             struct llog_cookie *cookies);
 int llog_cat_process(struct llog_handle *cat_llh, llog_cb_t cb, void *data);
+int llog_cat_reverse_process(struct llog_handle *cat_llh, llog_cb_t cb, void *data);
 int llog_cat_set_first_idx(struct llog_handle *cathandle, int index);
 
 /* llog_obd.c */
@@ -165,6 +169,8 @@ struct llog_operations {
         int (*lop_destroy)(struct llog_handle *handle);
         int (*lop_next_block)(struct llog_handle *h, int *curr_idx,
                               int next_idx, __u64 *offset, void *buf, int len);
+        int (*lop_prev_block)(struct llog_handle *h,
+                              int prev_idx, void *buf, int len);
         int (*lop_create)(struct llog_ctxt *ctxt, struct llog_handle **,
                           struct llog_logid *logid, char *name);
         int (*lop_close)(struct llog_handle *handle);
@@ -195,7 +201,7 @@ struct llog_ctxt {
         int                      loc_idx; /* my index the obd array of ctxt's */
         struct llog_gen          loc_gen;
         struct obd_device       *loc_obd; /* points back to the containing obd*/
-        struct obd_export       *loc_exp;
+        struct obd_export       *loc_exp; /* parent "disk" export (e.g. MDS) */
         struct obd_import       *loc_imp; /* to use in RPC's: can be backward
                                              pointing import */
         struct llog_operations  *loc_logops;
@@ -209,9 +215,9 @@ static inline void llog_gen_init(struct llog_ctxt *ctxt)
 {
         struct obd_device *obd = ctxt->loc_exp->exp_obd;
 
-        if (!strcmp(obd->obd_type->typ_name, "mds"))
+        if (!strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME))
                 ctxt->loc_gen.mnt_cnt = obd->u.mds.mds_mount_count;
-        else if (!strstr(obd->obd_type->typ_name, "filter"))
+        else if (!strstr(obd->obd_type->typ_name, LUSTRE_FILTER_NAME))
                 ctxt->loc_gen.mnt_cnt = obd->u.filter.fo_mount_count;
         else
                 ctxt->loc_gen.mnt_cnt = 0;
@@ -226,8 +232,9 @@ static inline int llog_gen_lt(struct llog_gen a, struct llog_gen b)
         return(a.conn_cnt < b.conn_cnt ? 1 : 0);
 }
 
-#define LLOG_GEN_INC(gen)  ((gen).conn_cnt) ++
+#define LLOG_GEN_INC(gen)  ((gen).conn_cnt ++)
 #define LLOG_PROC_BREAK 0x0001
+#define LLOG_DEL_RECORD 0x0002
 
 static inline int llog_obd2ops(struct llog_ctxt *ctxt,
                                struct llog_operations **lop)
@@ -362,6 +369,23 @@ static inline int llog_next_block(struct llog_handle *loghandle, int *cur_idx,
         RETURN(rc);
 }
 
+static inline int llog_prev_block(struct llog_handle *loghandle,
+                                  int prev_idx, void *buf, int len)
+{
+        struct llog_operations *lop;
+        int rc;
+        ENTRY;
+
+        rc = llog_handle2ops(loghandle, &lop);
+        if (rc)
+                RETURN(rc);
+        if (lop->lop_prev_block == NULL)
+                RETURN(-EOPNOTSUPP);
+
+        rc = lop->lop_prev_block(loghandle, prev_idx, buf, len);
+        RETURN(rc);
+}
+
 static inline int llog_create(struct llog_ctxt *ctxt, struct llog_handle **res,
                               struct llog_logid *logid, char *name)
 {
index 3c59755..e824f0e 100644 (file)
 #ifndef _LUSTRE_MDS_H
 #define _LUSTRE_MDS_H
 
+#define LUSTRE_MDS_NAME "mds"
+#define LUSTRE_MDT_NAME "mdt"
+#define LUSTRE_MDC_NAME "mdc"
+
 #include <lustre_handles.h>
 #include <libcfs/kp30.h>
 #include <lustre_idl.h>
@@ -37,14 +41,12 @@ struct ptlrpc_request;
 struct obd_device;
 struct ll_file_data;
 
-#define LUSTRE_MDS_NAME "mds"
-#define LUSTRE_MDT_NAME "mdt"
-#define LUSTRE_MDC_NAME "mdc"
-
 struct lustre_md {
         struct mds_body         *body;
         struct lov_stripe_md    *lsm;
+#ifdef CONFIG_FS_POSIX_ACL
         struct posix_acl        *posix_acl;
+#endif
 };
 
 struct mdc_op_data {
@@ -78,51 +80,6 @@ struct mds_update_record {
         struct lvfs_grp_hash_entry *ur_grp_entry;
 };
 
-#define MDS_LR_SERVER_SIZE    512
-
-#define MDS_LR_CLIENT_START  8192
-#define MDS_LR_CLIENT_SIZE    128
-#if MDS_LR_CLIENT_START < MDS_LR_SERVER_SIZE
-#error "Can't have MDS_LR_CLIENT_START < MDS_LR_SERVER_SIZE"
-#endif
-
-#define MDS_CLIENT_SLOTS 17
-
-#define MDS_ROCOMPAT_LOVOBJID   0x00000001
-#define MDS_ROCOMPAT_SUPP       (MDS_ROCOMPAT_LOVOBJID)
-
-#define MDS_INCOMPAT_SUPP       (0)
-
-/* Data stored per server at the head of the last_rcvd file.  In le32 order.
- * Try to keep this the same as fsd_server_data so we might one day merge. */
-struct mds_server_data {
-        __u8  msd_uuid[40];        /* server UUID */
-        __u64 msd_last_transno;    /* last completed transaction ID */
-        __u64 msd_mount_count;     /* MDS incarnation number */
-        __u64 msd_unused;
-        __u32 msd_feature_compat;  /* compatible feature flags */
-        __u32 msd_feature_rocompat;/* read-only compatible feature flags */
-        __u32 msd_feature_incompat;/* incompatible feature flags */
-        __u32 msd_server_size;     /* size of server data area */
-        __u32 msd_client_start;    /* start of per-client data area */
-        __u16 msd_client_size;     /* size of per-client data area */
-        __u16 msd_subdir_count;    /* number of subdirectories for objects */
-        __u64 msd_catalog_oid;     /* recovery catalog object id */
-        __u32 msd_catalog_ogen;    /* recovery catalog inode generation */
-        __u8  msd_peeruuid[40];    /* UUID of LOV/OSC associated with MDS */
-        __u8  msd_padding[MDS_LR_SERVER_SIZE - 140];
-};
-
-/* Data stored per client in the last_rcvd file.  In le32 order. */
-struct mds_client_data {
-        __u8 mcd_uuid[40];      /* client UUID */
-        __u64 mcd_last_transno; /* last completed transaction ID */
-        __u64 mcd_last_xid;     /* xid for the last transaction */
-        __u32 mcd_last_result;  /* result from last RPC */
-        __u32 mcd_last_data;    /* per-op data (disposition for open &c.) */
-        __u8 mcd_padding[MDS_LR_CLIENT_SIZE - 64];
-};
-
 /* file data for open files on MDS */
 struct mds_file_data {
         struct portals_handle mfd_handle; /* must be first */
@@ -134,9 +91,13 @@ struct mds_file_data {
 };
 
 /* ACL */
+#ifdef CONFIG_FS_POSIX_ACL
 #define LUSTRE_POSIX_ACL_MAX_ENTRIES    (32)
 #define LUSTRE_POSIX_ACL_MAX_SIZE       \
                 (xattr_acl_size(LUSTRE_POSIX_ACL_MAX_ENTRIES))
+#else
+#define LUSTRE_POSIX_ACL_MAX_SIZE       0
+#endif
 
 /* mds/mds_reint.c */
 int mds_reint_rec(struct mds_update_record *r, int offset,
@@ -149,7 +110,7 @@ int it_disposition(struct lookup_intent *it, int flag);
 void it_set_disposition(struct lookup_intent *it, int flag);
 int it_open_error(int phase, struct lookup_intent *it);
 void mdc_set_lock_data(__u64 *lockh, void *data);
-int mdc_change_cbdata(struct obd_export *exp, struct ll_fid *fid, 
+int mdc_change_cbdata(struct obd_export *exp, struct ll_fid *fid,
                       ldlm_iterator_t it, void *data);
 int mdc_intent_lock(struct obd_export *exp,
                     struct mdc_op_data *,
@@ -179,7 +140,7 @@ int mdc_getattr(struct obd_export *exp, struct ll_fid *fid,
                 obd_valid valid, unsigned int ea_size,
                 struct ptlrpc_request **request);
 int mdc_getattr_name(struct obd_export *exp, struct ll_fid *fid,
-                     char *filename, int namelen, unsigned long valid,
+                     const char *filename, int namelen, unsigned long valid,
                      unsigned int ea_size, struct ptlrpc_request **request);
 int mdc_setattr(struct obd_export *exp, struct mdc_op_data *data,
                 struct iattr *iattr, void *ea, int ealen, void *ea2, int ea2len,
index 86d9e81..bacef95 100644 (file)
@@ -44,7 +44,7 @@
 /* MD flags we _always_ use */
 #define PTLRPC_MD_OPTIONS  0
 
-/* Define maxima for bulk I/O 
+/* Define maxima for bulk I/O
  * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks)
  * these limits are system wide and not interface-local. */
 #define PTLRPC_MAX_BRW_SIZE     LNET_MTU
@@ -84,7 +84,7 @@
  * considered full when less than ?_MAXREQSIZE is left in them.
  */
 
-#define LDLM_NUM_THREADS        min((int)(smp_num_cpus * smp_num_cpus * 8), 64)
+#define LDLM_NUM_THREADS min((int)(smp_num_cpus * smp_num_cpus * 8), 64)
 #define LDLM_NBUFS       64
 #define LDLM_BUFSIZE    (8 * 1024)
 #define LDLM_MAXREQSIZE (5 * 1024)
  *
  * MDS_MAXREQSIZE ~= 4736 bytes =
  * lustre_msg + ldlm_request + mds_body + mds_rec_create + FNAME_MAX + PATH_MAX
+ * MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header
+ * or, for mds_close() and mds_reint_unlink() on a many-OST filesystem:
+ *      = 9210 bytes = lustre_msg + mds_body + 160 * (easize + cookiesize)
  *
  * Realistic size is about 512 bytes (20 character name + 128 char symlink),
  * except in the open case where there are a large number of OSTs in a LOV.
  */
 #define MDS_MAXREQSIZE  (5 * 1024)
-#define MDS_MAXREPSIZE  (9 * 1024)
+#define MDS_MAXREPSIZE  max(9 * 1024, 280 + LOV_MAX_STRIPE_COUNT * 56)
 
-#define OST_MAX_THREADS 36UL
-#define OST_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \
-                                  OST_MAX_THREADS), 2UL)
+#define OST_MAX_THREADS 512UL
+#define OST_DEF_THREADS max_t(unsigned long, 2, \
+                              (num_physpages >> (26-PAGE_SHIFT)) * smp_num_cpus)
 #define OST_NBUFS       (64 * smp_num_cpus)
 #define OST_BUFSIZE     (8 * 1024)
 /* OST_MAXREQSIZE ~= 4768 bytes =
 #define OST_MAXREQSIZE  (5 * 1024)
 #define OST_MAXREPSIZE  (9 * 1024)
 
-#define PTLBD_NUM_THREADS        4
-#define PTLBD_NBUFS      64
-#define PTLBD_BUFSIZE    (32 * 1024)
-#define PTLBD_MAXREQSIZE 1024
-
 struct ptlrpc_connection {
         struct list_head        c_link;
         lnet_nid_t              c_self;
@@ -281,7 +279,17 @@ struct ptlrpc_request {
         spinlock_t rq_lock;
         /* client-side flags */
         unsigned int rq_intr:1, rq_replied:1, rq_err:1,
-                rq_timedout:1, rq_resend:1, rq_restart:1, rq_replay:1,
+                rq_timedout:1, rq_resend:1, rq_restart:1,
+                /*
+                 * when ->rq_replay is set, request is kept by the client even
+                 * after server commits corresponding transaction. This is
+                 * used for operations that require sequence of multiple
+                 * requests to be replayed. The only example currently is file
+                 * open/close. When last request in such a sequence is
+                 * committed, ->rq_replay is cleared on all requests in the
+                 * sequence.
+                 */
+                rq_replay:1,
                 rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
                 rq_no_delay:1, rq_net_err:1;
         enum rq_phase rq_phase; /* one of RQ_PHASE_* */
@@ -297,7 +305,7 @@ struct ptlrpc_request {
         int rq_reqlen;
         struct lustre_msg *rq_reqmsg;
 
-        int rq_timeout;                         /* seconds */
+        int rq_timeout;         /* time to wait for reply (seconds) */
         int rq_replen;
         struct lustre_msg *rq_repmsg;
         __u64 rq_transno;
@@ -321,7 +329,6 @@ struct ptlrpc_request {
         struct ptlrpc_reply_state *rq_reply_state;  /* separated reply state */
         struct ptlrpc_request_buffer_desc *rq_rqbd; /* incoming request buffer*/
 #if CRAY_XT3
-# error "Need to get the uid from the event?"
         __u32                rq_uid;            /* peer uid, used in MDS only */
 #endif
 
@@ -445,7 +452,7 @@ struct ptlrpc_bulk_desc {
 
         struct ptlrpc_cb_id    bd_cbid;         /* network callback info */
         lnet_handle_md_t        bd_md_h;         /* associated MD */
-        
+
 #if defined(__KERNEL__)
         lnet_kiov_t             bd_iov[0];
 #else
@@ -493,8 +500,8 @@ struct ptlrpc_service {
         int              srv_num_threads;       /* # threads to start/started */
         unsigned         srv_cpu_affinity:1;    /* bind threads to CPUs */
 
-        __u32 srv_req_portal;
-        __u32 srv_rep_portal;
+        __u32            srv_req_portal;
+        __u32            srv_rep_portal;
 
         int               srv_n_queued_reqs;    /* # reqs waiting to be served */
         struct list_head  srv_request_queue;    /* reqs waiting for service */
@@ -515,12 +522,14 @@ struct ptlrpc_service {
         struct list_head  srv_active_replies;   /* all the active replies */
         struct list_head  srv_reply_queue;      /* replies waiting for service */
 
-        cfs_waitq_t       srv_waitq; /* all threads sleep on this */
+        cfs_waitq_t       srv_waitq; /* all threads sleep on this. This
+                                      * wait-queue is signalled when new
+                                      * incoming request arrives and when
+                                      * difficult reply has to be handled. */
 
         struct list_head   srv_threads;
-        struct obd_device *srv_obddev;
         svc_handler_t      srv_handler;
-        
+
         char *srv_name;  /* only statically allocated strings here; we don't clean them */
 
         spinlock_t               srv_lock;
@@ -529,9 +538,9 @@ struct ptlrpc_service {
         struct lprocfs_stats    *srv_stats;
 
         /* List of free reply_states */
-        struct list_head srv_free_rs_list;
+        struct list_head         srv_free_rs_list;
         /* waitq to run, when adding stuff to srv_free_rs_list */
-        cfs_waitq_t srv_free_rs_waitq;
+        cfs_waitq_t              srv_free_rs_waitq;
         
         /*
          * if non-NULL called during thread creation (ptlrpc_start_thread())
@@ -549,7 +558,7 @@ struct ptlrpc_service {
 
 /* ptlrpc/events.c */
 extern lnet_handle_eq_t ptlrpc_eq_h;
-extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, 
+extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
                                lnet_process_id_t *peer, lnet_nid_t *self);
 extern void request_out_callback (lnet_event_t *ev);
 extern void reply_in_callback(lnet_event_t *ev);
@@ -575,7 +584,7 @@ void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc);
 int ptlrpc_register_bulk(struct ptlrpc_request *req);
 void ptlrpc_unregister_bulk (struct ptlrpc_request *req);
 
-static inline int ptlrpc_bulk_active (struct ptlrpc_bulk_desc *desc) 
+static inline int ptlrpc_bulk_active (struct ptlrpc_bulk_desc *desc)
 {
         unsigned long flags;
         int           rc;
@@ -590,8 +599,7 @@ int ptlrpc_send_reply(struct ptlrpc_request *req, int);
 int ptlrpc_reply(struct ptlrpc_request *req);
 int ptlrpc_error(struct ptlrpc_request *req);
 void ptlrpc_resend_req(struct ptlrpc_request *request);
-int ptl_send_rpc(struct ptlrpc_request *request);
-int ptl_send_rpc_nowait(struct ptlrpc_request *request);
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply);
 int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd);
 
 /* ptlrpc/client.c */
@@ -605,7 +613,7 @@ ptlrpc_client_receiving_reply (struct ptlrpc_request *req)
 {
         unsigned long flags;
         int           rc;
-        
+
         spin_lock_irqsave(&req->rq_lock, flags);
         rc = req->rq_receiving_reply;
         spin_unlock_irqrestore(&req->rq_lock, flags);
@@ -617,7 +625,7 @@ ptlrpc_client_replied (struct ptlrpc_request *req)
 {
         unsigned long flags;
         int           rc;
-        
+
         spin_lock_irqsave(&req->rq_lock, flags);
         rc = req->rq_replied;
         spin_unlock_irqrestore(&req->rq_lock, flags);
@@ -655,10 +663,11 @@ void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool);
 void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
 struct ptlrpc_request_pool *ptlrpc_init_rq_pool(int, int,
                                                 void (*populate_pool)(struct ptlrpc_request_pool *, int));
-struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode,
-                                       int count, int *lengths, char **bufs);
-struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp, int opcode,
-                                            int count, int *lengths,
+struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
+                                       int opcode, int count,
+                                       int *lengths, char **bufs);
+struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version,
+                                            int opcode, int count, int *lengths,
                                             char **bufs,
                                             struct ptlrpc_request_pool *pool);
 void ptlrpc_free_req(struct ptlrpc_request *request);
@@ -679,7 +688,7 @@ __u64 ptlrpc_sample_next_xid(void);
 __u64 ptlrpc_req_xid(struct ptlrpc_request *request);
 
 /* ptlrpc/service.c */
-void ptlrpc_save_lock (struct ptlrpc_request *req, 
+void ptlrpc_save_lock (struct ptlrpc_request *req,
                        struct lustre_handle *lock, int mode);
 void ptlrpc_commit_replies (struct obd_device *obd);
 void ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs);
@@ -717,6 +726,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
 
 /* ptlrpc/pack_generic.c */
 int lustre_msg_swabbed(struct lustre_msg *msg);
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version);
 int lustre_pack_request(struct ptlrpc_request *, int count, int *lens,
                         char **bufs);
 int lustre_pack_reply(struct ptlrpc_request *, int count, int *lens,
@@ -782,6 +792,8 @@ static inline void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) {}
 
 /* ptlrpc/llog_server.c */
 int llog_origin_handle_create(struct ptlrpc_request *req);
+int llog_origin_handle_destroy(struct ptlrpc_request *req);
+int llog_origin_handle_prev_block(struct ptlrpc_request *req);
 int llog_origin_handle_next_block(struct ptlrpc_request *req);
 int llog_origin_handle_read_header(struct ptlrpc_request *req);
 int llog_origin_handle_close(struct ptlrpc_request *req);
index 544d57d..7e2f3b3 100644 (file)
@@ -75,9 +75,8 @@ struct dquot_id {
 #define QFILE_RD_INFO           2
 #define QFILE_WR_INFO           3
 #define QFILE_INIT_INFO         4
-#define QFILE_GET_QIDS          5
-#define QFILE_RD_DQUOT          6
-#define QFILE_WR_DQUOT          7
+#define QFILE_RD_DQUOT          5
+#define QFILE_WR_DQUOT          6
 
 /* admin quotafile operations */
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
@@ -87,7 +86,7 @@ int lustre_write_quota_info(struct lustre_quota_info *lqi, int type);
 int lustre_read_dquot(struct lustre_dquot *dquot);
 int lustre_commit_dquot(struct lustre_dquot *dquot);
 int lustre_init_quota_info(struct lustre_quota_info *lqi, int type);
-int lustre_get_qids(struct lustre_quota_info *lqi, int type, 
+int lustre_get_qids(struct file *file, struct inode *inode, int type, 
                     struct list_head *list);
 #else
 
index 61f7a46..04aaf8d 100644 (file)
@@ -62,10 +62,7 @@ struct lov_oinfo {                 /* per-stripe data structure */
 
         unsigned loi_kms_valid:1;
         __u64 loi_kms;             /* known minimum size */
-        __u64 loi_rss;             /* recently seen size */
-        __u64 loi_mtime;           /* recently seen mtime */
-        __u64 loi_blocks;          /* recently seen blocks */
-
+        struct ost_lvb loi_lvb;
         struct osc_async_rc     loi_ar;
 };
 
@@ -82,30 +79,61 @@ static inline void loi_init(struct lov_oinfo *loi)
         CFS_INIT_LIST_HEAD(&loi->loi_read_item);
 }
 
+/*extent array item for describing the joined file extent info*/
+struct lov_extent {
+        __u64 le_start;            /* extent start */
+        __u64 le_len;              /* extent length */
+        int   le_loi_idx;          /* extent #1 loi's index in lsm loi array */
+        int   le_stripe_count;     /* extent stripe count*/
+};
+
+/*Lov array info for describing joined file array EA info*/
+struct lov_array_info {
+        struct llog_logid    lai_array_id;    /* MDS med llog object id */
+        unsigned             lai_ext_count; /* number of extent count */
+        struct lov_extent    *lai_ext_array; /* extent desc array */
+};
+
 struct lov_stripe_md {
         spinlock_t       lsm_lock;
         void            *lsm_lock_owner; /* debugging */
 
-        /* Public members. */
-        __u64 lsm_object_id;        /* lov object id */
-        __u64 lsm_object_gr;        /* lov object id */
-        __u64 lsm_maxbytes;         /* maximum possible file size */
-        unsigned long lsm_xfersize; /* optimal transfer size */
-
-        /* LOV-private members start here -- only for use in lov/. */
-        __u32 lsm_magic;
-        __u32 lsm_stripe_size;      /* size of the stripe */
-        __u32 lsm_pattern;          /* striping pattern (RAID0, RAID1) */
-        unsigned lsm_stripe_count;  /* number of objects being striped over */
+        struct {
+                /* Public members. */
+                __u64 lw_object_id;        /* lov object id */
+                __u64 lw_object_gr;        /* lov object id */
+                __u64 lw_maxbytes;         /* maximum possible file size */
+                unsigned long lw_xfersize; /* optimal transfer size */
+
+                /* LOV-private members start here -- only for use in lov/. */
+                __u32 lw_magic;
+                __u32 lw_stripe_size;      /* size of the stripe */
+                __u32 lw_pattern;          /* striping pattern (RAID0, RAID1) */
+                unsigned lw_stripe_count;  /* number of objects being striped over */
+        } lsm_wire;
+
+        struct lov_array_info *lsm_array; /*Only for joined file array info*/
         struct lov_oinfo lsm_oinfo[0];
 };
 
-/* compare all fields except for semaphore */
+#define lsm_object_id    lsm_wire.lw_object_id
+#define lsm_object_gr    lsm_wire.lw_object_gr
+#define lsm_maxbytes     lsm_wire.lw_maxbytes
+#define lsm_xfersize     lsm_wire.lw_xfersize
+#define lsm_magic        lsm_wire.lw_magic
+#define lsm_stripe_size  lsm_wire.lw_stripe_size
+#define lsm_pattern      lsm_wire.lw_pattern
+#define lsm_stripe_count lsm_wire.lw_stripe_count
+
+/* compare all relevant fields. */
 static inline int lov_stripe_md_cmp(struct lov_stripe_md *m1,
                                     struct lov_stripe_md *m2)
 {
-        return memcmp(&m1->lsm_object_id, &m2->lsm_object_id,
-                      (char *)&m2->lsm_oinfo[0] - (char *)&m2->lsm_object_id);
+        /*
+         * ->lsm_wire contains padding, but it should be zeroed out during
+         * allocation.
+         */
+        return memcmp(&m1->lsm_wire, &m2->lsm_wire, sizeof m1->lsm_wire);
 }
 
 void lov_stripe_lock(struct lov_stripe_md *md);
@@ -135,7 +163,7 @@ enum async_flags {
                                      or cancel the size of the io */
         ASYNC_GROUP_SYNC = 0x8,  /* ap_completion will not be called, instead
                                     the page is accounted for in the
-                                    obd_io_group given to 
+                                    obd_io_group given to
                                     obd_queue_group_io */
 };
 
@@ -163,7 +191,7 @@ struct obd_io_group {
 struct oig_callback_context {
         struct list_head occ_oig_item;
         /* called when the caller has received a signal while sleeping.
-         * callees of this method are encouraged to abort their state 
+         * callees of this method are encouraged to abort their state
          * in the oig.  This may be called multiple times. */
         void (*occ_interrupted)(struct oig_callback_context *occ);
         unsigned int interrupted:1;
@@ -203,22 +231,16 @@ struct filter_obd {
         cfs_dentry_t        *fo_dentry_O;
         cfs_dentry_t       **fo_dentry_O_groups;
         cfs_dentry_t       **fo_dentry_O_sub;
-        spinlock_t           fo_objidlock;      /* protect fo_lastobjid
-                                                 * increment */
-        
-        spinlock_t           fo_translock;      /* protect fsd_last_rcvd
-                                                 * increment */
-        
+        spinlock_t           fo_objidlock;      /* protect fo_lastobjid */
+        spinlock_t           fo_translock;      /* protect fsd_last_transno */
         struct file         *fo_rcvd_filp;
+        struct file         *fo_health_check_filp;
         struct filter_server_data *fo_fsd;
         unsigned long       *fo_last_rcvd_slots;
         __u64                fo_mount_count;
 
         int                  fo_destroy_in_progress;
-
-        struct file_operations *fo_fop;
-        struct inode_operations *fo_iop;
-        struct address_space_operations *fo_aops;
+        struct semaphore     fo_create_lock;
 
         struct list_head     fo_export_list;
         int                  fo_subdir_count;
@@ -232,11 +254,9 @@ struct filter_obd {
         struct obd_import   *fo_mdc_imp;
         struct obd_uuid      fo_mdc_uuid;
         struct lustre_handle fo_mdc_conn;
-#if 0
-        struct ptlrpc_client fo_mdc_client;
-#endif
         struct file        **fo_last_objid_files;
-        __u64               *fo_last_objids; /* last created objid for groups */
+        __u64               *fo_last_objids; /* last created objid for groups,
+                                              * protected by fo_objidlock */
 
         struct semaphore     fo_alloc_lock;
 
@@ -255,11 +275,8 @@ struct filter_obd {
          *
          * Locking: none, each OST thread uses only one element, determined by
          * its "ordinal number", ->t_id.
-         *
-         * This is (void *) array, because 2.4 and 2.6 use different iobuf
-         * structures.
          */
-        void                   **fo_iobuf_pool;
+        struct filter_iobuf    **fo_iobuf_pool;
         int                      fo_iobuf_count;
 
         struct obd_histogram     fo_r_pages;
@@ -278,19 +295,14 @@ struct filter_obd {
         struct lustre_quota_ctxt fo_quota_ctxt;
         spinlock_t               fo_quotacheck_lock;
         atomic_t                 fo_quotachecking;
-
-        /* objids black list stuff. See for detailed comment in
-         * filter_clear_orphans() */
-        struct filter_ext       *fo_blacklist;
-        spinlock_t               fo_blacklist_lock;
 };
 
 struct mds_server_data;
 
 #define OSC_MAX_RIF_DEFAULT       8
-#define OSC_MAX_RIF_MAX          64
-#define OSC_MAX_DIRTY_DEFAULT    32
-#define OSC_MAX_DIRTY_MB_MAX    512     /* totally arbitrary */
+#define OSC_MAX_RIF_MAX         256
+#define OSC_MAX_DIRTY_DEFAULT  (OSC_MAX_RIF_DEFAULT * 4)
+#define OSC_MAX_DIRTY_MB_MAX   2048     /* totally arbitrary */
 
 struct mdc_rpc_lock;
 struct client_obd {
@@ -399,6 +411,7 @@ struct mds_obd {
         obd_id                          *mds_lov_objids;
         int                              mds_lov_nextid_set;
         struct file                     *mds_lov_objid_filp;
+        struct file                     *mds_health_check_filp;
         unsigned long                   *mds_client_bitmap;
         struct semaphore                 mds_orphan_recovery_sem;
         struct upcall_cache             *mds_group_hash;
@@ -419,37 +432,10 @@ struct echo_obd {
         atomic_t             eo_prep;
 };
 
-/*
- * this struct does double-duty acting as either a client or
- * server instance .. maybe not wise.
- */
-struct ptlbd_obd {
-        /* server's */
-        struct ptlrpc_service *ptlbd_service;
-        struct file *filp;
-        /* client's */
-        struct ptlrpc_client    bd_client;
-        struct obd_import       *bd_import;
-        struct obd_uuid         bd_server_uuid;
-        struct obd_export       *bd_exp;
-        int refcount; /* XXX sigh */
-};
-
-struct recovd_obd {
-        spinlock_t            recovd_lock;
-        struct list_head      recovd_managed_items; /* items managed  */
-        struct list_head      recovd_troubled_items; /* items in recovery */
-
-        cfs_waitq_t           recovd_recovery_waitq;
-        cfs_waitq_t           recovd_ctl_waitq;
-        cfs_waitq_t           recovd_waitq;
-        cfs_task_t           *recovd_thread;
-        __u32                 recovd_state;
-};
-
 struct ost_obd {
         struct ptlrpc_service *ost_service;
         struct ptlrpc_service *ost_create_service;
+        struct ptlrpc_service *ost_io_service;
         struct semaphore       ost_health_sem;
 };
 
@@ -461,27 +447,23 @@ struct echo_client_obd {
         __u64                ec_unique;
 };
 
-struct cache_obd {
-        struct obd_export *cobd_target_exp;/* local connection to target obd */
-        struct obd_export *cobd_cache_exp; /* local connection to cache obd */
-};
-
 struct lov_tgt_desc {
         struct obd_uuid          uuid;
         __u32                    ltd_gen;
         struct obd_export       *ltd_exp;
-        int                      active; /* is this target up for requests */
-        int                      index;  /* index of target array in lov_obd */
-        struct list_head         qos_bavail_list; /* link entry to lov_obd */
+        unsigned int             active:1, /* is this target up for requests */
+                                 reap:1;   /* should this target be deleted */
 };
 
 struct lov_obd {
-        spinlock_t lov_lock;
+        struct semaphore lov_lock;
+        atomic_t refcount;
         struct lov_desc desc;
         int bufsize;
-        int refcount;
+        int connects;
+        int death_row;      /* Do we have tgts scheduled to be deleted?
+                               (Make this a linked list?) */
         unsigned int lo_catalog_loaded:1;
-        struct list_head qos_bavail_list; /* tgts list, sorted by available space, protected by lov_lock */
         struct lov_tgt_desc *tgts;
 };
 
@@ -513,9 +495,24 @@ struct obd_trans_info {
         int                      oti_numcookies;
 
         /* initial thread handling transaction */
-        struct ptlrpc_thread    *oti_thread; 
+        int                      oti_thread_id;
 };
 
+static inline void oti_init(struct obd_trans_info *oti,
+                            struct ptlrpc_request *req)
+{
+        if (oti == NULL)
+                return;
+        memset(oti, 0, sizeof *oti);
+
+        if (req == NULL)
+                return;
+
+        if (req->rq_repmsg && req->rq_reqmsg != 0)
+                oti->oti_transno = req->rq_repmsg->transno;
+        oti->oti_thread_id = req->rq_svc_thread ? req->rq_svc_thread->t_id : -1;
+}
+
 static inline void oti_alloc_cookies(struct obd_trans_info *oti,int num_cookies)
 {
         if (!oti)
@@ -558,9 +555,33 @@ enum llog_ctxt_id {
         LLOG_RD1_REPL_CTXT     =  9,
         LLOG_TEST_ORIG_CTXT    = 10,
         LLOG_TEST_REPL_CTXT    = 11,
+        LLOG_LOVEA_ORIG_CTXT   = 12,
+        LLOG_LOVEA_REPL_CTXT   = 13,
         LLOG_MAX_CTXTS
 };
 
+/*
+ * Events signalled through obd_notify() upcall-chain.
+ */
+enum obd_notify_event {
+        /* Device activated */
+        OBD_NOTIFY_ACTIVE,
+        /* Device deactivated */
+        OBD_NOTIFY_INACTIVE,
+        /* Connect data for import were changed */
+        OBD_NOTIFY_OCD
+};
+
+/*
+ * Data structure used to pass obd_notify()-event to non-obd listeners (llite
+ * and liblustre being main examples).
+ */
+struct obd_notify_upcall {
+        int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
+                          enum obd_notify_event ev, void *owner);
+        /* Opaque datum supplied by upper layer listener */
+        void *onu_owner;
+};
 
 /* corresponds to one of the obd's */
 struct obd_device {
@@ -592,6 +613,7 @@ struct obd_device {
         struct lvfs_run_ctxt    obd_lvfs_ctxt;
         struct llog_ctxt        *obd_llog_ctxt[LLOG_MAX_CTXTS];
         struct obd_device       *obd_observer;
+        struct obd_notify_upcall obd_upcall;
         struct obd_export       *obd_self_export;
         /* list of exports in LRU order, for ping evictor, with obd_dev_lock */
         struct list_head        obd_exports_timed;
@@ -624,12 +646,9 @@ struct obd_device {
                 struct ost_obd ost;
                 struct echo_client_obd echo_client;
                 struct echo_obd echo;
-                struct recovd_obd recovd;
                 struct lov_obd lov;
-                struct cache_obd cobd;
-                struct ptlbd_obd ptlbd;
         } u;
-       /* Fields used by LProcFS */
+        /* Fields used by LProcFS */
         unsigned int           obd_cntr_base;
         struct lprocfs_stats  *obd_stats;
         cfs_proc_dir_entry_t  *obd_svc_procroot;
@@ -641,6 +660,18 @@ struct obd_device {
 
 #define OBD_LLOG_FL_SENDNOW     0x0001
 
+/* Special case hack for MDS LOVs */
+#define OBD_CLEANUP_EARLY       0
+/* Precleanup stage 1, we must make sure all exports (other than the
+   self-export) get destroyed. */
+#define OBD_CLEANUP_EXPORTS     1
+/* Precleanup stage 2,  do other type-specific cleanup requiring the
+   self-export. */
+#define OBD_CLEANUP_SELF_EXP    2
+/* FIXME we should eliminate the "precleanup" function and make them stages
+   of the "cleanup" function. */
+#define OBD_CLEANUP_OBD         3
+
 struct obd_ops {
         struct module *o_owner;
         int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len,
@@ -666,6 +697,9 @@ struct obd_ops {
          * asked for. If @ocd == NULL, use default parameters. */
         int (*o_connect)(struct lustre_handle *conn, struct obd_device *src,
                          struct obd_uuid *cluuid, struct obd_connect_data *ocd);
+        int (*o_reconnect)(struct obd_export *exp, struct obd_device *src,
+                           struct obd_uuid *cluuid,
+                           struct obd_connect_data *ocd);
         int (*o_disconnect)(struct obd_export *exp);
 
         int (*o_statfs)(struct obd_device *obd, struct obd_statfs *osfs,
@@ -674,12 +708,15 @@ struct obd_ops {
                         struct lov_stripe_md *mem_src);
         int (*o_unpackmd)(struct obd_export *exp,struct lov_stripe_md **mem_tgt,
                           struct lov_mds_md *disk_src, int disk_len);
+        int (*o_checkmd)(struct obd_export *exp, struct obd_export *md_exp,
+                         struct lov_stripe_md *mem_tgt);
         int (*o_preallocate)(struct lustre_handle *, obd_count *req,
                              obd_id *ids);
         int (*o_create)(struct obd_export *exp,  struct obdo *oa,
                         struct lov_stripe_md **ea, struct obd_trans_info *oti);
         int (*o_destroy)(struct obd_export *exp, struct obdo *oa,
-                         struct lov_stripe_md *ea, struct obd_trans_info *oti);
+                         struct lov_stripe_md *ea, struct obd_trans_info *oti,
+                         struct obd_export *md_exp);
         int (*o_setattr)(struct obd_export *exp, struct obdo *oa,
                          struct lov_stripe_md *ea, struct obd_trans_info *oti);
         int (*o_setattr_async)(struct obd_export *exp, struct obdo *oa,
@@ -696,26 +733,26 @@ struct obd_ops {
                            struct lov_stripe_md *ea, obd_count oa_bufs,
                            struct brw_page *pgarr, struct ptlrpc_request_set *,
                            struct obd_trans_info *oti);
-        int (*o_prep_async_page)(struct obd_export *exp, 
+        int (*o_prep_async_page)(struct obd_export *exp,
                                  struct lov_stripe_md *lsm,
-                                 struct lov_oinfo *loi, 
+                                 struct lov_oinfo *loi,
                                  cfs_page_t *page, obd_off offset, 
                                  struct obd_async_page_ops *ops, void *data,
                                  void **res);
-        int (*o_queue_async_io)(struct obd_export *exp, 
-                                struct lov_stripe_md *lsm, 
-                                struct lov_oinfo *loi, void *cookie, 
-                                int cmd, obd_off off, int count, 
+        int (*o_queue_async_io)(struct obd_export *exp,
+                                struct lov_stripe_md *lsm,
+                                struct lov_oinfo *loi, void *cookie,
+                                int cmd, obd_off off, int count,
                                 obd_flag brw_flags, obd_flag async_flags);
-        int (*o_queue_group_io)(struct obd_export *exp, 
-                                struct lov_stripe_md *lsm, 
-                                struct lov_oinfo *loi, 
-                                struct obd_io_group *oig, 
-                                void *cookie, int cmd, obd_off off, int count, 
+        int (*o_queue_group_io)(struct obd_export *exp,
+                                struct lov_stripe_md *lsm,
+                                struct lov_oinfo *loi,
+                                struct obd_io_group *oig,
+                                void *cookie, int cmd, obd_off off, int count,
                                 obd_flag brw_flags, obd_flag async_flags);
-        int (*o_trigger_group_io)(struct obd_export *exp, 
-                                  struct lov_stripe_md *lsm, 
-                                  struct lov_oinfo *loi, 
+        int (*o_trigger_group_io)(struct obd_export *exp,
+                                  struct lov_stripe_md *lsm,
+                                  struct lov_oinfo *loi,
                                   struct obd_io_group *oig);
         int (*o_set_async_flags)(struct obd_export *exp,
                                 struct lov_stripe_md *lsm,
@@ -724,6 +761,8 @@ struct obd_ops {
         int (*o_teardown_async_page)(struct obd_export *exp,
                                      struct lov_stripe_md *lsm,
                                      struct lov_oinfo *loi, void *cookie);
+        int (*o_merge_lvb)(struct obd_export *exp, struct lov_stripe_md *lsm,
+                           struct ost_lvb *lvb, int kms_only);
         int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm,
                             obd_off size, int shrink);
         int (*o_punch)(struct obd_export *exp, struct obdo *oa,
@@ -762,7 +801,7 @@ struct obd_ops {
                         __u32 mode, struct lustre_handle *);
         int (*o_cancel_unused)(struct obd_export *, struct lov_stripe_md *,
                                int flags, void *opaque);
-        int (*o_join_lru)(struct obd_export *, struct lov_stripe_md *, 
+        int (*o_join_lru)(struct obd_export *, struct lov_stripe_md *,
                          int join);
         int (*o_san_preprw)(int cmd, struct obd_export *exp,
                             struct obdo *oa, int objcount,
@@ -785,7 +824,7 @@ struct obd_ops {
                               enum obd_import_event);
 
         int (*o_notify)(struct obd_device *obd, struct obd_device *watched,
-                        int active);
+                        enum obd_notify_event ev);
 
         int (*o_health_check)(struct obd_device *);
 
@@ -793,7 +832,7 @@ struct obd_ops {
         int (*o_quotacheck)(struct obd_export *, struct obd_quotactl *);
         int (*o_quotactl)(struct obd_export *, struct obd_quotactl *);
 
-        /* 
+        /*
          * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line
          * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c.
          * Also, add a wrapper function in include/linux/obd_class.h.
@@ -804,6 +843,39 @@ struct obd_ops {
          */
 };
 
+struct lsm_operations {
+        void (*lsm_free)(struct lov_stripe_md *);
+        int (*lsm_destroy)(struct lov_stripe_md *, struct obdo *oa,
+                           struct obd_export *md_exp);
+        void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, obd_off *,
+                                     unsigned long *);
+        void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, obd_off *,
+                                     unsigned long *);
+        obd_off (*lsm_stripe_offset_by_index)(struct lov_stripe_md *, int);
+        int (*lsm_stripe_index_by_offset)(struct lov_stripe_md *, obd_off);
+        int (*lsm_revalidate) (struct lov_stripe_md *, struct obd_device *obd);
+        int (*lsm_lmm_verify) (struct lov_mds_md *lmm, int lmm_bytes,
+                               int *stripe_count);
+        int (*lsm_unpackmd) (struct lov_obd *lov, struct lov_stripe_md *lsm,
+                             struct lov_mds_md *lmm);
+};
+
+extern struct lsm_operations lsm_plain_ops;
+extern struct lsm_operations lsm_join_ops;
+static inline struct lsm_operations *lsm_op_find(int magic)
+{
+        switch(magic) {
+        case LOV_MAGIC:
+               return &lsm_plain_ops;
+        case LOV_MAGIC_JOIN:
+               return &lsm_join_ops;
+        default:
+               CERROR("Cannot recognize lsm_magic %d", magic);
+               return NULL;
+        }
+}
+
+int lvfs_check_io_health(struct obd_device *obd, struct file *file);
 
 static inline void obd_transno_commit_cb(struct obd_device *obd, __u64 transno,
                                          int error)
index 0692bf0..af2be21 100644 (file)
@@ -405,6 +405,20 @@ static inline int obd_free_memmd(struct obd_export *exp,
         return obd_unpackmd(exp, mem_tgt, NULL, 0);
 }
 
+static inline int obd_checkmd(struct obd_export *exp,
+                              struct obd_export *md_exp,
+                              struct lov_stripe_md *mem_tgt)
+{
+        int rc;
+        ENTRY;
+
+        EXP_CHECK_OP(exp, checkmd);
+        OBD_COUNTER_INCREMENT(exp->exp_obd, checkmd);
+
+        rc = OBP(exp->exp_obd, checkmd)(exp, md_exp, mem_tgt);
+        RETURN(rc);
+}
+
 static inline int obd_create(struct obd_export *exp, struct obdo *obdo,
                              struct lov_stripe_md **ea,
                              struct obd_trans_info *oti)
@@ -421,7 +435,8 @@ static inline int obd_create(struct obd_export *exp, struct obdo *obdo,
 
 static inline int obd_destroy(struct obd_export *exp, struct obdo *obdo,
                               struct lov_stripe_md *ea,
-                              struct obd_trans_info *oti)
+                              struct obd_trans_info *oti,
+                              struct obd_export *md_exp)
 {
         int rc;
         ENTRY;
@@ -429,7 +444,7 @@ static inline int obd_destroy(struct obd_export *exp, struct obdo *obdo,
         EXP_CHECK_OP(exp, destroy);
         OBD_COUNTER_INCREMENT(exp->exp_obd, destroy);
 
-        rc = OBP(exp->exp_obd, destroy)(exp, obdo, ea, oti);
+        rc = OBP(exp->exp_obd, destroy)(exp, obdo, ea, oti, md_exp);
         RETURN(rc);
 }
 
@@ -537,6 +552,26 @@ static inline int obd_connect(struct lustre_handle *conn, struct obd_device *obd
         RETURN(rc);
 }
 
+static inline int obd_reconnect(struct obd_export *exp,
+                                struct obd_device *obd,
+                                struct obd_uuid *cluuid,
+                                struct obd_connect_data *d)
+{
+        int rc;
+        __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition check */
+        ENTRY;
+
+        OBD_CHECK_DEV_ACTIVE(obd);
+        OBD_CHECK_OP(obd, reconnect, 0);
+        OBD_COUNTER_INCREMENT(obd, reconnect);
+
+        rc = OBP(obd, reconnect)(exp, obd, cluuid, d);
+        /* check that only subset is granted */
+        LASSERT(ergo(d != NULL,
+                     (d->ocd_connect_flags & ocf) == d->ocd_connect_flags));
+        RETURN(rc);
+}
+
 static inline int obd_disconnect(struct obd_export *exp)
 {
         int rc;
@@ -820,10 +855,19 @@ static inline int obd_commitrw(int cmd, struct obd_export *exp, struct obdo *oa,
         RETURN(rc);
 }
 
-/* b1_4_bug5047 has changes to make this an obd_merge_lvb() method */
-__u64 lov_merge_size(struct lov_stripe_md *lsm, int kms_only);
-__u64 lov_merge_blocks(struct lov_stripe_md *lsm);
-__u64 lov_merge_mtime(struct lov_stripe_md *lsm, __u64 current_time);
+static inline int obd_merge_lvb(struct obd_export *exp,
+                                struct lov_stripe_md *lsm,
+                                struct ost_lvb *lvb, int kms_only)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_OP(exp->exp_obd, merge_lvb, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(exp->exp_obd, merge_lvb);
+
+        rc = OBP(exp->exp_obd, merge_lvb)(exp, lsm, lvb, kms_only);
+        RETURN(rc);
+}
 
 static inline int obd_adjust_kms(struct obd_export *exp,
                                  struct lov_stripe_md *lsm, obd_off size,
@@ -1003,7 +1047,7 @@ static inline void obd_import_event(struct obd_device *obd,
 
 static inline int obd_notify(struct obd_device *obd,
                              struct obd_device *watched,
-                             int active)
+                             enum obd_notify_event ev)
 {
         ENTRY;
         OBD_CHECK_DEV(obd);
@@ -1018,9 +1062,34 @@ static inline int obd_notify(struct obd_device *obd,
         }
 
         OBD_COUNTER_INCREMENT(obd, notify);
-        RETURN(OBP(obd, notify)(obd, watched, active));
+        RETURN(OBP(obd, notify)(obd, watched, ev));
 }
 
+static inline int obd_notify_observer(struct obd_device *observer,
+                                      struct obd_device *observed,
+                                      enum obd_notify_event ev)
+{
+        int rc1;
+        int rc2;
+
+        struct obd_notify_upcall *onu;
+
+        if (observer->obd_observer)
+                rc1 = obd_notify(observer->obd_observer, observed, ev);
+        else
+                rc1 = 0;
+        /*
+         * Also, call non-obd listener, if any
+         */
+        onu = &observer->obd_upcall;
+        if (onu->onu_upcall != NULL)
+                rc2 = onu->onu_upcall(observer, observed, ev, onu->onu_owner);
+        else
+                rc2 = 0;
+
+        return rc1 ?: rc2;
+ }
+
 static inline int obd_quotacheck(struct obd_export *exp,
                                  struct obd_quotactl *oqctl)
 {
index 467dbf3..a938d31 100644 (file)
@@ -32,7 +32,7 @@ extern unsigned int obd_fail_loc;
 extern unsigned int obd_dump_on_timeout;
 extern unsigned int obd_timeout;          /* seconds */
 #define PING_INTERVAL max(obd_timeout / 4, 1U)
-#define STATFS_INTERVAL max(obd_timeout / 20, 1U)
+#define RECONNECT_INTERVAL max(obd_timeout / 10, 10U)
 extern unsigned int ldlm_timeout;
 extern unsigned int obd_health_check_timeout;
 extern char obd_lustre_upcall[128];
@@ -163,9 +163,6 @@ extern cfs_waitq_t obd_race_waitq;
 
 #define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
 
-#define OBD_FAIL_OST_CROW_EIO            0x801
-#define OBD_FAIL_OST_CLEAR_ORPHANS_RACE  0x802
-
 /* preparation for a more advanced failure testbed (not functional yet) */
 #define OBD_FAIL_MASK_SYS    0x0000FF00
 #define OBD_FAIL_MASK_LOC    (0x000000FF | OBD_FAIL_MASK_SYS)
index 0357610..c7fd393 100644 (file)
@@ -37,6 +37,7 @@
 #include <lustre_mds.h> /* for LUSTRE_MDC_NAME */
 #include <lustre_dlm.h>
 #include <lustre_net.h>
+#include <linux/lustre_ver.h>
 
 /* @priority: if non-zero, move the selected to the list head
  * @create: if zero, only search in existed connections
@@ -200,7 +201,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
         /* In a more perfect world, we would hang a ptlrpc_client off of
          * obd_type and just use the values from there. */
         if (!strcmp(name, LUSTRE_OSC_NAME)) {
-                rq_portal = OST_REQUEST_PORTAL;
+                rq_portal = OST_IO_PORTAL;
                 rp_portal = OSC_REPLY_PORTAL;
                 connect_op = OST_CONNECT;
         } else if (!strcmp(name, LUSTRE_MDC_NAME)) {
@@ -261,7 +262,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
         if (num_physpages >> (20 - PAGE_SHIFT) <= 128) { /* <= 128 MB */
                 cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES / 4;
                 cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT / 4;
-        } else if (num_physpages >> (20 - PAGE_SHIFT) <= 512) { /* <= 512 MB */
+        } else if (num_physpages >> (20 - PAGE_SHIFT) <= 256) { /* <= 256 MB */
                 cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES / 2;
                 cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT / 2;
         } else {
@@ -376,8 +377,10 @@ int client_connect_import(struct lustre_handle *dlm_handle,
                 GOTO(out_ldlm, rc);
 
         ocd = &imp->imp_connect_data;
-        if (data)
+        if (data) {
                 *ocd = *data;
+                imp->imp_connect_flags_orig = data->ocd_connect_flags;
+        }
 
         rc = ptlrpc_connect_import(imp, NULL);
         if (rc != 0) {
@@ -413,8 +416,8 @@ out_sem:
 int client_disconnect_export(struct obd_export *exp)
 {
         struct obd_device *obd = class_exp2obd(exp);
-        struct client_obd *cli = &obd->u.cli;
-        struct obd_import *imp = cli->cl_import;
+        struct client_obd *cli;
+        struct obd_import *imp;
         int rc = 0, err;
         ENTRY;
 
@@ -424,6 +427,9 @@ int client_disconnect_export(struct obd_export *exp)
                 RETURN(-EINVAL);
         }
 
+        cli = &obd->u.cli;
+        imp = cli->cl_import;
+
         mutex_down(&cli->cl_sem);
         if (!cli->cl_conn_count) {
                 CERROR("disconnecting disconnected device (%s)\n",
@@ -480,7 +486,8 @@ int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp,
                         CWARN("%s reconnecting\n", cluuid->uuid);
                         conn->cookie = exp->exp_handle.h_cookie;
                         /* target_handle_connect() treats EALREADY and
-                         * -EALREADY differently */
+                         * -EALREADY differently.  EALREADY means we are
+                         * doing a valid reconnect from the same client. */
                         RETURN(EALREADY);
                 } else {
                         CERROR("%s reconnecting from %s, "
@@ -490,15 +497,15 @@ int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp,
                                hdl->cookie, conn->cookie);
                         memset(conn, 0, sizeof *conn);
                         /* target_handle_connect() treats EALREADY and
-                         * -EALREADY differently */
+                         * -EALREADY differently.  -EALREADY is an error
+                         * (same UUID, different handle). */
                         RETURN(-EALREADY);
                 }
         }
 
         conn->cookie = exp->exp_handle.h_cookie;
-        CDEBUG(D_INFO, "existing export for UUID '%s' at %p\n",
-               cluuid->uuid, exp);
-        CDEBUG(D_IOCTL, "connect: cookie "LPX64"\n", conn->cookie);
+        CDEBUG(D_HA, "connect export for UUID '%s' at %p, cookie "LPX64"\n",
+               cluuid->uuid, exp, conn->cookie);
         RETURN(0);
 }
 
@@ -530,9 +537,8 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
 
         obd_str2uuid (&tgtuuid, str);
         target = class_uuid2obd(&tgtuuid);
-        if (!target) {
+        if (!target)
                 target = class_name2obd(str);
-        }
 
         if (!target || target->obd_stopping || !target->obd_set_up) {
                 DEBUG_REQ(D_ERROR, req, "UUID '%s' is not available "
@@ -583,6 +589,31 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
         if (rc)
                 GOTO(out, rc);
 
+        if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
+                if (!data) {
+                        DEBUG_REQ(D_INFO, req, "Refusing old (unversioned) "
+                                  "libclient connection attempt\n");
+                        GOTO(out, rc = -EPROTO);
+                } else if (data->ocd_version < LUSTRE_VERSION_CODE -
+                                               LUSTRE_VERSION_ALLOWED_OFFSET) {
+                        DEBUG_REQ(D_INFO, req, "Refusing old (%d.%d.%d.%d) "
+                                  "libclient connection attempt\n",
+                                  OBD_OCD_VERSION_MAJOR(data->ocd_version),
+                                  OBD_OCD_VERSION_MINOR(data->ocd_version),
+                                  OBD_OCD_VERSION_PATCH(data->ocd_version),
+                                  OBD_OCD_VERSION_FIX(data->ocd_version));
+                        data = lustre_msg_buf(req->rq_repmsg, 0,
+                                              offsetof(typeof(*data),
+                                                       ocd_version) +
+                                              sizeof(data->ocd_version));
+                        if (data) {
+                                data->ocd_connect_flags = OBD_CONNECT_VERSION;
+                                data->ocd_version = LUSTRE_VERSION_CODE;
+                        }
+                        GOTO(out, rc = -EPROTO);
+                }
+        }
+
         /* lctl gets a backstage, all-access pass. */
         if (obd_uuid_equals(&cluuid, &target->obd_uuid))
                 goto dont_check_exports;
@@ -603,11 +634,18 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
         if (!export) {
                 spin_unlock(&target->obd_dev_lock);
         } else if (req->rq_reqmsg->conn_cnt == 1) {
-                CERROR("%s reconnected with 1 conn_cnt; cookies not random?\n",
-                       cluuid.uuid);
+                CERROR("%s: NID %s (%s) reconnected with 1 conn_cnt; "
+                       "cookies not random?\n", target->obd_name,
+                       libcfs_nid2str(req->rq_peer.nid), cluuid.uuid);
                 GOTO(out, rc = -EALREADY);
         }
 
+        /* We indicate the reconnection in a flag, not an error code. */
+        if (rc == EALREADY) {
+                lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT);
+                rc = 0;
+        }
+
         /* Tell the client if we're in recovery. */
         /* If this is the first client, start the recovery timer */
         if (target->obd_recovering) {
@@ -621,9 +659,10 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
 
         if (export == NULL) {
                 if (target->obd_recovering) {
-                        CERROR("%s: denying connection for new client %s: "
+                        CERROR("%s: denying connection for new client %s (%s): "
                                "%d clients in recovery for %lds\n",
-                               target->obd_name, cluuid.uuid,
+                               target->obd_name,
+                               libcfs_nid2str(req->rq_peer.nid), cluuid.uuid,
                                target->obd_recoverable_clients,
                                cfs_duration_sec(cfs_time_sub(cfs_timer_deadline(&target->obd_recovery_timer),
                                                              cfs_time_current())));
@@ -632,8 +671,15 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
  dont_check_exports:
                         rc = obd_connect(&conn, target, &cluuid, data);
                 }
+        } else {
+                rc = obd_reconnect(export, target, &cluuid, data);
         }
 
+        /* we want to handle EALREADY but *not* -EALREADY from
+         * target_handle_reconnect() */
+        if (rc && rc != EALREADY)
+                GOTO(out, rc);
+
         /* Return only the parts of obd_connect_data that we understand, so the
          * client knows that we don't understand the rest. */
         if (data)
@@ -643,13 +689,14 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
         /* If all else goes well, this is our RPC return code. */
         req->rq_status = 0;
 
-        /* we want to handle EALREADY but *not* -EALREADY from
-         * target_handle_reconnect() */
-        if (rc && rc != EALREADY)
-                GOTO(out, rc);
-
         req->rq_repmsg->handle = conn;
 
+        /* ownership of this export ref transfers to the request AFTER we
+         * drop any previous reference the request had, but we don't want
+         * that to go to zero before we get our new export reference. */
+        export = class_conn2export(&conn);
+        LASSERT(export != NULL);
+
         /* If the client and the server are the same node, we will already
          * have an export that really points to the client's DLM export,
          * because we have a shared handles table.
@@ -660,15 +707,13 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
         if (req->rq_export != NULL)
                 class_export_put(req->rq_export);
 
-        /* ownership of this export ref transfers to the request */
-        export = req->rq_export = class_conn2export(&conn);
-        LASSERT(export != NULL);
+        req->rq_export = export;
 
         spin_lock_irqsave(&export->exp_lock, flags);
         if (export->exp_conn_cnt >= req->rq_reqmsg->conn_cnt) {
-                CERROR("%s: already connected at a higher conn_cnt: %d > %d\n",
-                       cluuid.uuid, export->exp_conn_cnt,
-                       req->rq_reqmsg->conn_cnt);
+                CERROR("%s: %s already connected at higher conn_cnt: %d > %d\n",
+                       cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
+                       export->exp_conn_cnt, req->rq_reqmsg->conn_cnt);
                 spin_unlock_irqrestore(&export->exp_lock, flags);
                 GOTO(out, rc = -EALREADY);
         }
@@ -688,11 +733,9 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
         export->exp_connection = ptlrpc_get_connection(req->rq_peer,
                                                        req->rq_self,
                                                        &remote_uuid);
-        if (rc == EALREADY) {
-                /* We indicate the reconnection in a flag, not an error code. */
-                lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT);
+
+        if (lustre_msg_get_op_flags(req->rq_repmsg) & MSG_CONNECT_RECONNECT)
                 GOTO(out, rc = 0);
-        }
 
         if (target->obd_recovering)
                 target->obd_connected_clients++;
@@ -718,7 +761,6 @@ out:
 
 int target_handle_disconnect(struct ptlrpc_request *req)
 {
-        struct obd_export *exp;
         int rc;
         ENTRY;
 
@@ -727,8 +769,7 @@ int target_handle_disconnect(struct ptlrpc_request *req)
                 RETURN(rc);
 
         /* keep the rq_export around so we can send the reply */
-        exp = class_export_get(req->rq_export);
-        req->rq_status = obd_disconnect(exp);
+        req->rq_status = obd_disconnect(class_export_get(req->rq_export));
         RETURN(0);
 }
 
@@ -765,7 +806,6 @@ static void target_release_saved_req(struct ptlrpc_request *req)
 static void target_finish_recovery(struct obd_device *obd)
 {
         struct list_head *tmp, *n;
-        int rc;
 
         CWARN("%s: sending delayed replies to recovered clients\n",
               obd->obd_name);
@@ -774,12 +814,9 @@ static void target_finish_recovery(struct obd_device *obd)
 
         /* when recovery finished, cleanup orphans on mds and ost */
         if (OBT(obd) && OBP(obd, postrecov)) {
-                rc = OBP(obd, postrecov)(obd);
-                if (rc >= 0)
-                        CWARN("%s: all clients recovered, %d MDS "
-                              "orphans deleted\n", obd->obd_name, rc);
-                else
-                        CERROR("postrecov failed %d\n", rc);
+                int rc = OBP(obd, postrecov)(obd);
+                CWARN("%s: recovery %s: rc %d\n", obd->obd_name,
+                      rc < 0 ? "failed" : "complete", rc);
         }
 
         list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) {
@@ -791,7 +828,6 @@ static void target_finish_recovery(struct obd_device *obd)
                 target_release_saved_req(req);
         }
         obd->obd_recovery_end = CURRENT_SECONDS;
-        return;
 }
 
 static void abort_recovery_queue(struct obd_device *obd)
@@ -1173,7 +1209,7 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc)
         OBD_ALLOC(reqmsg, req->rq_reqlen);
         if (!reqmsg)
                 LBUG();
-        memcpy(saved_req, req, sizeof *saved_req);
+        *saved_req = *req;
         memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
 
         /* Don't race cleanup */
@@ -1406,3 +1442,13 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req)
 #endif /* !__KERNEL__ */
 }
 #endif /* HAVE_QUOTA_SUPPORT */
+
+ldlm_mode_t lck_compat_array[] = {
+        [LCK_EX] LCK_COMPAT_EX,
+        [LCK_PW] LCK_COMPAT_PW,
+        [LCK_PR] LCK_COMPAT_PR,
+        [LCK_CW] LCK_COMPAT_CW,
+        [LCK_CR] LCK_COMPAT_CR,
+        [LCK_NL] LCK_COMPAT_NL,
+        [LCK_GROUP] LCK_COMPAT_GROUP
+};
index 8505f42..6da730d 100644 (file)
@@ -255,22 +255,20 @@ static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
         ldlm_lock_addref_internal(lock, mode);
         ldlm_lock2handle(lock, lockh);
         lock->l_flags |= LDLM_FL_LOCAL;
-        lock->l_flags |= *flags & LDLM_INHERIT_FLAGS;
         lock->l_lvb_swabber = lvb_swabber;
         if (policy != NULL)
-                memcpy(&lock->l_policy_data, policy, sizeof(*policy));
+                lock->l_policy_data = *policy;
         if (type == LDLM_EXTENT)
-                memcpy(&lock->l_req_extent, &policy->l_extent,
-                       sizeof(policy->l_extent));
+                lock->l_req_extent = policy->l_extent;
 
         err = ldlm_lock_enqueue(ns, &lock, policy, flags);
         if (err != ELDLM_OK)
                 GOTO(out, err);
 
         if (policy != NULL)
-                memcpy(policy, &lock->l_policy_data, sizeof(*policy));
+                *policy = lock->l_policy_data;
         if ((*flags) & LDLM_FL_LOCK_CHANGED)
-                memcpy(&res_id, &lock->l_resource->lr_name, sizeof(res_id));
+                res_id = lock->l_resource->lr_name;
 
         LDLM_DEBUG_NOLOCK("client-side local enqueue handler END (lock %p)",
                           lock);
@@ -325,7 +323,7 @@ int ldlm_cli_enqueue(struct obd_export *exp,
         struct ldlm_lock *lock;
         struct ldlm_request *body;
         struct ldlm_reply *reply;
-        int rc, size[2] = {sizeof(*body), lvb_len}, req_passed_in = 1;
+        int rc, size[] = {sizeof(*body), lvb_len}, req_passed_in = 1;
         int is_replay = *flags & LDLM_FL_REPLAY;
         int cleanup_phase = 0;
         ENTRY;
@@ -354,11 +352,23 @@ int ldlm_cli_enqueue(struct obd_export *exp,
                 ldlm_lock_addref_internal(lock, mode);
                 ldlm_lock2handle(lock, lockh);
                 lock->l_lvb_swabber = lvb_swabber;
-                if (policy != NULL)
-                        memcpy(&lock->l_policy_data, policy, sizeof(*policy));
+                if (policy != NULL) {
+                        /* INODEBITS_INTEROP: If the server does not support
+                         * inodebits, we will request a plain lock in the
+                         * descriptor (ldlm_lock2desc() below) but use an
+                         * inodebits lock internally with both bits set.
+                         */
+                        if (type == LDLM_IBITS && !(exp->exp_connect_flags &
+                                                    OBD_CONNECT_IBITS))
+                                lock->l_policy_data.l_inodebits.bits =
+                                        MDS_INODELOCK_LOOKUP |
+                                        MDS_INODELOCK_UPDATE;
+                        else
+                                lock->l_policy_data = *policy;
+                }
+
                 if (type == LDLM_EXTENT)
-                        memcpy(&lock->l_req_extent, &policy->l_extent,
-                               sizeof(policy->l_extent));
+                        lock->l_req_extent = policy->l_extent;
                 LDLM_DEBUG(lock, "client-side enqueue START");
         }
 
@@ -366,33 +376,36 @@ int ldlm_cli_enqueue(struct obd_export *exp,
         cleanup_phase = 2;
 
         if (req == NULL) {
-                req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 1,
-                                      size, NULL);
+                req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION,
+                                      LDLM_ENQUEUE, 1, size, NULL);
                 if (req == NULL)
                         GOTO(cleanup, rc = -ENOMEM);
                 req_passed_in = 0;
-        } else if (req->rq_reqmsg->buflens[0] != sizeof(*body))
-                LBUG();
+        } else {
+                LASSERTF(req->rq_reqmsg->buflens[MDS_REQ_INTENT_LOCKREQ_OFF] ==
+                         sizeof(*body), "buflen[%d] = %d, not %d\n",
+                         MDS_REQ_INTENT_LOCKREQ_OFF,
+                         req->rq_reqmsg->buflens[MDS_REQ_INTENT_LOCKREQ_OFF],
+                         (int)sizeof(*body));
+        }
+
+        lock->l_conn_export = exp;
+        lock->l_export = NULL;
+        lock->l_blocking_ast = blocking;
 
         /* Dump lock data into the request buffer */
-        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
+        body = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_INTENT_LOCKREQ_OFF,
+                              sizeof(*body));
         ldlm_lock2desc(lock, &body->lock_desc);
         body->lock_flags = *flags;
 
-        memcpy(&body->lock_handle1, lockh, sizeof(*lockh));
+        body->lock_handle1 = *lockh;
 
         /* Continue as normal. */
         if (!req_passed_in) {
-                int buffers = 1;
-                if (lvb_len > 0)
-                        buffers = 2;
                 size[0] = sizeof(*reply);
-                req->rq_replen = lustre_msg_size(buffers, size);
+                req->rq_replen = lustre_msg_size(1 + (lvb_len > 0), size);
         }
-        lock->l_conn_export = exp;
-        lock->l_export = NULL;
-        lock->l_blocking_ast = blocking;
-
         LDLM_DEBUG(lock, "sending request");
         rc = ptlrpc_queue_wait(req);
 
@@ -439,8 +452,7 @@ int ldlm_cli_enqueue(struct obd_export *exp,
         /* lock enqueued on the server */
         cleanup_phase = 1;
 
-        memcpy(&lock->l_remote_handle, &reply->lock_handle,
-               sizeof(lock->l_remote_handle));
+        lock->l_remote_handle = reply->lock_handle;
         *flags = reply->lock_flags;
         lock->l_flags |= reply->lock_flags & LDLM_INHERIT_FLAGS;
 
@@ -475,9 +487,10 @@ int ldlm_cli_enqueue(struct obd_export *exp,
                         LDLM_DEBUG(lock, "client-side enqueue, new resource");
                 }
                 if (policy != NULL)
-                        memcpy(&lock->l_policy_data,
-                               &reply->lock_desc.l_policy_data,
-                               sizeof(reply->lock_desc.l_policy_data));
+                        if (!(type == LDLM_IBITS && !(exp->exp_connect_flags &
+                                                    OBD_CONNECT_IBITS)))
+                                lock->l_policy_data =
+                                                 reply->lock_desc.l_policy_data;
                 if (type != LDLM_PLAIN)
                         LDLM_DEBUG(lock,"client-side enqueue, new policy data");
         }
@@ -587,13 +600,12 @@ int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, int *flags)
         LDLM_DEBUG(lock, "client-side convert");
 
         req = ptlrpc_prep_req(class_exp2cliimp(lock->l_conn_export),
-                              LDLM_CONVERT, 1, &size, NULL);
+                              LUSTRE_DLM_VERSION, LDLM_CONVERT, 1, &size, NULL);
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
         body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
-        memcpy(&body->lock_handle1, &lock->l_remote_handle,
-               sizeof(body->lock_handle1));
+        body->lock_handle1 = lock->l_remote_handle;
 
         body->lock_desc.l_req_mode = new_mode;
         body->lock_flags = *flags;
@@ -676,7 +688,8 @@ int ldlm_cli_cancel(struct lustre_handle *lockh)
                         goto local_cancel;
                 }
 
-                req = ptlrpc_prep_req(imp, LDLM_CANCEL, 1, &size, NULL);
+                req = ptlrpc_prep_req(imp, LUSTRE_DLM_VERSION, LDLM_CANCEL,
+                                      1, &size, NULL);
                 if (!req)
                         GOTO(out, rc = -ENOMEM);
                 req->rq_no_resend = 1;
@@ -686,19 +699,25 @@ int ldlm_cli_cancel(struct lustre_handle *lockh)
                 req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
 
                 body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
-                memcpy(&body->lock_handle1, &lock->l_remote_handle,
-                       sizeof(body->lock_handle1));
+                body->lock_handle1 = lock->l_remote_handle;
 
                 req->rq_replen = lustre_msg_size(0, NULL);
 
                 rc = ptlrpc_queue_wait(req);
 
                 if (rc == ESTALE) {
-                        CERROR("client/server (nid %s) out of sync"
-                               " -- not fatal, flags %d\n",
-                               libcfs_nid2str(req->rq_import->
-                                              imp_connection->c_peer.nid),
-                               lock->l_flags);
+                        /* For PLAIN (inodebits) locks on liblustre clients
+                           this is a valid race between us cancelling a lock
+                           from lru and sending notification and server
+                           cancelling our lock at the same time */
+#ifndef __KERNEL__
+                        if (lock->l_resource->lr_type != LDLM_PLAIN /* IBITS */)
+#endif
+                                CERROR("client/server (nid %s) out of sync"
+                                       " -- not fatal, flags %d\n",
+                                       libcfs_nid2str(req->rq_import->
+                                                    imp_connection->c_peer.nid),
+                                       lock->l_flags);
                 } else if (rc == -ETIMEDOUT) {
                         ptlrpc_req_finished(req);
                         GOTO(restart, rc);
@@ -753,6 +772,13 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns, ldlm_sync_t sync)
         list_for_each_entry_safe(lock, next, &ns->ns_unused_list, l_lru) {
                 LASSERT(!lock->l_readers && !lock->l_writers);
 
+                /* If we have chosen to canecl this lock voluntarily, we better
+                   send cancel notification to server, so that it frees
+                   appropriate state. This might lead to a race where while
+                   we are doing cancel here, server is also silently
+                   cancelling this lock. */
+                lock->l_flags &= ~LDLM_FL_CANCEL_ON_BLOCK;
+
                 /* Setting the CBPENDING flag is a little misleading, but
                  * prevents an important race; namely, once CBPENDING is set,
                  * the lock can accumulate no more readers/writers.  Since
@@ -959,13 +985,14 @@ int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
         struct list_head *tmp, *next;
         struct ldlm_lock *lock;
         int rc = LDLM_ITER_CONTINUE;
-        struct ldlm_namespace *ns = res->lr_namespace;
+        struct ldlm_namespace *ns;
 
         ENTRY;
 
         if (!res)
                 RETURN(LDLM_ITER_CONTINUE);
 
+        ns = res->lr_namespace;
         l_lock(&ns->ns_lock);
         list_for_each_safe(tmp, next, &res->lr_granted) {
                 lock = list_entry(tmp, struct ldlm_lock, l_res_link);
@@ -1099,8 +1126,7 @@ static int replay_lock_interpret(struct ptlrpc_request *req,
                 GOTO (out, rc = -EPROTO);
         }
 
-        memcpy(&lock->l_remote_handle, &reply->lock_handle,
-               sizeof(lock->l_remote_handle));
+        lock->l_remote_handle = reply->lock_handle;
         LDLM_DEBUG(lock, "replayed lock:");
         ptlrpc_import_recovery_state_machine(req->rq_import);
  out:
@@ -1119,8 +1145,16 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
         int buffers = 1;
         int size[2];
         int flags;
-
         ENTRY;
+
+        /* If this is reply-less callback lock, we cannot replay it, since
+         * server might have long dropped it, but notification of that event was
+         * lost by network. (and server granted conflicting lock already) */
+        if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) {
+                LDLM_DEBUG(lock, "Not replaying reply-less lock:");
+                ldlm_lock_cancel(lock);
+                RETURN(0);
+        }
         /*
          * If granted mode matches the requested mode, this lock is granted.
          *
@@ -1145,7 +1179,8 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
                 flags = LDLM_FL_REPLAY;
 
         size[0] = sizeof(*body);
-        req = ptlrpc_prep_req(imp, LDLM_ENQUEUE, 1, size, NULL);
+        req = ptlrpc_prep_req(imp, LUSTRE_DLM_VERSION, LDLM_ENQUEUE,
+                              1, size, NULL);
         if (!req)
                 RETURN(-ENOMEM);
 
index b8500d4..5fc4197 100644 (file)
@@ -27,6 +27,7 @@
 #define DEBUG_SUBSYSTEM S_LLITE
 
 #include <obd_support.h>
+#include <lustre_mds.h>
 #include <lustre_lite.h>
 #include <lustre_idl.h>
 #include <lustre_dlm.h>
index 00a1f8c..3d904c7 100644 (file)
@@ -5,8 +5,13 @@
 #ifndef LLITE_INTERNAL_H
 #define LLITE_INTERNAL_H
 
+#ifdef CONFIG_FS_POSIX_ACL
+# include <linux/fs.h>
+# include <linux/xattr_acl.h>
+#endif
+
 #include <lustre_debug.h>
-#include <linux/lustre_version.h>
+#include <linux/lustre_ver.h>
 
 /*
 struct lustre_intent_data {
@@ -79,6 +84,8 @@ struct ll_inode_info {
 
         struct posix_acl       *lli_posix_acl;
 
+        struct list_head        lli_dead_list;
+
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
         struct inode            lli_vfs_inode;
 #endif
@@ -136,6 +143,7 @@ struct ll_ra_info {
 #define LL_SBI_FLOCK            0x04
 #define LL_SBI_USER_XATTR       0x08 /* support user xattr */
 #define LL_SBI_ACL              0x10 /* support ACL */
+#define LL_SBI_JOIN             0x20 /* support JOIN */
 
 struct ll_sb_info {
         struct list_head          ll_list;
@@ -152,7 +160,7 @@ struct ll_sb_info {
 
         int                       ll_flags;
         struct list_head          ll_conn_chain; /* per-conn chain of SBs */
-        __u64                     ll_connect_flags;
+        struct lustre_client_ocd  ll_lco;
 
         struct hlist_head         ll_orphan_dentry_list; /*please don't ask -p*/
         struct ll_close_queue    *ll_lcq;
@@ -167,6 +175,9 @@ struct ll_sb_info {
         struct ll_ra_info         ll_ra_info;
         unsigned int              ll_namelen;
         struct file_operations   *ll_fop;
+
+        struct list_head          ll_deathrow; /* inodes to be destroyed (b1443) */
+        spinlock_t                ll_deathrow_lock;
 };
 
 struct ll_ra_read {
@@ -240,10 +251,6 @@ extern spinlock_t inode_lock;
 
 extern struct proc_dir_entry *proc_lustre_fs_root;
 
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-# define hlist_del_init list_del_init
-#endif
-
 static inline struct inode *ll_info2i(struct ll_inode_info *lli)
 {
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
@@ -389,6 +396,7 @@ extern void ll_set_dd(struct dentry *de);
 void ll_unhash_aliases(struct inode *);
 void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft);
 void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry);
+int ll_dcompare(struct dentry *parent, struct qstr *d_name, struct qstr *name);
 
 /* llite/llite_lib.c */
 
@@ -420,6 +428,8 @@ void lustre_dump_dentry(struct dentry *, int recur);
 void lustre_dump_inode(struct inode *);
 struct ll_async_page *llite_pglist_next_llap(struct ll_sb_info *sbi,
                                              struct list_head *list);
+int ll_obd_statfs(struct inode *inode, void *arg);
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
 
 /* llite/llite_nfs.c */
 __u32 get_uuid2int(const char *name, int len);
index ca474c8..7dee781 100644 (file)
@@ -83,6 +83,7 @@ struct ll_sb_info *lustre_init_sbi(struct super_block *sb)
                 RETURN(NULL);
 
         spin_lock_init(&sbi->ll_lock);
+        spin_lock_init(&sbi->ll_lco.lco_lock);
         INIT_LIST_HEAD(&sbi->ll_pglist);
         sbi->ll_pglist_gen = 0;
         if (num_physpages >> (20 - PAGE_SHIFT) < 512)
@@ -103,6 +104,9 @@ struct ll_sb_info *lustre_init_sbi(struct super_block *sb)
         spin_lock(&ll_sb_lock);
         list_add_tail(&sbi->ll_list, &ll_super_blocks);
         spin_unlock(&ll_sb_lock);
+
+        INIT_LIST_HEAD(&sbi->ll_deathrow);
+        spin_lock_init(&sbi->ll_deathrow_lock);
         RETURN(sbi);
 }
 
@@ -121,6 +125,10 @@ void lustre_free_sbi(struct super_block *sb)
         EXIT;
 }
 
+static struct dentry_operations ll_d_root_ops = {
+        .d_compare = ll_dcompare,
+};
+
 int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc)
 {
         struct inode *root = 0;
@@ -153,12 +161,15 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc)
                         CERROR("could not register mount in /proc/lustre");
         }
 
+        /* indicate that inodebits locking is supported by this client */
+        data->ocd_connect_flags |= OBD_CONNECT_IBITS;
+        data->ocd_ibits_known = MDS_INODELOCK_FULL;
+
         if (sb->s_flags & MS_RDONLY)
                 data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
         if (sbi->ll_flags & LL_SBI_USER_XATTR)
-                data->ocd_connect_flags |= OBD_CONNECT_USER_XATTR;
-        if (sbi->ll_flags & LL_SBI_ACL)
-                data->ocd_connect_flags |= OBD_CONNECT_ACL;
+                data->ocd_connect_flags |= OBD_CONNECT_XATTR;
+        data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_JOIN;
 
         if (sbi->ll_flags & LL_SBI_FLOCK) {
                 sbi->ll_fop = &ll_file_operations_flock;
@@ -166,6 +177,9 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc)
                 sbi->ll_fop = &ll_file_operations;
         }
 
+        data->ocd_connect_flags |= OBD_CONNECT_VERSION;
+        data->ocd_version = LUSTRE_VERSION_CODE;
+
         err = obd_connect(&mdc_conn, obd, &sbi->ll_sb_uuid, data);
         if (err == -EBUSY) {
                 CERROR("An MDS (mdc %s) is performing recovery, of which this"
@@ -193,17 +207,22 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc)
         sbi->ll_namelen = osfs.os_namelen;
 
         if ((sbi->ll_flags & LL_SBI_USER_XATTR) &&
-            !(data->ocd_connect_flags & OBD_CONNECT_USER_XATTR)) {
+            !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
                 LCONSOLE_INFO("Disabling user_xattr feature because "
                               "it is not supported on the server\n"); 
                 sbi->ll_flags &= ~LL_SBI_USER_XATTR;
         }
 
-        if (((sbi->ll_flags & LL_SBI_ACL) == 0) !=
-            ((data->ocd_connect_flags & OBD_CONNECT_ACL) == 0)) {
-                CERROR("Server return unexpected ACL flags\n");
-                GOTO(out_mdc, err = -EBADE);
-        }
+        if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
+#ifdef MS_POSIXACL
+                sb->s_flags |= MS_POSIXACL;
+#endif
+                sbi->ll_flags |= LL_SBI_ACL;
+        } else
+                sbi->ll_flags &= ~LL_SBI_ACL;
+
+        if (data->ocd_connect_flags & OBD_CONNECT_JOIN)
+                sbi->ll_flags |= LL_SBI_JOIN;
 
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
         /* We set sb->s_dev equal on all lustre clients in order to support
@@ -221,6 +240,16 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc)
                 GOTO(out_mdc, err);
         }
 
+        data->ocd_connect_flags =
+                OBD_CONNECT_GRANT|OBD_CONNECT_VERSION|OBD_CONNECT_REQPORTAL;
+
+        CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d "
+               "ocd_grant: %d\n", data->ocd_connect_flags,
+               data->ocd_version, data->ocd_grant);
+
+        obd->obd_upcall.onu_owner = &sbi->ll_lco;
+        obd->obd_upcall.onu_upcall = ll_ocd_update;
+
         err = obd_connect(&osc_conn, obd, &sbi->ll_sb_uuid, data);
         if (err == -EBUSY) {
                 CERROR("An OST (osc %s) is performing recovery, of which this"
@@ -232,7 +261,9 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc)
                 GOTO(out_mdc, err);
         }
         sbi->ll_osc_exp = class_conn2export(&osc_conn);
-        sbi->ll_connect_flags = data->ocd_connect_flags;
+        spin_lock(&sbi->ll_lco.lco_lock);
+        sbi->ll_lco.lco_flags = data->ocd_connect_flags;
+        spin_unlock(&sbi->ll_lco.lco_lock);
 
         mdc_init_ea_size(sbi->ll_mdc_exp, sbi->ll_osc_exp);
 
@@ -261,7 +292,8 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc)
         /* make root inode
          * XXX: move this to after cbd setup? */
         err = mdc_getattr(sbi->ll_mdc_exp, &rootfid,
-                          OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLACL,
+                          OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS |
+                          (sbi->ll_flags & LL_SBI_ACL ? OBD_MD_FLACL : 0),
                           0, &request);
         if (err) {
                 CERROR("mdc_getattr failed for root: rc = %d\n", err);
@@ -304,6 +336,7 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc)
         sb->s_root = d_alloc_root(root);
         if (data != NULL)
                 OBD_FREE(data, sizeof(*data));
+        sb->s_root->d_op = &ll_d_root_ops;
         RETURN(err);
 
 out_root:
@@ -320,6 +353,20 @@ out:
         RETURN(err);
 }
 
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize)
+{
+        int size, rc;
+
+        *lmmsize = obd_size_diskmd(sbi->ll_osc_exp, NULL);
+        size = sizeof(int);
+        rc = obd_get_info(sbi->ll_mdc_exp, strlen("max_easize"), "max_easize", 
+                          &size, lmmsize);
+        if (rc) 
+                CERROR("Get max mdsize error rc %d \n", rc);
+        
+        RETURN(rc);
+}
+
 void ll_dump_inode(struct inode *inode)
 {
         struct list_head *tmp;
@@ -363,14 +410,76 @@ void lustre_dump_dentry(struct dentry *dentry, int recur)
         }
 }
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+void lustre_throw_orphan_dentries(struct super_block *sb)
+{
+        struct hlist_node *tmp, *next;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+        /* Do this to get rid of orphaned dentries. That is not really trw. */
+        hlist_for_each_safe(tmp, next, &sbi->ll_orphan_dentry_list) {
+                struct dentry *dentry = hlist_entry(tmp, struct dentry, d_hash);
+                CWARN("found orphan dentry %.*s (%p->%p) at unmount, dumping "
+                      "before and after shrink_dcache_parent\n",
+                      dentry->d_name.len, dentry->d_name.name, dentry, next);
+                lustre_dump_dentry(dentry, 1);
+                shrink_dcache_parent(dentry);
+                lustre_dump_dentry(dentry, 1);
+        }
+}
+#else
+#define lustre_throw_orphan_dentries(sb)
+#endif
+
+static void prune_deathrow(struct ll_sb_info *sbi, int try)
+{
+        LIST_HEAD(throw_away);
+        int locked = 0;
+        ENTRY;
+
+        if (try) {
+                locked = spin_trylock(&sbi->ll_deathrow_lock);
+        } else {
+                spin_lock(&sbi->ll_deathrow_lock);
+                locked = 1;
+        }
+
+        if (!locked) {
+                EXIT;
+                return;
+        }
+
+        list_splice_init(&sbi->ll_deathrow, &throw_away);
+        spin_unlock(&sbi->ll_deathrow_lock);
+
+        while (!list_empty(&throw_away)) {
+                struct ll_inode_info *lli;
+                struct inode *inode;
+
+                lli = list_entry(throw_away.next, struct ll_inode_info,
+                                 lli_dead_list);
+                list_del_init(&lli->lli_dead_list);
+
+                inode = ll_info2i(lli);
+                d_prune_aliases(inode);
+
+                CDEBUG(D_INODE, "prune duplicate inode %p inum %lu count %u\n",
+                       inode, inode->i_ino, atomic_read(&inode->i_count));
+                iput(inode);
+        }
+        EXIT;
+}
+
 void lustre_common_put_super(struct super_block *sb)
 {
         struct ll_sb_info *sbi = ll_s2sbi(sb);
-        struct hlist_node *tmp, *next;
         ENTRY;
 
         ll_close_thread_shutdown(sbi->ll_lcq);
 
+        /* destroy inodes in deathrow */
+        prune_deathrow(sbi, 0);
+
         list_del(&sbi->ll_conn_chain);
         obd_disconnect(sbi->ll_osc_exp);
 
@@ -382,16 +491,7 @@ void lustre_common_put_super(struct super_block *sb)
 
         obd_disconnect(sbi->ll_mdc_exp);
 
-        // We do this to get rid of orphaned dentries. That is not really trw.
-        hlist_for_each_safe(tmp, next, &sbi->ll_orphan_dentry_list) {
-                struct dentry *dentry = hlist_entry(tmp, struct dentry, d_hash);
-                CWARN("found orphan dentry %.*s (%p->%p) at unmount, dumping "
-                      "before and after shrink_dcache_parent\n",
-                      dentry->d_name.len, dentry->d_name.name, dentry, next);
-                lustre_dump_dentry(dentry, 1);
-                shrink_dcache_parent(dentry);
-                lustre_dump_dentry(dentry, 1);
-        }
+        lustre_throw_orphan_dentries(sb);
         EXIT;
 }
 
@@ -453,9 +553,9 @@ void ll_options(char *options, char **ost, char **mdc, int *flags)
 #endif
         {
                 CDEBUG(D_SUPER, "this_char %s\n", this_char);
-                if (!*ost && (*ost = ll_read_opt("osc", this_char)))
+                if (!*ost && (*ost = ll_read_opt(LUSTRE_OSC_NAME, this_char)))
                         continue;
-                if (!*mdc && (*mdc = ll_read_opt("mdc", this_char)))
+                if (!*mdc && (*mdc = ll_read_opt(LUSTRE_MDC_NAME, this_char)))
                         continue;
                 tmp = ll_set_opt("nolock", this_char, LL_SBI_NOLCK);
                 if (tmp) {
@@ -484,12 +584,13 @@ void ll_options(char *options, char **ost, char **mdc, int *flags)
                 }
                 tmp = ll_set_opt("acl", this_char, LL_SBI_ACL);
                 if (tmp) {
-                        *flags |= tmp;
+                        /* Ignore deprecated mount option.  The client will
+                         * always try to mount with ACL support, whether this
+                         * is used depends on whether server supports it. */
                         continue;
                 }
                 tmp = ll_set_opt("noacl", this_char, LL_SBI_ACL);
                 if (tmp) {
-                        *flags &= ~tmp;
                         continue;
                 }
         }
@@ -505,6 +606,7 @@ void ll_lli_init(struct ll_inode_info *lli)
         spin_lock_init(&lli->lli_lock);
         INIT_LIST_HEAD(&lli->lli_pending_write_llaps);
         lli->lli_inode_magic = LLI_INODE_MAGIC;
+        INIT_LIST_HEAD(&lli->lli_dead_list);
 }
 
 int ll_fill_super(struct super_block *sb, void *data, int silent)
@@ -546,12 +648,31 @@ out:
         RETURN(err);
 } /* ll_read_super */
 
-int lustre_process_log(struct lustre_mount_data *lmd, char * profile,
-                       struct config_llog_instance *cfg, int allow_recov)
+static int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+                   char *s1, char *s2)
 {
-        struct lustre_cfg *lcfg = NULL;
         struct lustre_cfg_bufs bufs;
-        char * peer = "MDS_PEER_UUID";
+        struct lustre_cfg    * lcfg = NULL;
+        int err;
+               
+        CDEBUG(D_TRACE, "lcfg %s %#x %s %s\n", cfgname, cmd, s1, s2); 
+
+        lustre_cfg_bufs_reset(&bufs, cfgname);
+        if (s1) 
+                lustre_cfg_bufs_set_string(&bufs, 1, s1);
+        if (s2) 
+                lustre_cfg_bufs_set_string(&bufs, 2, s2);
+
+        lcfg = lustre_cfg_new(cmd, &bufs);
+        lcfg->lcfg_nid = nid;
+        err = class_process_config(lcfg);
+        lustre_cfg_free(lcfg);
+        return(err);
+}
+
+static int lustre_process_log(struct lustre_mount_data *lmd, char * profile,
+                       struct config_llog_instance *cfg)
+{
         struct obd_device *obd;
         struct lustre_handle mdc_conn = {0, };
         struct obd_export *exp;
@@ -559,8 +680,9 @@ int lustre_process_log(struct lustre_mount_data *lmd, char * profile,
         class_uuid_t uuid;
         struct obd_uuid mdc_uuid;
         struct llog_ctxt *ctxt;
-        struct obd_connect_data *ocd = NULL;
-        int rc = 0;
+        struct obd_connect_data ocd = { 0 };
+        lnet_nid_t nid;
+        int i, rc = 0, recov_bk = 1;
         int err;
         ENTRY;
 
@@ -570,35 +692,18 @@ int lustre_process_log(struct lustre_mount_data *lmd, char * profile,
         lustre_generate_random_uuid(uuid);
         class_uuid_unparse(uuid, &mdc_uuid);
         CDEBUG(D_HA, "generated uuid: %s\n", mdc_uuid.uuid);
-
-        lustre_cfg_bufs_reset(&bufs, name);
-        lustre_cfg_bufs_set_string(&bufs, 1, peer);
-
-        lcfg = lustre_cfg_new(LCFG_ADD_UUID, &bufs);
-        lcfg->lcfg_nid = lmd->lmd_nid;
-        LASSERT(lcfg->lcfg_nid != LNET_NID_ANY);
-        rc = class_process_config(lcfg);
-        lustre_cfg_free(lcfg);
+        
+        nid = lmd->lmd_nid[0];
+        LASSERT(nid != LNET_NID_ANY);
+        rc = do_lcfg(name, nid, LCFG_ADD_UUID, libcfs_nid2str(nid), 0);
         if (rc < 0)
                 GOTO(out, rc);
 
-        lustre_cfg_bufs_reset(&bufs, name);
-        lustre_cfg_bufs_set_string(&bufs, 1, LUSTRE_MDC_NAME);
-        lustre_cfg_bufs_set_string(&bufs, 2, mdc_uuid.uuid);
-
-        lcfg = lustre_cfg_new(LCFG_ATTACH, &bufs);
-        rc = class_process_config(lcfg);
-        lustre_cfg_free(lcfg);
+        rc = do_lcfg(name, 0, LCFG_ATTACH, LUSTRE_MDC_NAME, mdc_uuid.uuid);
         if (rc < 0)
                 GOTO(out_del_uuid, rc);
 
-        lustre_cfg_bufs_reset(&bufs, name);
-        lustre_cfg_bufs_set_string(&bufs, 1, lmd->lmd_mds);
-        lustre_cfg_bufs_set_string(&bufs, 2, peer);
-
-        lcfg = lustre_cfg_new(LCFG_SETUP, &bufs);
-        rc = class_process_config(lcfg);
-        lustre_cfg_free(lcfg);
+        rc = do_lcfg(name, 0, LCFG_SETUP, lmd->lmd_mds, libcfs_nid2str(nid));
         if (rc < 0) {
                 LCONSOLE_ERROR("I couldn't establish a connection with the MDS."
                                " Check that the MDS host NID is correct and the"
@@ -610,21 +715,31 @@ int lustre_process_log(struct lustre_mount_data *lmd, char * profile,
         if (obd == NULL)
                 GOTO(out_cleanup, rc = -EINVAL);
 
-        /* Disable initial recovery on this import */
+        /* Add the redundant MDS nids */
+        for (i = 1; i < lmd->lmd_nid_count; i++) {
+                nid = lmd->lmd_nid[i];
+                rc = do_lcfg(name, nid, LCFG_ADD_UUID, libcfs_nid2str(nid), 0);
+                if (rc) {
+                        CERROR("Add uuid for %s failed %d\n", 
+                               libcfs_nid2str(nid), rc);
+                        continue;
+                }
+                rc = do_lcfg(name, 0, LCFG_ADD_CONN, libcfs_nid2str(nid), 0);
+                if (rc) 
+                        CERROR("Add conn for %s failed %d\n", 
+                               libcfs_nid2str(nid), rc);
+        }
+
+        /* Try all connections, but only once. */
         rc = obd_set_info(obd->obd_self_export,
-                          strlen("initial_recov"), "initial_recov",
-                          sizeof(allow_recov), &allow_recov);
+                          strlen("init_recov_bk"), "init_recov_bk",
+                          sizeof(recov_bk), &recov_bk);
         if (rc)
                 GOTO(out_cleanup, rc);
 
-        if (lmd->lmd_flags & LMD_FLG_ACL) {
-                OBD_ALLOC(ocd, sizeof(*ocd));
-                if (ocd == NULL)
-                        GOTO(out_cleanup, rc = -ENOMEM);
-                ocd->ocd_connect_flags |= OBD_CONNECT_ACL;
-        }
+        ocd.ocd_connect_flags = OBD_CONNECT_ACL;
 
-        rc = obd_connect(&mdc_conn, obd, &mdc_uuid, ocd);
+        rc = obd_connect(&mdc_conn, obd, &mdc_uuid, &ocd);
         if (rc) {
                 CERROR("cannot connect to %s: rc = %d\n", lmd->lmd_mds, rc);
                 GOTO(out_cleanup, rc);
@@ -663,33 +778,27 @@ int lustre_process_log(struct lustre_mount_data *lmd, char * profile,
                 CERROR("obd_disconnect failed: rc = %d\n", err);
 
 out_cleanup:
-        lustre_cfg_bufs_reset(&bufs, name);
-        lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs);
-        err = class_process_config(lcfg);
-        lustre_cfg_free(lcfg);
+        err = do_lcfg(name, 0, LCFG_CLEANUP, 0, 0);
         if (err)
                 CERROR("mdc_cleanup failed: rc = %d\n", err);
 
 out_detach:
-        lustre_cfg_bufs_reset(&bufs, name);
-        lcfg = lustre_cfg_new(LCFG_DETACH, &bufs);
-        err = class_process_config(lcfg);
-        lustre_cfg_free(lcfg);
+        err = do_lcfg(name, 0, LCFG_DETACH, 0, 0);
         if (err)
                 CERROR("mdc_detach failed: rc = %d\n", err);
 
 out_del_uuid:
-        lustre_cfg_bufs_reset(&bufs, name);
-        lustre_cfg_bufs_set_string(&bufs, 1, peer);
-        lcfg = lustre_cfg_new(LCFG_DEL_UUID, &bufs);
-        err = class_process_config(lcfg);
-        lustre_cfg_free(lcfg);
-        if (err)
-                CERROR("del MDC UUID failed: rc = %d\n", err);
-
+        /* class_add_uuid adds a nid even if the same uuid exists; we might
+           delete any copy here.  So they all better match. */
+        for (i = 0; i < lmd->lmd_nid_count; i++) {
+                nid = lmd->lmd_nid[i];
+                err = do_lcfg(name, nid, LCFG_DEL_UUID, libcfs_nid2str(nid), 0);
+                if (err)
+                        CERROR("del MDC UUID %s failed: rc = %d\n", 
+                               libcfs_nid2str(nid), err);
+        }
+        /* class_import_put will get rid of the additional connections */
 out:
-        if (ocd)
-                OBD_FREE(ocd, sizeof(*ocd));
         RETURN(rc);
 }
 
@@ -700,7 +809,7 @@ static void lustre_manual_cleanup(struct ll_sb_info *sbi)
 
         while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) {
                 class_manual_cleanup(obd);
-        }                       
+        }
 
         if (sbi->ll_lmd != NULL)
                 class_del_profile(sbi->ll_lmd->lmd_profile);
@@ -741,8 +850,6 @@ int lustre_fill_super(struct super_block *sb, void *data, int silent)
                         sbi->ll_flags |= LL_SBI_FLOCK;
                 if (lmd->lmd_flags & LMD_FLG_USER_XATTR)
                         sbi->ll_flags |= LL_SBI_USER_XATTR;
-                if (lmd->lmd_flags & LMD_FLG_ACL)
-                        sbi->ll_flags |= LL_SBI_ACL;
 
                 /* generate a string unique to this super, let's try
                  the address of the super itself.*/
@@ -750,7 +857,7 @@ int lustre_fill_super(struct super_block *sb, void *data, int silent)
 
                 cfg.cfg_instance = ll_instance;
                 cfg.cfg_uuid = sbi->ll_sb_uuid;
-                err = lustre_process_log(lmd, lmd->lmd_profile, &cfg, 0);
+                err = lustre_process_log(lmd, lmd->lmd_profile, &cfg);
                 if (err < 0) {
                         CERROR("Unable to process log: %s\n", lmd->lmd_profile);
                         GOTO(out_free, err);
@@ -818,13 +925,13 @@ void lustre_put_super(struct super_block *sb)
         obd = class_exp2obd(sbi->ll_mdc_exp);
         if (obd) {
                 int next = 0;
-                /* We need to set force before the lov_disconnect in 
+                /* We need to set force before the lov_disconnect in
                 lustre_common_put_super, since l_d cleans up osc's as well. */
                 force = obd->obd_no_recov;
-                while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) 
+                while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next))
                        !=NULL) {
                         obd->obd_force = force;
-                }                       
+                }
         }
 
         lustre_common_put_super(sb);
@@ -926,14 +1033,20 @@ void ll_clear_inode(struct inode *inode)
                 lli->lli_symlink_name = NULL;
         }
 
+#ifdef CONFIG_FS_POSIX_ACL
         if (lli->lli_posix_acl) {
                 LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1);
                 posix_acl_release(lli->lli_posix_acl);
                 lli->lli_posix_acl = NULL;
         }
+#endif
 
         lli->lli_inode_magic = LLI_INODE_DEAD;
 
+        spin_lock(&sbi->ll_deathrow_lock);
+        list_del_init(&lli->lli_dead_list);
+        spin_unlock(&sbi->ll_deathrow_lock);
+
         EXIT;
 }
 
@@ -1034,14 +1147,14 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
                  * above to avoid invoking vmtruncate, otherwise it is important
                  * to call vmtruncate in inode_setattr to update inode->i_size
                  * (bug 6196) */
-                inode_setattr(inode, attr);
+                rc = inode_setattr(inode, attr);
 
                 ll_update_inode(inode, &md);
                 ptlrpc_req_finished(request);
 
                 if (!lsm || !S_ISREG(inode->i_mode)) {
                         CDEBUG(D_INODE, "no lsm: not setting attrs on OST\n");
-                        RETURN(0);
+                        RETURN(rc);
                 }
         } else {
                 /* The OST doesn't check permissions, but the alternative is
@@ -1063,7 +1176,7 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
                 }
 
                 /* Won't invoke vmtruncate, as we already cleared ATTR_SIZE */
-                inode_setattr(inode, attr);
+                rc = inode_setattr(inode, attr);
         }
 
         /* We really need to get our PW lock before we change inode->i_size.
@@ -1255,6 +1368,23 @@ void ll_inode_size_unlock(struct inode *inode, int unlock_lsm)
         up(&lli->lli_size_sem);
 }
 
+static void ll_replace_lsm(struct inode *inode, struct lov_stripe_md *lsm)
+{
+        struct ll_inode_info *lli = ll_i2info(inode);
+        dump_lsm(D_INODE, lsm);
+        dump_lsm(D_INODE, lli->lli_smd); 
+        LASSERTF(lsm->lsm_magic == LOV_MAGIC_JOIN, 
+                 "lsm must be joined lsm %p\n", lsm);
+        obd_free_memmd(ll_i2obdexp(inode), &lli->lli_smd);
+        CDEBUG(D_INODE, "replace lsm %p to lli_smd %p for inode %lu%u(%p)\n",
+               lsm, lli->lli_smd, inode->i_ino, inode->i_generation, inode);
+        lli->lli_smd = lsm;
+        lli->lli_maxbytes = lsm->lsm_maxbytes;
+        if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES)
+                lli->lli_maxbytes = PAGE_CACHE_MAXBYTES;
+}
+
 void ll_update_inode(struct inode *inode, struct lustre_md *md)
 {
         struct ll_inode_info *lli = ll_i2info(inode);
@@ -1264,7 +1394,8 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md)
         LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
         if (lsm != NULL) {
                 if (lli->lli_smd == NULL) {
-                        if (lsm->lsm_magic != LOV_MAGIC) {
+                        if (lsm->lsm_magic != LOV_MAGIC && 
+                            lsm->lsm_magic != LOV_MAGIC_JOIN) {
                                 dump_lsm(D_ERROR, lsm);
                                 LBUG();
                         }
@@ -1278,15 +1409,20 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md)
                         if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES)
                                 lli->lli_maxbytes = PAGE_CACHE_MAXBYTES;
                 } else {
-                        if (lov_stripe_md_cmp(lli->lli_smd, lsm)) {
-                                CERROR("lsm mismatch for inode %ld\n",
-                                       inode->i_ino);
-                                CERROR("lli_smd:\n");
-                                dump_lsm(D_ERROR, lli->lli_smd);
-                                CERROR("lsm:\n");
-                                dump_lsm(D_ERROR, lsm);
-                                LBUG();
-                        }
+                        if (lli->lli_smd->lsm_magic == lsm->lsm_magic &&
+                             lli->lli_smd->lsm_stripe_count == 
+                                        lsm->lsm_stripe_count) {
+                                if (lov_stripe_md_cmp(lli->lli_smd, lsm)) {
+                                        CERROR("lsm mismatch for inode %ld\n",
+                                                inode->i_ino);
+                                        CERROR("lli_smd:\n");
+                                        dump_lsm(D_ERROR, lli->lli_smd);
+                                        CERROR("lsm:\n");
+                                        dump_lsm(D_ERROR, lsm);
+                                        LBUG();
+                                }
+                        } else 
+                                ll_replace_lsm(inode, lsm);
                 }
                 /* bug 2844 - limit i_blksize for broken user-space apps */
                 LASSERTF(lsm->lsm_xfersize != 0, "%lu\n", lsm->lsm_xfersize);
@@ -1298,6 +1434,7 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md)
                                        inode->i_sb->s_blocksize);
         }
 
+#ifdef CONFIG_FS_POSIX_ACL
         LASSERT(!md->posix_acl || (body->valid & OBD_MD_FLACL));
         if (body->valid & OBD_MD_FLACL) {
                 spin_lock(&lli->lli_lock);
@@ -1306,10 +1443,12 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md)
                 lli->lli_posix_acl = md->posix_acl;
                 spin_unlock(&lli->lli_lock);
         }
+#endif
 
         if (body->valid & OBD_MD_FLID)
                 inode->i_ino = body->ino;
-        if (body->valid & OBD_MD_FLATIME)
+        if (body->valid & OBD_MD_FLATIME &&
+            body->atime > LTIME_S(inode->i_atime))
                 LTIME_S(inode->i_atime) = body->atime;
         if (body->valid & OBD_MD_FLMTIME &&
             body->mtime > LTIME_S(inode->i_mtime)) {
@@ -1592,7 +1731,13 @@ int ll_prep_inode(struct obd_export *exp, struct inode **inode,
                   struct ptlrpc_request *req, int offset,struct super_block *sb)
 {
         struct lustre_md md;
+        struct ll_sb_info *sbi = NULL;
         int rc = 0;
+        ENTRY;
+
+        LASSERT(*inode || sb);
+        sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode);
+        prune_deathrow(sbi, 1);
 
         rc = mdc_req2lustre_md(req, offset, exp, &md);
         if (rc)
@@ -1607,9 +1752,13 @@ int ll_prep_inode(struct obd_export *exp, struct inode **inode,
                         mdc_free_lustre_md(exp, &md);
                         rc = -ENOMEM;
                         CERROR("new_inode -fatal: rc %d\n", rc);
+                        GOTO(out, rc);
                 }
         }
 
+        rc = obd_checkmd(exp, ll_i2mdcexp(*inode),
+                         ll_i2info(*inode)->lli_smd);
+out:
         RETURN(rc);
 }
 
@@ -1638,3 +1787,69 @@ struct ll_async_page *llite_pglist_next_llap(struct ll_sb_info *sbi,
         LBUG();
         return NULL;
 }
+
+int ll_obd_statfs(struct inode *inode, void *arg)
+{
+        struct ll_sb_info *sbi = NULL;
+        struct obd_device *client_obd = NULL, *lov_obd = NULL;
+        struct lov_obd *lov = NULL;
+        struct obd_import *client_imp = NULL;
+        struct obd_statfs stat_buf = {0};
+        char *buf = NULL;
+        struct obd_ioctl_data *data = NULL;
+        __u32 type, index;
+        int len, rc;
+
+        if (!inode || !(sbi = ll_i2sbi(inode)))
+                GOTO(out_statfs, rc = -EINVAL);
+
+        rc = obd_ioctl_getdata(&buf, &len, arg);
+        if (rc)
+                GOTO(out_statfs, rc);
+
+        data = (void*)buf;
+        if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 ||
+            !data->ioc_pbuf1 || !data->ioc_pbuf2)
+                GOTO(out_statfs, rc = -EINVAL);
+
+        memcpy(&type, data->ioc_inlbuf1, sizeof(__u32));
+        memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
+
+        if (type == LL_STATFS_MDC) {
+                if (index > 0)
+                        GOTO(out_statfs, rc = -ENODEV);
+                client_obd = class_exp2obd(sbi->ll_mdc_exp);
+                client_imp = class_exp2cliimp(sbi->ll_mdc_exp);
+        } else if (type == LL_STATFS_LOV) {
+                lov_obd = class_exp2obd(sbi->ll_osc_exp);
+                lov = &lov_obd->u.lov;
+
+                if (index >= lov->desc.ld_tgt_count)
+                        GOTO(out_statfs, rc = -ENODEV);
+
+                client_obd = class_exp2obd(lov->tgts[index].ltd_exp);
+                client_imp = class_exp2cliimp(lov->tgts[index].ltd_exp);
+                if (!lov->tgts[index].active)
+                        GOTO(out_uuid, rc = -ENODATA);
+        }
+
+        if (!client_obd || !client_imp)
+                GOTO(out_statfs, rc = -EINVAL);
+
+        rc = obd_statfs(client_obd, &stat_buf, jiffies - 1);
+        if (rc)
+                GOTO(out_statfs, rc);
+
+        if (copy_to_user(data->ioc_pbuf1, &stat_buf, data->ioc_plen1))
+                GOTO(out_statfs, rc = -EFAULT);
+
+out_uuid:
+        if (copy_to_user(data->ioc_pbuf2, &client_imp->imp_target_uuid,
+                         data->ioc_plen2))
+                rc = -EFAULT;
+
+out_statfs:
+        if (buf)
+                obd_ioctl_freedata(buf, len);
+        return rc;
+}
index c08020d..ffc71de 100755 (executable)
 
 #ifdef __KERNEL__
 #include <asm/div64.h>
+#include <libcfs/libcfs.h>
 #else
 #include <liblustre.h>
 #endif
 
-#include <linux/obd_class.h>
-#include <linux/obd_lov.h>
-#include <linux/lustre_idl.h>
-#include <linux/lustre_log.h>
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <lustre_idl.h>
+#include <lustre_mds.h>
+#include <lustre_log.h>
 
 #include "lov_internal.h"
 
index 5829fa9..2af645e 100644 (file)
@@ -132,8 +132,8 @@ int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off);
 
 /* lov_qos.c */
 void qos_shrink_lsm(struct lov_request_set *set);
-int qos_prep_create(struct obd_export *exp, struct lov_request_set *set);
-void qos_update(struct lov_obd *lov, int idx, struct obd_statfs *osfs);
+int qos_prep_create(struct lov_obd *lov, struct lov_request_set *set,
+                    int newea);
 int qos_remedy_create(struct lov_request_set *set, struct lov_request *req);
 
 /* lov_request.c */
index 0ebda37..f0a409f 100644 (file)
 
 #include "lov_internal.h"
 
-/* obd methods */
+
+/* FIXME add lov_get/putrefs around every access to lov->tgts for on-line non-
+   quiescent ost removal */
+/* Keep a refcount of lov->tgt usage to prevent racing with deletion */
+static void lov_getref(struct obd_device *obd)
+{
+        struct lov_obd *lov = &obd->u.lov;
+
+        /* nobody gets through here until lov_putref is done */
+        down(&lov->lov_lock);
+        atomic_inc(&lov->refcount);
+        up(&lov->lov_lock);
+        return;
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt);
+
+static void lov_putref(struct obd_device *obd)
+{
+        struct lov_obd *lov = &obd->u.lov;
+        down(&lov->lov_lock);
+        /* ok to dec to 0 more than once -- ltd_exp's will be null */
+        if (atomic_dec_and_test(&lov->refcount) && lov->death_row) {
+                struct lov_tgt_desc *tgt;
+                int i;
+                CDEBUG(D_CONFIG, "destroying %d lov targets\n", lov->death_row);
+                for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count;
+                      i++, tgt++) {
+                        if (!tgt->reap)
+                                continue;
+                        /* Disconnect and delete from list */
+                        __lov_del_obd(obd, tgt);
+                        lov->death_row--;
+                }
+        }
+        up(&lov->lov_lock);
+}
+
 #define MAX_STRING_SIZE 128
 static int lov_connect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt,
                            int activate, struct obd_connect_data *data)
@@ -118,6 +155,7 @@ static int lov_connect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt,
         }
 
         tgt->active = 1;
+        tgt->reap = 0;
         lov->desc.ld_active_tgt_count++;
 
 #ifdef __KERNEL__
@@ -167,8 +205,8 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
 
         /* We don't want to actually do the underlying connections more than
          * once, so keep track. */
-        lov->refcount++;
-        if (lov->refcount > 1) {
+        lov->connects++;
+        if (lov->connects > 1) {
                 class_export_put(exp);
                 RETURN(0);
         }
@@ -176,6 +214,8 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
         for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) {
                 if (obd_uuid_empty(&tgt->uuid))
                         continue;
+                if (connect_flags & OBD_CONNECT_INDEX)
+                        data->ocd_index = i;
                 rc = lov_connect_obd(obd, tgt, 0, data);
                 if (rc)
                         GOTO(out_disc, rc);
@@ -259,43 +299,38 @@ static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
         RETURN(0);
 }
 
-static int
-lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen);
+static int lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp,
+                       int index, int gen);
 
 static int lov_disconnect(struct obd_export *exp)
 {
         struct obd_device *obd = class_exp2obd(exp);
-        struct obd_device *osc_obd;
         struct lov_obd *lov = &obd->u.lov;
         struct lov_tgt_desc *tgt;
-        int rc, i;
+        int i, rc;
         ENTRY;
 
-        rc = class_disconnect(exp);
-
         if (!lov->tgts)
-                RETURN(rc);
+                goto out;
 
         /* Only disconnect the underlying layers on the final disconnect. */
-        lov->refcount--;
-        if (lov->refcount != 0)
-                RETURN(rc);
+        lov->connects--;
+        if (lov->connects != 0)
+                goto out;
 
+        /* Let's hold another reference so lov_del_obd doesn't spin through
+           putref every time */
+        lov_getref(obd);
         for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) {
                 if (tgt->ltd_exp) {
-                        osc_obd = class_exp2obd(tgt->ltd_exp);
-                        /* Disconnect and delete from list */
+                        /* Disconnection is the last we know about an obd */
                         lov_del_obd(obd, &tgt->uuid, i, tgt->ltd_gen);
-                        /* Cleanup the osc now - can't do it from 
-                           lov_cleanup because we just lost our only reference
-                           to it. */ 
-                        /* Use lov's force/fail flags. */
-                        osc_obd->obd_force = obd->obd_force;
-                        osc_obd->obd_fail = obd->obd_fail;
-                        class_manual_cleanup(osc_obd);
                 }
         }
+        lov_putref(obd);
 
+out:
+        rc = class_disconnect(exp);
         RETURN(rc);
 }
 
@@ -315,7 +350,6 @@ static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid,
         CDEBUG(D_INFO, "Searching in lov %p for uuid %s (activate=%d)\n",
                lov, uuid->uuid, activate);
 
-        spin_lock(&lov->lov_lock);
         for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) {
                 if (tgt->ltd_exp == NULL)
                         continue;
@@ -346,18 +380,17 @@ static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid,
 
         EXIT;
  out:
-        spin_unlock(&lov->lov_lock);
         return rc;
 }
 
 static int lov_notify(struct obd_device *obd, struct obd_device *watched,
-                       int active)
+                      enum obd_notify_event ev)
 {
-        int rc;
         struct obd_uuid *uuid;
-
+        int rc;
         ENTRY;
-        if (strcmp(watched->obd_type->typ_name, "osc")) {
+
+        if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
                 CERROR("unexpected notification of %s %s!\n",
                        watched->obd_type->typ_name,
                        watched->obd_name);
@@ -365,19 +398,24 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched,
         }
         uuid = &watched->u.cli.cl_import->imp_target_uuid;
 
-        /* Set OSC as active before notifying the observer, so the
-         * observer can use the OSC normally.
-         */
-        rc = lov_set_osc_active(&obd->u.lov, uuid, active);
-        if (rc) {
-                CERROR("%sactivation of %s failed: %d\n",
-                       active ? "" : "de", uuid->uuid, rc);
-                RETURN(rc);
+        if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
+                /* Set OSC as active before notifying the observer, so the
+                 * observer can use the OSC normally.
+                 */
+                lov_getref(obd);
+                rc = lov_set_osc_active(&obd->u.lov, uuid,
+                                        ev == OBD_NOTIFY_ACTIVE);
+                lov_putref(obd);
+                if (rc) {
+                        CERROR("%sactivation of %s failed: %d\n",
+                               (ev == OBD_NOTIFY_ACTIVE) ? "" : "de",
+                               uuid->uuid, rc);
+                        RETURN(rc);
+                }
         }
 
-        if (obd->obd_observer)
-                /* Pass the notification up the chain. */
-                rc = obd_notify(obd->obd_observer, watched, active);
+        /* Pass the notification up the chain. */
+        rc = obd_notify_observer(obd, watched, ev);
 
         RETURN(rc);
 }
@@ -387,8 +425,9 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
 {
         struct lov_obd *lov = &obd->u.lov;
         struct lov_tgt_desc *tgt;
+        obd_id params[2];
         int rc, old_count;
-        __u32 bufsize;
+        __u32 bufsize, size = 2;
         ENTRY;
 
         CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d\n",
@@ -415,15 +454,9 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
                         RETURN(-ENOMEM);
                 }
 
+                memset(tgt, 0, bufsize);
                 if (lov->tgts) {
-                        int i;
                         memcpy(tgt, lov->tgts, lov->bufsize);
-                        LASSERT(index == lov->desc.ld_tgt_count);
-                        for (i = 0; i < index; i++) {
-                                INIT_LIST_HEAD(&tgt[i].qos_bavail_list);
-                                list_splice(&lov->tgts[i].qos_bavail_list, 
-                                            &tgt[i].qos_bavail_list);
-                        }
                         OBD_FREE(lov->tgts, lov->bufsize);
                 }
 
@@ -443,8 +476,6 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
         tgt->uuid = *uuidp;
         /* XXX - add a sanity check on the generation number. */
         tgt->ltd_gen = gen;
-        tgt->index = index;
-        INIT_LIST_HEAD(&tgt->qos_bavail_list);
 
         old_count = lov->desc.ld_tgt_count;
         if (index >= lov->desc.ld_tgt_count)
@@ -453,7 +484,7 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
         CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n",
                 index, tgt->ltd_gen, lov->desc.ld_tgt_count);
 
-        if (lov->refcount == 0)
+        if (lov->connects == 0)
                 /* lov_connect hasn't been called yet. So we'll do the
                    lov_connect_obd on this obd when that fn first runs. */
                 RETURN(0);
@@ -475,7 +506,18 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
         obd_llog_finish(obd->obd_observer, old_count);
         llog_cat_initialize(obd->obd_observer, lov->desc.ld_tgt_count);
 
-        rc = lov_notify(obd, tgt->ltd_exp->exp_obd, 1);
+        params[0] = index;
+        rc = obd_get_info(tgt->ltd_exp, strlen("last_id"), "last_id", &size,
+                          &params[1]);
+        if (rc)
+                GOTO(out, rc);
+
+        rc = obd_set_info(obd->obd_observer->obd_self_export,
+                          strlen("next_id"),"next_id", 2, params);
+        if (rc)
+                GOTO(out, rc);
+
+        rc = lov_notify(obd, tgt->ltd_exp->exp_obd, OBD_NOTIFY_ACTIVE);
         GOTO(out, rc);
  out:
         if (rc && tgt->ltd_exp != NULL)
@@ -483,6 +525,7 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
         return rc;
 }
 
+/* Schedule a target for deletion */
 static int
 lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
 {
@@ -492,9 +535,6 @@ lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
         int rc = 0;
         ENTRY;
 
-        CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d\n",
-               uuidp->uuid, index, gen);
-
         if (index >= count) {
                 CERROR("LOV target index %d >= number of LOV OBDs %d.\n",
                        index, count);
@@ -514,6 +554,25 @@ lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
                 RETURN(-EINVAL);
         }
 
+        CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n",
+               tgt->uuid.uuid, index, tgt->ltd_gen, tgt->ltd_exp, tgt->active);
+
+        lov_getref(obd);
+        tgt->reap = 1;
+        lov->death_row++;
+        /* we really delete it from lov_putref */
+        lov_putref(obd);
+
+        RETURN(rc);
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+        struct obd_device *osc_obd;
+
+        LASSERT(tgt->reap);
+        osc_obd = class_exp2obd(tgt->ltd_exp);
+
         if (tgt->ltd_exp)
                 lov_disconnect_obd(obd, tgt);
 
@@ -524,10 +583,15 @@ lov_del_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
         /* lt_gen = 0 will mean it will not match the gen of any valid loi */
         memset(tgt, 0, sizeof(*tgt));
 
-        CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n",
-               tgt->uuid.uuid, index, tgt->ltd_gen, tgt->ltd_exp, tgt->active);
-
-        RETURN(rc);
+        /* Manual cleanup - no cleanup logs to clean up the osc's.  We must
+           do it ourselves. And we can't do it from lov_cleanup,
+           because we just lost our only reference to it. */
+        if (osc_obd) {
+                /* Use lov's force/fail flags. */
+                osc_obd->obd_force = obd->obd_force;
+                osc_obd->obd_fail = obd->obd_fail;
+                class_manual_cleanup(osc_obd);
+        }
 }
 
 static int lov_setup(struct obd_device *obd, obd_count len, void *buf)
@@ -536,8 +600,7 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf)
         struct lustre_cfg *lcfg = buf;
         struct lov_desc *desc;
         struct lov_obd *lov = &obd->u.lov;
-        struct lov_tgt_desc *tgts;
-        int count, i;
+        int count;
         ENTRY;
 
         if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
@@ -602,15 +665,12 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf)
                 CERROR("Out of memory\n");
                 RETURN(-EINVAL);
         }
-        for (i = 0, tgts = lov->tgts; i < max(count, 1); i++, tgts++) {
-                tgts->index = i;
-                INIT_LIST_HEAD(&tgts->qos_bavail_list);
-        }
+        memset(lov->tgts, 0, lov->bufsize);
 
         desc->ld_active_tgt_count = 0;
         lov->desc = *desc;
-        spin_lock_init(&lov->lov_lock);
-        INIT_LIST_HEAD(&lov->qos_bavail_list);
+        sema_init(&lov->lov_lock, 1);
+        atomic_set(&lov->refcount, 0);
 
         lprocfs_init_vars(lov, &lvars);
         lprocfs_obd_setup(obd, lvars.obd_vars);
@@ -635,13 +695,23 @@ static int lov_precleanup(struct obd_device *obd, int stage)
         int rc = 0;
         ENTRY;
 
-        if (stage < 2)
-                RETURN(0);
-
-        rc = obd_llog_finish(obd, 0);
-        if (rc != 0)
-                CERROR("failed to cleanup llogging subsystems\n");
-
+        switch (stage) {
+        case OBD_CLEANUP_EARLY: {
+                struct lov_obd *lov = &obd->u.lov;
+                int i;
+                for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                        if (!lov->tgts[i].active)
+                                continue;
+                        obd_precleanup(class_exp2obd(lov->tgts[i].ltd_exp),
+                                       OBD_CLEANUP_EARLY);
+                }
+                break;
+        }
+        case OBD_CLEANUP_SELF_EXP:
+                rc = obd_llog_finish(obd, 0);
+                if (rc != 0)
+                        CERROR("failed to cleanup llogging subsystems\n");
+        }
         RETURN(rc);
 }
 
@@ -649,15 +719,18 @@ static int lov_cleanup(struct obd_device *obd)
 {
         struct lov_obd *lov = &obd->u.lov;
 
-        ENTRY;
         lprocfs_obd_cleanup(obd);
         if (lov->tgts) {
                 int i;
                 struct lov_tgt_desc *tgt;
                 for (i = 0, tgt = lov->tgts;
                       i < lov->desc.ld_tgt_count; i++, tgt++) {
-                        if (!obd_uuid_empty(&tgt->uuid))
+                        /* We should never get here - these should have
+                           been removed in the disconnect. */
+                        if (!obd_uuid_empty(&tgt->uuid)) {
+                                CERROR("lov tgt %d not cleaned!\n", i);
                                 lov_del_obd(obd, &tgt->uuid, i, 0);
+                        }
                 }
                 OBD_FREE(lov->tgts, lov->bufsize);
         }
@@ -747,8 +820,6 @@ static int lov_clear_orphans(struct obd_export *export, struct obdo *src_oa,
                         continue;
 
                 memcpy(tmp_oa, src_oa, sizeof(*tmp_oa));
-                tmp_oa->o_valid |= OBD_MD_FLID;
-                tmp_oa->o_id = oti->oti_objid[i];
 
                 LASSERT(lov->tgts[i].ltd_exp);
                 /* XXX: LOV STACKING: use real "obj_mdp" sub-data */
@@ -766,16 +837,52 @@ static int lov_clear_orphans(struct obd_export *export, struct obdo *src_oa,
         RETURN(rc);
 }
 
+static int lov_recreate(struct obd_export *exp, struct obdo *src_oa,
+                        struct lov_stripe_md **ea, struct obd_trans_info *oti)
+{
+        struct lov_stripe_md *obj_mdp, *lsm;
+        struct lov_obd *lov = &exp->exp_obd->u.lov;
+        unsigned ost_idx;
+        int rc, i;
+        ENTRY;
+
+        LASSERT(src_oa->o_valid & OBD_MD_FLFLAGS &&
+                src_oa->o_flags & OBD_FL_RECREATE_OBJS);
+
+        OBD_ALLOC(obj_mdp, sizeof(*obj_mdp));
+        if (obj_mdp == NULL)
+                RETURN(-ENOMEM);
+
+        ost_idx = src_oa->o_nlink;
+        lsm = *ea;
+        if (lsm == NULL)
+                GOTO(out, rc = -EINVAL);
+        if (ost_idx >= lov->desc.ld_tgt_count)
+                GOTO(out, rc = -EINVAL);
+
+        for (i = 0; i < lsm->lsm_stripe_count; i++) {
+                if (lsm->lsm_oinfo[i].loi_ost_idx == ost_idx) {
+                        if (lsm->lsm_oinfo[i].loi_id != src_oa->o_id)
+                                GOTO(out, rc = -EINVAL);
+                        break;
+                }
+        }
+        if (i == lsm->lsm_stripe_count)
+                GOTO(out, rc = -EINVAL);
+
+        rc = obd_create(lov->tgts[ost_idx].ltd_exp, src_oa, &obj_mdp, oti);
+out:
+        OBD_FREE(obj_mdp, sizeof(*obj_mdp));
+        RETURN(rc);
+}
+
 /* the LOV expects oa->o_id to be set to the LOV object id */
-static int
-lov_create(struct obd_export *exp, struct obdo *src_oa,
-           struct lov_stripe_md **ea, struct obd_trans_info *oti)
+static int lov_create(struct obd_export *exp, struct obdo *src_oa,
+                      struct lov_stripe_md **ea, struct obd_trans_info *oti)
 {
-        struct lov_request_set *set = NULL;
         struct lov_obd *lov;
-        struct obd_statfs osfs;
-        cfs_time_t maxage;
-        struct lov_request *req;
+        struct lov_request_set *set = NULL;
+        struct list_head *pos;
         int rc = 0;
         ENTRY;
 
@@ -789,22 +896,25 @@ lov_create(struct obd_export *exp, struct obdo *src_oa,
                 RETURN(rc);
         }
 
-        LASSERT(ergo(src_oa->o_valid & OBD_MD_FLFLAGS,
-                     !!(src_oa->o_flags & OBD_FL_CREATE_CROW) !=
-                     !!(src_oa->o_flags & OBD_FL_RECREATE_OBJS)));
-        
         lov = &exp->exp_obd->u.lov;
         if (!lov->desc.ld_active_tgt_count)
                 RETURN(-EIO);
-        
-        maxage = cfs_time_shift(-lov->desc.ld_qos_maxage);
-        obd_statfs(exp->exp_obd, &osfs, maxage);                
+
+        /* Recreate a specific object id at the given OST index */
+        if ((src_oa->o_valid & OBD_MD_FLFLAGS) &&
+            (src_oa->o_flags & OBD_FL_RECREATE_OBJS)) {
+                 rc = lov_recreate(exp, src_oa, ea, oti);
+                 RETURN(rc);
+        }
 
         rc = lov_prep_create_set(exp, ea, src_oa, oti, &set);
         if (rc)
                 RETURN(rc);
 
-        list_for_each_entry(req, &set->set_list, rq_link) {
+        list_for_each (pos, &set->set_list) {
+                struct lov_request *req =
+                        list_entry(pos, struct lov_request, rq_link);
+
                 /* XXX: LOV STACKING: use real "obj_mdp" sub-data */
                 rc = obd_create(lov->tgts[req->rq_idx].ltd_exp,
                                 req->rq_oa, &req->rq_md, oti);
@@ -814,15 +924,17 @@ lov_create(struct obd_export *exp, struct obdo *src_oa,
         RETURN(rc);
 }
 
-#define ASSERT_LSM_MAGIC(lsmp)                                          \
-do {                                                                    \
-        LASSERT((lsmp) != NULL);                                        \
-        LASSERTF((lsmp)->lsm_magic == LOV_MAGIC, "%p->lsm_magic=%x\n",  \
-                 (lsmp), (lsmp)->lsm_magic);                            \
+#define ASSERT_LSM_MAGIC(lsmp)                                                  \
+do {                                                                            \
+        LASSERT((lsmp) != NULL);                                                \
+        LASSERTF(((lsmp)->lsm_magic == LOV_MAGIC ||                             \
+                 (lsmp)->lsm_magic == LOV_MAGIC_JOIN), "%p->lsm_magic=%x\n",    \
+                 (lsmp), (lsmp)->lsm_magic);                                    \
 } while (0)
 
 static int lov_destroy(struct obd_export *exp, struct obdo *oa,
-                       struct lov_stripe_md *lsm, struct obd_trans_info *oti)
+                       struct lov_stripe_md *lsm, struct obd_trans_info *oti,
+                       struct obd_export *md_exp)
 {
         struct lov_request_set *set;
         struct lov_request *req;
@@ -848,7 +960,7 @@ static int lov_destroy(struct obd_export *exp, struct obdo *oa,
                 /* XXX update the cookie position */
                 oti->oti_logcookies = set->set_cookies + req->rq_stripe;
                 rc = obd_destroy(lov->tgts[req->rq_idx].ltd_exp, req->rq_oa,
-                                 NULL, oti);
+                                 NULL, oti, NULL);
                 err = lov_update_common_set(set, req, rc);
                 if (rc) {
                         CERROR("error: destroying objid "LPX64" subobj "
@@ -860,6 +972,10 @@ static int lov_destroy(struct obd_export *exp, struct obdo *oa,
                 }
         }
         lov_fini_destroy_set(set);
+        if (rc == 0) {
+                LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+                rc = lsm_op_find(lsm->lsm_magic)->lsm_destroy(lsm, oa, md_exp);
+        }
         RETURN(rc);
 }
 
@@ -1001,7 +1117,7 @@ static int lov_setattr(struct obd_export *exp, struct obdo *src_oa,
                                       OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLINLINE |
                                       OBD_MD_FLFID | OBD_MD_FLGENER)));
         lov = &exp->exp_obd->u.lov;
-        rc = lov_prep_setattr_set(exp, src_oa, lsm, NULL, &set);
+        rc = lov_prep_setattr_set(exp, src_oa, lsm, oti, &set);
         if (rc)
                 RETURN(rc);
 
@@ -1046,7 +1162,7 @@ static int lov_setattr_async(struct obd_export *exp, struct obdo *src_oa,
 
         LASSERT(!(src_oa->o_valid &  ~(OBD_MD_FLID | OBD_MD_FLUID |
                                        OBD_MD_FLGID| OBD_MD_FLCOOKIE |
-                                      OBD_MD_FLFID | OBD_MD_FLGENER)));
+                                       OBD_MD_FLFID | OBD_MD_FLGENER)));
         lov = &exp->exp_obd->u.lov;
 
         loi = lsm->lsm_oinfo;
@@ -1393,7 +1509,6 @@ static int lov_queue_async_io(struct obd_export *exp,
         struct lov_async_page *lap;
         int rc;
 
-        ENTRY;
         LASSERT(loi == NULL);
 
         ASSERT_LSM_MAGIC(lsm);
@@ -1417,7 +1532,6 @@ static int lov_set_async_flags(struct obd_export *exp,
         struct lov_async_page *lap;
         int rc;
 
-        ENTRY;
         LASSERT(loi == NULL);
 
         ASSERT_LSM_MAGIC(lsm);
@@ -1442,7 +1556,6 @@ static int lov_queue_group_io(struct obd_export *exp,
         struct lov_async_page *lap;
         int rc;
 
-        ENTRY;
         LASSERT(loi == NULL);
 
         ASSERT_LSM_MAGIC(lsm);
@@ -1468,7 +1581,6 @@ static int lov_trigger_group_io(struct obd_export *exp,
         struct lov_obd *lov = &exp->exp_obd->u.lov;
         int rc = 0, i, err;
 
-        ENTRY;
         LASSERT(loi == NULL);
 
         ASSERT_LSM_MAGIC(lsm);
@@ -1496,7 +1608,6 @@ static int lov_teardown_async_page(struct obd_export *exp,
         struct lov_async_page *lap;
         int rc;
 
-        ENTRY;
         LASSERT(loi == NULL);
 
         ASSERT_LSM_MAGIC(lsm);
@@ -1771,7 +1882,7 @@ static int lov_join_lru(struct obd_export *exp,
         } while(0)
 
 static int lov_statfs(struct obd_device *obd, struct obd_statfs *osfs,
-                      cfs_time_t max_age)
+                      unsigned long max_age)
 {
         struct lov_obd *lov = &obd->u.lov;
         struct obd_statfs lov_sfs;
@@ -1796,7 +1907,6 @@ static int lov_statfs(struct obd_device *obd, struct obd_statfs *osfs,
                                 rc = err;
                         continue;
                 }
-                qos_update(lov, i, &lov_sfs);
 
                 if (!set) {
                         memcpy(osfs, &lov_sfs, sizeof(lov_sfs));
@@ -1804,7 +1914,7 @@ static int lov_statfs(struct obd_device *obd, struct obd_statfs *osfs,
                 } else {
 #ifdef MIN_DF
                         /* Sandia requested that df (and so, statfs) only
-                           returned minimal available space on 
+                           returned minimal available space on
                            a single OST, so people would be able to
                            write this much data guaranteed. */
                         if (osfs->os_bavail > lov_sfs.os_bavail) {
@@ -1956,12 +2066,14 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen,
 {
         struct obd_device *obddev = class_exp2obd(exp);
         struct lov_obd *lov = &obddev->u.lov;
-        int i;
+        int i, rc;
         ENTRY;
 
         if (!vallen || !val)
                 RETURN(-EFAULT);
 
+        lov_getref(obddev);
+
         if (keylen > strlen("lock_to_stripe") &&
             strcmp(key, "lock_to_stripe") == 0) {
                 struct {
@@ -1969,11 +2081,12 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen,
                         struct ldlm_lock *lock;
                         struct lov_stripe_md *lsm;
                 } *data = key;
+                struct ldlm_res_id *res_id = &data->lock->l_resource->lr_name;
                 struct lov_oinfo *loi;
                 __u32 *stripe = val;
 
                 if (*vallen < sizeof(*stripe))
-                        RETURN(-EFAULT);
+                        GOTO(out, rc = -EFAULT);
                 *vallen = sizeof(*stripe);
 
                 /* XXX This is another one of those bits that will need to
@@ -1985,49 +2098,40 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen,
                 for (i = 0, loi = data->lsm->lsm_oinfo;
                      i < data->lsm->lsm_stripe_count;
                      i++, loi++) {
-                        if (lov->tgts[loi->loi_ost_idx].ltd_exp ==
-                            data->lock->l_conn_export) {
+                         if (lov->tgts[loi->loi_ost_idx].ltd_exp ==
+                             data->lock->l_conn_export &&
+                             loi->loi_id == res_id->name[0] &&
+                             loi->loi_gr == res_id->name[2]) {
                                 *stripe = i;
-                                RETURN(0);
+                                GOTO(out, rc = 0);
                         }
                 }
-                LDLM_ERROR(data->lock, "lock on inode without such object\n");
+                LDLM_ERROR(data->lock, "lock on inode without such object");
                 dump_lsm(D_ERROR, data->lsm);
-                RETURN(-ENXIO);
-        } else if (keylen >= strlen("size_to_stripe") &&
-                   strcmp(key, "size_to_stripe") == 0) {
-                struct {
-                        int stripe_number;
-                        __u64 size;
-                        struct lov_stripe_md *lsm;
-                } *data = val;
-
-                if (*vallen < sizeof(*data))
-                        RETURN(-EFAULT);
-
-                data->size = lov_size_to_stripe(data->lsm, data->size,
-                                                data->stripe_number);
-                RETURN(0);
+                GOTO(out, rc = -ENXIO);
         } else if (keylen >= strlen("last_id") && strcmp(key, "last_id") == 0) {
                 obd_id *ids = val;
-                int rc, size = sizeof(obd_id);
+                int size = sizeof(obd_id);
                 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                         if (!lov->tgts[i].active)
                                 continue;
                         rc = obd_get_info(lov->tgts[i].ltd_exp,
                                           keylen, key, &size, &(ids[i]));
                         if (rc != 0)
-                                RETURN(rc);
+                                GOTO(out, rc);
                 }
-                RETURN(0);
+                GOTO(out, rc = 0);
         } else if (keylen >= strlen("lovdesc") && strcmp(key, "lovdesc") == 0) {
                 struct lov_desc *desc_ret = val;
                 *desc_ret = lov->desc;
 
-                RETURN(0);
+                GOTO(out, rc = 0);
         }
 
-        RETURN(-EINVAL);
+        rc = -EINVAL;
+out:
+        lov_putref(obddev);
+        RETURN(rc);
 }
 
 static int lov_set_info(struct obd_export *exp, obd_count keylen,
@@ -2038,7 +2142,15 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen,
         int i, rc = 0, err;
         ENTRY;
 
-        if (KEY_IS("checksum")) {
+        if (KEY_IS("next_id")) {
+                if (vallen != lov->desc.ld_tgt_count)
+                        RETURN(-EINVAL);
+                vallen = sizeof(obd_id);
+        }
+
+        lov_getref(obddev);
+
+        if (KEY_IS("next_id") || KEY_IS("checksum")) {
                 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                         /* OST was disconnected */
                         if (!lov->tgts[i].ltd_exp)
@@ -2050,7 +2162,7 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen,
                         if (!rc)
                                 rc = err;
                 }
-                RETURN(rc);
+                GOTO(out, rc);
         }
 
         if (KEY_IS("evict_by_nid")) {
@@ -2064,14 +2176,14 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen,
                         if (!rc)
                                 rc = err;
                 }
-                RETURN(rc);
+                GOTO(out, rc);
         }
 
         if (KEY_IS("mds_conn") || KEY_IS("unlinked")) {
                 if (vallen != 0)
-                        RETURN(-EINVAL);
+                        GOTO(out, rc = -EINVAL);
         } else {
-                RETURN(-EINVAL);
+                GOTO(out, rc = -EINVAL);
         }
 
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
@@ -2090,6 +2202,23 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen,
                 if (!rc)
                         rc = err;
         }
+out:
+        lov_putref(obddev);
+        RETURN(rc);
+}
+
+static int lov_checkmd(struct obd_export *exp, struct obd_export *md_exp,
+                       struct lov_stripe_md *lsm)
+{
+        int rc;
+        ENTRY;
+
+        if (!lsm)
+                RETURN(0);
+        LASSERT(md_exp);
+        LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+        rc = lsm_op_find(lsm->lsm_magic)->lsm_revalidate(lsm, md_exp->exp_obd);
         RETURN(rc);
 }
 
@@ -2226,6 +2355,7 @@ struct obd_ops lov_obd_ops = {
         .o_statfs              = lov_statfs,
         .o_packmd              = lov_packmd,
         .o_unpackmd            = lov_unpackmd,
+        .o_checkmd             = lov_checkmd,
         .o_create              = lov_create,
         .o_destroy             = lov_destroy,
         .o_getattr             = lov_getattr,
@@ -2240,6 +2370,7 @@ struct obd_ops lov_obd_ops = {
         .o_queue_group_io      = lov_queue_group_io,
         .o_trigger_group_io    = lov_trigger_group_io,
         .o_teardown_async_page = lov_teardown_async_page,
+        .o_merge_lvb           = lov_merge_lvb,
         .o_adjust_kms          = lov_adjust_kms,
         .o_punch               = lov_punch,
         .o_sync                = lov_sync,
@@ -2257,7 +2388,7 @@ struct obd_ops lov_obd_ops = {
         .o_notify              = lov_notify,
 };
 
-static quota_interface_t *quota_interface = NULL;
+static quota_interface_t *quota_interface;
 extern quota_interface_t lov_quota_interface;
 
 int __init lov_init(void)
@@ -2270,7 +2401,7 @@ int __init lov_init(void)
 
         quota_interface = PORTAL_SYMBOL_GET(lov_quota_interface);
         init_obd_quota_ops(quota_interface, &lov_obd_ops);
-        
+
         rc = class_register_type(&lov_obd_ops, lvars.module_vars,
                                  OBD_LOV_DEVICENAME);
         if (rc && quota_interface)
@@ -2292,5 +2423,6 @@ MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
 MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver");
 MODULE_LICENSE("GPL");
 
-cfs_module(lov, "1.0.0", lov_init, lov_exit);
+module_init(lov_init);
+module_exit(lov_exit);
 #endif
index ab73a1b..a3b9b42 100644 (file)
@@ -84,21 +84,23 @@ int qos_remedy_create(struct lov_request_set *set, struct lov_request *req)
         int stripe, i, rc = -EIO;
         ENTRY;
 
-        ost_idx = (req->rq_idx + 1) % ost_count; 
+        ost_idx = (req->rq_idx + lsm->lsm_stripe_count) % ost_count;
         for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) {
                 if (lov->tgts[ost_idx].active == 0) {
                         CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx);
                         continue;
                 }
                 /* check if objects has been created on this ost */
-                for (stripe = req->rq_stripe; stripe >= 0; stripe--) {
+                for (stripe = 0; stripe < lsm->lsm_stripe_count; stripe++) {
+                        if (stripe == req->rq_stripe)
+                                continue;
                         if (ost_idx == lsm->lsm_oinfo[stripe].loi_ost_idx)
                                 break;
                 }
 
-                if (stripe < 0) {
+                if (stripe >= lsm->lsm_stripe_count) {
                         req->rq_idx = ost_idx;
-                        rc = obd_create(lov->tgts[ost_idx].ltd_exp, req->rq_oa, 
+                        rc = obd_create(lov->tgts[ost_idx].ltd_exp, req->rq_oa,
                                         &req->rq_md, set->set_oti);
                         if (!rc)
                                 break;
@@ -109,343 +111,73 @@ int qos_remedy_create(struct lov_request_set *set, struct lov_request *req)
 
 #define LOV_CREATE_RESEED_MULT 4
 #define LOV_CREATE_RESEED_MIN  1000
-/* alloc objects on osts with round-robin algorithm */
-static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt)
+/* FIXME use real qos data to prepare the lov create request */
+int qos_prep_create(struct lov_obd *lov, struct lov_request_set *set, int newea)
 {
-        static int ost_start_count, ost_start_idx;
+        static int ost_start_idx, ost_start_count;
         unsigned ost_idx, ost_count = lov->desc.ld_tgt_count;
         unsigned ost_active_count = lov->desc.ld_active_tgt_count;
-        int i, *idx_pos = idx_arr;
-        ENTRY;
-        
-        if (--ost_start_count <= 0) {
-                ost_start_idx = ll_insecure_random_int();
-                ost_start_count = 
-                        (LOV_CREATE_RESEED_MIN / max(ost_active_count, 1U) +
-                         LOV_CREATE_RESEED_MULT) * max(ost_active_count, 1U);
-        } else if (*stripe_cnt >= lov->desc.ld_active_tgt_count) {
-                /* If we allocate from all of the stripes, make the
-                 * next file start on the next OST. */
-                ++ost_start_idx;
-        }
-        ost_idx = ost_start_idx % ost_count;
-
-        for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) {
-                ++ost_start_idx;
-                
-                if (lov->tgts[ost_idx].active == 0) {
-                        CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx);
-                        continue;
-                }
-                
-                *idx_pos = ost_idx;
-                idx_pos++;
-                /* got enough ost */
-                if (idx_pos - idx_arr == *stripe_cnt)
-                        RETURN(0);
-        }
-        *stripe_cnt = idx_pos - idx_arr;
-        RETURN(0);
-}
-
-/* alloc objects on osts with specific stripe offset */
-static int alloc_specific(struct lov_obd *lov, struct lov_stripe_md *lsm,
-                          int *idx_arr)
-{
-        unsigned ost_idx, ost_count = lov->desc.ld_tgt_count;
-        int i, *idx_pos = idx_arr;
+        struct lov_stripe_md *lsm = set->set_md;
+        struct obdo *src_oa = set->set_oa;
+        int i, rc = 0;
         ENTRY;
 
-        ost_idx = lsm->lsm_oinfo[0].loi_ost_idx;
-        for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) {
-                if (lov->tgts[ost_idx].active == 0) {
-                        CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx);
-                        continue;
-                }
-                *idx_pos = ost_idx;
-                idx_pos++;
-                /* got enough ost */
-                if (idx_pos - idx_arr == lsm->lsm_stripe_count)
-                        RETURN(0);
-        }
-        /* If we were passed specific striping params, then a failure to
-         * meet those requirements is an error, since we can't reallocate
-         * that memory (it might be part of a larger array or something).
-         *
-         * We can only get here if lsm_stripe_count was originally > 1.
-         */
-        CERROR("can't lstripe objid "LPX64": have %u want %u\n",
-               lsm->lsm_object_id, idx_pos - idx_arr, lsm->lsm_stripe_count);
-        RETURN(-EFBIG);
-}
-
-/* free space OST must have to be used for object allocation. */
-#define QOS_MIN                 (lov->desc.ld_qos_threshold << 20)
-
-#define TGT_BAVAIL(tgt)         (tgt->ltd_exp->exp_obd->obd_osfs.os_bavail * \
-                                 tgt->ltd_exp->exp_obd->obd_osfs.os_bsize) 
-#define TGT_FFREE(tgt)          (tgt->ltd_exp->exp_obd->obd_osfs.os_ffree)
+        LASSERT(src_oa->o_valid & OBD_MD_FLID);
 
-/* alloc objects on osts with free space weighted algorithm */
-static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt)
-{
-        struct lov_obd *lov = &exp->exp_obd->u.lov;
-        unsigned ost_count = lov->desc.ld_tgt_count;
-        __u64 cur_bavail, rand, *availspace, total_bavail = 0;
-        int *indexes, nfound, good_osts, i, warn = 0, rc = 0;
-        struct lov_tgt_desc *tgt;
-        int shift, require_stripes = *stripe_cnt;
-        static time_t last_warn = 0;
-        time_t now = cfs_time_current_sec();
-        ENTRY;
-        
-        availspace = NULL;
-        indexes = NULL;
-        OBD_ALLOC(availspace, sizeof(__u64) * ost_count);
-        OBD_ALLOC(indexes, sizeof(int) * require_stripes);
-        if (!availspace || !indexes)
-                GOTO(out_free, rc = -EAGAIN);
-        
-        spin_lock(&lov->lov_lock);
-        /* if free space is below some threshold, just go
-         * to do round-robin allocation */
-        total_bavail = (exp->exp_obd->obd_osfs.os_bavail * \
-                        exp->exp_obd->obd_osfs.os_bsize);
-        if (ost_count < 2 || total_bavail <= QOS_MIN) {
-                spin_unlock(&lov->lov_lock);
-                GOTO(out_free, rc = -EAGAIN);
+        lsm->lsm_object_id = src_oa->o_id;
+        if (!lsm->lsm_stripe_size)
+                lsm->lsm_stripe_size = lov->desc.ld_default_stripe_size;
+        if (!lsm->lsm_pattern) {
+                lsm->lsm_pattern = lov->desc.ld_pattern ?
+                        lov->desc.ld_pattern : LOV_PATTERN_RAID0;
         }
 
-        /* if each ost has almost same free space, go to 
-         * do rr allocation for better creation performance */
-        if (!list_empty(&lov->qos_bavail_list)) {
-                __u64 max, min, val;
-                tgt = list_entry(lov->qos_bavail_list.next, 
-                                 struct lov_tgt_desc, qos_bavail_list);
-                max = TGT_BAVAIL(tgt);
-                tgt = list_entry(lov->qos_bavail_list.prev,
-                                 struct lov_tgt_desc, qos_bavail_list);
-                min = TGT_BAVAIL(tgt);
-
-                val = (max >= min) ? (max - min) : (min - max);
-                min = (min * 13) >> 8;          /* less than 5% of gap */ 
-
-                if (val < min) {
-                        spin_unlock(&lov->lov_lock);
-                        GOTO(out_free, rc = -EAGAIN);
+        if (newea || lsm->lsm_oinfo[0].loi_ost_idx >= ost_count) {
+                if (--ost_start_count <= 0) {
+                        ost_start_idx = ll_rand();
+                        ost_start_count =
+                          (LOV_CREATE_RESEED_MIN / max(ost_active_count, 1U) +
+                           LOV_CREATE_RESEED_MULT) * max(ost_active_count, 1U);
+                } else if (lsm->lsm_stripe_count >= ost_active_count) {
+                        /* If we allocate from all of the stripes, make the
+                         * next file start on the next OST. */
+                        ++ost_start_idx;
                 }
+                ost_idx = ost_start_idx % ost_count;
         } else {
-                spin_unlock(&lov->lov_lock);
-                GOTO(out_free, rc = -EAGAIN);
-        }
-        
-        total_bavail = 0;
-        good_osts = 0;
-        /* warn zero available space/inode every 30 min */
-        if (cfs_time_sub(now, last_warn) > 60 * 30)
-                warn = 1;
-        list_for_each_entry(tgt, &lov->qos_bavail_list, qos_bavail_list) {
-                if (!tgt->active)
-                        continue;
-                if (!TGT_BAVAIL(tgt)) {
-                        if (warn) {
-                                CWARN("avail space on %s is zero\n", 
-                                      tgt->uuid.uuid);
-                                last_warn = now;
-                        }
-                        continue;
-                }
-                if (!TGT_FFREE(tgt)) {
-                        if (warn) {
-                                CWARN("free inode on %s is zero\n", 
-                                      tgt->uuid.uuid);
-                                last_warn = now;
-                        }
-                        continue;
-                }
-                if ((TGT_BAVAIL(tgt) <= QOS_MIN) && (good_osts >= *stripe_cnt))
-                        break;
-                availspace[good_osts] = TGT_BAVAIL(tgt);
-                indexes[good_osts] = tgt->index;
-                total_bavail += availspace[good_osts];
-                good_osts++;
-        }
-        
-        spin_unlock(&lov->lov_lock);
-        
-        if (!total_bavail)
-                GOTO(out_free, rc = -ENOSPC);
-       
-        /* if we don't have enough good OSTs, we reduce the stripe count. */
-        if (good_osts < *stripe_cnt)
-                *stripe_cnt = good_osts;
-
-        if (!*stripe_cnt) 
-                GOTO(out_free, rc = -EAGAIN);
-        
-        nfound = shift = 0;
-        while ((total_bavail >> shift) > 0)
-                shift++;
-        shift++;
-        /* search enough OSTs with free space weighted random allocation */
-        while (nfound < *stripe_cnt) {
-                cur_bavail = 0;
-                
-                get_random_bytes(&rand, sizeof(rand));
-                if (shift < 64)
-                        rand &= ((1 << shift) - 1);
-                while (rand > total_bavail)
-                        rand -= total_bavail;
-                
-                for (i = 0; i < good_osts; i++) {
-                        cur_bavail += availspace[i];
-                        if (cur_bavail >= rand) {
-                                total_bavail -= availspace[i];
-                                availspace[i] = 0;
-                                idx_arr[nfound] = indexes[i];
-                                nfound++;
-                                break;
-                        }
-                }
-                /* should never satisfy below condition */
-                if (cur_bavail == 0)
-                        break;
+                ost_idx = lsm->lsm_oinfo[0].loi_ost_idx;
         }
-        LASSERT(nfound == *stripe_cnt);
-        
-out_free:
-        if (availspace)
-                OBD_FREE(availspace, sizeof(__u64) * ost_count);
-        if (indexes)
-                OBD_FREE(indexes, sizeof(int) * require_stripes);
-        if (rc != -EAGAIN)
-                RETURN(rc);
-
-        rc = alloc_rr(lov, idx_arr, stripe_cnt);
-        RETURN(rc);
-}
-
-/* return new alloced stripe count in success */
-static int alloc_idx_array(struct obd_export *exp, struct lov_stripe_md *lsm, 
-                           int newea, int **idx_arr, int *arr_cnt)
-{
-        struct lov_obd *lov = &exp->exp_obd->u.lov;
-        int stripe_cnt = lsm->lsm_stripe_count;
-        int i, rc = 0;
-        int *tmp_arr = NULL;
-        ENTRY;
-
-        *arr_cnt = stripe_cnt;
-        OBD_ALLOC(tmp_arr, *arr_cnt * sizeof(int));
-        if (tmp_arr == NULL)
-                RETURN(-ENOMEM);
-        for (i = 0; i < *arr_cnt; i++)
-                tmp_arr[i] = -1;
-
-        if (newea || 
-            lsm->lsm_oinfo[0].loi_ost_idx >= lov->desc.ld_tgt_count) 
-                rc = alloc_qos(exp, tmp_arr, &stripe_cnt);
-        else
-                rc = alloc_specific(lov, lsm, tmp_arr);
 
-        if (rc)
-                GOTO(out_arr, rc);
+        CDEBUG(D_INODE, "allocating %d subobjs for objid "LPX64" at idx %d\n",
+               lsm->lsm_stripe_count, lsm->lsm_object_id, ost_idx);
 
-        *idx_arr = tmp_arr;
-        RETURN(stripe_cnt);
-out_arr:
-        OBD_FREE(tmp_arr, *arr_cnt * sizeof(int));
-        *arr_cnt = 0;
-        RETURN(rc);
-}
-
-static void free_idx_array(int *idx_arr, int arr_cnt)
-{
-        if (arr_cnt)
-                OBD_FREE(idx_arr, arr_cnt * sizeof(int));
-}
-
-int qos_prep_create(struct obd_export *exp, struct lov_request_set *set)
-{
-        struct lov_obd *lov = &exp->exp_obd->u.lov;
-        struct lov_stripe_md *lsm;
-        struct obdo *src_oa = set->set_oa;
-        struct obd_trans_info *oti = set->set_oti;
-        int i, stripes, rc = 0, newea = 0;
-        int *idx_arr, idx_cnt = 0;
-        ENTRY;
-
-        LASSERT(src_oa->o_valid & OBD_MD_FLID);
-        if (set->set_md == NULL) {
-                int stripe_cnt = lov_get_stripecnt(lov, 0);
-
-                /* If the MDS file was truncated up to some size, stripe over
-                 * enough OSTs to allow the file to be created at that size. */
-                if (src_oa->o_valid & OBD_MD_FLSIZE) {
-                        struct lov_tgt_desc *tgt;
-                        stripes = 1;
-                        
-                        spin_lock(&lov->lov_lock);
-                        list_for_each_entry(tgt, &lov->qos_bavail_list, 
-                                            qos_bavail_list) {
-                                if (!tgt->active)
-                                        continue;
-                                if (TGT_BAVAIL(tgt) * stripes > src_oa->o_size)
-                                        break;
-                                stripes++;
-                        }
-                        spin_unlock(&lov->lov_lock);
+        for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) {
+                struct lov_request *req;
 
-                        if (stripes < stripe_cnt)
-                                stripes = stripe_cnt;
-                } else {
-                        stripes = stripe_cnt;
+                ++ost_start_idx;
+                if (lov->tgts[ost_idx].active == 0) {
+                        CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx);
+                        continue;
                 }
 
-                rc = lov_alloc_memmd(&set->set_md, stripes, 
-                                     lov->desc.ld_pattern ?
-                                     lov->desc.ld_pattern : LOV_PATTERN_RAID0);
-                if (rc < 0)
-                        GOTO(out_err, rc);
-                rc = 0;
-                newea = 1;
-        }
-        lsm = set->set_md;
-       
-        lsm->lsm_object_id = src_oa->o_id;
-        if (!lsm->lsm_stripe_size)
-                lsm->lsm_stripe_size = lov->desc.ld_default_stripe_size;
-        if (!lsm->lsm_pattern) {
-                LASSERT(lov->desc.ld_pattern);
-                lsm->lsm_pattern = lov->desc.ld_pattern;
-        }
-
-        stripes = alloc_idx_array(exp, lsm, newea, &idx_arr, &idx_cnt);
-        LASSERT(stripes <= lsm->lsm_stripe_count);
-        if (stripes <= 0)
-                GOTO(out_err, rc = stripes ? stripes : -EIO);
-        
-        for (i = 0; i < stripes; i++) {
-                struct lov_request *req;
-                int ost_idx = idx_arr[i];
-                LASSERT(ost_idx >= 0);
-                
                 OBD_ALLOC(req, sizeof(*req));
                 if (req == NULL)
-                        GOTO(out_err, rc = -ENOMEM);
-                lov_set_add_req(req, set);
+                        GOTO(out, rc = -ENOMEM);
 
                 req->rq_buflen = sizeof(*req->rq_md);
                 OBD_ALLOC(req->rq_md, req->rq_buflen);
-                if (req->rq_md == NULL)
-                        GOTO(out_err, rc = -ENOMEM);
-                
+                if (req->rq_md == NULL) {
+                        OBD_FREE_PTR(req);
+                        GOTO(out, rc = -ENOMEM);
+                }
+
                 req->rq_oa = obdo_alloc();
-                if (req->rq_oa == NULL)
-                        GOTO(out_err, rc = -ENOMEM);
-                
+                if (req->rq_oa == NULL) {
+                        OBD_FREE_PTR(req->rq_md);
+                        OBD_FREE_PTR(req);
+                        GOTO(out, rc = -ENOMEM);
+                }
+
                 req->rq_idx = ost_idx;
                 req->rq_stripe = i;
                 /* create data objects with "parent" OA */
@@ -456,74 +188,41 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set)
                  *     stripe which holds the existing file size.
                  */
                 if (src_oa->o_valid & OBD_MD_FLSIZE) {
-                        req->rq_oa->o_size = 
-                                lov_size_to_stripe(lsm, src_oa->o_size, i);
+                        if (lov_stripe_offset(lsm, src_oa->o_size, i,
+                                              &req->rq_oa->o_size) < 0 &&
+                            req->rq_oa->o_size)
+                                req->rq_oa->o_size--;
 
                         CDEBUG(D_INODE, "stripe %d has size "LPU64"/"LPU64"\n",
                                i, req->rq_oa->o_size, src_oa->o_size);
                 }
 
+                lov_set_add_req(req, set);
+
+                /* If we have allocated enough objects, we are OK */
+                if (set->set_count == lsm->lsm_stripe_count)
+                        GOTO(out, rc = 0);
         }
-        LASSERT(set->set_count == stripes);
 
-        if (stripes < lsm->lsm_stripe_count)
-                qos_shrink_lsm(set);
+        if (set->set_count == 0)
+                GOTO(out, rc = -EIO);
 
-        if (oti && (src_oa->o_valid & OBD_MD_FLCOOKIE)) {
-                oti_alloc_cookies(oti, set->set_count);
-                if (!oti->oti_logcookies)
-                        GOTO(out_err, rc = -ENOMEM);
-                set->set_cookies = oti->oti_logcookies;
+        /* If we were passed specific striping params, then a failure to
+         * meet those requirements is an error, since we can't reallocate
+         * that memory (it might be part of a larger array or something).
+         *
+         * We can only get here if lsm_stripe_count was originally > 1.
+         */
+        if (!newea) {
+                CERROR("can't lstripe objid "LPX64": have %u want %u, rc %d\n",
+                       lsm->lsm_object_id, set->set_count,
+                       lsm->lsm_stripe_count, rc);
+                rc = rc ? rc : -EFBIG;
+        } else {
+                qos_shrink_lsm(set);
+                rc = 0;
         }
-out_err:
-        if (newea && rc)
-                obd_free_memmd(exp, &set->set_md);
-        free_idx_array(idx_arr, idx_cnt);
-        EXIT;
-        return rc;
-}
-
-/* An caveat here is don't use list_move() on same list */
-#define list_adjust(tgt, lov, list_name, value) \
-{ \
-        struct list_head *element; \
-        struct lov_tgt_desc *tmp;  \
-        if (list_empty(&(tgt)->list_name)) \
-                list_add(&(tgt)->list_name, &(lov)->list_name); \
-        element = (tgt)->list_name.next; \
-        while((element != &(lov)->list_name) && \
-              (tmp = list_entry(element, struct lov_tgt_desc, list_name)) && \
-              (value(tgt) < value(tmp))) \
-                element = element->next; \
-        if (element != (tgt)->list_name.next) { \
-                list_del_init(&(tgt)->list_name); \
-                list_add(&(tgt)->list_name, element->prev); \
-        } \
-        element = (tgt)->list_name.prev; \
-        while ((element != &(lov)->list_name) && \
-               (tmp = list_entry(element, struct lov_tgt_desc, list_name)) && \
-               (value(tgt) > value(tmp))) \
-                element = element->prev; \
-        if (element != (tgt)->list_name.prev) { \
-                list_del_init(&(tgt)->list_name); \
-                list_add_tail(&(tgt)->list_name, element->prev); \
-        } \
-}
+out:
 
-void qos_update(struct lov_obd *lov, int idx, struct obd_statfs *osfs)
-{
-        struct lov_tgt_desc *tgt = &lov->tgts[idx];
-        __u64 bavail;
-        ENTRY;
-        
-        bavail = osfs->os_bavail * osfs->os_bsize;
-        if (!bavail) 
-                CWARN("ost %d has zero avail space!\n", idx);
-        
-        CDEBUG(D_OTHER, "QOS: bfree now "LPU64"\n", bavail);
-        
-        spin_lock(&lov->lov_lock);
-        list_adjust(tgt, lov, qos_bavail_list, TGT_BAVAIL);
-        spin_unlock(&lov->lov_lock);
+        RETURN(rc);
 }
-
index 975bb9c..4403eda 100644 (file)
@@ -129,26 +129,24 @@ int lov_update_enqueue_set(struct lov_request_set *set,
          * can be addressed then. */
         if (rc == ELDLM_OK) {
                 struct ldlm_lock *lock = ldlm_handle2lock(lov_lockhp);
-                __u64 tmp = req->rq_md->lsm_oinfo->loi_rss;
+                __u64 tmp = req->rq_md->lsm_oinfo->loi_lvb.lvb_size;
 
                 LASSERT(lock != NULL);
                 lov_stripe_lock(set->set_md);
-                loi->loi_rss = tmp;
-                loi->loi_mtime = req->rq_md->lsm_oinfo->loi_mtime;
-                loi->loi_blocks = req->rq_md->lsm_oinfo->loi_blocks;
+                loi->loi_lvb = req->rq_md->lsm_oinfo->loi_lvb;
                 /* Extend KMS up to the end of this lock and no further
                  * A lock on [x,y] means a KMS of up to y + 1 bytes! */
                 if (tmp > lock->l_policy_data.l_extent.end)
                         tmp = lock->l_policy_data.l_extent.end + 1;
                 if (tmp >= loi->loi_kms) {
-                        LDLM_DEBUG(lock, "lock acquired, setting rss="
-                                   LPU64", kms="LPU64, loi->loi_rss, tmp);
+                        LDLM_DEBUG(lock, "lock acquired, setting rss="LPU64
+                                   ", kms="LPU64, loi->loi_lvb.lvb_size, tmp);
                         loi->loi_kms = tmp;
                         loi->loi_kms_valid = 1;
                 } else {
                         LDLM_DEBUG(lock, "lock acquired, setting rss="
                                    LPU64"; leaving kms="LPU64", end="LPU64,
-                                   loi->loi_rss, loi->loi_kms,
+                                   loi->loi_lvb.lvb_size, loi->loi_kms,
                                    lock->l_policy_data.l_extent.end);
                 }
                 lov_stripe_unlock(set->set_md);
@@ -157,12 +155,10 @@ int lov_update_enqueue_set(struct lov_request_set *set,
         } else if (rc == ELDLM_LOCK_ABORTED && flags & LDLM_FL_HAS_INTENT) {
                 memset(lov_lockhp, 0, sizeof(*lov_lockhp));
                 lov_stripe_lock(set->set_md);
-                loi->loi_rss = req->rq_md->lsm_oinfo->loi_rss;
-                loi->loi_mtime = req->rq_md->lsm_oinfo->loi_mtime;
-                loi->loi_blocks = req->rq_md->lsm_oinfo->loi_blocks;
+                loi->loi_lvb = req->rq_md->lsm_oinfo->loi_lvb;
                 lov_stripe_unlock(set->set_md);
                 CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving"
-                       " kms="LPU64"\n", loi->loi_rss, loi->loi_kms);
+                       " kms="LPU64"\n", loi->loi_lvb.lvb_size, loi->loi_kms);
                 rc = ELDLM_OK;
         } else {
                 struct obd_export *exp = set->set_exp;
@@ -202,7 +198,7 @@ static int enqueue_done(struct lov_request_set *set, __u32 mode)
 
                 lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
                 LASSERT(lov_lockhp);
-                if (lov_lockhp->cookie == 0)
+                if (!lustre_handle_is_used(lov_lockhp))
                         continue;
 
                 rc = obd_cancel(lov->tgts[req->rq_idx].ltd_exp, req->rq_md,
@@ -293,10 +289,8 @@ int lov_prep_enqueue_set(struct obd_export *exp, struct lov_stripe_md *lsm,
                 req->rq_md->lsm_object_id = loi->loi_id;
                 req->rq_md->lsm_stripe_count = 0;
                 req->rq_md->lsm_oinfo->loi_kms_valid = loi->loi_kms_valid;
-                req->rq_md->lsm_oinfo->loi_rss = loi->loi_rss;
                 req->rq_md->lsm_oinfo->loi_kms = loi->loi_kms;
-                req->rq_md->lsm_oinfo->loi_blocks = loi->loi_blocks;
-                req->rq_md->lsm_oinfo->loi_mtime = loi->loi_mtime;
+                req->rq_md->lsm_oinfo->loi_lvb = loi->loi_lvb;
 
                 lov_set_add_req(req, set);
         }
@@ -417,10 +411,10 @@ int lov_fini_cancel_set(struct lov_request_set *set)
         int rc = 0;
         ENTRY;
 
-        LASSERT(set->set_exp);
         if (set == NULL)
                 RETURN(0);
 
+        LASSERT(set->set_exp);
         if (set->set_lockh)
                 lov_llh_put(set->set_lockh);
 
@@ -458,7 +452,7 @@ int lov_prep_cancel_set(struct obd_export *exp, struct lov_stripe_md *lsm,
                 struct lustre_handle *lov_lockhp;
 
                 lov_lockhp = set->set_lockh->llh_handles + i;
-                if (lov_lockhp->cookie == 0) {
+                if (!lustre_handle_is_used(lov_lockhp)) {
                         CDEBUG(D_HA, "lov idx %d subobj "LPX64" no lock?\n",
                                loi->loi_ost_idx, loi->loi_id);
                         continue;
@@ -567,7 +561,7 @@ cleanup:
                         continue;
 
                 sub_exp = lov->tgts[req->rq_idx].ltd_exp;
-                err = obd_destroy(sub_exp, req->rq_oa, NULL, oti);
+                err = obd_destroy(sub_exp, req->rq_oa, NULL, oti, NULL);
                 if (err)
                         CERROR("Failed to uncreate objid "LPX64" subobj "
                                LPX64" on OST idx %d: rc = %d\n",
@@ -594,11 +588,13 @@ int lov_fini_create_set(struct lov_request_set *set,struct lov_stripe_md **lsmp)
         int rc = 0;
         ENTRY;
 
-        LASSERT(set->set_exp);
         if (set == NULL)
                 RETURN(0);
-        if (set->set_completes) 
+        LASSERT(set->set_exp);
+        if (set->set_completes) {
                 rc = create_done(set->set_exp, set, lsmp);
+                /* FIXME update qos data here */
+        }
 
         if (atomic_dec_and_test(&set->set_refcount))
                 lov_finish_set(set);
@@ -653,8 +649,9 @@ int lov_prep_create_set(struct obd_export *exp, struct lov_stripe_md **lsmp,
                         struct obdo *src_oa, struct obd_trans_info *oti,
                         struct lov_request_set **reqset)
 {
+        struct lov_obd *lov = &exp->exp_obd->u.lov;
         struct lov_request_set *set;
-        int rc = 0;
+        int rc = 0, newea = 0;
         ENTRY;
 
         OBD_ALLOC(set, sizeof(*set));
@@ -666,14 +663,54 @@ int lov_prep_create_set(struct obd_export *exp, struct lov_stripe_md **lsmp,
         set->set_md = *lsmp;
         set->set_oa = src_oa;
         set->set_oti = oti;
-        
-        rc = qos_prep_create(exp, set);
+
+        if (set->set_md == NULL) {
+                int stripes, stripe_cnt;
+                stripe_cnt = lov_get_stripecnt(lov, 0);
+
+                /* If the MDS file was truncated up to some size, stripe over
+                 * enough OSTs to allow the file to be created at that size. */
+                if (src_oa->o_valid & OBD_MD_FLSIZE) {
+                        stripes=((src_oa->o_size+LUSTRE_STRIPE_MAXBYTES)>>12)-1;
+                        do_div(stripes, (__u32)(LUSTRE_STRIPE_MAXBYTES >> 12));
+
+                        if (stripes > lov->desc.ld_active_tgt_count)
+                                GOTO(out_set, rc = -EFBIG);
+                        if (stripes < stripe_cnt)
+                                stripes = stripe_cnt;
+                } else {
+                        stripes = stripe_cnt;
+                }
+
+                rc = lov_alloc_memmd(&set->set_md, stripes,
+                                     lov->desc.ld_pattern ?
+                                     lov->desc.ld_pattern : LOV_PATTERN_RAID0, 
+                                     LOV_MAGIC);
+                if (rc < 0)
+                        goto out_set;
+                newea = 1;
+        }
+
+        rc = qos_prep_create(lov, set, newea);
         if (rc)
-                lov_fini_create_set(set, lsmp);
-        else
-                *reqset = set;
+                goto out_lsm;
+
+        if (oti && (src_oa->o_valid & OBD_MD_FLCOOKIE)) {
+                oti_alloc_cookies(oti, set->set_count);
+                if (!oti->oti_logcookies)
+                        goto out_lsm;
+                set->set_cookies = oti->oti_logcookies;
+        }
+        *reqset = set;
+        RETURN(rc);
+
+out_lsm:
+        if (*lsmp == NULL)
+                obd_free_memmd(exp, &set->set_md);
+out_set:
+        lov_fini_create_set(set, lsmp);
         RETURN(rc);
-}                                                
+}
 
 static int common_attr_done(struct lov_request_set *set)
 {
@@ -733,7 +770,7 @@ static int brw_done(struct lov_request_set *set)
                 loi = &lsm->lsm_oinfo[req->rq_stripe];
 
                 if (req->rq_oa->o_valid & OBD_MD_FLBLOCKS)
-                        loi->loi_blocks = req->rq_oa->o_blocks;
+                        loi->loi_lvb.lvb_blocks = req->rq_oa->o_blocks;
         }
 
         RETURN(0);
@@ -744,9 +781,9 @@ int lov_fini_brw_set(struct lov_request_set *set)
         int rc = 0;
         ENTRY;
 
-        LASSERT(set->set_exp);
         if (set == NULL)
                 RETURN(0);
+        LASSERT(set->set_exp);
         if (set->set_completes) {
                 rc = brw_done(set);
                 /* FIXME update qos data here */
@@ -874,9 +911,9 @@ int lov_fini_getattr_set(struct lov_request_set *set)
         int rc = 0;
         ENTRY;
 
-        LASSERT(set->set_exp);
         if (set == NULL)
                 RETURN(0);
+        LASSERT(set->set_exp);
         if (set->set_completes)
                 rc = common_attr_done(set);
 
@@ -942,9 +979,9 @@ int lov_fini_destroy_set(struct lov_request_set *set)
 {
         ENTRY;
 
-        LASSERT(set->set_exp);
         if (set == NULL)
                 RETURN(0);
+        LASSERT(set->set_exp);
         if (set->set_completes) {
                 /* FIXME update qos data here */
         }
@@ -1021,9 +1058,9 @@ int lov_fini_setattr_set(struct lov_request_set *set)
         int rc = 0;
         ENTRY;
 
-        LASSERT(set->set_exp);
         if (set == NULL)
                 RETURN(0);
+        LASSERT(set->set_exp);
         if (set->set_completes) {
                 rc = common_attr_done(set);
                 /* FIXME update qos data here */
@@ -1098,6 +1135,7 @@ int lov_update_setattr_set(struct lov_request_set *set,
                            struct lov_request *req, int rc)
 {
         struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
+        struct lov_stripe_md *lsm = set->set_md;
         ENTRY;
 
         lov_update_set(set, req, rc);
@@ -1108,10 +1146,17 @@ int lov_update_setattr_set(struct lov_request_set *set,
 
         /* FIXME: LOV STACKING update loi data should be done by OSC *
          * when this is gone we can go back to using lov_update_common_set() */
-        if (rc == 0 && req->rq_oa->o_valid & OBD_MD_FLMTIME)
-                set->set_md->lsm_oinfo[req->rq_stripe].loi_mtime =
-                        req->rq_oa->o_mtime;
-        /* ditto loi_atime, loi_ctime when available */
+        if (rc == 0) {
+                if (req->rq_oa->o_valid & OBD_MD_FLMTIME)
+                        lsm->lsm_oinfo[req->rq_stripe].loi_lvb.lvb_ctime =
+                                req->rq_oa->o_ctime;
+                if (req->rq_oa->o_valid & OBD_MD_FLMTIME)
+                        lsm->lsm_oinfo[req->rq_stripe].loi_lvb.lvb_mtime =
+                                req->rq_oa->o_mtime;
+                if (req->rq_oa->o_valid & OBD_MD_FLATIME)
+                        lsm->lsm_oinfo[req->rq_stripe].loi_lvb.lvb_atime =
+                                req->rq_oa->o_atime;
+        }
 
         RETURN(rc);
 }
@@ -1134,9 +1179,9 @@ int lov_fini_punch_set(struct lov_request_set *set)
         int rc = 0;
         ENTRY;
 
-        LASSERT(set->set_exp);
         if (set == NULL)
                 RETURN(0);
+        LASSERT(set->set_exp);
         if (set->set_completes) {
                 if (!set->set_success)
                         rc = -EIO;
@@ -1215,9 +1260,9 @@ int lov_fini_sync_set(struct lov_request_set *set)
         int rc = 0;
         ENTRY;
 
-        LASSERT(set->set_exp);
         if (set == NULL)
                 RETURN(0);
+        LASSERT(set->set_exp);
         if (set->set_completes) {
                 if (!set->set_success)
                         rc = -EIO;
index 5ae9f62..5fc85fe 100644 (file)
@@ -118,68 +118,6 @@ static int lov_rd_desc_uuid(char *page, char **start, off_t off, int count,
         return snprintf(page, count, "%s\n", lov->desc.ld_uuid.uuid);
 }
 
-static int lov_rd_qos_threshold(char *page, char **start, off_t off, int count,
-                                int *eof, void *data)
-{
-        struct obd_device *dev = (struct obd_device*) data;
-        struct lov_obd *lov;
-
-        LASSERT(dev != NULL);
-        lov = &dev->u.lov;
-        *eof = 1;
-        return snprintf(page, count, "%u MB\n", lov->desc.ld_qos_threshold);
-}
-
-static int lov_wr_qos_threshold(struct file *file, const char *buffer,
-                                unsigned long count, void *data)
-{
-        struct obd_device *dev = (struct obd_device *)data;
-        struct lov_obd *lov;
-        int val, rc;
-        LASSERT(dev != NULL);
-
-        lov = &dev->u.lov;
-        rc = lprocfs_write_helper(buffer, count, &val);
-        if (rc)
-                return rc;
-
-        if (val <= 0)
-                return -EINVAL;
-        lov->desc.ld_qos_threshold = val;
-        return count;
-}
-
-static int lov_rd_qos_maxage(char *page, char **start, off_t off, int count,
-                             int *eof, void *data)
-{
-        struct obd_device *dev = (struct obd_device*) data;
-        struct lov_obd *lov;
-
-        LASSERT(dev != NULL);
-        lov = &dev->u.lov;
-        *eof = 1;
-        return snprintf(page, count, "%u Sec\n", lov->desc.ld_qos_maxage);
-}
-
-static int lov_wr_qos_maxage(struct file *file, const char *buffer,
-                             unsigned long count, void *data)
-{
-        struct obd_device *dev = (struct obd_device *)data;
-        struct lov_obd *lov;
-        int val, rc;
-        LASSERT(dev != NULL);
-
-        lov = &dev->u.lov;
-        rc = lprocfs_write_helper(buffer, count, &val);
-        if (rc)
-                return rc;
-
-        if (val <= 0)
-                return -EINVAL;
-        lov->desc.ld_qos_maxage = val;
-        return count;
-}
-
 static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos)
 {
         struct obd_device *dev = p->private;
@@ -250,8 +188,6 @@ struct lprocfs_vars lprocfs_obd_vars[] = {
         { "kbytesfree",   lprocfs_rd_kbytesfree,  0, 0 },
         { "kbytesavail",  lprocfs_rd_kbytesavail, 0, 0 },
         { "desc_uuid",    lov_rd_desc_uuid,       0, 0 },
-        { "qos_threshold",lov_rd_qos_threshold, lov_wr_qos_threshold, 0 },
-        { "qos_maxage",   lov_rd_qos_maxage, lov_wr_qos_maxage, 0 },
         { 0 }
 };
 
index 4d1f154..41162e6 100644 (file)
@@ -1,11 +1,12 @@
 #include <lustre_mds.h>
-void mdc_pack_req_body(struct ptlrpc_request *);
+void mdc_pack_req_body(struct ptlrpc_request *req, int offset,
+                       __u64 valid, struct ll_fid *fid, int ea_size);
 void mdc_pack_rep_body(struct ptlrpc_request *);
-void mdc_readdir_pack(struct ptlrpc_request *req, __u64 offset, __u32 size,
-                      struct ll_fid *mdc_fid);
+void mdc_readdir_pack(struct ptlrpc_request *req, int pos, __u64 offset,
+                     __u32 size, struct ll_fid *mdc_fid);
 void mdc_getattr_pack(struct ptlrpc_request *req, int valid, int offset,
                       int flags, struct mdc_op_data *data);
-void mdc_setattr_pack(struct ptlrpc_request *req,
+void mdc_setattr_pack(struct ptlrpc_request *req, int offset,
                       struct mdc_op_data *data,
                       struct iattr *iattr, void *ea, int ealen,
                      void *ea2, int ea2len);
@@ -16,6 +17,8 @@ void mdc_create_pack(struct ptlrpc_request *req, int offset,
 void mdc_open_pack(struct ptlrpc_request *req, int offset,
                    struct mdc_op_data *op_data, __u32 mode, __u64 rdev,
                    __u32 flags, const void *data, int datalen);
+void mdc_join_pack(struct ptlrpc_request *req, int offset, 
+                   struct mdc_op_data *op_data, __u64 head_size);
 void mdc_unlink_pack(struct ptlrpc_request *req, int offset,
                      struct mdc_op_data *data);
 void mdc_link_pack(struct ptlrpc_request *req, int offset,
index 72f8a7c..4b4eeba 100644 (file)
@@ -53,25 +53,26 @@ static int send_getstatus(struct obd_import *imp, struct ll_fid *rootfid,
                           int level, int msg_flags)
 {
         struct ptlrpc_request *req;
-        struct mds_body *body;
-        int rc, size = sizeof(*body);
+        int rc, size[] = { [MDS_REQ_REC_OFF] = sizeof(struct mds_body) };
         ENTRY;
 
-        req = ptlrpc_prep_req(imp, MDS_GETSTATUS, 1, &size, NULL);
+        req = ptlrpc_prep_req(imp, LUSTRE_MDS_VERSION, MDS_GETSTATUS,
+                              1, size, NULL);
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
         req->rq_send_state = level;
-        req->rq_replen = lustre_msg_size(1, &size);
+        req->rq_replen = lustre_msg_size(1, size);
 
-        mdc_pack_req_body(req);
+        mdc_pack_req_body(req, MDS_REQ_REC_OFF, 0, NULL, 0);
         req->rq_reqmsg->flags |= msg_flags;
         rc = ptlrpc_queue_wait(req);
 
         if (!rc) {
-                body = lustre_swab_repbuf (req, 0, sizeof (*body),
-                                           lustre_swab_mds_body);
+                struct mds_body *body;
+
+                body = lustre_swab_repbuf(req, 0, sizeof(*body),
+                                          lustre_swab_mds_body);
                 if (body == NULL) {
                         CERROR ("Can't extract mds_body\n");
                         GOTO (out, rc = -EPROTO);
@@ -147,6 +148,16 @@ int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size,
                         RETURN (-EPROTO);
                 }
         }
+        
+        if (body->valid & OBD_MD_FLMODEASIZE) {
+                if (exp->exp_obd->u.cli.cl_max_mds_easize < body->max_mdsize) 
+                        exp->exp_obd->u.cli.cl_max_mds_easize = 
+                                                body->max_mdsize;
+                if (exp->exp_obd->u.cli.cl_max_mds_cookiesize < 
+                                                body->max_cookiesize)
+                        exp->exp_obd->u.cli.cl_max_mds_cookiesize = 
+                                                body->max_cookiesize;
+        }
 
         RETURN (0);
 }
@@ -164,16 +175,12 @@ int mdc_getattr(struct obd_export *exp, struct ll_fid *fid,
         /* XXX do we need to make another request here?  We just did a getattr
          *     to do the lookup in the first place.
          */
-        req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_GETATTR, 1, &size,
-                              NULL);
+        req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+                              MDS_GETATTR, 1, &size, NULL);
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
-        memcpy(&body->fid1, fid, sizeof(*fid));
-        body->valid = valid;
-        body->eadatasize = ea_size;
-        mdc_pack_req_body(req);
+        mdc_pack_req_body(req, MDS_REQ_REC_OFF, valid, fid, ea_size);
 
         /* currently only root inode will call us with FLACL */
         if (valid & OBD_MD_FLACL)
@@ -190,29 +197,24 @@ int mdc_getattr(struct obd_export *exp, struct ll_fid *fid,
 }
 
 int mdc_getattr_name(struct obd_export *exp, struct ll_fid *fid,
-                     char *filename, int namelen, unsigned long valid,
-                     unsigned int ea_size, struct ptlrpc_request **request)
+                     const char *filename, int namelen, unsigned long valid,
+                     unsigned int ea_len, struct ptlrpc_request **request)
 {
         struct ptlrpc_request *req;
-        struct mds_body *body;
-        int rc, size[2] = {sizeof(*body), namelen};
+        int rc, size[] = { sizeof(struct mds_body), namelen };
         ENTRY;
 
-        req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_GETATTR_NAME, 2,
-                              size, NULL);
+        req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+                              MDS_GETATTR_NAME, 2, size, NULL);
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
-        memcpy(&body->fid1, fid, sizeof(*fid));
-        body->valid = valid;
-        body->eadatasize = ea_size;
-        mdc_pack_req_body(req);
-
+        mdc_pack_req_body(req, MDS_REQ_REC_OFF, valid, fid, ea_len);
         LASSERT (strnlen (filename, namelen) == namelen - 1);
         memcpy(lustre_msg_buf(req->rq_reqmsg, 1, namelen), filename, namelen);
 
-        rc = mdc_getattr_common(exp, ea_size, 0, req);
+        rc = mdc_getattr_common(exp, ea_len, 0, req);
         if (rc != 0) {
                 ptlrpc_req_finished (req);
                 req = NULL;
@@ -244,18 +246,15 @@ int mdc_xattr_common(struct obd_export *exp, struct ll_fid *fid,
                 size[bufcnt++] = input_size;
         }
 
-        req = ptlrpc_prep_req(class_exp2cliimp(exp), opcode,
+        req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION, opcode,
                               bufcnt, size, NULL);
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
         /* request data */
-        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
-        memcpy(&body->fid1, fid, sizeof(*fid));
-        body->valid = valid;
-        body->eadatasize = output_size;
+        body = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_REC_OFF, sizeof(*body));
+        mdc_pack_req_body(req, MDS_REQ_REC_OFF, valid, fid, output_size);
         body->flags = flags;
-        mdc_pack_req_body(req);
 
         if (xattr_name) {
                 tmp = lustre_msg_buf(req->rq_reqmsg, 1, xattr_namelen);
@@ -330,10 +329,10 @@ int mdc_getxattr(struct obd_export *exp, struct ll_fid *fid,
 void mdc_store_inode_generation(struct ptlrpc_request *req, int reqoff,
                                 int repoff)
 {
-        struct mds_rec_create *rec =
-                lustre_msg_buf(req->rq_reqmsg, reqoff, sizeof(*rec));
-        struct mds_body *body =
-                lustre_msg_buf(req->rq_repmsg, repoff, sizeof(*body));
+        struct mds_rec_create *rec = lustre_msg_buf(req->rq_reqmsg, reqoff,
+                                                    sizeof(*rec));
+        struct mds_body *body = lustre_msg_buf(req->rq_repmsg, repoff,
+                                               sizeof(*body));
 
         LASSERT (rec != NULL);
         LASSERT (body != NULL);
@@ -349,6 +348,7 @@ void mdc_store_inode_generation(struct ptlrpc_request *req, int reqoff,
                   rec->cr_replayfid.generation, rec->cr_replayfid.id);
 }
 
+#ifdef CONFIG_FS_POSIX_ACL
 static
 int mdc_unpack_acl(struct obd_export *exp, struct ptlrpc_request *req,
                    struct lustre_md *md, unsigned int offset)
@@ -387,6 +387,9 @@ int mdc_unpack_acl(struct obd_export *exp, struct ptlrpc_request *req,
         md->posix_acl = acl;
         return 0;
 }
+#else
+#define mdc_unpack_acl(exp, req, md, offset) 0
+#endif
 
 int mdc_req2lustre_md(struct ptlrpc_request *req, int offset,
                       struct obd_export *exp,
@@ -452,10 +455,12 @@ void mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
         if (md->lsm)
                 obd_free_memmd(exp, &md->lsm);
 
+#ifdef CONFIG_FS_POSIX_ACL
         if (md->posix_acl) {
                 posix_acl_release(md->posix_acl);
                 md->posix_acl = NULL;
         }
+#endif
 }
 
 static void mdc_commit_open(struct ptlrpc_request *req)
@@ -479,7 +484,7 @@ static void mdc_replay_open(struct ptlrpc_request *req)
         struct mdc_open_data *mod = req->rq_cb_data;
         struct obd_client_handle *och;
         struct ptlrpc_request *close_req;
-        struct lustre_handle old; 
+        struct lustre_handle old;
         struct mds_body *body;
         ENTRY;
 
@@ -495,20 +500,21 @@ static void mdc_replay_open(struct ptlrpc_request *req)
 
         och = mod->mod_och;
         if (och != NULL) {
-                struct lustre_handle *file_fh; 
+                struct lustre_handle *file_fh;
                 LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC);
                 file_fh = &och->och_fh;
                 CDEBUG(D_HA, "updating handle from "LPX64" to "LPX64"\n",
                        file_fh->cookie, body->handle.cookie);
                 memcpy(&old, file_fh, sizeof(old));
                 memcpy(file_fh, &body->handle, sizeof(*file_fh));
-        } 
+        }
 
         close_req = mod->mod_close_req;
         if (close_req != NULL) {
                 struct mds_body *close_body;
                 LASSERT(close_req->rq_reqmsg->opc == MDS_CLOSE);
-                close_body = lustre_msg_buf(close_req->rq_reqmsg, 0,
+                close_body = lustre_msg_buf(close_req->rq_reqmsg,
+                                            MDS_REQ_REC_OFF,
                                             sizeof(*close_body));
                 if (och != NULL)
                         LASSERT(!memcmp(&old, &close_body->handle, sizeof old));
@@ -524,15 +530,16 @@ void mdc_set_open_replay_data(struct obd_client_handle *och,
                               struct ptlrpc_request *open_req)
 {
         struct mdc_open_data *mod;
-        struct mds_rec_create *rec =
-                lustre_msg_buf(open_req->rq_reqmsg, 2, sizeof(*rec));
-        struct mds_body *body =
-                lustre_msg_buf(open_req->rq_repmsg, 1, sizeof(*body));
+        struct mds_rec_create *rec = lustre_msg_buf(open_req->rq_reqmsg,
+                                                    MDS_REQ_INTENT_REC_OFF,
+                                                    sizeof(*rec));
+        struct mds_body *body = lustre_msg_buf(open_req->rq_repmsg, 1,
+                                               sizeof(*body));
 
-        LASSERT(rec != NULL);
-        /* outgoing messages always in my byte order */
         LASSERT(body != NULL);
         /* incoming message in my byte order (it's been swabbed) */
+        LASSERT(rec != NULL);
+        /* outgoing messages always in my byte order */
         LASSERT_REPSWABBED(open_req, 1);
 
         OBD_ALLOC(mod, sizeof(*mod));
@@ -602,58 +609,20 @@ static void mdc_commit_close(struct ptlrpc_request *req)
         spin_unlock(&open_req->rq_lock);
 }
 
-static int mdc_close_interpret(struct ptlrpc_request *req, void *data, int rc)
-{
-        union ptlrpc_async_args *aa = data;
-        struct mdc_rpc_lock *rpc_lock;
-        struct obd_device *obd = aa->pointer_arg[1];
-        unsigned long flags;
-
-        spin_lock_irqsave(&req->rq_lock, flags);
-        rpc_lock = aa->pointer_arg[0];
-        aa->pointer_arg[0] = NULL;
-        spin_unlock_irqrestore(&req->rq_lock, flags);
-
-        if (rpc_lock == NULL) {
-                CERROR("called with NULL rpc_lock\n");
-        } else {
-                LASSERTF(rpc_lock == obd->u.cli.cl_rpc_lock, "%p != %p\n",
-                         rpc_lock, obd->u.cli.cl_rpc_lock);
-                mdc_put_rpc_lock(rpc_lock, NULL);
-        }
-        cfs_waitq_signal(&req->rq_reply_waitq);
-        RETURN(rc);
-}
-
-/* We can't use ptlrpc_check_reply, because we don't want to wake up for
- * anything but a reply or an error. */
-static int mdc_close_check_reply(struct ptlrpc_request *req)
-{
-        int rc = 0;
-        unsigned long flags;
-
-        spin_lock_irqsave(&req->rq_lock, flags);
-        if (req->rq_async_args.pointer_arg[0] == NULL)
-                rc = 1;
-        spin_unlock_irqrestore (&req->rq_lock, flags);
-        return rc;
-}
-
 int mdc_close(struct obd_export *exp, struct obdo *oa,
               struct obd_client_handle *och, struct ptlrpc_request **request)
 {
         struct obd_device *obd = class_exp2obd(exp);
-        int reqsize = sizeof(struct mds_body);
-        int rc, repsize[3] = {sizeof(struct mds_body),
+        int size[] = { sizeof(struct mds_body) };
+        int rc, repsize[] = { sizeof(struct mds_body),
                               obd->u.cli.cl_max_mds_easize,
                               obd->u.cli.cl_max_mds_cookiesize};
         struct ptlrpc_request *req;
         struct mdc_open_data *mod;
-        struct l_wait_info lwi;
         ENTRY;
 
-        req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_CLOSE, 1, &reqsize,
-                              NULL);
+        req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+                              MDS_CLOSE, 1, size, NULL);
         if (req == NULL)
                 GOTO(out, rc = -ENOMEM);
 
@@ -665,7 +634,7 @@ int mdc_close(struct obd_export *exp, struct obdo *oa,
                 mod->mod_close_req = req;
                 if (mod->mod_open_req->rq_type == LI_POISON) {
                         /* FIXME This should be an ASSERT, but until we
-                           figure out why it can be poisoned here, give 
+                           figure out why it can be poisoned here, give
                            a reasonable return. bug 6155 */
                         CERROR("LBUG POISONED open %p!\n", mod->mod_open_req);
                         ptlrpc_req_finished(req);
@@ -676,24 +645,17 @@ int mdc_close(struct obd_export *exp, struct obdo *oa,
                 CDEBUG(D_HA, "couldn't find open req; expecting close error\n");
         }
 
-        mdc_close_pack(req, 0, oa, oa->o_valid, och);
+        mdc_close_pack(req, MDS_REQ_REC_OFF, oa, oa->o_valid, och);
 
         req->rq_replen = lustre_msg_size(3, repsize);
         req->rq_commit_cb = mdc_commit_close;
         LASSERT(req->rq_cb_data == NULL);
         req->rq_cb_data = mod;
 
-        /* We hand a ref to the rpcd here, so we need another one of our own. */
-        ptlrpc_request_addref(req);
-
         mdc_get_rpc_lock(obd->u.cli.cl_rpc_lock, NULL);
-        req->rq_interpret_reply = mdc_close_interpret;
-        req->rq_async_args.pointer_arg[0] = obd->u.cli.cl_rpc_lock;
-        req->rq_async_args.pointer_arg[1] = obd;
-        ptlrpcd_add_req(req);
-        lwi = LWI_TIMEOUT_INTR(MAX(req->rq_timeout * HZ, 1), NULL, NULL, NULL);
-        rc = l_wait_event(req->rq_reply_waitq, mdc_close_check_reply(req),
-                          &lwi);
+        rc = ptlrpc_queue_wait(req);
+        mdc_put_rpc_lock(obd->u.cli.cl_rpc_lock, NULL);
+
         if (req->rq_repmsg == NULL) {
                 CDEBUG(D_HA, "request failed to send: %p, %d\n", req,
                        req->rq_status);
@@ -716,14 +678,10 @@ int mdc_close(struct obd_export *exp, struct obdo *oa,
                         rc = -EPROTO;
                 }
         }
-        if (req->rq_async_args.pointer_arg[0] != NULL) {
-                CERROR("returned without dropping rpc_lock: rc %d\n", rc);
-                mdc_close_interpret(req, &req->rq_async_args, rc);
-        }
 
         EXIT;
- out:
         *request = req;
+ out:
         return rc;
 }
 
@@ -731,15 +689,15 @@ int mdc_done_writing(struct obd_export *exp, struct obdo *obdo)
 {
         struct ptlrpc_request *req;
         struct mds_body *body;
-        int rc, size = sizeof(*body);
+        int rc, size[] = { [MDS_REQ_REC_OFF] = sizeof(*body) };
         ENTRY;
 
-        req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_DONE_WRITING, 1,
-                              &size, NULL);
+        req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+                              MDS_DONE_WRITING, 1, size, NULL);
         if (req == NULL)
                 RETURN(-ENOMEM);
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body));
+        body = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_REC_OFF, sizeof(*body));
         mdc_pack_fid(&body->fid1, obdo->o_id, 0, obdo->o_mode);
         body->size = obdo->o_size;
         body->blocks = obdo->o_blocks;
@@ -747,28 +705,30 @@ int mdc_done_writing(struct obd_export *exp, struct obdo *obdo)
         body->valid = obdo->o_valid;
 //        memcpy(&body->handle, &och->och_fh, sizeof(body->handle));
 
-        req->rq_replen = lustre_msg_size(1, &size);
+        req->rq_replen = lustre_msg_size(1, size);
 
         rc = ptlrpc_queue_wait(req);
         ptlrpc_req_finished(req);
         RETURN(rc);
 }
 
-int mdc_readpage(struct obd_export *exp, struct ll_fid *mdc_fid, __u64 offset,
+int mdc_readpage(struct obd_export *exp, struct ll_fid *fid, __u64 offset,
                  struct page *page, struct ptlrpc_request **request)
 {
         struct obd_import *imp = class_exp2cliimp(exp);
         struct ptlrpc_request *req = NULL;
         struct ptlrpc_bulk_desc *desc = NULL;
         struct mds_body *body;
-        int rc, size = sizeof(*body);
+        int rc, size[] = { sizeof(*body) };
         ENTRY;
 
-        CDEBUG(D_INODE, "inode: %ld\n", (long)mdc_fid->id);
+        CDEBUG(D_INODE, "inode: "LPU64"\n", fid->id);
 
-        req = ptlrpc_prep_req(imp, MDS_READPAGE, 1, &size, NULL);
+        req = ptlrpc_prep_req(imp, LUSTRE_MDS_VERSION, MDS_READPAGE,
+                              1, size, NULL);
         if (req == NULL)
                 GOTO(out, rc = -ENOMEM);
+
         /* XXX FIXME bug 249 */
         req->rq_request_portal = MDS_READPAGE_PORTAL;
 
@@ -779,9 +739,9 @@ int mdc_readpage(struct obd_export *exp, struct ll_fid *mdc_fid, __u64 offset,
 
         ptlrpc_prep_bulk_page(desc, page, 0, PAGE_CACHE_SIZE);
 
-        mdc_readdir_pack(req, offset, PAGE_CACHE_SIZE, mdc_fid);
+        mdc_readdir_pack(req, MDS_REQ_REC_OFF, offset, PAGE_CACHE_SIZE, fid);
 
-        req->rq_replen = lustre_msg_size(1, &size);
+        req->rq_replen = lustre_msg_size(1, size);
         rc = ptlrpc_queue_wait(req);
 
         if (rc == 0) {
@@ -873,8 +833,7 @@ int mdc_set_info(struct obd_export *exp, obd_count keylen,
         struct obd_import *imp = class_exp2cliimp(exp);
         int rc = -EINVAL;
 
-        if (keylen == strlen("initial_recov") &&
-            memcmp(key, "initial_recov", strlen("initial_recov")) == 0) {
+        if (KEY_IS("initial_recov")) {
                 if (vallen != sizeof(int))
                         RETURN(-EINVAL);
                 imp->imp_initial_recov = *(int *)val;
@@ -882,8 +841,18 @@ int mdc_set_info(struct obd_export *exp, obd_count keylen,
                        exp->exp_obd->obd_name, imp->imp_initial_recov);
                 RETURN(0);
         }
-        if (keylen == strlen("read-only") &&
-            memcmp(key, "read-only", strlen("read-only")) == 0) {
+        /* Turn off initial_recov after we try all backup servers once */
+        if (KEY_IS("init_recov_bk")) {
+                if (vallen != sizeof(int))
+                        RETURN(-EINVAL);
+                imp->imp_initial_recov_bk = *(int *)val;
+                if (imp->imp_initial_recov_bk)
+                        imp->imp_initial_recov = 1;
+                CDEBUG(D_HA, "%s: set imp_initial_recov_bk = %d\n",
+                       exp->exp_obd->obd_name, imp->imp_initial_recov_bk);
+                RETURN(0);
+        }
+        if (KEY_IS("read-only")) {
                 struct ptlrpc_request *req;
                 int size[2] = {keylen, vallen};
                 char *bufs[2] = {key, val};
@@ -899,7 +868,8 @@ int mdc_set_info(struct obd_export *exp, obd_count keylen,
                                 ~OBD_CONNECT_RDONLY;
                 }
 
-                req = ptlrpc_prep_req(imp, MDS_SET_INFO, 2, size, bufs);
+                req = ptlrpc_prep_req(imp, LUSTRE_MDS_VERSION,
+                                      MDS_SET_INFO, 2, size, bufs);
                 if (req == NULL)
                         RETURN(-ENOMEM);
 
@@ -908,7 +878,28 @@ int mdc_set_info(struct obd_export *exp, obd_count keylen,
                 ptlrpc_req_finished(req);
                 RETURN(rc);
         }
-        
+
+        RETURN(rc);
+}
+
+int mdc_get_info(struct obd_export *exp, __u32 keylen, void *key,
+                 __u32 *vallen, void *val)
+{
+        int rc = -EINVAL;
+
+        if (keylen == strlen("max_easize") &&
+            memcmp(key, "max_easize", strlen("max_easize")) == 0) {
+                int mdsize, *max_easize;
+                
+                if (*vallen != sizeof(int))
+                        RETURN(-EINVAL);
+                mdsize = *(int*)val;
+                if (mdsize > exp->exp_obd->u.cli.cl_max_mds_easize)
+                        exp->exp_obd->u.cli.cl_max_mds_easize = mdsize;
+                max_easize = val;
+                *max_easize = exp->exp_obd->u.cli.cl_max_mds_easize;
+                RETURN(0);
+        }
         RETURN(rc);
 }
 
@@ -926,7 +917,8 @@ static int mdc_statfs(struct obd_device *obd, struct obd_statfs *osfs,
          * during mount that would help a bit).  Having relative timestamps
          * is not so great if request processing is slow, while absolute
          * timestamps are not ideal because they need time synchronization. */
-        req = ptlrpc_prep_req(obd->u.cli.cl_import, MDS_STATFS, 0, NULL, NULL);
+        req = ptlrpc_prep_req(obd->u.cli.cl_import, LUSTRE_MDS_VERSION,
+                              MDS_STATFS, 0, NULL, NULL);
         if (!req)
                 RETURN(-ENOMEM);
 
@@ -958,18 +950,19 @@ static int mdc_pin(struct obd_export *exp, obd_id ino, __u32 gen, int type,
 {
         struct ptlrpc_request *req;
         struct mds_body *body;
-        int rc, size = sizeof(*body);
+        int rc, size[] = { [MDS_REQ_REC_OFF] = sizeof(struct mds_body) };
         ENTRY;
 
-        req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_PIN, 1, &size, NULL);
+        req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+                              MDS_PIN, 1, size, NULL);
         if (req == NULL)
                 RETURN(-ENOMEM);
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
+        body = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_REC_OFF, sizeof (*body));
         mdc_pack_fid(&body->fid1, ino, gen, type);
         body->flags = flag;
 
-        req->rq_replen = lustre_msg_size(1, &size);
+        req->rq_replen = lustre_msg_size(1, size);
 
         mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
         rc = ptlrpc_queue_wait(req);
@@ -1004,17 +997,18 @@ static int mdc_unpin(struct obd_export *exp,
 {
         struct ptlrpc_request *req;
         struct mds_body *body;
-        int rc, size = sizeof(*body);
+        int rc, size[] = { [MDS_REQ_REC_OFF] = sizeof(struct mds_body) };
         ENTRY;
 
         if (handle->och_magic != OBD_CLIENT_HANDLE_MAGIC)
                 RETURN(0);
 
-        req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_CLOSE, 1, &size, NULL);
+        req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+                              MDS_CLOSE, 1, size, NULL);
         if (req == NULL)
                 RETURN(-ENOMEM);
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body));
+        body = lustre_msg_buf(req->rq_reqmsg, 1, sizeof(*body));
         memcpy(&body->handle, &handle->och_fh, sizeof(body->handle));
         body->flags = flag;
 
@@ -1036,22 +1030,17 @@ int mdc_sync(struct obd_export *exp, struct ll_fid *fid,
              struct ptlrpc_request **request)
 {
         struct ptlrpc_request *req;
-        struct mds_body *body;
-        int size = sizeof(*body);
-        int rc;
+        int rc, size[] = { [MDS_REQ_REC_OFF] = sizeof(struct mds_body) };
         ENTRY;
 
-        req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_SYNC, 1,&size,NULL);
+        req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+                              MDS_SYNC, 1, size, NULL);
         if (!req)
                 RETURN(rc = -ENOMEM);
 
-        if (fid) {
-                body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
-                memcpy(&body->fid1, fid, sizeof(*fid));
-                mdc_pack_req_body(req);
-        }
+        mdc_pack_req_body(req, MDS_REQ_REC_OFF, 0, fid, 0);
 
-        req->rq_replen = lustre_msg_size(1, &size);
+        req->rq_replen = lustre_msg_size(1, size);
 
         rc = ptlrpc_queue_wait(req);
         if (rc || request == NULL)
@@ -1062,8 +1051,7 @@ int mdc_sync(struct obd_export *exp, struct ll_fid *fid,
         RETURN(rc);
 }
 
-static int mdc_import_event(struct obd_device *obd,
-                            struct obd_import *imp, 
+static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
                             enum obd_import_event event)
 {
         int rc = 0;
@@ -1075,8 +1063,7 @@ static int mdc_import_event(struct obd_device *obd,
                 break;
         }
         case IMP_EVENT_INACTIVE: {
-                if (obd->obd_observer)
-                        rc = obd_notify(obd->obd_observer, obd, 0);
+                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
                 break;
         }
         case IMP_EVENT_INVALIDATE: {
@@ -1087,12 +1074,14 @@ static int mdc_import_event(struct obd_device *obd,
                 break;
         }
         case IMP_EVENT_ACTIVE: {
-                if (obd->obd_observer)
-                        rc = obd_notify(obd->obd_observer, obd, 1);
+                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE);
                 break;
         }
+        case IMP_EVENT_OCD:
+                break;
+
         default:
-                CERROR("Unknown import event %d\n", event);
+                CERROR("Unknown import event %x\n", event);
                 LBUG();
         }
         RETURN(rc);
@@ -1150,28 +1139,35 @@ int mdc_init_ea_size(struct obd_export *mdc_exp, struct obd_export *lov_exp)
         struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC };
         struct lov_desc desc;
         __u32 valsize = sizeof(desc);
+        __u32 stripes;
         int rc, size;
         ENTRY;
 
-        size = obd_size_diskmd(lov_exp, NULL);
-        if (cli->cl_max_mds_easize < size)
-                cli->cl_max_mds_easize = size;
-
         rc = obd_get_info(lov_exp, strlen("lovdesc") + 1, "lovdesc",
                           &valsize, &desc);
         if (rc)
                 RETURN(rc);
 
+        stripes = min(desc.ld_tgt_count, (__u32)LOV_MAX_STRIPE_COUNT);
+        lsm.lsm_stripe_count = stripes;
+        size = obd_size_diskmd(lov_exp, &lsm);
+        
+        if (cli->cl_max_mds_easize < size)
+                cli->cl_max_mds_easize = size;
+
         lsm.lsm_stripe_count = desc.ld_default_stripe_count;
         size = obd_size_diskmd(lov_exp, &lsm);
 
         if (cli->cl_default_mds_easize < size)
                 cli->cl_default_mds_easize = size;
 
-        size = desc.ld_tgt_count * sizeof(struct llog_cookie);
+        size = stripes * sizeof(struct llog_cookie);
         if (cli->cl_max_mds_cookiesize < size)
                 cli->cl_max_mds_cookiesize = size;
 
+        CDEBUG(D_HA, "updating max_mdsize/max_cookiesize: %d/%d\n",
+               cli->cl_max_mds_easize, cli->cl_max_mds_cookiesize);
+        
         RETURN(0);
 }
 
@@ -1179,8 +1175,8 @@ static int mdc_precleanup(struct obd_device *obd, int stage)
 {
         int rc = 0;
         ENTRY;
-        
-        if (stage < 2) 
+
+        if (stage < OBD_CLEANUP_SELF_EXP)
                 RETURN(0);
 
         rc = obd_llog_finish(obd, 0);
@@ -1218,6 +1214,13 @@ static int mdc_llog_init(struct obd_device *obd, struct obd_device *tgt,
                 ctxt->loc_imp = obd->u.cli.cl_import;
         }
 
+        rc = llog_setup(obd, LLOG_LOVEA_REPL_CTXT, tgt, 0, NULL,
+                       &llog_client_ops);
+        if (rc == 0) {
+                ctxt = llog_get_context(obd, LLOG_LOVEA_REPL_CTXT);
+                ctxt->loc_imp = obd->u.cli.cl_import;
+        }
+
         RETURN(rc);
 }
 
@@ -1226,6 +1229,10 @@ static int mdc_llog_finish(struct obd_device *obd, int count)
         int rc;
         ENTRY;
 
+        rc = llog_cleanup(llog_get_context(obd, LLOG_LOVEA_REPL_CTXT));
+        if (rc) {
+                CERROR("can not cleanup LLOG_CONFIG_REPL_CTXT rc %d\n", rc);
+        }
         rc = llog_cleanup(llog_get_context(obd, LLOG_CONFIG_REPL_CTXT));
         RETURN(rc);
 }
@@ -1241,6 +1248,7 @@ struct obd_ops mdc_obd_ops = {
         .o_disconnect   = client_disconnect_export,
         .o_iocontrol    = mdc_iocontrol,
         .o_set_info     = mdc_set_info,
+        .o_get_info     = mdc_get_info,
         .o_statfs       = mdc_statfs,
         .o_pin          = mdc_pin,
         .o_unpin        = mdc_unpin,
@@ -1249,7 +1257,7 @@ struct obd_ops mdc_obd_ops = {
         .o_llog_finish  = mdc_llog_finish,
 };
 
-static quota_interface_t *quota_interface = NULL;
+static quota_interface_t *quota_interface;
 extern quota_interface_t mdc_quota_interface;
 
 int __init mdc_init(void)
@@ -1257,14 +1265,14 @@ int __init mdc_init(void)
         int rc;
         struct lprocfs_static_vars lvars;
         lprocfs_init_vars(mdc, &lvars);
+
         quota_interface = PORTAL_SYMBOL_GET(mdc_quota_interface);
         init_obd_quota_ops(quota_interface, &mdc_obd_ops);
-        
+
         rc = class_register_type(&mdc_obd_ops, lvars.module_vars,
                                  LUSTRE_MDC_NAME);
         if (rc && quota_interface)
-                PORTAL_SYMBOL_PUT(osc_quota_interface);
+                PORTAL_SYMBOL_PUT(mdc_quota_interface);
 
         RETURN(rc);
 }
index a1a287d..759a836 100644 (file)
@@ -57,6 +57,7 @@
 #include <lprocfs_status.h>
 #include <lustre_commit_confd.h>
 #include <lustre_quota.h>
+#include <linux/lustre_ver.h>
 
 #include "mds_internal.h"
 
@@ -248,6 +249,55 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid,
         RETURN(result);
 }
 
+static int mds_connect_internal(struct obd_export *exp, 
+                                struct obd_connect_data *data)
+{
+        struct obd_device *obd = exp->exp_obd;
+        if (data != NULL) {
+                data->ocd_connect_flags &= MDS_CONNECT_SUPPORTED;
+                data->ocd_ibits_known &= MDS_INODELOCK_FULL;
+
+                /* If no known bits (which should not happen, probably,
+                   as everybody should support LOOKUP and UPDATE bits at least)
+                   revert to compat mode with plain locks. */
+                if (!data->ocd_ibits_known &&
+                    data->ocd_connect_flags & OBD_CONNECT_IBITS)
+                        data->ocd_connect_flags &= ~OBD_CONNECT_IBITS;
+
+                if (!obd->u.mds.mds_fl_acl)
+                        data->ocd_connect_flags &= ~OBD_CONNECT_ACL;
+
+                if (!obd->u.mds.mds_fl_user_xattr)
+                        data->ocd_connect_flags &= ~OBD_CONNECT_XATTR;
+
+                exp->exp_connect_flags = data->ocd_connect_flags;
+                data->ocd_version = LUSTRE_VERSION_CODE;
+                exp->exp_mds_data.med_ibits_known = data->ocd_ibits_known;
+        }
+
+        if (obd->u.mds.mds_fl_acl &&
+            ((exp->exp_connect_flags & OBD_CONNECT_ACL) == 0)) {
+                CWARN("%s: MDS requires ACL support but client does not\n",
+                      obd->obd_name);
+                return -EBADE;
+        }
+        return 0;
+}
+
+static int mds_reconnect(struct obd_export *exp, struct obd_device *obd,
+                         struct obd_uuid *cluuid,
+                         struct obd_connect_data *data)
+{
+        int rc;
+        ENTRY;
+
+        if (exp == NULL || obd == NULL || cluuid == NULL)
+                RETURN(-EINVAL);
+
+        rc = mds_connect_internal(exp, data);
+
+        RETURN(rc);
+}
 
 /* Establish a connection to the MDS.
  *
@@ -291,28 +341,13 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd,
         LASSERT(exp);
         med = &exp->exp_mds_data;
 
-        if (data != NULL) {
-                data->ocd_connect_flags &= MDS_CONNECT_SUPPORTED;
-
-                if (!obd->u.mds.mds_fl_user_xattr)
-                        data->ocd_connect_flags &= ~OBD_CONNECT_USER_XATTR;
-
-                exp->exp_connect_flags = data->ocd_connect_flags;
-        }
-
-        if ((obd->u.mds.mds_fl_acl == 0) !=
-            ((exp->exp_connect_flags & OBD_CONNECT_ACL) == 0)) {
-                CWARN("%s require ACL support but %s doesn't\n",
-                      obd->u.mds.mds_fl_acl ? "MDS" : "client",
-                      obd->u.mds.mds_fl_acl ? "client" : "MDS");
-                GOTO(out, rc = -EBADE);
-        }
+        rc = mds_connect_internal(exp, data);
+        if (rc)
+                GOTO(out, rc);
 
         OBD_ALLOC(mcd, sizeof(*mcd));
-        if (!mcd) {
-                CERROR("mds: out of memory for client data\n");
+        if (!mcd)
                 GOTO(out, rc = -ENOMEM);
-        }
 
         memcpy(mcd->mcd_uuid, cluuid, sizeof(mcd->mcd_uuid));
         med->med_mcd = mcd;
@@ -334,7 +369,7 @@ out:
         RETURN(rc);
 }
 
-static int mds_init_export(struct obd_export *exp) 
+static int mds_init_export(struct obd_export *exp)
 {
         struct mds_export_data *med = &exp->exp_mds_data;
 
@@ -379,7 +414,7 @@ static int mds_destroy_export(struct obd_export *export)
                 /* child orphan sem protects orphan_dec_test and
                  * is_orphan race, mds_mfd_close drops it */
                 MDS_DOWN_WRITE_ORPHAN_SEM(dentry->d_inode);
-                rc = mds_mfd_close(NULL, obd, mfd,
+                rc = mds_mfd_close(NULL, MDS_REQ_REC_OFF, obd, mfd,
                                    !(export->exp_flags & OBD_OPT_FAILOVER));
 
                 if (rc)
@@ -457,7 +492,7 @@ int mds_get_md(struct obd_device *obd, struct inode *inode, void *md,
 
         if (lock)
                 down(&inode->i_sem);
-        rc = fsfilt_get_md(obd, inode, md, *size);
+        rc = fsfilt_get_md(obd, inode, md, *size, "lov");
 
         if (rc < 0) {
                 CERROR("Error %d reading eadata for ino %lu\n",
@@ -472,6 +507,8 @@ int mds_get_md(struct obd_device *obd, struct inode *inode, void *md,
                 } else if (rc > 0) {
                         *size = rc;
                 }
+        } else {
+                *size = 0;
         }
         if (lock)
                 up(&inode->i_sem);
@@ -510,7 +547,7 @@ int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg, int offset,
                        inode->i_ino, lmm_size, mds->mds_max_mdsize);
                 // RETURN(-EINVAL);
         }
-        
+
         rc = mds_get_md(obd, inode, lmm, &lmm_size, lock);
         if (rc > 0) {
                 if (S_ISDIR(inode->i_mode))
@@ -524,6 +561,7 @@ int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg, int offset,
         RETURN(rc);
 }
 
+#ifdef CONFIG_FS_POSIX_ACL
 static
 int mds_pack_posix_acl(struct inode *inode, struct lustre_msg *repmsg,
                        struct mds_body *repbody, int repoff)
@@ -559,6 +597,9 @@ out:
         repbody->valid |= OBD_MD_FLACL;
         return 0;
 }
+#else
+#define mds_pack_posix_acl(inode, repmsg, repbody, repoff) 0
+#endif
 
 int mds_pack_acl(struct mds_export_data *med, struct inode *inode,
                  struct lustre_msg *repmsg, struct mds_body *repbody,
@@ -592,7 +633,7 @@ static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry,
                                  inode, 1);
 
                 /* If we have LOV EA data, the OST holds size, atime, mtime */
-                if (!(body->valid & OBD_MD_FLEASIZE) && 
+                if (!(body->valid & OBD_MD_FLEASIZE) &&
                     !(body->valid & OBD_MD_FLDIREA))
                         body->valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
                                         OBD_MD_FLATIME | OBD_MD_FLMTIME);
@@ -625,9 +666,17 @@ static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry,
                 reply_off++;
         }
 
+        if (reqbody->valid & OBD_MD_FLMODEASIZE) {
+                struct mds_obd *mds = mds_req2mds(req);
+                body->max_cookiesize = mds->mds_max_cookiesize;
+                body->max_mdsize = mds->mds_max_mdsize;
+                body->valid |= OBD_MD_FLMODEASIZE;
+        }
+
         if (rc)
                 RETURN(rc);
 
+#ifdef CONFIG_FS_POSIX_ACL
         if ((req->rq_export->exp_connect_flags & OBD_CONNECT_ACL) &&
             (reqbody->valid & OBD_MD_FLACL)) {
                 rc = mds_pack_acl(&req->rq_export->exp_mds_data,
@@ -638,6 +687,7 @@ static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry,
                 if (body->aclsize)
                         reply_off++;
         }
+#endif
 
         RETURN(rc);
 }
@@ -657,7 +707,8 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode,
         if ((S_ISREG(inode->i_mode) && (body->valid & OBD_MD_FLEASIZE)) ||
             (S_ISDIR(inode->i_mode) && (body->valid & OBD_MD_FLDIREA))) {
                 down(&inode->i_sem);
-                rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0);
+                rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0,
+                                   "lov");
                 up(&inode->i_sem);
                 CDEBUG(D_INODE, "got %d bytes MD data for inode %lu\n",
                        rc, inode->i_ino);
@@ -686,6 +737,7 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode,
                        inode->i_size + 1, body->eadatasize);
         }
 
+#ifdef CONFIG_FS_POSIX_ACL
         if ((req->rq_export->exp_connect_flags & OBD_CONNECT_ACL) &&
             (body->valid & OBD_MD_FLACL)) {
                 struct dentry de = { .d_inode = inode };
@@ -707,6 +759,7 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode,
                 }
                 bufcount++;
         }
+#endif
 
         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETATTR_PACK)) {
                 CERROR("failed MDS_GETATTR_PACK test\n");
@@ -740,7 +793,7 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req,
         char *name;
         ENTRY;
 
-        LASSERT(!strcmp(obd->obd_type->typ_name, "mds"));
+        LASSERT(!strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME));
 
         /* Swab now, before anyone looks inside the request */
 
@@ -763,9 +816,9 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req,
         if (rc)
                 GOTO(cleanup, rc);
 
-        LASSERT (offset == 0 || offset == 2);
+        LASSERT (offset == MDS_REQ_REC_OFF || offset == MDS_REQ_INTENT_REC_OFF);
         /* if requests were at offset 2, the getattr reply goes back at 1 */
-        if (offset) {
+        if (offset == MDS_REQ_INTENT_REC_OFF) {
                 rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*rep));
                 offset = 1;
         }
@@ -797,7 +850,7 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req,
         }
 #endif
 
-        if (child_lockh->cookie != 0) {
+        if (lustre_handle_is_used(child_lockh)) {
                 LASSERT(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT);
                 resent_req = 1;
         }
@@ -886,7 +939,7 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req,
         return rc;
 }
 
-static int mds_getattr(int offset, struct ptlrpc_request *req)
+static int mds_getattr(struct ptlrpc_request *req, int offset)
 {
         struct mds_obd *mds = mds_req2mds(req);
         struct obd_device *obd = req->rq_export->exp_obd;
@@ -979,7 +1032,7 @@ out:
         return 0;
 }
 
-static int mds_sync(struct ptlrpc_request *req)
+static int mds_sync(struct ptlrpc_request *req, int offset)
 {
         struct obd_device *obd = req->rq_export->exp_obd;
         struct mds_obd *mds = &obd->u.mds;
@@ -1030,7 +1083,7 @@ out:
  *
  * If we were to take another one here, a deadlock will result, if another
  * thread is already waiting for a PW lock. */
-static int mds_readpage(struct ptlrpc_request *req)
+static int mds_readpage(struct ptlrpc_request *req, int offset)
 {
         struct obd_device *obd = req->rq_export->exp_obd;
         struct mds_obd *mds = &obd->u.mds;
@@ -1052,7 +1105,8 @@ static int mds_readpage(struct ptlrpc_request *req)
                 GOTO(out, rc);
         }
 
-        body = lustre_swab_reqbuf(req, 0, sizeof(*body), lustre_swab_mds_body);
+        body = lustre_swab_reqbuf(req, offset, sizeof(*body),
+                                  lustre_swab_mds_body);
         if (body == NULL)
                 GOTO (out, rc = -EFAULT);
 
@@ -1246,6 +1300,74 @@ static int mds_handle_quotactl(struct ptlrpc_request *req)
         RETURN(0);
 }
 
+static int mds_msg_check_version(struct lustre_msg *msg)
+{
+        int rc;
+
+        /* TODO: enable the below check while really introducing msg version.
+         * it's disabled because it will break compatibility with b1_4.
+         */
+        return (0);
+
+        switch (msg->opc) {
+        case MDS_CONNECT:
+        case MDS_DISCONNECT:
+        case OBD_PING:
+                rc = lustre_msg_check_version(msg, LUSTRE_OBD_VERSION);
+                if (rc)
+                        CERROR("bad opc %u version %08x, expecting %08x\n",
+                               msg->opc, msg->version, LUSTRE_OBD_VERSION);
+                break;
+        case MDS_GETSTATUS:
+        case MDS_GETATTR:
+        case MDS_GETATTR_NAME:
+        case MDS_STATFS:
+        case MDS_READPAGE:
+        case MDS_REINT:
+        case MDS_CLOSE:
+        case MDS_DONE_WRITING:
+        case MDS_PIN:
+        case MDS_SYNC:
+        case MDS_GETXATTR:
+        case MDS_SETXATTR:
+        case MDS_SET_INFO:
+        case MDS_QUOTACHECK:
+        case MDS_QUOTACTL:
+        case QUOTA_DQACQ:
+        case QUOTA_DQREL:
+                rc = lustre_msg_check_version(msg, LUSTRE_MDS_VERSION);
+                if (rc)
+                        CERROR("bad opc %u version %08x, expecting %08x\n",
+                               msg->opc, msg->version, LUSTRE_MDS_VERSION);
+                break;
+        case LDLM_ENQUEUE:
+        case LDLM_CONVERT:
+        case LDLM_BL_CALLBACK:
+        case LDLM_CP_CALLBACK:
+                rc = lustre_msg_check_version(msg, LUSTRE_DLM_VERSION);
+                if (rc)
+                        CERROR("bad opc %u version %08x, expecting %08x\n",
+                               msg->opc, msg->version, LUSTRE_DLM_VERSION);
+                break;
+        case OBD_LOG_CANCEL:
+        case LLOG_ORIGIN_HANDLE_CREATE:
+        case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
+        case LLOG_ORIGIN_HANDLE_PREV_BLOCK:
+        case LLOG_ORIGIN_HANDLE_READ_HEADER:
+        case LLOG_ORIGIN_HANDLE_CLOSE:
+        case LLOG_CATINFO:
+                rc = lustre_msg_check_version(msg, LUSTRE_LOG_VERSION);
+                if (rc)
+                        CERROR("bad opc %u version %08x, expecting %08x\n",
+                               msg->opc, msg->version, LUSTRE_LOG_VERSION);
+                break;
+        default:
+                CERROR("MDS unknown opcode %d\n", msg->opc);
+                rc = -ENOTSUPP;
+        }
+        return rc;
+}
+
 int mds_handle(struct ptlrpc_request *req)
 {
         int should_process, fail = OBD_FAIL_MDS_ALL_REPLY_NET;
@@ -1257,6 +1379,13 @@ int mds_handle(struct ptlrpc_request *req)
         OBD_FAIL_RETURN(OBD_FAIL_MDS_ALL_REQUEST_NET | OBD_FAIL_ONCE, 0);
 
         LASSERT(current->journal_info == NULL);
+
+        rc = mds_msg_check_version(req->rq_reqmsg);
+        if (rc) {
+                CERROR("MDS drop mal-formed request\n");
+                RETURN(rc);
+        }
+
         /* XXX identical to OST */
         if (req->rq_reqmsg->opc != MDS_CONNECT) {
                 struct mds_export_data *med;
@@ -1332,7 +1461,7 @@ int mds_handle(struct ptlrpc_request *req)
         case MDS_GETATTR:
                 DEBUG_REQ(D_INODE, req, "getattr");
                 OBD_FAIL_RETURN(OBD_FAIL_MDS_GETATTR_NET, 0);
-                rc = mds_getattr(0, req);
+                rc = mds_getattr(req, MDS_REQ_REC_OFF);
                 break;
 
         case MDS_SETXATTR:
@@ -1348,7 +1477,7 @@ int mds_handle(struct ptlrpc_request *req)
                 break;
 
         case MDS_GETATTR_NAME: {
-                struct lustre_handle lockh;
+                struct lustre_handle lockh = { 0 };
                 DEBUG_REQ(D_INODE, req, "getattr_name");
                 OBD_FAIL_RETURN(OBD_FAIL_MDS_GETATTR_NAME_NET, 0);
 
@@ -1356,11 +1485,11 @@ int mds_handle(struct ptlrpc_request *req)
                  * acquiring any new locks in mds_getattr_name, so we don't
                  * want to cancel.
                  */
-                lockh.cookie = 0;
-                rc = mds_getattr_name(0, req, MDS_INODELOCK_UPDATE, &lockh);
+                rc = mds_getattr_name(MDS_REQ_REC_OFF, req,
+                                      MDS_INODELOCK_UPDATE, &lockh);
                 /* this non-intent call (from an ioctl) is special */
                 req->rq_status = rc;
-                if (rc == 0 && lockh.cookie)
+                if (rc == 0 && lustre_handle_is_used(&lockh))
                         ldlm_lock_decref(&lockh, LCK_CR);
                 break;
         }
@@ -1373,7 +1502,7 @@ int mds_handle(struct ptlrpc_request *req)
         case MDS_READPAGE:
                 DEBUG_REQ(D_INODE, req, "readpage");
                 OBD_FAIL_RETURN(OBD_FAIL_MDS_READPAGE_NET, 0);
-                rc = mds_readpage(req);
+                rc = mds_readpage(req, MDS_REQ_REC_OFF);
 
                 if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_MDS_SENDPAGE)) {
                         RETURN(0);
@@ -1382,9 +1511,10 @@ int mds_handle(struct ptlrpc_request *req)
                 break;
 
         case MDS_REINT: {
-                __u32 *opcp = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*opcp));
+                __u32 *opcp = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_REC_OFF,
+                                             sizeof (*opcp));
                 __u32  opc;
-                int size[3] = {sizeof(struct mds_body), mds->mds_max_mdsize,
+                int size[] = { sizeof(struct mds_body), mds->mds_max_mdsize,
                                mds->mds_max_cookiesize};
                 int bufcount;
 
@@ -1416,7 +1546,7 @@ int mds_handle(struct ptlrpc_request *req)
                 if (rc)
                         break;
 
-                rc = mds_reint(req, 0, NULL);
+                rc = mds_reint(req, MDS_REQ_REC_OFF, NULL);
                 fail = OBD_FAIL_MDS_REINT_NET_REP;
                 break;
         }
@@ -1424,25 +1554,25 @@ int mds_handle(struct ptlrpc_request *req)
         case MDS_CLOSE:
                 DEBUG_REQ(D_INODE, req, "close");
                 OBD_FAIL_RETURN(OBD_FAIL_MDS_CLOSE_NET, 0);
-                rc = mds_close(req);
+                rc = mds_close(req, MDS_REQ_REC_OFF);
                 break;
 
         case MDS_DONE_WRITING:
                 DEBUG_REQ(D_INODE, req, "done_writing");
                 OBD_FAIL_RETURN(OBD_FAIL_MDS_DONE_WRITING_NET, 0);
-                rc = mds_done_writing(req);
+                rc = mds_done_writing(req, MDS_REQ_REC_OFF);
                 break;
 
         case MDS_PIN:
                 DEBUG_REQ(D_INODE, req, "pin");
                 OBD_FAIL_RETURN(OBD_FAIL_MDS_PIN_NET, 0);
-                rc = mds_pin(req);
+                rc = mds_pin(req, MDS_REQ_REC_OFF);
                 break;
 
         case MDS_SYNC:
                 DEBUG_REQ(D_INODE, req, "sync");
                 OBD_FAIL_RETURN(OBD_FAIL_MDS_SYNC_NET, 0);
-                rc = mds_sync(req);
+                rc = mds_sync(req, MDS_REQ_REC_OFF);
                 break;
 
         case MDS_SET_INFO:
@@ -1497,11 +1627,21 @@ int mds_handle(struct ptlrpc_request *req)
                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
                 rc = llog_origin_handle_create(req);
                 break;
+        case LLOG_ORIGIN_HANDLE_DESTROY:
+                DEBUG_REQ(D_INODE, req, "llog_init");
+                OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
+                rc = llog_origin_handle_destroy(req);
+                break;
         case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
                 DEBUG_REQ(D_INODE, req, "llog next block");
                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
                 rc = llog_origin_handle_next_block(req);
                 break;
+        case LLOG_ORIGIN_HANDLE_PREV_BLOCK:
+                DEBUG_REQ(D_INODE, req, "llog prev block");
+                OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
+                rc = llog_origin_handle_prev_block(req);
+                break;
         case LLOG_ORIGIN_HANDLE_READ_HEADER:
                 DEBUG_REQ(D_INODE, req, "llog read header");
                 OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
@@ -1581,20 +1721,29 @@ int mds_update_server_data(struct obd_device *obd, int force_sync)
 }
 
 static
-void fsoptions_to_mds_flags(struct mds_obd *mds, const char *options)
+void fsoptions_to_mds_flags(struct mds_obd *mds, char *options)
 {
-        const char *p = options;
+        char *p = options;
 
         while (*options) {
+                int len;
+
                 while (*p && *p != ',')
                         p++;
 
-                if ((p - options == sizeof("user_xattr") - 1) &&
-                    !memcmp(options, "user_xattr", sizeof("user_xattr") - 1))
+                len = p - options;
+                if (len == sizeof("user_xattr") - 1 &&
+                    memcmp(options, "user_xattr", len) == 0) {
                         mds->mds_fl_user_xattr = 1;
-                else if ((p - options == sizeof("acl") - 1) &&
-                    !memcmp(options, "acl", sizeof("acl") - 1))
+                } else if (len == sizeof("acl") - 1 &&
+                         memcmp(options, "acl", len) == 0) {
+#ifdef CONFIG_FS_POSIX_ACL
                         mds->mds_fl_acl = 1;
+#else
+                        CWARN("ignoring unsupported acl mount option\n");
+                        memmove(options, p, strlen(p) + 1);
+#endif
+                }
 
                 options = ++p;
         }
@@ -1610,9 +1759,11 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
 {
         struct lprocfs_static_vars lvars;
         struct lustre_cfg* lcfg = buf;
-        char *options = NULL;
         struct mds_obd *mds = &obd->u.mds;
         struct vfsmount *mnt;
+        struct obd_uuid uuid;
+        __u8 *uuid_ptr;
+        char *options, *str, *label;
         char ns_name[48];
         unsigned long page;
         int rc = 0;
@@ -1641,7 +1792,7 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
         /* here we use "iopen_nopriv" hardcoded, because it affects MDS utility
          * and the rest of options are passed by mount options. Probably this
          * should be moved to somewhere else like startup scripts or lconf. */
-        sprintf(options, "iopen_nopriv");
+        strcpy(options, "iopen_nopriv");
 
         if (LUSTRE_CFG_BUFLEN(lcfg, 4) > 0 && lustre_cfg_buf(lcfg, 4)) {
                 sprintf(options + strlen(options), ",%s",
@@ -1662,7 +1813,7 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
         CDEBUG(D_SUPER, "%s: mnt = %p\n", lustre_cfg_string(lcfg, 1), mnt);
 
         LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb)));
-        
+
         sema_init(&mds->mds_orphan_recovery_sem, 1);
         sema_init(&mds->mds_epoch_sem, 1);
         spin_lock_init(&mds->mds_transno_lock);
@@ -1710,7 +1861,7 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
         rc = lquota_setup(quota_interface, obd, lcfg);
         if (rc)
                 GOTO(err_fs, rc);
-        
+
         mds->mds_group_hash = upcall_cache_init(obd->obd_name);
         if (IS_ERR(mds->mds_group_hash)) {
                 rc = PTR_ERR(mds->mds_group_hash);
@@ -1718,8 +1869,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
                 GOTO(err_qctxt, rc);
         }
 
-        /* Wait for mds_postrecov trying to clear orphans until 9439 is fixed */
-        obd->obd_async_recov = 0;
+        /* Don't wait for mds_postrecov trying to clear orphans */
+        obd->obd_async_recov = 1;
         rc = mds_postsetup(obd);
         if (rc)
                 GOTO(err_qctxt, rc);
@@ -1728,25 +1879,34 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
         lprocfs_init_vars(mds, &lvars);
         lprocfs_obd_setup(obd, lvars.obd_vars);
 
+        uuid_ptr = fsfilt_uuid(obd, obd->u.obt.obt_sb);
+        if (uuid_ptr != NULL) {
+                class_uuid_unparse(uuid_ptr, &uuid);
+                str = uuid.uuid;
+        } else {
+                str = "no UUID";
+        }
+
+        label = fsfilt_label(obd, obd->u.obt.obt_sb);
         if (obd->obd_recovering) {
-                LCONSOLE_WARN("MDT %s now serving %s, but will be in recovery "
-                              "until %d %s reconnect, or if no clients "
-                              "reconnect for %d:%.02d; during that time new "
+                LCONSOLE_WARN("MDT %s now serving %s (%s%s%s), but will be in "
+                              "recovery until %d %s reconnect, or if no clients"
+                              " reconnect for %d:%.02d; during that time new "
                               "clients will not be allowed to connect. "
                               "Recovery progress can be monitored by watching "
                               "/proc/fs/lustre/mds/%s/recovery_status.\n",
-                              obd->obd_name,
-                              lustre_cfg_string(lcfg, 1),
+                              obd->obd_name, lustre_cfg_string(lcfg, 1),
+                              label ?: "", label ? "/" : "", str,
                               obd->obd_recoverable_clients,
-                              (obd->obd_recoverable_clients == 1) 
+                              (obd->obd_recoverable_clients == 1)
                               ? "client" : "clients",
                               (int)(OBD_RECOVERY_TIMEOUT) / 60,
                               (int)(OBD_RECOVERY_TIMEOUT) % 60,
                               obd->obd_name);
         } else {
-                LCONSOLE_INFO("MDT %s now serving %s with recovery %s.\n",
-                              obd->obd_name,
-                              lustre_cfg_string(lcfg, 1),
+                LCONSOLE_INFO("MDT %s now serving %s (%s%s%s) with recovery "
+                              "%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1),
+                              label ?: "", label ? "/" : "", str,
                               obd->obd_replayable ? "enabled" : "disabled");
         }
 
@@ -1756,7 +1916,7 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
         RETURN(0);
 
 err_qctxt:
-        lquota_cleanup(quota_interface, obd);       
+        lquota_cleanup(quota_interface, obd);
 err_fs:
         /* No extra cleanup needed for llog_init_commit_thread() */
         mds_fs_cleanup(obd);
@@ -1775,6 +1935,36 @@ err_ops:
         return rc;
 }
 
+static int mds_lov_clean(struct obd_device *obd)
+{
+        struct mds_obd *mds = &obd->u.mds;
+        struct obd_device *osc = mds->mds_osc_obd;
+        ENTRY;
+
+        if (mds->mds_profile) {
+                class_del_profile(mds->mds_profile);
+                OBD_FREE(mds->mds_profile, strlen(mds->mds_profile) + 1);
+                mds->mds_profile = NULL;
+        }
+
+        /* There better be a lov */
+        if (!osc)
+                RETURN(0);
+
+        obd_register_observer(osc, NULL);
+
+        /* Give lov our same shutdown flags */
+        osc->obd_force = obd->obd_force;
+        osc->obd_fail = obd->obd_fail;
+
+        /* Cleanup the lov */
+        obd_disconnect(mds->mds_osc_exp);
+        class_manual_cleanup(osc);
+        mds->mds_osc_exp = NULL;
+
+        RETURN(0);
+}
+
 static int mds_postsetup(struct obd_device *obd)
 {
         struct mds_obd *mds = &obd->u.mds;
@@ -1786,6 +1976,11 @@ static int mds_postsetup(struct obd_device *obd)
         if (rc)
                 RETURN(rc);
 
+        rc = llog_setup(obd, LLOG_LOVEA_ORIG_CTXT, obd, 0, NULL,
+                        &llog_lvfs_ops);
+        if (rc)
+                RETURN(rc);
+
         if (mds->mds_profile) {
                 struct lvfs_run_ctxt saved;
                 struct lustre_profile *lprof;
@@ -1830,27 +2025,33 @@ err_cleanup:
         mds_lov_clean(obd);
 err_llog:
         llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT));
+        llog_cleanup(llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT));
         RETURN(rc);
 }
 
 int mds_postrecov(struct obd_device *obd)
 {
-        int rc, item = 0;
+        int rc;
         ENTRY;
 
-        if (obd->obd_fail) 
+        if (obd->obd_fail)
                 RETURN(0);
 
         LASSERT(!obd->obd_recovering);
         LASSERT(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT) != NULL);
 
+        /* set nextid first, so we are sure it happens */
+        rc = mds_lov_set_nextid(obd);
+        if (rc) {
+                CERROR("%s: mds_lov_set_nextid failed\n",
+                       obd->obd_name);
+                GOTO(out, rc);
+        }
+
         /* clean PENDING dir */
         rc = mds_cleanup_pending(obd);
-        if (rc < 0) {
+        if (rc < 0)
                 GOTO(out, rc);
-        } else {
-                item = rc;
-        }
 
         /* Does anyone need this to be synchronous ever? */
         mds_lov_start_synchronize(obd, NULL, obd->obd_async_recov);
@@ -1859,37 +2060,20 @@ int mds_postrecov(struct obd_device *obd)
         lquota_recovery(quota_interface, obd);
 
 out:
-        RETURN(rc < 0 ? rc : item);
+        RETURN(rc);
 }
 
-int mds_lov_clean(struct obd_device *obd)
+/* We need to be able to stop an mds_lov_synchronize */
+static int mds_lov_early_clean(struct obd_device *obd)
 {
         struct mds_obd *mds = &obd->u.mds;
         struct obd_device *osc = mds->mds_osc_obd;
-        ENTRY;
-
-        if (mds->mds_profile) {
-                class_del_profile(mds->mds_profile);
-                OBD_FREE(mds->mds_profile, strlen(mds->mds_profile) + 1);
-                mds->mds_profile = NULL;
-        }
 
-        /* There better be a lov */
-        if (!osc)
-                RETURN(0);
-
-        obd_register_observer(osc, NULL);
-
-        /* Give lov our same shutdown flags */
-        osc->obd_force = obd->obd_force;
-        osc->obd_fail = obd->obd_fail;
-        
-        /* Cleanup the lov */
-        obd_disconnect(mds->mds_osc_exp);
-        class_manual_cleanup(osc);
-        mds->mds_osc_exp = NULL;
+        if (!osc || (!obd->obd_force && !obd->obd_fail))
+                return(0);
 
-        RETURN(0);
+        CDEBUG(D_HA, "abort inflight\n");
+        return (obd_precleanup(osc, OBD_CLEANUP_EARLY));
 }
 
 static int mds_precleanup(struct obd_device *obd, int stage)
@@ -1898,14 +2082,15 @@ static int mds_precleanup(struct obd_device *obd, int stage)
         ENTRY;
 
         switch (stage) {
-        case 1:
-                mds_lov_set_cleanup_flags(obd);
+        case OBD_CLEANUP_EXPORTS:
                 target_cleanup_recovery(obd);
+                mds_lov_early_clean(obd);
                 break;
-        case 2:
+        case OBD_CLEANUP_SELF_EXP:
                 mds_lov_disconnect(obd);
                 mds_lov_clean(obd);
                 llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT));
+                llog_cleanup(llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT));
                 rc = obd_llog_finish(obd, 0);
         }
         RETURN(rc);
@@ -1951,7 +2136,7 @@ static int mds_cleanup(struct obd_device *obd)
 
         /* We can only unlock kernel if we are in the context of sys_ioctl,
            otherwise we never called lock_kernel */
-        if (kernel_locked()) {
+        if (ll_kernel_locked()) {
                 unlock_kernel();
                 must_relock++;
         }
@@ -1980,7 +2165,7 @@ static int mds_cleanup(struct obd_device *obd)
         RETURN(0);
 }
 
-static void fixup_handle_for_resent_req(struct ptlrpc_request *req,
+static void fixup_handle_for_resent_req(struct ptlrpc_request *req, int offset,
                                         struct ldlm_lock *new_lock,
                                         struct ldlm_lock **old_lock,
                                         struct lustre_handle *lockh)
@@ -1988,7 +2173,7 @@ static void fixup_handle_for_resent_req(struct ptlrpc_request *req,
         struct obd_export *exp = req->rq_export;
         struct obd_device *obd = exp->exp_obd;
         struct ldlm_request *dlmreq =
-                lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*dlmreq));
+                lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*dlmreq));
         struct lustre_handle remote_hdl = dlmreq->lock_handle1;
         struct list_head *iter;
 
@@ -2017,7 +2202,7 @@ static void fixup_handle_for_resent_req(struct ptlrpc_request *req,
         /* If the xid matches, then we know this is a resent request,
          * and allow it. (It's probably an OPEN, for which we don't
          * send a lock */
-        if (req->rq_xid == 
+        if (req->rq_xid ==
             le64_to_cpu(exp->exp_mds_data.med_mcd->mcd_last_xid))
                 return;
 
@@ -2057,15 +2242,16 @@ static int mds_intent_policy(struct ldlm_namespace *ns,
         struct lustre_handle lockh = { 0 };
         struct ldlm_lock *new_lock = NULL;
         int getattr_part = MDS_INODELOCK_UPDATE;
-        int rc, offset = 2;
-        int repbufcnt = 3, repsize[4] = {sizeof(struct ldlm_reply),
-                                         sizeof(struct mds_body),
-                                         mds->mds_max_mdsize};
+        int repsize[4] = {sizeof(*rep),
+                          sizeof(struct mds_body),
+                          mds->mds_max_mdsize};
+        int repbufcnt = 3, offset = MDS_REQ_INTENT_REC_OFF;
+        int rc;
         ENTRY;
 
         LASSERT(req != NULL);
 
-        if (req->rq_reqmsg->bufcount <= 1) {
+        if (req->rq_reqmsg->bufcount <= MDS_REQ_INTENT_IT_OFF) {
                 /* No intent was provided */
                 int size = sizeof(struct ldlm_reply);
                 rc = lustre_pack_reply(req, 1, &size, NULL);
@@ -2073,7 +2259,8 @@ static int mds_intent_policy(struct ldlm_namespace *ns,
                 RETURN(0);
         }
 
-        it = lustre_swab_reqbuf(req, 1, sizeof(*it), lustre_swab_ldlm_intent);
+        it = lustre_swab_reqbuf(req, MDS_REQ_INTENT_IT_OFF, sizeof(*it),
+                                lustre_swab_ldlm_intent);
         if (it == NULL) {
                 CERROR("Intent missing\n");
                 RETURN(req->rq_status = -EFAULT);
@@ -2083,6 +2270,7 @@ static int mds_intent_policy(struct ldlm_namespace *ns,
 
         if ((req->rq_export->exp_connect_flags & OBD_CONNECT_ACL) &&
             (it->opc & (IT_OPEN | IT_GETATTR | IT_LOOKUP)))
+                /* we should never allow OBD_CONNECT_ACL if not configured */
                 repsize[repbufcnt++] = LUSTRE_POSIX_ACL_MAX_SIZE;
         else if (it->opc & IT_UNLINK)
                 repsize[repbufcnt++] = mds->mds_max_cookiesize;
@@ -2099,7 +2287,8 @@ static int mds_intent_policy(struct ldlm_namespace *ns,
         switch ((long)it->opc) {
         case IT_OPEN:
         case IT_CREAT|IT_OPEN:
-                fixup_handle_for_resent_req(req, lock, NULL, &lockh);
+                fixup_handle_for_resent_req(req, MDS_REQ_INTENT_LOCKREQ_OFF,
+                                            lock, NULL, &lockh);
                 /* XXX swab here to assert that an mds_open reint
                  * packet is following */
                 rep->lock_policy_res2 = mds_reint(req, offset, &lockh);
@@ -2110,7 +2299,7 @@ static int mds_intent_policy(struct ldlm_namespace *ns,
                         RETURN(ELDLM_LOCK_ABORTED);
                 if (intent_disposition(rep, DISP_LOOKUP_NEG) &&
                     !intent_disposition(rep, DISP_OPEN_OPEN))
-#endif 
+#endif
                         RETURN(ELDLM_LOCK_ABORTED);
                 break;
         case IT_LOOKUP:
@@ -2118,10 +2307,20 @@ static int mds_intent_policy(struct ldlm_namespace *ns,
         case IT_GETATTR:
                         getattr_part |= MDS_INODELOCK_LOOKUP;
         case IT_READDIR:
-                fixup_handle_for_resent_req(req, lock, &new_lock, &lockh);
+                fixup_handle_for_resent_req(req, MDS_REQ_INTENT_LOCKREQ_OFF,
+                                            lock, &new_lock, &lockh);
+
+                /* INODEBITS_INTEROP: if this lock was converted from a
+                 * plain lock (client does not support inodebits), then
+                 * child lock must be taken with both lookup and update
+                 * bits set for all operations.
+                 */
+                if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_IBITS))
+                        getattr_part = MDS_INODELOCK_LOOKUP |
+                                       MDS_INODELOCK_UPDATE;
+
                 rep->lock_policy_res2 = mds_getattr_name(offset, req,
                                                          getattr_part, &lockh);
-
                 /* FIXME: LDLM can set req->rq_status. MDS sets
                    policy_res{1,2} with disposition and status.
                    - replay: returns 0 & req->status is old status
@@ -2216,8 +2415,8 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf)
                 ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
                                 MDS_MAXREPSIZE, MDS_REQUEST_PORTAL,
                                 MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT,
-                                mds_handle, "mds", obd->obd_proc_entry, NULL,
-                                MDT_NUM_THREADS);
+                                mds_handle, LUSTRE_MDS_NAME,
+                                obd->obd_proc_entry, NULL, MDT_NUM_THREADS);
 
         if (!mds->mds_service) {
                 CERROR("failed to start service\n");
@@ -2300,7 +2499,7 @@ static int mdt_health_check(struct obd_device *obd)
 {
         struct mds_obd *mds = &obd->u.mds;
         int rc = 0;
-        
+
         down(&mds->mds_health_sem);
         rc |= ptlrpc_service_health_check(mds->mds_readpage_service);
         rc |= ptlrpc_service_health_check(mds->mds_setattr_service);
@@ -2313,11 +2512,10 @@ static int mdt_health_check(struct obd_device *obd)
          */
         if(rc != 0)
                 rc = 1;
-        
+
         return rc;
 }
 
-
 static struct dentry *mds_lvfs_fid2dentry(__u64 id, __u32 gen, __u64 gr,
                                           void *data)
 {
@@ -2328,6 +2526,21 @@ static struct dentry *mds_lvfs_fid2dentry(__u64 id, __u32 gen, __u64 gr,
         return mds_fid2dentry(&obd->u.mds, &fid, NULL);
 }
 
+static int mds_health_check(struct obd_device *obd)
+{
+        struct obd_device_target *odt = &obd->u.obt;
+        struct mds_obd *mds = &obd->u.mds;
+        int rc = 0;
+
+        if (odt->obt_sb->s_flags & MS_RDONLY)
+                rc = 1;
+
+        LASSERT(mds->mds_health_check_filp != NULL);
+        rc |= !!lvfs_check_io_health(obd, mds->mds_health_check_filp);
+
+        return rc;
+}
+
 struct lvfs_callback_ops mds_lvfs_ops = {
         l_fid2dentry:     mds_lvfs_fid2dentry,
 };
@@ -2336,6 +2549,7 @@ struct lvfs_callback_ops mds_lvfs_ops = {
 static struct obd_ops mds_obd_ops = {
         .o_owner           = THIS_MODULE,
         .o_connect         = mds_connect,
+        .o_reconnect       = mds_reconnect,
         .o_init_export     = mds_init_export,
         .o_destroy_export  = mds_destroy_export,
         .o_disconnect      = mds_disconnect,
@@ -2350,6 +2564,7 @@ static struct obd_ops mds_obd_ops = {
         .o_llog_init       = mds_llog_init,
         .o_llog_finish     = mds_llog_finish,
         .o_notify          = mds_notify,
+        .o_health_check    = mds_health_check,
 };
 
 static struct obd_ops mdt_obd_ops = {
@@ -2359,8 +2574,8 @@ static struct obd_ops mdt_obd_ops = {
         .o_health_check    = mdt_health_check,
 };
 
-quota_interface_t *quota_interface = NULL;
-extern quota_interface_t mds_quota_interface;
+quota_interface_t *quota_interface;
+quota_interface_t mds_quota_interface;
 
 static int __init mds_init(void)
 {
index 8d0565d..4cf39d6 100644 (file)
@@ -5,8 +5,45 @@
 #ifndef _MDS_INTERNAL_H
 #define _MDS_INTERNAL_H
 
+#include <linux/lustre_disk.h>  /* XXX */
 #include <lustre_mds.h>
 
+#define MDT_ROCOMPAT_SUPP       (OBD_ROCOMPAT_LOVOBJID)
+
+#define MDT_INCOMPAT_SUPP       (OBD_INCOMPAT_MDT)
+
+/* Data stored per server at the head of the last_rcvd file.  In le32 order.
+ * Try to keep this the same as fsd_server_data so we might one day merge. */
+struct mds_server_data {
+        __u8  msd_uuid[40];        /* server UUID */
+        __u64 msd_last_transno;    /* last completed transaction ID */
+        __u64 msd_mount_count;     /* MDS incarnation number */
+        __u64 msd_mount_count_new; /* future MDS incarnation number */
+        __u32 msd_feature_compat;  /* compatible feature flags */
+        __u32 msd_feature_rocompat;/* read-only compatible feature flags */
+        __u32 msd_feature_incompat;/* incompatible feature flags */
+        __u32 msd_server_size;     /* size of server data area */
+        __u32 msd_client_start;    /* start of per-client data area */
+        __u16 msd_client_size;     /* size of per-client data area */
+        __u16 msd_subdir_count;    /* number of subdirectories for objects */
+        __u64 msd_catalog_oid;     /* recovery catalog object id */
+        __u32 msd_catalog_ogen;    /* recovery catalog inode generation */
+        __u8  msd_peeruuid[40];    /* UUID of LOV/OSC associated with MDS */
+        __u32 msd_ost_index;       /* index number of OST in LOV */
+        __u32 msd_mds_index;       /* index number of MDS in LMV */
+        __u8  msd_padding[LR_SERVER_SIZE - 148];
+};
+
+/* Data stored per client in the last_rcvd file.  In le32 order. */
+struct mds_client_data {
+        __u8 mcd_uuid[40];      /* client UUID */
+        __u64 mcd_last_transno; /* last completed transaction ID */
+        __u64 mcd_last_xid;     /* xid for the last transaction */
+        __u32 mcd_last_result;  /* result from last RPC */
+        __u32 mcd_last_data;    /* per-op data (disposition for open &c.) */
+        __u8 mcd_padding[LR_CLIENT_SIZE - 64];
+};
+
 #define MDS_SERVICE_WATCHDOG_TIMEOUT (obd_timeout * 1000)
 
 #define MAX_ATIME_DIFF 60
@@ -127,6 +164,23 @@ int mds_osc_setattr_async(struct obd_device *obd, struct inode *inode,
                           struct lov_mds_md *lmm, int lmm_size,
                           struct llog_cookie *logcookies, struct ll_fid *fid);
 
+int mds_get_parents_children_locked(struct obd_device *obd,
+                                    struct mds_obd *mds,
+                                    struct ll_fid *p1_fid,
+                                    struct dentry **de_srcdirp,
+                                    struct ll_fid *p2_fid,
+                                    struct dentry **de_tgtdirp,
+                                    int parent_mode,
+                                    const char *old_name, int old_len,
+                                    struct dentry **de_oldp,
+                                    const char *new_name, int new_len,
+                                    struct dentry **de_newp,
+                                    struct lustre_handle *dlm_handles,
+                                    int child_mode);
+
+void mds_shrink_reply(struct obd_device *obd, struct ptlrpc_request *req,
+                      struct mds_body *body);
+int mds_get_cookie_size(struct obd_device *obd, struct lov_mds_md *lmm);
 /* mds/mds_lib.c */
 int mds_update_unpack(struct ptlrpc_request *, int offset,
                       struct mds_update_record *);
@@ -152,7 +206,6 @@ int mds_llog_finish(struct obd_device *obd, int count);
 /* mds/mds_lov.c */
 int mds_lov_connect(struct obd_device *obd, char * lov_name);
 int mds_lov_disconnect(struct obd_device *obd);
-void mds_lov_set_cleanup_flags(struct obd_device *);
 int mds_lov_write_objids(struct obd_device *obd);
 void mds_lov_update_objids(struct obd_device *obd, obd_id *ids);
 int mds_lov_clear_orphans(struct mds_obd *mds, struct obd_uuid *ost_uuid);
@@ -160,23 +213,28 @@ int mds_lov_set_nextid(struct obd_device *obd);
 int mds_lov_start_synchronize(struct obd_device *obd, struct obd_uuid *uuid,
                               int nonblock);
 int mds_post_mds_lovconf(struct obd_device *obd);
-int mds_notify(struct obd_device *obd, struct obd_device *watched, int active);
+int mds_notify(struct obd_device *obd, struct obd_device *watched,
+               enum obd_notify_event ev);
 int mds_convert_lov_ea(struct obd_device *obd, struct inode *inode,
                        struct lov_mds_md *lmm, int lmm_size);
 void mds_objids_from_lmm(obd_id *ids, struct lov_mds_md *lmm,
                          struct lov_desc *desc);
+int mds_init_lov_desc(struct obd_device *obd, struct obd_export *osc_exp);
 
 /* mds/mds_open.c */
 int mds_query_write_access(struct inode *inode);
 int mds_open(struct mds_update_record *rec, int offset,
              struct ptlrpc_request *req, struct lustre_handle *);
-int mds_pin(struct ptlrpc_request *req);
+int mds_pin(struct ptlrpc_request *req, int offset);
 void mds_mfd_unlink(struct mds_file_data *mfd, int decref);
-int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd,
+int mds_mfd_close(struct ptlrpc_request *req, int offset, struct obd_device *obd,
                   struct mds_file_data *mfd, int unlink_orphan);
-int mds_close(struct ptlrpc_request *req);
-int mds_done_writing(struct ptlrpc_request *req);
+int mds_close(struct ptlrpc_request *req, int offset);
+int mds_done_writing(struct ptlrpc_request *req, int offset);
 
+/*mds/mds_join.c*/
+int mds_join_file(struct mds_update_record *rec, struct ptlrpc_request *req, 
+                  struct dentry *dchild, struct lustre_handle *lockh);
 
 /* mds/mds_fs.c */
 int mds_client_add(struct obd_device *obd, struct mds_obd *mds,
@@ -185,11 +243,11 @@ int mds_client_free(struct obd_export *exp);
 int mds_obd_create(struct obd_export *exp, struct obdo *oa,
                    struct lov_stripe_md **ea, struct obd_trans_info *oti);
 int mds_obd_destroy(struct obd_export *exp, struct obdo *oa,
-                    struct lov_stripe_md *ea, struct obd_trans_info *oti);
+                    struct lov_stripe_md *ea, struct obd_trans_info *oti,
+                    struct obd_export *md_exp);
 
 /* mds/handler.c */
 extern struct lvfs_callback_ops mds_lvfs_ops;
-int mds_lov_clean(struct obd_device *obd);
 extern int mds_iocontrol(unsigned int cmd, struct obd_export *exp,
                          int len, void *karg, void *uarg);
 int mds_postrecov(struct obd_device *obd);
@@ -206,6 +264,7 @@ int mds_pack_acl(struct mds_export_data *med, struct inode *inode,
                  int repoff);
 
 /* quota stuff */
+extern quota_interface_t mds_quota_interface;
 extern quota_interface_t *quota_interface;
 
 /* mds/mds_xattr.c */
index fdc3189..6432379 100644 (file)
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/ext3_fs.h>
-#include <linux/obd_support.h>
-#include <linux/obd_class.h>
-#include <linux/obd.h>
-#include <linux/lustre_lib.h>
-#include <linux/lustre_idl.h>
-#include <linux/lustre_mds.h>
-#include <linux/lustre_dlm.h>
-#include <linux/lustre_log.h>
-#include <linux/lustre_fsfilt.h>
-#include <linux/lustre_lite.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <obd.h>
+#include <lustre_lib.h>
+#include <lustre_idl.h>
+#include <lustre_mds.h>
+#include <lustre_dlm.h>
+#include <lustre_log.h>
+#include <lustre_fsfilt.h>
+#include <lustre_lite.h>
+#include <obd_lov.h>
 #include "mds_internal.h"
-#include <linux/obd_lov.h>
 
 struct mdsea_cb_data {
     struct llog_handle     *mc_llh;
index b14ad93..a76be7d 100644 (file)
@@ -36,6 +36,7 @@
 #include <libcfs/list.h>
 #include <obd_class.h>
 #include <lustre_fsfilt.h>
+#include <lustre_mds.h>
 #include <lustre_commit_confd.h>
 #include <lustre_log.h>
 
index a39e33a..b392f14 100644 (file)
@@ -38,6 +38,7 @@
 #include <obd_lov.h>
 #include <lustre_lib.h>
 #include <lustre_fsfilt.h>
+#include <linux/lustre_ver.h>
 
 #include "mds_internal.h"
 
@@ -106,40 +107,82 @@ int mds_lov_write_objids(struct obd_device *obd)
 
 int mds_lov_clear_orphans(struct mds_obd *mds, struct obd_uuid *ost_uuid)
 {
+        int rc;
+        struct obdo oa;
+        struct obd_trans_info oti = {0};
         struct lov_stripe_md  *empty_ea = NULL;
-        struct obd_trans_info oti = { 0 };
-        struct obdo *oa;
+        ENTRY;
+
+        LASSERT(mds->mds_lov_objids != NULL);
+
+        /* This create will in fact either create or destroy:  If the OST is
+         * missing objects below this ID, they will be created.  If it finds
+         * objects above this ID, they will be removed. */
+        memset(&oa, 0, sizeof(oa));
+        oa.o_valid = OBD_MD_FLFLAGS;
+        oa.o_flags = OBD_FL_DELORPHAN;
+        if (ost_uuid != NULL) {
+                memcpy(&oa.o_inline, ost_uuid, sizeof(*ost_uuid));
+                oa.o_valid |= OBD_MD_FLINLINE;
+        }
+        rc = obd_create(mds->mds_osc_exp, &oa, &empty_ea, &oti);
+
+        RETURN(rc);
+}
+
+/* update the LOV-OSC knowledge of the last used object id's */
+int mds_lov_set_nextid(struct obd_device *obd)
+{
+        struct mds_obd *mds = &obd->u.mds;
         int rc;
         ENTRY;
 
+        LASSERT(!obd->obd_recovering);
+
         LASSERT(mds->mds_lov_objids != NULL);
 
-        oa = obdo_alloc();
-        if (oa == NULL)
-                RETURN(-ENOMEM);
+        rc = obd_set_info(mds->mds_osc_exp, strlen("next_id"), "next_id",
+                          mds->mds_lov_desc.ld_tgt_count, mds->mds_lov_objids);
+        RETURN(rc);
+}
 
-        oa->o_valid = OBD_MD_FLFLAGS;
-        oa->o_flags = OBD_FL_DELORPHAN;
+int mds_init_lov_desc(struct obd_device *obd, struct obd_export *osc_exp)
+{
+        struct mds_obd *mds = &obd->u.mds;
+        int valsize, rc, tgt_count;
+        __u32 stripes;
+        ENTRY;
 
-        if (ost_uuid != NULL) {
-                memcpy(&oa->o_inline, ost_uuid, sizeof(*ost_uuid));
-                oa->o_valid |= OBD_MD_FLINLINE;
+        mds->mds_has_lov_desc = 0;
+        valsize = sizeof(mds->mds_lov_desc);
+        rc = obd_get_info(mds->mds_osc_exp, strlen("lovdesc") + 1,
+                          "lovdesc", &valsize, &mds->mds_lov_desc);
+        if (rc) {
+                CERROR("can't get lov_desc, rc %d\n", rc);
+                RETURN(rc);
         }
 
-        oti.oti_objid = mds->mds_lov_objids;
-        rc = obd_create(mds->mds_osc_exp, oa, &empty_ea, &oti);
+        mds->mds_has_lov_desc = 1;
+        tgt_count = mds->mds_lov_desc.ld_tgt_count;
+        stripes = min(tgt_count, LOV_MAX_STRIPE_COUNT);
 
-        obdo_free(oa);
-        RETURN(rc);
+        mds->mds_max_mdsize = lov_mds_md_size(stripes);
+        mds->mds_max_cookiesize = stripes * sizeof(struct llog_cookie);
+
+        CDEBUG(D_HA, "updated lov_desc, tgt_count: %d\n", tgt_count);
+
+        CDEBUG(D_HA, "updating max_mdsize/max_cookiesize: %d/%d\n",
+               mds->mds_max_mdsize, mds->mds_max_cookiesize);
+
+        RETURN(0);
 }
 
 /* update the LOV-OSC knowledge of the last used object id's */
 int mds_lov_connect(struct obd_device *obd, char * lov_name)
 {
-        struct obd_connect_data *data = NULL;
         struct mds_obd *mds = &obd->u.mds;
         struct lustre_handle conn = {0,};
-        int valsize;
+        struct obd_connect_data *data;
         int rc, i;
         ENTRY;
 
@@ -156,15 +199,14 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
                 RETURN(-ENOTCONN);
         }
 
-        OBD_ALLOC_PTR(data);
-        if (!data)
+        OBD_ALLOC(data, sizeof(*data));
+        if (data == NULL)
                 RETURN(-ENOMEM);
-        data->ocd_connect_flags = OBD_CONNECT_CROW;
-
-        rc = obd_connect(&conn, mds->mds_osc_obd, &obd->obd_uuid,
-                         data);
-        OBD_FREE_PTR(data);
-        
+        data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX;
+        data->ocd_version = LUSTRE_VERSION_CODE;
+        /* NB: lov_connect() needs to fill in .ocd_index for each OST */
+        rc = obd_connect(&conn, mds->mds_osc_obd, &obd->obd_uuid, data);
+        OBD_FREE(data, sizeof(*data));
         if (rc) {
                 CERROR("MDS cannot connect to LOV %s (%d)\n", lov_name, rc);
                 mds->mds_osc_obd = ERR_PTR(rc);
@@ -179,16 +221,11 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
                 GOTO(err_discon, rc);
         }
 
-        valsize = sizeof(mds->mds_lov_desc);
-        rc = obd_get_info(mds->mds_osc_exp, strlen("lovdesc") + 1, "lovdesc",
-                          &valsize, &mds->mds_lov_desc);
+        /* init lov_desc + easize */
+        rc = mds_init_lov_desc(obd, mds->mds_osc_exp);
         if (rc)
                 GOTO(err_reg, rc);
 
-        mds->mds_max_mdsize = lov_mds_md_size(mds->mds_lov_desc.ld_tgt_count);
-        mds->mds_max_cookiesize = mds->mds_lov_desc.ld_tgt_count*
-                sizeof(struct llog_cookie);
-        mds->mds_has_lov_desc = 1;
         rc = mds_lov_read_objids(obd);
         if (rc) {
                 CERROR("cannot read %s: rc = %d\n", "lov_objids", rc);
@@ -258,34 +295,6 @@ int mds_lov_disconnect(struct obd_device *obd)
         RETURN(rc);
 }
 
-/* for consistency, let's make the lov and the lov's
- * osc's see the same cleanup flags as our mds */
-void mds_lov_set_cleanup_flags(struct obd_device *obd)
-{
-        struct mds_obd *mds = &obd->u.mds;
-        struct lov_obd *lov;
-
-        if (IS_ERR(mds->mds_osc_obd) || (mds->mds_osc_exp == NULL))
-                return;
-
-        lov = &mds->mds_osc_obd->u.lov;
-        mds->mds_osc_obd->obd_force = obd->obd_force;
-        mds->mds_osc_obd->obd_fail = obd->obd_fail;
-        if (lov->tgts) {
-                struct obd_export *osc_exp;
-                int i;
-                spin_lock(&lov->lov_lock);
-                for (i = 0; i < lov->desc.ld_tgt_count; i++) {
-                        if (lov->tgts[i].ltd_exp != NULL) {
-                                osc_exp = lov->tgts[i].ltd_exp;
-                                osc_exp->exp_obd->obd_force = obd->obd_force;
-                                osc_exp->exp_obd->obd_fail = obd->obd_fail;
-                        }
-                }
-                spin_unlock(&lov->lov_lock);
-        }
-}
-
 int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                   void *karg, void *uarg)
 {
@@ -517,7 +526,10 @@ static int __mds_lov_syncronize(void *data)
 
         CWARN("MDS %s: %s now active, resetting orphans\n",
               obd->obd_name, uuid ? (char *)uuid->uuid : "All OSC's");
-        
+
+        if (obd->obd_stopping)
+                GOTO(out, rc = -ENODEV);
+
         rc = mds_lov_clear_orphans(&obd->u.mds, uuid);
         if (rc != 0) {
                 CERROR("%s: failed at mds_lov_clear_orphans: %d\n",
@@ -525,9 +537,10 @@ static int __mds_lov_syncronize(void *data)
                 GOTO(out, rc);
         }
 
+        EXIT;
 out:
-        class_export_put(obd->obd_self_export);
-        RETURN(rc);
+        class_decref(obd);
+        return rc;
 }
 
 int mds_lov_synchronize(void *data)
@@ -560,9 +573,16 @@ int mds_lov_start_synchronize(struct obd_device *obd, struct obd_uuid *uuid,
 
         mlsi->mlsi_obd = obd;
         mlsi->mlsi_uuid = uuid;
-        
-        /* We need to lock the mds in place for our new thread context. */
-        class_export_get(obd->obd_self_export);
+
+        /* Although class_export_get(obd->obd_self_export) would lock
+           the MDS in place, since it's only a self-export
+           it doesn't lock the LOV in place.  The LOV can be disconnected
+           during MDS precleanup, leaving nothing for __mds_lov_syncronize.
+           Simply taking an export ref on the LOV doesn't help, because it's
+           still disconnected. Taking an obd reference insures that we don't
+           disconnect the LOV.  This of course means a cleanup won't
+           finish for as long as the sync is blocking. */
+        atomic_inc(&obd->obd_refcount);
 
         if (nonblock) {
                 /* Syncronize in the background */
@@ -570,7 +590,7 @@ int mds_lov_start_synchronize(struct obd_device *obd, struct obd_uuid *uuid,
                 if (rc < 0) {
                         CERROR("%s: error starting mds_lov_synchronize: %d\n",
                                obd->obd_name, rc);
-                        class_export_put(obd->obd_self_export);
+                        class_decref(obd);
                 } else {
                         CDEBUG(D_HA, "%s: mds_lov_synchronize thread: %d\n",
                                obd->obd_name, rc);
@@ -583,16 +603,18 @@ int mds_lov_start_synchronize(struct obd_device *obd, struct obd_uuid *uuid,
         RETURN(rc);
 }
 
-int mds_notify(struct obd_device *obd, struct obd_device *watched, int active)
+int mds_notify(struct obd_device *obd, struct obd_device *watched,
+               enum obd_notify_event ev)
 {
+        struct mds_obd *mds = &obd->u.mds;
         struct obd_uuid *uuid;
         int rc = 0;
         ENTRY;
 
-        if (!active)
+        if (ev != OBD_NOTIFY_ACTIVE)
                 RETURN(0);
 
-        if (strcmp(watched->obd_type->typ_name, "osc")) {
+        if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
                 CERROR("unexpected notification of %s %s!\n",
                        watched->obd_type->typ_name, watched->obd_name);
                 RETURN(-EINVAL);
@@ -600,14 +622,19 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched, int active)
 
         uuid = &watched->u.cli.cl_import->imp_target_uuid;
         if (obd->obd_recovering) {
+                /* in the case OBD is in recovery we do not reinit desc and
+                 * easize, as that will be done in mds_lov_connect() after
+                 * recovery is finished. */
                 CWARN("MDS %s: in recovery, not resetting orphans on %s\n",
                       obd->obd_name, uuid->uuid);
         } else {
                 LASSERT(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT) != NULL);
-                
-                rc = obd_set_info(obd->u.mds.mds_osc_exp, strlen("mds_conn"),
-                                  "mds_conn", 0, uuid);
-                if (rc != 0)
+
+                /* this may be called also in case of adding new OST, thus, we
+                 * have to update MDS lov_desc and re-init MDS easize. The same
+                 * should be done on clients. */
+                rc = mds_init_lov_desc(obd, mds->mds_osc_exp);
+                if (rc)
                         RETURN(rc);
 
                 rc = mds_lov_start_synchronize(obd, uuid, 1);
@@ -636,12 +663,14 @@ int mds_convert_lov_ea(struct obd_device *obd, struct inode *inode,
         int rc, err;
         ENTRY;
 
-        if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC)
+        if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC || 
+            le32_to_cpu(lmm->lmm_magic == LOV_MAGIC_JOIN))
                 RETURN(0);
 
         CDEBUG(D_INODE, "converting LOV EA on %lu/%u from %#08x to %#08x\n",
                inode->i_ino, inode->i_generation, le32_to_cpu(lmm->lmm_magic),
                LOV_MAGIC);
+       
         rc = obd_unpackmd(obd->u.mds.mds_osc_exp, &lsm, lmm, lmm_size);
         if (rc < 0)
                 GOTO(conv_end, rc);
@@ -657,7 +686,7 @@ int mds_convert_lov_ea(struct obd_device *obd, struct inode *inode,
                 GOTO(conv_free, rc);
         }
 
-        rc = fsfilt_set_md(obd, inode, handle, lmm, lmm_size);
+        rc = fsfilt_set_md(obd, inode, handle, lmm, lmm_size, "lov");
 
         err = fsfilt_commit(obd, inode, handle, 0);
         if (!rc)
index 9e15740..b877e69 100644 (file)
@@ -38,6 +38,7 @@
 #include <libcfs/list.h>
 #include <obd_class.h>
 #include <lustre_fsfilt.h>
+#include <lustre_mds.h>
 #include <lustre_commit_confd.h>
 #include <lvfs.h>
 
index 6cdb442..ff70e59 100644 (file)
@@ -23,7 +23,7 @@ obdclass-all-objs := llog.o llog_cat.o llog_lvfs.o llog_obd.o llog_swab.o
 obdclass-all-objs += class_obd.o
 obdclass-all-objs += debug.o genops.o uuid.o llog_ioctl.o
 obdclass-all-objs += lprocfs_status.o lustre_handles.o lustre_peer.o
-obdclass-all-objs += statfs_pack.o obdo.o obd_config.o
+obdclass-all-objs += statfs_pack.o obdo.o obd_config.o prng.o
 
 obdclass-objs := $(obdclass-linux-objs) $(obdclass-all-objs)
 
index 391c339..f144fda 100644 (file)
@@ -41,6 +41,7 @@
 #include <lprocfs_status.h>
 #ifdef __KERNEL__
 #include <linux/lustre_build_version.h>
+#include <linux/lustre_ver.h>
 #endif
 #include <libcfs/list.h>
 #include "llog_internal.h"
@@ -323,9 +324,8 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg)
                         CERROR("Device %d not attached\n", obd->obd_minor);
                         GOTO(out, err = -ENODEV);
                 }
-                CDEBUG(D_IOCTL,
-                       "disabling committed-transno notifications on %d\n",
-                       obd->obd_minor);
+                CDEBUG(D_HA, "%s: disabling committed-transno notification\n",
+                       obd->obd_name);
                 obd->obd_no_transno = 1;
                 GOTO(out, err = 0);
         }
@@ -414,6 +414,7 @@ EXPORT_SYMBOL(class_handle_unhash);
 EXPORT_SYMBOL(class_handle2object);
 
 /* config.c */
+EXPORT_SYMBOL(class_decref);
 EXPORT_SYMBOL(class_get_profile);
 EXPORT_SYMBOL(class_del_profile);
 EXPORT_SYMBOL(class_process_config);
index 47f4bd8..aeefaf9 100644 (file)
@@ -119,12 +119,12 @@ int class_register_type(struct obd_ops *ops, struct lprocfs_vars *vars,
 #ifdef LPROCFS
         type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root,
                                               vars, type);
-#endif
         if (IS_ERR(type->typ_procroot)) {
                 rc = PTR_ERR(type->typ_procroot);
                 type->typ_procroot = NULL;
                 GOTO (failed, rc);
         }
+#endif
 
         spin_lock(&obd_types_lock);
         list_add(&type->typ_chain, &obd_types);
@@ -1201,8 +1201,8 @@ search_again:
         list_for_each(p, &obd->obd_exports) {
                 doomed_exp[num_to_evict] = list_entry(p, struct obd_export,
                                                       exp_obd_chain);
-                if (strcmp(obd_export_nid2str(doomed_exp[num_to_evict]), nid)
-                    == 0) {
+                if (strcmp(obd_export_nid2str(doomed_exp[num_to_evict]),
+                           nid) == 0) {
                         class_export_get(doomed_exp[num_to_evict]);
                         if (++num_to_evict == EVICT_BATCH)
                                 break;
@@ -1212,8 +1212,8 @@ search_again:
 
         for (i = 0; i < num_to_evict; i++) {
                 exports_evicted++;
-                CERROR("evicting NID '%s' (%s) #%d at adminstrative request\n",
-                       nid, doomed_exp[i]->exp_client_uuid.uuid,
+                CWARN("%s: evict NID '%s' (%s) #%d at adminstrative request\n",
+                       obd->obd_name, nid, doomed_exp[i]->exp_client_uuid.uuid,
                        exports_evicted);
                 class_fail_export(doomed_exp[i]);
                 class_export_put(doomed_exp[i]);
@@ -1224,7 +1224,8 @@ search_again:
         }
 
         if (!exports_evicted)
-                CERROR("can't disconnect NID '%s': no exports found\n", nid);
+                CDEBUG(D_HA,"%s: can't disconnect NID '%s': no exports found\n",
+                       obd->obd_name, nid);
         return exports_evicted;
 }
 EXPORT_SYMBOL(obd_export_evict_by_nid);
@@ -1251,10 +1252,11 @@ int obd_export_evict_by_uuid(struct obd_device *obd, char *uuid)
         spin_unlock(&obd->obd_dev_lock);
 
         if (doomed_exp == NULL) {
-                CERROR("can't disconnect %s: no exports found\n", uuid);
+                CERROR("%s: can't disconnect %s: no exports found\n",
+                       obd->obd_name, uuid);
         } else {
-                CERROR("evicting %s at adminstrative request\n",
-                       doomed_exp->exp_client_uuid.uuid);
+                CWARN("%s: evicting %s at adminstrative request\n",
+                       obd->obd_name, doomed_exp->exp_client_uuid.uuid);
                 class_fail_export(doomed_exp);
                 class_export_put(doomed_exp);
                 exports_evicted++;
index a6edbb7..a364bec 100644 (file)
@@ -40,6 +40,7 @@
 #endif
 
 #include <obd_class.h>
+#include <lustre_mds.h>
 #include <lustre_log.h>
 #include <libcfs/list.h>
 
index 55039cc..28e47c1 100644 (file)
@@ -40,6 +40,7 @@
 #endif
 
 #include <obd_class.h>
+#include <lustre_mds.h>
 #include <lustre_log.h>
 #include <libcfs/list.h>
 
index f9c1ec0..0cafada 100644 (file)
@@ -29,6 +29,7 @@
 #endif
 
 #include <obd_class.h>
+#include <lustre_mds.h>
 #include <lustre_log.h>
 #include <libcfs/list.h>
 #include "llog_internal.h"
index 594a00f..7ea246d 100644 (file)
@@ -41,6 +41,7 @@
 
 #include <obd.h>
 #include <obd_class.h>
+#include <lustre_mds.h>
 #include <lustre_log.h>
 #include <obd_ost.h>
 #include <libcfs/list.h>
index c987642..c8c2cf1 100644 (file)
@@ -33,6 +33,7 @@
 #endif
 
 #include <obd_class.h>
+#include <lustre_mds.h>
 #include <lustre_log.h>
 #include <libcfs/list.h>
 #include "llog_internal.h"
index e12003f..303505b 100644 (file)
@@ -32,6 +32,7 @@
 #include <liblustre.h>
 #endif
 
+#include <lustre_mds.h>
 #include <lustre_log.h>
 
 static void print_llogd_body(struct llogd_body *d)
index 89dac0a..68f0d6a 100644 (file)
@@ -34,8 +34,8 @@
 #include <linux/init.h>
 
 #include <obd_class.h>
-#include <lustre_log.h>
 #include <lustre_mds.h> /* for LUSTRE_MDC_NAME */
+#include <lustre_log.h>
 
 static int llog_test_rand;
 static struct obd_uuid uuid = { .uuid = "test_uuid" };
index 1d4fa3a..21d6f50 100644 (file)
@@ -26,7 +26,7 @@
 #define DEBUG_SUBSYSTEM S_CLASS
 #ifndef __KERNEL__
 # include <liblustre.h>
-#endif 
+#endif
 
 #include <obd_support.h>
 #include <lustre_handles.h>
@@ -40,6 +40,10 @@ static int handle_count = 0;
 #define HANDLE_HASH_SIZE (1 << 14)
 #define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1)
 
+/*
+ * Generate a unique 64bit cookie (hash) for a handle and insert it into
+ * global (per-node) hash-table.
+ */
 void class_handle_hash(struct portals_handle *h, portals_handle_addref_cb cb)
 {
         struct list_head *bucket;
@@ -49,19 +53,33 @@ void class_handle_hash(struct portals_handle *h, portals_handle_addref_cb cb)
         LASSERT(list_empty(&h->h_link));
 
         spin_lock(&handle_lock);
+
+        /*
+         * This is fast, but simplistic cookie generation algorithm, it will
+         * need a re-do at some point in the future for security.
+         */
         h->h_cookie = handle_base;
         handle_base += HANDLE_INCR;
-        spin_unlock(&handle_lock);
 
-        h->h_addref = cb;
         bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
-        CDEBUG(D_INFO, "adding object %p with handle "LPX64" to hash\n",
-               h, h->h_cookie);
-
-        spin_lock(&handle_lock);
         list_add(&h->h_link, bucket);
         handle_count++;
+
+        if (unlikely(handle_base == 0)) {
+                /*
+                 * Cookie of zero is "dangerous", because in many places it's
+                 * assumed that 0 means "unassigned" handle, not bound to any
+                 * object.
+                 */
+                CWARN("The universe has been exhausted: cookie wrap-around.\n");
+                handle_base += HANDLE_INCR;
+        }
+
         spin_unlock(&handle_lock);
+
+        h->h_addref = cb;
+        CDEBUG(D_INFO, "added object %p with handle "LPX64" to hash\n",
+               h, h->h_cookie);
         EXIT;
 }
 
index ca38953..67082af 100644 (file)
@@ -33,6 +33,7 @@
 #include <obd_class.h>
 #include <obd.h>
 #endif
+#include <lustre_mds.h>
 #include <lustre_log.h>
 #include <lprocfs_status.h>
 #include <libcfs/list.h>
index 0dac6d4..a95eb77 100644 (file)
@@ -194,7 +194,7 @@ static int echo_create_object(struct obd_device *obd, int on_target,
                 if (lsm->lsm_stripe_size == 0)
                         lsm->lsm_stripe_size = CFS_PAGE_SIZE;
 
-                idx = ll_insecure_random_int();
+                idx = ll_rand();
 
                 /* setup stripes: indices + default ids if required */
                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
@@ -239,7 +239,7 @@ static int echo_create_object(struct obd_device *obd, int on_target,
                         oa->o_id, on_target ? " (undoing create)" : "");
 
                 if (on_target)
-                        obd_destroy(ec->ec_exp, oa, lsm, oti);
+                        obd_destroy(ec->ec_exp, oa, lsm, oti, NULL);
 
                 rc = -EEXIST;
                 goto failed;
@@ -280,11 +280,11 @@ echo_get_object (struct ec_object **ecop, struct obd_device *obd,
         spin_lock (&ec->ec_lock);
         eco = echo_find_object_locked (obd, oa->o_id);
         if (eco != NULL) {
-                if (eco->eco_deleted) {          /* being deleted */
-                        spin_unlock (&ec->ec_lock);
-                        return (-EAGAIN);       /* (see comment in cleanup) */
+                if (eco->eco_deleted) {            /* being deleted */
+                        spin_unlock(&ec->ec_lock); /* (see comment in cleanup) */
+                        return (-EAGAIN);
                 }
-
+                
                 eco->eco_refcount++;
                 spin_unlock (&ec->ec_lock);
                 *ecop = eco;
@@ -509,11 +509,11 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa,
         gfp_mask = ((oa->o_id & 2) == 0) ? CFS_ALLOC_STD : CFS_ALLOC_HIGHUSER;
 
         LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ);
+        LASSERT(lsm != NULL);
+        LASSERT(lsm->lsm_object_id == oa->o_id);
 
         if (count <= 0 ||
-            (count & (CFS_PAGE_SIZE - 1)) != 0 ||
-            (lsm != NULL &&
-             lsm->lsm_object_id != oa->o_id))
+            (count & (CFS_PAGE_SIZE - 1)) != 0)
                 return (-EINVAL);
 
         /* XXX think again with misaligned I/O */
@@ -936,9 +936,8 @@ static int echo_client_prep_commit(struct obd_export *exp, int rw,
                         rnb[i].len = CFS_PAGE_SIZE;
                 }
 
-                /* XXX this can't be the best.. */
-                memset(oti, 0, sizeof(*oti));
                 ioo.ioo_bufcnt = npages;
+                oti->oti_transno = 0;
 
                 ret = obd_preprw(rw, exp, oa, 1, &ioo, npages, rnb, lnb, oti);
                 if (ret != 0)
@@ -986,7 +985,7 @@ int echo_client_brw_ioctl(int rw, struct obd_export *exp,
 {
         struct obd_device *obd = class_exp2obd(exp);
         struct echo_client_obd *ec = &obd->u.echo_client;
-        struct obd_trans_info dummy_oti;
+        struct obd_trans_info dummy_oti = { .oti_thread_id = -1 };
         struct ec_object *eco;
         int rc;
         ENTRY;
@@ -995,8 +994,6 @@ int echo_client_brw_ioctl(int rw, struct obd_export *exp,
         if (rc)
                 RETURN(rc);
 
-        memset(&dummy_oti, 0, sizeof(dummy_oti));
-
         data->ioc_obdo1.o_valid &= ~OBD_MD_FLHANDLE;
         data->ioc_obdo1.o_valid |= OBD_MD_FLGROUP;
         data->ioc_obdo1.o_gr = FILTER_GROUP_ECHO;
@@ -1223,7 +1220,7 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp,
                         oa->o_gr = FILTER_GROUP_ECHO;
                         oa->o_valid |= OBD_MD_FLGROUP;
                         rc = obd_destroy(ec->ec_exp, oa, eco->eco_lsm, 
-                                         &dummy_oti);
+                                         &dummy_oti, NULL);
                         if (rc == 0)
                                 eco->eco_deleted = 1;
                         echo_put_object(eco);
index 6c80076..214b1ad 100644 (file)
 #include <lustre_dlm.h>
 #include <lustre_fsfilt.h>
 #include <lprocfs_status.h>
+#include <lustre_mds.h>
 #include <lustre_log.h>
 #include <lustre_commit_confd.h>
 #include <libcfs/list.h>
 #include <lustre_quota.h>
+#include <linux/lustre_ver.h>
 
 #include "filter_internal.h"
 
@@ -160,15 +162,14 @@ static int filter_client_add(struct obd_device *obd, struct filter_obd *filter,
          * there's no need for extra complication here
          */
         if (new_client) {
-                cl_idx = find_first_zero_bit(bitmap, FILTER_LR_MAX_CLIENTS);
+                cl_idx = find_first_zero_bit(bitmap, LR_MAX_CLIENTS);
         repeat:
-                if (cl_idx >= FILTER_LR_MAX_CLIENTS) {
-                        CERROR("no client slots - fix FILTER_LR_MAX_CLIENTS\n");
+                if (cl_idx >= LR_MAX_CLIENTS) {
+                        CERROR("no client slots - fix LR_MAX_CLIENTS\n");
                         RETURN(-EOVERFLOW);
                 }
                 if (test_and_set_bit(cl_idx, bitmap)) {
-                        cl_idx = find_next_zero_bit(bitmap,
-                                                    FILTER_LR_MAX_CLIENTS,
+                        cl_idx = find_next_zero_bit(bitmap, LR_MAX_CLIENTS,
                                                     cl_idx);
                         goto repeat;
                 }
@@ -302,7 +303,7 @@ static int filter_free_server_data(struct filter_obd *filter)
 {
         OBD_FREE(filter->fo_fsd, sizeof(*filter->fo_fsd));
         filter->fo_fsd = NULL;
-        OBD_FREE(filter->fo_last_rcvd_slots, FILTER_LR_MAX_CLIENTS / 8);
+        OBD_FREE(filter->fo_last_rcvd_slots, LR_MAX_CLIENTS / 8);
         filter->fo_last_rcvd_slots = NULL;
         return 0;
 }
@@ -369,17 +370,17 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
         int rc;
 
         /* ensure padding in the struct is the correct size */
-        LASSERT (offsetof(struct filter_server_data, fsd_padding) +
-                 sizeof(fsd->fsd_padding) == FILTER_LR_SERVER_SIZE);
-        LASSERT (offsetof(struct filter_client_data, fcd_padding) +
-                 sizeof(fcd->fcd_padding) == FILTER_LR_CLIENT_SIZE);
+        CLASSERT(offsetof(struct filter_server_data, fsd_padding) +
+                 sizeof(fsd->fsd_padding) == LR_SERVER_SIZE);
+        CLASSERT(offsetof(struct filter_client_data, fcd_padding) +
+                 sizeof(fcd->fcd_padding) == LR_CLIENT_SIZE);
 
         OBD_ALLOC(fsd, sizeof(*fsd));
         if (!fsd)
                 RETURN(-ENOMEM);
         filter->fo_fsd = fsd;
 
-        OBD_ALLOC(filter->fo_last_rcvd_slots, FILTER_LR_MAX_CLIENTS / 8);
+        OBD_ALLOC(filter->fo_last_rcvd_slots, LR_MAX_CLIENTS / 8);
         if (filter->fo_last_rcvd_slots == NULL) {
                 OBD_FREE(fsd, sizeof(*fsd));
                 RETURN(-ENOMEM);
@@ -391,9 +392,9 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                 memcpy(fsd->fsd_uuid, obd->obd_uuid.uuid,sizeof(fsd->fsd_uuid));
                 fsd->fsd_last_transno = 0;
                 mount_count = fsd->fsd_mount_count = 0;
-                fsd->fsd_server_size = cpu_to_le32(FILTER_LR_SERVER_SIZE);
-                fsd->fsd_client_start = cpu_to_le32(FILTER_LR_CLIENT_START);
-                fsd->fsd_client_size = cpu_to_le16(FILTER_LR_CLIENT_SIZE);
+                fsd->fsd_server_size = cpu_to_le32(LR_SERVER_SIZE);
+                fsd->fsd_client_start = cpu_to_le32(LR_CLIENT_START);
+                fsd->fsd_client_size = cpu_to_le16(LR_CLIENT_SIZE);
                 fsd->fsd_subdir_count = cpu_to_le16(FILTER_SUBDIR_COUNT);
                 filter->fo_subdir_count = FILTER_SUBDIR_COUNT;
         } else {
@@ -413,14 +414,14 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
         }
 
         if (fsd->fsd_feature_incompat & ~cpu_to_le32(FILTER_INCOMPAT_SUPP)) {
-                CERROR("unsupported feature %x\n",
-                       le32_to_cpu(fsd->fsd_feature_incompat) &
+                CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
+                       obd->obd_name, le32_to_cpu(fsd->fsd_feature_incompat) &
                        ~FILTER_INCOMPAT_SUPP);
                 GOTO(err_fsd, rc = -EINVAL);
         }
         if (fsd->fsd_feature_rocompat & ~cpu_to_le32(FILTER_ROCOMPAT_SUPP)) {
-                CERROR("read-only feature %x\n",
-                       le32_to_cpu(fsd->fsd_feature_rocompat) &
+                CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
+                       obd->obd_name, le32_to_cpu(fsd->fsd_feature_rocompat) &
                        ~FILTER_ROCOMPAT_SUPP);
                 /* Do something like remount filesystem read-only */
                 GOTO(err_fsd, rc = -EINVAL);
@@ -529,7 +530,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                 obd->obd_recovery_start = CURRENT_SECONDS;
                 /* Only used for lprocfs_status */
                 obd->obd_recovery_end = obd->obd_recovery_start +
-                        OBD_RECOVERY_TIMEOUT;
+                        OBD_RECOVERY_TIMEOUT / HZ;
         }
 
 out:
@@ -558,12 +559,6 @@ static int filter_cleanup_groups(struct obd_device *obd)
         int i;
         ENTRY;
 
-        if (filter->fo_blacklist != NULL) {
-                OBD_FREE(filter->fo_blacklist,
-                         FILTER_GROUPS * sizeof(struct filter_ext));
-                filter->fo_blacklist = NULL;
-        }
-        
         if (filter->fo_dentry_O_groups != NULL) {
                 for (i = 0; i < FILTER_GROUPS; i++) {
                         dentry = filter->fo_dentry_O_groups[i];
@@ -616,11 +611,6 @@ static int filter_prep_groups(struct obd_device *obd)
         int i, rc = 0, cleanup_phase = 0;
         ENTRY;
 
-        OBD_ALLOC(filter->fo_blacklist,
-                  FILTER_GROUPS * sizeof(struct filter_ext));
-        if (!filter->fo_blacklist)
-                GOTO(cleanup, rc = -ENOMEM);
-        
         O_dentry = simple_mkdir(current->fs->pwd, "O", 0700, 1);
         CDEBUG(D_INODE, "got/created O: %p\n", O_dentry);
         if (IS_ERR(O_dentry)) {
@@ -664,7 +654,7 @@ static int filter_prep_groups(struct obd_device *obd)
                         GOTO(cleanup_O0, rc);
                 }
                 filter->fo_fsd->fsd_feature_incompat |=
-                        cpu_to_le32(FILTER_INCOMPAT_GROUPS);
+                        cpu_to_le32(OBD_INCOMPAT_GROUPS);
                 rc = filter_update_server_data(obd, filter->fo_rcvd_filp,
                                                filter->fo_fsd, 1);
                 GOTO(cleanup_O0, rc);
@@ -716,15 +706,7 @@ static int filter_prep_groups(struct obd_device *obd)
                 filter->fo_last_objid_files[i] = filp;
 
                 if (filp->f_dentry->d_inode->i_size == 0) {
-                        if (i == 0 && filter->fo_fsd->fsd_unused != 0) {
-                                /* OST conversion, remove sometime post 1.0 */
-                                filter->fo_last_objids[0] =
-                                        le64_to_cpu(filter->fo_fsd->fsd_unused);
-                                CWARN("saving old objid "LPU64" to LAST_ID\n",
-                                      filter->fo_last_objids[0]);
-                        } else {
-                                filter->fo_last_objids[i] = FILTER_INIT_OBJID;
-                        }
+                        filter->fo_last_objids[i] = FILTER_INIT_OBJID;
                         rc = filter_update_last_objid(obd, i, 1);
                         if (rc)
                                 GOTO(cleanup, rc);
@@ -791,30 +773,47 @@ static int filter_prep(struct obd_device *obd)
                        LAST_RCVD, rc);
                 GOTO(out, rc);
         }
-
+        filter->fo_rcvd_filp = file;
         if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
                 CERROR("%s is not a regular file!: mode = %o\n", LAST_RCVD,
                        file->f_dentry->d_inode->i_mode);
                 GOTO(err_filp, rc = -ENOENT);
         }
 
-        /* steal operations */
-        inode = file->f_dentry->d_inode;
-        filter->fo_fop = file->f_op;
-        filter->fo_iop = inode->i_op;
-        filter->fo_aops = inode->i_mapping->a_ops;
+        inode = file->f_dentry->d_parent->d_inode;
+        /* We use i_op->unlink directly in filter_vfs_unlink() */
+        if (!inode->i_op || !inode->i_op->create || !inode->i_op->unlink) {
+                CERROR("%s: filesystem does not support create/unlink ops\n",
+                       obd->obd_name);
+                GOTO(err_filp, rc = -EOPNOTSUPP);
+        }
 
         rc = filter_init_server_data(obd, file);
         if (rc) {
                 CERROR("cannot read %s: rc = %d\n", LAST_RCVD, rc);
                 GOTO(err_filp, rc);
         }
-        filter->fo_rcvd_filp = file;
+        /* open and create health check io file*/
+        file = filp_open(HEALTH_CHECK, O_RDWR | O_CREAT, 0644);
+        if (IS_ERR(file)) {
+                rc = PTR_ERR(file);
+                CERROR("OBD filter: cannot open/create %s rc = %d\n",
+                       HEALTH_CHECK, rc);
+                GOTO(err_filp, rc);
+        }
+        filter->fo_health_check_filp = file;
+        if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
+                CERROR("%s is not a regular file!: mode = %o\n", HEALTH_CHECK,
+                       file->f_dentry->d_inode->i_mode);
+                GOTO(err_health_check, rc = -ENOENT);
+        }
+        rc = lvfs_check_io_health(obd, file);
+        if (rc)
+                GOTO(err_health_check, rc);
 
         rc = filter_prep_groups(obd);
         if (rc)
                 GOTO(err_server_data, rc);
-
  out:
         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
 
@@ -823,8 +822,12 @@ static int filter_prep(struct obd_device *obd)
  err_server_data:
         //class_disconnect_exports(obd, 0);
         filter_free_server_data(filter);
+ err_health_check:
+        if (filp_close(filter->fo_health_check_filp, 0))
+                CERROR("can't close %s after error\n", HEALTH_CHECK);
+        filter->fo_health_check_filp = NULL;
  err_filp:
-        if (filp_close(file, 0))
+        if (filp_close(filter->fo_rcvd_filp, 0))
                 CERROR("can't close %s after error\n", LAST_RCVD);
         filter->fo_rcvd_filp = NULL;
         goto out;
@@ -859,40 +862,44 @@ static void filter_post(struct obd_device *obd)
         if (rc)
                 CERROR("error closing %s: rc = %d\n", LAST_RCVD, rc);
 
+        rc = filp_close(filter->fo_health_check_filp, 0);
+        filter->fo_health_check_filp = NULL;
+        if (rc)
+                CERROR("error closing %s: rc = %d\n", HEALTH_CHECK, rc);
+
         filter_cleanup_groups(obd);
         filter_free_server_data(filter);
         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
 }
 
-static void filter_set_last_id(struct filter_obd *filter, 
-                              int group, obd_id id)
+static void filter_set_last_id(struct filter_obd *filter, struct obdo *oa,
+                               obd_id id)
 {
+        obd_gr group = 0;
         LASSERT(filter->fo_fsd != NULL);
-        LASSERT(group <= FILTER_GROUPS);
 
-        spin_lock(&filter->fo_objidlock);
-        filter->fo_last_objids[group] = id;
-        spin_unlock(&filter->fo_objidlock);
-}
-
-static void filter_grow_last_id(struct filter_obd *filter, 
-                                int group, obd_id id)
-{
-        LASSERT(filter->fo_fsd != NULL);
-        LASSERT(group <= FILTER_GROUPS);
+        if (oa != NULL) {
+                LASSERT(oa->o_gr <= FILTER_GROUPS);
+                group = oa->o_gr;
+        }
 
         spin_lock(&filter->fo_objidlock);
-        if (id > filter->fo_last_objids[group])
         filter->fo_last_objids[group] = id;
         spin_unlock(&filter->fo_objidlock);
 }
 
-__u64 filter_last_id(struct filter_obd *filter, int group)
+__u64 filter_last_id(struct filter_obd *filter, struct obdo *oa)
 {
         obd_id id;
+        obd_gr group = 0;
         LASSERT(filter->fo_fsd != NULL);
-        LASSERT(group < FILTER_GROUPS);
 
+        if (oa != NULL) {
+                LASSERT(oa->o_gr <= FILTER_GROUPS);
+                group = oa->o_gr;
+        }
+
+        /* FIXME: object groups */
         spin_lock(&filter->fo_objidlock);
         id = filter->fo_last_objids[group];
         spin_unlock(&filter->fo_objidlock);
@@ -900,46 +907,12 @@ __u64 filter_last_id(struct filter_obd *filter, int group)
         return id;
 }
 
-static void filter_lock_dentry(struct obd_device *obd,
-                               struct dentry *dparent)
+static int filter_lock_dentry(struct obd_device *obd, struct dentry *dparent)
 {
         down(&dparent->d_inode->i_sem);
+        return 0;
 }
 
-static void filter_unlock_dentry(struct obd_device *obd,
-                                 struct dentry *dparent)
-{
-        up(&dparent->d_inode->i_sem);
-}
-
-static void filter_parents_access(struct obd_device *obd,
-                                  obd_gr group, int lock)
-{
-        void (*access_func) (struct obd_device *, struct dentry *);
-        struct filter_obd *filter = &obd->u.filter;
-        struct dentry *dparent;
-        int i = 0;
-
-        access_func = lock ? filter_lock_dentry :
-                filter_unlock_dentry;
-        
-        if (group > 0 || filter->fo_subdir_count == 0) {
-                dparent = filter->fo_dentry_O_groups[group];
-                access_func(obd, dparent);
-        } else {
-                for (i = 0; i < filter->fo_subdir_count; i++) {
-                        dparent = filter->fo_dentry_O_sub[i];
-                        access_func(obd, dparent);
-                }
-        }
-}
-
-#define LOCK_PARENTS(obd, group)   \
-        filter_parents_access(obd, group, 1)
-
-#define UNLOCK_PARENTS(obd, group) \
-        filter_parents_access(obd, group, 0)
-
 /* We never dget the object parent, so DON'T dput it either */
 struct dentry *filter_parent(struct obd_device *obd, obd_gr group, obd_id objid)
 {
@@ -956,22 +929,22 @@ struct dentry *filter_parent(struct obd_device *obd, obd_gr group, obd_id objid)
 struct dentry *filter_parent_lock(struct obd_device *obd, obd_gr group,
                                   obd_id objid)
 {
-        struct dentry *dparent = filter_parent(obd, group, objid);
         unsigned long now = jiffies;
+        struct dentry *dparent = filter_parent(obd, group, objid);
+        int rc;
 
         if (IS_ERR(dparent))
                 return dparent;
 
-        filter_lock_dentry(obd, dparent);
+        rc = filter_lock_dentry(obd, dparent);
         fsfilt_check_slow(now, obd_timeout, "parent lock");
-        return dparent;
+        return rc ? ERR_PTR(rc) : dparent;
 }
 
-/* we never dget the object parent, so DON'T dput it either */
-static void filter_parent_unlock(struct obd_device *obd,
-                                 struct dentry *dparent)
+/* We never dget the object parent, so DON'T dput it either */
+static void filter_parent_unlock(struct dentry *dparent)
 {
-        filter_unlock_dentry(obd, dparent);
+        up(&dparent->d_inode->i_sem);
 }
 
 /* How to get files, dentries, inodes from object id's.
@@ -991,10 +964,8 @@ struct dentry *filter_fid2dentry(struct obd_device *obd,
         int len;
         ENTRY;
 
-        if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOENT)) {
-                CERROR("test case OBD_FAIL_OST_ENOENT\n");
+        if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOENT))
                 RETURN(ERR_PTR(-ENOENT));
-        }
 
         if (id == 0) {
                 CERROR("fatal: invalid object id 0\n");
@@ -1015,7 +986,7 @@ struct dentry *filter_fid2dentry(struct obd_device *obd,
                dparent->d_name.len, dparent->d_name.name, name);
         dchild = /*ll_*/lookup_one_len(name, dparent, len);
         if (dir_dentry == NULL)
-                filter_parent_unlock(obd, dparent);
+                filter_parent_unlock(dparent);
         if (IS_ERR(dchild)) {
                 CERROR("%s: child lookup error %ld\n", obd->obd_name,
                        PTR_ERR(dchild));
@@ -1059,14 +1030,67 @@ static int filter_prepare_destroy(struct obd_device *obd, obd_id objid)
         RETURN(rc);
 }
 
+/* This is vfs_unlink() without down(i_sem).  If we call regular vfs_unlink()
+ * we have 2.6 lock ordering issues with filter_commitrw_write() as it takes
+ * i_sem before starting a handle, while filter_destroy() + vfs_unlink do the
+ * reverse.  Caller must take i_sem before starting the transaction and we
+ * drop it here before the inode is removed from the dentry.  bug 4180/6984 */
+int filter_vfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        int rc;
+        ENTRY;
+
+        /* don't need dir->i_zombie for 2.4, it is for rename/unlink of dir
+         * itself we already hold dir->i_sem for child create/unlink ops */
+        LASSERT(down_trylock(&dir->i_sem) != 0);
+        LASSERT(down_trylock(&dentry->d_inode->i_sem) != 0);
+
+        /* may_delete() */
+        if (!dentry->d_inode || dentry->d_parent->d_inode != dir)
+                GOTO(out, rc = -ENOENT);
+
+        rc = ll_permission(dir, MAY_WRITE | MAY_EXEC, NULL);
+        if (rc)
+                GOTO(out, rc);
+
+        if (IS_APPEND(dir))
+                GOTO(out, rc = -EPERM);
+
+        /* check_sticky() */
+        if ((dentry->d_inode->i_uid != current->fsuid && !capable(CAP_FOWNER))||
+            IS_APPEND(dentry->d_inode) || IS_IMMUTABLE(dentry->d_inode))
+                GOTO(out, rc = -EPERM);
+
+        /* NOTE: This might need to go outside i_sem, though it isn't clear if
+         *       that was done because of journal_start (which is already done
+         *       here) or some other ordering issue. */
+        DQUOT_INIT(dir);
+
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+        rc = security_inode_unlink(dir, dentry);
+        if (rc)
+                GOTO(out, rc);
+#endif
+
+        rc = dir->i_op->unlink(dir, dentry);
+out:
+        /* need to drop i_sem before we lose inode reference */
+        up(&dentry->d_inode->i_sem);
+        if (rc == 0)
+                d_delete(dentry);
+
+        RETURN(rc);
+}
+
 /* Caller must hold LCK_PW on parent and push us into kernel context.
+ * Caller must hold child i_sem, we drop it always.
  * Caller is also required to ensure that dchild->d_inode exists. */
-static int filter_unlink(struct obd_device *obd, obd_id objid,
-                         struct dentry *dparent, struct dentry *dchild)
+static int filter_destroy_internal(struct obd_device *obd, obd_id objid,
+                                   struct dentry *dparent,
+                                   struct dentry *dchild)
 {
         struct inode *inode = dchild->d_inode;
         int rc;
-        ENTRY;
 
         if (inode->i_nlink != 1 || atomic_read(&inode->i_count) != 1) {
                 CERROR("destroying objid %.*s ino %lu nlink %lu count %d\n",
@@ -1075,11 +1099,11 @@ static int filter_unlink(struct obd_device *obd, obd_id objid,
                        atomic_read(&inode->i_count));
         }
 
-        rc = vfs_unlink(dparent->d_inode, dchild);
+        rc = filter_vfs_unlink(dparent->d_inode, dchild);
         if (rc)
                 CERROR("error unlinking objid %.*s: rc %d\n",
                        dchild->d_name.len, dchild->d_name.name, rc);
-        RETURN(rc);
+        return(rc);
 }
 
 static int filter_intent_policy(struct ldlm_namespace *ns,
@@ -1220,6 +1244,7 @@ static int filter_intent_policy(struct ldlm_namespace *ns,
                 }
                 RETURN(ELDLM_LOCK_ABORTED);
         }
+
         /*
          * This check is for lock taken in filter_prepare_destroy() that does
          * not have l_glimpse_ast set. So the logic is: if there is a lock
@@ -1268,7 +1293,7 @@ static int filter_intent_policy(struct ldlm_namespace *ns,
  * unknown at the time of OST thread creation.
  *
  * Instead array of iobuf's is attached to struct filter_obd (->fo_iobuf_pool
- * field). This array has size OST_NUM_THREADS, so that each OST thread uses
+ * field). This array has size OST_MAX_THREADS, so that each OST thread uses
  * it's very own iobuf.
  *
  * Functions below
@@ -1288,18 +1313,18 @@ static int filter_intent_policy(struct ldlm_namespace *ns,
  */
 static void filter_iobuf_pool_done(struct filter_obd *filter)
 {
-        void **pool;
+        struct filter_iobuf **pool;
         int i;
 
         ENTRY;
 
         pool = filter->fo_iobuf_pool;
         if (pool != NULL) {
-                for (i = 0; i < OST_NUM_THREADS; ++ i) {
+                for (i = 0; i < filter->fo_iobuf_count; ++ i) {
                         if (pool[i] != NULL)
                                 filter_free_iobuf(pool[i]);
                 }
-                OBD_FREE(pool, OST_NUM_THREADS * sizeof pool[0]);
+                OBD_FREE(pool, filter->fo_iobuf_count * sizeof pool[0]);
                 filter->fo_iobuf_pool = NULL;
         }
         EXIT;
@@ -1308,48 +1333,45 @@ static void filter_iobuf_pool_done(struct filter_obd *filter)
 /*
  * pre-allocate pool of iobuf's to be used by filter_{prep,commit}rw_write().
  */
-static int filter_iobuf_pool_init(struct filter_obd *filter, int count)
+static int filter_iobuf_pool_init(struct filter_obd *filter)
 {
         void **pool;
-        int i;
-        int result = 0;
 
         ENTRY;
 
-        LASSERT(count <= OST_NUM_THREADS);
-
-        OBD_ALLOC_GFP(pool, OST_NUM_THREADS * sizeof pool[0], CFS_ALLOC_STD);
-        if (pool == NULL)
+        OBD_ALLOC_GFP(filter->fo_iobuf_pool, OST_MAX_THREADS * sizeof(*pool),
+                      GFP_KERNEL);
+        if (filter->fo_iobuf_pool == NULL)
                 RETURN(-ENOMEM);
 
-        filter->fo_iobuf_pool = pool;
-        filter->fo_iobuf_count = count;
-        for (i = 0; i < count; ++ i) {
-                /*
-                 * allocate kiobuf to be used by i-th OST thread.
-                 */
-                result = filter_alloc_iobuf(filter, OBD_BRW_WRITE,
-                                            PTLRPC_MAX_BRW_PAGES,
-                                            &pool[i]);
-                if (result != 0) {
-                        filter_iobuf_pool_done(filter);
-                        break;
-                }
-        }
-        RETURN(result);
+        filter->fo_iobuf_count = OST_MAX_THREADS;
+
+        RETURN(0);
 }
 
-/*
- * return iobuf preallocated by filter_iobuf_pool_init() for @thread.
- */
-void *filter_iobuf_get(struct ptlrpc_thread *thread, struct filter_obd *filter)
+/* Return iobuf allocated for @thread_id.  We don't know in advance how
+ * many threads there will be so we allocate a large empty array and only
+ * fill in those slots that are actually in use.
+ * If we haven't allocated a pool entry for this thread before, do so now. */
+void *filter_iobuf_get(struct filter_obd *filter, struct obd_trans_info *oti)
 {
-        void *kio;
+        int thread_id                    = oti ? oti->oti_thread_id : -1;
+        struct filter_iobuf  *pool       = NULL;
+        struct filter_iobuf **pool_place = NULL;
+
+        if (thread_id >= 0) {
+                LASSERT(thread_id < filter->fo_iobuf_count);
+                pool = *(pool_place = &filter->fo_iobuf_pool[thread_id]);
+        }
 
-        LASSERT(thread->t_id < filter->fo_iobuf_count);
-        kio = filter->fo_iobuf_pool[thread->t_id];
-        LASSERT(kio != NULL);
-        return kio;
+        if (unlikely(pool == NULL)) {
+                pool = filter_alloc_iobuf(filter, OBD_BRW_WRITE,
+                                          PTLRPC_MAX_BRW_PAGES);
+                if (pool_place != NULL)
+                        *pool_place = pool;
+        }
+
+        return pool;
 }
 
 /* mount the file system (secretly).  lustre_cfg parameters are:
@@ -1364,7 +1386,9 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
         struct lustre_cfg* lcfg = buf;
         struct filter_obd *filter = &obd->u.filter;
         struct vfsmount *mnt;
-        char *str;
+        struct obd_uuid uuid;
+        __u8 *uuid_ptr;
+        char *str, *label;
         char ns_name[48];
         int rc;
         ENTRY;
@@ -1378,7 +1402,7 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
         if (IS_ERR(obd->obd_fsops))
                 RETURN(PTR_ERR(obd->obd_fsops));
 
-        rc = filter_iobuf_pool_init(filter, OST_NUM_THREADS);
+        rc = filter_iobuf_pool_init(filter);
         if (rc != 0)
                 GOTO(err_ops, rc);
 
@@ -1421,8 +1445,7 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
                 GOTO(err_mntput, rc);
 
         filter->fo_destroy_in_progress = 0;
-
-        spin_lock_init(&filter->fo_blacklist_lock);
+        sema_init(&filter->fo_create_lock, 1);
         spin_lock_init(&filter->fo_translock);
         spin_lock_init(&filter->fo_objidlock);
         spin_lock_init(&filter->fo_stats_lock);
@@ -1463,25 +1486,34 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
         if (rc)
                 GOTO(err_post, rc);
 
+        uuid_ptr = fsfilt_uuid(obd, obd->u.obt.obt_sb);
+        if (uuid_ptr != NULL) {
+                class_uuid_unparse(uuid_ptr, &uuid);
+                str = uuid.uuid;
+        } else {
+                str = "no UUID";
+        }
+        label = fsfilt_label(obd, obd->u.obt.obt_sb);
+
         if (obd->obd_recovering) {
-                LCONSOLE_WARN("OST %s now serving %s, but will be in recovery "
-                              "until %d %s reconnect, or if no clients "
-                              "reconnect for %d:%.02d; during that time new "
+                LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in"
+                              "recovery until %d %s reconnect, or if no clients"
+                              " reconnect for %d:%.02d; during that time new "
                               "clients will not be allowed to connect. "
                               "Recovery progress can be monitored by watching "
                               "/proc/fs/lustre/obdfilter/%s/recovery_status.\n",
-                              obd->obd_name,
-                              lustre_cfg_string(lcfg, 1),
+                              obd->obd_name, lustre_cfg_string(lcfg, 1),
+                              label ?: "", label ? "/" : "", str,
                               obd->obd_recoverable_clients,
                               (obd->obd_recoverable_clients == 1)
                               ? "client" : "clients",
-                              (int)(OBD_RECOVERY_TIMEOUT) / 60,
-                              (int)(OBD_RECOVERY_TIMEOUT) % 60,
+                              (int)(OBD_RECOVERY_TIMEOUT / HZ) / 60,
+                              (int)(OBD_RECOVERY_TIMEOUT / HZ) % 60,
                               obd->obd_name);
         } else {
-                LCONSOLE_INFO("OST %s now serving %s with recovery %s.\n",
-                              obd->obd_name,
-                              lustre_cfg_string(lcfg, 1),
+                LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery "
+                              "%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1),
+                              label ?: "", label ? "/" : "", str,
                               obd->obd_replayable ? "enabled" : "disabled");
         }
 
@@ -1600,10 +1632,10 @@ static int filter_precleanup(struct obd_device *obd, int stage)
         ENTRY;
 
         switch(stage) {
-        case 1:
+        case OBD_CLEANUP_EXPORTS:
                 target_cleanup_recovery(obd);
                 break;
-        case 2:
+        case OBD_CLEANUP_SELF_EXP:
                 rc = filter_llog_finish(obd, 0);
         }
         RETURN(rc);
@@ -1655,7 +1687,7 @@ static int filter_cleanup(struct obd_device *obd)
 
         /* We can only unlock kernel if we are in the context of sys_ioctl,
            otherwise we never called lock_kernel */
-        if (kernel_locked()) {
+        if (ll_kernel_locked()) {
                 unlock_kernel();
                 must_relock++;
         }
@@ -1678,9 +1710,85 @@ static int filter_cleanup(struct obd_device *obd)
         RETURN(0);
 }
 
+static int filter_connect_internal(struct obd_export *exp,
+                                   struct obd_connect_data *data)
+{
+        if (!data) 
+                RETURN(0);
+        
+        CDEBUG(D_RPCTRACE, "%s: cli %s/%p ocd_connect_flags: "LPX64
+               " ocd_version: %x ocd_grant: %d\n",
+               exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
+               data->ocd_connect_flags, data->ocd_version,
+               data->ocd_grant);
+
+        data->ocd_connect_flags &= OST_CONNECT_SUPPORTED;
+        exp->exp_connect_flags = data->ocd_connect_flags;
+        data->ocd_version = LUSTRE_VERSION_CODE;
+
+        if (exp->exp_connect_flags & OBD_CONNECT_GRANT) {
+                obd_size left, want;
+
+                spin_lock(&exp->exp_obd->obd_osfs_lock);
+                left = filter_grant_space_left(exp);
+                want = data->ocd_grant;
+                data->ocd_grant = filter_grant(exp, 0, want, left);
+                spin_unlock(&exp->exp_obd->obd_osfs_lock);
+
+                CDEBUG(D_CACHE, "%s: cli %s/%p ocd_grant: %d want: "
+                       "%lld left: %lld\n", exp->exp_obd->obd_name,
+                       exp->exp_client_uuid.uuid, exp,
+                       data->ocd_grant, want, left);
+        }
+
+        if (data->ocd_connect_flags & OBD_CONNECT_INDEX) {
+                struct filter_obd *filter = &exp->exp_obd->u.filter;
+                struct filter_server_data *fsd = filter->fo_fsd;
+                int index = le32_to_cpu(fsd->fsd_ost_index);
+                
+                if (!(fsd->fsd_feature_compat &
+                      cpu_to_le32(OBD_COMPAT_OST))) {
+                        /* this will only happen on the first connect */
+                        fsd->fsd_ost_index = le32_to_cpu(data->ocd_index);
+                        fsd->fsd_feature_compat |= cpu_to_le32(OBD_COMPAT_OST);
+                        filter_update_server_data(exp->exp_obd, 
+                                                  filter->fo_rcvd_filp, fsd, 1);
+                } else if (index != data->ocd_index) {
+                        LCONSOLE_ERROR("Connection from %s to index "
+                                       "%u doesn't match actual OST "
+                                       "index %u, bad configuration?\n",
+                                       obd_export_nid2str(exp), index, 
+                                       data->ocd_index);
+                        RETURN(-EBADF);
+                }
+        }
+        /* FIXME: Do the same with the MDS UUID and fsd_peeruuid.
+         * FIXME: We don't strictly need the COMPAT flag for that,
+         * FIXME: as fsd_peeruuid[0] will tell us if that is set.
+         * FIXME: We needed it for the index, as index 0 is valid. */
+
+        RETURN(0);
+}
+
+static int filter_reconnect(struct obd_export *exp, struct obd_device *obd,
+                            struct obd_uuid *cluuid,
+                            struct obd_connect_data *data)
+{
+        int rc;
+        ENTRY;
+
+        if (exp == NULL || obd == NULL || cluuid == NULL)
+                RETURN(-EINVAL);
+
+        rc = filter_connect_internal(exp, data);
+
+        RETURN(rc);
+}
+
 /* nearly identical to mds_connect */
 static int filter_connect(struct lustre_handle *conn, struct obd_device *obd,
-                          struct obd_uuid *cluuid, struct obd_connect_data *data)
+                          struct obd_uuid *cluuid,
+                          struct obd_connect_data *data)
 {
         struct obd_export *exp;
         struct filter_export_data *fed;
@@ -1700,13 +1808,12 @@ static int filter_connect(struct lustre_handle *conn, struct obd_device *obd,
 
         fed = &exp->exp_filter_data;
 
-        if (data != NULL) {
-                data->ocd_connect_flags &= OST_CONNECT_SUPPORTED;
-                exp->exp_connect_flags = data->ocd_connect_flags;
-        }
-
         spin_lock_init(&fed->fed_lock);
 
+        rc = filter_connect_internal(exp, data);
+        if (rc)
+                GOTO(cleanup, rc);
+
         if (!obd->obd_replayable)
                 GOTO(cleanup, rc = 0);
 
@@ -1720,6 +1827,7 @@ static int filter_connect(struct lustre_handle *conn, struct obd_device *obd,
         fed->fed_fcd = fcd;
 
         rc = filter_client_add(obd, filter, fed, -1);
+
         GOTO(cleanup, rc);
 
 cleanup:
@@ -1831,6 +1939,7 @@ static void filter_grant_discard(struct obd_export *exp)
                  "%s: tot_pending "LPU64" cli %s/%p fed_pending %ld\n",
                  obd->obd_name, filter->fo_tot_pending,
                  exp->exp_client_uuid.uuid, exp, fed->fed_pending);
+        /* fo_tot_pending is handled in filter_grant_commit as bulk finishes */
         LASSERTF(filter->fo_tot_dirty >= fed->fed_dirty,
                  "%s: tot_dirty "LPU64" cli %s/%p fed_dirty %ld\n",
                  obd->obd_name, filter->fo_tot_dirty,
@@ -1855,6 +1964,8 @@ static int filter_destroy_export(struct obd_export *exp)
 
         if (exp->exp_obd->obd_replayable)
                 filter_client_free(exp);
+        else
+                fsfilt_sync(exp->exp_obd, exp->exp_obd->u.obt.obt_sb);
 
         filter_grant_discard(exp);
 
@@ -1949,6 +2060,45 @@ static int filter_getattr(struct obd_export *exp, struct obdo *oa,
         RETURN(rc);
 }
 
+/* this should be enabled/disabled in condition to enabled/disabled large
+ * inodes (fast EAs) in backing store FS. */
+int filter_update_fidea(struct obd_export *exp, struct inode *inode,
+                        void *handle, struct obdo *oa)
+{
+        struct obd_device *obd = exp->exp_obd;
+        int rc = 0;
+        ENTRY;
+
+        if (oa->o_valid & OBD_MD_FLFID) {
+                struct filter_fid ff;
+                obd_gr group = 0;
+
+                if (oa->o_valid & OBD_MD_FLGROUP)
+                        group = oa->o_gr;
+
+                /* packing fid and converting it to LE for storing into EA.
+                 * Here ->o_stripe_idx should be filled by LOV and rest of
+                 * fields - by client. */
+                ff.ff_fid.id = cpu_to_le64(oa->o_fid);
+                ff.ff_fid.f_type = cpu_to_le32(oa->o_stripe_idx);
+                ff.ff_fid.generation = cpu_to_le32(oa->o_generation);
+                ff.ff_objid = cpu_to_le64(oa->o_id);
+                ff.ff_group = cpu_to_le64(group);
+
+                CDEBUG(D_INODE, "storing filter fid EA ("LPU64"/%u/%u"
+                       LPU64"/"LPU64")\n", oa->o_fid, oa->o_stripe_idx,
+                       oa->o_generation, oa->o_id, group);
+
+                rc = fsfilt_set_md(obd, inode, handle, &ff, sizeof(ff), "fid");
+                if (rc)
+                        CERROR("store fid in object failed! rc: %d\n", rc);
+        } else {
+                CDEBUG(D_HA, "OSS object without fid info!\n");
+        }
+
+        RETURN(rc);
+}
+
 /* this is called from filter_truncate() until we have filter_punch() */
 int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
                             struct obdo *oa, struct obd_trans_info *oti)
@@ -1956,17 +2106,22 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
         unsigned int orig_ids[MAXQUOTAS] = {0, 0};
         struct llog_cookie *fcc = NULL;
         struct filter_obd *filter;
+        int rc, err, locked = 0;
+        unsigned int ia_valid;
+        struct inode *inode;
         struct iattr iattr;
         void *handle;
-        int rc, err;
         ENTRY;
 
         LASSERT(dentry != NULL);
         LASSERT(!IS_ERR(dentry));
-        LASSERT(dentry->d_inode != NULL);
+
+        inode = dentry->d_inode;
+        LASSERT(inode != NULL);
 
         filter = &exp->exp_obd->u.filter;
         iattr_from_obdo(&iattr, oa, oa->o_valid);
+        ia_valid = iattr.ia_valid;
 
         if (oa->o_valid & OBD_MD_FLCOOKIE) {
                 OBD_ALLOC(fcc, sizeof(*fcc));
@@ -1974,16 +2129,48 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
                         memcpy(fcc, obdo_logcookie(oa), sizeof(*fcc));
         }
 
-        if (iattr.ia_valid & ATTR_SIZE)
-                down(&dentry->d_inode->i_sem);
+        if (ia_valid & ATTR_SIZE || ia_valid & (ATTR_UID | ATTR_GID)) {
+                down(&inode->i_sem);
+                locked = 1;
+        }
+
+        /* If the inode still has SUID+SGID bits set (see filter_precreate())
+         * then we will accept the UID+GID sent by the client during write for
+         * initializing the ownership of this inode.  We only allow this to
+         * happen once so clear these bits in setattr. In 2.6 kernels it is
+         * possible to get ATTR_UID and ATTR_GID separately, so we only clear
+         * the flags that are actually being set. */
+        if (ia_valid & (ATTR_UID | ATTR_GID)) {
+                CDEBUG(D_INODE, "update UID/GID to %lu/%lu\n",
+                       (unsigned long)oa->o_uid, (unsigned long)oa->o_gid);
+
+                if ((inode->i_mode & S_ISUID) && (ia_valid & ATTR_UID)) {
+                        if (!(ia_valid & ATTR_MODE)) {
+                                iattr.ia_mode = inode->i_mode;
+                                iattr.ia_valid |= ATTR_MODE;
+                        }
+                        iattr.ia_mode &= ~S_ISUID;
+                }
+                if ((inode->i_mode & S_ISGID) && (ia_valid & ATTR_GID)) {
+                        if (!(iattr.ia_valid & ATTR_MODE)) {
+                                iattr.ia_mode = inode->i_mode;
+                                iattr.ia_valid |= ATTR_MODE;
+                        }
+                        iattr.ia_mode &= ~S_ISGID;
+                }
 
-        if (iattr.ia_valid & (ATTR_UID | ATTR_GID)) {
-                orig_ids[USRQUOTA] = dentry->d_inode->i_uid;
-                orig_ids[GRPQUOTA] = dentry->d_inode->i_gid;
-                handle = fsfilt_start_log(exp->exp_obd, dentry->d_inode,
+                orig_ids[USRQUOTA] = inode->i_uid;
+                orig_ids[GRPQUOTA] = inode->i_gid;
+                handle = fsfilt_start_log(exp->exp_obd, inode,
                                           FSFILT_OP_SETATTR, oti, 1);
+
+                /* update inode EA only once when inode is suid bit marked. As
+                 * on 2.6.x UID and GID may be set separately, we check here
+                 * only one of them to avoid double setting. */
+                if (inode->i_mode & S_ISUID)
+                        filter_update_fidea(exp, inode, handle, oa);
         } else {
-                handle = fsfilt_start(exp->exp_obd, dentry->d_inode,
+                handle = fsfilt_start(exp->exp_obd, inode,
                                       FSFILT_OP_SETATTR, oti);
         }
 
@@ -1991,9 +2178,8 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
                 GOTO(out_unlock, rc = PTR_ERR(handle));
 
         if (oa->o_valid & OBD_MD_FLFLAGS) {
-                rc = fsfilt_iocontrol(exp->exp_obd, dentry->d_inode,
-                                      NULL, EXT3_IOC_SETFLAGS,
-                                      (long)&iattr.ia_attr_flags);
+                rc = fsfilt_iocontrol(exp->exp_obd, inode, NULL,
+                                      EXT3_IOC_SETFLAGS, (long)&oa->o_flags);
         } else {
                 rc = fsfilt_setattr(exp->exp_obd, dentry, handle, &iattr, 1);
                 if (fcc != NULL)
@@ -2004,9 +2190,14 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
                                               fcc);
         }
 
+        if (locked) {
+                up(&inode->i_sem);
+                locked = 0;
+        }
+
         rc = filter_finish_transno(exp, oti, rc);
-        
-        err = fsfilt_commit(exp->exp_obd, dentry->d_inode, handle, 0);
+
+        err = fsfilt_commit(exp->exp_obd, inode, handle, 0);
         if (err) {
                 CERROR("error on commit, err = %d\n", err);
                 if (!rc)
@@ -2014,14 +2205,14 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
         }
         EXIT;
 out_unlock:
-        if (iattr.ia_valid & ATTR_SIZE)
-                up(&dentry->d_inode->i_sem);
+        if (locked)
+                up(&inode->i_sem);
 
         /* trigger quota release */
-        if (iattr.ia_valid & (ATTR_SIZE | ATTR_UID | ATTR_GID)) {
+        if (ia_valid & (ATTR_SIZE | ATTR_UID | ATTR_GID)) {
                 unsigned int cur_ids[MAXQUOTAS] = {oa->o_uid, oa->o_gid};
-                int rc2= lquota_adjust(quota_interface, exp->exp_obd, cur_ids,
-                                       orig_ids, rc, FSFILT_OP_SETATTR);
+                int rc2 = lquota_adjust(quota_interface, exp->exp_obd, cur_ids,
+                                        orig_ids, rc, FSFILT_OP_SETATTR);
                 CDEBUG(rc2 ? D_ERROR : D_QUOTA, 
                        "filter adjust qunit. (rc:%d)\n", rc2);
         }
@@ -2041,16 +2232,13 @@ int filter_setattr(struct obd_export *exp, struct obdo *oa,
         int rc;
         ENTRY;
 
-        LASSERT(oti != NULL);
+        dentry = __filter_oa2dentry(exp->exp_obd, oa,
+                                    __FUNCTION__, 1);
+        if (IS_ERR(dentry))
+                RETURN(PTR_ERR(dentry));
 
         filter = &exp->exp_obd->u.filter;
         push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
-    
-        /* make sure that object is allocated. */
-        dentry = filter_crow_object(exp->exp_obd, oa);
-        if (IS_ERR(dentry))
-                GOTO(out_pop, rc = PTR_ERR(dentry));
-
         lock_kernel();
 
         /* setting objects attributes (including owner/group) */
@@ -2060,7 +2248,7 @@ int filter_setattr(struct obd_export *exp, struct obdo *oa,
 
         res = ldlm_resource_get(exp->exp_obd->obd_namespace, NULL,
                                 res_id, LDLM_EXTENT, 0);
-        
+
         if (res != NULL) {
                 ns_lvbo = res->lr_namespace->ns_lvbo;
                 if (ns_lvbo && ns_lvbo->lvbo_update)
@@ -2069,7 +2257,7 @@ int filter_setattr(struct obd_export *exp, struct obdo *oa,
         }
 
         oa->o_valid = OBD_MD_FLID;
-        
+
         /* Quota release need uid/gid info */
         obdo_from_inode(oa, dentry->d_inode,
                         FILTER_VALID_FLAGS | OBD_MD_FLUID | OBD_MD_FLGID);
@@ -2078,7 +2266,6 @@ int filter_setattr(struct obd_export *exp, struct obdo *oa,
 out_unlock:
         unlock_kernel();
         f_dput(dentry);
-out_pop:
         pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
         return rc;
 }
@@ -2133,6 +2320,96 @@ static int filter_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
         RETURN(lsm_size);
 }
 
+static void filter_destroy_precreated(struct obd_export *exp, struct obdo *oa,
+                                      struct filter_obd *filter)
+{
+        struct obdo doa; /* XXX obdo on stack */
+        __u64 last, id;
+        ENTRY;
+        LASSERT(oa);
+
+        memset(&doa, 0, sizeof(doa));
+        if (oa->o_valid & OBD_MD_FLGROUP) {
+                doa.o_valid |= OBD_MD_FLGROUP;
+                doa.o_gr = oa->o_gr;
+        } else {
+                doa.o_gr = 0;
+        }
+        doa.o_mode = S_IFREG;
+
+        filter->fo_destroy_in_progress = 1;
+        down(&filter->fo_create_lock);
+        if (!filter->fo_destroy_in_progress) {
+                CERROR("%s: destroy_in_progress already cleared\n",
+                        exp->exp_obd->obd_name);
+                up(&filter->fo_create_lock);
+                EXIT;
+                return;
+        }
+
+        last = filter_last_id(filter, &doa);
+        CWARN("%s: deleting orphan objects from "LPU64" to "LPU64"\n",
+               exp->exp_obd->obd_name, oa->o_id + 1, last);
+        for (id = oa->o_id + 1; id <= last; id++) {
+                doa.o_id = id;
+                filter_destroy(exp, &doa, NULL, NULL, NULL);
+        }
+
+        CDEBUG(D_HA, "%s: after destroy: set last_objids["LPU64"] = "LPU64"\n",
+               exp->exp_obd->obd_name, doa.o_gr, oa->o_id);
+
+        spin_lock(&filter->fo_objidlock);
+        filter->fo_last_objids[doa.o_gr] = oa->o_id;
+        spin_unlock(&filter->fo_objidlock);
+
+        filter->fo_destroy_in_progress = 0;
+        up(&filter->fo_create_lock);
+
+        EXIT;
+}
+
+/* returns a negative error or a nonnegative number of files to create */
+static int filter_should_precreate(struct obd_export *exp, struct obdo *oa,
+                                   obd_gr group)
+{
+        struct obd_device *obd = exp->exp_obd;
+        struct filter_obd *filter = &obd->u.filter;
+        int diff, rc;
+        ENTRY;
+
+        diff = oa->o_id - filter_last_id(filter, oa);
+        CDEBUG(D_INFO, "filter_last_id() = "LPU64" -> diff = %d\n",
+               filter_last_id(filter, oa), diff);
+
+        /* delete orphans request */
+        if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+            (oa->o_flags & OBD_FL_DELORPHAN)) {
+                if (diff >= 0)
+                        RETURN(diff);
+                if (-diff > OST_MAX_PRECREATE) {
+                        CERROR("%s: ignoring bogus orphan destroy request: "
+                               "obdid "LPU64" last_id "LPU64"\n", obd->obd_name,
+                               oa->o_id, filter_last_id(filter, oa));
+                        RETURN(-EINVAL);
+                }
+                filter_destroy_precreated(exp, oa, filter);
+                rc = filter_update_last_objid(obd, group, 0);
+                if (rc)
+                        CERROR("%s: unable to write lastobjid, but orphans"
+                               "were deleted\n", obd->obd_name);
+                RETURN(0);
+        } else {
+                /* only precreate if group == 0 and o_id is specfied */
+                if (!(oa->o_valid & OBD_FL_DELORPHAN) &&
+                    (group != 0 || oa->o_id == 0))
+                        RETURN(1);
+
+                LASSERTF(diff >= 0,"%s: "LPU64" - "LPU64" = %d\n",obd->obd_name,
+                         oa->o_id, filter_last_id(filter, oa), diff);
+                RETURN(diff);
+        }
+}
+
 static int filter_statfs(struct obd_device *obd, struct obd_statfs *osfs,
                          unsigned long max_age)
 {
@@ -2157,9 +2434,9 @@ static int filter_statfs(struct obd_device *obd, struct obd_statfs *osfs,
 
         filter_grant_sanity_check(obd, __FUNCTION__);
 
-        osfs->os_bavail -= min(osfs->os_bavail,
-                               (filter->fo_tot_dirty + filter->fo_tot_pending +
-                                osfs->os_bsize - 1) >> blockbits);
+        osfs->os_bavail -= min(osfs->os_bavail, GRANT_FOR_LLOG(obd) +
+                               ((filter->fo_tot_dirty + filter->fo_tot_pending +
+                                 osfs->os_bsize - 1) >> blockbits));
 
         /* set EROFS to state field if FS is mounted as RDONLY. The goal is to
          * stop creating files on MDS if OST is not good shape to create
@@ -2169,187 +2446,243 @@ static int filter_statfs(struct obd_device *obd, struct obd_statfs *osfs,
         RETURN(rc);
 }
 
-struct dentry *
-filter_create_object(struct obd_device *obd, struct obdo *oa)
+/* We rely on the fact that only one thread will be creating files in a given
+ * group at a time, which is why we don't need an atomic filter_get_new_id.
+ * Even if we had that atomic function, the following race would exist:
+ *
+ * thread 1: gets id x from filter_next_id
+ * thread 2: gets id (x + 1) from filter_next_id
+ * thread 2: creates object (x + 1)
+ * thread 1: tries to create object x, gets -ENOSPC
+ */
+static int filter_precreate(struct obd_device *obd, struct obdo *oa,
+                            obd_gr group, int *num)
 {
-        struct dentry *dparent = NULL;
-        struct dentry *dchild = NULL;
-        struct lvfs_ucred uc = {0,};
-        struct lvfs_run_ctxt saved;
+        struct dentry *dchild = NULL, *dparent = NULL;
         struct filter_obd *filter;
-        int cleanup_phase = 0;
-        int err = 0, rc = 0;
+        struct obd_statfs *osfs;
+        int err = 0, rc = 0, recreate_obj = 0, i;
+        unsigned long enough_time = jiffies + (obd_timeout * HZ) / 4;
+        __u64 next_id;
         void *handle = NULL;
-        obd_gr group = 0;
         ENTRY;
 
         filter = &obd->u.filter;
 
-        CDEBUG(D_INFO, "create objid "LPU64"\n", oa->o_id);
-
-        if (oa->o_valid & OBD_MD_FLGROUP)
-                group = oa->o_gr;
-
-        dparent = filter_parent_lock(obd, group, oa->o_id);
-        if (IS_ERR(dparent))
-                GOTO(cleanup, dchild = dparent);
-        cleanup_phase = 1;
-
-        /* check if object is in blacklist. This should be done under parent
-         * lock. */
-        spin_lock(&filter->fo_blacklist_lock);
-        if (oa->o_id > filter->fo_blacklist[group].fe_start &&
-            oa->o_id <= filter->fo_blacklist[group].fe_end) {
-                spin_unlock(&filter->fo_blacklist_lock);
-                GOTO(cleanup, dchild = ERR_PTR(-ENOENT));
+        if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+            (oa->o_flags & OBD_FL_RECREATE_OBJS)) {
+                recreate_obj = 1;
+        } else {
+                OBD_ALLOC(osfs, sizeof(*osfs));
+                if (osfs == NULL)
+                        RETURN(-ENOMEM);
+                rc = filter_statfs(obd, osfs, jiffies - HZ);
+                if (rc == 0 && osfs->os_bavail < (osfs->os_blocks >> 10)) {
+                        CDEBUG(D_HA, "OST out of space! avail "LPU64"\n",
+                               osfs->os_bavail <<
+                                       filter->fo_obt.obt_sb->s_blocksize_bits);
+                        *num = 0;
+                        rc = -ENOSPC;
+                }
+                OBD_FREE(osfs, sizeof(*osfs));
+                if (rc) {
+                        RETURN(rc);
+                }
         }
-        spin_unlock(&filter->fo_blacklist_lock);
 
-        /* check if object is already allocated */
-        dchild = filter_fid2dentry(obd, dparent, 
-                                  group, oa->o_id);
-        if (IS_ERR(dchild))
-                GOTO(cleanup, dchild);
+        CDEBUG(D_HA, "%s: precreating %d objects in group "LPU64" at "LPU64"\n",
+               obd->obd_name, *num, group, oa->o_id);
 
-        if (dchild->d_inode)
-                GOTO(cleanup, dchild);
+        down(&filter->fo_create_lock);
 
-        /* create new object */
-        handle = fsfilt_start_log(obd, dparent->d_inode,
-                                  FSFILT_OP_CREATE, NULL, 1);
-        if (IS_ERR(handle))
-                GOTO(cleanup, dchild = handle);
-        cleanup_phase = 2;
+        for (i = 0; i < *num && err == 0; i++) {
+                int cleanup_phase = 0;
 
-        uc.luc_fsuid = oa->o_valid & OBD_MD_FLUID ?
-                oa->o_uid : 0;
-        uc.luc_fsgid = oa->o_valid & OBD_MD_FLGID ?
-                oa->o_gid : 0;
-        uc.luc_cap = current->cap_effective;
-
-        cap_raise(uc.luc_cap, CAP_SYS_RESOURCE);
+                if (filter->fo_destroy_in_progress) {
+                        CWARN("%s: precreate aborted by destroy\n",
+                              obd->obd_name);
+                        break;
+                }
 
-        push_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
-        rc = ll_vfs_create(dparent->d_inode, dchild, S_IFREG, NULL);
-        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
+                if (recreate_obj) {
+                        __u64 last_id;
+                        next_id = oa->o_id;
+                        last_id = filter_last_id(filter, oa);
+                        if (next_id > last_id) {
+                                CERROR("Error: Trying to recreate obj greater"
+                                       "than last id "LPD64" > "LPD64"\n",
+                                       next_id, last_id);
+                                GOTO(cleanup, rc = -EINVAL);
+                        }
+                } else
+                        next_id = filter_last_id(filter, oa) + 1;
+
+                CDEBUG(D_INFO, "precreate objid "LPU64"\n", next_id);
+
+                dparent = filter_parent_lock(obd, group, next_id);
+                if (IS_ERR(dparent))
+                        GOTO(cleanup, rc = PTR_ERR(dparent));
+                cleanup_phase = 1;
+
+                dchild = filter_fid2dentry(obd, dparent, group, next_id);
+                if (IS_ERR(dchild))
+                        GOTO(cleanup, rc = PTR_ERR(dchild));
+                cleanup_phase = 2;
+
+                if (dchild->d_inode != NULL) {
+                        /* This would only happen if lastobjid was bad on disk*/
+                        /* Could also happen if recreating missing obj but
+                         * already exists
+                         */
+                        if (recreate_obj) {
+                                CERROR("%s: recreating existing object %.*s?\n",
+                                       obd->obd_name, dchild->d_name.len,
+                                       dchild->d_name.name);
+                        } else {
+                                CERROR("%s: Serious error: objid %.*s already "
+                                       "exists; is this filesystem corrupt?\n",
+                                       obd->obd_name, dchild->d_name.len,
+                                       dchild->d_name.name);
+                                LBUG();
+                        }
+                        GOTO(cleanup, rc = -EEXIST);
+                }
 
-        if (rc) {
-                CERROR("create failed rc = %d\n", rc);
-                f_dput(dchild);
-                GOTO(cleanup, dchild = ERR_PTR(rc));
-        }
+                handle = fsfilt_start_log(obd, dparent->d_inode,
+                                          FSFILT_OP_CREATE, NULL, 1);
+                if (IS_ERR(handle))
+                        GOTO(cleanup, rc = PTR_ERR(handle));
+                cleanup_phase = 3;
+
+                /* We mark object SUID+SGID to flag it for accepting UID+GID
+                 * from client on first write.  Currently the permission bits
+                 * on the OST are never used, so this is OK. */
+                rc = ll_vfs_create(dparent->d_inode, dchild,
+                                   S_IFREG |  S_ISUID | S_ISGID | 0666, NULL);
+                if (rc) {
+                        CERROR("create failed rc = %d\n", rc);
+                        GOTO(cleanup, rc);
+                }
 
-        /* grow last created object id. */
-        filter_grow_last_id(filter, group, oa->o_id);
-        rc = filter_update_last_objid(obd, group, 0);
-        if (rc) {
-                CERROR("unable to write lastobjid, but "
-                       "object is created, err = %d\n",
-                       rc);
-                rc = 0;
-        }
+                if (!recreate_obj) {
+                        filter_set_last_id(filter, oa, next_id);
+                        err = filter_update_last_objid(obd, group, 0);
+                        if (err)
+                                CERROR("unable to write lastobjid "
+                                       "but file created\n");
+                }
 
-        /* nobody else is touching this newly created object */
-        LASSERT(dchild->d_inode);
-        
-        if (oa->o_valid & OBD_MD_FLFID) {
-                struct ll_fid fid;
-
-                /* packing fid and converting it to LE for storing into EA. Here
-                 * oa->o_stripe_idx should be filled by LOV and rest of fields -
-                 * by client. */
-                fid.id = cpu_to_le64(oa->o_fid);
-                fid.f_type = cpu_to_le32(oa->o_stripe_idx);
-                fid.generation = cpu_to_le32(oa->o_generation);
-
-                down(&dchild->d_inode->i_sem);
-                rc = fsfilt_set_md(obd, dchild->d_inode, handle,
-                                   &fid, sizeof(struct ll_fid));
-                up(&dchild->d_inode->i_sem);
-                if (rc) {
-                        CERROR("store fid in object failed! rc:%d\n", rc);
+        cleanup:
+                switch(cleanup_phase) {
+                case 3:
+                        err = fsfilt_commit(obd, dparent->d_inode, handle, 0);
+                        if (err) {
+                                CERROR("error on commit, err = %d\n", err);
+                                if (!rc)
+                                        rc = err;
+                        }
+                case 2:
                         f_dput(dchild);
-                        GOTO(cleanup, dchild = ERR_PTR(rc));
+                case 1:
+                        filter_parent_unlock(dparent);
+                case 0:
+                        break;
                 }
-        } else {
-                CDEBUG(D_HA, "create OSS object without fid!\n");
-        }
 
-cleanup:
-        switch(cleanup_phase) {
-        case 2:
-                err = fsfilt_commit(obd, dparent->d_inode, handle, 0);
-                if (err) {
-                        CERROR("error on commit, err = %d\n", err);
-                        if (!rc) {
-                                rc = err;
-                                f_dput(dchild);
-                                dchild = ERR_PTR(rc);
-                        }
+                if (rc)
+                        break;
+                if (time_after(jiffies, enough_time)) {
+                        CDEBUG(D_HA, "%s: precreate slow - want %d got %d \n",
+                               obd->obd_name, *num, i);
+                        break;
                 }
-        case 1:
-                filter_parent_unlock(obd, dparent);
-        case 0:
-                break;
         }
+        *num = i;
 
-        RETURN(dchild);
+        up(&filter->fo_create_lock);
+
+        CDEBUG(D_HA, "%s: created %d objects for group "LPU64": "LPU64"\n",
+               obd->obd_name, i, group, filter->fo_last_objids[group]);
+
+        RETURN(rc);
 }
 
-struct dentry *
-filter_crow_object(struct obd_device *obd, struct obdo *oa)
+static int filter_create(struct obd_export *exp, struct obdo *oa,
+                         struct lov_stripe_md **ea, struct obd_trans_info *oti)
 {
-        struct filter_obd *filter;
-        struct dentry *dentry;
+        struct obd_device *obd = NULL;
+        struct lvfs_run_ctxt saved;
+        struct lov_stripe_md *lsm = NULL;
         obd_gr group = 0;
+        int rc = 0, diff;
         ENTRY;
 
-        if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_OST_CROW_EIO))
-                RETURN(ERR_PTR(-EIO));
-        
-        filter = &obd->u.filter;
-
         if (oa->o_valid & OBD_MD_FLGROUP)
                 group = oa->o_gr;
 
-        /* try to create new object (if it is not yet) */
-        dentry = filter_create_object(obd, oa);
-        if (IS_ERR(dentry)) {
-                CERROR("cannot create OSS object "LPU64"/"LPU64
-                       ", err = %d\n", oa->o_id, group,
-                       (int)PTR_ERR(dentry));
-                RETURN(dentry);
+        CDEBUG(D_INFO, "filter_create(od->o_gr="LPU64",od->o_id="LPU64")\n",
+               group, oa->o_id);
+        if (ea != NULL) {
+                lsm = *ea;
+                if (lsm == NULL) {
+                        rc = obd_alloc_memmd(exp, &lsm);
+                        if (rc < 0)
+                                RETURN(rc);
+                }
+        }
+
+        obd = exp->exp_obd;
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+        if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+            (oa->o_flags & OBD_FL_RECREATE_OBJS)) {
+                if (oa->o_id > filter_last_id(&obd->u.filter, oa)) {
+                        CERROR("recreate objid "LPU64" > last id "LPU64"\n",
+                               oa->o_id, filter_last_id(&obd->u.filter, oa));
+                        rc = -EINVAL;
+                } else {
+                        diff = 1;
+                        rc = filter_precreate(obd, oa, group, &diff);
+                }
+        } else {
+                diff = filter_should_precreate(exp, oa, group);
+                if (diff > 0) {
+                        oa->o_id = filter_last_id(&obd->u.filter, oa);
+                        rc = filter_precreate(obd, oa, group, &diff);
+                        oa->o_id = filter_last_id(&obd->u.filter, oa);
+                        oa->o_valid = OBD_MD_FLID;
+                }
+        }
+
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        if (rc && ea != NULL && *ea != lsm) {
+                obd_free_memmd(exp, &lsm);
+        } else if (rc == 0 && ea != NULL) {
+                /* XXX LOV STACKING: the lsm that is passed to us from
+                 * LOV does not have valid lsm_oinfo data structs, so
+                 * don't go touching that.  This needs to be fixed in a
+                 * big way. */
+                lsm->lsm_object_id = oa->o_id;
+                *ea = lsm;
         }
 
-        RETURN(dentry);
+        RETURN(rc);
 }
 
-/* destroys object @oa. Takes care of locking if @lock says that parent is not
- * yet locked. Also drops parent lock before taking ldlm PW lock to avoid
- * deadlocks in lock retraction related paths.
- *
- * This function does not change locking and does not imply hiden locking
- * knowladge. After this fucntion is finished, all parents stay at the same
- * locking state.
-
- * If @lock == 1, this means that parent of @oa is not locked and should be
- * locked for destroy operation. However, after operation is finished, parent
- * will be unlocked. The same is true about opposite case, when parent is
- * already locked and filter_destroy_internal() does not need to lock it. */
-static int
-filter_destroy_internal(struct obd_export *exp, struct obdo *oa,
-                        struct lov_stripe_md *md, struct obd_trans_info *oti,
-                        int lock)
+int filter_destroy(struct obd_export *exp, struct obdo *oa,
+                   struct lov_stripe_md *md, struct obd_trans_info *oti,
+                   struct obd_export *md_exp)
 {
+        unsigned int qcids[MAXQUOTAS] = {0, 0};
         struct obd_device *obd;
         struct filter_obd *filter;
-        struct dentry *dchild = NULL, *dparent = NULL;
+        struct dentry *dchild = NULL, *dparent;
         struct lvfs_run_ctxt saved;
         void *handle = NULL;
         struct llog_cookie *fcc = NULL;
-        int rc, rc2, cleanup_phase = 0, have_prepared = 0;
-        unsigned int qcids[MAXQUOTAS] = {0, 0};
+        int rc, rc2, cleanup_phase = 0;
         obd_gr group = 0;
+        struct iattr iattr;
         ENTRY;
 
         if (oa->o_valid & OBD_MD_FLGROUP)
@@ -2359,16 +2692,9 @@ filter_destroy_internal(struct obd_export *exp, struct obdo *oa,
         filter = &obd->u.filter;
 
         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-
- acquire_locks:
-        dparent = lock ?
-                filter_parent_lock(obd, group, oa->o_id):
-                filter_parent(obd, group, oa->o_id);
-        if (IS_ERR(dparent))
-                GOTO(cleanup, rc = PTR_ERR(dparent));
         cleanup_phase = 1;
 
-        dchild = filter_fid2dentry(obd, dparent, group, oa->o_id);
+        dchild = filter_fid2dentry(obd, NULL, group, oa->o_id);
         if (IS_ERR(dchild))
                 GOTO(cleanup, rc = PTR_ERR(dchild));
         cleanup_phase = 2;
@@ -2385,37 +2711,7 @@ filter_destroy_internal(struct obd_export *exp, struct obdo *oa,
                 GOTO(cleanup, rc = -ENOENT);
         }
 
-        if (!have_prepared) {
-                /* If we're really going to destroy the object, get ready by
-                 * getting the clients to discard their cached data.
-                 *
-                 * We have to drop the parent lock, because
-                 * filter_prepare_destroy() will acquire a PW on the object, and
-                 * we don't want to deadlock with an incoming write to the
-                 * object, which has the extent PW and then wants to get the
-                 * parent dentry to do the lookup.
-                 *
-                 * We dput the child because it's not worth the extra
-                 * complication of condition the above code to skip it on the
-                 * second time through. */
-                f_dput(dchild);
-
-                filter_unlock_dentry(obd, dparent);
-                filter_prepare_destroy(obd, oa->o_id);
-
-                /* lock parent dentry again, to keep locking state the same as
-                 * before calling this function. */
-                if (!lock)
-                        filter_lock_dentry(obd, dparent);
-
-                have_prepared = 1;
-                goto acquire_locks;
-        }
-
-        handle = fsfilt_start_log(obd, dparent->d_inode,FSFILT_OP_UNLINK,oti,1);
-        if (IS_ERR(handle))
-                GOTO(cleanup, rc = PTR_ERR(handle));
-        cleanup_phase = 3;
+        filter_prepare_destroy(obd, oa->o_id);
 
         /* Our MDC connection is established by the MDS to us */
         if (oa->o_valid & OBD_MD_FLCOOKIE) {
@@ -2424,13 +2720,58 @@ filter_destroy_internal(struct obd_export *exp, struct obdo *oa,
                         memcpy(fcc, obdo_logcookie(oa), sizeof(*fcc));
         }
 
+        /* we're gonna truncate it first in order to avoid possible deadlock:
+         *      P1                      P2
+         * open trasaction      open transaction
+         * down(i_zombie)       down(i_zombie)
+         *                      restart transaction
+         * (see BUG 4180) -bzzz
+         */
+        down(&dchild->d_inode->i_sem);
+        handle = fsfilt_start_log(obd, dchild->d_inode, FSFILT_OP_SETATTR,
+                                  NULL, 1);
+        if (IS_ERR(handle)) {
+                up(&dchild->d_inode->i_sem);
+                GOTO(cleanup, rc = PTR_ERR(handle));
+        }
+
+        iattr.ia_valid = ATTR_SIZE;
+        iattr.ia_size = 0;
+        rc = fsfilt_setattr(obd, dchild, handle, &iattr, 1);
+        rc2 = fsfilt_commit(obd, dchild->d_inode, handle, 0);
+        up(&dchild->d_inode->i_sem);
+        if (rc)
+                GOTO(cleanup, rc);
+        if (rc2)
+                GOTO(cleanup, rc = rc2);
+
+        /* We don't actually need to lock the parent until we are unlinking
+         * here, and not while truncating above.  That avoids holding the
+         * parent lock for a long time during truncate, which can block other
+         * threads from doing anything to objects in that directory. bug 7171 */
+        dparent = filter_parent_lock(obd, group, oa->o_id);
+        if (IS_ERR(dparent))
+                GOTO(cleanup, rc = PTR_ERR(dparent));
+        cleanup_phase = 3; /* filter_parent_unlock */
+
+        down(&dchild->d_inode->i_sem);
+        handle = fsfilt_start_log(obd, dparent->d_inode,FSFILT_OP_UNLINK,oti,1);
+        if (IS_ERR(handle)) {
+                up(&dchild->d_inode->i_sem);
+                GOTO(cleanup, rc = PTR_ERR(handle));
+        }
+        cleanup_phase = 4; /* fsfilt_commit */
+
         /* Quota release need uid/gid of inode */
         obdo_from_inode(oa, dchild->d_inode, OBD_MD_FLUID|OBD_MD_FLGID);
-        rc = filter_unlink(obd, oa->o_id, dparent, dchild);
 
+        /* this drops dchild->d_inode->i_sem unconditionally */
+        rc = filter_destroy_internal(obd, oa->o_id, dparent, dchild);
+
+        EXIT;
 cleanup:
         switch(cleanup_phase) {
-        case 3:
+        case 4:
                 if (fcc != NULL) {
                         fsfilt_add_journal_cb(obd, 0,
                                               oti ? oti->oti_handle : handle,
@@ -2443,12 +2784,11 @@ cleanup:
                         if (!rc)
                                 rc = rc2;
                 }
+        case 3:
+                filter_parent_unlock(dparent);
         case 2:
                 f_dput(dchild);
         case 1:
-                if (lock)
-                        filter_parent_unlock(obd, dparent);
-        case 0:
                 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
                 break;
         default:
@@ -2463,206 +2803,7 @@ cleanup:
                             FSFILT_OP_UNLINK); 
         CDEBUG(rc2 ? D_ERROR : D_QUOTA, 
                "filter adjust qunit! (rc:%d)\n", rc2);
-
-        RETURN(rc);
-}
-
-/* destroy oject with taking lock on parent first. */
-int filter_destroy(struct obd_export *exp, struct obdo *oa,
-                   struct lov_stripe_md *md, struct obd_trans_info *oti)
-{
-        int rc;
-
-        ENTRY;
-        rc = filter_destroy_internal(exp, oa, md, oti, 1);
-        RETURN(rc);
-}
-
-static int
-filter_clear_orphans(struct obd_export *exp, struct obdo *oa)
-{
-        struct filter_obd *filter;
-        struct obd_device *obd;
-        struct obdo *doa;
-        obd_gr group = 0;
-        int rc, orphans;
-        __u64 last, id;
-        ENTRY;
-
-        LASSERT(oa);
-
-        OBD_RACE(OBD_FAIL_OST_CLEAR_ORPHANS_RACE);
-
-        obd = exp->exp_obd;
-        filter = &obd->u.filter;
-
-        if (oa->o_valid & OBD_MD_FLGROUP)
-                group = oa->o_gr;
-
-        filter->fo_destroy_in_progress = 1;
-        
-        LOCK_PARENTS(obd, group);
-        if (!filter->fo_destroy_in_progress) {
-                UNLOCK_PARENTS(obd, group);
-                CDEBUG(D_HA, "cleanup orphans is already canceled\n");
-                RETURN(0);
-        }
-
-        last = filter_last_id(filter, group);
-        orphans = last - oa->o_id;
-        
-        if (orphans <= 0) {
-                filter->fo_destroy_in_progress = 0;
-                UNLOCK_PARENTS(obd, group);
-                CDEBUG(D_HA, "nothing to cleanup, MDS objid "LPU64
-                       " is not bigger than OST one "LPU64"\n",
-                       oa->o_id, last);
-                RETURN(0);
-        }
-
-        CDEBUG(D_HA, "adding orphans extent "LPU64":"LPU64"-"LPU64
-               " to blacklist\n", group, oa->o_id, last);
-
-        /* making all orphans entries in blacklist, that will deny to re-create
-         * them by CROW in filter_create_object(). This is done for case when
-         * orphans already exist on client and will be tried to write something
-         * and we want to stop them.
-         *
-         * In fact the issue is even worse, as we want to put in blacklist not
-         * only the objects which we just destroed, but also those which not yet
-         * created on OST (and OST has no idea about) but possibly existing on
-         * clients. */
-        spin_lock(&filter->fo_blacklist_lock);
-        filter->fo_blacklist[group].fe_start = oa->o_id;
-        filter->fo_blacklist[group].fe_end = last;
-        spin_unlock(&filter->fo_blacklist_lock);
-        
-       doa = obdo_alloc();
-        if (doa == NULL) {
-                filter->fo_destroy_in_progress = 0;
-                UNLOCK_PARENTS(obd, group);
-                RETURN(-ENOMEM);
-        }
-
-        doa->o_gr = group;
-        doa->o_mode = S_IFREG;
-        doa->o_valid = oa->o_valid & (OBD_MD_FLGROUP | OBD_MD_FLID);
-
-        CDEBUG(D_ERROR, "%s:["LPU64"] deleting orphan objects from "LPU64" to "
-              LPU64"\n", exp->exp_obd->obd_name, doa->o_gr, oa->o_id, last);
-
-        for (id = last; id > oa->o_id; id--) {
-                doa->o_id = id;
-
-                /* remove object @doa. It will not lock parent as parents
-                 * already locked. */
-                filter_destroy_internal(exp, doa, NULL, NULL, 0);
-
-                /* update last id just for case when OST will down in cleanup
-                 * orphans time. */
-                filter_set_last_id(filter, group, id);
-
-                /* update last_id on disk periodicaly */
-                if ((id & 1023) == 0)
-                        filter_update_last_objid(obd, group, 0);
-        }
-
-        UNLOCK_PARENTS(obd, group);
-
-        /* return next free id to be used as a new start of sequence. As we
-         * return last id from OST, this will make sure that MDS will start new
-         * sequence from object id which is far from existing and there will not
-         * be object id sharing. */
-        oa->o_id = last + 1;
-        filter_set_last_id(filter, group, oa->o_id);
-
-        CDEBUG(D_ERROR, "%s:["LPU64"] after destroy: set last_objids = "
-               LPU64"\n", exp->exp_obd->obd_name, doa->o_gr, oa->o_id);
-
-        rc = filter_update_last_objid(obd, group, 1);
-        filter->fo_destroy_in_progress = 0;
-
-        obdo_free(doa);
-        RETURN(rc);
-}
-
-static int filter_create(struct obd_export *exp, struct obdo *oa,
-                         struct lov_stripe_md **ea, struct obd_trans_info *oti)
-{
-        struct filter_export_data *fed;
-        struct lvfs_run_ctxt saved;
-        struct filter_obd *filter;
-        obd_gr group = oa->o_gr;
-        struct obd_device *obd;
-        int rc = 0;
-        ENTRY;
-
-        obd = exp->exp_obd;
-        fed = &exp->exp_filter_data;
-        filter = &obd->u.filter;
-
-        CDEBUG(D_INFO, "filter_create(od->o_gr="LPU64",od->o_id="LPU64")\n",
-               group, oa->o_id);
-
-        if (oa->o_valid & OBD_MD_FLFLAGS && oa->o_flags == OBD_FL_DELORPHAN) {
-                push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-
-                rc = filter_clear_orphans(exp, oa);
-                if (rc) {
-                        CERROR("cannot clear orphans starting from "
-                               LPU64", err = %d\n", oa->o_id, rc);
-                }
-                pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-                RETURN(rc);
-        }
-
-        LASSERT(ergo(oa->o_valid & OBD_MD_FLFLAGS,
-                     !!(oa->o_flags & OBD_FL_CREATE_CROW) !=
-                     !!(oa->o_flags & OBD_FL_RECREATE_OBJS)));
-
-        /* echo, llog and other "create asap" cases. */
-        if (OBDO_URGENT_CREATE(oa)) {
-                struct obd_statfs *osfs;
-                struct dentry *dentry;
-                
-                /* check space first. As this is real create and client does not
-                 * have yet file created, this is good place to check space. */
-                OBD_ALLOC_PTR(osfs);
-                if (!osfs)
-                        RETURN(-ENOMEM);
-
-                rc = filter_statfs(obd, osfs, jiffies - HZ);
-                if (rc == 0 && osfs->os_bavail < (osfs->os_blocks >> 10)) {
-                        CDEBUG(D_HA, "OST out of space! avail "LPU64"\n",
-                               osfs->os_bavail << filter->fo_obt.obt_sb->s_blocksize_bits);
-                        rc = -ENOSPC;
-                }
-
-                OBD_FREE_PTR(osfs);
-                if (rc)
-                        RETURN(rc);
-
-                dentry = filter_create_object(obd, oa);
-                if (!IS_ERR(dentry)) {
-                        f_dput(dentry);
-                        if (ea != NULL) {
-                                struct lov_stripe_md *lsm = *ea;
-                                if (lsm == NULL) {
-                                        rc = obd_alloc_memmd(exp, &lsm);
-                                        if (rc)
-                                                RETURN(rc);
-                                }
-                                lsm->lsm_object_id = oa->o_id;
-                                *ea = lsm;
-                                rc = 0;
-                        }
-                }
-        } else {
-                CERROR("wrong @oa flags detected 0x%lx. Not an urgent "
-                       "create and not recovery.\n", (unsigned long)oa->o_flags);
-                LBUG();
-        }
-        RETURN(rc);
+        return rc;
 }
 
 /* NB start and end are used for punch, but not truncate */
@@ -2673,9 +2814,11 @@ static int filter_truncate(struct obd_export *exp, struct obdo *oa,
         int rc;
         ENTRY;
 
-        if (end != OBD_OBJECT_EOF)
+        if (end != OBD_OBJECT_EOF) {
                 CERROR("PUNCH not supported, only truncate: end = "LPX64"\n",
                        end);
+                RETURN(-EFAULT);
+        }
 
         CDEBUG(D_INODE, "calling truncate for object "LPU64", valid = "LPX64
                ", o_size = "LPD64"\n", oa->o_id, oa->o_valid, start);
@@ -2875,15 +3018,19 @@ int filter_iocontrol(unsigned int cmd, struct obd_export *exp,
 
 static int filter_health_check(struct obd_device *obd)
 {
+        struct filter_obd *filter = &obd->u.filter;
         int rc = 0;
 
         /*
          * health_check to return 0 on healthy
          * and 1 on unhealthy.
          */
-        if(obd->u.obt.obt_sb->s_flags & MS_RDONLY)
+        if (obd->u.obt.obt_sb->s_flags & MS_RDONLY)
                 rc = 1;
 
+        LASSERT(filter->fo_health_check_filp != NULL);
+        rc |= !!lvfs_check_io_health(obd, filter->fo_health_check_filp);
+
         return rc;
 }
 
@@ -2905,6 +3052,7 @@ static struct obd_ops filter_obd_ops = {
         .o_precleanup     = filter_precleanup,
         .o_cleanup        = filter_cleanup,
         .o_connect        = filter_connect,
+        .o_reconnect      = filter_reconnect,
         .o_disconnect     = filter_disconnect,
         .o_statfs         = filter_statfs,
         .o_getattr        = filter_getattr,
@@ -2932,6 +3080,7 @@ static struct obd_ops filter_sanobd_ops = {
         .o_precleanup     = filter_precleanup,
         .o_cleanup        = filter_cleanup,
         .o_connect        = filter_connect,
+        .o_reconnect      = filter_reconnect,
         .o_disconnect     = filter_disconnect,
         .o_statfs         = filter_statfs,
         .o_getattr        = filter_getattr,
@@ -2951,7 +3100,7 @@ static struct obd_ops filter_sanobd_ops = {
         .o_iocontrol      = filter_iocontrol,
 };
 
-quota_interface_t *quota_interface = NULL;
+quota_interface_t *quota_interface;
 extern quota_interface_t filter_quota_interface;
 
 static int __init obdfilter_init(void)
index b562df5..8de82d1 100644 (file)
@@ -10,6 +10,7 @@
 #endif
 #include <lustre_handles.h>
 #include <lustre_debug.h>
+#include <linux/lustre_disk.h>
 #include <obd.h>
 
 #define FILTER_LAYOUT_VERSION "2"
 # define OBD_FILTER_SAN_DEVICENAME "sanobdfilter"
 #endif
 
-#define LAST_RCVD "last_rcvd"
+#define HEALTH_CHECK "health_check"
 #define FILTER_INIT_OBJID 0
 
-#define FILTER_LR_SERVER_SIZE    512
-
-#define FILTER_LR_CLIENT_START   8192
-#define FILTER_LR_CLIENT_SIZE    128
-
-/* This limit is arbitrary, but for now we fit it in 1 page (32k clients) */
-#define FILTER_LR_MAX_CLIENTS (PAGE_SIZE * 8)
-
 #define FILTER_SUBDIR_COUNT      32            /* set to zero for no subdirs */
 #define FILTER_GROUPS 3 /* must be at least 3; not dynamic yet */
 
-#define FILTER_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */
+#define FILTER_ROCOMPAT_SUPP (0)
 
-#define FILTER_ROCOMPAT_SUPP   (0)
+#define FILTER_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */
 
-#define FILTER_INCOMPAT_GROUPS 0x00000001
-#define FILTER_INCOMPAT_SUPP   (FILTER_INCOMPAT_GROUPS)
+#define FILTER_INCOMPAT_SUPP   (OBD_INCOMPAT_GROUPS)
 
 #define FILTER_GRANT_CHUNK (2ULL * PTLRPC_MAX_BRW_SIZE)
+#define GRANT_FOR_LLOG(obd) 16
 
 /* Data stored per server at the head of the last_rcvd file.  In le32 order.
  * Try to keep this the same as mds_server_data so we might one day merge. */
 struct filter_server_data {
         __u8  fsd_uuid[40];        /* server UUID */
-        __u64 fsd_unused;          /* was fsd_last_objid - don't use for now */
+        __u64 fsd_last_transno_new;/* future last completed transaction ID */
         __u64 fsd_last_transno;    /* last completed transaction ID */
         __u64 fsd_mount_count;     /* FILTER incarnation number */
         __u32 fsd_feature_compat;  /* compatible feature flags */
@@ -62,7 +55,9 @@ struct filter_server_data {
         __u64 fsd_catalog_oid;     /* recovery catalog object id */
         __u32 fsd_catalog_ogen;    /* recovery catalog inode generation */
         __u8  fsd_peeruuid[40];    /* UUID of MDS associated with this OST */
-        __u8  fsd_padding[FILTER_LR_SERVER_SIZE - 140];
+        __u32 fsd_ost_index;       /* index number of OST in LOV */
+        __u32 fsd_mds_index;       /* index number of MDS in LMV */
+        __u8  fsd_padding[LR_SERVER_SIZE - 148];
 };
 
 /* Data stored per client in the last_rcvd file.  In le32 order. */
@@ -70,17 +65,20 @@ struct filter_client_data {
         __u8  fcd_uuid[40];        /* client UUID */
         __u64 fcd_last_rcvd;       /* last completed transaction ID */
         __u64 fcd_last_xid;        /* client RPC xid for the last transaction */
-        __u8  fcd_padding[FILTER_LR_CLIENT_SIZE - 56];
+        __u8  fcd_padding[LR_CLIENT_SIZE - 56];
 };
 
-#define FILTER_DENTRY_MAGIC 0x9efba101
-#define FILTER_FLAG_DESTROY 0x0001      /* destroy dentry on last file close */
-
 /* Limit the returned fields marked valid to those that we actually might set */
 #define FILTER_VALID_FLAGS (OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLGENER  |\
                             OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ|\
                             OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME)
 
+struct filter_fid {
+        struct ll_fid   ff_fid;
+        __u64           ff_objid;
+        __u64           ff_group;
+};
+
 enum {
         LPROC_FILTER_READ_BYTES = 0,
         LPROC_FILTER_WRITE_BYTES = 1,
@@ -102,19 +100,20 @@ struct dentry *filter_fid2dentry(struct obd_device *, struct dentry *dir,
 struct dentry *__filter_oa2dentry(struct obd_device *obd, struct obdo *oa,
                                   const char *what, int quiet);
 #define filter_oa2dentry(obd, oa) __filter_oa2dentry(obd, oa, __FUNCTION__, 0)
-#define filter_oa2dentry_quiet(obd, oa) __filter_oa2dentry(obd, oa, __FUNCTION__, 1)
 
 int filter_finish_transno(struct obd_export *, struct obd_trans_info *, int rc);
-__u64 filter_last_id(struct filter_obd *, int group);
+__u64 filter_next_id(struct filter_obd *, struct obdo *);
+__u64 filter_last_id(struct filter_obd *, struct obdo *);
+int filter_update_fidea(struct obd_export *exp, struct inode *inode,
+                        void *handle, struct obdo *oa);
 int filter_update_server_data(struct obd_device *, struct file *,
                               struct filter_server_data *, int force_sync);
 int filter_update_last_objid(struct obd_device *, obd_gr, int force_sync);
 int filter_common_setup(struct obd_device *, obd_count len, void *buf,
                         void *option);
 int filter_destroy(struct obd_export *exp, struct obdo *oa,
-                   struct lov_stripe_md *md, struct obd_trans_info *);
-struct dentry *filter_crow_object(struct obd_device *obd, struct obdo *oa);
-
+                   struct lov_stripe_md *md, struct obd_trans_info *,
+                   struct obd_export *);
 int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
                             struct obdo *oa, struct obd_trans_info *oti);
 int filter_setattr(struct obd_export *exp, struct obdo *oa,
@@ -139,6 +138,7 @@ int filter_brw(int cmd, struct obd_export *, struct obdo *,
 void flip_into_page_cache(struct inode *inode, struct page *new_page);
 
 /* filter_io_*.c */
+struct filter_iobuf;
 int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount,
                           struct obd_ioobj *obj, int niocount,
                           struct niobuf_local *res, struct obd_trans_info *oti,
@@ -148,13 +148,15 @@ long filter_grant(struct obd_export *exp, obd_size current_grant,
                   obd_size want, obd_size fs_space_left);
 void filter_grant_commit(struct obd_export *exp, int niocount,
                          struct niobuf_local *res);
-int filter_alloc_iobuf(struct filter_obd *, int rw, int num_pages, void **ret);
-void filter_free_iobuf(void *iobuf);
-int filter_iobuf_add_page(struct obd_device *obd, void *iobuf,
+struct filter_iobuf *filter_alloc_iobuf(struct filter_obd *, int rw,
+                                        int num_pages);
+void filter_free_iobuf(struct filter_iobuf *iobuf);
+int filter_iobuf_add_page(struct obd_device *obd, struct filter_iobuf *iobuf,
                           struct inode *inode, struct page *page);
-void *filter_iobuf_get(struct ptlrpc_thread *thread, struct filter_obd *filter);
-void filter_iobuf_put(void *iobuf);
-int filter_direct_io(int rw, struct dentry *dchild, void *iobuf,
+void *filter_iobuf_get(struct filter_obd *filter, struct obd_trans_info *oti);
+void filter_iobuf_put(struct filter_obd *filter, struct filter_iobuf *iobuf,
+                      struct obd_trans_info *oti);
+int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf,
                      struct obd_export *exp, struct iattr *attr,
                      struct obd_trans_info *oti, void **wait_handle);
 
index 4a797c9..5421993 100644 (file)
@@ -36,6 +36,7 @@
 #include <libcfs/list.h>
 #include <obd_class.h>
 #include <lustre_fsfilt.h>
+#include <lustre_mds.h>
 #include <lustre_commit_confd.h>
 
 #include "filter_internal.h"
index 557e036..a823d98 100644 (file)
 #include <obd_class.h>
 #include "osc_internal.h"
 
-int oscc_recovering(struct osc_creator *oscc)
+static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc)
 {
-        int recov = 0;
+        struct osc_creator *oscc;
+        struct ost_body *body = NULL;
+        ENTRY;
 
+        if (req->rq_repmsg) {
+                body = lustre_swab_repbuf(req, 0, sizeof(*body),
+                                          lustre_swab_ost_body);
+                if (body == NULL && rc == 0)
+                        rc = -EPROTO;
+        }
+
+        oscc = req->rq_async_args.pointer_arg[0];
+        LASSERT(oscc && (oscc->oscc_obd != LP_POISON));
+        
         spin_lock(&oscc->oscc_lock);
-        recov = oscc->oscc_flags & OSCC_FLAG_RECOVERING;
-        spin_unlock(&oscc->oscc_lock);
+        oscc->oscc_flags &= ~OSCC_FLAG_CREATING;
+        if (rc == -ENOSPC || rc == -EROFS) {
+                oscc->oscc_flags |= OSCC_FLAG_NOSPC;
+                if (body && rc == -ENOSPC) {
+                        oscc->oscc_grow_count = OST_MIN_PRECREATE;
+                        oscc->oscc_last_id = body->oa.o_id;
+                }
+                spin_unlock(&oscc->oscc_lock);
+                DEBUG_REQ(D_INODE, req, "OST out of space, flagging");
+        } else if (rc != 0 && rc != -EIO) {
+                oscc->oscc_flags |= OSCC_FLAG_RECOVERING;
+                oscc->oscc_grow_count = OST_MIN_PRECREATE;
+                spin_unlock(&oscc->oscc_lock);
+                DEBUG_REQ(D_ERROR, req,
+                          "unknown rc %d from async create: failing oscc", rc);
+                ptlrpc_fail_import(req->rq_import, req->rq_import_generation);
+        } else {
+                if (rc == 0) {
+                        oscc->oscc_flags &= ~OSCC_FLAG_LOW;
+                        if (body) {
+                                int diff = body->oa.o_id - oscc->oscc_last_id;
+                                if (diff != oscc->oscc_grow_count)
+                                        oscc->oscc_grow_count =
+                                                max(diff/3, OST_MIN_PRECREATE);
+                                oscc->oscc_last_id = body->oa.o_id;
+                        }
+                }
+                spin_unlock(&oscc->oscc_lock);
+        }
 
-        return recov;
+        CDEBUG(D_HA, "preallocated through id "LPU64" (last used "LPU64")\n",
+               oscc->oscc_last_id, oscc->oscc_next_id);
+
+        wake_up(&oscc->oscc_waitq);
+        RETURN(rc);
 }
 
-static int osc_check_state(struct obd_export *exp)
+static int oscc_internal_create(struct osc_creator *oscc)
 {
-        int rc;
+        struct ptlrpc_request *request;
+        struct ost_body *body;
+        int size = sizeof(*body);
         ENTRY;
 
-        /* ->os_state contains positive error code on remote OST. To convert it
-         * to usual errno form we have to make an sign inversion. */
-        spin_lock(&exp->exp_obd->obd_osfs_lock);
-        rc = -exp->exp_obd->obd_osfs.os_state;
-        spin_unlock(&exp->exp_obd->obd_osfs_lock);
-        
-        RETURN(rc);
+        spin_lock(&oscc->oscc_lock);
+        if (oscc->oscc_grow_count < OST_MAX_PRECREATE &&
+            !(oscc->oscc_flags & (OSCC_FLAG_LOW | OSCC_FLAG_RECOVERING)) &&
+            (__s64)(oscc->oscc_last_id - oscc->oscc_next_id) <=
+                   (oscc->oscc_grow_count / 4 + 1)) {
+                oscc->oscc_flags |= OSCC_FLAG_LOW;
+                oscc->oscc_grow_count *= 2;
+        }
+
+        if (oscc->oscc_grow_count > OST_MAX_PRECREATE / 2)
+                oscc->oscc_grow_count = OST_MAX_PRECREATE / 2;
+
+        if (oscc->oscc_flags & OSCC_FLAG_CREATING ||
+            oscc->oscc_flags & OSCC_FLAG_RECOVERING) {
+                spin_unlock(&oscc->oscc_lock);
+                RETURN(0);
+        }
+        oscc->oscc_flags |= OSCC_FLAG_CREATING;
+        spin_unlock(&oscc->oscc_lock);
+
+        request = ptlrpc_prep_req(oscc->oscc_obd->u.cli.cl_import,
+                                  LUSTRE_OST_VERSION, OST_CREATE, 1,
+                                  &size, NULL);
+        if (request == NULL) {
+                spin_lock(&oscc->oscc_lock);
+                oscc->oscc_flags &= ~OSCC_FLAG_CREATING;
+                spin_unlock(&oscc->oscc_lock);
+                RETURN(-ENOMEM);
+        }
+
+        request->rq_request_portal = OST_CREATE_PORTAL; //XXX FIXME bug 249
+        body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof(*body));
+
+        spin_lock(&oscc->oscc_lock);
+        body->oa.o_id = oscc->oscc_last_id + oscc->oscc_grow_count;
+        body->oa.o_valid |= OBD_MD_FLID;
+        spin_unlock(&oscc->oscc_lock);
+        CDEBUG(D_HA, "preallocating through id "LPU64" (last used "LPU64")\n",
+               body->oa.o_id, oscc->oscc_next_id);
+
+        request->rq_replen = lustre_msg_size(1, &size);
+
+        request->rq_async_args.pointer_arg[0] = oscc;
+        request->rq_interpret_reply = osc_interpret_create;
+        ptlrpcd_add_req(request);
+
+        RETURN(0);
+}
+
+static int oscc_has_objects(struct osc_creator *oscc, int count)
+{
+        int have_objs;
+        spin_lock(&oscc->oscc_lock);
+        have_objs = ((__s64)(oscc->oscc_last_id - oscc->oscc_next_id) >= count);
+        spin_unlock(&oscc->oscc_lock);
+
+        if (!have_objs)
+                oscc_internal_create(oscc);
+
+        return have_objs;
 }
 
-static int osc_check_nospc(struct obd_export *exp)
+static int oscc_wait_for_objects(struct osc_creator *oscc, int count)
 {
-        __u64 blocks, bavail;
+        int have_objs;
+        int ost_full;
+        int osc_invalid;
+
+        have_objs = oscc_has_objects(oscc, count);
+
+        spin_lock(&oscc->oscc_lock);
+        ost_full = (oscc->oscc_flags & OSCC_FLAG_NOSPC);
+        spin_unlock(&oscc->oscc_lock);
+
+        osc_invalid = oscc->oscc_obd->u.cli.cl_import->imp_invalid;
+
+        return have_objs || ost_full || osc_invalid;
+}
+
+static int oscc_precreate(struct osc_creator *oscc, int wait)
+{
+        struct l_wait_info lwi = { 0 };
         int rc = 0;
         ENTRY;
 
-        spin_lock(&exp->exp_obd->obd_osfs_lock);
-        blocks = exp->exp_obd->obd_osfs.os_blocks;
-        bavail = exp->exp_obd->obd_osfs.os_bavail;
-        spin_unlock(&exp->exp_obd->obd_osfs_lock);
-        
-        /* return 1 if available space smaller then (blocks >> 10) of all space
-         * on OST. The main point of this water mark is to stop create files at
-         * some point, to let all created and opened files finish possible
-         * writes. */
-        if (blocks > 0 && bavail < (blocks >> 10))
-                rc = 1;
+        if (oscc_has_objects(oscc, oscc->oscc_grow_count / 2))
+                RETURN(0);
+
+        if (!wait)
+                RETURN(0);
+
+        /* no rc check -- a no-INTR, no-TIMEOUT wait can't fail */
+        l_wait_event(oscc->oscc_waitq, oscc_wait_for_objects(oscc, 1), &lwi);
+
+        if (!oscc_has_objects(oscc, 1) && (oscc->oscc_flags & OSCC_FLAG_NOSPC))
+                rc = -ENOSPC;
+
+        if (oscc->oscc_obd->u.cli.cl_import->imp_invalid)
+                rc = -EIO;
 
         RETURN(rc);
 }
 
+int oscc_recovering(struct osc_creator *oscc)
+{
+        int recov = 0;
+
+        spin_lock(&oscc->oscc_lock);
+        recov = oscc->oscc_flags & OSCC_FLAG_RECOVERING;
+        spin_unlock(&oscc->oscc_lock);
+
+        return recov;
+}
+
 int osc_create(struct obd_export *exp, struct obdo *oa,
                struct lov_stripe_md **ea, struct obd_trans_info *oti)
 {
+        struct lov_stripe_md *lsm;
         struct osc_creator *oscc = &exp->exp_obd->u.cli.cl_oscc;
         int try_again = 1, rc = 0;
         ENTRY;
+        LASSERT(oa);
+        LASSERT(ea);
+
+        if ((oa->o_valid & OBD_MD_FLGROUP) && (oa->o_gr != 0))
+                RETURN(osc_real_create(exp, oa, ea, oti));
+
+        if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+            oa->o_flags == OBD_FL_RECREATE_OBJS) {
+                RETURN(osc_real_create(exp, oa, ea, oti));
+        }
 
-        LASSERT(oa != NULL);
-        LASSERT(ea != NULL);
-        
         /* this is the special case where create removes orphans */
-        if (oa->o_valid & OBD_MD_FLFLAGS && oa->o_flags == OBD_FL_DELORPHAN) {
+        if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+            oa->o_flags == OBD_FL_DELORPHAN) {
                 spin_lock(&oscc->oscc_lock);
                 if (oscc->oscc_flags & OSCC_FLAG_SYNC_IN_PROGRESS) {
                         spin_unlock(&oscc->oscc_lock);
-                        return -EBUSY;
+                        RETURN(-EBUSY);
                 }
                 if (!(oscc->oscc_flags & OSCC_FLAG_RECOVERING)) {
                         spin_unlock(&oscc->oscc_lock);
-                        return 0;
+                        RETURN(0);
                 }
                 oscc->oscc_flags |= OSCC_FLAG_SYNC_IN_PROGRESS;
                 spin_unlock(&oscc->oscc_lock);
                 CDEBUG(D_HA, "%s: oscc recovery started\n",
                        oscc->oscc_obd->obd_name);
-                LASSERT(oscc->oscc_flags & OSCC_FLAG_RECOVERING);
+
+                /* delete from next_id on up */
+                oa->o_valid |= OBD_MD_FLID;
+                oa->o_id = oscc->oscc_next_id - 1;
 
                 CDEBUG(D_HA, "%s: deleting to next_id: "LPU64"\n",
                        oscc->oscc_obd->obd_name, oa->o_id);
 
                 rc = osc_real_create(exp, oa, ea, NULL);
-                if (oscc->oscc_obd == NULL) {
-                        CWARN("the obd for oscc %p has been freed\n", oscc);
-                        RETURN(rc);
-                }
 
                 spin_lock(&oscc->oscc_lock);
                 oscc->oscc_flags &= ~OSCC_FLAG_SYNC_IN_PROGRESS;
@@ -136,44 +273,31 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
                         if (rc == -ENOSPC)
                                 oscc->oscc_flags |= OSCC_FLAG_NOSPC;
                         oscc->oscc_flags &= ~OSCC_FLAG_RECOVERING;
-                        CDEBUG(D_HA, "%s: oscc recovery finished: %d\n",
-                               oscc->oscc_obd->obd_name, rc);
+                        oscc->oscc_last_id = oa->o_id;
+                        CDEBUG(D_HA, "%s: oscc recovery finished, last_id: "
+                               LPU64", rc: %d\n", oscc->oscc_obd->obd_name,
+                               oscc->oscc_last_id, rc);
                         cfs_waitq_signal(&oscc->oscc_waitq);
                 } else {
                         CDEBUG(D_ERROR, "%s: oscc recovery failed: %d\n",
                                oscc->oscc_obd->obd_name, rc);
                 }
                 spin_unlock(&oscc->oscc_lock);
-                RETURN(rc);
-        }
 
-        LASSERT(ergo(oa->o_valid & OBD_MD_FLFLAGS,
-                     !!(oa->o_flags & OBD_FL_CREATE_CROW) !=
-                     !!(oa->o_flags & OBD_FL_RECREATE_OBJS)));
-
-        /* perform urgent create if asked or import is not crow capable or
-         * ENOSPC case if detected. */
-        if (OBDO_URGENT_CREATE(oa) || !IMP_CROW_ABLE(class_exp2cliimp(exp)) ||
-            osc_check_nospc(exp)) {
-                CDEBUG(D_HA, "perform urgent create\n");
-                oa->o_flags &= ~OBD_FL_CREATE_CROW;
-                if (!oa->o_flags)
-                        oa->o_valid &= ~OBD_MD_FLFLAGS;
-                rc = osc_real_create(exp, oa, ea, oti);
+
                 RETURN(rc);
         }
 
-        /* check OST fs state. */
-        rc = osc_check_state(exp);
-        if (rc) { 
-                CDEBUG(D_HA,"OST is in bad shape to create objects, err %d\n",
-                       rc);
-                RETURN(rc);
+        lsm = *ea;
+        if (lsm == NULL) {
+                rc = obd_alloc_memmd(exp, &lsm);
+                if (rc < 0)
+                        RETURN(rc);
         }
-        
+
         while (try_again) {
-                /* if orphans are being recovered, then we must wait until it is
-                 * finished before we can continue with create. */
+                /* If orphans are being recovered, then we must wait until
+                   it is finished before we can continue with create. */
                 if (oscc_recovering(oscc)) {
                         struct l_wait_info lwi;
 
@@ -186,7 +310,7 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
                                           !oscc_recovering(oscc), &lwi);
                         LASSERT(rc == 0 || rc == -ETIMEDOUT);
                         if (rc == -ETIMEDOUT) {
-                                CDEBUG(D_HA, "%p: timeout waiting on recovery\n",
+                                CDEBUG(D_HA,"%p: timeout waiting on recovery\n",
                                        oscc);
                                 RETURN(rc);
                         }
@@ -200,22 +324,33 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
                         break;
                 }
 
-                if (oscc->oscc_flags & OSCC_FLAG_NOSPC) {
+                if (oscc->oscc_last_id >= oscc->oscc_next_id) {
+                        memcpy(oa, &oscc->oscc_oa, sizeof(*oa));
+                        oa->o_id = oscc->oscc_next_id;
+                        lsm->lsm_object_id = oscc->oscc_next_id;
+                        *ea = lsm;
+                        oscc->oscc_next_id++;
+                        try_again = 0;
+
+                        CDEBUG(D_HA, "%s: set oscc_next_id = "LPU64"\n",
+                               exp->exp_obd->obd_name, oscc->oscc_next_id);
+                } else if (oscc->oscc_flags & OSCC_FLAG_NOSPC) {
                         rc = -ENOSPC;
                         spin_unlock(&oscc->oscc_lock);
                         break;
                 }
-
-                oscc->oscc_next_id++;
-                oa->o_id = oscc->oscc_next_id;
-                try_again = 0;
                 spin_unlock(&oscc->oscc_lock);
+                rc = oscc_precreate(oscc, try_again);
+                if (rc)
+                        break;
+        }
 
+        if (rc == 0)
                 CDEBUG(D_HA, "%s: returning objid "LPU64"\n",
                        oscc->oscc_obd->u.cli.cl_import->imp_target_uuid.uuid,
-                       oa->o_id);
-        }
-
+                       lsm->lsm_object_id);
+        else if (*ea == NULL)
+                obd_free_memmd(exp, &lsm);
         RETURN(rc);
 }
 
@@ -227,10 +362,18 @@ void oscc_init(struct obd_device *obd)
                 return;
 
         oscc = &obd->u.cli.cl_oscc;
-        memset(oscc, 0, sizeof(*oscc));
 
-        oscc->oscc_obd = obd;
+        memset(oscc, 0, sizeof(*oscc));
+        INIT_LIST_HEAD(&oscc->oscc_list);
+        init_waitqueue_head(&oscc->oscc_waitq);
         spin_lock_init(&oscc->oscc_lock);
+        oscc->oscc_obd = obd;
+        oscc->oscc_grow_count = OST_MIN_PRECREATE;
+
+        oscc->oscc_next_id = 2;
+        oscc->oscc_last_id = 1;
         oscc->oscc_flags |= OSCC_FLAG_RECOVERING;
         cfs_waitq_init(&oscc->oscc_waitq);
+        /* XXX the export handle should give the oscc the last object */
+        /* oed->oed_oscc.oscc_last_id = exph->....; */
 }
index 0f80243..23f535d 100644 (file)
@@ -53,6 +53,7 @@
 
 #include <lustre_ha.h>
 #include <lprocfs_status.h>
+#include <lustre_mds.h>
 #include <lustre_log.h>
 #include <lustre_debug.h>
 #include "osc_internal.h"
@@ -176,8 +177,8 @@ static int osc_getattr_async(struct obd_export *exp, struct obdo *oa,
         struct osc_getattr_async_args *aa;
         ENTRY;
 
-        request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_GETATTR, 1,
-                                  &size, NULL);
+        request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+                                  OST_GETATTR, 1, &size, NULL);
         if (!request)
                 RETURN(-ENOMEM);
 
@@ -203,8 +204,8 @@ static int osc_getattr(struct obd_export *exp, struct obdo *oa,
         int rc, size = sizeof(*body);
         ENTRY;
 
-        request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_GETATTR, 1,
-                                  &size, NULL);
+        request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+                                  OST_GETATTR, 1, &size, NULL);
         if (!request)
                 RETURN(-ENOMEM);
 
@@ -247,8 +248,8 @@ static int osc_setattr(struct obd_export *exp, struct obdo *oa,
         int rc, size = sizeof(*body);
         ENTRY;
 
-        request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_SETATTR, 1, &size,
-                                  NULL);
+        request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+                                  OST_SETATTR, 1, &size, NULL);
         if (!request)
                 RETURN(-ENOMEM);
 
@@ -285,8 +286,8 @@ static int osc_setattr_async(struct obd_export *exp, struct obdo *oa,
 
         LASSERT(oti);
 
-        request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_SETATTR, 1,
-                                  &size, NULL);
+        request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+                                  OST_SETATTR, 1, &size, NULL);
         if (!request)
                 RETURN(-ENOMEM);
 
@@ -307,7 +308,6 @@ static int osc_setattr_async(struct obd_export *exp, struct obdo *oa,
 int osc_real_create(struct obd_export *exp, struct obdo *oa,
                     struct lov_stripe_md **ea, struct obd_trans_info *oti)
 {
-        struct osc_creator *oscc = &exp->exp_obd->u.cli.cl_oscc;
         struct ptlrpc_request *request;
         struct ost_body *body;
         struct lov_stripe_md *lsm;
@@ -324,8 +324,8 @@ int osc_real_create(struct obd_export *exp, struct obdo *oa,
                         RETURN(rc);
         }
 
-        request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_CREATE,
-                                  1, &size, NULL);
+        request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+                                  OST_CREATE, 1, &size, NULL);
         if (!request)
                 GOTO(out, rc = -ENOMEM);
 
@@ -353,16 +353,6 @@ int osc_real_create(struct obd_export *exp, struct obdo *oa,
                 GOTO (out_req, rc = -EPROTO);
         }
 
-        if ((oa->o_valid & OBD_MD_FLFLAGS) && oa->o_flags == OBD_FL_DELORPHAN) {
-                struct obd_import *imp = class_exp2cliimp(exp);
-                /* MDS declares last known object, OSS responses
-                 * with next possible object -bzzz */
-                spin_lock(&oscc->oscc_lock);
-                oscc->oscc_next_id = body->oa.o_id;
-                spin_unlock(&oscc->oscc_lock);
-                CDEBUG(D_HA, "%s: set nextid "LPD64" after recovery\n",
-                       imp->imp_target_uuid.uuid, oa->o_id);
-        }
         memcpy(oa, &body->oa, sizeof(*oa));
 
         /* This should really be sent by the OST */
@@ -411,8 +401,8 @@ static int osc_punch(struct obd_export *exp, struct obdo *oa,
                 RETURN(-EINVAL);
         }
 
-        request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_PUNCH, 1, &size,
-                                  NULL);
+        request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+                                  OST_PUNCH, 1, &size, NULL);
         if (!request)
                 RETURN(-ENOMEM);
 
@@ -458,8 +448,8 @@ static int osc_sync(struct obd_export *exp, struct obdo *oa,
                 RETURN(-EINVAL);
         }
 
-        request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_SYNC, 1, &size,
-                                  NULL);
+        request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+                                  OST_SYNC, 1, &size, NULL);
         if (!request)
                 RETURN(-ENOMEM);
 
@@ -493,7 +483,8 @@ static int osc_sync(struct obd_export *exp, struct obdo *oa,
 }
 
 static int osc_destroy(struct obd_export *exp, struct obdo *oa,
-                       struct lov_stripe_md *ea, struct obd_trans_info *oti)
+                       struct lov_stripe_md *ea, struct obd_trans_info *oti,
+                       struct obd_export *md_export)
 {
         struct ptlrpc_request *request;
         struct ost_body *body;
@@ -505,8 +496,8 @@ static int osc_destroy(struct obd_export *exp, struct obdo *oa,
                 RETURN(-EINVAL);
         }
 
-        request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_DESTROY, 1,
-                                  &size, NULL);
+        request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+                                  OST_DESTROY, 1, &size, NULL);
         if (!request)
                 RETURN(-ENOMEM);
 
@@ -628,6 +619,17 @@ void osc_wake_cache_waiters(struct client_obd *cli)
         EXIT;
 }
 
+static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
+{
+        spin_lock(&cli->cl_loi_list_lock);
+        cli->cl_avail_grant = ocd->ocd_grant;
+        spin_unlock(&cli->cl_loi_list_lock);
+
+        CDEBUG(D_CACHE, "setting cl_avail_grant: %ld cl_lost_grant: %ld\n",
+               cli->cl_avail_grant, cli->cl_lost_grant);
+        LASSERT(cli->cl_avail_grant >= 0);
+}
+
 static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
 {
         client_obd_list_lock(&cli->cl_loi_list_lock);
@@ -783,10 +785,15 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa,
         size[2] = niocount * sizeof(*niobuf);
 
         OBD_FAIL_RETURN(OBD_FAIL_OSC_BRW_PREP_REQ, -ENOMEM);
-        req = ptlrpc_prep_req_pool(imp, opc, 3, size, NULL, pool);
+        req = ptlrpc_prep_req_pool(imp, LUSTRE_OST_VERSION, opc, 3,
+                                   size, NULL, pool);
         if (req == NULL)
                 RETURN (-ENOMEM);
 
+        /* FIXME bug 249. Also see bug 7198 */
+        if (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_REQPORTAL)
+                req->rq_request_portal = OST_IO_PORTAL;
+
         if (opc == OST_WRITE)
                 desc = ptlrpc_prep_bulk_imp (req, page_count,
                                              BULK_GET_SOURCE, OST_BULK_PORTAL);
@@ -1141,9 +1148,9 @@ static obd_count max_unfragmented_pages(struct brw_page *pg, obd_count pages)
        LASSERT (pages > 0);
         offset = pg->off & (CFS_PAGE_SIZE - 1);
 
-       for (;;) {
-               pages--;
-               if (pages == 0)         /* that's all */
+        for (;;) {
+                pages--;
+                if (pages == 0)         /* that's all */
                         return count;
 
                 if (offset + pg->count < CFS_PAGE_SIZE)
@@ -1154,14 +1161,16 @@ static obd_count max_unfragmented_pages(struct brw_page *pg, obd_count pages)
                if (offset != 0)        /* doesn't start on page boundary */
                        return count;
 
-               count++;
-       }
+                count++;
+        }
 }
 
 static int osc_brw(int cmd, struct obd_export *exp, struct obdo *oa,
                    struct lov_stripe_md *md, obd_count page_count,
                    struct brw_page *pga, struct obd_trans_info *oti)
 {
+        struct obdo *saved_oa = NULL;
+        int          rc;
         ENTRY;
 
         if (cmd & OBD_BRW_CHECK) {
@@ -1174,9 +1183,10 @@ static int osc_brw(int cmd, struct obd_export *exp, struct obdo *oa,
                 RETURN(0);
         }
 
+        rc = 0;
+
         while (page_count) {
                 obd_count pages_per_brw;
-                int rc;
 
                 if (page_count > PTLRPC_MAX_BRW_PAGES)
                         pages_per_brw = PTLRPC_MAX_BRW_PAGES;
@@ -1186,15 +1196,32 @@ static int osc_brw(int cmd, struct obd_export *exp, struct obdo *oa,
                 sort_brw_pages(pga, pages_per_brw);
                 pages_per_brw = max_unfragmented_pages(pga, pages_per_brw);
 
+                if (saved_oa != NULL) {
+                        /* restore previously saved oa */
+                        *oa = *saved_oa;
+                } else if (page_count > pages_per_brw) {
+                        /* save a copy of oa (brw will clobber it) */
+                        OBD_ALLOC(saved_oa, sizeof(*saved_oa));
+                        if (saved_oa == NULL) {
+                                CERROR("Can't save oa (ENOMEM)\n");
+                                RETURN(-ENOMEM);
+                        }
+                        *saved_oa = *oa;
+                }
+                
                 rc = osc_brw_internal(cmd, exp, oa, md, pages_per_brw, pga);
 
                 if (rc != 0)
-                        RETURN(rc);
+                        break;
 
                 page_count -= pages_per_brw;
                 pga += pages_per_brw;
         }
-        RETURN(0);
+
+        if (saved_oa != NULL)
+                OBD_FREE(saved_oa, sizeof(*saved_oa));
+
+        RETURN(rc);
 }
 
 static int osc_brw_async(int cmd, struct obd_export *exp, struct obdo *oa,
@@ -1241,6 +1268,9 @@ static void osc_check_rpcs(struct client_obd *cli);
 static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap,
                            int sent);
 
+/* This maintains the lists of pending pages to read/write for a given object
+ * (lop).  This is used by osc_check_rpcs->osc_next_loi() and loi_list_maint()
+ * to quickly find objects that are ready to send an RPC. */
 static int lop_makes_rpc(struct client_obd *cli, struct loi_oap_pages *lop,
                          int cmd)
 {
@@ -1410,9 +1440,13 @@ static void osc_ap_completion(struct client_obd *cli, struct obdo *oa,
 
         if (rc == 0 && oa != NULL) {
                 if (oa->o_valid & OBD_MD_FLBLOCKS)
-                        oap->oap_loi->loi_blocks = oa->o_blocks;
+                        oap->oap_loi->loi_lvb.lvb_blocks = oa->o_blocks;
                 if (oa->o_valid & OBD_MD_FLMTIME)
-                        oap->oap_loi->loi_mtime = oa->o_mtime;
+                        oap->oap_loi->loi_lvb.lvb_mtime = oa->o_mtime;
+                if (oa->o_valid & OBD_MD_FLATIME)
+                        oap->oap_loi->loi_lvb.lvb_atime = oa->o_atime;
+                if (oa->o_valid & OBD_MD_FLCTIME)
+                        oap->oap_loi->loi_lvb.lvb_ctime = oa->o_ctime;
         }
 
         if (oap->oap_oig) {
@@ -1771,6 +1805,8 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
                !list_empty(&(LOI)->loi_read_lop.lop_urgent),             \
                args)                                                     \
 
+/* This is called by osc_check_rpcs() to find which objects have pages that
+ * we could be sending.  These lists are maintained by lop_makes_rpc(). */
 struct lov_oinfo *osc_next_loi(struct client_obd *cli)
 {
         ENTRY;
@@ -2042,7 +2078,7 @@ static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm,
 #ifdef HAVE_QUOTA_SUPPORT
         if ((cmd & OBD_BRW_WRITE) && !(cmd & OBD_BRW_NOQUOTA)){
                 struct obd_async_page_ops *ops;
-                struct obdo *oa = NULL;
+                struct obdo *oa;
 
                 oa = obdo_alloc();
                 if (oa == NULL)
@@ -2313,6 +2349,7 @@ static int sanosc_brw_read(struct obd_export *exp, struct obdo *oa,
         struct niobuf_remote *nioptr;
         struct obd_ioobj *iooptr;
         int rc, size[3] = {sizeof(*body)}, mapped = 0;
+        struct obd_import *imp = class_exp2cliimp(exp);
         int swab;
         ENTRY;
 
@@ -2321,11 +2358,16 @@ static int sanosc_brw_read(struct obd_export *exp, struct obdo *oa,
         size[1] = sizeof(struct obd_ioobj);
         size[2] = page_count * sizeof(*nioptr);
 
-        request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_SAN_READ, 3,
-                                  size, NULL);
+        request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+                                  OST_SAN_READ, 3, size, NULL);
         if (!request)
                 RETURN(-ENOMEM);
 
+        /* FIXME bug 249 */
+        /* See bug 7198 */
+        if (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_REQPORTAL)
+                request->rq_request_portal = OST_IO_PORTAL;
+
         body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof(*body));
         iooptr = lustre_msg_buf(request->rq_reqmsg, 1, sizeof(*iooptr));
         nioptr = lustre_msg_buf(request->rq_reqmsg, 2,
@@ -2443,6 +2485,7 @@ static int sanosc_brw_write(struct obd_export *exp, struct obdo *oa,
         struct ost_body *body;
         struct niobuf_remote *nioptr;
         struct obd_ioobj *iooptr;
+        struct obd_import *imp = class_exp2cliimp(exp);
         int rc, size[3] = {sizeof(*body)}, mapped = 0;
         int swab;
         ENTRY;
@@ -2450,11 +2493,17 @@ static int sanosc_brw_write(struct obd_export *exp, struct obdo *oa,
         size[1] = sizeof(struct obd_ioobj);
         size[2] = page_count * sizeof(*nioptr);
 
-        request = ptlrpc_prep_req_pool(class_exp2cliimp(exp), OST_SAN_WRITE,
+        request = ptlrpc_prep_req_pool(class_exp2cliimp(exp),
+                                       LUSTRE_OST_VERSION, OST_SAN_WRITE,
                                        3, size, NULL, cli->cl_rq_pool);
         if (!request)
                 RETURN(-ENOMEM);
 
+        /* FIXME bug 249 */
+        /* See bug 7198 */
+        if (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_REQPORTAL)
+                request->rq_request_portal = OST_IO_PORTAL;
+
         body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
         iooptr = lustre_msg_buf(request->rq_reqmsg, 1, sizeof (*iooptr));
         nioptr = lustre_msg_buf(request->rq_reqmsg, 2,
@@ -2687,7 +2736,8 @@ static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
         if (*flags & LDLM_FL_HAS_INTENT) {
                 int size[2] = {sizeof(struct ldlm_request), sizeof(lvb)};
 
-                req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 1,
+                req = ptlrpc_prep_req(class_exp2cliimp(exp),
+                                      LUSTRE_DLM_VERSION, LDLM_ENQUEUE, 1,
                                       size, NULL);
                 if (req == NULL)
                         RETURN(-ENOMEM);
@@ -2715,9 +2765,7 @@ static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
         if ((*flags & LDLM_FL_HAS_INTENT && rc == ELDLM_LOCK_ABORTED) || !rc) {
                 CDEBUG(D_INODE,"got kms "LPU64" blocks "LPU64" mtime "LPU64"\n",
                        lvb.lvb_size, lvb.lvb_blocks, lvb.lvb_mtime);
-                lsm->lsm_oinfo->loi_rss = lvb.lvb_size;
-                lsm->lsm_oinfo->loi_mtime = lvb.lvb_mtime;
-                lsm->lsm_oinfo->loi_blocks = lvb.lvb_blocks;
+                lsm->lsm_oinfo->loi_lvb = lvb;
         }
 
         RETURN(rc);
@@ -2811,8 +2859,8 @@ static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs,
          * during mount that would help a bit).  Having relative timestamps
          * is not so great if request processing is slow, while absolute
          * timestamps are not ideal because they need time synchronization. */
-        request = ptlrpc_prep_req(obd->u.cli.cl_import, OST_STATFS, 0,
-                                  NULL, NULL);
+        request = ptlrpc_prep_req(obd->u.cli.cl_import, LUSTRE_OST_VERSION,
+                                  OST_STATFS,0,NULL,NULL);
         if (!request)
                 RETURN(-ENOMEM);
 
@@ -2994,8 +3042,8 @@ static int osc_get_info(struct obd_export *exp, obd_count keylen,
                 obd_id *reply;
                 char *bufs[1] = {key};
                 int rc;
-                req = ptlrpc_prep_req(class_exp2cliimp(exp), OST_GET_INFO, 1,
-                                      &keylen, bufs);
+                req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+                                      OST_GET_INFO, 1, &keylen, bufs);
                 if (req == NULL)
                         RETURN(-ENOMEM);
 
@@ -3031,6 +3079,17 @@ static int osc_set_info(struct obd_export *exp, obd_count keylen,
 
         OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10);
 
+        if (KEY_IS("next_id")) {
+                if (vallen != sizeof(obd_id))
+                        RETURN(-EINVAL);
+                obd->u.cli.cl_oscc.oscc_next_id = *((obd_id*)val) + 1;
+                CDEBUG(D_HA, "%s: set oscc_next_id = "LPU64"\n",
+                       exp->exp_obd->obd_name,
+                       obd->u.cli.cl_oscc.oscc_next_id);
+
+                RETURN(0);
+        }
+        
         if (KEY_IS("unlinked")) {
                 struct osc_creator *oscc = &obd->u.cli.cl_oscc;
                 spin_lock(&oscc->oscc_lock);
@@ -3039,7 +3098,6 @@ static int osc_set_info(struct obd_export *exp, obd_count keylen,
                 RETURN(0);
         }
 
-
         if (KEY_IS("initial_recov")) {
                 struct obd_import *imp = exp->exp_obd->u.cli.cl_import;
                 if (vallen != sizeof(int))
@@ -3062,7 +3120,8 @@ static int osc_set_info(struct obd_export *exp, obd_count keylen,
                 RETURN(-EINVAL);
 
 
-        req = ptlrpc_prep_req(imp, OST_SET_INFO, 2, size, bufs);
+        req = ptlrpc_prep_req(imp, LUSTRE_OST_VERSION, OST_SET_INFO,
+                              2, size, bufs);
         if (req == NULL)
                 RETURN(-ENOMEM);
 
@@ -3133,6 +3192,33 @@ static int osc_llog_finish(struct obd_device *obd, int count)
         RETURN(rc);
 }
 
+static int osc_reconnect(struct obd_export *exp, struct obd_device *obd,
+                         struct obd_uuid *cluuid,
+                         struct obd_connect_data *data)
+{
+        struct client_obd *cli = &obd->u.cli;
+
+        if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
+                long lost_grant;
+
+                spin_lock(&cli->cl_loi_list_lock);
+                data->ocd_grant = cli->cl_avail_grant ?:
+                                2 * cli->cl_max_pages_per_rpc << PAGE_SHIFT;
+                lost_grant = cli->cl_lost_grant;
+                cli->cl_lost_grant = 0;
+                spin_unlock(&cli->cl_loi_list_lock);
+
+                CDEBUG(D_CACHE, "request ocd_grant: %d cl_avail_grant: %ld "
+                       "cl_lost_grant: %ld\n", data->ocd_grant,
+                       cli->cl_avail_grant, lost_grant);
+                CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d"
+                       " ocd_grant: %d\n", data->ocd_connect_flags,
+                       data->ocd_version, data->ocd_grant);
+        }
+
+        RETURN(0);
+}
+
 static int osc_disconnect(struct obd_export *exp)
 {
         struct obd_device *obd = class_exp2obd(exp);
@@ -3171,8 +3257,7 @@ static int osc_import_event(struct obd_device *obd,
                 break;
         }
         case IMP_EVENT_INACTIVE: {
-                if (obd->obd_observer)
-                        rc = obd_notify(obd->obd_observer, obd, 0);
+                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
                 break;
         }
         case IMP_EVENT_INVALIDATE: {
@@ -3200,8 +3285,20 @@ static int osc_import_event(struct obd_device *obd,
                         oscc->oscc_flags &= ~OSCC_FLAG_NOSPC;
                         spin_unlock(&oscc->oscc_lock);
                 }
-                if (obd->obd_observer)
-                        rc = obd_notify(obd->obd_observer, obd, 1);
+                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE);
+                break;
+        }
+        case IMP_EVENT_OCD: {
+                struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+                if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT)
+                        osc_init_grant(&obd->u.cli, ocd);
+
+                /* See bug 7198 */
+                if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL)
+                        imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL;
+
+                rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD);
                 break;
         }
         default:
@@ -3252,13 +3349,20 @@ static int osc_precleanup(struct obd_device *obd, int stage)
         int rc = 0;
         ENTRY;
 
-        if (stage < 2)
-                RETURN(0);
-
-        rc = obd_llog_finish(obd, 0);
-        if (rc != 0)
-                CERROR("failed to cleanup llogging subsystems\n");
-
+        switch (stage) {
+        case OBD_CLEANUP_EARLY: {
+                struct obd_import *imp;
+                imp = obd->u.cli.cl_import;
+                CDEBUG(D_HA, "Deactivating import %s\n", obd->obd_name);
+                /* ptlrpc_abort_inflight to stop an mds_lov_synchronize */
+                ptlrpc_deactivate_import(imp);
+                break;
+        }
+        case OBD_CLEANUP_SELF_EXP:
+                rc = obd_llog_finish(obd, 0);
+                if (rc != 0)
+                        CERROR("failed to cleanup llogging subsystems\n");
+        }
         RETURN(rc);
 }
 
@@ -3280,9 +3384,10 @@ int osc_cleanup(struct obd_device *obd)
         /* free memory of osc quota cache */
         lquota_cleanup(quota_interface, obd);
 
+        rc = client_obd_cleanup(obd);
+
         ptlrpc_free_rq_pool(cli->cl_rq_pool);
 
-        rc = client_obd_cleanup(obd);
         ptlrpcd_decref();
         RETURN(rc);
 }
@@ -3296,6 +3401,7 @@ struct obd_ops osc_obd_ops = {
         .o_add_conn             = client_import_add_conn,
         .o_del_conn             = client_import_del_conn,
         .o_connect              = client_connect_import,
+        .o_reconnect            = osc_reconnect,
         .o_disconnect           = osc_disconnect,
         .o_statfs               = osc_statfs,
         .o_packmd               = osc_packmd,
@@ -3337,6 +3443,7 @@ struct obd_ops sanosc_obd_ops = {
         .o_add_conn             = client_import_add_conn,
         .o_del_conn             = client_import_del_conn,
         .o_connect              = client_connect_import,
+        .o_reconnect            = osc_reconnect,
         .o_disconnect           = client_disconnect_export,
         .o_statfs               = osc_statfs,
         .o_packmd               = osc_packmd,
@@ -3363,6 +3470,9 @@ struct obd_ops sanosc_obd_ops = {
 };
 #endif
 
+static quota_interface_t *quota_interface;
+extern quota_interface_t osc_quota_interface;
+
 int __init osc_init(void)
 {
         struct lprocfs_static_vars lvars;
index 562eb09..c81868d 100644 (file)
@@ -46,6 +46,7 @@
 #include <lustre_debug.h>
 #include <linux/init.h>
 #include <lprocfs_status.h>
+#include <lustre_mds.h>
 #include <lustre_commit_confd.h>
 #include <libcfs/list.h>
 #include <lustre_quota.h>
index 0c8ce7b..45abef3 100644 (file)
@@ -35,6 +35,7 @@
 #include <lustre_export.h>
 #include <obd.h>
 #include <obd_class.h>
+#include <linux/lustre_ver.h>
 
 #include "ptlrpc_internal.h"
 
@@ -126,14 +127,13 @@ int ptlrpc_set_import_discon(struct obd_import *imp)
                                "service will %s.\n",
                                target_len, target_start,
                                libcfs_nid2str(imp->imp_connection->c_peer.nid),
-                               imp->imp_replayable 
-                               ? "wait for recovery to complete"
-                               : "fail");
+                               imp->imp_replayable ?
+                               "wait for recovery to complete" : "fail");
 
                 if (obd_dump_on_timeout)
                         libcfs_debug_dumplog();
 
-                CWARN("%s: connection lost to %s@%s\n",
+                CDEBUG(D_HA, "%s: connection lost to %s@%s\n",
                       imp->imp_obd->obd_name,
                       imp->imp_target_uuid.uuid,
                       imp->imp_connection->c_remote_uuid.uuid);
@@ -330,11 +330,10 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid)
         imp->imp_conn_cnt++;
         imp->imp_resend_replay = 0;
 
-        if (imp->imp_remote_handle.cookie == 0) {
+        if (!lustre_handle_is_used(&imp->imp_remote_handle))
                 initial_connect = 1;
-        } else {
+        else
                 committed_before_reconnect = imp->imp_peer_committed_transno;
-        }
 
         spin_unlock_irqrestore(&imp->imp_lock, flags);
 
@@ -351,7 +350,25 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid)
         if (rc)
                 GOTO(out, rc);
 
-        request = ptlrpc_prep_req(imp, imp->imp_connect_op, 4, size, tmp);
+        if (imp->imp_initial_recov_bk && initial_connect &&
+            /* last in list */
+            (imp->imp_conn_current->oic_item.next == &imp->imp_conn_list)) {
+                CDEBUG(D_HA, "Last connection attempt (%d) for %s\n",
+                       imp->imp_conn_cnt, imp->imp_target_uuid.uuid);
+                /* Don't retry if connect fails */
+                rc = 0;
+                obd_set_info(obd->obd_self_export,
+                             strlen("initial_recov"), "initial_recov",
+                             sizeof(rc), &rc);
+        }
+
+        rc = obd_reconnect(imp->imp_obd->obd_self_export, obd,
+                           &obd->obd_uuid, &imp->imp_connect_data);
+        if (rc)
+                GOTO(out, rc);
+
+        request = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, imp->imp_connect_op,
+                                  4, size, tmp);
         if (!request)
                 GOTO(out, rc = -ENOMEM);
 
@@ -365,7 +382,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid)
         request->rq_replen = lustre_msg_size(1, size);
         request->rq_interpret_reply = ptlrpc_connect_interpret;
 
-        LASSERT (sizeof (*aa) <= sizeof (request->rq_async_args));
+        CLASSERT(sizeof (*aa) <= sizeof (request->rq_async_args));
         aa = (struct ptlrpc_connect_async_args *)&request->rq_async_args;
         memset(aa, 0, sizeof *aa);
 
@@ -374,7 +391,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid)
 
         if (aa->pcaa_initial_connect) {
                 imp->imp_replayable = 1;
-                /* On an initial connect, we don't know which one of a 
+                /* On an initial connect, we don't know which one of a
                    failover server pair is up.  Don't wait long. */
                 request->rq_timeout = max((int)(obd_timeout / 20), 5);
         }
@@ -537,6 +554,7 @@ finish:
                 }
         } else {
                 struct obd_connect_data *ocd;
+                struct obd_export *exp;
 
                 ocd = lustre_swab_repbuf(request, 0,
                                          sizeof *ocd, lustre_swab_connect);
@@ -555,11 +573,39 @@ finish:
                         ocd->ocd_connect_flags);
 
                 imp->imp_connect_data = *ocd;
-                
-                if (IMP_CROW_ABLE(imp)) {
-                        CDEBUG(D_HA, "connected to CROW capable target: %s\n",
-                               imp->imp_target_uuid.uuid);
+                if (!ocd->ocd_ibits_known &&
+                    ocd->ocd_connect_flags & OBD_CONNECT_IBITS)
+                        CERROR("Inodebits aware server returned zero compatible"
+                               " bits?\n");
+
+                exp = class_conn2export(&imp->imp_dlm_handle);
+                LASSERT(exp);
+                exp->exp_connect_flags = ocd->ocd_connect_flags;
+                class_export_put(exp);
+
+                obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD);
+
+                if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+                    (ocd->ocd_version > LUSTRE_VERSION_CODE +
+                    LUSTRE_VERSION_OFFSET_WARN)) {
+                        /* Sigh, some compilers do not like #ifdef in the middle
+                           of macro arguments */
+#ifdef __KERNEL__
+                        char *action = "upgrading this client";
+#else
+                        char *action = "recompiling this application";
+#endif
+
+                        CWARN("Server %s version (%d.%d.%d.%d) is much newer. "
+                              "Consider %s (%s).\n",
+                              imp->imp_target_uuid.uuid,
+                              OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+                              OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+                              OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+                              OBD_OCD_VERSION_FIX(ocd->ocd_version),
+                              action, LUSTRE_VERSION_STRING);
                 }
+
                 if (imp->imp_conn_current != NULL) {
                         list_del(&imp->imp_conn_current->oic_item);
                         list_add(&imp->imp_conn_current->oic_item,
@@ -579,8 +625,33 @@ finish:
  out:
         if (rc != 0) {
                 IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
-                if (aa->pcaa_initial_connect && !imp->imp_initial_recov) {
+                if (aa->pcaa_initial_connect && !imp->imp_initial_recov)
                         ptlrpc_deactivate_import(imp);
+
+                if (rc == -EPROTO) {
+                        struct obd_connect_data *ocd;
+                        ocd = lustre_swab_repbuf(request, 0,
+                                                 sizeof *ocd,
+                                                 lustre_swab_connect);
+                        if (ocd &&
+                            (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+                            (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
+                           /* Actually servers are only supposed to refuse
+                              connection from liblustre clients, so we should
+                              never see this from VFS context */
+                                CERROR("Server %s version (%d.%d.%d.%d) "
+                                       "refused connection from this client "
+                                       "as too old version (%s).  Client must "
+                                       "be recompiled\n",
+                                      imp->imp_target_uuid.uuid,
+                                      OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+                                      OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+                                      OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+                                      OBD_OCD_VERSION_FIX(ocd->ocd_version),
+                                      LUSTRE_VERSION_STRING);
+                                IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
+                        }
+                        RETURN(-EPROTO);
                 }
 
                 ptlrpc_maybe_ping_import_soon(imp);
@@ -619,7 +690,8 @@ static int signal_completed_replay(struct obd_import *imp)
         LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
         atomic_inc(&imp->imp_replay_inflight);
 
-        req = ptlrpc_prep_req(imp, OBD_PING, 0, NULL, NULL);
+        req = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, OBD_PING,
+                              0, NULL, NULL);
         if (!req) {
                 atomic_dec(&imp->imp_replay_inflight);
                 RETURN(-ENOMEM);
@@ -726,8 +798,6 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
         }
 
         if (imp->imp_state == LUSTRE_IMP_RECOVER) {
-                char   *nidstr;
-
                 CDEBUG(D_HA, "reconnected to %s@%s\n",
                        imp->imp_target_uuid.uuid,
                        imp->imp_connection->c_remote_uuid.uuid);
@@ -740,15 +810,10 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 
                 deuuidify(imp->imp_target_uuid.uuid, NULL,
                           &target_start, &target_len);
-                nidstr = libcfs_nid2str(imp->imp_connection->c_peer.nid);
-
-                LCONSOLE_INFO("Connection restored to service %.*s using nid "
-                              "%s.\n", target_len, target_start, nidstr);
-
-                CWARN("%s: connection restored to %s@%s\n",
-                      imp->imp_obd->obd_name,
-                      imp->imp_target_uuid.uuid,
-                      imp->imp_connection->c_remote_uuid.uuid);
+                LCONSOLE_INFO("%s: Connection restored to service %.*s "
+                              "using nid %s.\n", imp->imp_obd->obd_name,
+                              target_len, target_start,
+                              libcfs_nid2str(imp->imp_connection->c_peer.nid));
         }
 
         if (imp->imp_state == LUSTRE_IMP_FULL) {
@@ -797,7 +862,8 @@ int ptlrpc_disconnect_import(struct obd_import *imp)
 
         spin_unlock_irqrestore(&imp->imp_lock, flags);
 
-        request = ptlrpc_prep_req(imp, rq_opc, 0, NULL, NULL);
+        request = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, rq_opc,
+                                  0, NULL, NULL);
         if (request) {
                 /* We are disconnecting, do not retry a failed DISCONNECT rpc if
                  * it fails.  We can get through the above with a down server
index d714a84..b250f0c 100644 (file)
@@ -39,6 +39,7 @@
 #endif
 
 #include <obd_class.h>
+#include <lustre_mds.h>
 #include <lustre_log.h>
 #include <lustre_net.h>
 #include <libcfs/list.h>
index 735ed31..0d21734 100644 (file)
@@ -42,6 +42,7 @@
 #endif
 
 #include <obd_class.h>
+#include <lustre_mds.h>
 #include <lustre_log.h>
 #include <libcfs/list.h>
 #include <lvfs.h>
index cf588d3..1d81f2e 100644 (file)
@@ -37,6 +37,7 @@
 #endif
 
 #include <obd_class.h>
+#include <lustre_mds.h>
 #include <lustre_log.h>
 #include <lustre_net.h>
 #include <libcfs/list.h>
index dd4063f..774021b 100644 (file)
@@ -46,6 +46,14 @@ int lustre_msg_swabbed(struct lustre_msg *msg)
         return (msg->magic == __swab32(PTLRPC_MSG_MAGIC));
 }
 
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version)
+{
+        if (lustre_msg_swabbed(msg))
+                 return (__swab32(msg->version) & LUSTRE_VERSION_MASK) != version;
+
+        return (msg->version & LUSTRE_VERSION_MASK) != version;
+}
+
 static void
 lustre_init_msg (struct lustre_msg *msg, int count, int *lens, char **bufs)
 {
@@ -321,7 +329,7 @@ int lustre_unpack_msg(struct lustre_msg *m, int len)
                 RETURN (-EINVAL);
         }
 
-        if (m->version != PTLRPC_MSG_VERSION) {
+        if ((m->version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) {
                 CERROR("wrong lustre_msg version %#08x\n", m->version);
                 RETURN (-EINVAL);
         }
@@ -494,6 +502,16 @@ void *lustre_swab_repbuf(struct ptlrpc_request *req, int index, int min_size,
 void lustre_swab_connect(struct obd_connect_data *ocd)
 {
         __swab64s (&ocd->ocd_connect_flags);
+        __swab32s (&ocd->ocd_version);
+        __swab32s (&ocd->ocd_grant);
+        __swab32s (&ocd->ocd_index);
+        __swab32s (&ocd->ocd_unused);
+        __swab64s (&ocd->ocd_ibits_known);
+        CLASSERT(offsetof(typeof(*ocd), padding2) != 0);
+        CLASSERT(offsetof(typeof(*ocd), padding3) != 0);
+        CLASSERT(offsetof(typeof(*ocd), padding4) != 0);
+        CLASSERT(offsetof(typeof(*ocd), padding5) != 0);
+        CLASSERT(offsetof(typeof(*ocd), padding6) != 0);
 }
 
 void lustre_swab_obdo (struct obdo  *o)
@@ -531,7 +549,7 @@ void lustre_swab_obd_statfs (struct obd_statfs *os)
         __swab64s (&os->os_bavail);
         __swab64s (&os->os_files);
         __swab64s (&os->os_ffree);
-        /* no need to swap os_fsid */
+        /* no need to swab os_fsid */
         __swab32s (&os->os_bsize);
         __swab32s (&os->os_namelen);
         __swab64s (&os->os_maxbytes);
@@ -605,8 +623,8 @@ void lustre_swab_mds_body (struct mds_body *b)
         __swab32s (&b->suppgid);
         __swab32s (&b->eadatasize);
         __swab32s (&b->aclsize);
-        __swab32s (&b->padding_2);
-        __swab32s (&b->padding_3);
+        __swab32s (&b->max_mdsize);
+        __swab32s (&b->max_cookiesize);
         __swab32s (&b->padding_4);
 }
 
@@ -629,7 +647,7 @@ static void lustre_swab_obd_dqblk (struct obd_dqblk *b)
         __swab64s (&b->dqb_btime);
         __swab64s (&b->dqb_itime);
         __swab32s (&b->dqb_valid);
-        __swab32s (&b->padding);
+        CLASSERT(offsetof(typeof(*b), padding) != 0);
 }
 
 void lustre_swab_obd_quotactl (struct obd_quotactl *q)
@@ -659,7 +677,13 @@ void lustre_swab_mds_rec_setattr (struct mds_rec_setattr *sa)
         __swab32s (&sa->sa_uid);
         __swab32s (&sa->sa_gid);
         __swab32s (&sa->sa_attr_flags);
-        __swab32s (&sa->sa_padding);
+        CLASSERT(offsetof(typeof(*sa), sa_padding) != 0);
+}
+
+void lustre_swab_mds_rec_join (struct mds_rec_join *jr)
+{
+        __swab64s(&jr->jr_headsize);
+        lustre_swab_ll_fid(&jr->jr_fid);
 }
 
 void lustre_swab_mds_rec_create (struct mds_rec_create *cr)
@@ -675,11 +699,11 @@ void lustre_swab_mds_rec_create (struct mds_rec_create *cr)
         __swab64s (&cr->cr_time);
         __swab64s (&cr->cr_rdev);
         __swab32s (&cr->cr_suppgid);
-        __swab32s (&cr->cr_padding_1);
-        __swab32s (&cr->cr_padding_2);
-        __swab32s (&cr->cr_padding_3);
-        __swab32s (&cr->cr_padding_4);
-        __swab32s (&cr->cr_padding_5);
+        CLASSERT(offsetof(typeof(*cr), cr_padding_1) != 0);
+        CLASSERT(offsetof(typeof(*cr), cr_padding_2) != 0);
+        CLASSERT(offsetof(typeof(*cr), cr_padding_3) != 0);
+        CLASSERT(offsetof(typeof(*cr), cr_padding_4) != 0);
+        CLASSERT(offsetof(typeof(*cr), cr_padding_5) != 0);
 }
 
 void lustre_swab_mds_rec_link (struct mds_rec_link *lk)
@@ -693,10 +717,10 @@ void lustre_swab_mds_rec_link (struct mds_rec_link *lk)
         lustre_swab_ll_fid (&lk->lk_fid1);
         lustre_swab_ll_fid (&lk->lk_fid2);
         __swab64s (&lk->lk_time);
-        __swab32s (&lk->lk_padding_1);
-        __swab32s (&lk->lk_padding_2);
-        __swab32s (&lk->lk_padding_3);
-        __swab32s (&lk->lk_padding_4);
+        CLASSERT(offsetof(typeof(*lk), lk_padding_1) != 0);
+        CLASSERT(offsetof(typeof(*lk), lk_padding_2) != 0);
+        CLASSERT(offsetof(typeof(*lk), lk_padding_3) != 0);
+        CLASSERT(offsetof(typeof(*lk), lk_padding_4) != 0);
 }
 
 void lustre_swab_mds_rec_unlink (struct mds_rec_unlink *ul)
@@ -710,10 +734,10 @@ void lustre_swab_mds_rec_unlink (struct mds_rec_unlink *ul)
         lustre_swab_ll_fid (&ul->ul_fid1);
         lustre_swab_ll_fid (&ul->ul_fid2);
         __swab64s (&ul->ul_time);
-        __swab32s (&ul->ul_padding_1);
-        __swab32s (&ul->ul_padding_2);
-        __swab32s (&ul->ul_padding_3);
-        __swab32s (&ul->ul_padding_4);
+        CLASSERT(offsetof(typeof(*ul), ul_padding_1) != 0);
+        CLASSERT(offsetof(typeof(*ul), ul_padding_2) != 0);
+        CLASSERT(offsetof(typeof(*ul), ul_padding_3) != 0);
+        CLASSERT(offsetof(typeof(*ul), ul_padding_4) != 0);
 }
 
 void lustre_swab_mds_rec_rename (struct mds_rec_rename *rn)
@@ -727,10 +751,10 @@ void lustre_swab_mds_rec_rename (struct mds_rec_rename *rn)
         lustre_swab_ll_fid (&rn->rn_fid1);
         lustre_swab_ll_fid (&rn->rn_fid2);
         __swab64s (&rn->rn_time);
-        __swab32s (&rn->rn_padding_1);
-        __swab32s (&rn->rn_padding_2);
-        __swab32s (&rn->rn_padding_3);
-        __swab32s (&rn->rn_padding_4);
+        CLASSERT(offsetof(typeof(*rn), rn_padding_1) != 0);
+        CLASSERT(offsetof(typeof(*rn), rn_padding_2) != 0);
+        CLASSERT(offsetof(typeof(*rn), rn_padding_3) != 0);
+        CLASSERT(offsetof(typeof(*rn), rn_padding_4) != 0);
 }
 
 void lustre_swab_lov_desc (struct lov_desc *ld)
@@ -741,8 +765,6 @@ void lustre_swab_lov_desc (struct lov_desc *ld)
         __swab64s (&ld->ld_default_stripe_size);
         __swab64s (&ld->ld_default_stripe_offset);
         __swab32s (&ld->ld_pattern);
-        __swab32s (&ld->ld_qos_threshold);
-        __swab32s (&ld->ld_qos_maxage);
         /* uuid endian insensitive */
 }
 
@@ -773,6 +795,33 @@ void lustre_swab_lov_user_md(struct lov_user_md *lum)
         EXIT;
 }
 
+static void print_lumj (struct lov_user_md_join *lumj)
+{
+        CDEBUG(D_OTHER, "lov_user_md %p:\n", lumj);
+        CDEBUG(D_OTHER, "\tlmm_magic: %#x\n", lumj->lmm_magic);
+        CDEBUG(D_OTHER, "\tlmm_pattern: %#x\n", lumj->lmm_pattern);
+        CDEBUG(D_OTHER, "\tlmm_object_id: "LPU64"\n", lumj->lmm_object_id);
+        CDEBUG(D_OTHER, "\tlmm_object_gr: "LPU64"\n", lumj->lmm_object_gr);
+        CDEBUG(D_OTHER, "\tlmm_stripe_size: %#x\n", lumj->lmm_stripe_size);
+        CDEBUG(D_OTHER, "\tlmm_stripe_count: %#x\n", lumj->lmm_stripe_count);
+        CDEBUG(D_OTHER, "\tlmm_extent_count: %#x\n", lumj->lmm_extent_count);
+}
+
+void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj)
+{
+        ENTRY;
+        CDEBUG(D_IOCTL, "swabbing lov_user_md_join\n");
+        __swab32s(&lumj->lmm_magic);
+        __swab32s(&lumj->lmm_pattern);
+        __swab64s(&lumj->lmm_object_id);
+        __swab64s(&lumj->lmm_object_gr);
+        __swab32s(&lumj->lmm_stripe_size);
+        __swab32s(&lumj->lmm_stripe_count);
+        __swab32s(&lumj->lmm_extent_count);
+        print_lumj(lumj);
+        EXIT;
+}
+
 static void print_lum_objs(struct lov_user_md *lum)
 {
         struct lov_user_ost_data *lod;
@@ -834,6 +883,7 @@ void lustre_swab_ldlm_intent (struct ldlm_intent *i)
 void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r)
 {
         __swab32s (&r->lr_type);
+        CLASSERT(offsetof(typeof(*r), lr_padding) != 0);
         lustre_swab_ldlm_res_id (&r->lr_name);
 }
 
@@ -848,6 +898,7 @@ void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l)
 void lustre_swab_ldlm_request (struct ldlm_request *rq)
 {
         __swab32s (&rq->lock_flags);
+        CLASSERT(offsetof(typeof(*rq), lock_padding) != 0);
         lustre_swab_ldlm_lock_desc (&rq->lock_desc);
         /* lock_handle1 opaque */
         /* lock_handle2 opaque */
@@ -856,35 +907,13 @@ void lustre_swab_ldlm_request (struct ldlm_request *rq)
 void lustre_swab_ldlm_reply (struct ldlm_reply *r)
 {
         __swab32s (&r->lock_flags);
+        CLASSERT(offsetof(typeof(*r), lock_padding) != 0);
         lustre_swab_ldlm_lock_desc (&r->lock_desc);
         /* lock_handle opaque */
         __swab64s (&r->lock_policy_res1);
         __swab64s (&r->lock_policy_res2);
 }
 
-void lustre_swab_ptlbd_op (struct ptlbd_op *op)
-{
-        __swab16s (&op->op_cmd);
-        __swab16s (&op->op_lun);
-        __swab16s (&op->op_niob_cnt);
-        /* ignore op__padding */
-        __swab32s (&op->op_block_cnt);
-}
-
-void lustre_swab_ptlbd_niob (struct ptlbd_niob *n)
-{
-        __swab64s (&n->n_xid);
-        __swab64s (&n->n_block_nr);
-        __swab32s (&n->n_offset);
-        __swab32s (&n->n_length);
-}
-
-void lustre_swab_ptlbd_rsp (struct ptlbd_rsp *r)
-{
-        __swab16s (&r->r_status);
-        __swab16s (&r->r_error_cnt);
-}
-
 /* no one calls this */
 int llog_log_swabbed(struct llog_log_hdr *hdr)
 {
@@ -906,8 +935,8 @@ void lustre_swab_qdata(struct qunit_data *d)
 void lustre_assert_wire_constants(void)
 {
         /* Wire protocol assertions generated by 'wirecheck'
-         * running on Linux mustang 2.6.12-1.1456_FC4smp #1 SMP Thu Sep 22 02:22:14 EDT 2005 i686 i68
-         * with gcc version 4.0.1 20050727 (Red Hat 4.0.1-5) */
+         * running on Linux schatzie.adilger.int 2.6.12-1.1381_FC3 #1 Fri Oct 21 03:46:55 EDT 2005 i6
+         * with gcc version 3.3.4 20040817 (Red Hat Linux 3.3.4-2) */
 
 
         /* Constants... */
@@ -1039,8 +1068,6 @@ void lustre_assert_wire_constants(void)
                  (long long)MDS_STATUS_CONN);
         LASSERTF(MDS_STATUS_LOV == 2, " found %lld\n",
                  (long long)MDS_STATUS_LOV);
-        LASSERTF(MDS_OPEN_HAS_EA == 1073741824, " found %lld\n",
-                 (long long)MDS_OPEN_HAS_EA);
         LASSERTF(LDLM_ENQUEUE == 101, " found %lld\n",
                  (long long)LDLM_ENQUEUE);
         LASSERTF(LDLM_CONVERT == 102, " found %lld\n",
@@ -1051,6 +1078,8 @@ void lustre_assert_wire_constants(void)
                  (long long)LDLM_BL_CALLBACK);
         LASSERTF(LDLM_CP_CALLBACK == 105, " found %lld\n",
                  (long long)LDLM_CP_CALLBACK);
+        LASSERTF(LDLM_GL_CALLBACK == 106, " found %lld\n",
+                 (long long)LDLM_GL_CALLBACK);
         LASSERTF(LDLM_LAST_OPC == 107, " found %lld\n",
                  (long long)LDLM_LAST_OPC);
         LASSERTF(LCK_EX == 1, " found %lld\n",
@@ -1065,26 +1094,14 @@ void lustre_assert_wire_constants(void)
                  (long long)LCK_CR);
         LASSERTF(LCK_NL == 32, " found %lld\n",
                  (long long)LCK_NL);
-        LASSERTF(PTLBD_QUERY == 200, " found %lld\n",
-                 (long long)PTLBD_QUERY);
-        LASSERTF(PTLBD_READ == 201, " found %lld\n",
-                 (long long)PTLBD_READ);
-        LASSERTF(PTLBD_WRITE == 202, " found %lld\n",
-                 (long long)PTLBD_WRITE);
-        LASSERTF(PTLBD_FLUSH == 203, " found %lld\n",
-                 (long long)PTLBD_FLUSH);
-        LASSERTF(PTLBD_CONNECT == 204, " found %lld\n",
-                 (long long)PTLBD_CONNECT);
-        LASSERTF(PTLBD_DISCONNECT == 205, " found %lld\n",
-                 (long long)PTLBD_DISCONNECT);
-        LASSERTF(PTLBD_LAST_OPC == 206, " found %lld\n",
-                 (long long)PTLBD_LAST_OPC);
-        LASSERTF(MGMT_CONNECT == 250, " found %lld\n",
-                 (long long)MGMT_CONNECT);
-        LASSERTF(MGMT_DISCONNECT == 251, " found %lld\n",
-                 (long long)MGMT_DISCONNECT);
-        LASSERTF(MGMT_EXCEPTION == 252, " found %lld\n",
-                 (long long)MGMT_EXCEPTION);
+        LASSERTF(LCK_GROUP == 64, " found %lld\n",
+                 (long long)LCK_GROUP);
+        LASSERTF(LCK_MAXMODE == 65, " found %lld\n",
+                 (long long)LCK_MAXMODE);
+        CLASSERT(LDLM_PLAIN == 10);
+        CLASSERT(LDLM_EXTENT == 11);
+        CLASSERT(LDLM_FLOCK == 12);
+        CLASSERT(LDLM_IBITS == 13);
         LASSERTF(OBD_PING == 400, " found %lld\n",
                  (long long)OBD_PING);
         LASSERTF(OBD_LOG_CANCEL == 401, " found %lld\n",
@@ -1097,6 +1114,19 @@ void lustre_assert_wire_constants(void)
                  (long long)QUOTA_DQACQ);
         LASSERTF(QUOTA_DQREL == 602, " found %lld\n",
                  (long long)QUOTA_DQREL);
+        CLASSERT(OBD_CONNECT_RDONLY == 0x1ULL);
+        CLASSERT(OBD_CONNECT_INDEX == 0x2ULL);
+        CLASSERT(OBD_CONNECT_GRANT == 0x8ULL);
+        CLASSERT(OBD_CONNECT_SRVLOCK == 0x10ULL);
+        CLASSERT(OBD_CONNECT_VERSION == 0x20ULL);
+        CLASSERT(OBD_CONNECT_REQPORTAL == 0x40ULL);
+        CLASSERT(OBD_CONNECT_ACL == 0x80ULL);
+        CLASSERT(OBD_CONNECT_XATTR == 0x100ULL);
+        CLASSERT(OBD_CONNECT_CROW == 0x200ULL);
+        CLASSERT(OBD_CONNECT_TRUNCLOCK == 0x400ULL);
+        CLASSERT(OBD_CONNECT_TRANSNO == 0x800ULL);
+        CLASSERT(OBD_CONNECT_IBITS == 0x1000ULL);
+        CLASSERT(OBD_CONNECT_JOIN == 0x2000ULL);
         /* Sizes and Offsets */
 
 
@@ -1243,92 +1273,66 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct obdo, o_mds));
         LASSERTF((int)sizeof(((struct obdo *)0)->o_mds) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct obdo *)0)->o_mds));
+        LASSERTF((int)offsetof(struct obdo, o_stripe_idx) == 120, " found %lld\n",
+                 (long long)(int)offsetof(struct obdo, o_stripe_idx));
+        LASSERTF((int)sizeof(((struct obdo *)0)->o_stripe_idx) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obdo *)0)->o_stripe_idx));
+        LASSERTF((int)offsetof(struct obdo, o_padding_1) == 124, " found %lld\n",
+                 (long long)(int)offsetof(struct obdo, o_padding_1));
+        LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_1) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obdo *)0)->o_padding_1));
         LASSERTF((int)offsetof(struct obdo, o_inline) == 128, " found %lld\n",
                  (long long)(int)offsetof(struct obdo, o_inline));
         LASSERTF((int)sizeof(((struct obdo *)0)->o_inline) == 80, " found %lld\n",
                  (long long)(int)sizeof(((struct obdo *)0)->o_inline));
         LASSERTF(OBD_INLINESZ == 80, " found %lld\n",
                  (long long)OBD_INLINESZ);
-        LASSERTF(OBD_MD_FLID == 1, " found %lld\n",
-                 (long long)OBD_MD_FLID);
-        LASSERTF(OBD_MD_FLATIME == 2, " found %lld\n",
-                 (long long)OBD_MD_FLATIME);
-        LASSERTF(OBD_MD_FLMTIME == 4, " found %lld\n",
-                 (long long)OBD_MD_FLMTIME);
-        LASSERTF(OBD_MD_FLCTIME == 8, " found %lld\n",
-                 (long long)OBD_MD_FLCTIME);
-        LASSERTF(OBD_MD_FLSIZE == 16, " found %lld\n",
-                 (long long)OBD_MD_FLSIZE);
-        LASSERTF(OBD_MD_FLBLOCKS == 32, " found %lld\n",
-                 (long long)OBD_MD_FLBLOCKS);
-        LASSERTF(OBD_MD_FLBLKSZ == 64, " found %lld\n",
-                 (long long)OBD_MD_FLBLKSZ);
-        LASSERTF(OBD_MD_FLMODE == 128, " found %lld\n",
-                 (long long)OBD_MD_FLMODE);
-        LASSERTF(OBD_MD_FLTYPE == 256, " found %lld\n",
-                 (long long)OBD_MD_FLTYPE);
-        LASSERTF(OBD_MD_FLUID == 512, " found %lld\n",
-                 (long long)OBD_MD_FLUID);
-        LASSERTF(OBD_MD_FLGID == 1024, " found %lld\n",
-                 (long long)OBD_MD_FLGID);
-        LASSERTF(OBD_MD_FLFLAGS == 2048, " found %lld\n",
-                 (long long)OBD_MD_FLFLAGS);
-        LASSERTF(OBD_MD_FLNLINK == 8192, " found %lld\n",
-                 (long long)OBD_MD_FLNLINK);
-        LASSERTF(OBD_MD_FLGENER == 16384, " found %lld\n",
-                 (long long)OBD_MD_FLGENER);
-        LASSERTF(OBD_MD_FLINLINE == 32768, " found %lld\n",
-                 (long long)OBD_MD_FLINLINE);
-        LASSERTF(OBD_MD_FLRDEV == 65536, " found %lld\n",
-                 (long long)OBD_MD_FLRDEV);
-        LASSERTF(OBD_MD_FLEASIZE == 131072, " found %lld\n",
-                 (long long)OBD_MD_FLEASIZE);
-        LASSERTF(OBD_MD_LINKNAME == 262144, " found %lld\n",
-                 (long long)OBD_MD_LINKNAME);
-        LASSERTF(OBD_MD_FLHANDLE == 524288, " found %lld\n",
-                 (long long)OBD_MD_FLHANDLE);
-        LASSERTF(OBD_MD_FLCKSUM == 1048576, " found %lld\n",
-                 (long long)OBD_MD_FLCKSUM);
-        LASSERTF(OBD_MD_FLQOS == 2097152, " found %lld\n",
-                 (long long)OBD_MD_FLQOS);
-        LASSERTF(OBD_MD_FLCOOKIE == 8388608, " found %lld\n",
-                 (long long)OBD_MD_FLCOOKIE);
-        LASSERTF(OBD_MD_FLGROUP == 16777216, " found %lld\n",
-                 (long long)OBD_MD_FLGROUP);
-        LASSERTF(OBD_MD_FLFID == 33554432, " found %lld\n",
-                 (long long)OBD_MD_FLFID);
-        LASSERTF(OBD_MD_FLEPOCH == 67108864, " found %lld\n",
-                 (long long)OBD_MD_FLEPOCH);
-        LASSERTF(OBD_MD_FLGRANT == 134217728, " found %lld\n",
-                 (long long)OBD_MD_FLGRANT);
-        LASSERTF(OBD_MD_FLDIREA == 268435456, " found %lld\n",
-                 (long long)OBD_MD_FLDIREA);
-        LASSERTF(OBD_MD_FLUSRQUOTA == 536870912, " found %lld\n",
-                 (long long)OBD_MD_FLUSRQUOTA);
-        LASSERTF(OBD_MD_FLGRPQUOTA == 1073741824, " found %lld\n",
-                 (long long)OBD_MD_FLGRPQUOTA);
-        LASSERTF(OBD_MD_MDS == 4294967296ULL, " found %lld\n",
-                 (long long)OBD_MD_MDS);
-        LASSERTF(OBD_MD_REINT == 8589934592ULL, " found %lld\n",
-                 (long long)OBD_MD_REINT);
-        LASSERTF(OBD_FL_INLINEDATA == 1, " found %lld\n",
-                 (long long)OBD_FL_INLINEDATA);
-        LASSERTF(OBD_FL_OBDMDEXISTS == 2, " found %lld\n",
-                 (long long)OBD_FL_OBDMDEXISTS);
-        LASSERTF(OBD_FL_DELORPHAN == 4, " found %lld\n",
-                 (long long)OBD_FL_DELORPHAN);
-        LASSERTF(OBD_FL_NORPC == 8, " found %lld\n",
-                 (long long)OBD_FL_NORPC);
-        LASSERTF(OBD_FL_IDONLY == 16, " found %lld\n",
-                 (long long)OBD_FL_IDONLY);
-        LASSERTF(OBD_FL_RECREATE_OBJS == 32, " found %lld\n",
-                 (long long)OBD_FL_RECREATE_OBJS);
-        LASSERTF(OBD_FL_DEBUG_CHECK == 64, " found %lld\n",
-                 (long long)OBD_FL_DEBUG_CHECK);
-        LASSERTF(OBD_FL_NO_USRQUOTA == 256, " found %lld\n",
-                 (long long)OBD_FL_NO_USRQUOTA);
-        LASSERTF(OBD_FL_NO_GRPQUOTA == 512, " found %lld\n",
-                 (long long)OBD_FL_NO_GRPQUOTA);
+        CLASSERT(OBD_MD_FLID == (0x00000001ULL));
+        CLASSERT(OBD_MD_FLATIME == (0x00000002ULL));
+        CLASSERT(OBD_MD_FLMTIME == (0x00000004ULL));
+        CLASSERT(OBD_MD_FLCTIME == (0x00000008ULL));
+        CLASSERT(OBD_MD_FLSIZE == (0x00000010ULL));
+        CLASSERT(OBD_MD_FLBLOCKS == (0x00000020ULL));
+        CLASSERT(OBD_MD_FLBLKSZ == (0x00000040ULL));
+        CLASSERT(OBD_MD_FLMODE == (0x00000080ULL));
+        CLASSERT(OBD_MD_FLTYPE == (0x00000100ULL));
+        CLASSERT(OBD_MD_FLUID == (0x00000200ULL));
+        CLASSERT(OBD_MD_FLGID == (0x00000400ULL));
+        CLASSERT(OBD_MD_FLFLAGS == (0x00000800ULL));
+        CLASSERT(OBD_MD_FLNLINK == (0x00002000ULL));
+        CLASSERT(OBD_MD_FLGENER == (0x00004000ULL));
+        CLASSERT(OBD_MD_FLINLINE == (0x00008000ULL));
+        CLASSERT(OBD_MD_FLRDEV == (0x00010000ULL));
+        CLASSERT(OBD_MD_FLEASIZE == (0x00020000ULL));
+        CLASSERT(OBD_MD_LINKNAME == (0x00040000ULL));
+        CLASSERT(OBD_MD_FLHANDLE == (0x00080000ULL));
+        CLASSERT(OBD_MD_FLCKSUM == (0x00100000ULL));
+        CLASSERT(OBD_MD_FLQOS == (0x00200000ULL));
+        CLASSERT(OBD_MD_FLCOOKIE == (0x00800000ULL));
+        CLASSERT(OBD_MD_FLGROUP == (0x01000000ULL));
+        CLASSERT(OBD_MD_FLFID == (0x02000000ULL));
+        CLASSERT(OBD_MD_FLEPOCH == (0x04000000ULL));
+        CLASSERT(OBD_MD_FLGRANT == (0x08000000ULL));
+        CLASSERT(OBD_MD_FLDIREA == (0x10000000ULL));
+        CLASSERT(OBD_MD_FLUSRQUOTA == (0x20000000ULL));
+        CLASSERT(OBD_MD_FLGRPQUOTA == (0x40000000ULL));
+        CLASSERT(OBD_MD_FLMODEASIZE == (0x80000000ULL));
+        CLASSERT(OBD_MD_MDS == (0x0000000100000000ULL));
+        CLASSERT(OBD_MD_REINT == (0x0000000200000000ULL));
+        CLASSERT(OBD_MD_FLXATTR == (0x0000001000000000ULL));
+        CLASSERT(OBD_MD_FLXATTRLS == (0x0000002000000000ULL));
+        CLASSERT(OBD_MD_FLXATTRRM == (0x0000004000000000ULL));
+        CLASSERT(OBD_MD_FLACL == (0x0000008000000000ULL));
+        CLASSERT(OBD_FL_INLINEDATA == (0x00000001));
+        CLASSERT(OBD_FL_OBDMDEXISTS == (0x00000002));
+        CLASSERT(OBD_FL_DELORPHAN == (0x00000004));
+        CLASSERT(OBD_FL_NORPC == (0x00000008));
+        CLASSERT(OBD_FL_IDONLY == (0x00000010));
+        CLASSERT(OBD_FL_RECREATE_OBJS == (0x00000020));
+        CLASSERT(OBD_FL_DEBUG_CHECK == (0x00000040));
+        CLASSERT(OBD_FL_NO_USRQUOTA == (0x00000100));
+        CLASSERT(OBD_FL_NO_GRPQUOTA == (0x00000200));
+        CLASSERT(OBD_FL_CREATE_CROW == (0x00000400));
 
         /* Checks for struct lov_mds_md_v1 */
         LASSERTF((int)sizeof(struct lov_mds_md_v1) == 32, " found %lld\n",
@@ -1381,13 +1385,29 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx));
         LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx));
-        LASSERTF(LOV_MAGIC_V1 == 198249424, " found %lld\n",
-                 (long long)LOV_MAGIC_V1);
+        CLASSERT(LOV_MAGIC_V1 == 0x0BD10BD0);
+        CLASSERT(LOV_MAGIC_JOIN == 0x0BD20BD0);
         LASSERTF(LOV_PATTERN_RAID0 == 1, " found %lld\n",
                  (long long)LOV_PATTERN_RAID0);
         LASSERTF(LOV_PATTERN_RAID1 == 2, " found %lld\n",
                  (long long)LOV_PATTERN_RAID1);
 
+        /* Checks for struct lov_mds_md_join */
+        LASSERTF((int)sizeof(struct lov_mds_md_join) == 56, " found %lld\n",
+                 (long long)(int)sizeof(struct lov_mds_md_join));
+        LASSERTF((int)offsetof(struct lov_mds_md_join, lmmj_md) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_join, lmmj_md));
+        LASSERTF((int)sizeof(((struct lov_mds_md_join *)0)->lmmj_md) == 32, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_join *)0)->lmmj_md));
+        LASSERTF((int)offsetof(struct lov_mds_md_join, lmmj_array_id) == 32, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_join, lmmj_array_id));
+        LASSERTF((int)sizeof(((struct lov_mds_md_join *)0)->lmmj_array_id) == 20, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_join *)0)->lmmj_array_id));
+        LASSERTF((int)offsetof(struct lov_mds_md_join, lmmj_extent_count) == 52, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_join, lmmj_extent_count));
+        LASSERTF((int)sizeof(((struct lov_mds_md_join *)0)->lmmj_extent_count) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_join *)0)->lmmj_extent_count));
+
         /* Checks for struct obd_statfs */
         LASSERTF((int)sizeof(struct obd_statfs) == 144, " found %lld\n",
                  (long long)(int)sizeof(struct obd_statfs));
@@ -1427,6 +1447,42 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct obd_statfs, os_state));
         LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct obd_statfs *)0)->os_state));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare1) == 108, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare1));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare1) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare1));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare2) == 112, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare2));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare2) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare2));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare3));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare3));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare4) == 120, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare4));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare4) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare4));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare5) == 124, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare5));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare5) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare5));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare6) == 128, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare6));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare6) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare6));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare7) == 132, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare7));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare7) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare7));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare8) == 136, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare8));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare8) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare8));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare9) == 140, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare9));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9));
 
         /* Checks for struct obd_ioobj */
         LASSERTF((int)sizeof(struct obd_ioobj) == 24, " found %lld\n",
@@ -1535,6 +1591,18 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct obd_dqblk, dqb_valid));
         LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid));
+        LASSERTF((int)offsetof(struct obd_dqblk, padding) == 68, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_dqblk, padding));
+        LASSERTF((int)sizeof(((struct obd_dqblk *)0)->padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_dqblk *)0)->padding));
+        LASSERTF(Q_QUOTACHECK == 0x800100," found %lld\n",
+                 (long long)Q_QUOTACHECK);
+        LASSERTF(Q_INITQUOTA == 0x800101," found %lld\n",
+                 (long long)Q_INITQUOTA);
+        LASSERTF(Q_GETOINFO == 0x800102," found %lld\n",
+                 (long long)Q_GETOINFO);
+        LASSERTF(Q_GETOQUOTA == 0x800103," found %lld\n",
+                 (long long)Q_GETOQUOTA);
 
         /* Checks for struct niobuf_remote */
         LASSERTF((int)sizeof(struct niobuf_remote) == 16, " found %lld\n",
@@ -1697,14 +1765,14 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct mds_body, aclsize));
         LASSERTF((int)sizeof(((struct mds_body *)0)->aclsize) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct mds_body *)0)->aclsize));
-        LASSERTF((int)offsetof(struct mds_body, padding_2) == 156, " found %lld\n",
-                 (long long)(int)offsetof(struct mds_body, padding_2));
-        LASSERTF((int)sizeof(((struct mds_body *)0)->padding_2) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct mds_body *)0)->padding_2));
-        LASSERTF((int)offsetof(struct mds_body, padding_3) == 160, " found %lld\n",
-                 (long long)(int)offsetof(struct mds_body, padding_3));
-        LASSERTF((int)sizeof(((struct mds_body *)0)->padding_3) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct mds_body *)0)->padding_3));
+        LASSERTF((int)offsetof(struct mds_body, max_mdsize) == 156, " found %lld\n",
+                 (long long)(int)offsetof(struct mds_body, max_mdsize));
+        LASSERTF((int)sizeof(((struct mds_body *)0)->max_mdsize) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct mds_body *)0)->max_mdsize));
+        LASSERTF((int)offsetof(struct mds_body, max_cookiesize) == 160, " found %lld\n",
+                 (long long)(int)offsetof(struct mds_body, max_cookiesize));
+        LASSERTF((int)sizeof(((struct mds_body *)0)->max_cookiesize) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct mds_body *)0)->max_cookiesize));
         LASSERTF((int)offsetof(struct mds_body, padding_4) == 164, " found %lld\n",
                  (long long)(int)offsetof(struct mds_body, padding_4));
         LASSERTF((int)sizeof(((struct mds_body *)0)->padding_4) == 4, " found %lld\n",
@@ -1715,22 +1783,20 @@ void lustre_assert_wire_constants(void)
                  (long long)FMODE_WRITE);
         LASSERTF(FMODE_EXEC == 4, " found %lld\n",
                  (long long)FMODE_EXEC);
-        LASSERTF(MDS_OPEN_CREAT == 64, " found %lld\n",
-                 (long long)MDS_OPEN_CREAT);
-        LASSERTF(MDS_OPEN_EXCL == 128, " found %lld\n",
-                 (long long)MDS_OPEN_EXCL);
-        LASSERTF(MDS_OPEN_TRUNC == 512, " found %lld\n",
-                 (long long)MDS_OPEN_TRUNC);
-        LASSERTF(MDS_OPEN_APPEND == 1024, " found %lld\n",
-                 (long long)MDS_OPEN_APPEND);
-        LASSERTF(MDS_OPEN_SYNC == 4096, " found %lld\n",
-                 (long long)MDS_OPEN_SYNC);
-        LASSERTF(MDS_OPEN_DIRECTORY == 65536, " found %lld\n",
-                 (long long)MDS_OPEN_DIRECTORY);
-        LASSERTF(MDS_OPEN_DELAY_CREATE == 16777216, " found %lld\n",
-                 (long long)MDS_OPEN_DELAY_CREATE);
-        LASSERTF(MDS_OPEN_HAS_EA == 1073741824, " found %lld\n",
-                 (long long)MDS_OPEN_HAS_EA);
+        CLASSERT(MDS_OPEN_CREAT == 00000100);
+        CLASSERT(MDS_OPEN_EXCL == 00000200);
+        CLASSERT(MDS_OPEN_TRUNC == 00001000);
+        CLASSERT(MDS_OPEN_APPEND == 00002000);
+        CLASSERT(MDS_OPEN_SYNC == 00010000);
+        CLASSERT(MDS_OPEN_DIRECTORY == 00200000);
+        CLASSERT(MDS_OPEN_DELAY_CREATE == 0100000000);
+        CLASSERT(MDS_OPEN_OWNEROVERRIDE == 0200000000);
+        CLASSERT(MDS_OPEN_JOIN_FILE == 0400000000);
+        CLASSERT(MDS_OPEN_HAS_EA == 010000000000);
+        CLASSERT(MDS_OPEN_HAS_OBJS == 020000000000);
+        CLASSERT(MDS_INODELOCK_LOOKUP == 0x000001);
+        CLASSERT(MDS_INODELOCK_UPDATE == 0x000002);
+        CLASSERT(MDS_INODELOCK_OPEN == 0x000004);
 
         /* Checks for struct mds_rec_setattr */
         LASSERTF((int)sizeof(struct mds_rec_setattr) == 96, " found %lld\n",
@@ -1964,6 +2030,18 @@ void lustre_assert_wire_constants(void)
         LASSERTF((int)sizeof(((struct mds_rec_rename *)0)->rn_time) == 8, " found %lld\n",
                  (long long)(int)sizeof(((struct mds_rec_rename *)0)->rn_time));
 
+        /* Checks for struct mds_rec_join */
+        LASSERTF((int)sizeof(struct mds_rec_join) == 24, " found %lld\n",
+                 (long long)(int)sizeof(struct mds_rec_join));
+        LASSERTF((int)offsetof(struct mds_rec_join, jr_fid) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct mds_rec_join, jr_fid));
+        LASSERTF((int)sizeof(((struct mds_rec_join *)0)->jr_fid) == 16, " found %lld\n",
+                 (long long)(int)sizeof(((struct mds_rec_join *)0)->jr_fid));
+        LASSERTF((int)offsetof(struct mds_rec_join, jr_headsize) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct mds_rec_join, jr_headsize));
+        LASSERTF((int)sizeof(((struct mds_rec_join *)0)->jr_headsize) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct mds_rec_join *)0)->jr_headsize));
+
         /* Checks for struct lov_desc */
         LASSERTF((int)sizeof(struct lov_desc) == 88, " found %lld\n",
                  (long long)(int)sizeof(struct lov_desc));
@@ -1991,14 +2069,26 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset));
         LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, " found %lld\n",
                  (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset));
-        LASSERTF((int)offsetof(struct lov_desc, ld_qos_threshold) == 32, " found %lld\n",
-                 (long long)(int)offsetof(struct lov_desc, ld_qos_threshold));
-        LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_threshold) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_threshold));
-        LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, " found %lld\n",
-                 (long long)(int)offsetof(struct lov_desc, ld_qos_maxage));
-        LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage));
+        LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset));
+        LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset));
+        LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 32, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_desc, ld_padding_1));
+        LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1));
+        LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 36, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_desc, ld_padding_2));
+        LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2));
+        LASSERTF((int)offsetof(struct lov_desc, ld_padding_3) == 40, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_desc, ld_padding_3));
+        LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_3) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_3));
+        LASSERTF((int)offsetof(struct lov_desc, ld_padding_4) == 44, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_desc, ld_padding_4));
+        LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_4) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_4));
         LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, " found %lld\n",
                  (long long)(int)offsetof(struct lov_desc, ld_uuid));
         LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, " found %lld\n",
@@ -2048,6 +2138,14 @@ void lustre_assert_wire_constants(void)
         LASSERTF((int)sizeof(((struct ldlm_flock *)0)->pid) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct ldlm_flock *)0)->pid));
 
+        /* Checks for struct ldlm_inodebits */
+        LASSERTF((int)sizeof(struct ldlm_inodebits) == 8, " found %lld\n",
+                 (long long)(int)sizeof(struct ldlm_inodebits));
+        LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct ldlm_inodebits, bits));
+        LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits));
+
         /* Checks for struct ldlm_intent */
         LASSERTF((int)sizeof(struct ldlm_intent) == 8, " found %lld\n",
                  (long long)(int)sizeof(struct ldlm_intent));
@@ -2063,6 +2161,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct ldlm_resource_desc, lr_type));
         LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type));
+        LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_padding) == 4, " found %lld\n",
+                 (long long)(int)offsetof(struct ldlm_resource_desc, lr_padding));
+        LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding));
         LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, " found %lld\n",
                  (long long)(int)offsetof(struct ldlm_resource_desc, lr_name));
         LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, " found %lld\n",
@@ -2095,6 +2197,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct ldlm_request, lock_flags));
         LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags));
+        LASSERTF((int)offsetof(struct ldlm_request, lock_padding) == 4, " found %lld\n",
+                 (long long)(int)offsetof(struct ldlm_request, lock_padding));
+        LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_padding));
         LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, " found %lld\n",
                  (long long)(int)offsetof(struct ldlm_request, lock_desc));
         LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, " found %lld\n",
@@ -2115,6 +2221,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct ldlm_reply, lock_flags));
         LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags));
+        LASSERTF((int)offsetof(struct ldlm_request, lock_padding) == 4, " found %lld\n",
+                 (long long)(int)offsetof(struct ldlm_request, lock_padding));
+        LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_padding));
         LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, " found %lld\n",
                  (long long)(int)offsetof(struct ldlm_request, lock_desc));
         LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, " found %lld\n",
@@ -2156,62 +2266,6 @@ void lustre_assert_wire_constants(void)
         LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, " found %lld\n",
                  (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks));
 
-        /* Checks for struct ptlbd_op */
-        LASSERTF((int)sizeof(struct ptlbd_op) == 12, " found %lld\n",
-                 (long long)(int)sizeof(struct ptlbd_op));
-        LASSERTF((int)offsetof(struct ptlbd_op, op_cmd) == 0, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_op, op_cmd));
-        LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op_cmd) == 2, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_op *)0)->op_cmd));
-        LASSERTF((int)offsetof(struct ptlbd_op, op_lun) == 2, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_op, op_lun));
-        LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op_lun) == 2, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_op *)0)->op_lun));
-        LASSERTF((int)offsetof(struct ptlbd_op, op_niob_cnt) == 4, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_op, op_niob_cnt));
-        LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op_niob_cnt) == 2, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_op *)0)->op_niob_cnt));
-        LASSERTF((int)offsetof(struct ptlbd_op, op__padding) == 6, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_op, op__padding));
-        LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op__padding) == 2, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_op *)0)->op__padding));
-        LASSERTF((int)offsetof(struct ptlbd_op, op_block_cnt) == 8, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_op, op_block_cnt));
-        LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op_block_cnt) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_op *)0)->op_block_cnt));
-
-        /* Checks for struct ptlbd_niob */
-        LASSERTF((int)sizeof(struct ptlbd_niob) == 24, " found %lld\n",
-                 (long long)(int)sizeof(struct ptlbd_niob));
-        LASSERTF((int)offsetof(struct ptlbd_niob, n_xid) == 0, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_niob, n_xid));
-        LASSERTF((int)sizeof(((struct ptlbd_niob *)0)->n_xid) == 8, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_niob *)0)->n_xid));
-        LASSERTF((int)offsetof(struct ptlbd_niob, n_block_nr) == 8, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_niob, n_block_nr));
-        LASSERTF((int)sizeof(((struct ptlbd_niob *)0)->n_block_nr) == 8, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_niob *)0)->n_block_nr));
-        LASSERTF((int)offsetof(struct ptlbd_niob, n_offset) == 16, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_niob, n_offset));
-        LASSERTF((int)sizeof(((struct ptlbd_niob *)0)->n_offset) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_niob *)0)->n_offset));
-        LASSERTF((int)offsetof(struct ptlbd_niob, n_length) == 20, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_niob, n_length));
-        LASSERTF((int)sizeof(((struct ptlbd_niob *)0)->n_length) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_niob *)0)->n_length));
-
-        /* Checks for struct ptlbd_rsp */
-        LASSERTF((int)sizeof(struct ptlbd_rsp) == 4, " found %lld\n",
-                 (long long)(int)sizeof(struct ptlbd_rsp));
-        LASSERTF((int)offsetof(struct ptlbd_rsp, r_status) == 0, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_rsp, r_status));
-        LASSERTF((int)sizeof(((struct ptlbd_rsp *)0)->r_status) == 2, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_rsp *)0)->r_status));
-        LASSERTF((int)offsetof(struct ptlbd_rsp, r_error_cnt) == 2, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_rsp, r_error_cnt));
-        LASSERTF((int)sizeof(((struct ptlbd_rsp *)0)->r_error_cnt) == 2, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_rsp *)0)->r_error_cnt));
-
         /* Checks for struct llog_logid */
         LASSERTF((int)sizeof(struct llog_logid) == 20, " found %lld\n",
                  (long long)(int)sizeof(struct llog_logid));
@@ -2227,22 +2281,16 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_logid, lgl_ogen));
         LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_ogen) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_logid *)0)->lgl_ogen));
-        LASSERTF(OST_SZ_REC == 274730752, " found %lld\n",
-                 (long long)OST_SZ_REC);
-        LASSERTF(OST_RAID1_REC == 274731008, " found %lld\n",
-                 (long long)OST_RAID1_REC);
-        LASSERTF(MDS_UNLINK_REC == 274801668, " found %lld\n",
-                 (long long)MDS_UNLINK_REC);
-        LASSERTF(MDS_SETATTR_REC == 274801665, " found %lld\n",
-                 (long long)MDS_SETATTR_REC);
-        LASSERTF(OBD_CFG_REC == 274857984, " found %lld\n",
-                 (long long)OBD_CFG_REC);
-        LASSERTF(LLOG_GEN_REC == 274989056, " found %lld\n",
-                 (long long)LLOG_GEN_REC);
-        LASSERTF(LLOG_HDR_MAGIC == 275010873, " found %lld\n",
-                 (long long)LLOG_HDR_MAGIC);
-        LASSERTF(LLOG_LOGID_MAGIC == 275010875, " found %lld\n",
-                 (long long)LLOG_LOGID_MAGIC);
+        CLASSERT(OST_SZ_REC == 274730752);
+        CLASSERT(OST_RAID1_REC == 274731008);
+        CLASSERT(MDS_UNLINK_REC == 274801668);
+        CLASSERT(MDS_SETATTR_REC == 274801665);
+        CLASSERT(OBD_CFG_REC == 274857984);
+        CLASSERT(PTL_CFG_REC == 274923520);
+        CLASSERT(LLOG_GEN_REC == 274989056);
+        CLASSERT(LLOG_JOIN_REC == 275054592);
+        CLASSERT(LLOG_HDR_MAGIC == 275010873);
+        CLASSERT(LLOG_LOGID_MAGIC == 275010875);
 
         /* Checks for struct llog_catid */
         LASSERTF((int)sizeof(struct llog_catid) == 32, " found %lld\n",
@@ -2251,6 +2299,18 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_catid, lci_logid));
         LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid));
+        LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_catid, lci_padding1));
+        LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1));
+        LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_catid, lci_padding2));
+        LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2));
+        LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_catid, lci_padding3));
+        LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3));
 
         /* Checks for struct llog_rec_hdr */
         LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, " found %lld\n",
@@ -2267,6 +2327,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_rec_hdr, lrh_type));
         LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type));
+        LASSERTF((int)offsetof(struct llog_rec_hdr, padding) == 12, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_rec_hdr, padding));
+        LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->padding));
 
         /* Checks for struct llog_rec_tail */
         LASSERTF((int)sizeof(struct llog_rec_tail) == 8, " found %lld\n",
@@ -2291,6 +2355,26 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_logid_rec, lid_id));
         LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id));
+        LASSERTF((int)offsetof(struct llog_logid_rec, padding1) == 36, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_logid_rec, padding1));
+        LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding1) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding1));
+        LASSERTF((int)offsetof(struct llog_logid_rec, padding2) == 40, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_logid_rec, padding2));
+        LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding2) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding2));
+        LASSERTF((int)offsetof(struct llog_logid_rec, padding3) == 44, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_logid_rec, padding3));
+        LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding3) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding3));
+        LASSERTF((int)offsetof(struct llog_logid_rec, padding4) == 48, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_logid_rec, padding4));
+        LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding4) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding4));
+        LASSERTF((int)offsetof(struct llog_logid_rec, padding5) == 52, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_logid_rec, padding5));
+        LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding5) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding5));
         LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, " found %lld\n",
                  (long long)(int)offsetof(struct llog_logid_rec, lid_tail));
         LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, " found %lld\n",
@@ -2315,6 +2399,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_create_rec, lcr_ogen));
         LASSERTF((int)sizeof(((struct llog_create_rec *)0)->lcr_ogen) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_create_rec *)0)->lcr_ogen));
+        LASSERTF((int)offsetof(struct llog_create_rec, padding) == 44, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_create_rec, padding));
+        LASSERTF((int)sizeof(((struct llog_create_rec *)0)->padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_create_rec *)0)->padding));
 
         /* Checks for struct llog_orphan_rec */
         LASSERTF((int)sizeof(struct llog_orphan_rec) == 40, " found %lld\n",
@@ -2331,6 +2419,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_orphan_rec, lor_ogen));
         LASSERTF((int)sizeof(((struct llog_orphan_rec *)0)->lor_ogen) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_orphan_rec *)0)->lor_ogen));
+        LASSERTF((int)offsetof(struct llog_orphan_rec, padding) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_orphan_rec, padding));
+        LASSERTF((int)sizeof(((struct llog_orphan_rec *)0)->padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_orphan_rec *)0)->padding));
         LASSERTF((int)offsetof(struct llog_orphan_rec, lor_tail) == 32, " found %lld\n",
                  (long long)(int)offsetof(struct llog_orphan_rec, lor_tail));
         LASSERTF((int)sizeof(((struct llog_orphan_rec *)0)->lor_tail) == 8, " found %lld\n",
@@ -2351,11 +2443,47 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_unlink_rec, lur_ogen));
         LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_ogen) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_ogen));
+        LASSERTF((int)offsetof(struct llog_unlink_rec, padding) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_unlink_rec, padding));
+        LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->padding));
         LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, " found %lld\n",
                  (long long)(int)offsetof(struct llog_unlink_rec, lur_tail));
         LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail));
 
+        /* Checks for struct llog_setattr_rec */
+        LASSERTF((int)sizeof(struct llog_setattr_rec) == 48, " found %lld\n",
+                 (long long)(int)sizeof(struct llog_setattr_rec));
+        LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_hdr) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr_rec, lsr_hdr));
+        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_hdr) == 16, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_hdr));
+        LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_oid) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr_rec, lsr_oid));
+        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_oid) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_oid));
+        LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_ogen) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr_rec, lsr_ogen));
+        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_ogen) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_ogen));
+        LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_uid) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr_rec, lsr_uid));
+        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_uid) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_uid));
+        LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_gid) == 32, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr_rec, lsr_gid));
+        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_gid) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_gid));
+        LASSERTF((int)offsetof(struct llog_setattr_rec, padding) == 36, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr_rec, padding));
+        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->padding));
+        LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_tail) == 40, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr_rec, lsr_tail));
+        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_tail) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_tail));
+
         /* Checks for struct llog_size_change_rec */
         LASSERTF((int)sizeof(struct llog_size_change_rec) == 48, " found %lld\n",
                  (long long)(int)sizeof(struct llog_size_change_rec));
@@ -2371,6 +2499,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_size_change_rec, lsc_io_epoch));
         LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_io_epoch) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_io_epoch));
+        LASSERTF((int)offsetof(struct llog_size_change_rec, padding) == 36, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_size_change_rec, padding));
+        LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->padding));
         LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 40, " found %lld\n",
                  (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail));
         LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, " found %lld\n",
@@ -2467,6 +2599,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_cookie, lgc_index));
         LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_index) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_index));
+        LASSERTF((int)offsetof(struct llog_cookie, lgc_padding) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_cookie, lgc_padding));
+        LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_padding));
 
         /* Checks for struct llogd_body */
         LASSERTF((int)sizeof(struct llogd_body) == 48, " found %lld\n",
@@ -2499,20 +2635,15 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llogd_body, lgd_cur_offset));
         LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_cur_offset) == 8, " found %lld\n",
                  (long long)(int)sizeof(((struct llogd_body *)0)->lgd_cur_offset));
-        LASSERTF(LLOG_ORIGIN_HANDLE_CREATE == 501, " found %lld\n",
-                 (long long)LLOG_ORIGIN_HANDLE_CREATE);
-        LASSERTF(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502, " found %lld\n",
-                 (long long)LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
-        LASSERTF(LLOG_ORIGIN_HANDLE_READ_HEADER == 503, " found %lld\n",
-                 (long long)LLOG_ORIGIN_HANDLE_READ_HEADER);
-        LASSERTF(LLOG_ORIGIN_HANDLE_WRITE_REC == 504, " found %lld\n",
-                 (long long)LLOG_ORIGIN_HANDLE_WRITE_REC);
-        LASSERTF(LLOG_ORIGIN_HANDLE_CLOSE == 505, " found %lld\n",
-                 (long long)LLOG_ORIGIN_HANDLE_CLOSE);
-        LASSERTF(LLOG_ORIGIN_CONNECT == 506, " found %lld\n",
-                 (long long)LLOG_ORIGIN_CONNECT);
-        LASSERTF(LLOG_CATINFO == 507, " found %lld\n",
-                 (long long)LLOG_CATINFO);
+        CLASSERT(LLOG_ORIGIN_HANDLE_CREATE == 501);
+        CLASSERT(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502);
+        CLASSERT(LLOG_ORIGIN_HANDLE_READ_HEADER == 503);
+        CLASSERT(LLOG_ORIGIN_HANDLE_WRITE_REC == 504);
+        CLASSERT(LLOG_ORIGIN_HANDLE_CLOSE == 505);
+        CLASSERT(LLOG_ORIGIN_CONNECT == 506);
+        CLASSERT(LLOG_CATINFO == 507);
+        CLASSERT(LLOG_ORIGIN_HANDLE_PREV_BLOCK == 508);
+        CLASSERT(LLOG_ORIGIN_HANDLE_DESTROY == 509);
 
         /* Checks for struct llogd_conn_body */
         LASSERTF((int)sizeof(struct llogd_conn_body) == 40, " found %lld\n",
@@ -2530,6 +2661,38 @@ void lustre_assert_wire_constants(void)
         LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx));
 
+        /* Checks for struct llog_array_rec */
+        LASSERTF((int)sizeof(struct llog_array_rec) == 72, " found %lld\n",
+                 (long long)(int)sizeof(struct llog_array_rec));
+        LASSERTF((int)offsetof(struct llog_array_rec, lmr_hdr) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_array_rec, lmr_hdr));
+        LASSERTF((int)sizeof(((struct llog_array_rec *)0)->lmr_hdr) == 16, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_array_rec *)0)->lmr_hdr));
+        LASSERTF((int)offsetof(struct llog_array_rec, lmr_med) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_array_rec, lmr_med));
+        LASSERTF((int)sizeof(((struct llog_array_rec *)0)->lmr_med) == 48, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_array_rec *)0)->lmr_med));
+        LASSERTF((int)offsetof(struct llog_array_rec, lmr_tail) == 64, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_array_rec, lmr_tail));
+        LASSERTF((int)sizeof(((struct llog_array_rec *)0)->lmr_tail) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_array_rec *)0)->lmr_tail));
+
+        /* Checks for struct mds_extent_desc */
+        LASSERTF((int)sizeof(struct mds_extent_desc) == 48, " found %lld\n",
+                 (long long)(int)sizeof(struct mds_extent_desc));
+        LASSERTF((int)offsetof(struct mds_extent_desc, med_start) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct mds_extent_desc, med_start));
+        LASSERTF((int)sizeof(((struct mds_extent_desc *)0)->med_start) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct mds_extent_desc *)0)->med_start));
+        LASSERTF((int)offsetof(struct mds_extent_desc, med_len) == 8, " found %lld\n",
+                 (long long)(int)offsetof(struct mds_extent_desc, med_len));
+        LASSERTF((int)sizeof(((struct mds_extent_desc *)0)->med_len) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct mds_extent_desc *)0)->med_len));
+        LASSERTF((int)offsetof(struct mds_extent_desc, med_lmm) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct mds_extent_desc, med_lmm));
+        LASSERTF((int)sizeof(((struct mds_extent_desc *)0)->med_lmm) == 32, " found %lld\n",
+                 (long long)(int)sizeof(((struct mds_extent_desc *)0)->med_lmm));
+
         /* Checks for struct qunit_data */
         LASSERTF((int)sizeof(struct qunit_data) == 16, " found %lld\n",
                  (long long)(int)sizeof(struct qunit_data));
index 05c4a96..c897628 100644 (file)
@@ -45,8 +45,7 @@ int ptlrpc_ping(struct obd_import *imp)
         int rc = 0;
         ENTRY;
 
-        req = ptlrpc_prep_req(imp, OBD_PING, 0, NULL,
-                              NULL);
+        req = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, OBD_PING, 0, NULL, NULL);
         if (req) {
                 DEBUG_REQ(D_INFO, req, "pinging %s->%s",
                           imp->imp_obd->obd_uuid.uuid,
@@ -64,70 +63,11 @@ int ptlrpc_ping(struct obd_import *imp)
         RETURN(rc);
 }
 
-static int ptlrpc_statfs_interpret(struct ptlrpc_request *req,
-                                   void *data, int rc)
-{
-        struct obd_statfs *msfs;
-        struct obd_device *obd;
-        ENTRY;
-
-        if (rc)
-                RETURN(rc);
-        
-        if (!req->rq_repmsg)
-                RETURN(-EPROTO);
-        
-        msfs = lustre_swab_repbuf(req, 0, sizeof(*msfs),
-                                  lustre_swab_obd_statfs);
-        if (msfs == NULL)
-                RETURN(-EPROTO);
-
-        obd = req->rq_import->imp_obd;
-        
-        spin_lock(&obd->obd_osfs_lock);
-        obd->obd_osfs = *msfs;
-        obd->obd_osfs_age = cfs_time_current();
-        spin_unlock(&obd->obd_osfs_lock);
-        
-        RETURN(0);
-}
-
-int ptlrpc_statfs(struct obd_import *imp)
-{
-        int size = sizeof(struct obd_statfs);
-        struct ptlrpc_request *req;
-        ENTRY;
-
-        req = ptlrpc_prep_req(imp, OST_STATFS, 0,
-                              NULL, NULL);
-        if (!req) {
-                CERROR("OOM trying to ping %s->%s\n",
-                       imp->imp_obd->obd_uuid.uuid,
-                       imp->imp_target_uuid.uuid);
-                RETURN(-ENOMEM);
-        }
-
-        DEBUG_REQ(D_INFO, req, "pinging %s->%s",
-                  imp->imp_obd->obd_uuid.uuid,
-                  imp->imp_target_uuid.uuid);
-
-        req->rq_interpret_reply = ptlrpc_statfs_interpret;
-        req->rq_replen = lustre_msg_size(1, &size);
-        req->rq_no_resend = req->rq_no_delay = 1;
-        ptlrpcd_add_req(req);
-
-        RETURN(0);
-}
-
 static void ptlrpc_update_next_ping(struct obd_import *imp)
 {
-        cfs_duration_t interval;
-
-        interval = IMP_CROW_ABLE(imp) ?
-                STATFS_INTERVAL : PING_INTERVAL;
-
         imp->imp_next_ping = cfs_time_shift(cfs_time_seconds(
-                (imp->imp_state == LUSTRE_IMP_DISCON ? 10 : interval)));
+                (imp->imp_state == LUSTRE_IMP_DISCON ? RECONNECT_INTERVAL :
+                                                       PING_INTERVAL)));
 }
 
 void ptlrpc_ping_import_soon(struct obd_import *imp)
@@ -159,8 +99,6 @@ static int ptlrpc_pinger_main(void *arg)
 
         /* And now, loop forever, pinging as needed. */
         while (1) {
-                unsigned long sleep_interval = PING_INTERVAL;
-                unsigned long update_interval = 0;
                 cfs_time_t this_ping = cfs_time_current();
                 struct l_wait_info lwi;
                 cfs_duration_t time_to_next_ping;
@@ -174,9 +112,6 @@ static int ptlrpc_pinger_main(void *arg)
                         int force, level;
                         unsigned long flags;
 
-                        if (IMP_CROW_ABLE(imp))
-                                sleep_interval = STATFS_INTERVAL;
-                        
                         spin_lock_irqsave(&imp->imp_lock, flags);
                         level = imp->imp_state;
                         force = imp->imp_force_verify;
@@ -210,10 +145,7 @@ static int ptlrpc_pinger_main(void *arg)
                                                imp->imp_deactive,
                                                imp->imp_obd->obd_no_recov);
                                 } else if (imp->imp_pingable || force) {
-                                        if (IMP_CROW_ABLE(imp))
-                                                ptlrpc_statfs(imp);
-                                        else
-                                                ptlrpc_ping(imp);
+                                        ptlrpc_ping(imp);
                                 }
                         } else {
                                 if (!imp->imp_pingable)
@@ -225,37 +157,28 @@ static int ptlrpc_pinger_main(void *arg)
                                        imp->imp_next_ping, this_ping);
                         }
 
-                        /* using here new calculated @update_interval, as
-                         * sleep_interval holds minimal of possible intervals
-                         * over pingable imports. */
-                        update_interval = IMP_CROW_ABLE(imp) ?
-                                STATFS_INTERVAL : PING_INTERVAL;
-                        
                         /* obd_timeout might have changed */
                         if (cfs_time_after(imp->imp_next_ping,
                                            cfs_time_add(this_ping, 
-                                                        cfs_time_seconds(update_interval))))
+                                                        cfs_time_seconds(PING_INTERVAL))))
                                 ptlrpc_update_next_ping(imp);
                 }
                 mutex_up(&pinger_sem);
 
-                /* Wait until the next ping time, or until we're stopped. We
-                 * sleep here smaller interval of two possible (ping or
-                 * statfs). If one of imports is CROW capable we'll sleep
-                 * STATFS_INTERVAL and PING_INTERVAL otherwise. */
+                /* Wait until the next ping time, or until we're stopped. */
                 time_to_next_ping = cfs_time_sub(cfs_time_add(this_ping, 
-                                                              cfs_time_seconds(sleep_interval)), 
+                                                              cfs_time_seconds(PING_INTERVAL)), 
                                                  cfs_time_current());
                 
                 /* The ping sent by ptlrpc_send_rpc may get sent out
                    say .01 second after this.
-                   ptlrpc_pinger_sending_on_import will then set the
+                   ptlrpc_pinger_eending_on_import will then set the
                    next ping time to next_ping + .01 sec, which means
                    we will SKIP the next ping at next_ping, and the
                    ping will get sent 2 timeouts from now!  Beware. */
                 CDEBUG(D_INFO, "next ping in "CFS_DURATION_T" ("CFS_TIME_T")\n", 
                                time_to_next_ping, 
-                               cfs_time_add(this_ping, cfs_time_seconds(sleep_interval)));
+                               cfs_time_add(this_ping, cfs_time_seconds(PING_INTERVAL)));
                 if (time_to_next_ping > 0) {
                         lwi = LWI_TIMEOUT(max_t(cfs_duration_t, time_to_next_ping, cfs_time_seconds(1)),
                                           NULL, NULL);
@@ -309,12 +232,13 @@ int ptlrpc_start_pinger(void)
         if (rc < 0) {
                 CERROR("cannot start thread: %d\n", rc);
                 OBD_FREE(pinger_thread, sizeof(*pinger_thread));
+                pinger_thread = NULL;
                 RETURN(rc);
         }
         l_wait_event(pinger_thread->t_ctl_waitq,
                      pinger_thread->t_flags & SVC_RUNNING, &lwi);
 
-        RETURN(rc);
+        RETURN(0);
 }
 
 int ptlrpc_stop_pinger(void)
@@ -395,6 +319,7 @@ void ptlrpc_pinger_wake_up()
  * the current implementation of pinger in liblustre is not optimized
  */
 
+#ifdef ENABLE_PINGER
 static struct pinger_data {
         int             pd_recursion;
         cfs_time_t      pd_this_ping;   /* jiffies */
@@ -461,8 +386,8 @@ static int pinger_check_rpcs(void *arg)
                                 continue;
                         }
 
-                        req = ptlrpc_prep_req(imp, OBD_PING, 0, NULL,
-                                              NULL);
+                        req = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, OBD_PING,
+                                              0, NULL, NULL);
                         if (!req) {
                                 CERROR("out of memory\n");
                                 break;
@@ -493,7 +418,7 @@ static int pinger_check_rpcs(void *arg)
                 DEBUG_REQ(D_HA, req, "pinging %s->%s",
                           req->rq_import->imp_obd->obd_uuid.uuid,
                           req->rq_import->imp_target_uuid.uuid);
-                (void)ptl_send_rpc(req);
+                (void)ptl_send_rpc(req, 0);
         }
 
 do_check_set:
@@ -544,13 +469,14 @@ out:
 }
 
 static void *pinger_callback = NULL;
+#endif /* ENABLE_PINGER */
 
 int ptlrpc_start_pinger(void)
 {
-        memset(&pinger_args, 0, sizeof(pinger_args));
 #ifdef ENABLE_PINGER
-        pinger_callback =
-                liblustre_register_wait_callback(&pinger_check_rpcs, &pinger_args);
+        memset(&pinger_args, 0, sizeof(pinger_args));
+        pinger_callback = liblustre_register_wait_callback(&pinger_check_rpcs,
+                                                           &pinger_args);
 #endif
         return 0;
 }
@@ -566,6 +492,7 @@ int ptlrpc_stop_pinger(void)
 
 void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
 {
+#ifdef ENABLE_PINGER
         mutex_down(&pinger_sem);
         ptlrpc_update_next_ping(imp);
         if (pinger_args.pd_set == NULL &&
@@ -575,6 +502,7 @@ void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
                 pinger_args.pd_next_ping = imp->imp_next_ping;
         }
         mutex_up(&pinger_sem);
+#endif
 }
 
 int ptlrpc_pinger_add_import(struct obd_import *imp)
index 7a0419b..eef6d39 100644 (file)
@@ -72,6 +72,8 @@ void ptlrpcd_wake(struct ptlrpc_request *req)
         cfs_waitq_signal(&pc->pc_waitq);
 }
 
+/* requests that are added to the ptlrpcd queue are sent via
+ * ptlrpcd_check->ptlrpc_check_set() */
 void ptlrpcd_add_req(struct ptlrpc_request *req)
 {
         struct ptlrpcd_ctl *pc;
@@ -187,8 +189,11 @@ int ptlrpcd_check_async_rpcs(void *arg)
         /* single threaded!! */
         pc->pc_recurred++;
 
-        if (pc->pc_recurred == 1)
+        if (pc->pc_recurred == 1) {
                 rc = ptlrpcd_check(pc);
+                if (!rc)
+                        ptlrpc_expired_set(pc->pc_set);
+        }
 
         pc->pc_recurred--;
         return rc;
@@ -197,7 +202,7 @@ int ptlrpcd_check_async_rpcs(void *arg)
 
 static int ptlrpcd_start(char *name, struct ptlrpcd_ctl *pc)
 {
-        int rc = 0;
+        int rc;
 
         ENTRY;
         memset(pc, 0, sizeof(*pc));
@@ -211,21 +216,22 @@ static int ptlrpcd_start(char *name, struct ptlrpcd_ctl *pc)
 
         pc->pc_set = ptlrpc_prep_set();
         if (pc->pc_set == NULL)
-                GOTO(out, rc = -ENOMEM);
+                RETURN(-ENOMEM);
 
 #ifdef __KERNEL__
-        if (cfs_kernel_thread(ptlrpcd, pc, 0) < 0)  {
+        rc = cfs_kernel_thread(ptlrpcd, pc, 0);
+        if (rc < 0)  {
                 ptlrpc_set_destroy(pc->pc_set);
-                GOTO(out, rc = -ECHILD);
+                RETURN(rc);
         }
 
         wait_for_completion(&pc->pc_starting);
 #else
         pc->pc_callback =
                 liblustre_register_wait_callback(&ptlrpcd_check_async_rpcs, pc);
+        (void)rc;
 #endif
-out:
-        RETURN(rc);
+        RETURN(0);
 }
 
 static void ptlrpcd_stop(struct ptlrpcd_ctl *pc)
index 2a1164c..58f253c 100644 (file)
@@ -44,6 +44,7 @@
 
 #include <libcfs/kp30.h>
 #include <obd_class.h>
+#include <lustre_mds.h>
 #include <lustre_commit_confd.h>
 #include <obd_support.h>
 #include <obd_class.h>
index 10ea8ae..e258b20 100644 (file)
@@ -324,7 +324,7 @@ ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size,
         /* Now allocate pool of reply buffers */
         /* Increase max reply size to next power of two */
         service->srv_max_reply_size = 1;
-        while(service->srv_max_reply_size < max_reply_size)
+        while (service->srv_max_reply_size < max_reply_size)
                 service->srv_max_reply_size <<= 1;
 
         if (proc_entry != NULL)
@@ -570,17 +570,23 @@ put_conn:
         timediff = timeval_sub(&work_end, &work_start);
 
         if (timediff / 1000000 > (long)obd_timeout)
-                CERROR("request "LPU64" opc %u from %s processed in %lds\n",
+                CERROR("request "LPU64" opc %u from %s processed in %lds "
+                       "trans "LPU64" rc %d/%d\n",
                        request->rq_xid, request->rq_reqmsg->opc,
                        libcfs_id2str(request->rq_peer),
                        timeval_sub(&work_end,
-                                   &request->rq_arrival_time) / 1000000);
+                                   &request->rq_arrival_time) / 1000000,
+                       request->rq_repmsg ? request->rq_repmsg->transno :
+                       request->rq_transno, request->rq_status,
+                       request->rq_repmsg ? request->rq_repmsg->status : -999);
         else
-                CDEBUG(D_HA,"request "LPU64" opc %u from %s processed in %ldus"
-                       " (%ldus total)\n", request->rq_xid,
-                       request->rq_reqmsg->opc,
+                CDEBUG(D_HA, "request "LPU64" opc %u from %s processed in "
+                       "%ldus (%ldus total) trans "LPU64" rc %d/%d\n",
+                       request->rq_xid, request->rq_reqmsg->opc,
                        libcfs_id2str(request->rq_peer), timediff,
-                       timeval_sub(&work_end, &request->rq_arrival_time));
+                       timeval_sub(&work_end, &request->rq_arrival_time),
+                       request->rq_transno, request->rq_status,
+                       request->rq_repmsg ? request->rq_repmsg->status : -999);
 
         if (svc->srv_stats != NULL) {
                 int opc = opcode_offset(request->rq_reqmsg->opc);
@@ -625,7 +631,7 @@ ptlrpc_server_handle_reply (struct ptlrpc_service *svc)
 
         list_del_init (&rs->rs_list);
 
-        /* Disengage from notifiers carefully (lock ordering!) */
+        /* Disengage from notifiers carefully (lock order - irqrestore below!)*/
         spin_unlock(&svc->srv_lock);
 
         spin_lock (&obd->obd_uncommitted_replies_lock);
@@ -917,17 +923,16 @@ out_srv_init:
                 svc->srv_done(thread);
 
 out:
-        spin_lock_irqsave(&svc->srv_lock, flags);
+        CDEBUG(D_NET, "service thread %d exiting: rc %d\n", thread->t_id, rc);
 
+        spin_lock_irqsave(&svc->srv_lock, flags);
         svc->srv_nthreads--;                    /* must know immediately */
+        thread->t_id = rc;
         thread->t_flags = SVC_STOPPED;
-        cfs_waitq_signal(&thread->t_ctl_waitq);
 
+        cfs_waitq_signal(&thread->t_ctl_waitq);
         spin_unlock_irqrestore(&svc->srv_lock, flags);
 
-        CDEBUG(D_NET, "service thread %d exiting: rc %d\n", thread->t_id, rc);
-        thread->t_id = rc;
-
         return rc;
 }
 
@@ -970,13 +975,14 @@ void ptlrpc_stop_all_threads(struct ptlrpc_service *svc)
         spin_unlock_irqrestore(&svc->srv_lock, flags);
 }
 
-/* @base_name should be 12 characters or less - 3 will be added on */
+/* @base_name should be 11 characters or less - 3 will be added on */
 int ptlrpc_start_threads(struct obd_device *dev, struct ptlrpc_service *svc,
                          char *base_name)
 {
         int i, rc = 0;
         ENTRY;
 
+        LASSERT(svc->srv_num_threads > 0);
         for (i = 0; i < svc->srv_num_threads; i++) {
                 char name[32];
                 sprintf(name, "%s_%02d", base_name, i);
index cb717b5..3c5544e 100644 (file)
@@ -398,8 +398,6 @@ int jt_lcfg_lov_setup(int argc, char **argv)
                         jt_cmdname(argv[0]), argv[5]);
                 return CMD_HELP;
         }
-        desc.ld_qos_threshold = QOS_DEFAULT_THRESHOLD;
-        desc.ld_qos_maxage = QOS_DEFAULT_MAXAGE;
 
         if (argc == 7) {
                 desc.ld_tgt_count = strtoul(argv[6], &end, 0);
index 4409bc4..2b56b83 100644 (file)
@@ -20,6 +20,13 @@ do {                                                            \
 
 #define STRINGIFY(a) #a
 
+
+#define CHECK_CDEFINE(a)                                        \
+        printf("        CLASSERT("#a" == "STRINGIFY(a) ");\n")
+
+#define CHECK_CVALUE(a)                                         \
+        printf("        CLASSERT("#a" == %lld);\n", (long long)a)
+
 #define CHECK_DEFINE(a)                                         \
 do {                                                            \
         printf("        LASSERTF("#a" == "STRINGIFY(a)          \
@@ -64,16 +71,7 @@ do {                                                            \
 } while(0)
 
 
-
-void check1(void)
-{
-#define VALUE 1234567
-
-        CHECK_VALUE(VALUE);
-        CHECK_DEFINE(VALUE);
-}
-
-void
+static void
 check_lustre_handle(void)
 {
         BLANK_LINE();
@@ -81,7 +79,7 @@ check_lustre_handle(void)
         CHECK_MEMBER(lustre_handle, cookie);
 }
 
-void
+static void
 check_lustre_msg(void)
 {
         BLANK_LINE();
@@ -100,7 +98,7 @@ check_lustre_msg(void)
         CHECK_MEMBER(lustre_msg, buflens[7]);
 }
 
-void
+static void
 check_obdo(void)
 {
         BLANK_LINE();
@@ -125,54 +123,62 @@ check_obdo(void)
         CHECK_MEMBER(obdo, o_misc);
         CHECK_MEMBER(obdo, o_easize);
         CHECK_MEMBER(obdo, o_mds);
+        CHECK_MEMBER(obdo, o_stripe_idx);
+        CHECK_MEMBER(obdo, o_padding_1);
         CHECK_MEMBER(obdo, o_inline);
 
         CHECK_VALUE(OBD_INLINESZ);
 
-        CHECK_VALUE(OBD_MD_FLID);
-        CHECK_VALUE(OBD_MD_FLATIME);
-        CHECK_VALUE(OBD_MD_FLMTIME);
-        CHECK_VALUE(OBD_MD_FLCTIME);
-        CHECK_VALUE(OBD_MD_FLSIZE);
-        CHECK_VALUE(OBD_MD_FLBLOCKS);
-        CHECK_VALUE(OBD_MD_FLBLKSZ);
-        CHECK_VALUE(OBD_MD_FLMODE);
-        CHECK_VALUE(OBD_MD_FLTYPE);
-        CHECK_VALUE(OBD_MD_FLUID);
-        CHECK_VALUE(OBD_MD_FLGID);
-        CHECK_VALUE(OBD_MD_FLFLAGS);
-        CHECK_VALUE(OBD_MD_FLNLINK);
-        CHECK_VALUE(OBD_MD_FLGENER);
-        CHECK_VALUE(OBD_MD_FLINLINE);
-        CHECK_VALUE(OBD_MD_FLRDEV);
-        CHECK_VALUE(OBD_MD_FLEASIZE);
-        CHECK_VALUE(OBD_MD_LINKNAME);
-        CHECK_VALUE(OBD_MD_FLHANDLE);
-        CHECK_VALUE(OBD_MD_FLCKSUM);
-        CHECK_VALUE(OBD_MD_FLQOS);
-        CHECK_VALUE(OBD_MD_FLCOOKIE);
-        CHECK_VALUE(OBD_MD_FLGROUP);
-        CHECK_VALUE(OBD_MD_FLFID);
-        CHECK_VALUE(OBD_MD_FLEPOCH);
-        CHECK_VALUE(OBD_MD_FLGRANT);
-        CHECK_VALUE(OBD_MD_FLDIREA);
-        CHECK_VALUE(OBD_MD_FLUSRQUOTA);
-        CHECK_VALUE(OBD_MD_FLGRPQUOTA);
-        CHECK_VALUE_64(OBD_MD_MDS);
-        CHECK_VALUE_64(OBD_MD_REINT);
-
-        CHECK_VALUE(OBD_FL_INLINEDATA);
-        CHECK_VALUE(OBD_FL_OBDMDEXISTS);
-        CHECK_VALUE(OBD_FL_DELORPHAN);
-        CHECK_VALUE(OBD_FL_NORPC);
-        CHECK_VALUE(OBD_FL_IDONLY);
-        CHECK_VALUE(OBD_FL_RECREATE_OBJS);
-        CHECK_VALUE(OBD_FL_DEBUG_CHECK);
-        CHECK_VALUE(OBD_FL_NO_USRQUOTA);
-        CHECK_VALUE(OBD_FL_NO_GRPQUOTA);
-}
-
-void
+        CHECK_CDEFINE(OBD_MD_FLID);
+        CHECK_CDEFINE(OBD_MD_FLATIME);
+        CHECK_CDEFINE(OBD_MD_FLMTIME);
+        CHECK_CDEFINE(OBD_MD_FLCTIME);
+        CHECK_CDEFINE(OBD_MD_FLSIZE);
+        CHECK_CDEFINE(OBD_MD_FLBLOCKS);
+        CHECK_CDEFINE(OBD_MD_FLBLKSZ);
+        CHECK_CDEFINE(OBD_MD_FLMODE);
+        CHECK_CDEFINE(OBD_MD_FLTYPE);
+        CHECK_CDEFINE(OBD_MD_FLUID);
+        CHECK_CDEFINE(OBD_MD_FLGID);
+        CHECK_CDEFINE(OBD_MD_FLFLAGS);
+        CHECK_CDEFINE(OBD_MD_FLNLINK);
+        CHECK_CDEFINE(OBD_MD_FLGENER);
+        CHECK_CDEFINE(OBD_MD_FLINLINE);
+        CHECK_CDEFINE(OBD_MD_FLRDEV);
+        CHECK_CDEFINE(OBD_MD_FLEASIZE);
+        CHECK_CDEFINE(OBD_MD_LINKNAME);
+        CHECK_CDEFINE(OBD_MD_FLHANDLE);
+        CHECK_CDEFINE(OBD_MD_FLCKSUM);
+        CHECK_CDEFINE(OBD_MD_FLQOS);
+        CHECK_CDEFINE(OBD_MD_FLCOOKIE);
+        CHECK_CDEFINE(OBD_MD_FLGROUP);
+        CHECK_CDEFINE(OBD_MD_FLFID);
+        CHECK_CDEFINE(OBD_MD_FLEPOCH);
+        CHECK_CDEFINE(OBD_MD_FLGRANT);
+        CHECK_CDEFINE(OBD_MD_FLDIREA);
+        CHECK_CDEFINE(OBD_MD_FLUSRQUOTA);
+        CHECK_CDEFINE(OBD_MD_FLGRPQUOTA);
+        CHECK_CDEFINE(OBD_MD_FLMODEASIZE);
+        CHECK_CDEFINE(OBD_MD_MDS);
+        CHECK_CDEFINE(OBD_MD_REINT);
+        CHECK_CDEFINE(OBD_MD_FLXATTR);
+        CHECK_CDEFINE(OBD_MD_FLXATTRLS);
+        CHECK_CDEFINE(OBD_MD_FLXATTRRM);
+        CHECK_CDEFINE(OBD_MD_FLACL);
+
+        CHECK_CDEFINE(OBD_FL_INLINEDATA);
+        CHECK_CDEFINE(OBD_FL_OBDMDEXISTS);
+        CHECK_CDEFINE(OBD_FL_DELORPHAN);
+        CHECK_CDEFINE(OBD_FL_NORPC);
+        CHECK_CDEFINE(OBD_FL_IDONLY);
+        CHECK_CDEFINE(OBD_FL_RECREATE_OBJS);
+        CHECK_CDEFINE(OBD_FL_DEBUG_CHECK);
+        CHECK_CDEFINE(OBD_FL_NO_USRQUOTA);
+        CHECK_CDEFINE(OBD_FL_NO_GRPQUOTA);
+        CHECK_CDEFINE(OBD_FL_CREATE_CROW);
+}
+
+static void
 check_lov_mds_md_v1(void)
 {
         BLANK_LINE();
@@ -192,13 +198,24 @@ check_lov_mds_md_v1(void)
         CHECK_MEMBER(lov_ost_data_v1, l_ost_gen);
         CHECK_MEMBER(lov_ost_data_v1, l_ost_idx);
 
-        CHECK_VALUE(LOV_MAGIC_V1);
+        CHECK_CDEFINE(LOV_MAGIC_V1);
+        CHECK_CDEFINE(LOV_MAGIC_JOIN);
 
         CHECK_VALUE(LOV_PATTERN_RAID0);
         CHECK_VALUE(LOV_PATTERN_RAID1);
 }
 
-void
+static void
+check_lov_mds_md_join(void)
+{
+        BLANK_LINE();
+        CHECK_STRUCT(lov_mds_md_join);
+        CHECK_MEMBER(lov_mds_md_join, lmmj_md);
+        CHECK_MEMBER(lov_mds_md_join, lmmj_array_id);
+        CHECK_MEMBER(lov_mds_md_join, lmmj_extent_count);
+}
+
+static void
 check_obd_statfs(void)
 {
         BLANK_LINE();
@@ -212,9 +229,18 @@ check_obd_statfs(void)
         CHECK_MEMBER(obd_statfs, os_bsize);
         CHECK_MEMBER(obd_statfs, os_namelen);
         CHECK_MEMBER(obd_statfs, os_state);
+        CHECK_MEMBER(obd_statfs, os_spare1);
+        CHECK_MEMBER(obd_statfs, os_spare2);
+        CHECK_MEMBER(obd_statfs, os_spare3);
+        CHECK_MEMBER(obd_statfs, os_spare4);
+        CHECK_MEMBER(obd_statfs, os_spare5);
+        CHECK_MEMBER(obd_statfs, os_spare6);
+        CHECK_MEMBER(obd_statfs, os_spare7);
+        CHECK_MEMBER(obd_statfs, os_spare8);
+        CHECK_MEMBER(obd_statfs, os_spare9);
 }
 
-void
+static void
 check_obd_ioobj(void)
 {
         BLANK_LINE();
@@ -225,7 +251,7 @@ check_obd_ioobj(void)
         CHECK_MEMBER(obd_ioobj, ioo_bufcnt);
 }
 
-void
+static void
 check_obd_quotactl(void)
 {
         BLANK_LINE();
@@ -255,9 +281,15 @@ check_obd_quotactl(void)
         CHECK_MEMBER(obd_dqblk, dqb_btime);
         CHECK_MEMBER(obd_dqblk, dqb_itime);
         CHECK_MEMBER(obd_dqblk, dqb_valid);
+        CHECK_MEMBER(obd_dqblk, padding);
+
+        CHECK_DEFINE(Q_QUOTACHECK);
+        CHECK_DEFINE(Q_INITQUOTA);
+        CHECK_DEFINE(Q_GETOINFO);
+        CHECK_DEFINE(Q_GETOQUOTA);
 }
 
-void
+static void
 check_niobuf_remote(void)
 {
         BLANK_LINE();
@@ -273,7 +305,7 @@ check_niobuf_remote(void)
         CHECK_VALUE(OBD_BRW_NOQUOTA);
 }
 
-void
+static void
 check_ost_body(void)
 {
         BLANK_LINE();
@@ -281,7 +313,7 @@ check_ost_body(void)
         CHECK_MEMBER(ost_body, oa);
 }
 
-void
+static void
 check_ll_fid(void)
 {
         BLANK_LINE();
@@ -291,7 +323,7 @@ check_ll_fid(void)
         CHECK_MEMBER(ll_fid, f_type);
 }
 
-void
+static void
 check_mds_status_req(void)
 {
         BLANK_LINE();
@@ -300,7 +332,7 @@ check_mds_status_req(void)
         CHECK_MEMBER(mds_status_req, repbuf);
 }
 
-void
+static void
 check_mds_body(void)
 {
         BLANK_LINE();
@@ -329,24 +361,32 @@ check_mds_body(void)
         CHECK_MEMBER(mds_body, suppgid);
         CHECK_MEMBER(mds_body, eadatasize);
         CHECK_MEMBER(mds_body, aclsize);
-        CHECK_MEMBER(mds_body, padding_2);
-        CHECK_MEMBER(mds_body, padding_3);
+        CHECK_MEMBER(mds_body, max_mdsize);
+        CHECK_MEMBER(mds_body, max_cookiesize);
         CHECK_MEMBER(mds_body, padding_4);
 
         CHECK_VALUE(FMODE_READ);
         CHECK_VALUE(FMODE_WRITE);
         CHECK_VALUE(FMODE_EXEC);
-        CHECK_VALUE(MDS_OPEN_CREAT);
-        CHECK_VALUE(MDS_OPEN_EXCL);
-        CHECK_VALUE(MDS_OPEN_TRUNC);
-        CHECK_VALUE(MDS_OPEN_APPEND);
-        CHECK_VALUE(MDS_OPEN_SYNC);
-        CHECK_VALUE(MDS_OPEN_DIRECTORY);
-        CHECK_VALUE(MDS_OPEN_DELAY_CREATE);
-        CHECK_VALUE(MDS_OPEN_HAS_EA);
+
+        CHECK_CDEFINE(MDS_OPEN_CREAT);
+        CHECK_CDEFINE(MDS_OPEN_EXCL);
+        CHECK_CDEFINE(MDS_OPEN_TRUNC);
+        CHECK_CDEFINE(MDS_OPEN_APPEND);
+        CHECK_CDEFINE(MDS_OPEN_SYNC);
+        CHECK_CDEFINE(MDS_OPEN_DIRECTORY);
+        CHECK_CDEFINE(MDS_OPEN_DELAY_CREATE);
+        CHECK_CDEFINE(MDS_OPEN_OWNEROVERRIDE);
+        CHECK_CDEFINE(MDS_OPEN_JOIN_FILE);
+        CHECK_CDEFINE(MDS_OPEN_HAS_EA);
+        CHECK_CDEFINE(MDS_OPEN_HAS_OBJS);
+
+        CHECK_CDEFINE(MDS_INODELOCK_LOOKUP);
+        CHECK_CDEFINE(MDS_INODELOCK_UPDATE);
+        CHECK_CDEFINE(MDS_INODELOCK_OPEN);
 }
 
-void
+static void
 check_mds_rec_setattr(void)
 {
         BLANK_LINE();
@@ -368,7 +408,7 @@ check_mds_rec_setattr(void)
         CHECK_MEMBER(mds_rec_setattr, sa_attr_flags);
 }
 
-void
+static void
 check_mds_rec_create(void)
 {
         BLANK_LINE();
@@ -386,7 +426,7 @@ check_mds_rec_create(void)
         CHECK_MEMBER(mds_rec_create, cr_suppgid);
 }
 
-void
+static void
 check_mds_rec_link(void)
 {
         BLANK_LINE();
@@ -402,7 +442,7 @@ check_mds_rec_link(void)
         CHECK_MEMBER(mds_rec_link, lk_time);
 }
 
-void
+static void
 check_mds_rec_unlink(void)
 {
         BLANK_LINE();
@@ -418,7 +458,7 @@ check_mds_rec_unlink(void)
         CHECK_MEMBER(mds_rec_unlink, ul_time);
 }
 
-void
+static void
 check_mds_rec_rename(void)
 {
         BLANK_LINE();
@@ -434,7 +474,16 @@ check_mds_rec_rename(void)
         CHECK_MEMBER(mds_rec_rename, rn_time);
 }
 
-void
+static void
+check_mds_rec_join(void)
+{
+        BLANK_LINE();
+        CHECK_STRUCT(mds_rec_join);
+        CHECK_MEMBER(mds_rec_join, jr_fid);
+        CHECK_MEMBER(mds_rec_join, jr_headsize);
+}
+
+static void
 check_lov_desc(void)
 {
         BLANK_LINE();
@@ -445,12 +494,14 @@ check_lov_desc(void)
         CHECK_MEMBER(lov_desc, ld_pattern);
         CHECK_MEMBER(lov_desc, ld_default_stripe_size);
         CHECK_MEMBER(lov_desc, ld_default_stripe_offset);
-        CHECK_MEMBER(lov_desc, ld_qos_threshold);
-        CHECK_MEMBER(lov_desc, ld_qos_maxage);
+        CHECK_MEMBER(lov_desc, ld_padding_1);
+        CHECK_MEMBER(lov_desc, ld_padding_2);
+        CHECK_MEMBER(lov_desc, ld_padding_3);
+        CHECK_MEMBER(lov_desc, ld_padding_4);
         CHECK_MEMBER(lov_desc, ld_uuid);
 }
 
-void
+static void
 check_ldlm_res_id(void)
 {
         BLANK_LINE();
@@ -458,7 +509,7 @@ check_ldlm_res_id(void)
         CHECK_MEMBER(ldlm_res_id, name[RES_NAME_SIZE]);
 }
 
-void
+static void
 check_ldlm_extent(void)
 {
         BLANK_LINE();
@@ -468,7 +519,15 @@ check_ldlm_extent(void)
         CHECK_MEMBER(ldlm_extent, gid);
 }
 
-void
+static void
+check_ldlm_inodebits(void)
+{
+        BLANK_LINE();
+        CHECK_STRUCT(ldlm_inodebits);
+        CHECK_MEMBER(ldlm_inodebits, bits);
+}
+
+static void
 check_ldlm_flock(void)
 {
         BLANK_LINE();
@@ -479,7 +538,7 @@ check_ldlm_flock(void)
         CHECK_MEMBER(ldlm_flock, pid);
 }
 
-void
+static void
 check_ldlm_intent(void)
 {
         BLANK_LINE();
@@ -487,16 +546,17 @@ check_ldlm_intent(void)
         CHECK_MEMBER(ldlm_intent, opc);
 }
 
-void
+static void
 check_ldlm_resource_desc(void)
 {
         BLANK_LINE();
         CHECK_STRUCT(ldlm_resource_desc);
         CHECK_MEMBER(ldlm_resource_desc, lr_type);
+        CHECK_MEMBER(ldlm_resource_desc, lr_padding);
         CHECK_MEMBER(ldlm_resource_desc, lr_name);
 }
 
-void
+static void
 check_ldlm_lock_desc(void)
 {
         BLANK_LINE();
@@ -507,30 +567,32 @@ check_ldlm_lock_desc(void)
         CHECK_MEMBER(ldlm_lock_desc, l_policy_data);
 }
 
-void
+static void
 check_ldlm_request(void)
 {
         BLANK_LINE();
         CHECK_STRUCT(ldlm_request);
         CHECK_MEMBER(ldlm_request, lock_flags);
+        CHECK_MEMBER(ldlm_request, lock_padding);
         CHECK_MEMBER(ldlm_request, lock_desc);
         CHECK_MEMBER(ldlm_request, lock_handle1);
         CHECK_MEMBER(ldlm_request, lock_handle2);
 }
 
-void
+static void
 check_ldlm_reply(void)
 {
         BLANK_LINE();
         CHECK_STRUCT(ldlm_reply);
         CHECK_MEMBER(ldlm_reply, lock_flags);
+        CHECK_MEMBER(ldlm_request, lock_padding);
         CHECK_MEMBER(ldlm_request, lock_desc);
         CHECK_MEMBER(ldlm_reply, lock_handle);
         CHECK_MEMBER(ldlm_reply, lock_policy_res1);
         CHECK_MEMBER(ldlm_reply, lock_policy_res2);
 }
 
-void
+static void
 check_ldlm_lvb(void)
 {
         BLANK_LINE();
@@ -542,39 +604,8 @@ check_ldlm_lvb(void)
         CHECK_MEMBER(ost_lvb, lvb_blocks);
 }
 
-void
-check_ptlbd_op(void)
-{
-        BLANK_LINE();
-        CHECK_STRUCT(ptlbd_op);
-        CHECK_MEMBER(ptlbd_op, op_cmd);
-        CHECK_MEMBER(ptlbd_op, op_lun);
-        CHECK_MEMBER(ptlbd_op, op_niob_cnt);
-        CHECK_MEMBER(ptlbd_op, op__padding);
-        CHECK_MEMBER(ptlbd_op, op_block_cnt);
-}
-
-void
-check_ptlbd_niob(void)
-{
-        BLANK_LINE();
-        CHECK_STRUCT(ptlbd_niob);
-        CHECK_MEMBER(ptlbd_niob, n_xid);
-        CHECK_MEMBER(ptlbd_niob, n_block_nr);
-        CHECK_MEMBER(ptlbd_niob, n_offset);
-        CHECK_MEMBER(ptlbd_niob, n_length);
-}
 
-void
-check_ptlbd_rsp(void)
-{
-        BLANK_LINE();
-        CHECK_STRUCT(ptlbd_rsp);
-        CHECK_MEMBER(ptlbd_rsp, r_status);
-        CHECK_MEMBER(ptlbd_rsp, r_error_cnt);
-}
-
-void
+static void
 check_llog_logid(void)
 {
         BLANK_LINE();
@@ -583,26 +614,30 @@ check_llog_logid(void)
         CHECK_MEMBER(llog_logid, lgl_ogr);
         CHECK_MEMBER(llog_logid, lgl_ogen);
 
-        CHECK_VALUE(OST_SZ_REC);
-        CHECK_VALUE(OST_RAID1_REC);
-        CHECK_VALUE(MDS_UNLINK_REC);
-        CHECK_VALUE(MDS_SETATTR_REC);
-        CHECK_VALUE(OBD_CFG_REC);
-        CHECK_VALUE(PTL_CFG_REC);
-        CHECK_VALUE(LLOG_GEN_REC);
-        CHECK_VALUE(LLOG_HDR_MAGIC);
-        CHECK_VALUE(LLOG_LOGID_MAGIC);
+        CHECK_CVALUE(OST_SZ_REC);
+        CHECK_CVALUE(OST_RAID1_REC);
+        CHECK_CVALUE(MDS_UNLINK_REC);
+        CHECK_CVALUE(MDS_SETATTR_REC);
+        CHECK_CVALUE(OBD_CFG_REC);
+        CHECK_CVALUE(PTL_CFG_REC);
+        CHECK_CVALUE(LLOG_GEN_REC);
+        CHECK_CVALUE(LLOG_JOIN_REC);
+        CHECK_CVALUE(LLOG_HDR_MAGIC);
+        CHECK_CVALUE(LLOG_LOGID_MAGIC);
 }
 
-void
+static void
 check_llog_catid(void)
 {
         BLANK_LINE();
         CHECK_STRUCT(llog_catid);
         CHECK_MEMBER(llog_catid, lci_logid);
+        CHECK_MEMBER(llog_catid, lci_padding1);
+        CHECK_MEMBER(llog_catid, lci_padding2);
+        CHECK_MEMBER(llog_catid, lci_padding3);
 }
 
-void
+static void
 check_llog_rec_hdr(void)
 {
         BLANK_LINE();
@@ -610,9 +645,10 @@ check_llog_rec_hdr(void)
         CHECK_MEMBER(llog_rec_hdr, lrh_len);
         CHECK_MEMBER(llog_rec_hdr, lrh_index);
         CHECK_MEMBER(llog_rec_hdr, lrh_type);
+        CHECK_MEMBER(llog_rec_hdr, padding);
 }
 
-void
+static void
 check_llog_rec_tail(void)
 {
         BLANK_LINE();
@@ -621,17 +657,22 @@ check_llog_rec_tail(void)
         CHECK_MEMBER(llog_rec_tail, lrt_index);
 }
 
-void
+static void
 check_llog_logid_rec(void)
 {
         BLANK_LINE();
         CHECK_STRUCT(llog_logid_rec);
         CHECK_MEMBER(llog_logid_rec, lid_hdr);
         CHECK_MEMBER(llog_logid_rec, lid_id);
+        CHECK_MEMBER(llog_logid_rec, padding1);
+        CHECK_MEMBER(llog_logid_rec, padding2);
+        CHECK_MEMBER(llog_logid_rec, padding3);
+        CHECK_MEMBER(llog_logid_rec, padding4);
+        CHECK_MEMBER(llog_logid_rec, padding5);
         CHECK_MEMBER(llog_logid_rec, lid_tail);
 }
 
-void
+static void
 check_llog_create_rec(void)
 {
         BLANK_LINE();
@@ -640,9 +681,10 @@ check_llog_create_rec(void)
         CHECK_MEMBER(llog_create_rec, lcr_fid);
         CHECK_MEMBER(llog_create_rec, lcr_oid);
         CHECK_MEMBER(llog_create_rec, lcr_ogen);
+        CHECK_MEMBER(llog_create_rec, padding);
 }
 
-void
+static void
 check_llog_orphan_rec(void)
 {
         BLANK_LINE();
@@ -650,10 +692,11 @@ check_llog_orphan_rec(void)
         CHECK_MEMBER(llog_orphan_rec, lor_hdr);
         CHECK_MEMBER(llog_orphan_rec, lor_oid);
         CHECK_MEMBER(llog_orphan_rec, lor_ogen);
+        CHECK_MEMBER(llog_orphan_rec, padding);
         CHECK_MEMBER(llog_orphan_rec, lor_tail);
 }
 
-void
+static void
 check_llog_unlink_rec(void)
 {
         BLANK_LINE();
@@ -661,10 +704,25 @@ check_llog_unlink_rec(void)
         CHECK_MEMBER(llog_unlink_rec, lur_hdr);
         CHECK_MEMBER(llog_unlink_rec, lur_oid);
         CHECK_MEMBER(llog_unlink_rec, lur_ogen);
+        CHECK_MEMBER(llog_unlink_rec, padding);
         CHECK_MEMBER(llog_unlink_rec, lur_tail);
 }
 
-void
+static void
+check_llog_setattr_rec(void)
+{
+        BLANK_LINE();
+        CHECK_STRUCT(llog_setattr_rec);
+        CHECK_MEMBER(llog_setattr_rec, lsr_hdr);
+        CHECK_MEMBER(llog_setattr_rec, lsr_oid);
+        CHECK_MEMBER(llog_setattr_rec, lsr_ogen);
+        CHECK_MEMBER(llog_setattr_rec, lsr_uid);
+        CHECK_MEMBER(llog_setattr_rec, lsr_gid);
+        CHECK_MEMBER(llog_setattr_rec, padding);
+        CHECK_MEMBER(llog_setattr_rec, lsr_tail);
+}
+
+static void
 check_llog_size_change_rec(void)
 {
         BLANK_LINE();
@@ -672,10 +730,11 @@ check_llog_size_change_rec(void)
         CHECK_MEMBER(llog_size_change_rec, lsc_hdr);
         CHECK_MEMBER(llog_size_change_rec, lsc_fid);
         CHECK_MEMBER(llog_size_change_rec, lsc_io_epoch);
+        CHECK_MEMBER(llog_size_change_rec, padding);
         CHECK_MEMBER(llog_size_change_rec, lsc_tail);
 }
 
-void
+static void
 check_llog_gen(void)
 {
         BLANK_LINE();
@@ -684,7 +743,7 @@ check_llog_gen(void)
         CHECK_MEMBER(llog_gen, conn_cnt);
 }
 
-void
+static void
 check_llog_gen_rec(void)
 {
         BLANK_LINE();
@@ -694,7 +753,7 @@ check_llog_gen_rec(void)
         CHECK_MEMBER(llog_gen_rec, lgr_tail);
 }
 
-void
+static void
 check_llog_log_hdr(void)
 {
         BLANK_LINE();
@@ -712,7 +771,7 @@ check_llog_log_hdr(void)
         CHECK_MEMBER(llog_log_hdr, llh_tail);
 }
 
-void
+static void
 check_llog_cookie(void)
 {
         BLANK_LINE();
@@ -720,9 +779,10 @@ check_llog_cookie(void)
         CHECK_MEMBER(llog_cookie, lgc_lgl);
         CHECK_MEMBER(llog_cookie, lgc_subsys);
         CHECK_MEMBER(llog_cookie, lgc_index);
+        CHECK_MEMBER(llog_cookie, lgc_padding);
 }
 
-void
+static void
 check_llogd_body(void)
 {
         BLANK_LINE();
@@ -735,16 +795,18 @@ check_llogd_body(void)
         CHECK_MEMBER(llogd_body, lgd_len);
         CHECK_MEMBER(llogd_body, lgd_cur_offset);
 
-        CHECK_VALUE(LLOG_ORIGIN_HANDLE_CREATE);
-        CHECK_VALUE(LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
-        CHECK_VALUE(LLOG_ORIGIN_HANDLE_READ_HEADER);
-        CHECK_VALUE(LLOG_ORIGIN_HANDLE_WRITE_REC);
-        CHECK_VALUE(LLOG_ORIGIN_HANDLE_CLOSE);
-        CHECK_VALUE(LLOG_ORIGIN_CONNECT);
-        CHECK_VALUE(LLOG_CATINFO);
+        CHECK_CVALUE(LLOG_ORIGIN_HANDLE_CREATE);
+        CHECK_CVALUE(LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+        CHECK_CVALUE(LLOG_ORIGIN_HANDLE_READ_HEADER);
+        CHECK_CVALUE(LLOG_ORIGIN_HANDLE_WRITE_REC);
+        CHECK_CVALUE(LLOG_ORIGIN_HANDLE_CLOSE);
+        CHECK_CVALUE(LLOG_ORIGIN_CONNECT);
+        CHECK_CVALUE(LLOG_CATINFO);
+        CHECK_CVALUE(LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+        CHECK_CVALUE(LLOG_ORIGIN_HANDLE_DESTROY);
 }
 
-void
+static void
 check_llogd_conn_body(void)
 {
         BLANK_LINE();
@@ -754,7 +816,27 @@ check_llogd_conn_body(void)
         CHECK_MEMBER(llogd_conn_body, lgdc_ctxt_idx);
 }
 
-void
+static void
+check_mds_extent_desc(void)
+{
+        BLANK_LINE();
+        CHECK_STRUCT(mds_extent_desc);
+        CHECK_MEMBER(mds_extent_desc, med_start);
+        CHECK_MEMBER(mds_extent_desc, med_len);
+        CHECK_MEMBER(mds_extent_desc, med_lmm);
+}
+
+static void
+check_llog_array_rec(void)
+{
+        BLANK_LINE();
+        CHECK_STRUCT(llog_array_rec);
+        CHECK_MEMBER(llog_array_rec, lmr_hdr);
+        CHECK_MEMBER(llog_array_rec, lmr_med);
+        CHECK_MEMBER(llog_array_rec, lmr_tail);
+}
+
+static void
 check_qunit_data(void)
 {
         BLANK_LINE();
@@ -765,7 +847,7 @@ check_qunit_data(void)
         CHECK_MEMBER(qunit_data, qd_isblk);
 }
 
-void
+static void
 system_string (char *cmdline, char *str, int len)
 {
         int   fds[2];
@@ -909,13 +991,12 @@ main(int argc, char **argv)
         CHECK_VALUE(MDS_STATUS_CONN);
         CHECK_VALUE(MDS_STATUS_LOV);
 
-        CHECK_VALUE(MDS_OPEN_HAS_EA);
-
         CHECK_VALUE(LDLM_ENQUEUE);
         CHECK_VALUE(LDLM_CONVERT);
         CHECK_VALUE(LDLM_CANCEL);
         CHECK_VALUE(LDLM_BL_CALLBACK);
         CHECK_VALUE(LDLM_CP_CALLBACK);
+        CHECK_VALUE(LDLM_GL_CALLBACK);
         CHECK_VALUE(LDLM_LAST_OPC);
 
         CHECK_VALUE(LCK_EX);
@@ -924,18 +1005,13 @@ main(int argc, char **argv)
         CHECK_VALUE(LCK_CW);
         CHECK_VALUE(LCK_CR);
         CHECK_VALUE(LCK_NL);
+        CHECK_VALUE(LCK_GROUP);
+        CHECK_VALUE(LCK_MAXMODE);
 
-        CHECK_VALUE(PTLBD_QUERY);
-        CHECK_VALUE(PTLBD_READ);
-        CHECK_VALUE(PTLBD_WRITE);
-        CHECK_VALUE(PTLBD_FLUSH);
-        CHECK_VALUE(PTLBD_CONNECT);
-        CHECK_VALUE(PTLBD_DISCONNECT);
-        CHECK_VALUE(PTLBD_LAST_OPC);
-
-        CHECK_VALUE(MGMT_CONNECT);
-        CHECK_VALUE(MGMT_DISCONNECT);
-        CHECK_VALUE(MGMT_EXCEPTION);
+        CHECK_CVALUE(LDLM_PLAIN);
+        CHECK_CVALUE(LDLM_EXTENT);
+        CHECK_CVALUE(LDLM_FLOCK);
+        CHECK_CVALUE(LDLM_IBITS);
 
         CHECK_VALUE(OBD_PING);
         CHECK_VALUE(OBD_LOG_CANCEL);
@@ -945,12 +1021,27 @@ main(int argc, char **argv)
         CHECK_VALUE(QUOTA_DQACQ);
         CHECK_VALUE(QUOTA_DQREL);
 
+        CHECK_CDEFINE(OBD_CONNECT_RDONLY);
+        CHECK_CDEFINE(OBD_CONNECT_INDEX);
+        CHECK_CDEFINE(OBD_CONNECT_GRANT);
+        CHECK_CDEFINE(OBD_CONNECT_SRVLOCK);
+        CHECK_CDEFINE(OBD_CONNECT_VERSION);
+        CHECK_CDEFINE(OBD_CONNECT_REQPORTAL);
+        CHECK_CDEFINE(OBD_CONNECT_ACL);
+        CHECK_CDEFINE(OBD_CONNECT_XATTR);
+        CHECK_CDEFINE(OBD_CONNECT_CROW);
+        CHECK_CDEFINE(OBD_CONNECT_TRUNCLOCK);
+        CHECK_CDEFINE(OBD_CONNECT_TRANSNO);
+        CHECK_CDEFINE(OBD_CONNECT_IBITS);
+        CHECK_CDEFINE(OBD_CONNECT_JOIN);
+
         COMMENT("Sizes and Offsets");
         BLANK_LINE();
         check_lustre_handle();
         check_lustre_msg();
         check_obdo();
         check_lov_mds_md_v1();
+        check_lov_mds_md_join();
         check_obd_statfs();
         check_obd_ioobj();
         check_obd_quotactl();
@@ -964,19 +1055,18 @@ main(int argc, char **argv)
         check_mds_rec_link();
         check_mds_rec_unlink();
         check_mds_rec_rename();
+        check_mds_rec_join();
         check_lov_desc();
         check_ldlm_res_id();
         check_ldlm_extent();
         check_ldlm_flock();
+        check_ldlm_inodebits();
         check_ldlm_intent();
         check_ldlm_resource_desc();
         check_ldlm_lock_desc();
         check_ldlm_request();
         check_ldlm_reply();
         check_ldlm_lvb();
-        check_ptlbd_op();
-        check_ptlbd_niob();
-        check_ptlbd_rsp();
         check_llog_logid();
         check_llog_catid();
         check_llog_rec_hdr();
@@ -985,6 +1075,7 @@ main(int argc, char **argv)
         check_llog_create_rec();
         check_llog_orphan_rec();
         check_llog_unlink_rec();
+        check_llog_setattr_rec();
         check_llog_size_change_rec();
         check_llog_gen();
         check_llog_gen_rec();
@@ -992,6 +1083,8 @@ main(int argc, char **argv)
         check_llog_cookie();
         check_llogd_body();
         check_llogd_conn_body();
+        check_llog_array_rec();
+        check_mds_extent_desc();
         check_qunit_data();
 
         printf("}\n\n");
index 345bf82..5845795 100644 (file)
@@ -5,6 +5,7 @@
 
 #undef LASSERT
 #undef LASSERTF
+#define CLASSERT(cond) ({ switch(42) { case (cond): case 0: break; } })
 #define LASSERT(cond) if (!(cond)) { printf("failed " #cond "\n"); ret = 1; }
 #define LASSERTF(cond, fmt, arg) if (!(cond)) { printf("failed '" #cond "'" fmt, arg);ret = 1;}
 
@@ -25,8 +26,8 @@ int main()
 void lustre_assert_wire_constants(void)
 {
         /* Wire protocol assertions generated by 'wirecheck'
-         * running on Linux localhost.localdomain 2.6.9-1.667 #1 Tue Nov 2 14:41:25 EST 2004 i686 i68
-         * with gcc version 3.4.3 20050227 (Red Hat 3.4.3-22.fc3) */
+         * running on Linux schatzie.adilger.int 2.6.12-1.1381_FC3 #1 Fri Oct 21 03:46:55 EDT 2005 i6
+         * with gcc version 3.3.4 20040817 (Red Hat Linux 3.3.4-2) */
 
 
         /* Constants... */
@@ -158,8 +159,6 @@ void lustre_assert_wire_constants(void)
                  (long long)MDS_STATUS_CONN);
         LASSERTF(MDS_STATUS_LOV == 2, " found %lld\n",
                  (long long)MDS_STATUS_LOV);
-        LASSERTF(MDS_OPEN_HAS_EA == 1073741824, " found %lld\n",
-                 (long long)MDS_OPEN_HAS_EA);
         LASSERTF(LDLM_ENQUEUE == 101, " found %lld\n",
                  (long long)LDLM_ENQUEUE);
         LASSERTF(LDLM_CONVERT == 102, " found %lld\n",
@@ -170,6 +169,8 @@ void lustre_assert_wire_constants(void)
                  (long long)LDLM_BL_CALLBACK);
         LASSERTF(LDLM_CP_CALLBACK == 105, " found %lld\n",
                  (long long)LDLM_CP_CALLBACK);
+        LASSERTF(LDLM_GL_CALLBACK == 106, " found %lld\n",
+                 (long long)LDLM_GL_CALLBACK);
         LASSERTF(LDLM_LAST_OPC == 107, " found %lld\n",
                  (long long)LDLM_LAST_OPC);
         LASSERTF(LCK_EX == 1, " found %lld\n",
@@ -184,26 +185,14 @@ void lustre_assert_wire_constants(void)
                  (long long)LCK_CR);
         LASSERTF(LCK_NL == 32, " found %lld\n",
                  (long long)LCK_NL);
-        LASSERTF(PTLBD_QUERY == 200, " found %lld\n",
-                 (long long)PTLBD_QUERY);
-        LASSERTF(PTLBD_READ == 201, " found %lld\n",
-                 (long long)PTLBD_READ);
-        LASSERTF(PTLBD_WRITE == 202, " found %lld\n",
-                 (long long)PTLBD_WRITE);
-        LASSERTF(PTLBD_FLUSH == 203, " found %lld\n",
-                 (long long)PTLBD_FLUSH);
-        LASSERTF(PTLBD_CONNECT == 204, " found %lld\n",
-                 (long long)PTLBD_CONNECT);
-        LASSERTF(PTLBD_DISCONNECT == 205, " found %lld\n",
-                 (long long)PTLBD_DISCONNECT);
-        LASSERTF(PTLBD_LAST_OPC == 206, " found %lld\n",
-                 (long long)PTLBD_LAST_OPC);
-        LASSERTF(MGMT_CONNECT == 250, " found %lld\n",
-                 (long long)MGMT_CONNECT);
-        LASSERTF(MGMT_DISCONNECT == 251, " found %lld\n",
-                 (long long)MGMT_DISCONNECT);
-        LASSERTF(MGMT_EXCEPTION == 252, " found %lld\n",
-                 (long long)MGMT_EXCEPTION);
+        LASSERTF(LCK_GROUP == 64, " found %lld\n",
+                 (long long)LCK_GROUP);
+        LASSERTF(LCK_MAXMODE == 65, " found %lld\n",
+                 (long long)LCK_MAXMODE);
+        CLASSERT(LDLM_PLAIN == 10);
+        CLASSERT(LDLM_EXTENT == 11);
+        CLASSERT(LDLM_FLOCK == 12);
+        CLASSERT(LDLM_IBITS == 13);
         LASSERTF(OBD_PING == 400, " found %lld\n",
                  (long long)OBD_PING);
         LASSERTF(OBD_LOG_CANCEL == 401, " found %lld\n",
@@ -216,6 +205,19 @@ void lustre_assert_wire_constants(void)
                  (long long)QUOTA_DQACQ);
         LASSERTF(QUOTA_DQREL == 602, " found %lld\n",
                  (long long)QUOTA_DQREL);
+        CLASSERT(OBD_CONNECT_RDONLY == 0x1ULL);
+        CLASSERT(OBD_CONNECT_INDEX == 0x2ULL);
+        CLASSERT(OBD_CONNECT_GRANT == 0x8ULL);
+        CLASSERT(OBD_CONNECT_SRVLOCK == 0x10ULL);
+        CLASSERT(OBD_CONNECT_VERSION == 0x20ULL);
+        CLASSERT(OBD_CONNECT_REQPORTAL == 0x40ULL);
+        CLASSERT(OBD_CONNECT_ACL == 0x80ULL);
+        CLASSERT(OBD_CONNECT_XATTR == 0x100ULL);
+        CLASSERT(OBD_CONNECT_CROW == 0x200ULL);
+        CLASSERT(OBD_CONNECT_TRUNCLOCK == 0x400ULL);
+        CLASSERT(OBD_CONNECT_TRANSNO == 0x800ULL);
+        CLASSERT(OBD_CONNECT_IBITS == 0x1000ULL);
+        CLASSERT(OBD_CONNECT_JOIN == 0x2000ULL);
         /* Sizes and Offsets */
 
 
@@ -362,92 +364,66 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct obdo, o_mds));
         LASSERTF((int)sizeof(((struct obdo *)0)->o_mds) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct obdo *)0)->o_mds));
+        LASSERTF((int)offsetof(struct obdo, o_stripe_idx) == 120, " found %lld\n",
+                 (long long)(int)offsetof(struct obdo, o_stripe_idx));
+        LASSERTF((int)sizeof(((struct obdo *)0)->o_stripe_idx) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obdo *)0)->o_stripe_idx));
+        LASSERTF((int)offsetof(struct obdo, o_padding_1) == 124, " found %lld\n",
+                 (long long)(int)offsetof(struct obdo, o_padding_1));
+        LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_1) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obdo *)0)->o_padding_1));
         LASSERTF((int)offsetof(struct obdo, o_inline) == 128, " found %lld\n",
                  (long long)(int)offsetof(struct obdo, o_inline));
         LASSERTF((int)sizeof(((struct obdo *)0)->o_inline) == 80, " found %lld\n",
                  (long long)(int)sizeof(((struct obdo *)0)->o_inline));
         LASSERTF(OBD_INLINESZ == 80, " found %lld\n",
                  (long long)OBD_INLINESZ);
-        LASSERTF(OBD_MD_FLID == 1, " found %lld\n",
-                 (long long)OBD_MD_FLID);
-        LASSERTF(OBD_MD_FLATIME == 2, " found %lld\n",
-                 (long long)OBD_MD_FLATIME);
-        LASSERTF(OBD_MD_FLMTIME == 4, " found %lld\n",
-                 (long long)OBD_MD_FLMTIME);
-        LASSERTF(OBD_MD_FLCTIME == 8, " found %lld\n",
-                 (long long)OBD_MD_FLCTIME);
-        LASSERTF(OBD_MD_FLSIZE == 16, " found %lld\n",
-                 (long long)OBD_MD_FLSIZE);
-        LASSERTF(OBD_MD_FLBLOCKS == 32, " found %lld\n",
-                 (long long)OBD_MD_FLBLOCKS);
-        LASSERTF(OBD_MD_FLBLKSZ == 64, " found %lld\n",
-                 (long long)OBD_MD_FLBLKSZ);
-        LASSERTF(OBD_MD_FLMODE == 128, " found %lld\n",
-                 (long long)OBD_MD_FLMODE);
-        LASSERTF(OBD_MD_FLTYPE == 256, " found %lld\n",
-                 (long long)OBD_MD_FLTYPE);
-        LASSERTF(OBD_MD_FLUID == 512, " found %lld\n",
-                 (long long)OBD_MD_FLUID);
-        LASSERTF(OBD_MD_FLGID == 1024, " found %lld\n",
-                 (long long)OBD_MD_FLGID);
-        LASSERTF(OBD_MD_FLFLAGS == 2048, " found %lld\n",
-                 (long long)OBD_MD_FLFLAGS);
-        LASSERTF(OBD_MD_FLNLINK == 8192, " found %lld\n",
-                 (long long)OBD_MD_FLNLINK);
-        LASSERTF(OBD_MD_FLGENER == 16384, " found %lld\n",
-                 (long long)OBD_MD_FLGENER);
-        LASSERTF(OBD_MD_FLINLINE == 32768, " found %lld\n",
-                 (long long)OBD_MD_FLINLINE);
-        LASSERTF(OBD_MD_FLRDEV == 65536, " found %lld\n",
-                 (long long)OBD_MD_FLRDEV);
-        LASSERTF(OBD_MD_FLEASIZE == 131072, " found %lld\n",
-                 (long long)OBD_MD_FLEASIZE);
-        LASSERTF(OBD_MD_LINKNAME == 262144, " found %lld\n",
-                 (long long)OBD_MD_LINKNAME);
-        LASSERTF(OBD_MD_FLHANDLE == 524288, " found %lld\n",
-                 (long long)OBD_MD_FLHANDLE);
-        LASSERTF(OBD_MD_FLCKSUM == 1048576, " found %lld\n",
-                 (long long)OBD_MD_FLCKSUM);
-        LASSERTF(OBD_MD_FLQOS == 2097152, " found %lld\n",
-                 (long long)OBD_MD_FLQOS);
-        LASSERTF(OBD_MD_FLCOOKIE == 8388608, " found %lld\n",
-                 (long long)OBD_MD_FLCOOKIE);
-        LASSERTF(OBD_MD_FLGROUP == 16777216, " found %lld\n",
-                 (long long)OBD_MD_FLGROUP);
-        LASSERTF(OBD_MD_FLFID == 33554432, " found %lld\n",
-                 (long long)OBD_MD_FLFID);
-        LASSERTF(OBD_MD_FLEPOCH == 67108864, " found %lld\n",
-                 (long long)OBD_MD_FLEPOCH);
-        LASSERTF(OBD_MD_FLGRANT == 134217728, " found %lld\n",
-                 (long long)OBD_MD_FLGRANT);
-        LASSERTF(OBD_MD_FLDIREA == 268435456, " found %lld\n",
-                 (long long)OBD_MD_FLDIREA);
-        LASSERTF(OBD_MD_FLUSRQUOTA == 536870912, " found %lld\n",
-                 (long long)OBD_MD_FLUSRQUOTA);
-        LASSERTF(OBD_MD_FLGRPQUOTA == 1073741824, " found %lld\n",
-                 (long long)OBD_MD_FLGRPQUOTA);
-        LASSERTF(OBD_MD_MDS == 4294967296ULL, " found %lld\n",
-                 (long long)OBD_MD_MDS);
-        LASSERTF(OBD_MD_REINT == 8589934592ULL, " found %lld\n",
-                 (long long)OBD_MD_REINT);
-        LASSERTF(OBD_FL_INLINEDATA == 1, " found %lld\n",
-                 (long long)OBD_FL_INLINEDATA);
-        LASSERTF(OBD_FL_OBDMDEXISTS == 2, " found %lld\n",
-                 (long long)OBD_FL_OBDMDEXISTS);
-        LASSERTF(OBD_FL_DELORPHAN == 4, " found %lld\n",
-                 (long long)OBD_FL_DELORPHAN);
-        LASSERTF(OBD_FL_NORPC == 8, " found %lld\n",
-                 (long long)OBD_FL_NORPC);
-        LASSERTF(OBD_FL_IDONLY == 16, " found %lld\n",
-                 (long long)OBD_FL_IDONLY);
-        LASSERTF(OBD_FL_RECREATE_OBJS == 32, " found %lld\n",
-                 (long long)OBD_FL_RECREATE_OBJS);
-        LASSERTF(OBD_FL_DEBUG_CHECK == 64, " found %lld\n",
-                 (long long)OBD_FL_DEBUG_CHECK);
-        LASSERTF(OBD_FL_NO_USRQUOTA == 256, " found %lld\n",
-                 (long long)OBD_FL_NO_USRQUOTA);
-        LASSERTF(OBD_FL_NO_GRPQUOTA == 512, " found %lld\n",
-                 (long long)OBD_FL_NO_GRPQUOTA);
+        CLASSERT(OBD_MD_FLID == (0x00000001ULL));
+        CLASSERT(OBD_MD_FLATIME == (0x00000002ULL));
+        CLASSERT(OBD_MD_FLMTIME == (0x00000004ULL));
+        CLASSERT(OBD_MD_FLCTIME == (0x00000008ULL));
+        CLASSERT(OBD_MD_FLSIZE == (0x00000010ULL));
+        CLASSERT(OBD_MD_FLBLOCKS == (0x00000020ULL));
+        CLASSERT(OBD_MD_FLBLKSZ == (0x00000040ULL));
+        CLASSERT(OBD_MD_FLMODE == (0x00000080ULL));
+        CLASSERT(OBD_MD_FLTYPE == (0x00000100ULL));
+        CLASSERT(OBD_MD_FLUID == (0x00000200ULL));
+        CLASSERT(OBD_MD_FLGID == (0x00000400ULL));
+        CLASSERT(OBD_MD_FLFLAGS == (0x00000800ULL));
+        CLASSERT(OBD_MD_FLNLINK == (0x00002000ULL));
+        CLASSERT(OBD_MD_FLGENER == (0x00004000ULL));
+        CLASSERT(OBD_MD_FLINLINE == (0x00008000ULL));
+        CLASSERT(OBD_MD_FLRDEV == (0x00010000ULL));
+        CLASSERT(OBD_MD_FLEASIZE == (0x00020000ULL));
+        CLASSERT(OBD_MD_LINKNAME == (0x00040000ULL));
+        CLASSERT(OBD_MD_FLHANDLE == (0x00080000ULL));
+        CLASSERT(OBD_MD_FLCKSUM == (0x00100000ULL));
+        CLASSERT(OBD_MD_FLQOS == (0x00200000ULL));
+        CLASSERT(OBD_MD_FLCOOKIE == (0x00800000ULL));
+        CLASSERT(OBD_MD_FLGROUP == (0x01000000ULL));
+        CLASSERT(OBD_MD_FLFID == (0x02000000ULL));
+        CLASSERT(OBD_MD_FLEPOCH == (0x04000000ULL));
+        CLASSERT(OBD_MD_FLGRANT == (0x08000000ULL));
+        CLASSERT(OBD_MD_FLDIREA == (0x10000000ULL));
+        CLASSERT(OBD_MD_FLUSRQUOTA == (0x20000000ULL));
+        CLASSERT(OBD_MD_FLGRPQUOTA == (0x40000000ULL));
+        CLASSERT(OBD_MD_FLMODEASIZE == (0x80000000ULL));
+        CLASSERT(OBD_MD_MDS == (0x0000000100000000ULL));
+        CLASSERT(OBD_MD_REINT == (0x0000000200000000ULL));
+        CLASSERT(OBD_MD_FLXATTR == (0x0000001000000000ULL));
+        CLASSERT(OBD_MD_FLXATTRLS == (0x0000002000000000ULL));
+        CLASSERT(OBD_MD_FLXATTRRM == (0x0000004000000000ULL));
+        CLASSERT(OBD_MD_FLACL == (0x0000008000000000ULL));
+        CLASSERT(OBD_FL_INLINEDATA == (0x00000001));
+        CLASSERT(OBD_FL_OBDMDEXISTS == (0x00000002));
+        CLASSERT(OBD_FL_DELORPHAN == (0x00000004));
+        CLASSERT(OBD_FL_NORPC == (0x00000008));
+        CLASSERT(OBD_FL_IDONLY == (0x00000010));
+        CLASSERT(OBD_FL_RECREATE_OBJS == (0x00000020));
+        CLASSERT(OBD_FL_DEBUG_CHECK == (0x00000040));
+        CLASSERT(OBD_FL_NO_USRQUOTA == (0x00000100));
+        CLASSERT(OBD_FL_NO_GRPQUOTA == (0x00000200));
+        CLASSERT(OBD_FL_CREATE_CROW == (0x00000400));
 
         /* Checks for struct lov_mds_md_v1 */
         LASSERTF((int)sizeof(struct lov_mds_md_v1) == 32, " found %lld\n",
@@ -500,13 +476,29 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx));
         LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx));
-        LASSERTF(LOV_MAGIC_V1 == 198249424, " found %lld\n",
-                 (long long)LOV_MAGIC_V1);
+        CLASSERT(LOV_MAGIC_V1 == 0x0BD10BD0);
+        CLASSERT(LOV_MAGIC_JOIN == 0x0BD20BD0);
         LASSERTF(LOV_PATTERN_RAID0 == 1, " found %lld\n",
                  (long long)LOV_PATTERN_RAID0);
         LASSERTF(LOV_PATTERN_RAID1 == 2, " found %lld\n",
                  (long long)LOV_PATTERN_RAID1);
 
+        /* Checks for struct lov_mds_md_join */
+        LASSERTF((int)sizeof(struct lov_mds_md_join) == 56, " found %lld\n",
+                 (long long)(int)sizeof(struct lov_mds_md_join));
+        LASSERTF((int)offsetof(struct lov_mds_md_join, lmmj_md) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_join, lmmj_md));
+        LASSERTF((int)sizeof(((struct lov_mds_md_join *)0)->lmmj_md) == 32, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_join *)0)->lmmj_md));
+        LASSERTF((int)offsetof(struct lov_mds_md_join, lmmj_array_id) == 32, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_join, lmmj_array_id));
+        LASSERTF((int)sizeof(((struct lov_mds_md_join *)0)->lmmj_array_id) == 20, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_join *)0)->lmmj_array_id));
+        LASSERTF((int)offsetof(struct lov_mds_md_join, lmmj_extent_count) == 52, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_mds_md_join, lmmj_extent_count));
+        LASSERTF((int)sizeof(((struct lov_mds_md_join *)0)->lmmj_extent_count) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_mds_md_join *)0)->lmmj_extent_count));
+
         /* Checks for struct obd_statfs */
         LASSERTF((int)sizeof(struct obd_statfs) == 144, " found %lld\n",
                  (long long)(int)sizeof(struct obd_statfs));
@@ -546,6 +538,42 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct obd_statfs, os_state));
         LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct obd_statfs *)0)->os_state));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare1) == 108, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare1));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare1) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare1));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare2) == 112, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare2));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare2) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare2));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare3));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare3));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare4) == 120, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare4));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare4) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare4));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare5) == 124, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare5));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare5) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare5));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare6) == 128, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare6));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare6) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare6));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare7) == 132, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare7));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare7) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare7));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare8) == 136, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare8));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare8) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare8));
+        LASSERTF((int)offsetof(struct obd_statfs, os_spare9) == 140, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_statfs, os_spare9));
+        LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9));
 
         /* Checks for struct obd_ioobj */
         LASSERTF((int)sizeof(struct obd_ioobj) == 24, " found %lld\n",
@@ -654,6 +682,18 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct obd_dqblk, dqb_valid));
         LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid));
+        LASSERTF((int)offsetof(struct obd_dqblk, padding) == 68, " found %lld\n",
+                 (long long)(int)offsetof(struct obd_dqblk, padding));
+        LASSERTF((int)sizeof(((struct obd_dqblk *)0)->padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct obd_dqblk *)0)->padding));
+        LASSERTF(Q_QUOTACHECK == 0x800100," found %lld\n",
+                 (long long)Q_QUOTACHECK);
+        LASSERTF(Q_INITQUOTA == 0x800101," found %lld\n",
+                 (long long)Q_INITQUOTA);
+        LASSERTF(Q_GETOINFO == 0x800102," found %lld\n",
+                 (long long)Q_GETOINFO);
+        LASSERTF(Q_GETOQUOTA == 0x800103," found %lld\n",
+                 (long long)Q_GETOQUOTA);
 
         /* Checks for struct niobuf_remote */
         LASSERTF((int)sizeof(struct niobuf_remote) == 16, " found %lld\n",
@@ -816,14 +856,14 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct mds_body, aclsize));
         LASSERTF((int)sizeof(((struct mds_body *)0)->aclsize) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct mds_body *)0)->aclsize));
-        LASSERTF((int)offsetof(struct mds_body, padding_2) == 156, " found %lld\n",
-                 (long long)(int)offsetof(struct mds_body, padding_2));
-        LASSERTF((int)sizeof(((struct mds_body *)0)->padding_2) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct mds_body *)0)->padding_2));
-        LASSERTF((int)offsetof(struct mds_body, padding_3) == 160, " found %lld\n",
-                 (long long)(int)offsetof(struct mds_body, padding_3));
-        LASSERTF((int)sizeof(((struct mds_body *)0)->padding_3) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct mds_body *)0)->padding_3));
+        LASSERTF((int)offsetof(struct mds_body, max_mdsize) == 156, " found %lld\n",
+                 (long long)(int)offsetof(struct mds_body, max_mdsize));
+        LASSERTF((int)sizeof(((struct mds_body *)0)->max_mdsize) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct mds_body *)0)->max_mdsize));
+        LASSERTF((int)offsetof(struct mds_body, max_cookiesize) == 160, " found %lld\n",
+                 (long long)(int)offsetof(struct mds_body, max_cookiesize));
+        LASSERTF((int)sizeof(((struct mds_body *)0)->max_cookiesize) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct mds_body *)0)->max_cookiesize));
         LASSERTF((int)offsetof(struct mds_body, padding_4) == 164, " found %lld\n",
                  (long long)(int)offsetof(struct mds_body, padding_4));
         LASSERTF((int)sizeof(((struct mds_body *)0)->padding_4) == 4, " found %lld\n",
@@ -834,22 +874,20 @@ void lustre_assert_wire_constants(void)
                  (long long)FMODE_WRITE);
         LASSERTF(FMODE_EXEC == 4, " found %lld\n",
                  (long long)FMODE_EXEC);
-        LASSERTF(MDS_OPEN_CREAT == 64, " found %lld\n",
-                 (long long)MDS_OPEN_CREAT);
-        LASSERTF(MDS_OPEN_EXCL == 128, " found %lld\n",
-                 (long long)MDS_OPEN_EXCL);
-        LASSERTF(MDS_OPEN_TRUNC == 512, " found %lld\n",
-                 (long long)MDS_OPEN_TRUNC);
-        LASSERTF(MDS_OPEN_APPEND == 1024, " found %lld\n",
-                 (long long)MDS_OPEN_APPEND);
-        LASSERTF(MDS_OPEN_SYNC == 4096, " found %lld\n",
-                 (long long)MDS_OPEN_SYNC);
-        LASSERTF(MDS_OPEN_DIRECTORY == 65536, " found %lld\n",
-                 (long long)MDS_OPEN_DIRECTORY);
-        LASSERTF(MDS_OPEN_DELAY_CREATE == 16777216, " found %lld\n",
-                 (long long)MDS_OPEN_DELAY_CREATE);
-        LASSERTF(MDS_OPEN_HAS_EA == 1073741824, " found %lld\n",
-                 (long long)MDS_OPEN_HAS_EA);
+        CLASSERT(MDS_OPEN_CREAT == 00000100);
+        CLASSERT(MDS_OPEN_EXCL == 00000200);
+        CLASSERT(MDS_OPEN_TRUNC == 00001000);
+        CLASSERT(MDS_OPEN_APPEND == 00002000);
+        CLASSERT(MDS_OPEN_SYNC == 00010000);
+        CLASSERT(MDS_OPEN_DIRECTORY == 00200000);
+        CLASSERT(MDS_OPEN_DELAY_CREATE == 0100000000);
+        CLASSERT(MDS_OPEN_OWNEROVERRIDE == 0200000000);
+        CLASSERT(MDS_OPEN_JOIN_FILE == 0400000000);
+        CLASSERT(MDS_OPEN_HAS_EA == 010000000000);
+        CLASSERT(MDS_OPEN_HAS_OBJS == 020000000000);
+        CLASSERT(MDS_INODELOCK_LOOKUP == 0x000001);
+        CLASSERT(MDS_INODELOCK_UPDATE == 0x000002);
+        CLASSERT(MDS_INODELOCK_OPEN == 0x000004);
 
         /* Checks for struct mds_rec_setattr */
         LASSERTF((int)sizeof(struct mds_rec_setattr) == 96, " found %lld\n",
@@ -1083,6 +1121,18 @@ void lustre_assert_wire_constants(void)
         LASSERTF((int)sizeof(((struct mds_rec_rename *)0)->rn_time) == 8, " found %lld\n",
                  (long long)(int)sizeof(((struct mds_rec_rename *)0)->rn_time));
 
+        /* Checks for struct mds_rec_join */
+        LASSERTF((int)sizeof(struct mds_rec_join) == 24, " found %lld\n",
+                 (long long)(int)sizeof(struct mds_rec_join));
+        LASSERTF((int)offsetof(struct mds_rec_join, jr_fid) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct mds_rec_join, jr_fid));
+        LASSERTF((int)sizeof(((struct mds_rec_join *)0)->jr_fid) == 16, " found %lld\n",
+                 (long long)(int)sizeof(((struct mds_rec_join *)0)->jr_fid));
+        LASSERTF((int)offsetof(struct mds_rec_join, jr_headsize) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct mds_rec_join, jr_headsize));
+        LASSERTF((int)sizeof(((struct mds_rec_join *)0)->jr_headsize) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct mds_rec_join *)0)->jr_headsize));
+
         /* Checks for struct lov_desc */
         LASSERTF((int)sizeof(struct lov_desc) == 88, " found %lld\n",
                  (long long)(int)sizeof(struct lov_desc));
@@ -1110,14 +1160,26 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset));
         LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, " found %lld\n",
                  (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset));
-        LASSERTF((int)offsetof(struct lov_desc, ld_qos_threshold) == 32, " found %lld\n",
-                 (long long)(int)offsetof(struct lov_desc, ld_qos_threshold));
-        LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_threshold) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_threshold));
-        LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, " found %lld\n",
-                 (long long)(int)offsetof(struct lov_desc, ld_qos_maxage));
-        LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage));
+        LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset));
+        LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset));
+        LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 32, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_desc, ld_padding_1));
+        LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1));
+        LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 36, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_desc, ld_padding_2));
+        LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2));
+        LASSERTF((int)offsetof(struct lov_desc, ld_padding_3) == 40, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_desc, ld_padding_3));
+        LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_3) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_3));
+        LASSERTF((int)offsetof(struct lov_desc, ld_padding_4) == 44, " found %lld\n",
+                 (long long)(int)offsetof(struct lov_desc, ld_padding_4));
+        LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_4) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_4));
         LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, " found %lld\n",
                  (long long)(int)offsetof(struct lov_desc, ld_uuid));
         LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, " found %lld\n",
@@ -1167,6 +1229,14 @@ void lustre_assert_wire_constants(void)
         LASSERTF((int)sizeof(((struct ldlm_flock *)0)->pid) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct ldlm_flock *)0)->pid));
 
+        /* Checks for struct ldlm_inodebits */
+        LASSERTF((int)sizeof(struct ldlm_inodebits) == 8, " found %lld\n",
+                 (long long)(int)sizeof(struct ldlm_inodebits));
+        LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct ldlm_inodebits, bits));
+        LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits));
+
         /* Checks for struct ldlm_intent */
         LASSERTF((int)sizeof(struct ldlm_intent) == 8, " found %lld\n",
                  (long long)(int)sizeof(struct ldlm_intent));
@@ -1182,6 +1252,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct ldlm_resource_desc, lr_type));
         LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type));
+        LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_padding) == 4, " found %lld\n",
+                 (long long)(int)offsetof(struct ldlm_resource_desc, lr_padding));
+        LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding));
         LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, " found %lld\n",
                  (long long)(int)offsetof(struct ldlm_resource_desc, lr_name));
         LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, " found %lld\n",
@@ -1214,6 +1288,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct ldlm_request, lock_flags));
         LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags));
+        LASSERTF((int)offsetof(struct ldlm_request, lock_padding) == 4, " found %lld\n",
+                 (long long)(int)offsetof(struct ldlm_request, lock_padding));
+        LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_padding));
         LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, " found %lld\n",
                  (long long)(int)offsetof(struct ldlm_request, lock_desc));
         LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, " found %lld\n",
@@ -1234,6 +1312,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct ldlm_reply, lock_flags));
         LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags));
+        LASSERTF((int)offsetof(struct ldlm_request, lock_padding) == 4, " found %lld\n",
+                 (long long)(int)offsetof(struct ldlm_request, lock_padding));
+        LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct ldlm_request *)0)->lock_padding));
         LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, " found %lld\n",
                  (long long)(int)offsetof(struct ldlm_request, lock_desc));
         LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, " found %lld\n",
@@ -1275,62 +1357,6 @@ void lustre_assert_wire_constants(void)
         LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, " found %lld\n",
                  (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks));
 
-        /* Checks for struct ptlbd_op */
-        LASSERTF((int)sizeof(struct ptlbd_op) == 12, " found %lld\n",
-                 (long long)(int)sizeof(struct ptlbd_op));
-        LASSERTF((int)offsetof(struct ptlbd_op, op_cmd) == 0, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_op, op_cmd));
-        LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op_cmd) == 2, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_op *)0)->op_cmd));
-        LASSERTF((int)offsetof(struct ptlbd_op, op_lun) == 2, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_op, op_lun));
-        LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op_lun) == 2, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_op *)0)->op_lun));
-        LASSERTF((int)offsetof(struct ptlbd_op, op_niob_cnt) == 4, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_op, op_niob_cnt));
-        LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op_niob_cnt) == 2, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_op *)0)->op_niob_cnt));
-        LASSERTF((int)offsetof(struct ptlbd_op, op__padding) == 6, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_op, op__padding));
-        LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op__padding) == 2, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_op *)0)->op__padding));
-        LASSERTF((int)offsetof(struct ptlbd_op, op_block_cnt) == 8, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_op, op_block_cnt));
-        LASSERTF((int)sizeof(((struct ptlbd_op *)0)->op_block_cnt) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_op *)0)->op_block_cnt));
-
-        /* Checks for struct ptlbd_niob */
-        LASSERTF((int)sizeof(struct ptlbd_niob) == 24, " found %lld\n",
-                 (long long)(int)sizeof(struct ptlbd_niob));
-        LASSERTF((int)offsetof(struct ptlbd_niob, n_xid) == 0, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_niob, n_xid));
-        LASSERTF((int)sizeof(((struct ptlbd_niob *)0)->n_xid) == 8, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_niob *)0)->n_xid));
-        LASSERTF((int)offsetof(struct ptlbd_niob, n_block_nr) == 8, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_niob, n_block_nr));
-        LASSERTF((int)sizeof(((struct ptlbd_niob *)0)->n_block_nr) == 8, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_niob *)0)->n_block_nr));
-        LASSERTF((int)offsetof(struct ptlbd_niob, n_offset) == 16, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_niob, n_offset));
-        LASSERTF((int)sizeof(((struct ptlbd_niob *)0)->n_offset) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_niob *)0)->n_offset));
-        LASSERTF((int)offsetof(struct ptlbd_niob, n_length) == 20, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_niob, n_length));
-        LASSERTF((int)sizeof(((struct ptlbd_niob *)0)->n_length) == 4, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_niob *)0)->n_length));
-
-        /* Checks for struct ptlbd_rsp */
-        LASSERTF((int)sizeof(struct ptlbd_rsp) == 4, " found %lld\n",
-                 (long long)(int)sizeof(struct ptlbd_rsp));
-        LASSERTF((int)offsetof(struct ptlbd_rsp, r_status) == 0, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_rsp, r_status));
-        LASSERTF((int)sizeof(((struct ptlbd_rsp *)0)->r_status) == 2, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_rsp *)0)->r_status));
-        LASSERTF((int)offsetof(struct ptlbd_rsp, r_error_cnt) == 2, " found %lld\n",
-                 (long long)(int)offsetof(struct ptlbd_rsp, r_error_cnt));
-        LASSERTF((int)sizeof(((struct ptlbd_rsp *)0)->r_error_cnt) == 2, " found %lld\n",
-                 (long long)(int)sizeof(((struct ptlbd_rsp *)0)->r_error_cnt));
-
         /* Checks for struct llog_logid */
         LASSERTF((int)sizeof(struct llog_logid) == 20, " found %lld\n",
                  (long long)(int)sizeof(struct llog_logid));
@@ -1346,24 +1372,16 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_logid, lgl_ogen));
         LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_ogen) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_logid *)0)->lgl_ogen));
-        LASSERTF(OST_SZ_REC == 274730752, " found %lld\n",
-                 (long long)OST_SZ_REC);
-        LASSERTF(OST_RAID1_REC == 274731008, " found %lld\n",
-                 (long long)OST_RAID1_REC);
-        LASSERTF(MDS_UNLINK_REC == 274801668, " found %lld\n",
-                 (long long)MDS_UNLINK_REC);
-        LASSERTF(MDS_SETATTR_REC == 274801665, " found %lld\n",
-                 (long long)MDS_SETATTR_REC);
-        LASSERTF(OBD_CFG_REC == 274857984, " found %lld\n",
-                 (long long)OBD_CFG_REC);
-        LASSERTF(PTL_CFG_REC == 274923520, " found %lld\n",
-                 (long long)PTL_CFG_REC);
-        LASSERTF(LLOG_GEN_REC == 274989056, " found %lld\n",
-                 (long long)LLOG_GEN_REC);
-        LASSERTF(LLOG_HDR_MAGIC == 275010873, " found %lld\n",
-                 (long long)LLOG_HDR_MAGIC);
-        LASSERTF(LLOG_LOGID_MAGIC == 275010875, " found %lld\n",
-                 (long long)LLOG_LOGID_MAGIC);
+        CLASSERT(OST_SZ_REC == 274730752);
+        CLASSERT(OST_RAID1_REC == 274731008);
+        CLASSERT(MDS_UNLINK_REC == 274801668);
+        CLASSERT(MDS_SETATTR_REC == 274801665);
+        CLASSERT(OBD_CFG_REC == 274857984);
+        CLASSERT(PTL_CFG_REC == 274923520);
+        CLASSERT(LLOG_GEN_REC == 274989056);
+        CLASSERT(LLOG_JOIN_REC == 275054592);
+        CLASSERT(LLOG_HDR_MAGIC == 275010873);
+        CLASSERT(LLOG_LOGID_MAGIC == 275010875);
 
         /* Checks for struct llog_catid */
         LASSERTF((int)sizeof(struct llog_catid) == 32, " found %lld\n",
@@ -1372,6 +1390,18 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_catid, lci_logid));
         LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid));
+        LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_catid, lci_padding1));
+        LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1));
+        LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_catid, lci_padding2));
+        LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2));
+        LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_catid, lci_padding3));
+        LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3));
 
         /* Checks for struct llog_rec_hdr */
         LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, " found %lld\n",
@@ -1388,6 +1418,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_rec_hdr, lrh_type));
         LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type));
+        LASSERTF((int)offsetof(struct llog_rec_hdr, padding) == 12, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_rec_hdr, padding));
+        LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_rec_hdr *)0)->padding));
 
         /* Checks for struct llog_rec_tail */
         LASSERTF((int)sizeof(struct llog_rec_tail) == 8, " found %lld\n",
@@ -1412,6 +1446,26 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_logid_rec, lid_id));
         LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id));
+        LASSERTF((int)offsetof(struct llog_logid_rec, padding1) == 36, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_logid_rec, padding1));
+        LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding1) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding1));
+        LASSERTF((int)offsetof(struct llog_logid_rec, padding2) == 40, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_logid_rec, padding2));
+        LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding2) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding2));
+        LASSERTF((int)offsetof(struct llog_logid_rec, padding3) == 44, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_logid_rec, padding3));
+        LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding3) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding3));
+        LASSERTF((int)offsetof(struct llog_logid_rec, padding4) == 48, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_logid_rec, padding4));
+        LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding4) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding4));
+        LASSERTF((int)offsetof(struct llog_logid_rec, padding5) == 52, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_logid_rec, padding5));
+        LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->padding5) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_logid_rec *)0)->padding5));
         LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, " found %lld\n",
                  (long long)(int)offsetof(struct llog_logid_rec, lid_tail));
         LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, " found %lld\n",
@@ -1436,6 +1490,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_create_rec, lcr_ogen));
         LASSERTF((int)sizeof(((struct llog_create_rec *)0)->lcr_ogen) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_create_rec *)0)->lcr_ogen));
+        LASSERTF((int)offsetof(struct llog_create_rec, padding) == 44, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_create_rec, padding));
+        LASSERTF((int)sizeof(((struct llog_create_rec *)0)->padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_create_rec *)0)->padding));
 
         /* Checks for struct llog_orphan_rec */
         LASSERTF((int)sizeof(struct llog_orphan_rec) == 40, " found %lld\n",
@@ -1452,6 +1510,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_orphan_rec, lor_ogen));
         LASSERTF((int)sizeof(((struct llog_orphan_rec *)0)->lor_ogen) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_orphan_rec *)0)->lor_ogen));
+        LASSERTF((int)offsetof(struct llog_orphan_rec, padding) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_orphan_rec, padding));
+        LASSERTF((int)sizeof(((struct llog_orphan_rec *)0)->padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_orphan_rec *)0)->padding));
         LASSERTF((int)offsetof(struct llog_orphan_rec, lor_tail) == 32, " found %lld\n",
                  (long long)(int)offsetof(struct llog_orphan_rec, lor_tail));
         LASSERTF((int)sizeof(((struct llog_orphan_rec *)0)->lor_tail) == 8, " found %lld\n",
@@ -1472,11 +1534,47 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_unlink_rec, lur_ogen));
         LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_ogen) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_ogen));
+        LASSERTF((int)offsetof(struct llog_unlink_rec, padding) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_unlink_rec, padding));
+        LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_unlink_rec *)0)->padding));
         LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, " found %lld\n",
                  (long long)(int)offsetof(struct llog_unlink_rec, lur_tail));
         LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail));
 
+        /* Checks for struct llog_setattr_rec */
+        LASSERTF((int)sizeof(struct llog_setattr_rec) == 48, " found %lld\n",
+                 (long long)(int)sizeof(struct llog_setattr_rec));
+        LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_hdr) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr_rec, lsr_hdr));
+        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_hdr) == 16, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_hdr));
+        LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_oid) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr_rec, lsr_oid));
+        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_oid) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_oid));
+        LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_ogen) == 24, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr_rec, lsr_ogen));
+        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_ogen) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_ogen));
+        LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_uid) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr_rec, lsr_uid));
+        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_uid) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_uid));
+        LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_gid) == 32, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr_rec, lsr_gid));
+        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_gid) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_gid));
+        LASSERTF((int)offsetof(struct llog_setattr_rec, padding) == 36, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr_rec, padding));
+        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->padding));
+        LASSERTF((int)offsetof(struct llog_setattr_rec, lsr_tail) == 40, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_setattr_rec, lsr_tail));
+        LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_tail) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_tail));
+
         /* Checks for struct llog_size_change_rec */
         LASSERTF((int)sizeof(struct llog_size_change_rec) == 48, " found %lld\n",
                  (long long)(int)sizeof(struct llog_size_change_rec));
@@ -1492,6 +1590,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_size_change_rec, lsc_io_epoch));
         LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_io_epoch) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_io_epoch));
+        LASSERTF((int)offsetof(struct llog_size_change_rec, padding) == 36, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_size_change_rec, padding));
+        LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_size_change_rec *)0)->padding));
         LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 40, " found %lld\n",
                  (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail));
         LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, " found %lld\n",
@@ -1588,6 +1690,10 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llog_cookie, lgc_index));
         LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_index) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_index));
+        LASSERTF((int)offsetof(struct llog_cookie, lgc_padding) == 28, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_cookie, lgc_padding));
+        LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_padding) == 4, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_padding));
 
         /* Checks for struct llogd_body */
         LASSERTF((int)sizeof(struct llogd_body) == 48, " found %lld\n",
@@ -1620,20 +1726,15 @@ void lustre_assert_wire_constants(void)
                  (long long)(int)offsetof(struct llogd_body, lgd_cur_offset));
         LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_cur_offset) == 8, " found %lld\n",
                  (long long)(int)sizeof(((struct llogd_body *)0)->lgd_cur_offset));
-        LASSERTF(LLOG_ORIGIN_HANDLE_CREATE == 501, " found %lld\n",
-                 (long long)LLOG_ORIGIN_HANDLE_CREATE);
-        LASSERTF(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502, " found %lld\n",
-                 (long long)LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
-        LASSERTF(LLOG_ORIGIN_HANDLE_READ_HEADER == 503, " found %lld\n",
-                 (long long)LLOG_ORIGIN_HANDLE_READ_HEADER);
-        LASSERTF(LLOG_ORIGIN_HANDLE_WRITE_REC == 504, " found %lld\n",
-                 (long long)LLOG_ORIGIN_HANDLE_WRITE_REC);
-        LASSERTF(LLOG_ORIGIN_HANDLE_CLOSE == 505, " found %lld\n",
-                 (long long)LLOG_ORIGIN_HANDLE_CLOSE);
-        LASSERTF(LLOG_ORIGIN_CONNECT == 506, " found %lld\n",
-                 (long long)LLOG_ORIGIN_CONNECT);
-        LASSERTF(LLOG_CATINFO == 507, " found %lld\n",
-                 (long long)LLOG_CATINFO);
+        CLASSERT(LLOG_ORIGIN_HANDLE_CREATE == 501);
+        CLASSERT(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502);
+        CLASSERT(LLOG_ORIGIN_HANDLE_READ_HEADER == 503);
+        CLASSERT(LLOG_ORIGIN_HANDLE_WRITE_REC == 504);
+        CLASSERT(LLOG_ORIGIN_HANDLE_CLOSE == 505);
+        CLASSERT(LLOG_ORIGIN_CONNECT == 506);
+        CLASSERT(LLOG_CATINFO == 507);
+        CLASSERT(LLOG_ORIGIN_HANDLE_PREV_BLOCK == 508);
+        CLASSERT(LLOG_ORIGIN_HANDLE_DESTROY == 509);
 
         /* Checks for struct llogd_conn_body */
         LASSERTF((int)sizeof(struct llogd_conn_body) == 40, " found %lld\n",
@@ -1651,6 +1752,38 @@ void lustre_assert_wire_constants(void)
         LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx) == 4, " found %lld\n",
                  (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx));
 
+        /* Checks for struct llog_array_rec */
+        LASSERTF((int)sizeof(struct llog_array_rec) == 72, " found %lld\n",
+                 (long long)(int)sizeof(struct llog_array_rec));
+        LASSERTF((int)offsetof(struct llog_array_rec, lmr_hdr) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_array_rec, lmr_hdr));
+        LASSERTF((int)sizeof(((struct llog_array_rec *)0)->lmr_hdr) == 16, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_array_rec *)0)->lmr_hdr));
+        LASSERTF((int)offsetof(struct llog_array_rec, lmr_med) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_array_rec, lmr_med));
+        LASSERTF((int)sizeof(((struct llog_array_rec *)0)->lmr_med) == 48, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_array_rec *)0)->lmr_med));
+        LASSERTF((int)offsetof(struct llog_array_rec, lmr_tail) == 64, " found %lld\n",
+                 (long long)(int)offsetof(struct llog_array_rec, lmr_tail));
+        LASSERTF((int)sizeof(((struct llog_array_rec *)0)->lmr_tail) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct llog_array_rec *)0)->lmr_tail));
+
+        /* Checks for struct mds_extent_desc */
+        LASSERTF((int)sizeof(struct mds_extent_desc) == 48, " found %lld\n",
+                 (long long)(int)sizeof(struct mds_extent_desc));
+        LASSERTF((int)offsetof(struct mds_extent_desc, med_start) == 0, " found %lld\n",
+                 (long long)(int)offsetof(struct mds_extent_desc, med_start));
+        LASSERTF((int)sizeof(((struct mds_extent_desc *)0)->med_start) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct mds_extent_desc *)0)->med_start));
+        LASSERTF((int)offsetof(struct mds_extent_desc, med_len) == 8, " found %lld\n",
+                 (long long)(int)offsetof(struct mds_extent_desc, med_len));
+        LASSERTF((int)sizeof(((struct mds_extent_desc *)0)->med_len) == 8, " found %lld\n",
+                 (long long)(int)sizeof(((struct mds_extent_desc *)0)->med_len));
+        LASSERTF((int)offsetof(struct mds_extent_desc, med_lmm) == 16, " found %lld\n",
+                 (long long)(int)offsetof(struct mds_extent_desc, med_lmm));
+        LASSERTF((int)sizeof(((struct mds_extent_desc *)0)->med_lmm) == 32, " found %lld\n",
+                 (long long)(int)sizeof(((struct mds_extent_desc *)0)->med_lmm));
+
         /* Checks for struct qunit_data */
         LASSERTF((int)sizeof(struct qunit_data) == 16, " found %lld\n",
                  (long long)(int)sizeof(struct qunit_data));