Whamcloud - gitweb
Lproc-snmp code drop
[fs/lustre-release.git] / lustre / mdc / mdc_request.c
index 55630b9..4b5f6b2 100644 (file)
  */
 
 #define EXPORT_SYMTAB
+#define DEBUG_SUBSYSTEM S_MDC
 
 #include <linux/module.h>
 #include <linux/miscdevice.h>
-
-#define DEBUG_SUBSYSTEM S_MDC
-
 #include <linux/lustre_mds.h>
+#include <linux/lustre_lite.h>
+#include <linux/lustre_dlm.h>
+#include <linux/init.h>
+#include <linux/obd_lov.h>
+#include <linux/lprocfs_status.h>
 
 #define REQUEST_MINOR 244
 
 extern int mds_queue_req(struct ptlrpc_request *);
+extern lprocfs_vars_t status_var_nm_1[];
+extern lprocfs_vars_t status_class_var[];
 
-int mdc_connect(struct ptlrpc_client *cl, struct lustre_peer *peer, ino_t ino,
-                int type, int valid, struct ptlrpc_request **request)
+/* should become mdc_getinfo() */
+int mdc_getstatus(struct lustre_handle *conn, struct ll_fid *rootfid)
 {
         struct ptlrpc_request *req;
         struct mds_body *body;
         int rc, size = sizeof(*body);
         ENTRY;
 
-        req = ptlrpc_prep_req(cl, peer, MDS_GETATTR, 1, &size, NULL);
+        req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_GETSTATUS, 1, &size,
+                              NULL);
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
         body = lustre_msg_buf(req->rq_reqmsg, 0);
-        ll_ino2fid(&body->fid1, ino, 0, type);
-        body->valid = valid;
-
+        req->rq_level = LUSTRE_CONN_CON;
         req->rq_replen = lustre_msg_size(1, &size);
 
-        rc = ptlrpc_queue_wait(cl, req);
+        mds_pack_req_body(req);
+        rc = ptlrpc_queue_wait(req);
         rc = ptlrpc_check_status(req, rc);
 
         if (!rc) {
-                mds_unpack_body(req);
                 body = lustre_msg_buf(req->rq_repmsg, 0);
-                CDEBUG(D_NET, "mode: %o\n", body->mode);
+                mds_unpack_body(body);
+                memcpy(rootfid, &body->fid1, sizeof(*rootfid));
+
+                CDEBUG(D_NET, "root ino="LPU64", last_committed="LPU64
+                       ", last_xid="LPU64"\n",
+                       rootfid->id, req->rq_repmsg->last_committed,
+                       req->rq_repmsg->last_xid);
         }
 
         EXIT;
  out:
-        *request = req;
+        ptlrpc_req_finished(req);
         return rc;
 }
 
+int mdc_getlovinfo(struct obd_device *obd, struct lustre_handle *mdc_connh,
+                   struct ptlrpc_request **request)
+{
+        struct ptlrpc_request *req;
+        struct mds_status_req *streq;
+        int rc, size[2] = {sizeof(*streq)};
+        ENTRY;
+
+        req = ptlrpc_prep_req(class_conn2cliimp(mdc_connh), MDS_GETLOVINFO, 1,
+                              size, NULL);
+        if (!req)
+                GOTO(out, rc = -ENOMEM);
+
+        *request = req;
+        streq = lustre_msg_buf(req->rq_reqmsg, 0);
+        streq->flags = HTON__u32(MDS_STATUS_LOV);
+        streq->repbuf = HTON__u32(8192);
+
+        /* prepare for reply */
+        req->rq_level = LUSTRE_CONN_CON;
+        size[0] = 512;
+        size[1] = 8192;
+        req->rq_replen = lustre_msg_size(2, size);
+
+        rc = ptlrpc_queue_wait(req);
+        rc = ptlrpc_check_status(req, rc);
+
+ out:
+        RETURN(rc);
+}
+
 
-int mdc_getattr(struct ptlrpc_client *cl, struct lustre_peer *peer, ino_t ino,
-                int type, int valid, struct ptlrpc_request **request)
+int mdc_getattr(struct lustre_handle *conn,
+                obd_id ino, int type, unsigned long valid, size_t ea_size,
+                struct ptlrpc_request **request)
 {
         struct ptlrpc_request *req;
         struct mds_body *body;
-        int rc, size = sizeof(*body);
+        int rc, size[2] = {sizeof(*body), 0}, bufcount = 1;
         ENTRY;
 
-        req = ptlrpc_prep_req(cl, peer, MDS_GETATTR, 1, &size, NULL);
+        req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_GETATTR, 1, size,
+                              NULL);
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
@@ -83,14 +126,26 @@ int mdc_getattr(struct ptlrpc_client *cl, struct lustre_peer *peer, ino_t ino,
         ll_ino2fid(&body->fid1, ino, 0, type);
         body->valid = valid;
 
-        req->rq_replen = lustre_msg_size(1, &size);
+        if (S_ISREG(type)) {
+                struct client_obd *mdc = &class_conn2obd(conn)->u.cli;
+                bufcount = 2;
+                size[1] = mdc->cl_max_mds_easize;
+        } else if (valid & OBD_MD_LINKNAME) {
+                bufcount = 2;
+                size[1] = ea_size;
+                body->size = ea_size;
+                CDEBUG(D_INODE, "allocating %d bytes for symlink in packet\n",
+                       ea_size);
+        }
+        req->rq_replen = lustre_msg_size(bufcount, size);
+        mds_pack_req_body(req);
 
-        rc = ptlrpc_queue_wait(cl, req);
+        rc = ptlrpc_queue_wait(req);
         rc = ptlrpc_check_status(req, rc);
 
         if (!rc) {
-                mds_unpack_body(req);
                 body = lustre_msg_buf(req->rq_repmsg, 0);
+                mds_unpack_body(body);
                 CDEBUG(D_NET, "mode: %o\n", body->mode);
         }
 
@@ -100,56 +155,535 @@ int mdc_getattr(struct ptlrpc_client *cl, struct lustre_peer *peer, ino_t ino,
         return rc;
 }
 
-int mdc_open(struct ptlrpc_client *cl, struct lustre_peer *peer, ino_t ino,
-             int type, int flags, __u64 *fh, struct ptlrpc_request **request)
+static void d_delete_aliases(struct inode *inode)
+{
+        struct dentry *dentry = NULL;
+       struct list_head *tmp;
+        int dentry_count = 0;
+        ENTRY;
+
+       spin_lock(&dcache_lock);
+        list_for_each(tmp, &inode->i_dentry) {
+                dentry = list_entry(tmp, struct dentry, d_alias);
+                dentry_count++;
+        }
+
+        /* XXX FIXME tell phil/peter that you see this -- unless you're playing
+         * with hard links, in which case, stop. */
+        LASSERT(dentry_count <= 1);
+
+        if (dentry_count == 0) {
+                spin_unlock(&dcache_lock);
+                EXIT;
+                return;
+        }
+
+        CDEBUG(D_INODE, "d_deleting dentry %p\n", dentry);
+        dget_locked(dentry);
+        spin_unlock(&dcache_lock);
+        d_delete(dentry);
+        dput(dentry);
+        EXIT;
+}
+
+static int mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                            void *data, __u32 data_len, int flag)
+{
+        int rc;
+        struct lustre_handle lockh;
+        ENTRY;
+
+        switch (flag) {
+        case LDLM_CB_BLOCKING:
+                ldlm_lock2handle(lock, &lockh);
+                rc = ldlm_cli_cancel(&lockh);
+                if (rc < 0) {
+                        CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
+                        RETURN(rc);
+                }
+                break;
+        case LDLM_CB_CANCELING: {
+                /* Invalidate all dentries associated with this inode */
+                struct inode *inode = data;
+
+                LASSERT(inode != NULL);
+                LASSERT(data_len == sizeof(*inode));
+
+                if (S_ISDIR(inode->i_mode)) {
+                        CDEBUG(D_INODE, "invalidating inode %ld\n",
+                               inode->i_ino);
+                        ll_invalidate_inode_pages(inode);
+                }
+
+                LASSERT(igrab(inode) == inode);
+                d_delete_aliases(inode);
+                iput(inode);
+                break;
+        }
+        default:
+                LBUG();
+        }
+
+        RETURN(0);
+}
+
+struct create_replay_data {
+        struct super_block *sb;
+        u32                 generation;
+};
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+static int create_replay_find_inode(struct inode *inode, unsigned long ino,
+                                    void *opaque)
+#else
+static int create_replay_find_inode(struct inode *inode, void *opaque)
+#endif
 {
+        struct ptlrpc_request *req = opaque;
+        struct create_replay_data *saved;
         struct mds_body *body;
-        int rc, size = sizeof(*body);
+        
+        saved = lustre_msg_buf(req->rq_reqmsg, 5); /* lock with intent */
+        
+        if (saved->generation != inode->i_generation) {
+                CDEBUG(D_HA,
+                       "generation mismatch for ino %u: saved %u != inode %u\n",
+                       inode->i_ino, saved->generation, inode->i_generation);
+                return 0;
+        }
+
+        body = lustre_msg_buf(req->rq_repmsg, 1);
+
+        /* XXX do I need more out of ll_update_inode? */
+        CDEBUG(D_HA, "updating inode %u generation %u to %u\n",
+               inode->i_ino, inode->i_generation, body->generation);
+
+        inode->i_generation = body->generation;
+
+        return 1;
+}
+
+static void fixup_req_for_recreate(struct ptlrpc_request *fixreq,
+                                   struct ptlrpc_request *req,
+                                   struct inode *inode)
+{
+        struct ldlm_request *lockreq; 
+        struct mds_rec_link *rec; /* representative, two-fid op structure */
+        int opc;
+
+        if (fixreq->rq_import != req->rq_import) {
+                DEBUG_REQ(D_HA, fixreq, "import mismatch, skipping");
+                return;
+        }
+
+        DEBUG_REQ(D_HA, fixreq, "fixing");
+        
+        /* XXX check replay_state to see if we'll actually replay. */
+
+        /* We only care about LDLM_ENQUEUE and MDS_REINT requests. */
+        if (fixreq->rq_reqmsg->opc == LDLM_ENQUEUE) {
+                lockreq = lustre_msg_buf(fixreq->rq_reqmsg, 0);
+
+                if (lockreq->lock_desc.l_resource.lr_type != LDLM_PLAIN &&
+                    !(lockreq->lock_flags & LDLM_FL_HAS_INTENT)) {
+                        DEBUG_REQ(D_HA, fixreq, "non-intent lock, skipping");
+                        return;
+                }
+
+                if (fixreq->rq_reqmsg->bufcount < 2) {
+                        DEBUG_REQ(D_HA, fixreq,
+                                  "short intent (probably readdir), skipping");
+                        return;
+                }
+
+                /* XXX endianness is probably very very wrong here. Very. */
+                rec = lustre_msg_buf(fixreq->rq_reqmsg, 2);
+        } else if (fixreq->rq_reqmsg->opc == MDS_REINT) {
+                rec = lustre_msg_buf(fixreq->rq_reqmsg, 0);
+        } else if (fixreq->rq_reqmsg->opc == MDS_OPEN) {
+                struct mds_body *body = lustre_msg_buf(fixreq->rq_reqmsg, 0);
+                DEBUG_REQ(D_HA, fixreq, "fixing fid1: %u -> %u",
+                          body->fid1.generation, inode->i_generation);
+                body->fid1.generation = inode->i_generation;
+                return;
+        } else {
+                DEBUG_REQ(D_HA, fixreq, "not a replayable request, skipping");
+                return;
+        }
+        
+        if (rec->lk_fid1.id == inode->i_ino) {
+                DEBUG_REQ(D_HA, fixreq, "fixing fid1: %u -> %u",
+                          rec->lk_fid1.generation, inode->i_generation);
+                rec->lk_fid1.generation = inode->i_generation;
+        }
+        
+        /* Some ops have two FIDs. ZZZ We rely on the identical
+         * placement of that second FID in all such ops' messages.
+         */
+        opc = rec->lk_opcode & REINT_OPCODE_MASK;
+        if ((opc == REINT_LINK || opc == REINT_UNLINK ||
+             opc == REINT_RENAME) &&
+            rec->lk_fid2.id == inode->i_ino) {
+                DEBUG_REQ(D_HA, fixreq, "fixing fid2: %u -> %u",
+                          rec->lk_fid2.generation, inode->i_generation);
+                rec->lk_fid2.generation = inode->i_generation;
+        }
+}
+
+static void mdc_replay_create(struct ptlrpc_request *req)
+{
+        struct create_replay_data *saved;
+        struct mds_body *body;
+        struct inode *inode;
+        struct list_head *tmp;
+
+        if (req->rq_reqmsg->opc == MDS_REINT)
+                LBUG(); /* XXX don't handle the non-intent case yet */
+
+        body = lustre_msg_buf(req->rq_repmsg, 1);
+        saved = lustre_msg_buf(req->rq_reqmsg, 5); /* lock with intent */
+
+        CDEBUG(D_HA, "create of inode %d replayed; gen %u -> %u\n",
+               body->fid1.id, saved->generation, body->generation);
+        /* XXX cargo-culted right out of ll_iget */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        inode = iget4(saved->sb, body->fid1.id, create_replay_find_inode, req);
+#endif
+#if 0
+        {
+                extern int ll_read_inode2(struct inode *inode, void *opaque);
+                inode = iget5_locked(saved->sb, body->fid1.id,
+                                     create_replay_find_inode, 
+                                     ll_read_inode2, req);
+
+                if (!inode)
+                        LBUG(); /* XXX ick */
+                
+                if (inode->i_state & I_NEW)
+                        unlock_new_inode(inode);
+        }
+#endif
+
+        /* Now that we've updated the generation, we need to go and find all
+         * the other requests that refer to this file and will be replayed,
+         * and teach them about our new generation.
+         */
+        list_for_each(tmp, &req->rq_connection->c_sending_head) {
+                struct ptlrpc_request *fixreq =
+                        list_entry(tmp, struct ptlrpc_request, rq_list);
+
+                fixup_req_for_recreate(fixreq, req, inode);
+        }
+
+        list_for_each(tmp, &req->rq_connection->c_delayed_head) {
+                struct ptlrpc_request *fixreq =
+                        list_entry(tmp, struct ptlrpc_request, rq_list);
+
+                fixup_req_for_recreate(fixreq, req, inode);
+        }
+}
+
+void mdc_store_create_replay_data(struct ptlrpc_request *req,
+                                  struct super_block *sb)
+{
+        struct create_replay_data *saved = 
+                lustre_msg_buf(req->rq_reqmsg, 5);
+        struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1);
+
+
+        if (req->rq_reqmsg->opc == MDS_REINT)
+                LBUG(); /* XXX don't handle the non-intent case yet */
+
+        saved->generation = body->generation;
+        saved->sb = sb; /* XXX is this safe? */
+
+        req->rq_replay_cb = mdc_replay_create;
+}
+
+int mdc_enqueue(struct lustre_handle *conn, int lock_type,
+                struct lookup_intent *it, int lock_mode, struct inode *dir,
+                struct dentry *de, struct lustre_handle *lockh,
+                char *tgt, int tgtlen, void *data, int datalen)
+{
         struct ptlrpc_request *req;
+        struct obd_device *obddev = class_conn2obd(conn);
+        __u64 res_id[RES_NAME_SIZE] = {dir->i_ino, (__u64)dir->i_generation};
+        int size[6] = {sizeof(struct ldlm_request), sizeof(struct ldlm_intent)};
+        int rc, flags = LDLM_FL_HAS_INTENT;
+        int repsize[3] = {sizeof(struct ldlm_reply),
+                          sizeof(struct mds_body),
+                          obddev->u.cli.cl_max_mds_easize};
+        struct ldlm_reply *dlm_rep;
+        struct ldlm_intent *lit;
+        struct ldlm_request *lockreq;
+        ENTRY;
+
+        LDLM_DEBUG_NOLOCK("mdsintent %s dir %ld", ldlm_it2str(it->it_op),
+                          dir->i_ino);
 
-        req = ptlrpc_prep_req(cl, peer, MDS_OPEN, 1, &size, NULL);
+        if (it->it_op & (IT_MKDIR | IT_CREAT | IT_SYMLINK | IT_MKNOD)) {
+                switch (it->it_op) {
+                case IT_MKDIR:
+                        it->it_mode |= S_IFDIR;
+                        break;
+                case (IT_CREAT|IT_OPEN):
+                case IT_CREAT:
+                        it->it_mode |= S_IFREG;
+                        break;
+                case IT_SYMLINK:
+                        it->it_mode |= S_IFLNK;
+                        break;
+                }
+                it->it_mode &= ~current->fs->umask;
+
+                size[2] = sizeof(struct mds_rec_create);
+                size[3] = de->d_name.len + 1;
+                size[4] = tgtlen + 1;
+                size[5] = sizeof(struct create_replay_data);
+                req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 6,
+                                      size, NULL);
+                if (!req)
+                        RETURN(-ENOMEM);
+
+                /* pack the intent */
+                lit = lustre_msg_buf(req->rq_reqmsg, 1);
+                lit->opc = NTOH__u64((__u64)it->it_op);
+
+                /* pack the intended request */
+                mds_create_pack(req, 2, dir, it->it_mode, 0, current->fsuid,
+                                current->fsgid, CURRENT_TIME, de->d_name.name,
+                                de->d_name.len, tgt, tgtlen);
+                req->rq_replen = lustre_msg_size(3, repsize);
+        } else if (it->it_op == IT_RENAME2) {
+                struct dentry *old_de = it->it_data;
+
+                size[2] = sizeof(struct mds_rec_rename);
+                size[3] = old_de->d_name.len + 1;
+                size[4] = de->d_name.len + 1;
+                req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 5,
+                                      size, NULL);
+                if (!req)
+                        RETURN(-ENOMEM);
+
+                /* pack the intent */
+                lit = lustre_msg_buf(req->rq_reqmsg, 1);
+                lit->opc = NTOH__u64((__u64)it->it_op);
+
+                /* pack the intended request */
+                mds_rename_pack(req, 2, old_de->d_parent->d_inode, dir,
+                                old_de->d_name.name, old_de->d_name.len,
+                                de->d_name.name, de->d_name.len);
+                req->rq_replen = lustre_msg_size(3, repsize);
+        } else if (it->it_op == IT_LINK2) {
+                struct dentry *old_de = it->it_data;
+
+                size[2] = sizeof(struct mds_rec_link);
+                size[3] = de->d_name.len + 1;
+                req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 4,
+                                      size, NULL);
+                if (!req)
+                        RETURN(-ENOMEM);
+
+                /* pack the intent */
+                lit = lustre_msg_buf(req->rq_reqmsg, 1);
+                lit->opc = NTOH__u64((__u64)it->it_op);
+
+                /* pack the intended request */
+                mds_link_pack(req, 2, old_de->d_inode, dir,
+                              de->d_name.name, de->d_name.len);
+                req->rq_replen = lustre_msg_size(3, repsize);
+        } else if (it->it_op == IT_UNLINK || it->it_op == IT_RMDIR) {
+                size[2] = sizeof(struct mds_rec_unlink);
+                size[3] = de->d_name.len + 1;
+                req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 4,
+                                      size, NULL);
+                if (!req)
+                        RETURN(-ENOMEM);
+
+                /* pack the intent */
+                lit = lustre_msg_buf(req->rq_reqmsg, 1);
+                lit->opc = NTOH__u64((__u64)it->it_op);
+
+                /* pack the intended request */
+                mds_unlink_pack(req, 2, dir, NULL,
+                                it->it_op == IT_UNLINK ? S_IFREG : S_IFDIR,
+                                de->d_name.name, de->d_name.len);
+
+                req->rq_replen = lustre_msg_size(3, repsize);
+        } else if (it->it_op  & (IT_GETATTR | IT_RENAME | IT_LINK | 
+                   IT_OPEN |  IT_SETATTR | IT_LOOKUP | IT_READLINK)) {
+                size[2] = sizeof(struct mds_body);
+                size[3] = de->d_name.len + 1;
+
+                req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 4,
+                                      size, NULL);
+                if (!req)
+                        RETURN(-ENOMEM);
+
+                /* pack the intent */
+                lit = lustre_msg_buf(req->rq_reqmsg, 1);
+                lit->opc = NTOH__u64((__u64)it->it_op);
+
+                /* pack the intended request */
+                mds_getattr_pack(req, 2, dir, de->d_name.name, de->d_name.len);
+
+                /* we need to replay opens */
+                if (it->it_op == IT_OPEN)
+                        req->rq_flags |= PTL_RPC_FL_REPLAY;
+
+                /* get ready for the reply */
+                req->rq_replen = lustre_msg_size(3, repsize);
+        } else if (it->it_op == IT_READDIR) {
+                req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 1,
+                                      size, NULL);
+                if (!req)
+                        RETURN(-ENOMEM);
+
+                /* get ready for the reply */
+                req->rq_replen = lustre_msg_size(1, repsize);
+        } else {
+                LBUG();
+                RETURN(-EINVAL);
+        }
+#warning FIXME: the data here needs to be different if a lock was granted for a different inode
+        rc = ldlm_cli_enqueue(conn, req, obddev->obd_namespace, NULL, res_id,
+                              lock_type, NULL, 0, lock_mode, &flags,
+                              ldlm_completion_ast, mdc_blocking_ast, data,
+                              datalen, lockh);
+        if (rc == -ENOENT) {
+                /* This can go when we're sure that this can never happen */
+                LBUG();
+        }
+        if (rc == ELDLM_LOCK_ABORTED) {
+                lock_mode = 0;
+                memset(lockh, 0, sizeof(*lockh));
+                /* rc = 0 */
+        } else if (rc != 0) {
+                CERROR("ldlm_cli_enqueue: %d\n", rc);
+                RETURN(rc);
+        }
+
+        /* On replay, we don't want the lock granted. */
+        lockreq = lustre_msg_buf(req->rq_reqmsg, 0);
+        lockreq->lock_flags |= LDLM_FL_INTENT_ONLY;
+
+        dlm_rep = lustre_msg_buf(req->rq_repmsg, 0);
+        it->it_disposition = (int) dlm_rep->lock_policy_res1;
+        it->it_status = (int) dlm_rep->lock_policy_res2;
+        it->it_lock_mode = lock_mode;
+        it->it_data = req;
+
+        RETURN(0);
+}
+
+int mdc_cancel_unused(struct lustre_handle *conn, struct inode *inode,
+                      int flags)
+{
+        __u64 res_id[RES_NAME_SIZE] = {inode->i_ino, inode->i_generation};
+        struct obd_device *obddev = class_conn2obd(conn);
+        ENTRY;
+        RETURN(ldlm_cli_cancel_unused(obddev->obd_namespace, res_id, flags));
+}
+
+struct replay_open_data {
+        struct lustre_handle *fh;
+};
+
+static void mdc_replay_open(struct ptlrpc_request *req)
+{
+        int offset;
+        struct replay_open_data *saved;
+        struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 0);
+
+        if (lustre_msg_get_op_flags(req->rq_reqmsg) & MDS_OPEN_HAS_EA)
+                offset = 2;
+        else
+                offset = 1;
+
+        saved = lustre_msg_buf(req->rq_reqmsg, offset);
+        mds_unpack_body(body);
+        CDEBUG(D_HA, "updating from "LPD64"/"LPD64" to "LPD64"/"LPD64"\n",
+               saved->fh->addr, saved->fh->cookie,
+               body->handle.addr, body->handle.cookie);
+        memcpy(saved->fh, &body->handle, sizeof(body->handle));
+}
+
+int mdc_open(struct lustre_handle *conn, obd_id ino, int type, int flags,
+             struct lov_stripe_md *lsm, struct lustre_handle *fh,
+             struct ptlrpc_request **request)
+{
+        struct mds_body *body;
+        struct replay_open_data *replay_data;
+        int rc, size[3] = {sizeof(*body), sizeof(*replay_data)}, bufcount = 2;
+        struct ptlrpc_request *req;
+        ENTRY;
+
+        if (lsm) {
+                bufcount = 3;
+                size[2] = size[1]; /* shuffle the spare data along */
+
+                size[1] = lsm->lsm_mds_easize;
+        }
+
+        req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_OPEN, bufcount, size,
+                              NULL);
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
+        if (lsm)
+                lustre_msg_set_op_flags(req->rq_reqmsg, MDS_OPEN_HAS_EA);
+
+
+        req->rq_flags |= PTL_RPC_FL_REPLAY;
         body = lustre_msg_buf(req->rq_reqmsg, 0);
+
         ll_ino2fid(&body->fid1, ino, 0, type);
         body->flags = HTON__u32(flags);
+        memcpy(&body->handle, fh, sizeof(body->handle));
 
-        req->rq_replen = lustre_msg_size(1, &size);
+        if (lsm)
+                lov_packmd(lustre_msg_buf(req->rq_reqmsg, 1), lsm);
 
-        rc = ptlrpc_queue_wait(cl, req);
-        rc = ptlrpc_check_status(req, rc);
+        req->rq_replen = lustre_msg_size(1, size);
 
+        rc = ptlrpc_queue_wait(req);
+        rc = ptlrpc_check_status(req, rc);
         if (!rc) {
-                mds_unpack_body(req);
                 body = lustre_msg_buf(req->rq_repmsg, 0);
-                *fh = body->objid;
+                mds_unpack_body(body);
+                memcpy(fh, &body->handle, sizeof(*fh));
         }
 
+        /* If open is replayed, we need to fix up the fh. */
+        req->rq_replay_cb = mdc_replay_open;
+        replay_data = lustre_msg_buf(req->rq_reqmsg, lsm ? 2 : 1);
+        replay_data->fh = fh;
+        
         EXIT;
  out:
         *request = req;
         return rc;
 }
 
-int mdc_close(struct ptlrpc_client *cl, struct lustre_peer *peer, ino_t ino,
-              int type, __u64 fh, struct ptlrpc_request **request)
+int mdc_close(struct lustre_handle *conn, obd_id ino, int type,
+              struct lustre_handle *fh, struct ptlrpc_request **request)
 {
         struct mds_body *body;
         int rc, size = sizeof(*body);
         struct ptlrpc_request *req;
 
-        req = ptlrpc_prep_req(cl, peer, MDS_CLOSE, 1, &size, NULL);
+        req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_CLOSE, 1, &size,
+                              NULL);
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
         body = lustre_msg_buf(req->rq_reqmsg, 0);
         ll_ino2fid(&body->fid1, ino, 0, type);
-        body->objid = fh;
+        memcpy(&body->handle, fh, sizeof(body->handle));
 
-        req->rq_replen = lustre_msg_size(1, &size);
+        req->rq_replen = lustre_msg_size(0, NULL);
 
-        rc = ptlrpc_queue_wait(cl, req);
+        rc = ptlrpc_queue_wait(req);
         rc = ptlrpc_check_status(req, rc);
 
         EXIT;
@@ -158,41 +692,39 @@ int mdc_close(struct ptlrpc_client *cl, struct lustre_peer *peer, ino_t ino,
         return rc;
 }
 
-int mdc_readpage(struct ptlrpc_client *cl, struct lustre_peer *peer, ino_t ino,
-                 int type, __u64 offset, char *addr,
-                 struct ptlrpc_request **request)
+int mdc_readpage(struct lustre_handle *conn, obd_id ino, int type, __u64 offset,
+                 char *addr, struct ptlrpc_request **request)
 {
+        struct ptlrpc_connection *connection = 
+                client_conn2cli(conn)->cl_import.imp_connection;
         struct ptlrpc_request *req = NULL;
-        struct ptlrpc_bulk_desc *bulk = NULL;
-        struct niobuf niobuf;
+        struct ptlrpc_bulk_desc *desc = NULL;
+        struct ptlrpc_bulk_page *bulk = NULL;
         struct mds_body *body;
-        int rc, size[2] = {sizeof(*body), sizeof(struct niobuf)};
-        char *bufs[2] = {NULL, (char *)&niobuf};
-
-        niobuf.addr = (__u64) (long) addr;
+        int rc, size = sizeof(*body);
+        ENTRY;
 
-        CDEBUG(D_INODE, "inode: %ld\n", ino);
+        CDEBUG(D_INODE, "inode: %ld\n", (long)ino);
 
-        bulk = ptlrpc_prep_bulk(peer);
-        if (bulk == NULL) {
-                CERROR("%s: cannot init bulk desc\n", __FUNCTION__);
-                rc = -ENOMEM;
-                goto out;
-        }
+        desc = ptlrpc_prep_bulk(connection);
+        if (desc == NULL)
+                GOTO(out, rc = -ENOMEM);
 
-        req = ptlrpc_prep_req(cl, peer, MDS_READPAGE, 2, size, bufs);
+        req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_READPAGE, 1, &size,
+                              NULL);
         if (!req)
-                GOTO(out, rc = -ENOMEM);
+                GOTO(out2, rc = -ENOMEM);
 
-        bulk->b_buflen = PAGE_SIZE;
-        bulk->b_buf = (void *)(long)niobuf.addr;
-        bulk->b_portal = MDS_BULK_PORTAL;
-        bulk->b_xid = req->rq_xid;
+        bulk = ptlrpc_prep_bulk_page(desc);
+        bulk->bp_buflen = PAGE_SIZE;
+        bulk->bp_buf = addr;
+        bulk->bp_xid = req->rq_xid;
+        desc->bd_portal = MDS_BULK_PORTAL;
 
-        rc = ptlrpc_register_bulk(bulk);
+        rc = ptlrpc_register_bulk(desc);
         if (rc) {
                 CERROR("couldn't setup bulk sink: error %d.\n", rc);
-                GOTO(out, rc);
+                GOTO(out2, rc);
         }
 
         body = lustre_msg_buf(req->rq_reqmsg, 0);
@@ -200,177 +732,115 @@ int mdc_readpage(struct ptlrpc_client *cl, struct lustre_peer *peer, ino_t ino,
         body->fid1.f_type = type;
         body->size = offset;
 
-        req->rq_replen = lustre_msg_size(1, size);
-
-        rc = ptlrpc_queue_wait(cl, req);
+        req->rq_replen = lustre_msg_size(1, &size);
+        rc = ptlrpc_queue_wait(req);
+        rc = ptlrpc_check_status(req, rc);
         if (rc) {
-                CERROR("error in handling %d\n", rc);
-                ptlrpc_abort_bulk(bulk);
-                GOTO(out, rc);
+                ptlrpc_abort_bulk(desc);
+                GOTO(out2, rc);
+        } else {
+                body = lustre_msg_buf(req->rq_repmsg, 0);
+                mds_unpack_body(body);
         }
 
-        mds_unpack_body(req);
         EXIT;
-
+ out2:
+        ptlrpc_free_bulk(desc);
  out:
         *request = req;
-        if (bulk != NULL)
-                OBD_FREE(bulk, sizeof(*bulk));
         return rc;
 }
 
-static int request_ioctl(struct inode *inode, struct file *file,
-                         unsigned int cmd, unsigned long arg)
+static int mdc_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
 {
-        int err;
-        struct ptlrpc_client cl;
-        struct lustre_peer peer;
-        struct ptlrpc_request *request;
-
+        struct ptlrpc_request *req;
+        int rc, size = sizeof(*osfs);
         ENTRY;
 
-        if (MINOR(inode->i_rdev) != REQUEST_MINOR)
-                RETURN(-EINVAL);
-
-        if (_IOC_TYPE(cmd) != IOC_REQUEST_TYPE ||
-            _IOC_NR(cmd) < IOC_REQUEST_MIN_NR  ||
-            _IOC_NR(cmd) > IOC_REQUEST_MAX_NR ) {
-                CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n",
-                       _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
-                RETURN(-EINVAL);
-        }
-
-        ptlrpc_init_client(NULL, MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL, &cl);
-        err = ptlrpc_connect_client("mds", &cl, &peer);
-        if (err) {
-                CERROR("cannot create client\n");
-                RETURN(-EINVAL);
-        }
-
-        switch (cmd) {
-        case IOC_REQUEST_GETATTR: {
-                CERROR("-- getting attr for ino %lu\n", arg);
-                err = mdc_getattr(&cl, &peer, arg, S_IFDIR, ~0, &request);
-                CERROR("-- done err %d\n", err);
-
-                GOTO(out, err);
-        }
-
-        case IOC_REQUEST_READPAGE: {
-                char *buf;
-                OBD_ALLOC(buf, PAGE_SIZE);
-                if (!buf) {
-                        err = -ENOMEM;
-                        break;
-                }
-                CERROR("-- readpage 0 for ino %lu\n", arg);
-                err = mdc_readpage(&cl, &peer, arg, S_IFDIR, 0, buf, &request);
-                CERROR("-- done err %d\n", err);
-                OBD_FREE(buf, PAGE_SIZE);
-
-                GOTO(out, err);
-        }
-
-        case IOC_REQUEST_SETATTR: {
-                struct inode inode;
-                struct iattr iattr;
-
-                inode.i_ino = arg;
-                inode.i_generation = 0;
-                iattr.ia_mode = 040777;
-                iattr.ia_atime = 0;
-                iattr.ia_valid = ATTR_MODE | ATTR_ATIME;
-
-                err = mdc_setattr(&cl, &peer, &inode, &iattr, &request);
-                CERROR("-- done err %d\n", err);
-
-                GOTO(out, err);
-        }
-
-        case IOC_REQUEST_CREATE: {
-                struct inode inode;
-                struct iattr iattr;
-
-                inode.i_ino = arg;
-                inode.i_generation = 0;
-                iattr.ia_mode = 040777;
-                iattr.ia_atime = 0;
-                iattr.ia_valid = ATTR_MODE | ATTR_ATIME;
-
-                err = mdc_create(&cl, &peer, &inode,
-                                 "foofile", strlen("foofile"),
-                                 NULL, 0, 0100707, 47114711,
-                                 11, 47, 0, &request);
-                CERROR("-- done err %d\n", err);
-
-                GOTO(out, err);
-        }
-
-        case IOC_REQUEST_OPEN: {
-                __u64 fh, ino;
-                copy_from_user(&ino, (__u64 *)arg, sizeof(ino));
-                CERROR("-- opening ino %llu\n", ino);
-                err = mdc_open(&cl, &peer, ino, S_IFDIR, O_RDONLY, &fh,
-                               &request);
-                copy_to_user((__u64 *)arg, &fh, sizeof(fh));
-                CERROR("-- done err %d (fh=%Lu)\n", err, fh);
+        req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_STATFS, 0, NULL,
+                              NULL);
+        if (!req)
+                RETURN(-ENOMEM);
 
-                GOTO(out, err);
-        }
+        req->rq_replen = lustre_msg_size(1, &size);
 
-        case IOC_REQUEST_CLOSE: {
-                CERROR("-- closing ino 2, filehandle %lu\n", arg);
-                err = mdc_close(&cl, &peer, 2, S_IFDIR, arg, &request);
-                CERROR("-- done err %d\n", err);
+        rc = ptlrpc_queue_wait(req);
+        rc = ptlrpc_check_status(req, rc);
 
-                GOTO(out, err);
-        }
+        if (rc)
+                GOTO(out, rc);
 
-        default:
-                RETURN(-EINVAL);
-        }
+        obd_statfs_unpack(osfs, lustre_msg_buf(req->rq_repmsg, 0));
 
- out:
-        ptlrpc_free_req(request);
+        EXIT;
+out:
+        ptlrpc_req_finished(req);
 
-        RETURN(err);
+        return rc;
+}
+int mdc_attach(struct obd_device *dev, 
+                   obd_count len, void *data)
+{
+        int rc;
+        rc = lprocfs_reg_obd(dev, (lprocfs_vars_t*)status_var_nm_1, (void*)dev);
+        return rc; 
 }
 
+int mdc_detach(struct obd_device *dev)
+{
+        int rc;
+        rc = lprocfs_dereg_obd(dev);
+        return rc;
 
-static struct file_operations requestdev_fops = {
-        ioctl: request_ioctl,
-};
-
-static struct miscdevice request_dev = {
-        REQUEST_MINOR,
-        "request",
-        &requestdev_fops
+}
+struct obd_ops mdc_obd_ops = {
+        o_attach: mdc_attach,
+        o_detach: mdc_detach,
+        o_setup:   client_obd_setup,
+        o_cleanup: client_obd_cleanup,
+        o_connect: client_obd_connect,
+        o_disconnect: client_obd_disconnect,
+        o_statfs: mdc_statfs,
 };
 
 static int __init ptlrpc_request_init(void)
 {
-        misc_register(&request_dev);
+        int rc;
+        rc = class_register_type(&mdc_obd_ops, 
+                                 (lprocfs_vars_t*)status_class_var, 
+                                 LUSTRE_MDC_NAME);
+        if(rc)
+                RETURN(rc);
         return 0;
+        
 }
 
 static void __exit ptlrpc_request_exit(void)
 {
-        misc_deregister(&request_dev);
+        
+        class_unregister_type(LUSTRE_MDC_NAME);
+        
 }
 
-MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
-MODULE_DESCRIPTION("Lustre MDS Request Tester v1.0");
+MODULE_AUTHOR("Cluster File Systems <info@clusterfs.com>");
+MODULE_DESCRIPTION("Lustre Metadata Client v1.0");
 MODULE_LICENSE("GPL");
 
+EXPORT_SYMBOL(mdc_getstatus);
+EXPORT_SYMBOL(mdc_getlovinfo);
+EXPORT_SYMBOL(mdc_enqueue);
+EXPORT_SYMBOL(mdc_cancel_unused);
+EXPORT_SYMBOL(mdc_getattr);
 EXPORT_SYMBOL(mdc_create);
 EXPORT_SYMBOL(mdc_unlink);
 EXPORT_SYMBOL(mdc_rename);
 EXPORT_SYMBOL(mdc_link);
-EXPORT_SYMBOL(mdc_getattr);
 EXPORT_SYMBOL(mdc_readpage);
 EXPORT_SYMBOL(mdc_setattr);
 EXPORT_SYMBOL(mdc_close);
 EXPORT_SYMBOL(mdc_open);
 
+EXPORT_SYMBOL(mdc_store_create_replay_data);
+
 module_init(ptlrpc_request_init);
 module_exit(ptlrpc_request_exit);