Whamcloud - gitweb
land minor fixes from b_hd_sec:
[fs/lustre-release.git] / lustre / mdc / mdc_locks.c
index 56f8188..3addfcf 100644 (file)
 #include <linux/obd_class.h>
 #include <linux/lustre_mds.h>
 #include <linux/lustre_dlm.h>
+#include <linux/lustre_sec.h>
 #include <linux/lprocfs_status.h>
+#include <linux/lustre_acl.h>
+#include <linux/lustre_lite.h>
 #include "mdc_internal.h"
 
 int it_disposition(struct lookup_intent *it, int flag)
 {
-        return it->d.lustre.it_disposition & flag;
+        return LUSTRE_IT(it)->it_disposition & flag;
 }
 EXPORT_SYMBOL(it_disposition);
 
 void it_set_disposition(struct lookup_intent *it, int flag)
 {
-        it->d.lustre.it_disposition |= flag;
+        LUSTRE_IT(it)->it_disposition |= flag;
 }
 EXPORT_SYMBOL(it_set_disposition);
 
@@ -88,33 +91,33 @@ int it_open_error(int phase, struct lookup_intent *it)
 {
         if (it_disposition(it, DISP_OPEN_OPEN)) {
                 if (phase == DISP_OPEN_OPEN)
-                        return it->d.lustre.it_status;
+                        return LUSTRE_IT(it)->it_status;
                 else
                         return 0;
         }
 
         if (it_disposition(it, DISP_OPEN_CREATE)) {
                 if (phase == DISP_OPEN_CREATE)
-                        return it->d.lustre.it_status;
+                        return LUSTRE_IT(it)->it_status;
                 else
                         return 0;
         }
 
         if (it_disposition(it, DISP_LOOKUP_EXECD)) {
                 if (phase == DISP_LOOKUP_EXECD)
-                        return it->d.lustre.it_status;
+                        return LUSTRE_IT(it)->it_status;
                 else
                         return 0;
         }
 
         if (it_disposition(it, DISP_IT_EXECD)) {
                 if (phase == DISP_IT_EXECD)
-                        return it->d.lustre.it_status;
+                        return LUSTRE_IT(it)->it_status;
                 else
                         return 0;
         }
-        CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition,
-               it->d.lustre.it_status);
+        CERROR("it disp: %X, status: %d\n", LUSTRE_IT(it)->it_disposition,
+               LUSTRE_IT(it)->it_status);
         LBUG();
         return 0;
 }
@@ -135,7 +138,7 @@ int mdc_set_lock_data(struct obd_export *exp, __u64 *l, void *data)
         lock = ldlm_handle2lock(lockh);
 
         LASSERT(lock != NULL);
-        l_lock(&lock->l_resource->lr_namespace->ns_lock);
+        lock_res_and_lock(lock);
 #ifdef __KERNEL__
         if (lock->l_ast_data && lock->l_ast_data != data) {
                 struct inode *new_inode = data;
@@ -149,7 +152,7 @@ int mdc_set_lock_data(struct obd_export *exp, __u64 *l, void *data)
         }
 #endif
         lock->l_ast_data = data;
-        l_unlock(&lock->l_resource->lr_namespace->ns_lock);
+        unlock_res_and_lock(lock);
         LDLM_LOCK_PUT(lock);
 
         EXIT;
@@ -173,6 +176,22 @@ int mdc_change_cbdata(struct obd_export *exp, struct lustre_id *id,
         return 0;
 }
 
+static inline void
+mdc_clear_replay_flag(struct ptlrpc_request *req, int rc)
+{
+        /* Don't hold error requests for replay. */
+        if (req->rq_replay) {
+                unsigned long irqflags;
+                spin_lock_irqsave(&req->rq_lock, irqflags);
+                req->rq_replay = 0;
+                spin_unlock_irqrestore(&req->rq_lock, irqflags);
+        }
+        if (rc && req->rq_transno != 0) {
+                DEBUG_REQ(D_ERROR, req, "transno returned on error rc %d", rc);
+                LBUG();
+        }
+}
+
 /* We always reserve enough space in the reply packet for a stripe MD, because
  * we don't know in advance the file type. */
 int mdc_enqueue(struct obd_export *exp,
@@ -195,16 +214,15 @@ int mdc_enqueue(struct obd_export *exp,
         ldlm_policy_data_t policy = { .l_inodebits = { MDS_INODELOCK_LOOKUP } };
         struct ldlm_intent *lit;
         struct ldlm_request *lockreq;
-        struct ldlm_reply *dlm_rep;
         int reqsize[6] = {[MDS_REQ_SECDESC_OFF] = 0,
                           [MDS_REQ_INTENT_LOCKREQ_OFF] = sizeof(*lockreq),
                           [MDS_REQ_INTENT_IT_OFF] = sizeof(*lit)};
-        int repsize[4] = {sizeof(struct ldlm_reply),
+        int repsize[5] = {sizeof(struct ldlm_reply),
                           sizeof(struct mds_body),
-                          obddev->u.cli.cl_max_mds_easize,
-                          obddev->u.cli.cl_max_mds_cookiesize};
+                          obddev->u.cli.cl_max_mds_easize};
         int req_buffers = 3, reply_buffers = 0;
         int rc, flags = LDLM_FL_HAS_INTENT;
+        struct ldlm_reply *dlm_rep = NULL;
         void *eadata;
         unsigned long irqflags;
         ENTRY;
@@ -212,7 +230,7 @@ int mdc_enqueue(struct obd_export *exp,
 //        LDLM_DEBUG_NOLOCK("mdsintent=%s,name=%s,dir=%lu",
 //                          ldlm_it2str(it->it_op), it_name, it_inode->i_ino);
 
-        reqsize[0] = mdc_get_secdesc_size();
+        reqsize[0] = lustre_secdesc_size();
 
         if (it->it_op & IT_OPEN) {
                 it->it_create_mode |= S_IFREG;
@@ -221,6 +239,7 @@ int mdc_enqueue(struct obd_export *exp,
                 reqsize[req_buffers++] = sizeof(struct mds_rec_create);
                 reqsize[req_buffers++] = data->namelen + 1;
                 reqsize[req_buffers++] = obddev->u.cli.cl_max_mds_easize;
+
                 req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION,
                                       LDLM_ENQUEUE, req_buffers, reqsize, NULL);
                 if (!req)
@@ -237,13 +256,18 @@ int mdc_enqueue(struct obd_export *exp,
 
                 /* pack the intended request */
                 mdc_open_pack(req->rq_reqmsg, MDS_REQ_INTENT_REC_OFF, data,
-                              it->it_create_mode, 0, it->it_flags,
-                              lmm, lmmsize);
+                              it->it_create_mode, 0, it->it_flags, lmm, lmmsize);
                 /* get ready for the reply */
-                reply_buffers = 3;
-                req->rq_replen = lustre_msg_size(3, repsize);
+                repsize[3] = 4;
+                repsize[4] = xattr_acl_size(LL_ACL_MAX_ENTRIES);
+                reply_buffers = 5;
+                req->rq_replen = lustre_msg_size(5, repsize);
         } else if (it->it_op & (IT_GETATTR | IT_LOOKUP | IT_CHDIR)) {
-                int valid = data->valid | OBD_MD_FLNOTOBD | OBD_MD_FLEASIZE;
+                __u64 valid = data->valid | OBD_MD_FLNOTOBD | OBD_MD_FLEASIZE |
+                              OBD_MD_FLACL;
+
+                /* we don't expect xattr retrieve could reach here */
+                LASSERT(!(valid & (OBD_MD_FLXATTR | OBD_MD_FLXATTRLIST)));
 
                 reqsize[req_buffers++] = sizeof(struct mds_body);
                 reqsize[req_buffers++] = data->namelen + 1;
@@ -267,8 +291,10 @@ int mdc_enqueue(struct obd_export *exp,
                                  valid, it->it_flags, data);
                 
                 /* get ready for the reply */
-                reply_buffers = 3;
-                req->rq_replen = lustre_msg_size(3, repsize);
+                repsize[3] = 4;
+                repsize[4] = xattr_acl_size(LL_ACL_MAX_ENTRIES);
+                reply_buffers = 5;
+                req->rq_replen = lustre_msg_size(5, repsize);
         } else if (it->it_op == IT_READDIR) {
                 policy.l_inodebits.bits = MDS_INODELOCK_UPDATE;
                 req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION,
@@ -304,7 +330,7 @@ int mdc_enqueue(struct obd_export *exp,
                 RETURN(-EINVAL);
         }
 
-        mdc_pack_secdesc(req, reqsize[0]);
+        lustre_pack_secdesc(req, reqsize[0]);
 
         mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
         rc = ldlm_cli_enqueue(exp, req, obddev->obd_namespace, res_id,
@@ -324,13 +350,21 @@ int mdc_enqueue(struct obd_export *exp,
 
         /* This can go when we're sure that this can never happen */
         LASSERT(rc != -ENOENT);
+        /* We need dlm_rep to be assigned this early, to check lock mode of
+           returned lock from request to avoid possible race with lock
+           conversion */
+        if (rc == ELDLM_LOCK_ABORTED || !rc) {
+                dlm_rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*dlm_rep));
+                LASSERT(dlm_rep != NULL);   /* checked by ldlm_cli_enqueue() */
+        }
         if (rc == ELDLM_LOCK_ABORTED) {
                 lock_mode = 0;
                 memset(lockh, 0, sizeof(*lockh));
                 rc = 0;
         } else if (rc != 0) {
                 CERROR("ldlm_cli_enqueue: %d\n", rc);
-                LASSERT (rc < 0);
+                LASSERTF(rc < 0, "rc = %d\n", rc);
+                mdc_clear_replay_flag(req, rc);
                 ptlrpc_req_finished(req);
                 RETURN(rc);
         } else { /* rc = 0 */
@@ -339,43 +373,37 @@ int mdc_enqueue(struct obd_export *exp,
 
                 /* If the server gave us back a different lock mode, we should
                  * fix up our variables. */
-                if (lock->l_req_mode != lock_mode) {
-                        ldlm_lock_addref(lockh, lock->l_req_mode);
+                if (dlm_rep->lock_desc.l_req_mode != lock_mode) {
+                        ldlm_lock_addref(lockh, dlm_rep->lock_desc.l_req_mode);
                         ldlm_lock_decref(lockh, lock_mode);
-                        lock_mode = lock->l_req_mode;
+                        lock_mode = dlm_rep->lock_desc.l_req_mode;
                 }
 
                 ldlm_lock_allow_match(lock);
                 LDLM_LOCK_PUT(lock);
         }
 
-        dlm_rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*dlm_rep));
-        LASSERT(dlm_rep != NULL);           /* checked by ldlm_cli_enqueue() */
         LASSERT_REPSWABBED(req, 0);         /* swabbed by ldlm_cli_enqueue() */
 
-        it->d.lustre.it_disposition = (int) dlm_rep->lock_policy_res1;
-        it->d.lustre.it_status = (int) dlm_rep->lock_policy_res2;
-        it->d.lustre.it_lock_mode = lock_mode;
-        it->d.lustre.it_data = req;
+        LUSTRE_IT(it)->it_disposition = (int) dlm_rep->lock_policy_res1;
+        LUSTRE_IT(it)->it_status = (int) dlm_rep->lock_policy_res2;
+        LUSTRE_IT(it)->it_lock_mode = lock_mode;
+        LUSTRE_IT(it)->it_data = req;
 
-        if (it->d.lustre.it_status < 0 && req->rq_replay) {
-                LASSERT(req->rq_transno == 0);
-                /* Don't hold error requests for replay. */
-                spin_lock(&req->rq_lock);
-                req->rq_replay = 0;
-                spin_unlock(&req->rq_lock);
-        }
+        if (LUSTRE_IT(it)->it_status < 0 && req->rq_replay)
+                mdc_clear_replay_flag(req, LUSTRE_IT(it)->it_status);
 
-        DEBUG_REQ(D_RPCTRACE, req, "disposition: %x, status: %d",
-                  it->d.lustre.it_disposition, it->d.lustre.it_status);
+        DEBUG_REQ(D_DLMTRACE, req, "disposition: %x, status: %d",
+                  LUSTRE_IT(it)->it_disposition, LUSTRE_IT(it)->it_status);
 
         /* We know what to expect, so we do any byte flipping required here */
-        LASSERT(reply_buffers == 4 || reply_buffers == 3 || reply_buffers == 1);
+        LASSERT(reply_buffers == 5 || reply_buffers == 4 || 
+                reply_buffers == 3 || reply_buffers == 1);
         if (reply_buffers >= 3) {
                 struct mds_body *body;
 
                 body = lustre_swab_repbuf(req, 1, sizeof (*body),
-                                           lustre_swab_mds_body);
+                                          lustre_swab_mds_body);
                 if (body == NULL) {
                         CERROR ("Can't swab mds_body\n");
                         RETURN (-EPROTO);
@@ -427,15 +455,15 @@ EXPORT_SYMBOL(mdc_enqueue);
  * ll_create/ll_open gets called.
  *
  * The server will return to us, in it_disposition, an indication of
- * exactly what d.lustre.it_status refers to.
+ * exactly what d.lustre->it_status refers to.
  *
- * If DISP_OPEN_OPEN is set, then d.lustre.it_status refers to the open() call,
+ * If DISP_OPEN_OPEN is set, then d.lustre->it_status refers to the open() call,
  * otherwise if DISP_OPEN_CREATE is set, then it status is the
  * creation failure mode.  In either case, one of DISP_LOOKUP_NEG or
  * DISP_LOOKUP_POS will be set, indicating whether the child lookup
  * was successful.
  *
- * Else, if DISP_LOOKUP_EXECD then d.lustre.it_status is the rc of the
+ * Else, if DISP_LOOKUP_EXECD then d.lustre->it_status is the rc of the
  * child lookup.
  */
 int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid, 
@@ -465,30 +493,51 @@ int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid,
                                                       id_group(cid)}};
                 struct lustre_handle lockh;
                 ldlm_policy_data_t policy;
-                int mode = LCK_PR;
+                int mode;
 
                 /* For the GETATTR case, ll_revalidate_it issues two separate
                    queries - for LOOKUP and for UPDATE lock because it cannot
                    check them together - we might have those two bits to be
                    present in two separate granted locks */
                 policy.l_inodebits.bits = (it->it_op == IT_GETATTR) ?
-                        MDS_INODELOCK_UPDATE: MDS_INODELOCK_LOOKUP;
+                        MDS_INODELOCK_UPDATE : MDS_INODELOCK_LOOKUP;
                 
                 mode = LCK_PR;
                 rc = ldlm_lock_match(exp->exp_obd->obd_namespace,
                                      LDLM_FL_BLOCK_GRANTED, &res_id,
-                                     LDLM_IBITS, &policy, LCK_PR, &lockh);
+                                     LDLM_IBITS, &policy, mode,
+                                     &lockh);
+
+                if (!rc) {
+                        mode = LCK_CR;
+                        rc = ldlm_lock_match(exp->exp_obd->obd_namespace,
+                                             LDLM_FL_BLOCK_GRANTED, &res_id,
+                                             LDLM_IBITS, &policy, mode,
+                                             &lockh);
+                }
                 if (!rc) {
                         mode = LCK_PW;
                         rc = ldlm_lock_match(exp->exp_obd->obd_namespace,
                                              LDLM_FL_BLOCK_GRANTED, &res_id,
-                                             LDLM_IBITS, &policy, LCK_PW,
+                                             LDLM_IBITS, &policy, mode,
+                                             &lockh);
+                }
+                if (!rc) {
+                        mode = LCK_CW;
+                        rc = ldlm_lock_match(exp->exp_obd->obd_namespace,
+                                             LDLM_FL_BLOCK_GRANTED, &res_id,
+                                             LDLM_IBITS, &policy, mode,
                                              &lockh);
                 }
                 if (rc) {
-                        memcpy(&it->d.lustre.it_lock_handle, &lockh,
+                        if (ptlrpcs_check_cred(exp->exp_obd->u.cli.cl_import)) {
+                                /* return immediately if no credential held */
+                                ldlm_lock_decref(&lockh, mode);
+                                RETURN(-EACCES);
+                        }
+                        memcpy(&LUSTRE_IT(it)->it_lock_handle, &lockh,
                                sizeof(lockh));
-                        it->d.lustre.it_lock_mode = mode;
+                        LUSTRE_IT(it)->it_lock_mode = mode;
                 }
 
                 /* Only return failure if it was not GETATTR by cid (from
@@ -506,40 +555,27 @@ int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid,
          * this and use the request from revalidate.  In this case, revalidate
          * never dropped its reference, so the refcounts are all OK */
         if (!it_disposition(it, DISP_ENQ_COMPLETE)) {
-                struct mdc_op_data op_data;
-
-                mdc_id2mdc_data(&op_data, pid, cid, name, len, 0);
-
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-                /* 
-                 * This is optimization. Now fid will not be obtained from
-                 * server if client inode already exists. This flag is set in
-                 * ll_revalidate_it() if it finds that passed dentry contains
-                 * inode.
-                 */
-                if (!(it->d.lustre.it_int_flags && LL_IT_EXIST)) {
-#endif
-                        /* 
-                         * if we get inode by name (ll_lookup_it() case), we
-                         * always should ask for fid, as we will not be able to
-                         * take locks, revalidate dentry, etc. later with
-                         * invalid fid in inode.
-                         */
-                        if (cid == NULL && name != NULL)
-                                op_data.valid |= OBD_MD_FID;
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-                }
-#endif
+                struct mdc_op_data *op_data;
+
+                OBD_ALLOC(op_data, sizeof(*op_data));
+                if (op_data == NULL)
+                        RETURN(-ENOMEM);
+
+                mdc_id2mdc_data(op_data, pid, cid, name, len, 0);
+
+                if (name != NULL)
+                        op_data->valid |= OBD_MD_FID;
 
                 rc = mdc_enqueue(exp, LDLM_IBITS, it, it_to_lock_mode(it),
-                                 &op_data, &lockh, lmm, lmmsize,
+                                 op_data, &lockh, lmm, lmmsize,
                                  ldlm_completion_ast, cb_blocking, NULL);
+                OBD_FREE(op_data, sizeof(*op_data));
                 if (rc < 0)
                         RETURN(rc);
                 
-                memcpy(&it->d.lustre.it_lock_handle, &lockh, sizeof(lockh));
+                memcpy(&LUSTRE_IT(it)->it_lock_handle, &lockh, sizeof(lockh));
         }
-        request = *reqp = it->d.lustre.it_data;
+        request = *reqp = LUSTRE_IT(it)->it_data;
         LASSERT(request != NULL);
         
         /* If we're doing an IT_OPEN which did not result in an actual
@@ -549,21 +585,15 @@ int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid,
          * It's important that we do this first!  Otherwise we might exit the
          * function without doing so, and try to replay a failed create (bug
          * 3440) */
-        if (it->it_op & IT_OPEN) {
-                if (!it_disposition(it, DISP_OPEN_OPEN) ||
-                    it->d.lustre.it_status != 0) {
-                        unsigned long irqflags;
-
-                        spin_lock_irqsave(&request->rq_lock, irqflags);
-                        request->rq_replay = 0;
-                        spin_unlock_irqrestore(&request->rq_lock, irqflags);
-                }
-        }
+        if (it->it_op & IT_OPEN && request->rq_replay &&
+            (!it_disposition(it, DISP_OPEN_OPEN) || LUSTRE_IT(it)->it_status != 0))
+                mdc_clear_replay_flag(request, LUSTRE_IT(it)->it_status);
         if (!it_disposition(it, DISP_IT_EXECD)) {
                 /* The server failed before it even started executing the
                  * intent, i.e. because it couldn't unpack the request. */
-                LASSERT(it->d.lustre.it_status != 0);
-                RETURN(it->d.lustre.it_status);
+                LASSERT(LUSTRE_IT(it)->it_status != 0);
+                RETURN(LUSTRE_IT(it)->it_status);
         }
         rc = it_open_error(DISP_IT_EXECD, it);
         if (rc)
@@ -581,7 +611,7 @@ int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid,
                 
                 /* we have to compare all the fields but type, because MDS can
                  * return fid/mds/ino/gen if inode lives on another MDS -bzzz */
-                if (!id_equal(cid, &mds_body->id1))
+                if (!(lookup_flags & LOOKUP_COBD) && !id_equal(cid, &mds_body->id1))
                         RETURN(-ESTALE);
         }
 
@@ -633,15 +663,15 @@ int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid,
                 if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL,
                                     LDLM_IBITS, &policy, LCK_NL, &old_lock)) {
                         ldlm_lock_decref_and_cancel(&lockh,
-                                                    it->d.lustre.it_lock_mode);
+                                                    LUSTRE_IT(it)->it_lock_mode);
                         memcpy(&lockh, &old_lock, sizeof(old_lock));
-                        memcpy(&it->d.lustre.it_lock_handle, &lockh,
+                        memcpy(&LUSTRE_IT(it)->it_lock_handle, &lockh,
                                sizeof(lockh));
                 }
         }
         CDEBUG(D_DENTRY, "D_IT dentry %*s intent: %s status %d disp %x rc %d\n",
-               len, name, ldlm_it2str(it->it_op), it->d.lustre.it_status,
-               it->d.lustre.it_disposition, rc);
+               len, name, ldlm_it2str(it->it_op), LUSTRE_IT(it)->it_status,
+               LUSTRE_IT(it)->it_disposition, rc);
 
         RETURN(rc);
 }