Whamcloud - gitweb
LU-11427 llite: optimize read on open pages
[fs/lustre-release.git] / lustre / llite / namei.c
index 5fc7c29..8abf542 100644 (file)
@@ -195,8 +195,12 @@ int ll_dom_lock_cancel(struct inode *inode, struct ldlm_lock *lock)
 
        ENTRY;
 
-       if (!lli->lli_clob)
+       if (!lli->lli_clob) {
+               /* due to DoM read on open, there may exist pages for Lustre
+                * regular file even though cl_object is not set up yet. */
+               truncate_inode_pages(inode->i_mapping, 0);
                RETURN(0);
+       }
 
        env = cl_env_get(&refcheck);
        if (IS_ERR(env))
@@ -223,179 +227,259 @@ int ll_dom_lock_cancel(struct inode *inode, struct ldlm_lock *lock)
        RETURN(rc);
 }
 
-int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
-                      void *data, int flag)
+void ll_lock_cancel_bits(struct ldlm_lock *lock, __u64 to_cancel)
 {
-       struct lustre_handle lockh;
+       struct inode *inode = ll_inode_from_resource_lock(lock);
+       __u64 bits = to_cancel;
        int rc;
-       ENTRY;
 
-       switch (flag) {
-       case LDLM_CB_BLOCKING:
-               ldlm_lock2handle(lock, &lockh);
-               rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
-               if (rc < 0) {
-                       CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc);
-                       RETURN(rc);
-               }
-               break;
-       case LDLM_CB_CANCELING: {
-               struct inode *inode = ll_inode_from_resource_lock(lock);
-               __u64 bits = lock->l_policy_data.l_inodebits.bits;
+       if (inode == NULL)
+               return;
 
-               if (inode == NULL)
-                       break;
+       if (!fid_res_name_eq(ll_inode2fid(inode),
+                            &lock->l_resource->lr_name)) {
+               LDLM_ERROR(lock, "data mismatch with object "DFID"(%p)",
+                          PFID(ll_inode2fid(inode)), inode);
+               LBUG();
+       }
 
-               /* Invalidate all dentries associated with this inode */
-               LASSERT(ldlm_is_canceling(lock));
+       if (bits & MDS_INODELOCK_XATTR) {
+               if (S_ISDIR(inode->i_mode))
+                       ll_i2info(inode)->lli_def_stripe_offset = -1;
+               ll_xattr_cache_destroy(inode);
+               bits &= ~MDS_INODELOCK_XATTR;
+       }
 
-               if (!fid_res_name_eq(ll_inode2fid(inode),
-                                    &lock->l_resource->lr_name)) {
-                       LDLM_ERROR(lock, "data mismatch with object "DFID"(%p)",
-                                  PFID(ll_inode2fid(inode)), inode);
+       /* For OPEN locks we differentiate between lock modes
+        * LCK_CR, LCK_CW, LCK_PR - bug 22891 */
+       if (bits & MDS_INODELOCK_OPEN)
+               ll_have_md_lock(inode, &bits, lock->l_req_mode);
+
+       if (bits & MDS_INODELOCK_OPEN) {
+               fmode_t fmode;
+
+               switch (lock->l_req_mode) {
+               case LCK_CW:
+                       fmode = FMODE_WRITE;
+                       break;
+               case LCK_PR:
+                       fmode = FMODE_EXEC;
+                       break;
+               case LCK_CR:
+                       fmode = FMODE_READ;
+                       break;
+               default:
+                       LDLM_ERROR(lock, "bad lock mode for OPEN lock");
                        LBUG();
                }
 
-               if (bits & MDS_INODELOCK_XATTR) {
-                       if (S_ISDIR(inode->i_mode))
-                               ll_i2info(inode)->lli_def_stripe_offset = -1;
-                       ll_xattr_cache_destroy(inode);
-                       bits &= ~MDS_INODELOCK_XATTR;
-               }
+               ll_md_real_close(inode, fmode);
 
-               /* For OPEN locks we differentiate between lock modes
-                * LCK_CR, LCK_CW, LCK_PR - bug 22891 */
-               if (bits & MDS_INODELOCK_OPEN)
-                       ll_have_md_lock(inode, &bits, lock->l_req_mode);
-
-               if (bits & MDS_INODELOCK_OPEN) {
-                       fmode_t fmode;
-
-                       switch (lock->l_req_mode) {
-                       case LCK_CW:
-                               fmode = FMODE_WRITE;
-                               break;
-                       case LCK_PR:
-                               fmode = FMODE_EXEC;
-                               break;
-                       case LCK_CR:
-                               fmode = FMODE_READ;
-                               break;
-                       default:
-                               LDLM_ERROR(lock, "bad lock mode for OPEN lock");
-                               LBUG();
-                       }
+               bits &= ~MDS_INODELOCK_OPEN;
+       }
 
-                       ll_md_real_close(inode, fmode);
+       if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+                   MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM |
+                   MDS_INODELOCK_DOM))
+               ll_have_md_lock(inode, &bits, LCK_MINMODE);
 
-                       bits &= ~MDS_INODELOCK_OPEN;
-               }
+       if (bits & MDS_INODELOCK_DOM) {
+               rc =  ll_dom_lock_cancel(inode, lock);
+               if (rc < 0)
+                       CDEBUG(D_INODE, "cannot flush DoM data "
+                              DFID": rc = %d\n",
+                              PFID(ll_inode2fid(inode)), rc);
+               lock_res_and_lock(lock);
+               ldlm_set_kms_ignore(lock);
+               unlock_res_and_lock(lock);
+       }
 
-               if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
-                           MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM |
-                           MDS_INODELOCK_DOM))
-                       ll_have_md_lock(inode, &bits, LCK_MINMODE);
-
-               if (bits & MDS_INODELOCK_DOM) {
-                       rc =  ll_dom_lock_cancel(inode, lock);
-                       if (rc < 0)
-                               CDEBUG(D_INODE, "cannot flush DoM data "
-                                      DFID": rc = %d\n",
-                                      PFID(ll_inode2fid(inode)), rc);
-                       lock_res_and_lock(lock);
-                       ldlm_set_kms_ignore(lock);
-                       unlock_res_and_lock(lock);
-                       bits &= ~MDS_INODELOCK_DOM;
-               }
+       if (bits & MDS_INODELOCK_LAYOUT) {
+               struct cl_object_conf conf = {
+                       .coc_opc = OBJECT_CONF_INVALIDATE,
+                       .coc_inode = inode,
+               };
 
-               if (bits & MDS_INODELOCK_LAYOUT) {
-                       struct cl_object_conf conf = {
-                               .coc_opc = OBJECT_CONF_INVALIDATE,
-                               .coc_inode = inode,
-                       };
-
-                       rc = ll_layout_conf(inode, &conf);
-                       if (rc < 0)
-                               CDEBUG(D_INODE, "cannot invalidate layout of "
-                                      DFID": rc = %d\n",
-                                      PFID(ll_inode2fid(inode)), rc);
-               }
+               rc = ll_layout_conf(inode, &conf);
+               if (rc < 0)
+                       CDEBUG(D_INODE, "cannot invalidate layout of "
+                              DFID": rc = %d\n",
+                              PFID(ll_inode2fid(inode)), rc);
+       }
 
-               if (bits & MDS_INODELOCK_UPDATE) {
-                       struct ll_inode_info *lli = ll_i2info(inode);
-                       lli->lli_update_atime = 1;
-               }
+       if (bits & MDS_INODELOCK_UPDATE) {
+               struct ll_inode_info *lli = ll_i2info(inode);
 
-               if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) {
-                       struct ll_inode_info *lli = ll_i2info(inode);
+               lli->lli_update_atime = 1;
+       }
+
+       if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) {
+               struct ll_inode_info *lli = ll_i2info(inode);
 
-                       CDEBUG(D_INODE, "invalidating inode "DFID" lli = %p, "
-                              "pfid  = "DFID"\n", PFID(ll_inode2fid(inode)),
-                              lli, PFID(&lli->lli_pfid));
-                       truncate_inode_pages(inode->i_mapping, 0);
+               CDEBUG(D_INODE, "invalidating inode "DFID" lli = %p, "
+                      "pfid  = "DFID"\n", PFID(ll_inode2fid(inode)),
+                      lli, PFID(&lli->lli_pfid));
+               truncate_inode_pages(inode->i_mapping, 0);
 
-                       if (unlikely(!fid_is_zero(&lli->lli_pfid))) {
-                               struct inode *master_inode = NULL;
-                               unsigned long hash;
+               if (unlikely(!fid_is_zero(&lli->lli_pfid))) {
+                       struct inode *master_inode = NULL;
+                       unsigned long hash;
 
-                               /* This is slave inode, since all of the child
-                                * dentry is connected on the master inode, so
-                                * we have to invalidate the negative children
-                                * on master inode */
-                               CDEBUG(D_INODE, "Invalidate s"DFID" m"DFID"\n",
-                                      PFID(ll_inode2fid(inode)),
-                                      PFID(&lli->lli_pfid));
+                       /* This is slave inode, since all of the child dentry
+                        * is connected on the master inode, so we have to
+                        * invalidate the negative children on master inode */
+                       CDEBUG(D_INODE, "Invalidate s"DFID" m"DFID"\n",
+                              PFID(ll_inode2fid(inode)), PFID(&lli->lli_pfid));
 
-                               hash = cl_fid_build_ino(&lli->lli_pfid,
+                       hash = cl_fid_build_ino(&lli->lli_pfid,
                                        ll_need_32bit_api(ll_i2sbi(inode)));
 
-                               /* Do not lookup the inode with ilookup5,
-                                * otherwise it will cause dead lock,
-                                *
-                                * 1. Client1 send chmod req to the MDT0, then
-                                * on MDT0, it enqueues master and all of its
-                                * slaves lock, (mdt_attr_set() ->
-                                * mdt_lock_slaves()), after gets master and
-                                * stripe0 lock, it will send the enqueue req
-                                * (for stripe1) to MDT1, then MDT1 finds the
-                                * lock has been granted to client2. Then MDT1
-                                * sends blocking ast to client2.
-                                *
-                                * 2. At the same time, client2 tries to unlink
-                                * the striped dir (rm -rf striped_dir), and
-                                * during lookup, it will hold the master inode
-                                * of the striped directory, whose inode state
-                                * is NEW, then tries to revalidate all of its
-                                * slaves, (ll_prep_inode()->ll_iget()->
-                                * ll_read_inode2()-> ll_update_inode().). And
-                                * it will be blocked on the server side because
-                                * of 1.
-                                *
-                                * 3. Then the client get the blocking_ast req,
-                                * cancel the lock, but being blocked if using
-                                * ->ilookup5()), because master inode state is
-                                *  NEW. */
-                               master_inode = ilookup5_nowait(inode->i_sb,
-                                                   hash, ll_test_inode_by_fid,
+                       /* Do not lookup the inode with ilookup5, otherwise
+                        * it will cause dead lock,
+                        * 1. Client1 send chmod req to the MDT0, then on MDT0,
+                        * it enqueues master and all of its slaves lock,
+                        * (mdt_attr_set() -> mdt_lock_slaves()), after gets
+                        * master and stripe0 lock, it will send the enqueue
+                        * req (for stripe1) to MDT1, then MDT1 finds the lock
+                        * has been granted to client2. Then MDT1 sends blocking
+                        * ast to client2.
+                        * 2. At the same time, client2 tries to unlink
+                        * the striped dir (rm -rf striped_dir), and during
+                        * lookup, it will hold the master inode of the striped
+                        * directory, whose inode state is NEW, then tries to
+                        * revalidate all of its slaves, (ll_prep_inode()->
+                        * ll_iget()->ll_read_inode2()-> ll_update_inode().).
+                        * And it will be blocked on the server side because
+                        * of 1.
+                        * 3. Then the client get the blocking_ast req, cancel
+                        * the lock, but being blocked if using ->ilookup5()),
+                        * because master inode state is NEW. */
+                       master_inode = ilookup5_nowait(inode->i_sb, hash,
+                                                       ll_test_inode_by_fid,
                                                        (void *)&lli->lli_pfid);
-                               if (master_inode) {
-                                       ll_invalidate_negative_children(
-                                                               master_inode);
-                                       iput(master_inode);
-                               }
-                       } else {
-                               ll_invalidate_negative_children(inode);
+                       if (master_inode) {
+                               ll_invalidate_negative_children(master_inode);
+                               iput(master_inode);
                        }
+               } else {
+                       ll_invalidate_negative_children(inode);
                }
+       }
 
-               if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) &&
-                   inode->i_sb->s_root != NULL &&
-                   inode != inode->i_sb->s_root->d_inode)
-                       ll_invalidate_aliases(inode);
+       if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) &&
+           inode->i_sb->s_root != NULL &&
+           inode != inode->i_sb->s_root->d_inode)
+               ll_invalidate_aliases(inode);
 
-               iput(inode);
+       iput(inode);
+}
+
+/* Check if the given lock may be downgraded instead of canceling and
+ * that convert is really needed. */
+int ll_md_need_convert(struct ldlm_lock *lock)
+{
+       struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+       struct inode *inode;
+       __u64 wanted = lock->l_policy_data.l_inodebits.cancel_bits;
+       __u64 bits = lock->l_policy_data.l_inodebits.bits & ~wanted;
+       enum ldlm_mode mode = LCK_MINMODE;
+
+       if (!lock->l_conn_export ||
+           !exp_connect_lock_convert(lock->l_conn_export))
+               return 0;
+
+       if (!wanted || !bits || ldlm_is_cancel(lock))
+               return 0;
+
+       /* do not convert locks other than DOM for now */
+       if (!((bits | wanted) & MDS_INODELOCK_DOM))
+               return 0;
+
+       /* We may have already remaining bits in some other lock so
+        * lock convert will leave us just extra lock for the same bit.
+        * Check if client has other lock with the same bits and the same
+        * or lower mode and don't convert if any.
+        */
+       switch (lock->l_req_mode) {
+       case LCK_PR:
+               mode = LCK_PR;
+       case LCK_PW:
+               mode |= LCK_CR;
+               break;
+       case LCK_CW:
+               mode = LCK_CW;
+       case LCK_CR:
+               mode |= LCK_CR;
+               break;
+       default:
+               /* do not convert other modes */
+               return 0;
+       }
+
+       /* is lock is too old to be converted? */
+       lock_res_and_lock(lock);
+       if (ktime_after(ktime_get(),
+                       ktime_add(lock->l_last_used,
+                                 ktime_set(ns->ns_dirty_age_limit, 0)))) {
+               unlock_res_and_lock(lock);
+               return 0;
+       }
+       unlock_res_and_lock(lock);
+
+       inode = ll_inode_from_resource_lock(lock);
+       ll_have_md_lock(inode, &bits, mode);
+       iput(inode);
+       return !!(bits);
+}
+
+int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                      void *data, int flag)
+{
+       struct lustre_handle lockh;
+       __u64 bits = lock->l_policy_data.l_inodebits.bits;
+       int rc;
+
+       ENTRY;
+
+       switch (flag) {
+       case LDLM_CB_BLOCKING:
+       {
+               __u64 cancel_flags = LCF_ASYNC;
+
+               if (ll_md_need_convert(lock)) {
+                       cancel_flags |= LCF_CONVERT;
+                       /* For lock convert some cancel actions may require
+                        * this lock with non-dropped canceled bits, e.g. page
+                        * flush for DOM lock. So call ll_lock_cancel_bits()
+                        * here while canceled bits are still set.
+                        */
+                       bits = lock->l_policy_data.l_inodebits.cancel_bits;
+                       if (bits & MDS_INODELOCK_DOM)
+                               ll_lock_cancel_bits(lock, MDS_INODELOCK_DOM);
+               }
+               ldlm_lock2handle(lock, &lockh);
+               rc = ldlm_cli_cancel(&lockh, cancel_flags);
+               if (rc < 0) {
+                       CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc);
+                       RETURN(rc);
+               }
                break;
        }
+       case LDLM_CB_CANCELING:
+               if (ldlm_is_converting(lock)) {
+                       /* this is called on already converted lock, so
+                        * ibits has remained bits only and cancel_bits
+                        * are bits that were dropped.
+                        * Note that DOM lock is handled prior lock convert
+                        * and is excluded here.
+                        */
+                       bits = lock->l_policy_data.l_inodebits.cancel_bits &
+                               ~MDS_INODELOCK_DOM;
+               } else {
+                       LASSERT(ldlm_is_canceling(lock));
+               }
+               ll_lock_cancel_bits(lock, bits);
+               break;
        default:
                LBUG();
        }
@@ -526,20 +610,24 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request,
        CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it,
               it->it_disposition);
        if (!it_disposition(it, DISP_LOOKUP_NEG)) {
-                rc = ll_prep_inode(&inode, request, (*de)->d_sb, it);
-                if (rc)
-                        RETURN(rc);
-
-                ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
-
-                /* We used to query real size from OSTs here, but actually
-                   this is not needed. For stat() calls size would be updated
-                   from subsequent do_revalidate()->ll_inode_revalidate_it() in
-                   2.4 and
-                   vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
-                   Everybody else who needs correct file size would call
-                   ll_glimpse_size or some equivalent themselves anyway.
-                   Also see bug 7198. */
+               rc = ll_prep_inode(&inode, request, (*de)->d_sb, it);
+               if (rc)
+                       RETURN(rc);
+
+               if (it->it_op & IT_OPEN)
+                       ll_dom_finish_open(inode, request, it);
+
+               ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
+
+               /* We used to query real size from OSTs here, but actually
+                * this is not needed. For stat() calls size would be updated
+                * from subsequent do_revalidate()->ll_inode_revalidate_it() in
+                * 2.4 and
+                * vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
+                * Everybody else who needs correct file size would call
+                * ll_glimpse_size or some equivalent themselves anyway.
+                * Also see bug 7198.
+                */
        }
 
        /* Only hash *de if it is unhashed (new dentry).