Whamcloud - gitweb
LU-14868 llite: revert 'simplify callback handling for async getattr'
[fs/lustre-release.git] / lustre / llite / statahead.c
index 1db80ae..775c488 100644 (file)
@@ -54,12 +54,13 @@ typedef enum {
 
 /*
  * sa_entry is not refcounted: statahead thread allocates it and do async stat,
- * and in async stat callback ll_statahead_interpret() will prepare the inode
- * and set lock data in the ptlrpcd context. Then the scanner process will be
- * woken up if this entry is the waiting one, can access and free it.
+ * and in async stat callback ll_statahead_interpret() will add it into
+ * sai_interim_entries, later statahead thread will call sa_handle_callback() to
+ * instantiate entry and move it into sai_entries, and then only scanner process
+ * can access and free it.
  */
 struct sa_entry {
-       /* link into sai_entries */
+       /* link into sai_interim_entries or sai_entries */
        struct list_head        se_list;
        /* link into sai hash table locally */
        struct list_head        se_hash;
@@ -71,6 +72,10 @@ struct sa_entry {
        se_state_t              se_state;
        /* entry size, contains name */
        int                     se_size;
+       /* pointer to async getattr enqueue info */
+       struct md_enqueue_info *se_minfo;
+       /* pointer to the async getattr request */
+       struct ptlrpc_request  *se_req;
        /* pointer to the target inode */
        struct inode           *se_inode;
        /* entry name */
@@ -142,6 +147,12 @@ static inline int sa_sent_full(struct ll_statahead_info *sai)
        return atomic_read(&sai->sai_cache_count) >= sai->sai_max;
 }
 
+/* got async stat replies */
+static inline int sa_has_callback(struct ll_statahead_info *sai)
+{
+       return !list_empty(&sai->sai_interim_entries);
+}
+
 static inline int agl_list_empty(struct ll_statahead_info *sai)
 {
        return list_empty(&sai->sai_agls);
@@ -330,55 +341,55 @@ __sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret)
 }
 
 /* finish async stat RPC arguments */
-static void sa_fini_data(struct md_op_item *item)
+static void sa_fini_data(struct md_enqueue_info *minfo)
 {
-       ll_unlock_md_op_lsm(&item->mop_data);
-       iput(item->mop_dir);
-       OBD_FREE_PTR(item);
+       ll_unlock_md_op_lsm(&minfo->mi_data);
+       iput(minfo->mi_dir);
+       OBD_FREE_PTR(minfo);
 }
 
-static int ll_statahead_interpret(struct req_capsule *pill,
-                                 struct md_op_item *item, int rc);
+static int ll_statahead_interpret(struct ptlrpc_request *req,
+                                 struct md_enqueue_info *minfo, int rc);
 
 /*
  * prepare arguments for async stat RPC.
  */
-static struct md_op_item *
+static struct md_enqueue_info *
 sa_prep_data(struct inode *dir, struct inode *child, struct sa_entry *entry)
 {
-       struct md_op_item *item;
+       struct md_enqueue_info   *minfo;
        struct ldlm_enqueue_info *einfo;
-       struct md_op_data *op_data;
+       struct md_op_data        *op_data;
 
-       OBD_ALLOC_PTR(item);
-       if (!item)
+       OBD_ALLOC_PTR(minfo);
+       if (!minfo)
                return ERR_PTR(-ENOMEM);
 
-       op_data = ll_prep_md_op_data(&item->mop_data, dir, child,
+       op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child,
                                     entry->se_qstr.name, entry->se_qstr.len, 0,
                                     LUSTRE_OPC_ANY, NULL);
        if (IS_ERR(op_data)) {
-               OBD_FREE_PTR(item);
-               return (struct md_op_item *)op_data;
+               OBD_FREE_PTR(minfo);
+               return (struct md_enqueue_info *)op_data;
        }
 
        if (!child)
                op_data->op_fid2 = entry->se_fid;
 
-       item->mop_it.it_op = IT_GETATTR;
-       item->mop_dir = igrab(dir);
-       item->mop_cb = ll_statahead_interpret;
-       item->mop_cbdata = entry;
-
-       einfo = &item->mop_einfo;
-       einfo->ei_type = LDLM_IBITS;
-       einfo->ei_mode = it_to_lock_mode(&item->mop_it);
-       einfo->ei_cb_bl = ll_md_blocking_ast;
-       einfo->ei_cb_cp = ldlm_completion_ast;
-       einfo->ei_cb_gl = NULL;
+       minfo->mi_it.it_op = IT_GETATTR;
+       minfo->mi_dir = igrab(dir);
+       minfo->mi_cb = ll_statahead_interpret;
+       minfo->mi_cbdata = entry;
+
+       einfo = &minfo->mi_einfo;
+       einfo->ei_type   = LDLM_IBITS;
+       einfo->ei_mode   = it_to_lock_mode(&minfo->mi_it);
+       einfo->ei_cb_bl  = ll_md_blocking_ast;
+       einfo->ei_cb_cp  = ldlm_completion_ast;
+       einfo->ei_cb_gl  = NULL;
        einfo->ei_cbdata = NULL;
 
-       return item;
+       return minfo;
 }
 
 /*
@@ -389,8 +400,22 @@ static void
 sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret)
 {
        struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode);
+       struct md_enqueue_info *minfo = entry->se_minfo;
+       struct ptlrpc_request *req = entry->se_req;
        bool wakeup;
 
+       /* release resources used in RPC */
+       if (minfo) {
+               entry->se_minfo = NULL;
+               ll_intent_release(&minfo->mi_it);
+               sa_fini_data(minfo);
+       }
+
+       if (req) {
+               entry->se_req = NULL;
+               ptlrpc_req_finished(req);
+       }
+
        spin_lock(&lli->lli_sa_lock);
        wakeup = __sa_make_ready(sai, entry, ret);
        spin_unlock(&lli->lli_sa_lock);
@@ -447,6 +472,7 @@ static struct ll_statahead_info *ll_sai_alloc(struct dentry *dentry)
        sai->sai_index = 1;
        init_waitqueue_head(&sai->sai_waitq);
 
+       INIT_LIST_HEAD(&sai->sai_interim_entries);
        INIT_LIST_HEAD(&sai->sai_entries);
        INIT_LIST_HEAD(&sai->sai_agls);
 
@@ -509,6 +535,7 @@ static void ll_sai_put(struct ll_statahead_info *sai)
                LASSERT(!sai->sai_task);
                LASSERT(!sai->sai_agl_task);
                LASSERT(sai->sai_sent == sai->sai_replied);
+               LASSERT(!sa_has_callback(sai));
 
                list_for_each_entry_safe(entry, next, &sai->sai_entries,
                                         se_list)
@@ -604,71 +631,34 @@ static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
 }
 
 /*
- * Callback for async stat RPC, this is called in ptlrpcd context. It prepares
- * the inode and set lock data directly in the ptlrpcd context. It will wake up
- * the directory listing process if the dentry is the waiting one.
+ * prepare inode for sa entry, add it into agl list, now sa_entry is ready
+ * to be used by scanner process.
  */
-static int ll_statahead_interpret(struct req_capsule *pill,
-                                 struct md_op_item *item, int rc)
+static void sa_instantiate(struct ll_statahead_info *sai,
+                          struct sa_entry *entry)
 {
-       struct lookup_intent *it = &item->mop_it;
-       struct inode *dir = item->mop_dir;
-       struct ll_inode_info *lli = ll_i2info(dir);
-       struct ll_statahead_info *sai = lli->lli_sai;
-       struct sa_entry *entry = (struct sa_entry *)item->mop_cbdata;
-       struct mdt_body *body;
+       struct inode *dir = sai->sai_dentry->d_inode;
        struct inode *child;
-       __u64 handle = 0;
+       struct md_enqueue_info *minfo;
+       struct lookup_intent *it;
+       struct ptlrpc_request *req;
+       struct mdt_body *body;
+       int rc = 0;
 
        ENTRY;
 
-       if (it_disposition(it, DISP_LOOKUP_NEG))
-               rc = -ENOENT;
-
-       /*
-        * because statahead thread will wait for all inflight RPC to finish,
-        * sai should be always valid, no need to refcount
-        */
-       LASSERT(sai != NULL);
-       LASSERT(entry != NULL);
-
-       CDEBUG(D_READA, "sa_entry %.*s rc %d\n",
-              entry->se_qstr.len, entry->se_qstr.name, rc);
-
-       if (rc != 0) {
-               ll_intent_release(it);
-               sa_fini_data(item);
-       } else {
-               /*
-                * release ibits lock ASAP to avoid deadlock when statahead
-                * thread enqueues lock on parent in readdir and another
-                * process enqueues lock on child with parent lock held, eg.
-                * unlink.
-                */
-               handle = it->it_lock_handle;
-               ll_intent_drop_lock(it);
-               ll_unlock_md_op_lsm(&item->mop_data);
-       }
-
-       if (rc != 0) {
-               spin_lock(&lli->lli_sa_lock);
-               if (__sa_make_ready(sai, entry, rc))
-                       wake_up(&sai->sai_waitq);
-
-               sai->sai_replied++;
-               spin_unlock(&lli->lli_sa_lock);
-
-               RETURN(rc);
-       }
+       LASSERT(entry->se_handle != 0);
 
-       entry->se_handle = handle;
-       body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+       minfo = entry->se_minfo;
+       it = &minfo->mi_it;
+       req = entry->se_req;
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
        if (!body)
                GOTO(out, rc = -EFAULT);
 
        child = entry->se_inode;
        /* revalidate; unlinked and re-created with the same name */
-       if (unlikely(!lu_fid_eq(&item->mop_data.op_fid2, &body->mbo_fid1))) {
+       if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->mbo_fid1))) {
                if (child) {
                        entry->se_inode = NULL;
                        iput(child);
@@ -682,7 +672,7 @@ static int ll_statahead_interpret(struct req_capsule *pill,
        if (rc != 1)
                GOTO(out, rc = -EAGAIN);
 
-       rc = ll_prep_inode(&child, pill, dir->i_sb, it);
+       rc = ll_prep_inode(&child, &req->rq_pill, dir->i_sb, it);
        if (rc)
                GOTO(out, rc);
 
@@ -696,18 +686,109 @@ static int ll_statahead_interpret(struct req_capsule *pill,
        if (agl_should_run(sai, child))
                ll_agl_add(sai, child, entry->se_index);
 
+       EXIT;
+
 out:
        /*
-        * First it will drop ldlm ibits lock refcount by calling
+        * sa_make_ready() will drop ldlm ibits lock refcount by calling
         * ll_intent_drop_lock() in spite of failures. Do not worry about
         * calling ll_intent_drop_lock() more than once.
         */
-       ll_intent_release(&item->mop_it);
-       sa_fini_data(item);
        sa_make_ready(sai, entry, rc);
+}
+
+/* once there are async stat replies, instantiate sa_entry from replies */
+static void sa_handle_callback(struct ll_statahead_info *sai)
+{
+       struct ll_inode_info *lli;
+
+       lli = ll_i2info(sai->sai_dentry->d_inode);
+
+       spin_lock(&lli->lli_sa_lock);
+       while (sa_has_callback(sai)) {
+               struct sa_entry *entry;
+
+               entry = list_entry(sai->sai_interim_entries.next,
+                                  struct sa_entry, se_list);
+               list_del_init(&entry->se_list);
+               spin_unlock(&lli->lli_sa_lock);
+
+               sa_instantiate(sai, entry);
+               spin_lock(&lli->lli_sa_lock);
+       }
+       spin_unlock(&lli->lli_sa_lock);
+}
+
+/*
+ * callback for async stat RPC, because this is called in ptlrpcd context, we
+ * only put sa_entry in sai_interim_entries, and wake up statahead thread to
+ * really prepare inode and instantiate sa_entry later.
+ */
+static int ll_statahead_interpret(struct ptlrpc_request *req,
+                                 struct md_enqueue_info *minfo, int rc)
+{
+       struct lookup_intent *it = &minfo->mi_it;
+       struct inode *dir = minfo->mi_dir;
+       struct ll_inode_info *lli = ll_i2info(dir);
+       struct ll_statahead_info *sai = lli->lli_sai;
+       struct sa_entry *entry = (struct sa_entry *)minfo->mi_cbdata;
+       __u64 handle = 0;
+
+       ENTRY;
+
+       if (it_disposition(it, DISP_LOOKUP_NEG))
+               rc = -ENOENT;
+
+       /*
+        * because statahead thread will wait for all inflight RPC to finish,
+        * sai should be always valid, no need to refcount
+        */
+       LASSERT(sai != NULL);
+       LASSERT(entry != NULL);
+
+       CDEBUG(D_READA, "sa_entry %.*s rc %d\n",
+              entry->se_qstr.len, entry->se_qstr.name, rc);
+
+       if (rc != 0) {
+               ll_intent_release(it);
+               sa_fini_data(minfo);
+       } else {
+               /*
+                * release ibits lock ASAP to avoid deadlock when statahead
+                * thread enqueues lock on parent in readdir and another
+                * process enqueues lock on child with parent lock held, eg.
+                * unlink.
+                */
+               handle = it->it_lock_handle;
+               ll_intent_drop_lock(it);
+               ll_unlock_md_op_lsm(&minfo->mi_data);
+       }
 
        spin_lock(&lli->lli_sa_lock);
+       if (rc != 0) {
+               if (__sa_make_ready(sai, entry, rc))
+                       wake_up(&sai->sai_waitq);
+       } else {
+               int first = 0;
+
+               entry->se_minfo = minfo;
+               entry->se_req = ptlrpc_request_addref(req);
+               /*
+                * Release the async ibits lock ASAP to avoid deadlock
+                * when statahead thread tries to enqueue lock on parent
+                * for readpage and other tries to enqueue lock on child
+                * with parent's lock held, for example: unlink.
+                */
+               entry->se_handle = handle;
+               if (!sa_has_callback(sai))
+                       first = 1;
+
+               list_add_tail(&entry->se_list, &sai->sai_interim_entries);
+               if (first && sai->sai_task)
+                       wake_up_process(sai->sai_task);
+       }
        sai->sai_replied++;
+
        spin_unlock(&lli->lli_sa_lock);
 
        RETURN(rc);
@@ -716,18 +797,18 @@ out:
 /* async stat for file not found in dcache */
 static int sa_lookup(struct inode *dir, struct sa_entry *entry)
 {
-       struct md_op_item *item;
-       int rc;
+       struct md_enqueue_info   *minfo;
+       int                       rc;
 
        ENTRY;
 
-       item = sa_prep_data(dir, NULL, entry);
-       if (IS_ERR(item))
-               RETURN(PTR_ERR(item));
+       minfo = sa_prep_data(dir, NULL, entry);
+       if (IS_ERR(minfo))
+               RETURN(PTR_ERR(minfo));
 
-       rc = md_intent_getattr_async(ll_i2mdexp(dir), item);
+       rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo);
        if (rc < 0)
-               sa_fini_data(item);
+               sa_fini_data(minfo);
 
        RETURN(rc);
 }
@@ -745,7 +826,7 @@ static int sa_revalidate(struct inode *dir, struct sa_entry *entry,
        struct inode *inode = dentry->d_inode;
        struct lookup_intent it = { .it_op = IT_GETATTR,
                                    .it_lock_handle = 0 };
-       struct md_op_item *item;
+       struct md_enqueue_info *minfo;
        int rc;
 
        ENTRY;
@@ -756,9 +837,9 @@ static int sa_revalidate(struct inode *dir, struct sa_entry *entry,
        if (d_mountpoint(dentry))
                RETURN(1);
 
-       item = sa_prep_data(dir, inode, entry);
-       if (IS_ERR(item))
-               RETURN(PTR_ERR(item));
+       minfo = sa_prep_data(dir, inode, entry);
+       if (IS_ERR(minfo))
+               RETURN(PTR_ERR(minfo));
 
        entry->se_inode = igrab(inode);
        rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode),
@@ -766,15 +847,15 @@ static int sa_revalidate(struct inode *dir, struct sa_entry *entry,
        if (rc == 1) {
                entry->se_handle = it.it_lock_handle;
                ll_intent_release(&it);
-               sa_fini_data(item);
+               sa_fini_data(minfo);
                RETURN(1);
        }
 
-       rc = md_intent_getattr_async(ll_i2mdexp(dir), item);
+       rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo);
        if (rc < 0) {
                entry->se_inode = NULL;
                iput(inode);
-               sa_fini_data(item);
+               sa_fini_data(minfo);
        }
 
        RETURN(rc);
@@ -952,8 +1033,10 @@ static int ll_statahead_thread(void *arg)
                        break;
                }
 
+               sai->sai_in_readpage = 1;
                page = ll_get_dir_page(dir, op_data, pos);
                ll_unlock_md_op_lsm(op_data);
+               sai->sai_in_readpage = 0;
                if (IS_ERR(page)) {
                        rc = PTR_ERR(page);
                        CDEBUG(D_READA,
@@ -1018,6 +1101,11 @@ static int ll_statahead_thread(void *arg)
 
                        while (({set_current_state(TASK_IDLE);
                                 sai->sai_task; })) {
+                               if (sa_has_callback(sai)) {
+                                       __set_current_state(TASK_RUNNING);
+                                       sa_handle_callback(sai);
+                               }
+
                                spin_lock(&lli->lli_agl_lock);
                                while (sa_sent_full(sai) &&
                                       !agl_list_empty(sai)) {
@@ -1070,11 +1158,16 @@ static int ll_statahead_thread(void *arg)
 
        /*
         * statahead is finished, but statahead entries need to be cached, wait
-        * for file release closedir() call to stop me.
+        * for file release to stop me.
         */
        while (({set_current_state(TASK_IDLE);
                 sai->sai_task; })) {
-               schedule();
+               if (sa_has_callback(sai)) {
+                       __set_current_state(TASK_RUNNING);
+                       sa_handle_callback(sai);
+               } else {
+                       schedule();
+               }
        }
        __set_current_state(TASK_RUNNING);
 
@@ -1090,6 +1183,9 @@ out:
                /* in case we're not woken up, timeout wait */
                msleep(125);
 
+       /* release resources held by statahead RPCs */
+       sa_handle_callback(sai);
+
        CDEBUG(D_READA, "%s: statahead thread stopped: sai %p, parent %pd\n",
               sbi->ll_fsname, sai, parent);
 
@@ -1350,6 +1446,10 @@ static int revalidate_statahead_dentry(struct inode *dir,
        if (!entry)
                GOTO(out, rc = -EAGAIN);
 
+       /* if statahead is busy in readdir, help it do post-work */
+       if (!sa_ready(entry) && sai->sai_in_readpage)
+               sa_handle_callback(sai);
+
        if (!sa_ready(entry)) {
                spin_lock(&lli->lli_sa_lock);
                sai->sai_index_wait = entry->se_index;