Whamcloud - gitweb
LU-7330 ldlm: fix race of starting bl threads 26/17026/2
authorNiu Yawei <yawei.niu@intel.com>
Tue, 3 Nov 2015 06:59:32 +0000 (01:59 -0500)
committerOleg Drokin <oleg.drokin@intel.com>
Wed, 11 Nov 2015 15:53:51 +0000 (15:53 +0000)
There is race in the code of starting bl threads which leads to
thread number exceeds the maximum number when race happened, it
can also lead to duplicated thread name.

This patch fixes the race and cleanup the code a bit.

Signed-off-by: Niu Yawei <yawei.niu@intel.com>
Change-Id: I9c9be125d1d76890b8c52476684976dad3cb3d87
Reviewed-on: http://review.whamcloud.com/17026
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Bobi Jam <bobijam@hotmail.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
lustre/ldlm/ldlm_lockd.c

index 651451a..72a6fb2 100644 (file)
@@ -2600,7 +2600,6 @@ static int ldlm_bl_get_work(struct ldlm_bl_pool *blp,
 
 /* This only contains temporary data until the thread starts */
 struct ldlm_bl_thread_data {
-       char                    bltd_name[CFS_CURPROC_COMM_MAX];
        struct ldlm_bl_pool     *bltd_blp;
        struct completion       bltd_comp;
        int                     bltd_num;
@@ -2608,19 +2607,32 @@ struct ldlm_bl_thread_data {
 
 static int ldlm_bl_thread_main(void *arg);
 
-static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp)
+static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp, bool check_busy)
 {
        struct ldlm_bl_thread_data bltd = { .bltd_blp = blp };
        struct task_struct *task;
 
        init_completion(&bltd.bltd_comp);
-       bltd.bltd_num = atomic_read(&blp->blp_num_threads);
-       snprintf(bltd.bltd_name, sizeof(bltd.bltd_name) - 1,
-               "ldlm_bl_%02d", bltd.bltd_num);
-       task = kthread_run(ldlm_bl_thread_main, &bltd, bltd.bltd_name);
+
+       bltd.bltd_num = atomic_inc_return(&blp->blp_num_threads);
+       if (bltd.bltd_num >= blp->blp_max_threads) {
+               atomic_dec(&blp->blp_num_threads);
+               return 0;
+       }
+
+       LASSERTF(bltd.bltd_num > 0, "thread num:%d\n", bltd.bltd_num);
+       if (check_busy &&
+           atomic_read(&blp->blp_busy_threads) < (bltd.bltd_num - 1)) {
+               atomic_dec(&blp->blp_num_threads);
+               return 0;
+       }
+
+       task = kthread_run(ldlm_bl_thread_main, &bltd, "ldlm_bl_%02d",
+                          bltd.bltd_num);
        if (IS_ERR(task)) {
                CERROR("cannot start LDLM thread ldlm_bl_%02d: rc %ld\n",
-                      atomic_read(&blp->blp_num_threads), PTR_ERR(task));
+                      bltd.bltd_num, PTR_ERR(task));
+               atomic_dec(&blp->blp_num_threads);
                return PTR_ERR(task);
        }
        wait_for_completion(&bltd.bltd_comp);
@@ -2632,12 +2644,11 @@ static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp)
 static int ldlm_bl_thread_need_create(struct ldlm_bl_pool *blp,
                                      struct ldlm_bl_work_item *blwi)
 {
-       int busy = atomic_read(&blp->blp_busy_threads);
-
-       if (busy >= blp->blp_max_threads)
+       if (atomic_read(&blp->blp_num_threads) >= blp->blp_max_threads)
                return 0;
 
-       if (busy < atomic_read(&blp->blp_num_threads))
+       if (atomic_read(&blp->blp_busy_threads) <
+           atomic_read(&blp->blp_num_threads))
                return 0;
 
        if (blwi != NULL && (blwi->blwi_ns == NULL ||
@@ -2726,9 +2737,6 @@ static int ldlm_bl_thread_main(void *arg)
 
        blp = bltd->bltd_blp;
 
-       atomic_inc(&blp->blp_num_threads);
-       atomic_inc(&blp->blp_busy_threads);
-
        complete(&bltd->bltd_comp);
        /* cannot use bltd after this, it is only on caller's stack */
 
@@ -2740,29 +2748,28 @@ static int ldlm_bl_thread_main(void *arg)
 
                rc = ldlm_bl_get_work(blp, &blwi, &exp);
 
-               if (rc == 0) {
-                       atomic_dec(&blp->blp_busy_threads);
+               if (rc == 0)
                        l_wait_event_exclusive(blp->blp_waitq,
                                               ldlm_bl_get_work(blp, &blwi,
                                                                &exp),
                                               &lwi);
-                       atomic_inc(&blp->blp_busy_threads);
-               }
+               atomic_inc(&blp->blp_busy_threads);
 
                if (ldlm_bl_thread_need_create(blp, blwi))
                        /* discard the return value, we tried */
-                       ldlm_bl_thread_start(blp);
+                       ldlm_bl_thread_start(blp, true);
 
                if (exp)
                        rc = ldlm_bl_thread_exports(blp, exp);
                else if (blwi)
                        rc = ldlm_bl_thread_blwi(blp, blwi);
 
+               atomic_dec(&blp->blp_busy_threads);
+
                if (rc == LDLM_ITER_STOP)
                        break;
        }
 
-       atomic_dec(&blp->blp_busy_threads);
        atomic_dec(&blp->blp_num_threads);
        complete(&blp->blp_comp);
        RETURN(0);
@@ -3042,7 +3049,7 @@ static int ldlm_setup(void)
        }
 
        for (i = 0; i < blp->blp_min_threads; i++) {
-               rc = ldlm_bl_thread_start(blp);
+               rc = ldlm_bl_thread_start(blp, false);
                if (rc < 0)
                        GOTO(out, rc);
        }