Whamcloud - gitweb
LU-1622 lnet: hash MEs on wildcard portal
authorLiang Zhen <liang@whamcloud.com>
Wed, 11 Jul 2012 16:47:01 +0000 (00:47 +0800)
committerOleg Drokin <green@whamcloud.com>
Mon, 23 Jul 2012 15:03:18 +0000 (11:03 -0400)
This patch addressed a few more issues in BZ21619.

One issue is, we should always attach ME with ignore-bits on a list
instead of hash-table, no matter it's wildcard portal or unique
portal, because message could match buffers with various match-bits
if they also had ignore-bits, which means if user set both match-bits
and ignore-bits for MEs on unique portal, incoming message could
never be able to find them because they only search MEs hashed by
match-bits even those MEs with different match-bits can also fit
them after ignoring some bits.

the reason that nobody complained about it is because Lustre doesn't
have such use-case(posting ME with ignore-bits on unique portal).

The second issue fixed by this patch is, If multiple services share
one portal but use match-bits to identify different buffer types,
we still have performance issue because all buffers are attached
on one list and searching for typed buffer could be long and
expensive. So we should hash buffers on wildcard portal as well.

This is the reason that LNet selftest performance is not so good
if we mixed PING test and BRW test, because PING and BRW are sharing
on portal, so request buffers for them are attached on the same list,
BRW requests are processed much slower than PING requests, so there
will be a long search for request buffer of PING.

Signed-off-by: Liang Zhen <liang@whamcloud.com>
Change-Id: I0d2c6330dd231d369e2a86ced2b8374c0c96dbf9
Reviewed-on: http://review.whamcloud.com/3376
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Bobi Jam <bobijam@whamcloud.com>
Reviewed-by: Doug Oucharek <doug@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/lib-types.h
lnet/lnet/lib-me.c
lnet/lnet/lib-ptl.c

index f4ef8ff..4d3d614 100644 (file)
@@ -260,6 +260,7 @@ typedef struct lnet_me {
         lnet_libhandle_t       me_lh;
         lnet_process_id_t      me_match_id;
         unsigned int           me_portal;
+       unsigned int           me_pos;          /* hash offset in mt_hash */
         __u64                  me_match_bits;
         __u64                  me_ignore_bits;
         lnet_unlink_t          me_unlink;
@@ -601,6 +602,16 @@ struct lnet_match_info {
 /* ME hash of RDMA portal */
 #define LNET_MT_HASH_BITS              8
 #define LNET_MT_HASH_SIZE              (1 << LNET_MT_HASH_BITS)
+#define LNET_MT_HASH_MASK              (LNET_MT_HASH_SIZE - 1)
+/* we allocate (LNET_MT_HASH_SIZE + 1) entries for lnet_match_table::mt_hash,
+ * the last entry is reserved for MEs with ignore-bits */
+#define LNET_MT_HASH_IGNORE            LNET_MT_HASH_SIZE
+/* __u64 has 2^6 bits, so need 2^(LNET_MT_HASH_BITS - LNET_MT_BITS_U64) which
+ * is 4 __u64s as bit-map, and add an extra __u64 (only use one bit) for the
+ * ME-list with ignore-bits, which is mtable::mt_hash[LNET_MT_HASH_IGNORE] */
+#define LNET_MT_BITS_U64               6       /* 2^6 bits */
+#define LNET_MT_EXHAUSTED_BITS         (LNET_MT_HASH_BITS - LNET_MT_BITS_U64)
+#define LNET_MT_EXHAUSTED_BMAP         ((1 << LNET_MT_EXHAUSTED_BITS) + 1)
 
 /* portal match table */
 struct lnet_match_table {
@@ -608,9 +619,10 @@ struct lnet_match_table {
        unsigned int            mt_cpt;
        unsigned int            mt_portal;      /* portal index */
        /* match table is set as "enabled" if there's non-exhausted MD
-        * attached on mt_mlist, it's only valide for wildcard portal */
+        * attached on mt_mhash, it's only valide for wildcard portal */
        unsigned int            mt_enabled;
-       cfs_list_t              mt_mlist;       /* matching list */
+       /* bitmap to flag whether MEs on mt_hash are exhausted or not */
+       __u64                   mt_exhausted[LNET_MT_EXHAUSTED_BMAP];
        cfs_list_t              *mt_mhash;      /* matching hash */
 };
 
index f203e30..8df9917 100644 (file)
@@ -107,8 +107,12 @@ LNetMEAttach(unsigned int portal,
 
        lnet_res_lh_initialize(the_lnet.ln_me_containers[mtable->mt_cpt],
                               &me->me_lh);
-       head = lnet_mt_match_head(mtable, match_id, match_bits);
+       if (ignore_bits != 0)
+               head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+       else
+               head = lnet_mt_match_head(mtable, match_id, match_bits);
 
+       me->me_pos = head - &mtable->mt_mhash[0];
        if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL)
                cfs_list_add_tail(&me->me_list, head);
        else
@@ -182,6 +186,7 @@ LNetMEInsert(lnet_handle_me_t current_meh,
                return -EPERM;
         }
 
+       new_me->me_pos = current_me->me_pos;
         new_me->me_portal = current_me->me_portal;
         new_me->me_match_id = match_id;
         new_me->me_match_bits = match_bits;
index d68c4fc..527a28a 100644 (file)
@@ -311,6 +311,49 @@ lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg)
        return ptl->ptl_mtables[cpt];
 }
 
+static int
+lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos)
+{
+       __u64   *bmap;
+       int     i;
+
+       if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+               return 0;
+
+       if (pos < 0) { /* check all bits */
+               for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) {
+                       if (mtable->mt_exhausted[i] != (__u64)(-1))
+                               return 0;
+               }
+               return 1;
+       }
+
+       LASSERT(pos <= LNET_MT_HASH_IGNORE);
+       /* mtable::mt_mhash[pos] is marked as exhausted or not */
+       bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+       pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+       return ((*bmap) & (1ULL << pos)) != 0;
+}
+
+static void
+lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted)
+{
+       __u64   *bmap;
+
+       LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]));
+       LASSERT(pos <= LNET_MT_HASH_IGNORE);
+
+       /* set mtable::mt_mhash[pos] as exhausted/non-exhausted */
+       bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+       pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+       if (!exhausted)
+               *bmap &= ~(1ULL << pos);
+       else
+               *bmap |= 1ULL << pos;
+}
+
 cfs_list_t *
 lnet_mt_match_head(struct lnet_match_table *mtable,
                   lnet_process_id_t id, __u64 mbits)
@@ -318,8 +361,7 @@ lnet_mt_match_head(struct lnet_match_table *mtable,
        struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal];
 
        if (lnet_ptl_is_wildcard(ptl)) {
-               return &mtable->mt_mlist;
-
+               return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK];
        } else {
                unsigned long hash = mbits + id.nid + id.pid;
 
@@ -339,12 +381,16 @@ lnet_mt_match_md(struct lnet_match_table *mtable,
        int                     exhausted = 0;
        int                     rc;
 
-       /* NB: only wildcard portal can return LNET_MATCHMD_EXHAUSTED */
+       /* any ME with ignore bits? */
+       if (!cfs_list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE]))
+               head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+       else
+               head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+ again:
+       /* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */
        if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
                exhausted = LNET_MATCHMD_EXHAUSTED;
 
-       head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
-
        cfs_list_for_each_entry_safe(me, tmp, head, me_list) {
                /* ME attached but MD not attached yet */
                if (me->me_md == NULL)
@@ -363,6 +409,17 @@ lnet_mt_match_md(struct lnet_match_table *mtable,
                }
        }
 
+       if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */
+               lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1);
+               if (!lnet_mt_test_exhausted(mtable, -1))
+                       exhausted = 0;
+       }
+
+       if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) {
+               head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+               goto again; /* re-check MEs w/o ignore-bits */
+       }
+
        if (info->mi_opc == LNET_MD_OP_GET ||
            !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal]))
                return LNET_MATCHMD_DROP | exhausted;
@@ -585,7 +642,7 @@ lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
 
        if (cfs_list_empty(&ptl->ptl_msg_stealing) &&
            cfs_list_empty(&ptl->ptl_msg_delayed) &&
-           mtable->mt_enabled)
+           !lnet_mt_test_exhausted(mtable, me->me_pos))
                return;
 
        lnet_ptl_lock(ptl);
@@ -648,8 +705,11 @@ lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
                goto again;
        }
 
-       if (lnet_ptl_is_wildcard(ptl) && !exhausted && !mtable->mt_enabled)
-               lnet_ptl_enable_mt(ptl, cpt);
+       if (lnet_ptl_is_wildcard(ptl) && !exhausted) {
+               lnet_mt_set_exhausted(mtable, me->me_pos, 0);
+               if (!mtable->mt_enabled)
+                       lnet_ptl_enable_mt(ptl, cpt);
+       }
 
        lnet_ptl_unlock(ptl);
 }
@@ -680,25 +740,17 @@ lnet_ptl_cleanup(struct lnet_portal *ptl)
 
                mhash = mtable->mt_mhash;
                /* cleanup ME */
-               while (!cfs_list_empty(&mtable->mt_mlist)) {
-                       me = cfs_list_entry(mtable->mt_mlist.next,
-                                           lnet_me_t, me_list);
-                       CERROR("Active wildcard ME %p on exit\n", me);
-                       cfs_list_del(&me->me_list);
-                       lnet_me_free(me);
-               }
-
-               for (j = 0; j < LNET_MT_HASH_SIZE; j++) {
+               for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) {
                        while (!cfs_list_empty(&mhash[j])) {
                                me = cfs_list_entry(mhash[j].next,
                                                    lnet_me_t, me_list);
-                               CERROR("Active unique ME %p on exit\n", me);
+                               CERROR("Active ME %p on exit\n", me);
                                cfs_list_del(&me->me_list);
                                lnet_me_free(me);
                        }
                }
-
-               LIBCFS_FREE(mhash, sizeof(*mhash) * LNET_MT_HASH_SIZE);
+               /* the extra entry is for MEs with ignore bits */
+               LIBCFS_FREE(mhash, sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
        }
 
        cfs_percpt_free(ptl->ptl_mtables);
@@ -731,19 +783,22 @@ lnet_ptl_setup(struct lnet_portal *ptl, int index)
 # endif
 #endif
        cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+               /* the extra entry is for MEs with ignore bits */
                LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i,
-                                sizeof(*mhash) * LNET_MT_HASH_SIZE);
+                                sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
                if (mhash == NULL) {
                        CERROR("Failed to create match hash for portal %d\n",
                               index);
                        goto failed;
                }
 
+               memset(&mtable->mt_exhausted[0], -1,
+                      sizeof(mtable->mt_exhausted[0]) *
+                      LNET_MT_EXHAUSTED_BMAP);
                mtable->mt_mhash = mhash;
-               for (j = 0; j < LNET_MT_HASH_SIZE; j++)
+               for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++)
                        CFS_INIT_LIST_HEAD(&mhash[j]);
 
-               CFS_INIT_LIST_HEAD(&mtable->mt_mlist);
                mtable->mt_portal = index;
                mtable->mt_cpt = i;
        }