Whamcloud - gitweb
lmv/iam: new "scalable" readdir.
authornikita <nikita>
Thu, 23 Nov 2006 18:44:45 +0000 (18:44 +0000)
committernikita <nikita>
Thu, 23 Nov 2006 18:44:45 +0000 (18:44 +0000)
lustre/kernel_patches/patches/ext3-iam-separate.patch
lustre/lmv/lmv_obd.c
lustre/obdclass/mea.c

index 7064248..965c641 100644 (file)
@@ -2887,7 +2887,7 @@ Index: iam/fs/ext3/iam_lvar.c
 ===================================================================
 --- iam.orig/fs/ext3/iam_lvar.c
 +++ iam/fs/ext3/iam_lvar.c
-@@ -0,0 +1,1042 @@
+@@ -0,0 +1,1061 @@
 +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
 + * vim:expandtab:shiftwidth=8:tabstop=8:
 + *
@@ -3063,17 +3063,16 @@ Index: iam/fs/ext3/iam_lvar.c
 +#define LVAR_HASH_R5     (0)
 +#define LVAR_HASH_PREFIX (0)
 +
-+static inline lvar_hash_t get_hash(const struct iam_container *bag,
-+                                   const char *name, int namelen)
++static __u32 hash_build0(const char *name, int namelen)
 +{
-+        lvar_hash_t result;
++        __u32 result;
 +
 +        if (namelen == 0)
 +                return 0;
 +        if (strncmp(name, ".", 1) == 0 && namelen == 1)
-+                return 2;
++                return 1;
 +        if (strncmp(name, "..", 2) == 0 && namelen == 2)
-+                return 4;
++                return 2;
 +
 +        if (LVAR_HASH_PREFIX) {
 +                result = 0;
@@ -3084,14 +3083,34 @@ Index: iam/fs/ext3/iam_lvar.c
 +
 +                if (LVAR_HASH_TEA)
 +                        hinfo.hash_version = DX_HASH_TEA;
-+                else if (LVAR_HASH_R5)
++                else
 +                        hinfo.hash_version = DX_HASH_R5;
 +                hinfo.seed = 0;
 +                ext3fs_dirhash(name, namelen, &hinfo);
 +                result = hinfo.hash;
 +        }
++        return result;
++}
++
++enum {
++        HASH_GRAY_AREA = 1024,
++        MAX_HASH_SIZE  = 0x7fffffffUL
++};
 +
-+        return (result << 1) & 0x7fffffff;
++static __u32 hash_build(const char *name, int namelen)
++{
++        __u32 hash;
++
++        hash = (hash_build0(name, namelen) << 1) & MAX_HASH_SIZE;
++        if (hash > MAX_HASH_SIZE - HASH_GRAY_AREA)
++                hash += HASH_GRAY_AREA;
++        return hash;
++}
++
++static inline lvar_hash_t get_hash(const struct iam_container *bag,
++                                   const char *name, int namelen)
++{
++        return hash_build(name, namelen);
 +}
 +
 +static inline int e_eq(const struct lvar_leaf_entry *ent,
index 06ea1e0..3f6fd59 100644 (file)
@@ -36,6 +36,7 @@
 #else
 #include <liblustre.h>
 #endif
+
 #include <linux/ext2_fs.h>
 
 #include <lustre/lustre_idl.h>
@@ -185,7 +186,7 @@ static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
         if (ev == OBD_NOTIFY_OCD) {
                 struct obd_connect_data *conn_data =
                         &watched->u.cli.cl_import->imp_connect_data;
-                
+
                 /* Set connect data to desired target, update
                  * exp_connect_flags. */
                 rc = lmv_set_mdc_data(lmv, uuid, conn_data);
@@ -745,21 +746,21 @@ static int lmv_placement_policy(struct obd_device *obd,
 
         LASSERT(mds != NULL);
 
-        /* 
+        /*
          * Allocate new fid on target according to operation type and parent
          * home mds.
          */
         obj = lmv_obj_grab(obd, &op_data->op_fid1);
         if (obj != NULL || op_data->op_name == NULL ||
             op_data->op_opc != LUSTRE_OPC_MKDIR) {
-                /* 
+                /*
                  * Allocate fid for non-dir or for null name or for case parent
                  * dir is split.
                  */
                 if (obj) {
                         lmv_obj_put(obj);
 
-                        /* 
+                        /*
                          * If we have this flag turned on, and we see that
                          * parent dir is split, this means, that caller did not
                          * notice split yet. This is race and we would like to
@@ -768,7 +769,7 @@ static int lmv_placement_policy(struct obd_device *obd,
                         if (op_data->op_bias & MDS_CHECK_SPLIT)
                                 RETURN(-ERESTART);
                 }
-                                
+
                 /*
                  * Allocate new fid on same mds where parent fid is located. In
                  * case of split dir, ->op_fid1 here will contain fid of slave
@@ -778,7 +779,7 @@ static int lmv_placement_policy(struct obd_device *obd,
                 if (rc)
                         GOTO(out, rc);
         } else {
-                /* 
+                /*
                  * Parent directory is not split and we want to create a
                  * directory in it. Let's calculate where to place it according
                  * to name.
@@ -1332,11 +1333,11 @@ repeat:
                 CDEBUG(D_OTHER, "created - "DFID"\n", PFID(&op_data->op_fid1));
         } else if (rc == -ERESTART) {
                 LASSERT(*request != NULL);
-                DEBUG_REQ(D_WARNING|D_RPCTRACE, *request, 
+                DEBUG_REQ(D_WARNING|D_RPCTRACE, *request,
                           "Got -ERESTART during create!\n");
                 ptlrpc_req_finished(*request);
                 *request = NULL;
-                
+
                 /*
                  * Directory got split. Time to update local object and repeat
                  * the request with proper MDS.
@@ -1399,7 +1400,7 @@ lmv_enqueue_slaves(struct obd_export *exp, int locktype,
                 memset(op_data2, 0, sizeof(*op_data2));
                 op_data2->op_fid1 = mea->mea_ids[i];
                 op_data2->op_bias = 0;
-        
+
                 tgt_exp = lmv_find_export(lmv, &op_data2->op_fid1);
                 if (IS_ERR(tgt_exp))
                         GOTO(cleanup, rc = PTR_ERR(tgt_exp));
@@ -1589,7 +1590,7 @@ repeat:
         obj = lmv_obj_grab(obd, &rid);
         if (obj) {
                 int mea_idx;
-                
+
                 /* Directory is split. Look for right mds for this name */
                 mea_idx = raw_name2idx(obj->lo_hashtype, obj->lo_objcount,
                                        filename, namelen - 1);
@@ -1635,11 +1636,11 @@ repeat:
                 }
         } else if (rc == -ERESTART) {
                 LASSERT(*request != NULL);
-                DEBUG_REQ(D_WARNING|D_RPCTRACE, *request, 
+                DEBUG_REQ(D_WARNING|D_RPCTRACE, *request,
                           "Got -ERESTART during getattr!\n");
                 ptlrpc_req_finished(*request);
                 *request = NULL;
-                
+
                 /*
                  * Directory got split. Time to update local object and repeat
                  * the request with proper MDS.
@@ -1673,7 +1674,7 @@ repeat:
         LASSERT(++loop <= 2);
         if (op_data->op_namelen != 0) {
                 int mea_idx;
-                
+
                 /* Usual link request */
                 obj = lmv_obj_grab(obd, &op_data->op_fid2);
                 if (obj) {
@@ -1713,11 +1714,11 @@ repeat:
         rc = md_link(lmv->tgts[mds].ltd_exp, op_data, request);
         if (rc == -ERESTART) {
                 LASSERT(*request != NULL);
-                DEBUG_REQ(D_WARNING|D_RPCTRACE, *request, 
+                DEBUG_REQ(D_WARNING|D_RPCTRACE, *request,
                           "Got -ERESTART during link!\n");
                 ptlrpc_req_finished(*request);
                 *request = NULL;
-                
+
                 /*
                  * Directory got split. Time to update local object and repeat
                  * the request with proper MDS.
@@ -1813,7 +1814,7 @@ repeat:
                 CDEBUG(D_OTHER, "Parent obj "DFID"\n", PFID(&op_data->op_fid2));
                 lmv_obj_put(obj);
         }
-        
+
 request:
         op_data->op_fsuid = current->fsuid;
         op_data->op_fsgid = current->fsgid;
@@ -1823,11 +1824,11 @@ request:
                        new, newlen, request);
         if (rc == -ERESTART) {
                 LASSERT(*request != NULL);
-                DEBUG_REQ(D_WARNING|D_RPCTRACE, *request, 
+                DEBUG_REQ(D_WARNING|D_RPCTRACE, *request,
                           "Got -ERESTART during rename!\n");
                 ptlrpc_req_finished(*request);
                 *request = NULL;
-                
+
                 /*
                  * Directory got split. Time to update local object and repeat
                  * the request with proper MDS.
@@ -1960,8 +1961,30 @@ int lmv_blocking_ast(struct ldlm_lock *lock,
         RETURN(0);
 }
 
+static void lmv_hash_adjust(__u32 *hash, __u32 hash_adj)
+{
+        __u32 val;
+
+        val = le32_to_cpu(*hash);
+        if (val != 0 && val != DIR_END_OFF)
+                *hash = cpu_to_le32(val - hash_adj);
+}
+
+static __u32 lmv_node_rank(struct obd_export *exp, const struct lu_fid *fid)
+{
+        __u64 id;
+        struct obd_import *imp;
+
+        /*
+         * XXX Hack: to get nid we assume that underlying obd device is mdc.
+         */
+        imp  = class_exp2cliimp(exp);
+        id   = imp->imp_connection->c_self + fid_flatten(fid);
+        return id ^ (id >> 32);
+}
+
 static int lmv_readpage(struct obd_export *exp, const struct lu_fid *fid,
-                        struct obd_capa *oc, __u64 offset, struct page *page,
+                        struct obd_capa *oc, __u64 offset64, struct page *page,
                         struct ptlrpc_request **request)
 {
         struct obd_device *obd = exp->exp_obd;
@@ -1969,35 +1992,67 @@ static int lmv_readpage(struct obd_export *exp, const struct lu_fid *fid,
         struct obd_export *tgt_exp;
         struct lu_fid rid = *fid;
         struct lmv_obj *obj;
-        int i = 0, rc;
+        __u32 offset;
+        __u32 hash_adj = 0;
+        __u32 rank = 0;
+        __u32 seg_size = 0;
+        int tgt = 0;
+        int tgt0 = 0;
+        int rc;
+        int nr = 0;
         ENTRY;
 
+        offset = offset64;
+        /*
+         * Check that offset is representable by 32bit number.
+         */
+        LASSERT((__u64)offset == offset64);
+
         rc = lmv_check_connect(obd);
        if (rc)
                RETURN(rc);
 
-        CDEBUG(D_INFO, "READPAGE at %llu from "DFID"\n", offset, PFID(&rid));
+        CDEBUG(D_INFO, "READPAGE at %x from "DFID"\n", offset, PFID(&rid));
 
         obj = lmv_obj_grab(obd, fid);
         if (obj) {
-                __u64 index = offset;
-                __u64 seg = MAX_HASH_SIZE;
+                struct lmv_inode *loi;
+
                 lmv_obj_lock(obj);
 
-                LASSERT(obj->lo_objcount > 0);
-                do_div(seg, obj->lo_objcount);
-                do_div(index, (__u32)seg);
-                i = (int)index;
-                rid = obj->lo_inodes[i].li_fid;
-                tgt_exp = lmv_get_export(lmv, obj->lo_inodes[i].li_mds);
+                nr       = obj->lo_objcount;
+                LASSERT(nr > 0);
+                seg_size = MAX_HASH_SIZE / nr;
+                loi      = obj->lo_inodes;
+                rank     = lmv_node_rank(lmv_get_export(lmv, loi[0].li_mds),
+                                         fid) % nr;
+                tgt0     = (offset / seg_size) % nr;
+                tgt      = (tgt0 + rank) % nr;
+
+                if (tgt < tgt0)
+                        /*
+                         * Wrap around.
+                         *
+                         * Last segment has unusual length due to division
+                         * rounding.
+                         */
+                        hash_adj = MAX_HASH_SIZE - seg_size * nr;
+                else
+                        hash_adj = 0;
+
+                hash_adj += rank * seg_size;
 
-                lmv_obj_unlock(obj);
+                CDEBUG(D_INFO, "hash_adj: %x %x %x/%x -> %x/%x\n",
+                       rank, hash_adj, offset, tgt0, offset + hash_adj, tgt);
+
+                offset = (offset + hash_adj) % MAX_HASH_SIZE;
+                rid = obj->lo_inodes[tgt].li_fid;
+                tgt_exp = lmv_get_export(lmv, loi[tgt].li_mds);
 
                 CDEBUG(D_INFO, "forward to "DFID" with offset %lu i %d\n",
-                       PFID(&rid), (unsigned long)offset, i);
-        } else {
+                       PFID(&rid), (unsigned long)offset, tgt);
+        } else
                 tgt_exp = lmv_find_export(lmv, &rid);
-        }
 
         if (IS_ERR(tgt_exp))
                 GOTO(cleanup, rc = PTR_ERR(tgt_exp));
@@ -2005,19 +2060,32 @@ static int lmv_readpage(struct obd_export *exp, const struct lu_fid *fid,
         rc = md_readpage(tgt_exp, &rid, oc, offset, page, request);
         if (rc)
                 GOTO(cleanup, rc);
-        if (obj && i < obj->lo_objcount - 1) {
+        if (obj) {
+                __u32 hend;
                 struct lu_dirpage *dp;
-                __u32 end;
-                (void)cfs_kmap(page);
-                dp = cfs_page_address(page);
-                end = le32_to_cpu(dp->ldp_hash_end);
-                if (end == DIR_END_OFF) {
-                        __u64 max_hash = MAX_HASH_SIZE;
-
-                        do_div(max_hash, obj->lo_objcount);
-                        dp->ldp_hash_end = (__u32)max_hash * (i + 1);
-                        CDEBUG(D_INFO, ""DFID" reset end %lu i %d\n", PFID(&rid),
-                                        (unsigned long)dp->ldp_hash_end, i);
+                struct lu_dirent  *ent;
+
+                dp = cfs_kmap(page);
+
+                hend = le32_to_cpu(dp->ldp_hash_end);
+                lmv_hash_adjust(&dp->ldp_hash_start, hash_adj);
+                lmv_hash_adjust(&dp->ldp_hash_end,   hash_adj);
+
+                for (ent = lu_dirent_start(dp); ent != NULL;
+                     ent = lu_dirent_next(ent))
+                        lmv_hash_adjust(&ent->lde_hash, hash_adj);
+
+                if (tgt0 != nr - 1) {
+                        __u32 end;
+
+                        end = le32_to_cpu(dp->ldp_hash_end);
+                        if (end == DIR_END_OFF) {
+                                dp->ldp_hash_end = cpu_to_le32(seg_size *
+                                                               (tgt0 + 1));
+                                CDEBUG(D_INFO, ""DFID" reset end %x tgt %d\n",
+                                       PFID(&rid),
+                                       le32_to_cpu(dp->ldp_hash_end), tgt);
+                        }
                 }
                 cfs_kunmap(page);
         }
@@ -2027,8 +2095,10 @@ static int lmv_readpage(struct obd_export *exp, const struct lu_fid *fid,
          */
         EXIT;
 cleanup:
-        if (obj)
+        if (obj) {
+                lmv_obj_unlock(obj);
                 lmv_obj_put(obj);
+        }
         return rc;
 }
 
@@ -2052,7 +2122,7 @@ static int lmv_unlink_slaves(struct obd_export *exp,
         op_data2->op_fsuid = current->fsuid;
         op_data2->op_fsgid = current->fsgid;
         op_data2->op_bias = 0;
-        
+
         LASSERT(mea != NULL);
         for (i = 0; i < mea->mea_count; i++) {
                 memset(op_data2, 0, sizeof(*op_data2));
@@ -2137,15 +2207,15 @@ repeat:
         op_data->op_fsuid = current->fsuid;
         op_data->op_fsgid = current->fsgid;
         op_data->op_cap   = current->cap_effective;
-        
+
         rc = md_unlink(tgt_exp, op_data, request);
         if (rc == -ERESTART) {
                 LASSERT(*request != NULL);
-                DEBUG_REQ(D_WARNING|D_RPCTRACE, *request, 
+                DEBUG_REQ(D_WARNING|D_RPCTRACE, *request,
                           "Got -ERESTART during unlink!\n");
                 ptlrpc_req_finished(*request);
                 *request = NULL;
-                
+
                 /*
                  * Directory got split. Time to update local object and repeat
                  * the request with proper MDS.
index 030cf61..cc83130 100644 (file)
@@ -68,16 +68,16 @@ static int mea_all_chars_hash(int count, char *name, int namelen)
 #define LVAR_HASH_R5     (0)
 #define LVAR_HASH_PREFIX (0)
 
-static __u32 hash_build(char *name, int namelen)
+static __u32 hash_build0(const char *name, int namelen)
 {
         __u32 result;
 
         if (namelen == 0)
                 return 0;
         if (strncmp(name, ".", 1) == 0 && namelen == 1)
-                return 2;
+                return 1;
         if (strncmp(name, "..", 2) == 0 && namelen == 2)
-                return 4;
+                return 2;
 
         if (LVAR_HASH_PREFIX) {
                 result = 0;
@@ -95,18 +95,31 @@ static __u32 hash_build(char *name, int namelen)
                 result = hinfo.hash;
         }
 
-        return (result << 1) & 0x7fffffff;
+        return result;
 }
 
-static int mea_hash_segment(int count, char *name, int namelen)
+enum {
+        HASH_GRAY_AREA = 1024
+};
+
+static __u32 hash_build(const char *name, int namelen)
+{
+        __u32 hash;
+
+        hash = (hash_build0(name, namelen) << 1) & MAX_HASH_SIZE;
+        if (hash > MAX_HASH_SIZE - HASH_GRAY_AREA)
+                hash += HASH_GRAY_AREA;
+        return hash;
+}
+
+static int mea_hash_segment(int count, const char *name, int namelen)
 {
-        __u64 hash;
-        __u64 hash_segment = MAX_HASH_SIZE;
+        __u32 hash;
+
+        LASSERT(IS_PO2(MAX_HASH_SIZE + 1));
 
-        hash = hash_build(name, namelen);
-        do_div(hash_segment, count);
-        do_div(hash, hash_segment);
-        LASSERTF(hash <= count, "hash "LPU64" count %d \n", hash, count);
+        hash = hash_build(name, namelen) / (MAX_HASH_SIZE / count);
+        LASSERTF(hash <= count, "hash %x count %d \n", hash, count);
 
         return hash;
 }