Whamcloud - gitweb
LU-11025 dne: introduce new directory hash type: "crush"
[fs/lustre-release.git] / lustre / include / lustre_lmv.h
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License version 2 for more details.  A copy is
14  * included in the COPYING file that accompanied this code.
15
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2014, 2016, Intel Corporation.
24  */
25 /*
26  * lustre/include/lustre_lmv.h
27  *
28  * Lustre LMV structures and functions.
29  *
30  * Author: Di Wang <di.wang@intel.com>
31  */
32
33 #ifndef _LUSTRE_LMV_H
34 #define _LUSTRE_LMV_H
35 #include <uapi/linux/lustre/lustre_idl.h>
36
37 struct lmv_oinfo {
38         struct lu_fid   lmo_fid;
39         u32             lmo_mds;
40         struct inode    *lmo_root;
41 };
42
43 struct lmv_stripe_md {
44         __u32   lsm_md_magic;
45         __u32   lsm_md_stripe_count;
46         __u32   lsm_md_master_mdt_index;
47         __u32   lsm_md_hash_type;
48         __u32   lsm_md_layout_version;
49         __u32   lsm_md_migrate_offset;
50         __u32   lsm_md_migrate_hash;
51         __u32   lsm_md_default_count;
52         __u32   lsm_md_default_index;
53         char    lsm_md_pool_name[LOV_MAXPOOLNAME + 1];
54         struct lmv_oinfo lsm_md_oinfo[0];
55 };
56
57 static inline bool lmv_dir_striped(const struct lmv_stripe_md *lsm)
58 {
59         return lsm && lsm->lsm_md_magic == LMV_MAGIC;
60 }
61
62 static inline bool lmv_dir_foreign(const struct lmv_stripe_md *lsm)
63 {
64         return lsm && lsm->lsm_md_magic == LMV_MAGIC_FOREIGN;
65 }
66
67 static inline bool lmv_dir_migrating(const struct lmv_stripe_md *lsm)
68 {
69         return lmv_dir_striped(lsm) &&
70                lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION;
71 }
72
73 static inline bool lmv_dir_bad_hash(const struct lmv_stripe_md *lsm)
74 {
75         if (!lmv_dir_striped(lsm))
76                 return false;
77
78         if (lmv_dir_migrating(lsm) &&
79             lsm->lsm_md_stripe_count - lsm->lsm_md_migrate_offset <= 1)
80                 return false;
81
82         if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_BAD_TYPE)
83                 return true;
84
85         return !lmv_is_known_hash_type(lsm->lsm_md_hash_type);
86 }
87
88 static inline bool
89 lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2)
90 {
91         __u32 idx;
92
93         if (lsm1->lsm_md_magic != lsm2->lsm_md_magic ||
94             lsm1->lsm_md_stripe_count != lsm2->lsm_md_stripe_count ||
95             lsm1->lsm_md_master_mdt_index !=
96                                 lsm2->lsm_md_master_mdt_index ||
97             lsm1->lsm_md_hash_type != lsm2->lsm_md_hash_type ||
98             lsm1->lsm_md_layout_version !=
99                                 lsm2->lsm_md_layout_version ||
100             lsm1->lsm_md_migrate_offset !=
101                                 lsm2->lsm_md_migrate_offset ||
102             lsm1->lsm_md_migrate_hash !=
103                                 lsm2->lsm_md_migrate_hash ||
104             strncmp(lsm1->lsm_md_pool_name, lsm2->lsm_md_pool_name,
105                     sizeof(lsm1->lsm_md_pool_name)) != 0)
106                 return false;
107
108         if (lmv_dir_striped(lsm1)) {
109                 for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) {
110                         if (!lu_fid_eq(&lsm1->lsm_md_oinfo[idx].lmo_fid,
111                                        &lsm2->lsm_md_oinfo[idx].lmo_fid))
112                                 return false;
113                 }
114         }
115
116         return true;
117 }
118
119 static inline void lsm_md_dump(int mask, const struct lmv_stripe_md *lsm)
120 {
121         int i;
122
123         /* If lsm_md_magic == LMV_MAGIC_FOREIGN pool_name may not be a null
124          * terminated string so only print LOV_MAXPOOLNAME bytes.
125          */
126         CDEBUG(mask,
127                "magic %#x stripe count %d master mdt %d hash type %#x version %d migrate offset %d migrate hash %#x pool %.*s\n",
128                lsm->lsm_md_magic, lsm->lsm_md_stripe_count,
129                lsm->lsm_md_master_mdt_index, lsm->lsm_md_hash_type,
130                lsm->lsm_md_layout_version, lsm->lsm_md_migrate_offset,
131                lsm->lsm_md_migrate_hash,
132                LOV_MAXPOOLNAME, lsm->lsm_md_pool_name);
133
134         if (!lmv_dir_striped(lsm))
135                 return;
136
137         for (i = 0; i < lsm->lsm_md_stripe_count; i++)
138                 CDEBUG(mask, "stripe[%d] "DFID"\n",
139                        i, PFID(&lsm->lsm_md_oinfo[i].lmo_fid));
140 }
141
142 union lmv_mds_md;
143
144 void lmv_free_memmd(struct lmv_stripe_md *lsm);
145
146 static inline void lmv1_le_to_cpu(struct lmv_mds_md_v1 *lmv_dst,
147                                   const struct lmv_mds_md_v1 *lmv_src)
148 {
149         __u32 i;
150
151         lmv_dst->lmv_magic = le32_to_cpu(lmv_src->lmv_magic);
152         lmv_dst->lmv_stripe_count = le32_to_cpu(lmv_src->lmv_stripe_count);
153         lmv_dst->lmv_master_mdt_index =
154                                 le32_to_cpu(lmv_src->lmv_master_mdt_index);
155         lmv_dst->lmv_hash_type = le32_to_cpu(lmv_src->lmv_hash_type);
156         lmv_dst->lmv_layout_version = le32_to_cpu(lmv_src->lmv_layout_version);
157         for (i = 0; i < lmv_src->lmv_stripe_count; i++)
158                 fid_le_to_cpu(&lmv_dst->lmv_stripe_fids[i],
159                               &lmv_src->lmv_stripe_fids[i]);
160 }
161
162 static inline void lmv_le_to_cpu(union lmv_mds_md *lmv_dst,
163                                  const union lmv_mds_md *lmv_src)
164 {
165         switch (le32_to_cpu(lmv_src->lmv_magic)) {
166         case LMV_MAGIC_V1:
167                 lmv1_le_to_cpu(&lmv_dst->lmv_md_v1, &lmv_src->lmv_md_v1);
168                 break;
169         default:
170                 break;
171         }
172 }
173
174 /* This hash is only for testing purpose */
175 static inline unsigned int
176 lmv_hash_all_chars(unsigned int count, const char *name, int namelen)
177 {
178         unsigned int c = 0;
179         const unsigned char *p = (const unsigned char *)name;
180
181         while (--namelen >= 0)
182                 c += p[namelen];
183
184         c = c % count;
185
186         return c;
187 }
188
189 static inline unsigned int
190 lmv_hash_fnv1a(unsigned int count, const char *name, int namelen)
191 {
192         __u64 hash;
193
194         hash = lustre_hash_fnv_1a_64(name, namelen);
195
196         return do_div(hash, count);
197 }
198
199 /*
200  * Robert Jenkins' function for mixing 32-bit values
201  * http://burtleburtle.net/bob/hash/evahash.html
202  * a, b = random bits, c = input and output
203  *
204  * Mixing inputs to generate an evenly distributed hash.
205  */
206 #define crush_hashmix(a, b, c)                          \
207 do {                                                    \
208         a = a - b;  a = a - c;  a = a ^ (c >> 13);      \
209         b = b - c;  b = b - a;  b = b ^ (a << 8);       \
210         c = c - a;  c = c - b;  c = c ^ (b >> 13);      \
211         a = a - b;  a = a - c;  a = a ^ (c >> 12);      \
212         b = b - c;  b = b - a;  b = b ^ (a << 16);      \
213         c = c - a;  c = c - b;  c = c ^ (b >> 5);       \
214         a = a - b;  a = a - c;  a = a ^ (c >> 3);       \
215         b = b - c;  b = b - a;  b = b ^ (a << 10);      \
216         c = c - a;  c = c - b;  c = c ^ (b >> 15);      \
217 } while (0)
218
219 #define crush_hash_seed 1315423911
220
221 static inline __u32 crush_hash(__u32 a, __u32 b)
222 {
223         __u32 hash = crush_hash_seed ^ a ^ b;
224         __u32 x = 231232;
225         __u32 y = 1232;
226
227         crush_hashmix(a, b, hash);
228         crush_hashmix(x, a, hash);
229         crush_hashmix(b, y, hash);
230
231         return hash;
232 }
233
234 /* refer to https://github.com/ceph/ceph/blob/master/src/crush/hash.c and
235  * https://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf for details of CRUSH
236  * algorithm.
237  */
238 static inline unsigned int
239 lmv_hash_crush(unsigned int count, const char *name, int namelen)
240 {
241         unsigned long long straw;
242         unsigned long long highest_straw = 0;
243         unsigned int pg_id;
244         unsigned int idx = 0;
245         int i;
246
247         /* put temp and backup file on the same MDT where target is located.
248          * temporary file naming rule:
249          * 1. rsync: .<target>.XXXXXX
250          * 2. dstripe: <target>.XXXXXXXX
251          */
252         if (lu_name_is_temp_file(name, namelen, true, 6)) {
253                 name++;
254                 namelen -= 8;
255         } else if (lu_name_is_temp_file(name, namelen, false, 8)) {
256                 namelen -= 9;
257         } else if (lu_name_is_backup_file(name, namelen, &i)) {
258                 LASSERT(i < namelen);
259                 namelen -= i;
260         }
261
262         pg_id = lmv_hash_fnv1a(LMV_CRUSH_PG_COUNT, name, namelen);
263
264         /* distribute PG among all stripes pseudo-randomly, so they are almost
265          * evenly distributed, and when stripe count changes, only (delta /
266          * total) sub files need to be moved, herein 'delta' is added or removed
267          * stripe count, 'total' is total stripe count before change for
268          * removal, or count after change for addition.
269          */
270         for (i = 0; i < count; i++) {
271                 straw = crush_hash(pg_id, i);
272                 if (straw > highest_straw) {
273                         highest_straw = straw;
274                         idx = i;
275                 }
276         }
277         LASSERT(idx < count);
278
279         return idx;
280 }
281
282 static inline int lmv_name_to_stripe_index(__u32 hash_type,
283                                            unsigned int stripe_count,
284                                            const char *name, int namelen)
285 {
286         unsigned int idx;
287
288         LASSERT(namelen > 0);
289         LASSERT(stripe_count > 0);
290
291         if (stripe_count == 1)
292                 return 0;
293
294         switch (hash_type & LMV_HASH_TYPE_MASK) {
295         case LMV_HASH_TYPE_ALL_CHARS:
296                 idx = lmv_hash_all_chars(stripe_count, name, namelen);
297                 break;
298         case LMV_HASH_TYPE_FNV_1A_64:
299                 idx = lmv_hash_fnv1a(stripe_count, name, namelen);
300                 break;
301         case LMV_HASH_TYPE_CRUSH:
302                 idx = lmv_hash_crush(stripe_count, name, namelen);
303                 break;
304         default:
305                 return -EBADFD;
306         }
307
308         CDEBUG(D_INFO, "name %.*s hash_type %#x idx %d/%u\n", namelen, name,
309                hash_type, idx, stripe_count);
310
311         return idx;
312 }
313
314 static inline bool lmv_user_magic_supported(__u32 lum_magic)
315 {
316         return lum_magic == LMV_USER_MAGIC ||
317                lum_magic == LMV_USER_MAGIC_SPECIFIC ||
318                lum_magic == LMV_MAGIC_FOREIGN;
319 }
320
321 static inline bool lmv_is_sane(const struct lmv_mds_md_v1 *lmv)
322 {
323         if (le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_V1)
324                 return false;
325
326         if (le32_to_cpu(lmv->lmv_stripe_count) == 0)
327                 return false;
328
329         if (!lmv_is_known_hash_type(lmv->lmv_hash_type))
330                 return false;
331
332         return true;
333 }
334
335 #endif