4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; if not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2017, DDN Storage Corporation.
26 * lustre/utils/llsom_sync.c
28 * Tool for sync the LSOM xattr.
30 * Author: Qian Yingjin <qian@ddn.com>
41 #include <sys/types.h>
43 #include <linux/unistd.h>
44 #include <linux/kernel.h>
45 #include <sys/sysinfo.h>
46 #include <linux/lustre/lustre_user.h>
47 #include <lustre/lustreapi.h>
48 #include <linux/lustre/lustre_idl.h>
49 #include <linux/lustre/lustre_fid.h>
50 #include <libcfs/util/hash.h>
51 #include <libcfs/util/list.h>
52 #include <libcfs/util/parser.h>
54 #define container_of(ptr, type, member) ({ \
55 const typeof(((type *) 0)->member) * __mptr = (ptr); \
56 (type *) ((char *) __mptr - offsetof(type, member)); })
58 #define CHLG_POLL_INTV 60
59 #define REC_MIN_AGE 600
60 #define DEF_CACHE_SIZE (256 * 1048576) /* 256MB */
63 const char *o_chlg_user;
64 const char *o_mdtname;
71 unsigned long o_cached_fid_hiwm; /* high watermark */
72 unsigned long o_batch_sync_cnt;
78 struct hlist_node fr_node;
79 struct list_head fr_link;
85 static const int fid_hash_shift = 6;
87 #define FID_HASH_ENTRIES (1 << fid_hash_shift)
88 #define FID_ON_HASH(f) (!hlist_unhashed(&(f)->fr_node))
90 #if __BITS_PER_LONG == 32
91 #define FID_HASH_FN(f) (hash_long(fid_flatten32(f), fid_hash_shift))
92 #elif __BITS_PER_LONG == 64
93 #define FID_HASH_FN(f) (hash_long(fid_flatten(f), fid_hash_shift))
95 #error Wordsize not 32 or 64
99 struct hlist_head *lh_hash;
100 struct list_head lh_list; /* ordered list by record index */
101 unsigned long lh_cached_count;
104 static void usage(char *prog)
106 printf("\nUsage: %s [options] -u <userid> -m <mdtdev> <mntpt>\n"
108 "\t-d, --daemonize\n"
109 "\t-i, --interval, poll interval in second\n"
110 "\t-a, --min-age, min age before a record is processed.\n"
111 "\t-c, --max-cache, percentage of the memroy used for cache.\n"
112 "\t-s, --sync, data sync when update LSOM xattr\n"
113 "\t-v, --verbose, produce more verbose ouput\n",
118 static inline __u64 fid_flatten(const struct lu_fid *fid)
123 if (fid_is_igif(fid)) {
124 ino = lu_igif_ino(fid);
130 ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid);
132 return ino ?: fid_oid(fid);
136 * map fid to 32 bit value for ino on 32bit systems.
138 static inline __u32 fid_flatten32(const struct lu_fid *fid)
143 if (fid_is_igif(fid)) {
144 ino = lu_igif_ino(fid);
148 seq = fid_seq(fid) - FID_SEQ_START;
150 /* Map the high bits of the OID into higher bits of the inode number so
151 * that inodes generated at about the same time have a reduced chance
152 * of collisions. This will give a period of 2^12 = 1024 unique clients
153 * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects
154 * (from OID), or up to 128M inodes without collisions for new files.
156 ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) +
157 (seq >> (64 - (40-8)) & 0xffffff00) +
158 (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8);
160 return ino ?: fid_oid(fid);
163 static inline bool fid_eq(const lustre_fid *f1, const lustre_fid *f2)
165 return f1->f_seq == f2->f_seq && f1->f_oid == f2->f_oid &&
166 f1->f_ver == f2->f_ver;
169 static void fid_hash_del(struct fid_rec *f)
172 hlist_del_init(&f->fr_node);
175 static void fid_hash_add(struct fid_rec *f)
177 assert(!FID_ON_HASH(f));
178 hlist_add_head(&f->fr_node, &head.lh_hash[FID_HASH_FN(&f->fr_fid)]);
181 static struct fid_rec *fid_hash_find(const lustre_fid *fid)
183 struct hlist_head *hash_list;
184 struct hlist_node *entry, *next;
187 hash_list = &head.lh_hash[FID_HASH_FN(fid)];
188 hlist_for_each_entry_safe(f, entry, next, hash_list, fr_node) {
189 assert(FID_ON_HASH(f));
190 if (fid_eq(fid, &f->fr_fid))
197 static int lsom_setup(void)
201 /* set llapi message level */
202 llapi_msg_set_level(opt.o_verbose);
204 memset(&head, 0, sizeof(head));
205 head.lh_hash = malloc(sizeof(struct hlist_head) * FID_HASH_ENTRIES);
206 if (head.lh_hash == NULL) {
207 llapi_err_noerrno(LLAPI_MSG_ERROR,
208 "failed to alloc memory for hash (%zu).",
209 sizeof(struct hlist_head) * FID_HASH_ENTRIES);
213 for (i = 0; i < FID_HASH_ENTRIES; i++)
214 INIT_HLIST_HEAD(&head.lh_hash[i]);
216 INIT_LIST_HEAD(&head.lh_list);
220 static void lsom_cleanup(void)
225 static int lsom_update_one(struct fid_rec *f)
231 fd = llapi_open_by_fid(opt.o_mntpt, &f->fr_fid,
232 O_RDONLY | O_NOATIME);
236 /* The file may be deleted, clean the corresponding
237 * changelog record and ignore this error.
242 llapi_error(LLAPI_MSG_ERROR, rc,
243 "llapi_open_by_fid for " DFID " failed",
248 if (opt.o_data_sync) {
251 /* Flush dirty pages from clients */
252 rc = llapi_get_data_version(fd, &dv, LL_DV_RD_FLUSH);
254 llapi_error(LLAPI_MSG_ERROR, errno,
255 "failed to sync data for " DFID,
257 /* ignore this error, continue to sync lsom data */
262 llapi_error(LLAPI_MSG_ERROR, rc, "failed to stat FID: " DFID,
267 /* After call fstat(), it already gets OST attrs to the client,
268 * when close the file, MDS will update the LSOM data itself
269 * according the size and blocks information from the client.
273 llapi_printf(LLAPI_MSG_DEBUG,
274 "record %llu:%llu, updated LSOM for fid " DFID
275 " size:%lu blocks:%lu\n",
276 (unsigned long long)f->fr_time,
277 (unsigned long long)f->fr_index,
278 PFID(&f->fr_fid), st.st_size, st.st_blocks);
281 rc = llapi_changelog_clear(opt.o_mdtname,
282 opt.o_chlg_user, f->fr_index);
284 llapi_error(LLAPI_MSG_ERROR, rc,
285 "failed to clear changelog record: %s:%llu",
286 opt.o_chlg_user, (unsigned long long)f->fr_index);
290 static int lsom_start_update(int count)
295 llapi_printf(LLAPI_MSG_INFO, "Start to sync %d records.\n", count);
300 f = list_entry(head.lh_list.next, struct fid_rec, fr_link);
301 rc = lsom_update_one(f);
303 list_del_init(&f->fr_link);
306 head.lh_cached_count--;
317 static int lsom_check_sync(void)
324 if (list_empty(&head.lh_list))
327 if (head.lh_cached_count > opt.o_cached_fid_hiwm)
328 count = opt.o_batch_sync_cnt;
333 /* When the first record in the list was not being
334 * processed for a long time (more than o_min_age),
335 * pop the record, start to handle it immediately.
338 f = list_entry(head.lh_list.next, struct fid_rec, fr_link);
339 if (now > ((f->fr_time >> 30) + opt.o_min_age))
344 rc = lsom_start_update(count);
346 if (rc == 0 && count == 1)
352 static void lsom_sort_record_list(struct fid_rec *f)
354 struct list_head *pos;
355 bool need_move = false;
357 for (pos = f->fr_link.next; pos != &head.lh_list; pos = pos->next) {
358 struct fid_rec *rec = list_entry(pos, struct fid_rec, fr_link);
360 if (f->fr_index > rec->fr_index) {
369 list_move_tail(&f->fr_link, pos);
372 static int process_record(struct changelog_rec *rec)
374 __u64 index = rec->cr_index;
377 if (rec->cr_type == CL_CLOSE || rec->cr_type == CL_TRUNC ||
378 rec->cr_type == CL_SETATTR) {
381 f = fid_hash_find(&rec->cr_tfid);
383 f = malloc(sizeof(struct fid_rec));
386 llapi_error(LLAPI_MSG_ERROR, rc,
387 "failed to alloc memory for fid_rec");
391 f->fr_fid = rec->cr_tfid;
393 f->fr_time = rec->cr_time;
394 INIT_HLIST_NODE(&f->fr_node);
397 * The newly changelog record index is processed in the
398 * ascending order, so it is safe to put the record at
399 * the tail of the ordered list.
401 list_add_tail(&f->fr_link, &head.lh_list);
402 head.lh_cached_count++;
405 lsom_sort_record_list(f);
409 llapi_printf(LLAPI_MSG_DEBUG,
410 "Processed changelog record index:%llu type:%s(0x%x) FID:"DFID"\n",
411 (unsigned long long)index,
412 changelog_type2str(__le32_to_cpu(rec->cr_type)),
413 __le32_to_cpu(rec->cr_type), PFID(&rec->cr_tfid));
418 static unsigned long get_fid_cache_size(int pct)
420 struct sysinfo sinfo;
421 unsigned long cache_size;
424 rc = sysinfo(&sinfo);
426 llapi_error(LLAPI_MSG_ERROR, rc, "failed to get sysinfo");
427 /* ignore this error, just pick some reasonable static
428 * limit for the cache size (e.g. 256MB, default value).
430 cache_size = DEF_CACHE_SIZE;
432 /* maximum cached fid size is tunned according to total
433 * memory size, e.g. 5% of the memroy.
435 cache_size = sinfo.totalram * pct / 100;
441 int main(int argc, char **argv)
446 struct changelog_rec *rec;
449 unsigned long cache_size = DEF_CACHE_SIZE;
450 char fsname[MAX_OBD_NAME + 1];
451 static struct option options[] = {
452 { "mdt", required_argument, NULL, 'm' },
453 { "user", required_argument, 0, 'u'},
454 { "daemonize", no_argument, NULL, 'd'},
455 { "interval", required_argument, NULL, 'i'},
456 { "min-age", required_argument, NULL, 'a'},
457 { "max-cache", required_argument, NULL, 'c'},
458 { "verbose", no_argument, NULL, 'v'},
459 { "sync", no_argument, NULL, 's'},
460 { "help", no_argument, NULL, 'h' },
464 memset(&opt, 0, sizeof(opt));
465 opt.o_data_sync = false;
466 opt.o_verbose = LLAPI_MSG_INFO;
467 opt.o_intv = CHLG_POLL_INTV;
468 opt.o_min_age = REC_MIN_AGE;
470 while ((c = getopt_long(argc, argv, "u:hm:dsi:a:c:v", options, NULL))
475 llapi_error(LLAPI_MSG_ERROR, rc,
476 "%s: unknown option '-%c'\n",
480 opt.o_chlg_user = optarg;
486 opt.o_mdtname = optarg;
489 opt.o_daemonize = true;
492 opt.o_intv = atoi(optarg);
493 if (opt.o_intv < 0) {
495 llapi_error(LLAPI_MSG_ERROR, rc,
496 "bad value for -i %s", optarg);
501 opt.o_min_age = atoi(optarg);
502 if (opt.o_min_age < 0) {
504 llapi_error(LLAPI_MSG_ERROR, rc,
505 "bad value for -a %s", optarg);
510 rc = Parser_size(&cache_size, optarg);
513 llapi_error(LLAPI_MSG_ERROR, rc,
514 "bad valud for -c '%s'", optarg);
518 /* For value < 100, it is taken as the percentage of
519 * total memory instead.
521 if (cache_size < 100)
522 cache_size = get_fid_cache_size(cache_size);
523 llapi_printf(LLAPI_MSG_INFO, "Cache size: %lu\n",
530 opt.o_data_sync = true;
535 if (argc != optind + 1) {
536 llapi_err_noerrno(LLAPI_MSG_ERROR,
537 "%s: no mount point specified\n", argv[0]);
541 opt.o_mntpt = argv[optind];
542 rc = llapi_search_fsname(opt.o_mntpt, fsname);
544 llapi_error(LLAPI_MSG_ERROR, rc,
545 "cannot find a Lustre file system mounted at '%s'",
553 if (!opt.o_chlg_user)
556 if (opt.o_daemonize) {
560 llapi_error(LLAPI_MSG_ERROR, rc, "cannot daemonize");
564 setbuf(stdout, NULL);
567 opt.o_cached_fid_hiwm = cache_size / sizeof(struct fid_rec);
568 opt.o_batch_sync_cnt = opt.o_cached_fid_hiwm / 2;
577 llapi_printf(LLAPI_MSG_DEBUG, "Start receiving records\n");
578 rc = llapi_changelog_start(&chglog_hdlr,
579 CHANGELOG_FLAG_BLOCK |
580 CHANGELOG_FLAG_JOBID |
581 CHANGELOG_FLAG_EXTRA_FLAGS,
584 llapi_error(LLAPI_MSG_ERROR, rc,
585 "unable to open changelog of MDT [%s]\n",
590 while (!eof && !stop) {
591 rc = llapi_changelog_recv(chglog_hdlr, &rec);
594 rc = process_record(rec);
596 llapi_error(LLAPI_MSG_ERROR, rc,
597 "failed to process record");
601 llapi_changelog_free(&rec);
603 rc = lsom_check_sync();
611 llapi_printf(LLAPI_MSG_DEBUG,
612 "finished reading [%s]\n",
616 case -EINVAL: /* FS unmounted */
617 case -EPROTO: /* error in KUC channel */
620 llapi_error(LLAPI_MSG_ERROR, rc,
621 "failed to get changelog record");
627 /* reach EOF of changelog */
628 rc = llapi_changelog_fini(&chglog_hdlr);
630 llapi_error(LLAPI_MSG_ERROR, rc,
631 "unable to close changelog of MDT [%s]",
637 if (opt.o_daemonize) {
640 rc = lsom_check_sync();
646 lsom_start_update(head.lh_cached_count);