Whamcloud - gitweb
9676ded11223a599a4169b68fc0cce4edd89159d
[fs/lustre-release.git] / lustre / utils / llsom_sync.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; if not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2017, DDN Storage Corporation.
24  */
25 /*
26  * lustre/utils/llsom_sync.c
27  *
28  * Tool for sync the LSOM xattr.
29  *
30  * Author: Qian Yingjin <qian@ddn.com>
31  */
32
33 #include <stdlib.h>
34 #include <errno.h>
35 #include <getopt.h>
36 #include <unistd.h>
37 #include <fcntl.h>
38 #include <poll.h>
39 #include <assert.h>
40 #include <sys/stat.h>
41 #include <sys/types.h>
42 #include <time.h>
43 #include <linux/unistd.h>
44 #include <linux/kernel.h>
45 #include <sys/sysinfo.h>
46 #include <linux/lustre/lustre_user.h>
47 #include <lustre/lustreapi.h>
48 #include <linux/lustre/lustre_idl.h>
49 #include <linux/lustre/lustre_fid.h>
50 #include <libcfs/util/hash.h>
51 #include <libcfs/util/list.h>
52 #include <libcfs/util/parser.h>
53
54 #ifndef PATH_MAX
55 #define PATH_MAX (4096)
56 #endif
57
58 #define container_of(ptr, type, member) ({                      \
59         const typeof(((type *) 0)->member) * __mptr = (ptr);     \
60         (type *) ((char *) __mptr - offsetof(type, member)); })
61
62 #define CHLG_POLL_INTV  60
63 #define REC_MIN_AGE     600
64 #define DEF_CACHE_SIZE  (256 * 1048576) /* 256MB */
65
66 struct options {
67         const char      *o_chlg_user;
68         const char      *o_mdtname;
69         const char      *o_mntpt;
70         bool             o_daemonize;
71         bool             o_data_sync;
72         int              o_verbose;
73         int              o_intv;
74         int              o_min_age;
75         unsigned long    o_cached_fid_hiwm; /* high watermark */
76         unsigned long    o_batch_sync_cnt;
77 };
78
79 struct options opt;
80
81 struct fid_rec {
82         struct hlist_node       fr_node;
83         struct list_head        fr_link;
84         lustre_fid              fr_fid;
85         __u64                   fr_time;
86         __u64                   fr_index;
87 };
88
89 static const int fid_hash_shift = 6;
90
91 #define FID_HASH_ENTRIES        (1 << fid_hash_shift)
92 #define FID_ON_HASH(f)          (!hlist_unhashed(&(f)->fr_node))
93
94 #if __BITS_PER_LONG == 32
95 #define FID_HASH_FN(f)  (hash_long(fid_flatten32(f), fid_hash_shift))
96 #elif __BITS_PER_LONG == 64
97 #define FID_HASH_FN(f)  (hash_long(fid_flatten(f), fid_hash_shift))
98 #else
99 #error Wordsize not 32 or 64
100 #endif
101
102 struct lsom_head {
103         struct hlist_head       *lh_hash;
104         struct list_head         lh_list; /* ordered list by record index */
105         unsigned long            lh_cached_count;
106 } head;
107
108 static void usage(char *prog)
109 {
110         printf("\nUsage: %s [options] -u <userid> -m <mdtdev> <mntpt>\n"
111                "options:\n"
112                "\t-d, --daemonize\n"
113                "\t-i, --interval, poll interval in second\n"
114                "\t-a, --min-age, min age before a record is processed.\n"
115                "\t-c, --max-cache, percentage of the memroy used for cache.\n"
116                "\t-s, --sync, data sync when update LSOM xattr\n"
117                "\t-v, --verbose, produce more verbose ouput\n",
118                prog);
119         exit(0);
120 }
121
122 static inline __u64 fid_flatten(const struct lu_fid *fid)
123 {
124         __u64 ino;
125         __u64 seq;
126
127         if (fid_is_igif(fid)) {
128                 ino = lu_igif_ino(fid);
129                 return ino;
130         }
131
132         seq = fid_seq(fid);
133
134         ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid);
135
136         return ino ?: fid_oid(fid);
137 }
138
139 /**
140  * map fid to 32 bit value for ino on 32bit systems.
141  */
142 static inline __u32 fid_flatten32(const struct lu_fid *fid)
143 {
144         __u32 ino;
145         __u64 seq;
146
147         if (fid_is_igif(fid)) {
148                 ino = lu_igif_ino(fid);
149                 return ino;
150         }
151
152         seq = fid_seq(fid) - FID_SEQ_START;
153
154         /* Map the high bits of the OID into higher bits of the inode number so
155          * that inodes generated at about the same time have a reduced chance
156          * of collisions. This will give a period of 2^12 = 1024 unique clients
157          * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects
158          * (from OID), or up to 128M inodes without collisions for new files.
159          */
160         ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) +
161               (seq >> (64 - (40-8)) & 0xffffff00) +
162               (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8);
163
164         return ino ?: fid_oid(fid);
165 }
166
167 static inline bool fid_eq(const lustre_fid *f1, const lustre_fid *f2)
168 {
169         return f1->f_seq == f2->f_seq && f1->f_oid == f2->f_oid &&
170                f1->f_ver == f2->f_ver;
171 }
172
173 static void fid_hash_del(struct fid_rec *f)
174 {
175         if (FID_ON_HASH(f))
176                 hlist_del_init(&f->fr_node);
177 }
178
179 static void fid_hash_add(struct fid_rec *f)
180 {
181         assert(!FID_ON_HASH(f));
182         hlist_add_head(&f->fr_node, &head.lh_hash[FID_HASH_FN(&f->fr_fid)]);
183 }
184
185 static struct fid_rec *fid_hash_find(const lustre_fid *fid)
186 {
187         struct hlist_head *hash_list;
188         struct hlist_node *entry, *next;
189         struct fid_rec *f;
190
191         hash_list = &head.lh_hash[FID_HASH_FN(fid)];
192         hlist_for_each_entry_safe(f, entry, next, hash_list, fr_node) {
193                 assert(FID_ON_HASH(f));
194                 if (fid_eq(fid, &f->fr_fid))
195                         return f;
196         }
197
198         return NULL;
199 }
200
201 static int lsom_setup(void)
202 {
203         int i;
204
205         /* set llapi message level */
206         llapi_msg_set_level(opt.o_verbose);
207
208         memset(&head, 0, sizeof(head));
209         head.lh_hash = malloc(sizeof(struct hlist_head) * FID_HASH_ENTRIES);
210         if (head.lh_hash == NULL) {
211                 llapi_err_noerrno(LLAPI_MSG_ERROR,
212                                  "failed to alloc memory for hash (%zu).",
213                                  sizeof(struct hlist_head) * FID_HASH_ENTRIES);
214                 return -ENOMEM;
215         }
216
217         for (i = 0; i < FID_HASH_ENTRIES; i++)
218                 INIT_HLIST_HEAD(&head.lh_hash[i]);
219
220         INIT_LIST_HEAD(&head.lh_list);
221         return 0;
222 }
223
224 static void lsom_cleanup(void)
225 {
226         free(head.lh_hash);
227 }
228
229 static int lsom_update_one(struct fid_rec *f)
230 {
231         struct stat st;
232         int fd;
233         int rc = 0;
234
235         fd = llapi_open_by_fid(opt.o_mntpt, &f->fr_fid,
236                                O_RDONLY | O_NOATIME);
237         if (fd < 0) {
238                 rc = -errno;
239
240                 /* The file may be deleted, clean the corresponding
241                  * changelog record and ignore this error.
242                  */
243                 if (rc == -ENOENT)
244                         goto clean_up;
245
246                 llapi_error(LLAPI_MSG_ERROR, rc,
247                             "llapi_open_by_fid for " DFID " failed",
248                             PFID(&f->fr_fid));
249                 return rc;
250         }
251
252         if (opt.o_data_sync) {
253                 __u64 dv;
254
255                 /* Flush dirty pages from clients */
256                 rc = llapi_get_data_version(fd, &dv, LL_DV_RD_FLUSH);
257                 if (rc < 0)
258                         llapi_error(LLAPI_MSG_ERROR, errno,
259                                     "failed to sync data for " DFID,
260                                     PFID(&f->fr_fid));
261                 /* ignore this error, continue to sync lsom data */
262         }
263
264         rc = fstat(fd, &st);
265         if (rc < 0) {
266                 llapi_error(LLAPI_MSG_ERROR, rc, "failed to stat FID: " DFID,
267                             PFID(&f->fr_fid));
268                 return rc;
269         }
270
271         /* After call fstat(), it already gets OST attrs to the client,
272          * when close the file, MDS will update the LSOM data itself
273          * according the size and blocks information from the client.
274          */
275         close(fd);
276
277         llapi_printf(LLAPI_MSG_DEBUG,
278                      "record %llu:%llu, updated LSOM for fid " DFID
279                      " size:%lu blocks:%lu\n", f->fr_time, f->fr_index,
280                      PFID(&f->fr_fid), st.st_size, st.st_blocks);
281
282 clean_up:
283         rc = llapi_changelog_clear(opt.o_mdtname,
284                                    opt.o_chlg_user, f->fr_index);
285         if (rc)
286                 llapi_error(LLAPI_MSG_ERROR, rc,
287                             "failed to clear changelog record: %s:%llu",
288                             opt.o_chlg_user, f->fr_index);
289         return rc;
290 }
291
292 static int lsom_start_update(int count)
293 {
294         int rc = 0;
295         int i = 0;
296
297         llapi_printf(LLAPI_MSG_INFO, "Start to sync %d records.\n", count);
298
299         while (i < count) {
300                 struct fid_rec *f;
301
302                 f = list_entry(head.lh_list.next, struct fid_rec, fr_link);
303                 rc = lsom_update_one(f);
304                 if (rc == 0) {
305                         list_del_init(&f->fr_link);
306                         fid_hash_del(f);
307                         free(f);
308                         head.lh_cached_count--;
309                         i++;
310                 } else {
311                         goto out;
312                 }
313         }
314
315 out:
316         return rc;
317 }
318
319 static int lsom_check_sync(void)
320 {
321         int rc = 0;
322         int count;
323
324 repeated:
325         count = 0;
326         if (list_empty(&head.lh_list))
327                 return 0;
328
329         if (head.lh_cached_count > opt.o_cached_fid_hiwm)
330                 count = opt.o_batch_sync_cnt;
331         else {
332                 struct fid_rec *f;
333                 time_t now;
334
335                 /* When the first record in the list was not being
336                  * processed for a long time (more than o_min_age),
337                  * pop the record, start to handle it immediately.
338                  */
339                 now = time(NULL);
340                 f = list_entry(head.lh_list.next, struct fid_rec, fr_link);
341                 if (now > ((f->fr_time >> 30) + opt.o_min_age))
342                         count = 1;
343         }
344
345         if (count > 0)
346                 rc = lsom_start_update(count);
347
348         if (rc == 0 && count == 1)
349                 goto repeated;
350
351         return rc;
352 }
353
354 static void lsom_sort_record_list(struct fid_rec *f)
355 {
356         struct list_head *pos;
357         bool need_move = false;
358
359         for (pos = f->fr_link.next; pos != &head.lh_list; pos = pos->next) {
360                 struct fid_rec *rec = list_entry(pos, struct fid_rec, fr_link);
361
362                 if (f->fr_index > rec->fr_index) {
363                         need_move = true;
364                         continue;
365                 } else {
366                         break;
367                 }
368         }
369
370         if (need_move)
371                 list_move_tail(&f->fr_link, pos);
372 }
373
374 static int process_record(struct changelog_rec *rec)
375 {
376         __u64 index = rec->cr_index;
377         int rc = 0;
378
379         if (rec->cr_type == CL_CLOSE || rec->cr_type == CL_TRUNC ||
380             rec->cr_type == CL_SETATTR) {
381                 struct fid_rec *f;
382
383                 f = fid_hash_find(&rec->cr_tfid);
384                 if (f == NULL) {
385                         f = malloc(sizeof(struct fid_rec));
386                         if (f == NULL) {
387                                 rc = -ENOMEM;
388                                 llapi_error(LLAPI_MSG_ERROR, rc,
389                                             "failed to alloc memory for fid_rec");
390                                 return rc;
391                         }
392
393                         f->fr_fid = rec->cr_tfid;
394                         f->fr_index = index;
395                         f->fr_time = rec->cr_time;
396                         INIT_HLIST_NODE(&f->fr_node);
397                         fid_hash_add(f);
398                         /*
399                          * The newly changelog record index is processed in the
400                          * ascending order, so it is safe to put the record at
401                          * the tail of the ordered list.
402                          */
403                         list_add_tail(&f->fr_link, &head.lh_list);
404                         head.lh_cached_count++;
405                 } else {
406                         f->fr_index = index;
407                         lsom_sort_record_list(f);
408                 }
409         }
410
411         llapi_printf(LLAPI_MSG_DEBUG, "Processed changelog record index:%llu "
412                      "type:%s(0x%x) FID:"DFID"\n", index,
413                      changelog_type2str(__le32_to_cpu(rec->cr_type)),
414                      __le32_to_cpu(rec->cr_type), PFID(&rec->cr_tfid));
415
416         return rc;
417 }
418
419 static unsigned long get_fid_cache_size(int pct)
420 {
421         struct sysinfo sinfo;
422         unsigned long cache_size;
423         int rc;
424
425         rc = sysinfo(&sinfo);
426         if (rc) {
427                 llapi_error(LLAPI_MSG_ERROR, rc, "failed to get sysinfo");
428                 /* ignore this error, just pick some reasonable static
429                  * limit for the cache size (e.g. 256MB, default value).
430                  */
431                 cache_size = DEF_CACHE_SIZE;
432         } else {
433                 /* maximum cached fid size is tunned according to total
434                  * memory size, e.g. 5% of the memroy.
435                  */
436                 cache_size = sinfo.totalram * pct / 100;
437         }
438
439         return cache_size;
440 }
441
442 int main(int argc, char **argv)
443 {
444         int                      c;
445         int                      rc;
446         void                    *chglog_hdlr;
447         struct changelog_rec    *rec;
448         bool                     stop = 0;
449         int                      ret = 0;
450         unsigned long            cache_size = DEF_CACHE_SIZE;
451         char                     fsname[MAX_OBD_NAME + 1];
452         static struct option options[] = {
453                 { "mdt", required_argument, NULL, 'm' },
454                 { "user", required_argument, 0, 'u'},
455                 { "daemonize", no_argument, NULL, 'd'},
456                 { "interval", required_argument, NULL, 'i'},
457                 { "min-age", required_argument, NULL, 'a'},
458                 { "max-cache", required_argument, NULL, 'c'},
459                 { "verbose", no_argument, NULL, 'v'},
460                 { "sync", no_argument, NULL, 's'},
461                 { "help", no_argument, NULL, 'h' },
462                 { NULL }
463         };
464
465         memset(&opt, 0, sizeof(opt));
466         opt.o_data_sync = false;
467         opt.o_verbose = LLAPI_MSG_INFO;
468         opt.o_intv = CHLG_POLL_INTV;
469         opt.o_min_age = REC_MIN_AGE;
470
471         while ((c = getopt_long(argc, argv, "u:hm:dsi:a:c:v", options, NULL))
472                != EOF) {
473                 switch (c) {
474                 default:
475                         rc = -EINVAL;
476                         llapi_error(LLAPI_MSG_ERROR, rc,
477                                     "%s: unknown option '-%c'\n",
478                                     argv[0], optopt);
479                         return rc;
480                 case 'u':
481                         opt.o_chlg_user = optarg;
482                         break;
483                 case 'h':
484                         usage(argv[0]);
485                         break;
486                 case 'm':
487                         opt.o_mdtname = optarg;
488                         break;
489                 case 'd':
490                         opt.o_daemonize = true;
491                         break;
492                 case 'i':
493                         opt.o_intv = atoi(optarg);
494                         if (opt.o_intv < 0) {
495                                 rc = -EINVAL;
496                                 llapi_error(LLAPI_MSG_ERROR, rc,
497                                             "bad value for -i %s", optarg);
498                                 return rc;
499                         }
500                         break;
501                 case 'a':
502                         opt.o_min_age = atoi(optarg);
503                         if (opt.o_min_age < 0) {
504                                 rc = -EINVAL;
505                                 llapi_error(LLAPI_MSG_ERROR, rc,
506                                             "bad value for -a %s", optarg);
507                                 return rc;
508                         }
509                         break;
510                 case 'c':
511                         rc = Parser_size(&cache_size, optarg);
512                         if (rc < 0) {
513                                 rc = -EINVAL;
514                                 llapi_error(LLAPI_MSG_ERROR, rc,
515                                             "bad valud for -c '%s'", optarg);
516                                 return rc;
517                         }
518
519                         /* For value < 100, it is taken as the percentage of
520                          * total memory instead.
521                          */
522                         if (cache_size < 100)
523                                 cache_size = get_fid_cache_size(cache_size);
524                         llapi_printf(LLAPI_MSG_INFO, "Cache size: %lu\n",
525                                      cache_size);
526                         break;
527                 case 'v':
528                         opt.o_verbose++;
529                         break;
530                 case 's':
531                         opt.o_data_sync = true;
532                         break;
533                 }
534         }
535
536         if (argc != optind + 1) {
537                 llapi_err_noerrno(LLAPI_MSG_ERROR,
538                                   "%s: no mount point specified\n", argv[0]);
539                 usage(argv[0]);
540         }
541
542         opt.o_mntpt = argv[optind];
543         rc = llapi_search_fsname(opt.o_mntpt, fsname);
544         if (rc < 0) {
545                 llapi_error(LLAPI_MSG_ERROR, rc,
546                             "cannot find a Lustre file system mounted at '%s'",
547                             opt.o_mntpt);
548                 return rc;
549         }
550
551         if (!opt.o_mdtname)
552                 usage(argv[0]);
553
554         if (!opt.o_chlg_user)
555                 usage(argv[0]);
556
557         if (opt.o_daemonize) {
558                 rc = daemon(1, 1);
559                 if (rc < 0) {
560                         rc = -errno;
561                         llapi_error(LLAPI_MSG_ERROR, rc, "cannot daemonize");
562                         return rc;
563                 }
564
565                 setbuf(stdout, NULL);
566         }
567
568         opt.o_cached_fid_hiwm = cache_size / sizeof(struct fid_rec);
569         opt.o_batch_sync_cnt = opt.o_cached_fid_hiwm / 2;
570
571         rc = lsom_setup();
572         if (rc < 0)
573                 return rc;
574
575         while (!stop) {
576                 bool eof = false;
577
578                 llapi_printf(LLAPI_MSG_DEBUG, "Start receiving records\n");
579                 rc = llapi_changelog_start(&chglog_hdlr,
580                                            CHANGELOG_FLAG_BLOCK |
581                                            CHANGELOG_FLAG_JOBID |
582                                            CHANGELOG_FLAG_EXTRA_FLAGS,
583                                            opt.o_mdtname, 0);
584                 if (rc) {
585                         llapi_error(LLAPI_MSG_ERROR, rc,
586                                     "unable to open changelog of MDT [%s]\n",
587                                     opt.o_mdtname);
588                         return rc;
589                 }
590
591                 while (!eof && !stop) {
592                         rc = llapi_changelog_recv(chglog_hdlr, &rec);
593                         switch (rc) {
594                         case 0:
595                                 rc = process_record(rec);
596                                 if (rc) {
597                                         llapi_error(LLAPI_MSG_ERROR, rc,
598                                                     "failed to process record");
599                                         ret = rc;
600                                 }
601
602                                 llapi_changelog_free(&rec);
603
604                                 rc = lsom_check_sync();
605                                 if (rc) {
606                                         stop = true;
607                                         ret = rc;
608                                 }
609
610                                 break;
611                         case 1: /* EOF */
612                                 llapi_printf(LLAPI_MSG_DEBUG,
613                                              "finished reading [%s]\n",
614                                              opt.o_mdtname);
615                                 eof = true;
616                                 break;
617                         case -EINVAL: /* FS unmounted */
618                         case -EPROTO:  /* error in KUC channel */
619                         default:
620                                 stop = true;
621                                 llapi_error(LLAPI_MSG_ERROR, rc,
622                                             "failed to get changelog record");
623                                 ret = rc;
624                                 break;
625                         }
626                 }
627
628                 /* reach EOF of changelog */
629                 rc = llapi_changelog_fini(&chglog_hdlr);
630                 if (rc) {
631                         llapi_error(LLAPI_MSG_ERROR, rc,
632                                     "unable to close changelog of MDT [%s]",
633                                     opt.o_mdtname);
634                         ret = rc;
635                         return rc;
636                 }
637
638                 if (opt.o_daemonize) {
639                         sleep(opt.o_intv);
640
641                         rc = lsom_check_sync();
642                         if (rc) {
643                                 stop = true;
644                                 ret = rc;
645                         }
646                 } else {
647                         lsom_start_update(head.lh_cached_count);
648                         stop = true;
649                 }
650         }
651
652         lsom_cleanup();
653         return ret;
654 }