Whamcloud - gitweb
LU-12705 build: fix building fail against Power9 little endian
[fs/lustre-release.git] / lustre / utils / llsom_sync.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; if not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2017, DDN Storage Corporation.
24  */
25 /*
26  * lustre/utils/llsom_sync.c
27  *
28  * Tool for sync the LSOM xattr.
29  *
30  * Author: Qian Yingjin <qian@ddn.com>
31  */
32
33 #include <stdlib.h>
34 #include <errno.h>
35 #include <getopt.h>
36 #include <unistd.h>
37 #include <fcntl.h>
38 #include <poll.h>
39 #include <assert.h>
40 #include <sys/stat.h>
41 #include <sys/types.h>
42 #include <time.h>
43 #include <linux/unistd.h>
44 #include <linux/kernel.h>
45 #include <sys/sysinfo.h>
46 #include <linux/lustre/lustre_user.h>
47 #include <lustre/lustreapi.h>
48 #include <linux/lustre/lustre_idl.h>
49 #include <linux/lustre/lustre_fid.h>
50 #include <libcfs/util/hash.h>
51 #include <libcfs/util/list.h>
52 #include <libcfs/util/parser.h>
53
54 #define container_of(ptr, type, member) ({                      \
55         const typeof(((type *) 0)->member) * __mptr = (ptr);     \
56         (type *) ((char *) __mptr - offsetof(type, member)); })
57
58 #define CHLG_POLL_INTV  60
59 #define REC_MIN_AGE     600
60 #define DEF_CACHE_SIZE  (256 * 1048576) /* 256MB */
61
62 struct options {
63         const char      *o_chlg_user;
64         const char      *o_mdtname;
65         const char      *o_mntpt;
66         bool             o_daemonize;
67         bool             o_data_sync;
68         int              o_verbose;
69         int              o_intv;
70         int              o_min_age;
71         unsigned long    o_cached_fid_hiwm; /* high watermark */
72         unsigned long    o_batch_sync_cnt;
73 };
74
75 struct options opt;
76
77 struct fid_rec {
78         struct hlist_node       fr_node;
79         struct list_head        fr_link;
80         lustre_fid              fr_fid;
81         __u64                   fr_time;
82         __u64                   fr_index;
83 };
84
85 static const int fid_hash_shift = 6;
86
87 #define FID_HASH_ENTRIES        (1 << fid_hash_shift)
88 #define FID_ON_HASH(f)          (!hlist_unhashed(&(f)->fr_node))
89
90 #if __BITS_PER_LONG == 32
91 #define FID_HASH_FN(f)  (hash_long(fid_flatten32(f), fid_hash_shift))
92 #elif __BITS_PER_LONG == 64
93 #define FID_HASH_FN(f)  (hash_long(fid_flatten(f), fid_hash_shift))
94 #else
95 #error Wordsize not 32 or 64
96 #endif
97
98 struct lsom_head {
99         struct hlist_head       *lh_hash;
100         struct list_head         lh_list; /* ordered list by record index */
101         unsigned long            lh_cached_count;
102 } head;
103
104 static void usage(char *prog)
105 {
106         printf("\nUsage: %s [options] -u <userid> -m <mdtdev> <mntpt>\n"
107                "options:\n"
108                "\t-d, --daemonize\n"
109                "\t-i, --interval, poll interval in second\n"
110                "\t-a, --min-age, min age before a record is processed.\n"
111                "\t-c, --max-cache, percentage of the memroy used for cache.\n"
112                "\t-s, --sync, data sync when update LSOM xattr\n"
113                "\t-v, --verbose, produce more verbose ouput\n",
114                prog);
115         exit(0);
116 }
117
118 static inline __u64 fid_flatten(const struct lu_fid *fid)
119 {
120         __u64 ino;
121         __u64 seq;
122
123         if (fid_is_igif(fid)) {
124                 ino = lu_igif_ino(fid);
125                 return ino;
126         }
127
128         seq = fid_seq(fid);
129
130         ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid);
131
132         return ino ?: fid_oid(fid);
133 }
134
135 /**
136  * map fid to 32 bit value for ino on 32bit systems.
137  */
138 static inline __u32 fid_flatten32(const struct lu_fid *fid)
139 {
140         __u32 ino;
141         __u64 seq;
142
143         if (fid_is_igif(fid)) {
144                 ino = lu_igif_ino(fid);
145                 return ino;
146         }
147
148         seq = fid_seq(fid) - FID_SEQ_START;
149
150         /* Map the high bits of the OID into higher bits of the inode number so
151          * that inodes generated at about the same time have a reduced chance
152          * of collisions. This will give a period of 2^12 = 1024 unique clients
153          * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects
154          * (from OID), or up to 128M inodes without collisions for new files.
155          */
156         ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) +
157               (seq >> (64 - (40-8)) & 0xffffff00) +
158               (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8);
159
160         return ino ?: fid_oid(fid);
161 }
162
163 static inline bool fid_eq(const lustre_fid *f1, const lustre_fid *f2)
164 {
165         return f1->f_seq == f2->f_seq && f1->f_oid == f2->f_oid &&
166                f1->f_ver == f2->f_ver;
167 }
168
169 static void fid_hash_del(struct fid_rec *f)
170 {
171         if (FID_ON_HASH(f))
172                 hlist_del_init(&f->fr_node);
173 }
174
175 static void fid_hash_add(struct fid_rec *f)
176 {
177         assert(!FID_ON_HASH(f));
178         hlist_add_head(&f->fr_node, &head.lh_hash[FID_HASH_FN(&f->fr_fid)]);
179 }
180
181 static struct fid_rec *fid_hash_find(const lustre_fid *fid)
182 {
183         struct hlist_head *hash_list;
184         struct hlist_node *entry, *next;
185         struct fid_rec *f;
186
187         hash_list = &head.lh_hash[FID_HASH_FN(fid)];
188         hlist_for_each_entry_safe(f, entry, next, hash_list, fr_node) {
189                 assert(FID_ON_HASH(f));
190                 if (fid_eq(fid, &f->fr_fid))
191                         return f;
192         }
193
194         return NULL;
195 }
196
197 static int lsom_setup(void)
198 {
199         int i;
200
201         /* set llapi message level */
202         llapi_msg_set_level(opt.o_verbose);
203
204         memset(&head, 0, sizeof(head));
205         head.lh_hash = malloc(sizeof(struct hlist_head) * FID_HASH_ENTRIES);
206         if (head.lh_hash == NULL) {
207                 llapi_err_noerrno(LLAPI_MSG_ERROR,
208                                  "failed to alloc memory for hash (%zu).",
209                                  sizeof(struct hlist_head) * FID_HASH_ENTRIES);
210                 return -ENOMEM;
211         }
212
213         for (i = 0; i < FID_HASH_ENTRIES; i++)
214                 INIT_HLIST_HEAD(&head.lh_hash[i]);
215
216         INIT_LIST_HEAD(&head.lh_list);
217         return 0;
218 }
219
220 static void lsom_cleanup(void)
221 {
222         free(head.lh_hash);
223 }
224
225 static int lsom_update_one(struct fid_rec *f)
226 {
227         struct stat st;
228         int fd;
229         int rc = 0;
230
231         fd = llapi_open_by_fid(opt.o_mntpt, &f->fr_fid,
232                                O_RDONLY | O_NOATIME);
233         if (fd < 0) {
234                 rc = -errno;
235
236                 /* The file may be deleted, clean the corresponding
237                  * changelog record and ignore this error.
238                  */
239                 if (rc == -ENOENT)
240                         goto clean_up;
241
242                 llapi_error(LLAPI_MSG_ERROR, rc,
243                             "llapi_open_by_fid for " DFID " failed",
244                             PFID(&f->fr_fid));
245                 return rc;
246         }
247
248         if (opt.o_data_sync) {
249                 __u64 dv;
250
251                 /* Flush dirty pages from clients */
252                 rc = llapi_get_data_version(fd, &dv, LL_DV_RD_FLUSH);
253                 if (rc < 0)
254                         llapi_error(LLAPI_MSG_ERROR, errno,
255                                     "failed to sync data for " DFID,
256                                     PFID(&f->fr_fid));
257                 /* ignore this error, continue to sync lsom data */
258         }
259
260         rc = fstat(fd, &st);
261         if (rc < 0) {
262                 llapi_error(LLAPI_MSG_ERROR, rc, "failed to stat FID: " DFID,
263                             PFID(&f->fr_fid));
264                 return rc;
265         }
266
267         /* After call fstat(), it already gets OST attrs to the client,
268          * when close the file, MDS will update the LSOM data itself
269          * according the size and blocks information from the client.
270          */
271         close(fd);
272
273         llapi_printf(LLAPI_MSG_DEBUG,
274                      "record %llu:%llu, updated LSOM for fid " DFID
275                      " size:%lu blocks:%lu\n",
276                      (unsigned long long)f->fr_time,
277                      (unsigned long long)f->fr_index,
278                      PFID(&f->fr_fid), st.st_size, st.st_blocks);
279
280 clean_up:
281         rc = llapi_changelog_clear(opt.o_mdtname,
282                                    opt.o_chlg_user, f->fr_index);
283         if (rc)
284                 llapi_error(LLAPI_MSG_ERROR, rc,
285                             "failed to clear changelog record: %s:%llu",
286                             opt.o_chlg_user, (unsigned long long)f->fr_index);
287         return rc;
288 }
289
290 static int lsom_start_update(int count)
291 {
292         int rc = 0;
293         int i = 0;
294
295         llapi_printf(LLAPI_MSG_INFO, "Start to sync %d records.\n", count);
296
297         while (i < count) {
298                 struct fid_rec *f;
299
300                 f = list_entry(head.lh_list.next, struct fid_rec, fr_link);
301                 rc = lsom_update_one(f);
302                 if (rc == 0) {
303                         list_del_init(&f->fr_link);
304                         fid_hash_del(f);
305                         free(f);
306                         head.lh_cached_count--;
307                         i++;
308                 } else {
309                         goto out;
310                 }
311         }
312
313 out:
314         return rc;
315 }
316
317 static int lsom_check_sync(void)
318 {
319         int rc = 0;
320         int count;
321
322 repeated:
323         count = 0;
324         if (list_empty(&head.lh_list))
325                 return 0;
326
327         if (head.lh_cached_count > opt.o_cached_fid_hiwm)
328                 count = opt.o_batch_sync_cnt;
329         else {
330                 struct fid_rec *f;
331                 time_t now;
332
333                 /* When the first record in the list was not being
334                  * processed for a long time (more than o_min_age),
335                  * pop the record, start to handle it immediately.
336                  */
337                 now = time(NULL);
338                 f = list_entry(head.lh_list.next, struct fid_rec, fr_link);
339                 if (now > ((f->fr_time >> 30) + opt.o_min_age))
340                         count = 1;
341         }
342
343         if (count > 0)
344                 rc = lsom_start_update(count);
345
346         if (rc == 0 && count == 1)
347                 goto repeated;
348
349         return rc;
350 }
351
352 static void lsom_sort_record_list(struct fid_rec *f)
353 {
354         struct list_head *pos;
355         bool need_move = false;
356
357         for (pos = f->fr_link.next; pos != &head.lh_list; pos = pos->next) {
358                 struct fid_rec *rec = list_entry(pos, struct fid_rec, fr_link);
359
360                 if (f->fr_index > rec->fr_index) {
361                         need_move = true;
362                         continue;
363                 } else {
364                         break;
365                 }
366         }
367
368         if (need_move)
369                 list_move_tail(&f->fr_link, pos);
370 }
371
372 static int process_record(struct changelog_rec *rec)
373 {
374         __u64 index = rec->cr_index;
375         int rc = 0;
376
377         if (rec->cr_type == CL_CLOSE || rec->cr_type == CL_TRUNC ||
378             rec->cr_type == CL_SETATTR) {
379                 struct fid_rec *f;
380
381                 f = fid_hash_find(&rec->cr_tfid);
382                 if (f == NULL) {
383                         f = malloc(sizeof(struct fid_rec));
384                         if (f == NULL) {
385                                 rc = -ENOMEM;
386                                 llapi_error(LLAPI_MSG_ERROR, rc,
387                                             "failed to alloc memory for fid_rec");
388                                 return rc;
389                         }
390
391                         f->fr_fid = rec->cr_tfid;
392                         f->fr_index = index;
393                         f->fr_time = rec->cr_time;
394                         INIT_HLIST_NODE(&f->fr_node);
395                         fid_hash_add(f);
396                         /*
397                          * The newly changelog record index is processed in the
398                          * ascending order, so it is safe to put the record at
399                          * the tail of the ordered list.
400                          */
401                         list_add_tail(&f->fr_link, &head.lh_list);
402                         head.lh_cached_count++;
403                 } else {
404                         f->fr_index = index;
405                         lsom_sort_record_list(f);
406                 }
407         }
408
409         llapi_printf(LLAPI_MSG_DEBUG,
410                      "Processed changelog record index:%llu type:%s(0x%x) FID:"DFID"\n",
411                      (unsigned long long)index,
412                      changelog_type2str(__le32_to_cpu(rec->cr_type)),
413                      __le32_to_cpu(rec->cr_type), PFID(&rec->cr_tfid));
414
415         return rc;
416 }
417
418 static unsigned long get_fid_cache_size(int pct)
419 {
420         struct sysinfo sinfo;
421         unsigned long cache_size;
422         int rc;
423
424         rc = sysinfo(&sinfo);
425         if (rc) {
426                 llapi_error(LLAPI_MSG_ERROR, rc, "failed to get sysinfo");
427                 /* ignore this error, just pick some reasonable static
428                  * limit for the cache size (e.g. 256MB, default value).
429                  */
430                 cache_size = DEF_CACHE_SIZE;
431         } else {
432                 /* maximum cached fid size is tunned according to total
433                  * memory size, e.g. 5% of the memroy.
434                  */
435                 cache_size = sinfo.totalram * pct / 100;
436         }
437
438         return cache_size;
439 }
440
441 int main(int argc, char **argv)
442 {
443         int                      c;
444         int                      rc;
445         void                    *chglog_hdlr;
446         struct changelog_rec    *rec;
447         bool                     stop = 0;
448         int                      ret = 0;
449         unsigned long            cache_size = DEF_CACHE_SIZE;
450         char                     fsname[MAX_OBD_NAME + 1];
451         static struct option options[] = {
452                 { "mdt", required_argument, NULL, 'm' },
453                 { "user", required_argument, 0, 'u'},
454                 { "daemonize", no_argument, NULL, 'd'},
455                 { "interval", required_argument, NULL, 'i'},
456                 { "min-age", required_argument, NULL, 'a'},
457                 { "max-cache", required_argument, NULL, 'c'},
458                 { "verbose", no_argument, NULL, 'v'},
459                 { "sync", no_argument, NULL, 's'},
460                 { "help", no_argument, NULL, 'h' },
461                 { NULL }
462         };
463
464         memset(&opt, 0, sizeof(opt));
465         opt.o_data_sync = false;
466         opt.o_verbose = LLAPI_MSG_INFO;
467         opt.o_intv = CHLG_POLL_INTV;
468         opt.o_min_age = REC_MIN_AGE;
469
470         while ((c = getopt_long(argc, argv, "u:hm:dsi:a:c:v", options, NULL))
471                != EOF) {
472                 switch (c) {
473                 default:
474                         rc = -EINVAL;
475                         llapi_error(LLAPI_MSG_ERROR, rc,
476                                     "%s: unknown option '-%c'\n",
477                                     argv[0], optopt);
478                         return rc;
479                 case 'u':
480                         opt.o_chlg_user = optarg;
481                         break;
482                 case 'h':
483                         usage(argv[0]);
484                         break;
485                 case 'm':
486                         opt.o_mdtname = optarg;
487                         break;
488                 case 'd':
489                         opt.o_daemonize = true;
490                         break;
491                 case 'i':
492                         opt.o_intv = atoi(optarg);
493                         if (opt.o_intv < 0) {
494                                 rc = -EINVAL;
495                                 llapi_error(LLAPI_MSG_ERROR, rc,
496                                             "bad value for -i %s", optarg);
497                                 return rc;
498                         }
499                         break;
500                 case 'a':
501                         opt.o_min_age = atoi(optarg);
502                         if (opt.o_min_age < 0) {
503                                 rc = -EINVAL;
504                                 llapi_error(LLAPI_MSG_ERROR, rc,
505                                             "bad value for -a %s", optarg);
506                                 return rc;
507                         }
508                         break;
509                 case 'c':
510                         rc = Parser_size(&cache_size, optarg);
511                         if (rc < 0) {
512                                 rc = -EINVAL;
513                                 llapi_error(LLAPI_MSG_ERROR, rc,
514                                             "bad valud for -c '%s'", optarg);
515                                 return rc;
516                         }
517
518                         /* For value < 100, it is taken as the percentage of
519                          * total memory instead.
520                          */
521                         if (cache_size < 100)
522                                 cache_size = get_fid_cache_size(cache_size);
523                         llapi_printf(LLAPI_MSG_INFO, "Cache size: %lu\n",
524                                      cache_size);
525                         break;
526                 case 'v':
527                         opt.o_verbose++;
528                         break;
529                 case 's':
530                         opt.o_data_sync = true;
531                         break;
532                 }
533         }
534
535         if (argc != optind + 1) {
536                 llapi_err_noerrno(LLAPI_MSG_ERROR,
537                                   "%s: no mount point specified\n", argv[0]);
538                 usage(argv[0]);
539         }
540
541         opt.o_mntpt = argv[optind];
542         rc = llapi_search_fsname(opt.o_mntpt, fsname);
543         if (rc < 0) {
544                 llapi_error(LLAPI_MSG_ERROR, rc,
545                             "cannot find a Lustre file system mounted at '%s'",
546                             opt.o_mntpt);
547                 return rc;
548         }
549
550         if (!opt.o_mdtname)
551                 usage(argv[0]);
552
553         if (!opt.o_chlg_user)
554                 usage(argv[0]);
555
556         if (opt.o_daemonize) {
557                 rc = daemon(1, 1);
558                 if (rc < 0) {
559                         rc = -errno;
560                         llapi_error(LLAPI_MSG_ERROR, rc, "cannot daemonize");
561                         return rc;
562                 }
563
564                 setbuf(stdout, NULL);
565         }
566
567         opt.o_cached_fid_hiwm = cache_size / sizeof(struct fid_rec);
568         opt.o_batch_sync_cnt = opt.o_cached_fid_hiwm / 2;
569
570         rc = lsom_setup();
571         if (rc < 0)
572                 return rc;
573
574         while (!stop) {
575                 bool eof = false;
576
577                 llapi_printf(LLAPI_MSG_DEBUG, "Start receiving records\n");
578                 rc = llapi_changelog_start(&chglog_hdlr,
579                                            CHANGELOG_FLAG_BLOCK |
580                                            CHANGELOG_FLAG_JOBID |
581                                            CHANGELOG_FLAG_EXTRA_FLAGS,
582                                            opt.o_mdtname, 0);
583                 if (rc) {
584                         llapi_error(LLAPI_MSG_ERROR, rc,
585                                     "unable to open changelog of MDT [%s]\n",
586                                     opt.o_mdtname);
587                         return rc;
588                 }
589
590                 while (!eof && !stop) {
591                         rc = llapi_changelog_recv(chglog_hdlr, &rec);
592                         switch (rc) {
593                         case 0:
594                                 rc = process_record(rec);
595                                 if (rc) {
596                                         llapi_error(LLAPI_MSG_ERROR, rc,
597                                                     "failed to process record");
598                                         ret = rc;
599                                 }
600
601                                 llapi_changelog_free(&rec);
602
603                                 rc = lsom_check_sync();
604                                 if (rc) {
605                                         stop = true;
606                                         ret = rc;
607                                 }
608
609                                 break;
610                         case 1: /* EOF */
611                                 llapi_printf(LLAPI_MSG_DEBUG,
612                                              "finished reading [%s]\n",
613                                              opt.o_mdtname);
614                                 eof = true;
615                                 break;
616                         case -EINVAL: /* FS unmounted */
617                         case -EPROTO:  /* error in KUC channel */
618                         default:
619                                 stop = true;
620                                 llapi_error(LLAPI_MSG_ERROR, rc,
621                                             "failed to get changelog record");
622                                 ret = rc;
623                                 break;
624                         }
625                 }
626
627                 /* reach EOF of changelog */
628                 rc = llapi_changelog_fini(&chglog_hdlr);
629                 if (rc) {
630                         llapi_error(LLAPI_MSG_ERROR, rc,
631                                     "unable to close changelog of MDT [%s]",
632                                     opt.o_mdtname);
633                         ret = rc;
634                         return rc;
635                 }
636
637                 if (opt.o_daemonize) {
638                         sleep(opt.o_intv);
639
640                         rc = lsom_check_sync();
641                         if (rc) {
642                                 stop = true;
643                                 ret = rc;
644                         }
645                 } else {
646                         lsom_start_update(head.lh_cached_count);
647                         stop = true;
648                 }
649         }
650
651         lsom_cleanup();
652         return ret;
653 }