Whamcloud - gitweb
LU-17744 ldiskfs: mballoc stats fixes
[fs/lustre-release.git] / lustre / utils / llsom_sync.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; if not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2017, DDN Storage Corporation.
24  */
25 /*
26  * lustre/utils/llsom_sync.c
27  *
28  * Tool for sync the LSOM xattr.
29  *
30  * Author: Qian Yingjin <qian@ddn.com>
31  */
32
33 #include <stdlib.h>
34 #include <errno.h>
35 #include <getopt.h>
36 #include <unistd.h>
37 #include <fcntl.h>
38 #include <poll.h>
39 #include <assert.h>
40 #include <sys/stat.h>
41 #include <sys/types.h>
42 #include <time.h>
43 #include <linux/unistd.h>
44 #include <linux/kernel.h>
45 #include <sys/sysinfo.h>
46 #include <linux/lustre/lustre_user.h>
47 #include <lustre/lustreapi.h>
48 #include <linux/lustre/lustre_idl.h>
49 #include <linux/lustre/lustre_fid.h>
50 #include <libcfs/util/hash.h>
51 #include <libcfs/util/list.h>
52
53 #define container_of(ptr, type, member) ({                      \
54         const typeof(((type *) 0)->member) * __mptr = (ptr);     \
55         (type *) ((char *) __mptr - offsetof(type, member)); })
56
57 #define CHLG_POLL_INTV  60
58 #define REC_MIN_AGE     600
59 #define DEF_CACHE_SIZE  (256 * 1048576) /* 256MB */
60 #define ONE_MB 0x100000
61
62 struct options {
63         const char      *o_chlg_user;
64         const char      *o_mdtname;
65         const char      *o_mntpt;
66         bool             o_daemonize;
67         bool             o_data_sync;
68         int              o_verbose;
69         int              o_intv;
70         int              o_min_age;
71         unsigned long    o_cached_fid_hiwm; /* high watermark */
72         unsigned long    o_batch_sync_cnt;
73 };
74
75 struct options opt;
76
77 struct fid_rec {
78         struct hlist_node       fr_node;
79         struct list_head        fr_link;
80         lustre_fid              fr_fid;
81         __u64                   fr_time;
82         __u64                   fr_index;
83 };
84
85 static const int fid_hash_shift = 6;
86
87 #define FID_HASH_ENTRIES        (1 << fid_hash_shift)
88 #define FID_ON_HASH(f)          (!hlist_unhashed(&(f)->fr_node))
89
90 struct lsom_head {
91         struct hlist_head       *lh_hash;
92         struct list_head         lh_list; /* ordered list by record index */
93         unsigned long            lh_cached_count;
94 } head;
95
96 static void usage(char *prog)
97 {
98         printf("\nUsage: %s [options] -u <userid> -m <mdtdev> <mntpt>\n"
99                "options:\n"
100                "\t-d, --daemonize\n"
101                "\t-i, --interval, poll interval in second\n"
102                "\t-a, --min-age, min age before a record is processed.\n"
103                "\t-c, --max-cache, percentage of the memroy used for cache.\n"
104                "\t-s, --sync, data sync when update LSOM xattr\n"
105                "\t-v, --verbose, produce more verbose ouput\n",
106                prog);
107         exit(0);
108 }
109
110 static inline bool fid_eq(const lustre_fid *f1, const lustre_fid *f2)
111 {
112         return f1->f_seq == f2->f_seq && f1->f_oid == f2->f_oid &&
113                f1->f_ver == f2->f_ver;
114 }
115
116 static void fid_hash_del(struct fid_rec *f)
117 {
118         if (FID_ON_HASH(f))
119                 hlist_del_init(&f->fr_node);
120 }
121
122 static void fid_hash_add(struct fid_rec *f)
123 {
124         assert(!FID_ON_HASH(f));
125         hlist_add_head(&f->fr_node,
126                        &head.lh_hash[llapi_fid_hash(&f->fr_fid,
127                                               fid_hash_shift)]);
128 }
129
130 static struct fid_rec *fid_hash_find(const lustre_fid *fid)
131 {
132         struct hlist_head *hash_list;
133         struct hlist_node *entry, *next;
134         struct fid_rec *f;
135
136         hash_list = &head.lh_hash[llapi_fid_hash(fid, fid_hash_shift)];
137         hlist_for_each_entry_safe(f, entry, next, hash_list, fr_node) {
138                 assert(FID_ON_HASH(f));
139                 if (fid_eq(fid, &f->fr_fid))
140                         return f;
141         }
142
143         return NULL;
144 }
145
146 static int lsom_setup(void)
147 {
148         int i;
149
150         /* set llapi message level */
151         llapi_msg_set_level(opt.o_verbose);
152
153         memset(&head, 0, sizeof(head));
154         head.lh_hash = malloc(sizeof(struct hlist_head) * FID_HASH_ENTRIES);
155         if (head.lh_hash == NULL) {
156                 llapi_err_noerrno(LLAPI_MSG_ERROR,
157                                  "failed to alloc memory for hash (%zu).",
158                                  sizeof(struct hlist_head) * FID_HASH_ENTRIES);
159                 return -ENOMEM;
160         }
161
162         for (i = 0; i < FID_HASH_ENTRIES; i++)
163                 INIT_HLIST_HEAD(&head.lh_hash[i]);
164
165         INIT_LIST_HEAD(&head.lh_list);
166         return 0;
167 }
168
169 static void lsom_cleanup(void)
170 {
171         free(head.lh_hash);
172 }
173
174 static int lsom_update_one(struct fid_rec *f)
175 {
176         struct stat st;
177         int fd;
178         int rc = 0;
179
180         fd = llapi_open_by_fid(opt.o_mntpt, &f->fr_fid,
181                                O_RDONLY | O_NOATIME);
182         if (fd < 0) {
183                 rc = -errno;
184
185                 /* The file may be deleted, clean the corresponding
186                  * changelog record and ignore this error.
187                  */
188                 if (rc == -ENOENT)
189                         goto clean_up;
190
191                 llapi_error(LLAPI_MSG_ERROR, rc,
192                             "llapi_open_by_fid for " DFID " failed",
193                             PFID(&f->fr_fid));
194                 return rc;
195         }
196
197         if (opt.o_data_sync) {
198                 __u64 dv;
199
200                 /* Flush dirty pages from clients */
201                 rc = llapi_get_data_version(fd, &dv, LL_DV_RD_FLUSH);
202                 if (rc < 0)
203                         llapi_error(LLAPI_MSG_ERROR, errno,
204                                     "failed to sync data for " DFID,
205                                     PFID(&f->fr_fid));
206                 /* ignore this error, continue to sync lsom data */
207         }
208
209         rc = fstat(fd, &st);
210         if (rc < 0) {
211                 llapi_error(LLAPI_MSG_ERROR, rc, "failed to stat FID: " DFID,
212                             PFID(&f->fr_fid));
213                 return rc;
214         }
215
216         /* After call fstat(), it already gets OST attrs to the client,
217          * when close the file, MDS will update the LSOM data itself
218          * according the size and blocks information from the client.
219          */
220         close(fd);
221
222         llapi_printf(LLAPI_MSG_DEBUG,
223                      "record %llu:%llu, updated LSOM for fid " DFID
224                      " size:%lu blocks:%lu\n",
225                      (unsigned long long)f->fr_time,
226                      (unsigned long long)f->fr_index,
227                      PFID(&f->fr_fid), st.st_size, st.st_blocks);
228
229 clean_up:
230         rc = llapi_changelog_clear(opt.o_mdtname,
231                                    opt.o_chlg_user, f->fr_index);
232         if (rc)
233                 llapi_error(LLAPI_MSG_ERROR, rc,
234                             "failed to clear changelog record: %s:%llu",
235                             opt.o_chlg_user, (unsigned long long)f->fr_index);
236         return rc;
237 }
238
239 static int lsom_start_update(int count)
240 {
241         int rc = 0;
242         int i = 0;
243
244         llapi_printf(LLAPI_MSG_INFO, "Start to sync %d records.\n", count);
245
246         while (i < count) {
247                 struct fid_rec *f;
248
249                 f = list_first_entry(&head.lh_list, struct fid_rec, fr_link);
250                 rc = lsom_update_one(f);
251                 if (rc == 0) {
252                         list_del_init(&f->fr_link);
253                         fid_hash_del(f);
254                         free(f);
255                         head.lh_cached_count--;
256                         i++;
257                 } else {
258                         goto out;
259                 }
260         }
261
262 out:
263         return rc;
264 }
265
266 static int lsom_check_sync(void)
267 {
268         int rc = 0;
269         int count;
270
271 repeated:
272         count = 0;
273         if (list_empty(&head.lh_list))
274                 return 0;
275
276         if (head.lh_cached_count > opt.o_cached_fid_hiwm)
277                 count = opt.o_batch_sync_cnt;
278         else {
279                 struct fid_rec *f;
280                 time_t now;
281
282                 /* When the first record in the list was not being
283                  * processed for a long time (more than o_min_age),
284                  * pop the record, start to handle it immediately.
285                  */
286                 now = time(NULL);
287                 f = list_first_entry(&head.lh_list, struct fid_rec, fr_link);
288                 if (now > ((f->fr_time >> 30) + opt.o_min_age))
289                         count = 1;
290         }
291
292         if (count > 0)
293                 rc = lsom_start_update(count);
294
295         if (rc == 0 && count == 1)
296                 goto repeated;
297
298         return rc;
299 }
300
301 static void lsom_sort_record_list(struct fid_rec *f)
302 {
303         struct list_head *pos;
304         bool need_move = false;
305
306         for (pos = f->fr_link.next; pos != &head.lh_list; pos = pos->next) {
307                 struct fid_rec *rec = list_entry(pos, struct fid_rec, fr_link);
308
309                 if (f->fr_index > rec->fr_index) {
310                         need_move = true;
311                         continue;
312                 } else {
313                         break;
314                 }
315         }
316
317         if (need_move)
318                 list_move_tail(&f->fr_link, pos);
319 }
320
321 static int process_record(struct changelog_rec *rec)
322 {
323         __u64 index = rec->cr_index;
324         int rc = 0;
325
326         if (rec->cr_type == CL_CLOSE || rec->cr_type == CL_TRUNC ||
327             rec->cr_type == CL_SETATTR) {
328                 struct fid_rec *f;
329
330                 f = fid_hash_find(&rec->cr_tfid);
331                 if (f == NULL) {
332                         f = malloc(sizeof(struct fid_rec));
333                         if (f == NULL) {
334                                 rc = -ENOMEM;
335                                 llapi_error(LLAPI_MSG_ERROR, rc,
336                                             "failed to alloc memory for fid_rec");
337                                 return rc;
338                         }
339
340                         f->fr_fid = rec->cr_tfid;
341                         f->fr_index = index;
342                         f->fr_time = rec->cr_time;
343                         INIT_HLIST_NODE(&f->fr_node);
344                         fid_hash_add(f);
345                         /*
346                          * The newly changelog record index is processed in the
347                          * ascending order, so it is safe to put the record at
348                          * the tail of the ordered list.
349                          */
350                         list_add_tail(&f->fr_link, &head.lh_list);
351                         head.lh_cached_count++;
352                 } else {
353                         f->fr_index = index;
354                         lsom_sort_record_list(f);
355                 }
356         }
357
358         llapi_printf(LLAPI_MSG_DEBUG,
359                      "Processed changelog record index:%llu type:%s(0x%x) FID:"DFID"\n",
360                      (unsigned long long)index,
361                      changelog_type2str(__le32_to_cpu(rec->cr_type)),
362                      __le32_to_cpu(rec->cr_type), PFID(&rec->cr_tfid));
363
364         return rc;
365 }
366
367 static unsigned long get_fid_cache_size(int pct)
368 {
369         struct sysinfo sinfo;
370         unsigned long cache_size;
371         int rc;
372
373         rc = sysinfo(&sinfo);
374         if (rc) {
375                 llapi_error(LLAPI_MSG_ERROR, rc, "failed to get sysinfo");
376                 /* ignore this error, just pick some reasonable static
377                  * limit for the cache size (e.g. 256MB, default value).
378                  */
379                 cache_size = DEF_CACHE_SIZE;
380         } else {
381                 /* maximum cached fid size is tunned according to total
382                  * memory size, e.g. 5% of the memroy.
383                  */
384                 cache_size = sinfo.totalram * pct / 100;
385         }
386
387         return cache_size;
388 }
389
390 int main(int argc, char **argv)
391 {
392         int c;
393         int rc;
394         void *chglog_hdlr;
395         struct changelog_rec *rec;
396         bool stop = 0;
397         int ret = 0;
398         unsigned long long cache_size = DEF_CACHE_SIZE;
399         char fsname[MAX_OBD_NAME + 1];
400         unsigned long long unit;
401         static struct option options[] = {
402                 { "mdt", required_argument, NULL, 'm' },
403                 { "user", required_argument, 0, 'u'},
404                 { "daemonize", no_argument, NULL, 'd'},
405                 { "interval", required_argument, NULL, 'i'},
406                 { "min-age", required_argument, NULL, 'a'},
407                 { "max-cache", required_argument, NULL, 'c'},
408                 { "verbose", no_argument, NULL, 'v'},
409                 { "sync", no_argument, NULL, 's'},
410                 { "help", no_argument, NULL, 'h' },
411                 { NULL }
412         };
413
414         memset(&opt, 0, sizeof(opt));
415         opt.o_data_sync = false;
416         opt.o_verbose = LLAPI_MSG_INFO;
417         opt.o_intv = CHLG_POLL_INTV;
418         opt.o_min_age = REC_MIN_AGE;
419
420         while ((c = getopt_long(argc, argv, "u:hm:dsi:a:c:v", options, NULL))
421                != EOF) {
422                 switch (c) {
423                 default:
424                         rc = -EINVAL;
425                         llapi_error(LLAPI_MSG_ERROR, rc,
426                                     "%s: unknown option '%c'",
427                                     argv[0], optopt);
428                         return rc;
429                 case 'u':
430                         opt.o_chlg_user = optarg;
431                         break;
432                 case 'h':
433                         usage(argv[0]);
434                         break;
435                 case 'm':
436                         opt.o_mdtname = optarg;
437                         break;
438                 case 'd':
439                         opt.o_daemonize = true;
440                         break;
441                 case 'i':
442                         opt.o_intv = atoi(optarg);
443                         if (opt.o_intv < 0) {
444                                 rc = -EINVAL;
445                                 llapi_error(LLAPI_MSG_ERROR, rc,
446                                             "bad value for -i %s", optarg);
447                                 return rc;
448                         }
449                         break;
450                 case 'a':
451                         opt.o_min_age = atoi(optarg);
452                         if (opt.o_min_age < 0) {
453                                 rc = -EINVAL;
454                                 llapi_error(LLAPI_MSG_ERROR, rc,
455                                             "bad value for -a %s", optarg);
456                                 return rc;
457                         }
458                         break;
459                 case 'c':
460                         unit = ONE_MB;
461                         rc = llapi_parse_size(optarg, &cache_size, &unit, 0);
462                         if (rc < 0) {
463                                 rc = -EINVAL;
464                                 llapi_error(LLAPI_MSG_ERROR, rc,
465                                             "bad valud for -c '%s'", optarg);
466                                 return rc;
467                         }
468
469                         /* For value < 100, it is taken as the percentage of
470                          * total memory instead.
471                          */
472                         if (cache_size < 100)
473                                 cache_size = get_fid_cache_size(cache_size);
474                         llapi_printf(LLAPI_MSG_INFO, "Cache size: %llu\n",
475                                      cache_size);
476                         break;
477                 case 'v':
478                         opt.o_verbose++;
479                         break;
480                 case 's':
481                         opt.o_data_sync = true;
482                         break;
483                 }
484         }
485
486         if (argc != optind + 1) {
487                 llapi_err_noerrno(LLAPI_MSG_ERROR,
488                                   "%s: no mount point specified\n", argv[0]);
489                 usage(argv[0]);
490         }
491
492         opt.o_mntpt = argv[optind];
493         rc = llapi_search_fsname(opt.o_mntpt, fsname);
494         if (rc < 0) {
495                 llapi_error(LLAPI_MSG_ERROR, rc,
496                             "cannot find a Lustre file system mounted at '%s'",
497                             opt.o_mntpt);
498                 return rc;
499         }
500
501         if (!opt.o_mdtname)
502                 usage(argv[0]);
503
504         if (!opt.o_chlg_user)
505                 usage(argv[0]);
506
507         if (opt.o_daemonize) {
508                 rc = daemon(1, 1);
509                 if (rc < 0) {
510                         rc = -errno;
511                         llapi_error(LLAPI_MSG_ERROR, rc, "cannot daemonize");
512                         return rc;
513                 }
514
515                 setbuf(stdout, NULL);
516         }
517
518         opt.o_cached_fid_hiwm = cache_size / sizeof(struct fid_rec);
519         opt.o_batch_sync_cnt = opt.o_cached_fid_hiwm / 2;
520
521         rc = lsom_setup();
522         if (rc < 0)
523                 return rc;
524
525         while (!stop) {
526                 bool eof = false;
527
528                 llapi_printf(LLAPI_MSG_DEBUG, "Start receiving records\n");
529                 rc = llapi_changelog_start(&chglog_hdlr,
530                                            CHANGELOG_FLAG_BLOCK |
531                                            CHANGELOG_FLAG_JOBID |
532                                            CHANGELOG_FLAG_EXTRA_FLAGS,
533                                            opt.o_mdtname, 0);
534                 if (rc) {
535                         llapi_error(LLAPI_MSG_ERROR, rc,
536                                     "unable to open changelog of MDT '%s'",
537                                     opt.o_mdtname);
538                         return rc;
539                 }
540
541                 while (!eof && !stop) {
542                         rc = llapi_changelog_recv(chglog_hdlr, &rec);
543                         switch (rc) {
544                         case 0:
545                                 rc = process_record(rec);
546                                 if (rc) {
547                                         llapi_error(LLAPI_MSG_ERROR, rc,
548                                                     "failed to process record");
549                                         ret = rc;
550                                 }
551
552                                 llapi_changelog_free(&rec);
553
554                                 rc = lsom_check_sync();
555                                 if (rc) {
556                                         stop = true;
557                                         ret = rc;
558                                 }
559
560                                 break;
561                         case 1: /* EOF */
562                                 llapi_printf(LLAPI_MSG_DEBUG,
563                                              "finished reading [%s]\n",
564                                              opt.o_mdtname);
565                                 eof = true;
566                                 break;
567                         case -EINVAL: /* FS unmounted */
568                         case -EPROTO:  /* error in KUC channel */
569                         default:
570                                 stop = true;
571                                 llapi_error(LLAPI_MSG_ERROR, rc,
572                                             "failed to get changelog record");
573                                 ret = rc;
574                                 break;
575                         }
576                 }
577
578                 /* reach EOF of changelog */
579                 rc = llapi_changelog_fini(&chglog_hdlr);
580                 if (rc) {
581                         llapi_error(LLAPI_MSG_ERROR, rc,
582                                     "unable to close changelog of MDT '%s'",
583                                     opt.o_mdtname);
584                         ret = rc;
585                         return rc;
586                 }
587
588                 if (opt.o_daemonize) {
589                         sleep(opt.o_intv);
590
591                         rc = lsom_check_sync();
592                         if (rc) {
593                                 stop = true;
594                                 ret = rc;
595                         }
596                 } else {
597                         lsom_start_update(head.lh_cached_count);
598                         stop = true;
599                 }
600         }
601
602         lsom_cleanup();
603         return ret;
604 }