2 * 2003, Copyright, Hewlett-Packard Development Compnay, LP.
4 * Developed under the sponsorship of the U.S. Government
5 * under Subcontract No. B514193
9 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
10 * Use is subject to license terms.
12 * Copyright (c) 2012, 2015, Intel Corporation.
18 #include <sys/types.h>
29 #include <sys/ioctl.h>
31 #include <sys/xattr.h>
36 #include <lustre/lustreapi.h> /* for O_LOV_DELAY_CREATE */
38 #define CHECK_COUNT 10000
39 #define DISPLAY_COUNT (CHECK_COUNT * 10)
40 #define DISPLAY_TIME 100
74 struct option longOpts[] = {
75 { .name = "create", .has_arg = no_argument, .val = CREATE },
76 { .name = "lookup", .has_arg = no_argument, .val = LOOKUP },
77 { .name = "mknod", .has_arg = no_argument, .val = MKNOD },
78 { .name = "open", .has_arg = no_argument, .val = OPEN },
79 { .name = "stat", .has_arg = no_argument, .val = STAT },
80 { .name = "unlink", .has_arg = no_argument, .val = UNLINK },
81 { .name = "begin", .has_arg = required_argument, .val = BEGIN },
82 { .name = "iters", .has_arg = required_argument, .val = ITERS },
83 /* time is in seconds */
84 { .name = "time", .has_arg = required_argument, .val = TIME },
85 { .name = "dirfmt", .has_arg = required_argument, .val = DIRFMT },
86 { .name = "ndirs", .has_arg = required_argument, .val = NDIRS },
87 { .name = "filefmt", .has_arg = required_argument, .val = FILEFMT },
88 { .name = "nfiles", .has_arg = required_argument, .val = NFILES },
89 { .name = "noexcl", .has_arg = no_argument, .val = NOEXCL },
90 { .name = "stripes", .has_arg = required_argument, .val = STRIPES },
91 { .name = "seed", .has_arg = required_argument, .val = SEED },
92 { .name = "seedfile", .has_arg = required_argument, .val = SEEDFILE },
93 { .name = "random_order", .has_arg = no_argument, .val = RANDOM },
94 { .name = "readdir_order", .has_arg = no_argument, .val = READDIR },
95 { .name = "recreate", .has_arg = no_argument, .val = RECREATE },
96 { .name = "setxattr", .has_arg = no_argument, .val = SETXATTR },
97 { .name = "smallwrite", .has_arg = no_argument, .val = SMALLWRITE },
98 { .name = "ignore", .has_arg = no_argument, .val = IGNORE },
99 { .name = "verbose", .has_arg = no_argument, .val = VERBOSE },
100 { .name = "debug", .has_arg = no_argument, .val = DEBUG },
101 { .name = "help", .has_arg = no_argument, .val = HELP },
102 { .name = "mdtcount", .has_arg = required_argument, .val = MDTCOUNT },
103 { .name = "mntcount", .has_arg = required_argument, .val = MNTCOUNT },
104 { .name = "mntfmt", .has_arg = required_argument, .val = MNT },
114 char hostname[512] = "unknown";
117 int openflags = O_RDWR|O_CREAT|O_EXCL;
121 char mkdir_cmd[PATH_MAX+14];
125 struct dirent *dir_entry;
127 char filefmt[PATH_MAX];
128 char filename[PATH_MAX];
137 struct sigaction act;
146 char xattrname[] = "user.mdsrate";
148 /* max xattr name + value length is block size, use 4000 here to avoid ENOSPC */
155 #define dmesg if (debug) printf
157 #define DISPLAY_PROGRESS() { \
158 if (verbose && (nops % CHECK_COUNT == 0)) { \
159 curTime = MPI_Wtime(); \
160 interval = curTime - lastTime; \
161 if (interval > DISPLAY_TIME || nops % DISPLAY_COUNT == 0) { \
162 rate = (double)(nops - lastOps)/interval; \
163 printf("Rank %d: %.2f %ss/sec %.2f secs " \
164 "(total: %d %ss %.2f secs)\n", \
165 myrank, rate, cmd, interval, \
166 nops, cmd, curTime - startTime); \
168 lastTime = curTime; \
173 char *usage_msg = "usage: %s\n"
174 " { --create [ --noexcl | --setxattr | --smallwrite ] |\n"
175 " --lookup | --mknod [ --setxattr ] | --open |\n"
176 " --stat | --unlink [ --recreate ] [ --ignore ] |\n"
178 " [ --help ] [ --verbose ] [ --debug ]\n"
179 " { [ --begin <num> ] --nfiles <num> }\n"
180 " [ --iters <num> ] [ --time <secs> ]\n"
181 " [ --dirfmt <str> ] [ --ndirs <num> ]\n"
182 " [ --filefmt <str> ] [ --stripes <num> ]\n"
183 " [ --random_order [--seed <num> | --seedfile <file>] ]\n"
184 " [ --readdir_order ] [ --mntfmt <str> ]\n"
185 " [ --mntcount <num> ] [ --mdtcount <num> ]\n"
186 " [ --setxattr ] }\n";
189 usage(FILE *stream, char *fmt, ...)
195 fprintf(stream, "%s: ", prog);
197 vfprintf(stderr, fmt, ap);
200 fprintf(stream, usage_msg, prog);
204 exit(stream == stderr);
207 /* Print process myrank and message, and exit (i.e. a fatal error) */
209 fatal(int rank, const char *fmt, ...)
211 if (rank == myrank) {
214 fprintf(stderr, "rank %d: ", rank);
216 vfprintf(stderr, fmt, ap);
220 MPI_Abort(MPI_COMM_WORLD, 1);
225 sigalrm_handler(int signum)
231 process_args(int argc, char *argv[])
234 int i, index, offset, tmpend, rc;
241 prog = basename(argv[0]);
242 strcpy(filefmt, "f%d");
243 gethostname(hostname, sizeof(hostname));
245 /* auto create shortOpts rather than maintaining a static string. */
246 for (opt = longOpts, cp = shortOpts; opt->name != NULL; opt++, cp++) {
252 while ((rc = getopt_long(argc,argv, shortOpts, longOpts,&index)) != -1) {
255 openflags &= ~(O_CREAT|O_EXCL);
262 fatal(0, "Invalid - more than one operation "
264 longOpts[index].name);
267 cmd = (char *)longOpts[index].name;
270 if (mode != CREATE && mode != MKNOD) {
271 usage(stderr, "--noexcl only applies to "
272 "--create or --mknod.\n");
274 openflags &= ~O_EXCL;
277 if (mode != UNLINK) {
278 usage(stderr, "--recreate only makes sense"
286 cmd = (char *)longOpts[index].name;
287 } else if (mode == CREATE || mode == MKNOD) {
290 usage(stderr, "--setxattr only makes sense "
291 "with --create, --mknod or alone.\n");
296 usage(stderr, "--smallwrite only applies to "
301 begin = strtol(optarg, &endptr, 0);
302 if ((*endptr != 0) || (begin < 0)) {
303 fatal(0, "Invalid --start value.\n");
307 iters = strtol(optarg, &endptr, 0);
308 if ((*endptr != 0) || (iters <= 0)) {
309 fatal(0, "Invalid --iters value.\n");
311 if (mode != LOOKUP && mode != OPEN) {
312 usage(stderr, "--iters only makes sense with "
313 "--lookup or --open.\n");
317 seconds = strtol(optarg, &endptr, 0);
318 if ((*endptr != 0) || (seconds <= 0)) {
319 fatal(0, "Invalid --time value.\n");
323 if (strlen(optarg) > (PATH_MAX - 16)) {
324 fatal(0, "--dirfmt too long\n");
329 ndirs = strtol(optarg, &endptr, 0);
330 if ((*endptr != 0) || (ndirs <= 0)) {
331 fatal(0, "Invalid --ndirs value.\n");
333 if ((ndirs > nthreads) &&
334 ((mode == CREATE) || (mode == MKNOD))) {
335 fatal(0, "--ndirs=%d must be less than or "
336 "equal to the number of threads (%d).\n",
341 if (strlen(optarg) > 4080) {
342 fatal(0, "--filefmt too long\n");
345 /* Use %%d where you want the file # in the name. */
346 sprintf(filefmt, optarg, myrank);
349 nfiles = strtol(optarg, &endptr, 0);
350 if ((*endptr != 0) || (nfiles <= 0)) {
351 fatal(0, "Invalid --nfiles value.\n");
355 stripes = strtol(optarg, &endptr, 0);
356 if ((*endptr != 0) || (stripes < 0)) {
357 fatal(0, "Invalid --stripes value.\n");
361 openflags |= O_LOV_DELAY_CREATE;
363 fatal(0, "non-zero --stripes value "
364 "not yet supported.\n");
369 seed = strtoul(optarg, &endptr, 0);
371 fatal(0, "bad --seed option %s\n", optarg);
375 seed_file = fopen(optarg, "r");
377 fatal(myrank, "fopen(%s) error: %s\n",
378 optarg, strerror(errno));
381 for (i = -1; fgets(tmp, 16, seed_file) != NULL;) {
387 rc = sscanf(tmp, "%d", &seed);
388 if ((rc != 1) || (seed < 0)) {
389 fatal(myrank, "Invalid seed value '%s' "
390 "at line %d in %s.\n",
394 fatal(myrank, "File '%s' too short. Does not "
395 "contain a seed for thread %d.\n",
403 if (mode != LOOKUP && mode != OPEN) {
404 fatal(0, "--%s can only be specified with "
405 "--lookup, or --open.\n",
406 (char *)longOpts[index].name);
422 if (strlen(optarg) > (PATH_MAX - 16))
423 fatal(0, "--mnt too long\n");
427 mnt_count = strtol(optarg, &endptr, 0);
428 if ((*endptr != 0) || (mnt_count <= 0)) {
429 fatal(0, "Invalid --mnt_count value %s.\n",
434 mdt_count = strtol(optarg, &endptr, 0);
435 if ((*endptr != 0) || (mdt_count <= 0)) {
436 fatal(0, "Invalid --mdt_count value %s.\n",
441 usage(stderr, "unrecognized option: '%c'.\n", optopt);
446 usage(stderr, "too many arguments %d >= %d.\n", optind, argc);
449 if ((mnt_count != -1 && mntfmt == NULL) ||
450 (mnt_count == -1 && mntfmt != NULL)) {
451 usage(stderr, "mnt_count and mntfmt must be specified at the "
455 if (mode == CREATE || mode == MKNOD || mode == UNLINK ||
456 mode == STAT || mode == SETXATTR) {
460 } else if (nfiles == 0) {
461 usage(stderr, "--nfiles or --time must be specified "
464 } else if (mode == LOOKUP || mode == OPEN) {
468 } else if (iters == 0) {
469 usage(stderr, "--iters or --time must be specifed "
474 usage(stderr, "--nfiles must be specifed with --%s.\n",
479 int fd = open("/dev/urandom", O_RDONLY);
482 if (read(fd, &seed, sizeof(seed)) <
493 dmesg("%s: rank %d seed %d (%s).\n", prog, myrank, seed,
494 (order == RANDOM) ? "random_order" : "readdir_order");
496 usage(stderr, "one --create, --mknod, --open, --stat,"
498 " --unlink or --setxattr must be specifed.");
501 /* support for multiple threads in a dir, set begin/end appropriately.*/
502 dirnum = myrank % ndirs;
503 dirthreads = nthreads / ndirs;
504 if (nthreads > (ndirs * dirthreads + dirnum))
507 offset = myrank / ndirs;
509 tmpend = begin + nfiles - 1;
513 end = begin + (nfiles / dirthreads) * dirthreads + offset;
514 if ((end > tmpend) || (end <= 0))
517 /* make sure mnt_count <= nthreads, otherwise it might div 0 in
518 * the following test */
519 if (mnt_count > nthreads)
520 mnt_count = nthreads;
528 dmesg("%d: iters %d nfiles %d time %d begin %d end %d dirthreads %d."
529 "\n", myrank, iters, nfiles, seconds, begin, end, dirthreads);
531 if (dirfmt == NULL) {
536 if (mntfmt != NULL) {
537 sprintf(dir, mntfmt, (myrank / (nthreads/mnt_count)));
539 dir_len = strlen(dir);
541 sprintf(dir + dir_len, dirfmt, dirnum);
545 if (stat(dir, &sb) == 0) {
546 if (!S_ISDIR(sb.st_mode))
547 fatal(myrank, "'%s' is not dir\n", dir);
548 } else if (errno == ENOENT) {
549 sprintf(mkdir_cmd, "lfs mkdir -i %d %s",
550 myrank % mdt_count, dir);
552 fatal(myrank, "'%s' stat failed\n", dir);
555 sprintf(mkdir_cmd, "mkdir -p %s", dir);
558 dmesg("%d: %s\n", myrank, mkdir_cmd);
559 #ifdef _LIGHTWEIGHT_KERNEL
560 printf("NOTICE: not running system(%s)\n", mkdir_cmd);
562 rc = system(mkdir_cmd);
564 fatal(myrank, "'%s' failed.\n", mkdir_cmd);
569 fatal(myrank, "unable to chdir to '%s'.\n", dir);
574 static inline char *next_file()
576 if (order == RANDOM) {
577 sprintf(filename, filefmt, random() % nfiles);
583 dir_entry = readdir(directory);
584 if (dir_entry == NULL) {
585 rewinddir(directory);
586 while ((dir_entry = readdir(directory)) != NULL) {
587 if (dir_entry->d_name[0] != '.')
588 return(dir_entry->d_name);
591 fatal(myrank, "unable to read directory %s (%s).\n",
592 dir, strerror(errno));
595 return(dir_entry->d_name);
599 main(int argc, char *argv[])
601 int i, j, fd, rc, nops, lastOps;
603 double ag_interval = 0;
605 double rate, avg_rate, effective_rate;
606 double startTime, curTime, lastTime, interval;
610 rc = MPI_Init(&argc, &argv);
611 if (rc != MPI_SUCCESS)
612 fatal(myrank, "MPI_Init failed: %d\n", rc);
614 rc = MPI_Comm_size(MPI_COMM_WORLD, &nthreads);
615 if (rc != MPI_SUCCESS)
616 fatal(myrank, "MPI_Comm_size failed: %d\n", rc);
618 rc = MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
619 if (rc != MPI_SUCCESS)
620 fatal(myrank, "MPI_Comm_rank failed: %d\n", rc);
622 process_args(argc, argv);
625 if ((myrank == 0) || debug) {
626 printf("%d: %s starting at %s",
627 myrank, hostname, ctime(×tamp));
630 /* if we're not measuring creation rates then precreate
631 * the files we're operating on. */
632 if ((mode != CREATE) && (mode != MKNOD) && !ignore &&
633 (mode != UNLINK || recreate)) {
634 /* create the files in reverse order. When we encounter
635 * a file that already exists, assume the remainder of
636 * the files exist to save time. The timed performance
637 * test scripts make use of this behavior. */
638 for (i = end, j = 0; i >= begin; i -= dirthreads) {
639 sprintf(filename, filefmt, i);
640 fd = open(filename, openflags, 0644);
645 fatal(myrank, "precreate open(%s) error: %s\n",
646 filename, strerror(rc));
651 dmesg("%d: %s pre-created %d files.\n",myrank,hostname,j);
653 rc = MPI_Barrier(MPI_COMM_WORLD);
654 if (rc != MPI_SUCCESS)
655 fatal(myrank, "prep MPI_Barrier failed: %d\n", rc);
658 if (order == READDIR) {
659 directory = opendir(dir);
660 if (directory == NULL) {
662 fatal(myrank, "opendir(%s) error: %s\n",
667 j = random() % nfiles;
668 dmesg("%d: %s initializing dir offset %u: %s",
669 myrank, hostname, j, ctime(×tamp));
671 for (i = 0; i <= j; i++) {
672 if ((dir_entry = readdir(directory)) == NULL) {
673 fatal(myrank, "could not read entry number %d "
674 "in directory %s.\n", i, dir);
679 dmesg("%d: index %d, filename %s, offset %ld: "
680 "%s initialization complete: %s",
681 myrank, i, dir_entry->d_name, telldir(directory),
682 hostname, ctime(×tamp));
686 act.sa_handler = sigalrm_handler;
687 (void)sigemptyset(&act.sa_mask);
689 sigaction(SIGALRM, &act, NULL);
693 rc = MPI_Barrier(MPI_COMM_WORLD);
694 if (rc != MPI_SUCCESS)
695 fatal(myrank, "prep MPI_Barrier failed: %d\n", rc);
697 startTime = lastTime = MPI_Wtime();
702 for (; begin <= end && !alarm_caught; begin += dirthreads) {
703 snprintf(filename, sizeof(filename), filefmt, begin);
704 fd = open(filename, openflags, 0644);
707 if (rc == EINTR && alarm_caught)
709 fatal(myrank, "open(%s) error: %s\n",
710 filename, strerror(rc));
714 rc = fsetxattr(fd, xattrname, xattrbuf,
715 xattrlen, XATTR_CREATE);
718 if (rc == EINTR && alarm_caught)
721 "setxattr(%s) error: %s\n",
722 filename, strerror(rc));
726 rc = write(fd, xattrbuf, xattrlen);
729 if (rc == EINTR && alarm_caught)
732 "write(%s) error: %s\n",
733 filename, strerror(rc));
742 dmesg("%d: created %d files, last file '%s'.\n",
743 myrank, nops, filename);
746 fd = open(dir, O_RDONLY);
748 fatal(myrank, "open(dir == '%s') error: %s\n",
749 dir, strerror(errno));
752 for (; nops < iters && !alarm_caught;) {
753 char *filename = next_file();
754 rc = llapi_file_lookup(fd, filename);
756 if (((rc = errno) == EINTR) && alarm_caught)
758 fatal(myrank, "llapi_file_lookup(%s) "
759 "error: %s\n", filename, strerror(rc));
767 for (; begin <= end && !alarm_caught; begin += dirthreads) {
768 snprintf(filename, sizeof(filename), filefmt, begin);
769 rc = mknod(filename, S_IFREG | 0644, 0);
772 if (rc == EINTR && alarm_caught)
774 fatal(myrank, "mknod(%s) error: %s\n",
775 filename, strerror(rc));
779 rc = setxattr(filename, xattrname, xattrbuf,
780 xattrlen, XATTR_CREATE);
783 if (rc == EINTR && alarm_caught)
786 "setxattr(%s) error: %s\n",
787 filename, strerror(rc));
796 for (; nops < iters && !alarm_caught;) {
798 if ((fd = open(file, openflags, 0644)) < 0) {
799 if (((rc = errno) == EINTR) && alarm_caught)
801 fatal(myrank, "open(%s) error: %s\n",
812 for (; begin <= end && !alarm_caught; begin += dirthreads) {
813 sprintf(filename, filefmt, begin);
814 rc = stat(filename, &statbuf);
816 if (((rc = errno) == EINTR) && alarm_caught)
818 if (((rc = errno) == ENOENT) && ignore)
820 fatal(myrank, "stat(%s) error: %s\n",
821 filename, strerror(rc));
829 for (; begin <= end && !alarm_caught; begin += dirthreads) {
830 sprintf(filename, filefmt, begin);
831 rc = unlink(filename);
833 if (((rc = errno) == EINTR) && alarm_caught)
835 if ((rc = errno) == ENOENT) {
838 /* no more files to unlink */
841 fatal(myrank, "unlink(%s) error: %s\n",
842 filename, strerror(rc));
850 for (; begin <= end && !alarm_caught; begin += dirthreads) {
851 snprintf(filename, sizeof(filename), filefmt, begin);
852 rc = setxattr(filename, xattrname, xattrbuf, xattrlen,
856 if (rc == EINTR && alarm_caught)
858 if (rc == ENOENT && ignore)
860 fatal(myrank, "setxattr(%s) error: %s\n",
861 filename, strerror(rc));
870 rc = MPI_Barrier(MPI_COMM_WORLD);
871 if (rc != MPI_SUCCESS)
872 fatal(myrank, "prep MPI_Barrier failed: %d\n", rc);
873 curTime = MPI_Wtime();
874 interval = curTime - startTime;
875 rate = (double) (nops) / interval;
877 rc = MPI_Reduce(&nops, &ag_ops, 1, MPI_INT, MPI_SUM, 0,
879 if (rc != MPI_SUCCESS) {
880 fatal(myrank, "Failure in MPI_Reduce of total ops.\n");
883 rc = MPI_Reduce(&interval, &ag_interval, 1, MPI_DOUBLE, MPI_SUM, 0,
885 if (rc != MPI_SUCCESS) {
886 fatal(myrank, "Failure in MPI_Reduce of total interval.\n");
889 rc = MPI_Reduce(&rate, &ag_rate, 1, MPI_DOUBLE, MPI_SUM, 0,
891 if (rc != MPI_SUCCESS) {
892 fatal(myrank, "Failure in MPI_Reduce of aggregated rate.\n");
896 curTime = MPI_Wtime();
897 interval = curTime - startTime;
898 effective_rate = (double) ag_ops / interval;
899 avg_rate = (double) ag_ops / ag_interval;
901 printf("Rate: %.2f eff %.2f aggr %.2f avg client %ss/sec "
902 "(total: %d threads %d %ss %d dirs %d threads/dir %.2f secs)\n",
903 effective_rate, ag_rate, avg_rate, cmd, nthreads, ag_ops,
904 cmd, ndirs, dirthreads, interval);
905 if (mode == UNLINK && !recreate && !ignore && ag_ops != nfiles)
906 printf("Warning: only unlinked %d files instead of %d"
907 "\n", ag_ops, nfiles);
911 for (begin = beginsave; begin <= end; begin += dirthreads) {
912 sprintf(filename, filefmt, begin);
913 if ((fd = open(filename, openflags, 0644)) < 0) {
917 fatal(myrank, "recreate open(%s) error: %s\n",
918 filename, strerror(rc));
926 if ((myrank == 0) || debug) {
927 printf("%d: %s finished at %s",
928 myrank, hostname, ctime(×tamp));