2 * 2003, Copyright, Hewlett-Packard Development Compnay, LP.
4 * Developed under the sponsorship of the U.S. Government
5 * under Subcontract No. B514193
9 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
10 * Use is subject to license terms.
12 * Copyright (c) 2012, 2015, Intel Corporation.
18 #include <sys/types.h>
29 #include <sys/ioctl.h>
31 #include <sys/xattr.h>
36 #include <lustre/lustreapi.h> /* for O_LOV_DELAY_CREATE */
38 #define CHECK_COUNT 10000
39 #define DISPLAY_COUNT (CHECK_COUNT * 10)
40 #define DISPLAY_TIME 100
74 struct option longOpts[] = {
75 { .name = "create", .has_arg = no_argument, .val = CREATE },
76 { .name = "lookup", .has_arg = no_argument, .val = LOOKUP },
77 { .name = "mknod", .has_arg = no_argument, .val = MKNOD },
78 { .name = "open", .has_arg = no_argument, .val = OPEN },
79 { .name = "stat", .has_arg = no_argument, .val = STAT },
80 { .name = "unlink", .has_arg = no_argument, .val = UNLINK },
81 { .name = "begin", .has_arg = required_argument, .val = BEGIN },
82 { .name = "iters", .has_arg = required_argument, .val = ITERS },
83 /* time is in seconds */
84 { .name = "time", .has_arg = required_argument, .val = TIME },
85 { .name = "dirfmt", .has_arg = required_argument, .val = DIRFMT },
86 { .name = "ndirs", .has_arg = required_argument, .val = NDIRS },
87 { .name = "filefmt", .has_arg = required_argument, .val = FILEFMT },
88 { .name = "nfiles", .has_arg = required_argument, .val = NFILES },
89 { .name = "noexcl", .has_arg = no_argument, .val = NOEXCL },
90 { .name = "stripes", .has_arg = required_argument, .val = STRIPES },
91 { .name = "seed", .has_arg = required_argument, .val = SEED },
92 { .name = "seedfile", .has_arg = required_argument, .val = SEEDFILE },
93 { .name = "random_order", .has_arg = no_argument, .val = RANDOM },
94 { .name = "readdir_order", .has_arg = no_argument, .val = READDIR },
95 { .name = "recreate", .has_arg = no_argument, .val = RECREATE },
96 { .name = "setxattr", .has_arg = no_argument, .val = SETXATTR },
97 { .name = "smallwrite", .has_arg = no_argument, .val = SMALLWRITE },
98 { .name = "ignore", .has_arg = no_argument, .val = IGNORE },
99 { .name = "verbose", .has_arg = no_argument, .val = VERBOSE },
100 { .name = "debug", .has_arg = no_argument, .val = DEBUG },
101 { .name = "help", .has_arg = no_argument, .val = HELP },
102 { .name = "mdtcount", .has_arg = required_argument, .val = MDTCOUNT },
103 { .name = "mntcount", .has_arg = required_argument, .val = MNTCOUNT },
104 { .name = "mntfmt", .has_arg = required_argument, .val = MNT },
114 char hostname[512] = "unknown";
117 int openflags = O_RDWR | O_CREAT | O_EXCL;
121 char mkdir_cmd[PATH_MAX + 48];
125 struct dirent *dir_entry;
127 char filefmt[PATH_MAX];
128 char filename[PATH_MAX];
137 struct sigaction act;
146 char xattrname[] = "user.mdsrate";
148 /* max xattr name + value length is block size, use 4000 here to avoid ENOSPC */
155 #define dmesg if (debug) printf
157 #define DISPLAY_PROGRESS() { \
158 if (verbose && (nops % CHECK_COUNT == 0)) { \
159 curTime = MPI_Wtime(); \
160 interval = curTime - lastTime; \
161 if (interval > DISPLAY_TIME || nops % DISPLAY_COUNT == 0) { \
162 rate = (double)(nops - lastOps) / interval; \
163 printf("Rank %d: %.2f %ss/sec %.2f secs " \
164 "(total: %d %ss %.2f secs)\n", \
165 myrank, rate, cmd, interval, \
166 nops, cmd, curTime - startTime); \
168 lastTime = curTime; \
173 char *usage_msg = "usage: %s\n"
174 " { --create [ --noexcl | --setxattr | --smallwrite ] |\n"
175 " --lookup | --mknod [ --setxattr ] | --open |\n"
176 " --stat | --unlink [ --recreate ] [ --ignore ] |\n"
178 " [ --help ] [ --verbose ] [ --debug ]\n"
179 " { [ --begin <num> ] --nfiles <num> }\n"
180 " [ --iters <num> ] [ --time <secs> ]\n"
181 " [ --dirfmt <str> ] [ --ndirs <num> ]\n"
182 " [ --filefmt <str> ] [ --stripes <num> ]\n"
183 " [ --random_order [--seed <num> | --seedfile <file>] ]\n"
184 " [ --readdir_order ] [ --mntfmt <str> ]\n"
185 " [ --mntcount <num> ] [ --mdtcount <num> ]\n"
186 " [ --setxattr ] }\n";
189 usage(FILE *stream, char *fmt, ...)
195 fprintf(stream, "%s: ", prog);
197 vfprintf(stderr, fmt, ap);
200 fprintf(stream, usage_msg, prog);
204 exit(stream == stderr);
207 /* Print process myrank and message, and exit (i.e. a fatal error) */
209 fatal(int rank, const char *fmt, ...)
211 if (rank == myrank) {
214 fprintf(stderr, "rank %d: ", rank);
216 vfprintf(stderr, fmt, ap);
220 MPI_Abort(MPI_COMM_WORLD, 1);
225 sigalrm_handler(int signum)
231 process_args(int argc, char *argv[])
234 int i, index, offset, tmpend, rc;
241 prog = basename(argv[0]);
242 strcpy(filefmt, "f%d");
243 gethostname(hostname, sizeof(hostname));
245 /* auto create shortOpts rather than maintaining a static string. */
246 for (opt = longOpts, cp = shortOpts; opt->name != NULL; opt++, cp++) {
252 while ((rc = getopt_long(argc, argv, shortOpts,
253 longOpts, &index)) != -1) {
256 openflags &= ~(O_CREAT | O_EXCL);
264 "Invalid - more than one operation specified: --%s\n",
265 longOpts[index].name);
268 cmd = (char *)longOpts[index].name;
271 if (mode != CREATE && mode != MKNOD) {
273 "--noexcl only applies to --create or --mknod.\n");
275 openflags &= ~O_EXCL;
278 if (mode != UNLINK) {
280 "--recreate only makes sense with --unlink.\n");
287 cmd = (char *)longOpts[index].name;
288 } else if (mode == CREATE || mode == MKNOD) {
292 "--setxattr only makes sense with --create, --mknod or alone.\n");
298 "--smallwrite only applies to --create.\n");
302 begin = strtol(optarg, &endptr, 0);
303 if ((*endptr != 0) || (begin < 0))
304 fatal(0, "Invalid --start value.\n");
307 iters = strtol(optarg, &endptr, 0);
308 if ((*endptr != 0) || (iters <= 0))
309 fatal(0, "Invalid --iters value.\n");
310 if (mode != LOOKUP && mode != OPEN)
312 "--iters only makes sense with --lookup or --open.\n");
315 seconds = strtol(optarg, &endptr, 0);
316 if ((*endptr != 0) || (seconds <= 0))
317 fatal(0, "Invalid --time value.\n");
320 if (strlen(optarg) > (PATH_MAX - 16))
321 fatal(0, "--dirfmt too long\n");
325 ndirs = strtol(optarg, &endptr, 0);
326 if ((*endptr != 0) || (ndirs <= 0))
327 fatal(0, "Invalid --ndirs value.\n");
328 if ((ndirs > nthreads) &&
329 ((mode == CREATE) || (mode == MKNOD))) {
331 "--ndirs=%d must be less than or equal to the number of threads (%d).\n",
336 if (strlen(optarg) > 4080)
337 fatal(0, "--filefmt too long\n");
339 /* Use %%d where you want the file # in the name. */
340 sprintf(filefmt, optarg, myrank);
343 nfiles = strtol(optarg, &endptr, 0);
344 if ((*endptr != 0) || (nfiles <= 0))
345 fatal(0, "Invalid --nfiles value.\n");
348 stripes = strtol(optarg, &endptr, 0);
349 if ((*endptr != 0) || (stripes < 0))
350 fatal(0, "Invalid --stripes value.\n");
353 openflags |= O_LOV_DELAY_CREATE;
356 "non-zero --stripes value not yet supported.\n");
361 seed = strtoul(optarg, &endptr, 0);
363 fatal(0, "bad --seed option %s\n", optarg);
366 seed_file = fopen(optarg, "r");
368 fatal(myrank, "fopen(%s) error: %s\n",
369 optarg, strerror(errno));
372 for (i = -1; fgets(tmp, 16, seed_file) != NULL;) {
378 rc = sscanf(tmp, "%d", &seed);
379 if ((rc != 1) || (seed < 0)) {
381 "Invalid seed value '%s' at line %d in %s.\n",
386 "File '%s' too short. Does not contain a seed for thread %d.\n",
394 if (mode != LOOKUP && mode != OPEN) {
396 "--%s can only be specified with --lookup, or --open.\n",
397 (char *)longOpts[index].name);
413 if (strlen(optarg) > (PATH_MAX - 16))
414 fatal(0, "--mnt too long\n");
418 mnt_count = strtol(optarg, &endptr, 0);
419 if ((*endptr != 0) || (mnt_count <= 0)) {
420 fatal(0, "Invalid --mnt_count value %s.\n",
425 mdt_count = strtol(optarg, &endptr, 0);
426 if ((*endptr != 0) || (mdt_count <= 0)) {
427 fatal(0, "Invalid --mdt_count value %s.\n",
432 usage(stderr, "unrecognized option: '%c'.\n", optopt);
437 usage(stderr, "too many arguments %d >= %d.\n", optind, argc);
439 if ((mnt_count != -1 && !mntfmt) ||
440 (mnt_count == -1 && mntfmt))
442 "mnt_count and mntfmt must be specified at the same time\n");
444 if (mode == CREATE || mode == MKNOD || mode == UNLINK ||
445 mode == STAT || mode == SETXATTR) {
449 } else if (nfiles == 0) {
451 "--nfiles or --time must be specified with %s.\n",
454 } else if (mode == LOOKUP || mode == OPEN) {
458 } else if (iters == 0) {
460 "--iters or --time must be specifed with %s.\n",
465 usage(stderr, "--nfiles must be specifed with --%s.\n",
470 int fd = open("/dev/urandom", O_RDONLY);
473 if (read(fd, &seed, sizeof(seed)) <
484 dmesg("%s: rank %d seed %d (%s).\n", prog, myrank, seed,
485 (order == RANDOM) ? "random_order" : "readdir_order");
488 "one --create, --mknod, --open, --stat, --lookup, --unlink or --setxattr must be specifed.");
491 /* support for multiple threads in a dir, set begin/end appropriately.*/
492 dirnum = myrank % ndirs;
493 dirthreads = nthreads / ndirs;
494 if (nthreads > (ndirs * dirthreads + dirnum))
497 offset = myrank / ndirs;
499 tmpend = begin + nfiles - 1;
503 end = begin + (nfiles / dirthreads) * dirthreads + offset;
504 if ((end > tmpend) || (end <= 0))
508 * make sure mnt_count <= nthreads, otherwise it might div 0 in
511 if (mnt_count > nthreads)
512 mnt_count = nthreads;
520 dmesg("%d: iters %d nfiles %d time %d begin %d end %d dirthreads %d.\n",
521 myrank, iters, nfiles, seconds, begin, end, dirthreads);
529 sprintf(dir, mntfmt, (myrank / (nthreads / mnt_count)));
531 dir_len = strlen(dir);
533 sprintf(dir + dir_len, dirfmt, dirnum);
538 if (stat(dir, &sb) == 0) {
539 if (!S_ISDIR(sb.st_mode))
540 fatal(myrank, "'%s' is not dir\n", dir);
541 } else if (errno == ENOENT) {
542 sprintf(mkdir_cmd, "lfs mkdir -i %d -c %d %s",
544 rand() % mdt_count + 1, dir);
546 fatal(myrank, "'%s' stat failed\n", dir);
549 sprintf(mkdir_cmd, "mkdir -p %s", dir);
552 #ifdef _LIGHTWEIGHT_KERNEL
553 printf("NOTICE: not running system(%s)\n", mkdir_cmd);
557 dmesg("%d: %s\n", myrank, mkdir_cmd);
558 rc = system(mkdir_cmd);
562 if (MPI_Barrier(MPI_COMM_WORLD) != MPI_SUCCESS)
563 fatal(myrank, "mkdir MPI_Barrier failed\n");
565 dmesg("%d: %s\n", myrank, mkdir_cmd);
566 rc = system(mkdir_cmd);
569 fatal(myrank, "'%s' failed.\n", mkdir_cmd);
574 fatal(myrank, "unable to chdir to '%s'.\n", dir);
578 static inline char *next_file()
580 if (order == RANDOM) {
581 sprintf(filename, filefmt, random() % nfiles);
587 dir_entry = readdir(directory);
589 rewinddir(directory);
590 while ((dir_entry = readdir(directory)) != NULL) {
591 if (dir_entry->d_name[0] != '.')
592 return dir_entry->d_name;
595 fatal(myrank, "unable to read directory %s (%s).\n",
596 dir, strerror(errno));
599 return dir_entry->d_name;
603 main(int argc, char *argv[])
605 int i, j, fd, rc, nops, lastOps;
607 double ag_interval = 0;
609 double rate, avg_rate, effective_rate;
610 double startTime, curTime, lastTime, interval;
614 rc = MPI_Init(&argc, &argv);
615 if (rc != MPI_SUCCESS)
616 fatal(myrank, "MPI_Init failed: %d\n", rc);
618 rc = MPI_Comm_size(MPI_COMM_WORLD, &nthreads);
619 if (rc != MPI_SUCCESS)
620 fatal(myrank, "MPI_Comm_size failed: %d\n", rc);
622 rc = MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
623 if (rc != MPI_SUCCESS)
624 fatal(myrank, "MPI_Comm_rank failed: %d\n", rc);
626 process_args(argc, argv);
629 if ((myrank == 0) || debug) {
630 printf("%d: %s starting at %s",
631 myrank, hostname, ctime(×tamp));
635 * if we're not measuring creation rates then precreate
636 * the files we're operating on.
638 if ((mode != CREATE) && (mode != MKNOD) && !ignore &&
639 (mode != UNLINK || recreate)) {
641 * create the files in reverse order. When we encounter
642 * a file that already exists, assume the remainder of
643 * the files exist to save time. The timed performance
644 * test scripts make use of this behavior.
646 for (i = end, j = 0; i >= begin; i -= dirthreads) {
647 sprintf(filename, filefmt, i);
648 fd = open(filename, openflags, 0644);
653 fatal(myrank, "precreate open(%s) error: %s\n",
654 filename, strerror(rc));
659 dmesg("%d: %s pre-created %d files.\n", myrank, hostname, j);
661 rc = MPI_Barrier(MPI_COMM_WORLD);
662 if (rc != MPI_SUCCESS)
663 fatal(myrank, "prep MPI_Barrier failed: %d\n", rc);
666 if (order == READDIR) {
667 directory = opendir(dir);
670 fatal(myrank, "opendir(%s) error: %s\n",
675 j = random() % nfiles;
676 dmesg("%d: %s initializing dir offset %u: %s",
677 myrank, hostname, j, ctime(×tamp));
679 for (i = 0; i <= j; i++) {
680 if ((dir_entry = readdir(directory)) == NULL) {
682 "could not read entry number %d in directory %s.\n",
688 dmesg("%d: index %d, filename %s, offset %ld: %s initialization complete: %s",
689 myrank, i, dir_entry->d_name, telldir(directory),
690 hostname, ctime(×tamp));
694 act.sa_handler = sigalrm_handler;
695 (void)sigemptyset(&act.sa_mask);
697 sigaction(SIGALRM, &act, NULL);
701 rc = MPI_Barrier(MPI_COMM_WORLD);
702 if (rc != MPI_SUCCESS)
703 fatal(myrank, "prep MPI_Barrier failed: %d\n", rc);
705 startTime = MPI_Wtime();
706 lastTime = MPI_Wtime();
712 for (; begin <= end && !alarm_caught; begin += dirthreads) {
713 snprintf(filename, sizeof(filename), filefmt, begin);
714 fd = open(filename, openflags, 0644);
717 if (rc == EINTR && alarm_caught)
719 fatal(myrank, "open(%s) error: %s\n",
720 filename, strerror(rc));
724 rc = fsetxattr(fd, xattrname, xattrbuf,
725 xattrlen, XATTR_CREATE);
728 if (rc == EINTR && alarm_caught)
731 "setxattr(%s) error: %s\n",
732 filename, strerror(rc));
736 rc = write(fd, xattrbuf, xattrlen);
739 if (rc == EINTR && alarm_caught)
742 "write(%s) error: %s\n",
743 filename, strerror(rc));
752 dmesg("%d: created %d files, last file '%s'.\n",
753 myrank, nops, filename);
756 fd = open(dir, O_RDONLY);
758 fatal(myrank, "open(dir == '%s') error: %s\n",
759 dir, strerror(errno));
762 for (; nops < iters && !alarm_caught;) {
763 char *filename = next_file();
765 rc = llapi_file_lookup(fd, filename);
767 if (((rc = errno) == EINTR) && alarm_caught)
770 "llapi_file_lookup(%s) error: %s\n",
771 filename, strerror(rc));
779 for (; begin <= end && !alarm_caught; begin += dirthreads) {
780 snprintf(filename, sizeof(filename), filefmt, begin);
781 rc = mknod(filename, S_IFREG | 0644, 0);
784 if (rc == EINTR && alarm_caught)
786 fatal(myrank, "mknod(%s) error: %s\n",
787 filename, strerror(rc));
791 rc = setxattr(filename, xattrname, xattrbuf,
792 xattrlen, XATTR_CREATE);
795 if (rc == EINTR && alarm_caught)
798 "setxattr(%s) error: %s\n",
799 filename, strerror(rc));
808 for (; nops < iters && !alarm_caught;) {
810 if ((fd = open(file, openflags, 0644)) < 0) {
811 if (((rc = errno) == EINTR) && alarm_caught)
813 fatal(myrank, "open(%s) error: %s\n",
824 for (; begin <= end && !alarm_caught; begin += dirthreads) {
825 sprintf(filename, filefmt, begin);
826 rc = stat(filename, &statbuf);
828 if (((rc = errno) == EINTR) && alarm_caught)
830 if (((rc = errno) == ENOENT) && ignore)
832 fatal(myrank, "stat(%s) error: %s\n",
833 filename, strerror(rc));
841 for (; begin <= end && !alarm_caught; begin += dirthreads) {
842 sprintf(filename, filefmt, begin);
843 rc = unlink(filename);
845 if (((rc = errno) == EINTR) && alarm_caught)
847 if ((rc = errno) == ENOENT) {
850 /* no more files to unlink */
853 fatal(myrank, "unlink(%s) error: %s\n",
854 filename, strerror(rc));
862 for (; begin <= end && !alarm_caught; begin += dirthreads) {
863 snprintf(filename, sizeof(filename), filefmt, begin);
864 rc = setxattr(filename, xattrname, xattrbuf, xattrlen,
868 if (rc == EINTR && alarm_caught)
870 if (rc == ENOENT && ignore)
872 fatal(myrank, "setxattr(%s) error: %s\n",
873 filename, strerror(rc));
882 rc = MPI_Barrier(MPI_COMM_WORLD);
883 if (rc != MPI_SUCCESS)
884 fatal(myrank, "prep MPI_Barrier failed: %d\n", rc);
885 curTime = MPI_Wtime();
886 interval = curTime - startTime;
887 rate = (double)(nops) / interval;
889 rc = MPI_Reduce(&nops, &ag_ops, 1, MPI_INT, MPI_SUM, 0,
891 if (rc != MPI_SUCCESS)
892 fatal(myrank, "Failure in MPI_Reduce of total ops.\n");
894 rc = MPI_Reduce(&interval, &ag_interval, 1, MPI_DOUBLE, MPI_SUM, 0,
896 if (rc != MPI_SUCCESS)
897 fatal(myrank, "Failure in MPI_Reduce of total interval.\n");
899 rc = MPI_Reduce(&rate, &ag_rate, 1, MPI_DOUBLE, MPI_SUM, 0,
901 if (rc != MPI_SUCCESS)
902 fatal(myrank, "Failure in MPI_Reduce of aggregated rate.\n");
905 curTime = MPI_Wtime();
906 interval = curTime - startTime;
907 effective_rate = (double)ag_ops / interval;
908 avg_rate = (double)ag_ops / ag_interval;
910 printf("Rate: %.2f eff %.2f aggr %.2f avg client %ss/sec (total: %d threads %d %ss %d dirs %d threads/dir %.2f secs)\n",
911 effective_rate, ag_rate, avg_rate, cmd, nthreads, ag_ops,
912 cmd, ndirs, dirthreads, interval);
913 if (mode == UNLINK && !recreate && !ignore && ag_ops != nfiles)
914 printf("Warning: only unlinked %d files instead of %d\n",
919 for (begin = beginsave; begin <= end; begin += dirthreads) {
920 sprintf(filename, filefmt, begin);
921 if ((fd = open(filename, openflags, 0644)) < 0) {
925 fatal(myrank, "recreate open(%s) error: %s\n",
926 filename, strerror(rc));
934 if ((myrank == 0) || debug) {
935 printf("%d: %s finished at %s",
936 myrank, hostname, ctime(×tamp));