2 * 2003, Copyright, Hewlett-Packard Development Compnay, LP.
4 * Developed under the sponsorship of the U.S. Government
5 * under Subcontract No. B514193
9 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
10 * Use is subject to license terms.
12 * Copyright (c) 2012, 2014, Intel Corporation.
18 #include <sys/types.h>
29 #include <sys/ioctl.h>
31 #include <attr/xattr.h>
36 #include <lustre/lustreapi.h> /* for O_LOV_DELAY_CREATE */
38 #define CHECK_COUNT 10000
39 #define DISPLAY_COUNT (CHECK_COUNT * 10)
40 #define DISPLAY_TIME 100
74 struct option longOpts[] = {
75 {"create", 0, NULL, CREATE },
76 {"lookup", 0, NULL, LOOKUP },
77 {"mknod", 0, NULL, MKNOD },
78 {"open", 0, NULL, OPEN },
79 {"stat", 0, NULL, STAT },
80 {"unlink", 0, NULL, UNLINK },
81 {"begin", 1, NULL, BEGIN },
82 {"iters", 1, NULL, ITERS },
83 {"time", 1, NULL, TIME }, /* seconds */
84 {"dirfmt", 1, NULL, DIRFMT },
85 {"ndirs", 1, NULL, NDIRS },
86 {"filefmt", 1, NULL, FILEFMT },
87 {"nfiles", 1, NULL, NFILES },
88 {"noexcl", 0, NULL, NOEXCL },
89 {"stripes", 1, NULL, STRIPES },
90 {"seed", 1, NULL, SEED },
91 {"seedfile", 1, NULL, SEEDFILE },
92 {"random_order", 0, NULL, RANDOM },
93 {"readdir_order", 0, NULL, READDIR },
94 {"recreate", 0, NULL, RECREATE },
95 {"setxattr", 0, NULL, SETXATTR },
96 {"smallwrite", 0, NULL, SMALLWRITE },
97 {"ignore", 0, NULL, IGNORE },
98 {"verbose", 0, NULL, VERBOSE },
99 {"debug", 0, NULL, DEBUG },
100 {"help", 0, NULL, HELP },
101 {"mdtcount", 1, NULL, MDTCOUNT },
102 {"mntcount", 1, NULL, MNTCOUNT },
103 {"mntfmt", 1, NULL, MNT },
113 char hostname[512] = "unknown";
116 int openflags = O_RDWR|O_CREAT|O_EXCL;
120 char mkdir_cmd[PATH_MAX+14];
124 struct dirent *dir_entry;
126 char filefmt[PATH_MAX];
127 char filename[PATH_MAX];
136 struct sigaction act;
145 char xattrname[] = "user.mdsrate";
147 /* max xattr name + value length is block size, use 4000 here to avoid ENOSPC */
154 #define dmesg if (debug) printf
156 #define DISPLAY_PROGRESS() { \
157 if (verbose && (nops % CHECK_COUNT == 0)) { \
158 curTime = MPI_Wtime(); \
159 interval = curTime - lastTime; \
160 if (interval > DISPLAY_TIME || nops % DISPLAY_COUNT == 0) { \
161 rate = (double)(nops - lastOps)/interval; \
162 printf("Rank %d: %.2f %ss/sec %.2f secs " \
163 "(total: %d %ss %.2f secs)\n", \
164 myrank, rate, cmd, interval, \
165 nops, cmd, curTime - startTime); \
167 lastTime = curTime; \
172 char *usage_msg = "usage: %s\n"
173 " { --create [ --noexcl | --setxattr | --smallwrite ] |\n"
174 " --lookup | --mknod [ --setxattr ] | --open |\n"
175 " --stat | --unlink [ --recreate ] [ --ignore ] |\n"
177 " [ --help ] [ --verbose ] [ --debug ]\n"
178 " { [ --begin <num> ] --nfiles <num> }\n"
179 " [ --iters <num> ] [ --time <secs> ]\n"
180 " [ --dirfmt <str> ] [ --ndirs <num> ]\n"
181 " [ --filefmt <str> ] [ --stripes <num> ]\n"
182 " [ --random_order [--seed <num> | --seedfile <file>] ]\n"
183 " [ --readdir_order ] [ --mntfmt <str> ]\n"
184 " [ --mntcount <num> ] [ --mdtcount <num> ]\n"
185 " [ --setxattr ] }\n";
188 usage(FILE *stream, char *fmt, ...)
194 fprintf(stream, "%s: ", prog);
196 vfprintf(stderr, fmt, ap);
199 fprintf(stream, usage_msg, prog);
203 exit(stream == stderr);
206 /* Print process myrank and message, and exit (i.e. a fatal error) */
208 fatal(int rank, const char *fmt, ...)
210 if (rank == myrank) {
213 fprintf(stderr, "rank %d: ", rank);
215 vfprintf(stderr, fmt, ap);
219 MPI_Abort(MPI_COMM_WORLD, 1);
224 sigalrm_handler(int signum)
229 /* HAVE_LLAPI_FILE_LOOKUP is defined by liblustreapi.h if this function is
230 * defined therein. Otherwise we can do the equivalent operation via ioctl
231 * if we have access to a complete lustre build tree to get the various
232 * definitions - then compile with USE_MDC_LOOKUP defined. */
233 #if defined(HAVE_LLAPI_FILE_LOOKUP)
234 #define HAVE_MDC_LOOKUP
235 #elif defined(USE_MDC_LOOKUP)
237 #include <libcfs/libcfs.h>
238 #include <lustre_ioctl.h>
240 int llapi_file_lookup(int dirfd, const char *name)
242 struct obd_ioctl_data data = { 0 };
247 if (dirfd < 0 || name == NULL)
250 data.ioc_version = OBD_IOCTL_VERSION;
251 data.ioc_len = sizeof(data);
252 data.ioc_inlbuf1 = name;
253 data.ioc_inllen1 = strlen(name) + 1;
255 rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
257 fatal(myrank, "ioctl_pack failed: rc = %d\n", rc);
261 return ioctl(fd, IOC_MDC_LOOKUP, buf);
263 #define HAVE_MDC_LOOKUP
267 process_args(int argc, char *argv[])
270 int i, index, offset, tmpend, rc;
277 prog = basename(argv[0]);
278 strcpy(filefmt, "f%d");
279 gethostname(hostname, sizeof(hostname));
281 /* auto create shortOpts rather than maintaining a static string. */
282 for (opt = longOpts, cp = shortOpts; opt->name != NULL; opt++, cp++) {
288 while ((rc = getopt_long(argc,argv, shortOpts, longOpts,&index)) != -1) {
291 openflags &= ~(O_CREAT|O_EXCL);
293 #ifdef HAVE_MDC_LOOKUP
300 fatal(0, "Invalid - more than one operation "
302 longOpts[index].name);
305 cmd = (char *)longOpts[index].name;
308 if (mode != CREATE && mode != MKNOD) {
309 usage(stderr, "--noexcl only applies to "
310 "--create or --mknod.\n");
312 openflags &= ~O_EXCL;
315 if (mode != UNLINK) {
316 usage(stderr, "--recreate only makes sense"
324 cmd = (char *)longOpts[index].name;
325 } else if (mode == CREATE || mode == MKNOD) {
328 usage(stderr, "--setxattr only makes sense "
329 "with --create, --mknod or alone.\n");
334 usage(stderr, "--smallwrite only applies to "
339 begin = strtol(optarg, &endptr, 0);
340 if ((*endptr != 0) || (begin < 0)) {
341 fatal(0, "Invalid --start value.\n");
345 iters = strtol(optarg, &endptr, 0);
346 if ((*endptr != 0) || (iters <= 0)) {
347 fatal(0, "Invalid --iters value.\n");
349 if (mode != LOOKUP && mode != OPEN) {
350 usage(stderr, "--iters only makes sense with "
351 "--lookup or --open.\n");
355 seconds = strtol(optarg, &endptr, 0);
356 if ((*endptr != 0) || (seconds <= 0)) {
357 fatal(0, "Invalid --time value.\n");
361 if (strlen(optarg) > (PATH_MAX - 16)) {
362 fatal(0, "--dirfmt too long\n");
367 ndirs = strtol(optarg, &endptr, 0);
368 if ((*endptr != 0) || (ndirs <= 0)) {
369 fatal(0, "Invalid --ndirs value.\n");
371 if ((ndirs > nthreads) &&
372 ((mode == CREATE) || (mode == MKNOD))) {
373 fatal(0, "--ndirs=%d must be less than or "
374 "equal to the number of threads (%d).\n",
379 if (strlen(optarg) > 4080) {
380 fatal(0, "--filefmt too long\n");
383 /* Use %%d where you want the file # in the name. */
384 sprintf(filefmt, optarg, myrank);
387 nfiles = strtol(optarg, &endptr, 0);
388 if ((*endptr != 0) || (nfiles <= 0)) {
389 fatal(0, "Invalid --nfiles value.\n");
393 stripes = strtol(optarg, &endptr, 0);
394 if ((*endptr != 0) || (stripes < 0)) {
395 fatal(0, "Invalid --stripes value.\n");
399 openflags |= O_LOV_DELAY_CREATE;
401 fatal(0, "non-zero --stripes value "
402 "not yet supported.\n");
407 seed = strtoul(optarg, &endptr, 0);
409 fatal(0, "bad --seed option %s\n", optarg);
413 seed_file = fopen(optarg, "r");
415 fatal(myrank, "fopen(%s) error: %s\n",
416 optarg, strerror(errno));
419 for (i = -1; fgets(tmp, 16, seed_file) != NULL;) {
425 rc = sscanf(tmp, "%d", &seed);
426 if ((rc != 1) || (seed < 0)) {
427 fatal(myrank, "Invalid seed value '%s' "
428 "at line %d in %s.\n",
432 fatal(myrank, "File '%s' too short. Does not "
433 "contain a seed for thread %d.\n",
441 if (mode != LOOKUP && mode != OPEN) {
442 fatal(0, "--%s can only be specified with "
443 "--lookup, or --open.\n",
444 (char *)longOpts[index].name);
460 if (strlen(optarg) > (PATH_MAX - 16))
461 fatal(0, "--mnt too long\n");
465 mnt_count = strtol(optarg, &endptr, 0);
466 if ((*endptr != 0) || (mnt_count <= 0)) {
467 fatal(0, "Invalid --mnt_count value %s.\n",
472 mdt_count = strtol(optarg, &endptr, 0);
473 if ((*endptr != 0) || (mdt_count <= 0)) {
474 fatal(0, "Invalid --mdt_count value %s.\n",
479 usage(stderr, "unrecognized option: '%c'.\n", optopt);
484 usage(stderr, "too many arguments %d >= %d.\n", optind, argc);
487 if ((mnt_count != -1 && mntfmt == NULL) ||
488 (mnt_count == -1 && mntfmt != NULL)) {
489 usage(stderr, "mnt_count and mntfmt must be specified at the "
493 if (mode == CREATE || mode == MKNOD || mode == UNLINK ||
494 mode == STAT || mode == SETXATTR) {
498 } else if (nfiles == 0) {
499 usage(stderr, "--nfiles or --time must be specified "
502 } else if (mode == LOOKUP || mode == OPEN) {
506 } else if (iters == 0) {
507 usage(stderr, "--iters or --time must be specifed "
512 usage(stderr, "--nfiles must be specifed with --%s.\n",
517 int fd = open("/dev/urandom", O_RDONLY);
520 if (read(fd, &seed, sizeof(seed)) <
531 dmesg("%s: rank %d seed %d (%s).\n", prog, myrank, seed,
532 (order == RANDOM) ? "random_order" : "readdir_order");
534 usage(stderr, "one --create, --mknod, --open, --stat,"
535 #ifdef HAVE_MDC_LOOKUP
538 " --unlink or --setxattr must be specifed.");
541 /* support for multiple threads in a dir, set begin/end appropriately.*/
542 dirnum = myrank % ndirs;
543 dirthreads = nthreads / ndirs;
544 if (nthreads > (ndirs * dirthreads + dirnum))
547 offset = myrank / ndirs;
549 tmpend = begin + nfiles - 1;
553 end = begin + (nfiles / dirthreads) * dirthreads + offset;
554 if ((end > tmpend) || (end <= 0))
557 /* make sure mnt_count <= nthreads, otherwise it might div 0 in
558 * the following test */
559 if (mnt_count > nthreads)
560 mnt_count = nthreads;
568 dmesg("%d: iters %d nfiles %d time %d begin %d end %d dirthreads %d."
569 "\n", myrank, iters, nfiles, seconds, begin, end, dirthreads);
571 if (dirfmt == NULL) {
576 if (mntfmt != NULL) {
577 sprintf(dir, mntfmt, (myrank / (nthreads/mnt_count)));
579 dir_len = strlen(dir);
581 sprintf(dir + dir_len, dirfmt, dirnum);
585 if (stat(dir, &sb) == 0) {
586 if (!S_ISDIR(sb.st_mode))
587 fatal(myrank, "'%s' is not dir\n", dir);
588 } else if (errno == ENOENT) {
589 sprintf(mkdir_cmd, "lfs mkdir -i %d %s",
590 myrank % mdt_count, dir);
592 fatal(myrank, "'%s' stat failed\n", dir);
595 sprintf(mkdir_cmd, "mkdir -p %s", dir);
598 dmesg("%d: %s\n", myrank, mkdir_cmd);
599 #ifdef _LIGHTWEIGHT_KERNEL
600 printf("NOTICE: not running system(%s)\n", mkdir_cmd);
602 rc = system(mkdir_cmd);
604 fatal(myrank, "'%s' failed.\n", mkdir_cmd);
609 fatal(myrank, "unable to chdir to '%s'.\n", dir);
614 static inline char *next_file()
616 if (order == RANDOM) {
617 sprintf(filename, filefmt, random() % nfiles);
623 dir_entry = readdir(directory);
624 if (dir_entry == NULL) {
625 rewinddir(directory);
626 while ((dir_entry = readdir(directory)) != NULL) {
627 if (dir_entry->d_name[0] != '.')
628 return(dir_entry->d_name);
631 fatal(myrank, "unable to read directory %s (%s).\n",
632 dir, strerror(errno));
635 return(dir_entry->d_name);
639 main(int argc, char *argv[])
641 int i, j, fd, rc, nops, lastOps;
643 double ag_interval = 0;
645 double rate, avg_rate, effective_rate;
646 double startTime, curTime, lastTime, interval;
650 rc = MPI_Init(&argc, &argv);
651 if (rc != MPI_SUCCESS)
652 fatal(myrank, "MPI_Init failed: %d\n", rc);
654 rc = MPI_Comm_size(MPI_COMM_WORLD, &nthreads);
655 if (rc != MPI_SUCCESS)
656 fatal(myrank, "MPI_Comm_size failed: %d\n", rc);
658 rc = MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
659 if (rc != MPI_SUCCESS)
660 fatal(myrank, "MPI_Comm_rank failed: %d\n", rc);
662 process_args(argc, argv);
665 if ((myrank == 0) || debug) {
666 printf("%d: %s starting at %s",
667 myrank, hostname, ctime(×tamp));
670 /* if we're not measuring creation rates then precreate
671 * the files we're operating on. */
672 if ((mode != CREATE) && (mode != MKNOD) && !ignore &&
673 (mode != UNLINK || recreate)) {
674 /* create the files in reverse order. When we encounter
675 * a file that already exists, assume the remainder of
676 * the files exist to save time. The timed performance
677 * test scripts make use of this behavior. */
678 for (i = end, j = 0; i >= begin; i -= dirthreads) {
679 sprintf(filename, filefmt, i);
680 fd = open(filename, openflags, 0644);
685 fatal(myrank, "precreate open(%s) error: %s\n",
686 filename, strerror(rc));
691 dmesg("%d: %s pre-created %d files.\n",myrank,hostname,j);
693 rc = MPI_Barrier(MPI_COMM_WORLD);
694 if (rc != MPI_SUCCESS)
695 fatal(myrank, "prep MPI_Barrier failed: %d\n", rc);
698 if (order == READDIR) {
699 directory = opendir(dir);
700 if (directory == NULL) {
702 fatal(myrank, "opendir(%s) error: %s\n",
707 j = random() % nfiles;
708 dmesg("%d: %s initializing dir offset %u: %s",
709 myrank, hostname, j, ctime(×tamp));
711 for (i = 0; i <= j; i++) {
712 if ((dir_entry = readdir(directory)) == NULL) {
713 fatal(myrank, "could not read entry number %d "
714 "in directory %s.\n", i, dir);
719 dmesg("%d: index %d, filename %s, offset %ld: "
720 "%s initialization complete: %s",
721 myrank, i, dir_entry->d_name, telldir(directory),
722 hostname, ctime(×tamp));
726 act.sa_handler = sigalrm_handler;
727 (void)sigemptyset(&act.sa_mask);
729 sigaction(SIGALRM, &act, NULL);
733 rc = MPI_Barrier(MPI_COMM_WORLD);
734 if (rc != MPI_SUCCESS)
735 fatal(myrank, "prep MPI_Barrier failed: %d\n", rc);
737 startTime = lastTime = MPI_Wtime();
742 for (; begin <= end && !alarm_caught; begin += dirthreads) {
743 snprintf(filename, sizeof(filename), filefmt, begin);
744 fd = open(filename, openflags, 0644);
747 if (rc == EINTR && alarm_caught)
749 fatal(myrank, "open(%s) error: %s\n",
750 filename, strerror(rc));
754 rc = fsetxattr(fd, xattrname, xattrbuf,
755 xattrlen, XATTR_CREATE);
758 if (rc == EINTR && alarm_caught)
761 "setxattr(%s) error: %s\n",
762 filename, strerror(rc));
766 rc = write(fd, xattrbuf, xattrlen);
769 if (rc == EINTR && alarm_caught)
772 "write(%s) error: %s\n",
773 filename, strerror(rc));
782 dmesg("%d: created %d files, last file '%s'.\n",
783 myrank, nops, filename);
785 #ifdef HAVE_MDC_LOOKUP
787 fd = open(dir, O_RDONLY);
789 fatal(myrank, "open(dir == '%s') error: %s\n",
790 dir, strerror(errno));
793 for (; nops < iters && !alarm_caught;) {
794 char *filename = next_file();
795 rc = llapi_file_lookup(fd, filename);
797 if (((rc = errno) == EINTR) && alarm_caught)
799 fatal(myrank, "llapi_file_lookup(%s) "
800 "error: %s\n", filename, strerror(rc));
809 for (; begin <= end && !alarm_caught; begin += dirthreads) {
810 snprintf(filename, sizeof(filename), filefmt, begin);
811 rc = mknod(filename, S_IFREG | 0644, 0);
814 if (rc == EINTR && alarm_caught)
816 fatal(myrank, "mknod(%s) error: %s\n",
817 filename, strerror(rc));
821 rc = setxattr(filename, xattrname, xattrbuf,
822 xattrlen, XATTR_CREATE);
825 if (rc == EINTR && alarm_caught)
828 "setxattr(%s) error: %s\n",
829 filename, strerror(rc));
838 for (; nops < iters && !alarm_caught;) {
840 if ((fd = open(file, openflags, 0644)) < 0) {
841 if (((rc = errno) == EINTR) && alarm_caught)
843 fatal(myrank, "open(%s) error: %s\n",
854 for (; begin <= end && !alarm_caught; begin += dirthreads) {
855 sprintf(filename, filefmt, begin);
856 rc = stat(filename, &statbuf);
858 if (((rc = errno) == EINTR) && alarm_caught)
860 if (((rc = errno) == ENOENT) && ignore)
862 fatal(myrank, "stat(%s) error: %s\n",
863 filename, strerror(rc));
871 for (; begin <= end && !alarm_caught; begin += dirthreads) {
872 sprintf(filename, filefmt, begin);
873 rc = unlink(filename);
875 if (((rc = errno) == EINTR) && alarm_caught)
877 if ((rc = errno) == ENOENT) {
880 /* no more files to unlink */
883 fatal(myrank, "unlink(%s) error: %s\n",
884 filename, strerror(rc));
892 for (; begin <= end && !alarm_caught; begin += dirthreads) {
893 snprintf(filename, sizeof(filename), filefmt, begin);
894 rc = setxattr(filename, xattrname, xattrbuf, xattrlen,
898 if (rc == EINTR && alarm_caught)
900 if (rc == ENOENT && ignore)
902 fatal(myrank, "setxattr(%s) error: %s\n",
903 filename, strerror(rc));
912 rc = MPI_Barrier(MPI_COMM_WORLD);
913 if (rc != MPI_SUCCESS)
914 fatal(myrank, "prep MPI_Barrier failed: %d\n", rc);
915 curTime = MPI_Wtime();
916 interval = curTime - startTime;
917 rate = (double) (nops) / interval;
919 rc = MPI_Reduce(&nops, &ag_ops, 1, MPI_INT, MPI_SUM, 0,
921 if (rc != MPI_SUCCESS) {
922 fatal(myrank, "Failure in MPI_Reduce of total ops.\n");
925 rc = MPI_Reduce(&interval, &ag_interval, 1, MPI_DOUBLE, MPI_SUM, 0,
927 if (rc != MPI_SUCCESS) {
928 fatal(myrank, "Failure in MPI_Reduce of total interval.\n");
931 rc = MPI_Reduce(&rate, &ag_rate, 1, MPI_DOUBLE, MPI_SUM, 0,
933 if (rc != MPI_SUCCESS) {
934 fatal(myrank, "Failure in MPI_Reduce of aggregated rate.\n");
938 curTime = MPI_Wtime();
939 interval = curTime - startTime;
940 effective_rate = (double) ag_ops / interval;
941 avg_rate = (double) ag_ops / ag_interval;
943 printf("Rate: %.2f eff %.2f aggr %.2f avg client %ss/sec "
944 "(total: %d threads %d %ss %d dirs %d threads/dir %.2f secs)\n",
945 effective_rate, ag_rate, avg_rate, cmd, nthreads, ag_ops,
946 cmd, ndirs, dirthreads, interval);
947 if (mode == UNLINK && !recreate && !ignore && ag_ops != nfiles)
948 printf("Warning: only unlinked %d files instead of %d"
949 "\n", ag_ops, nfiles);
953 for (begin = beginsave; begin <= end; begin += dirthreads) {
954 sprintf(filename, filefmt, begin);
955 if ((fd = open(filename, openflags, 0644)) < 0) {
959 fatal(myrank, "recreate open(%s) error: %s\n",
960 filename, strerror(rc));
968 if ((myrank == 0) || debug) {
969 printf("%d: %s finished at %s",
970 myrank, hostname, ctime(×tamp));