X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fcontrib%2Fadio_driver_mpich2-1.0.7.patch;h=9bb012630e1a67416d0dceca4223a88a6616ac3a;hb=0f67add39b81f28e86f1df2aaea830b250125a47;hp=5f1daa365d6bba529ebffc62fc7050c262f8054f;hpb=cc181896d406a1fb51488413b69acde09c15deba;p=fs%2Flustre-release.git diff --git a/lustre/contrib/adio_driver_mpich2-1.0.7.patch b/lustre/contrib/adio_driver_mpich2-1.0.7.patch index 5f1daa3..9bb0126 100644 --- a/lustre/contrib/adio_driver_mpich2-1.0.7.patch +++ b/lustre/contrib/adio_driver_mpich2-1.0.7.patch @@ -1,7 +1,40 @@ -diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c ---- ad_lustre_orig/ad_lustre_aggregate.c 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/ad_lustre_aggregate.c 2008-10-15 22:26:35.000000000 +0800 -@@ -0,0 +1,514 @@ +--- configure_orig.in 2009-03-01 13:50:30.000000000 +0800 ++++ configure.in 2009-02-27 13:35:42.000000000 +0800 +@@ -1123,8 +1123,14 @@ + if test -n "$file_system_testfs"; then + AC_DEFINE(ROMIO_TESTFS,1,[Define for ROMIO with TESTFS]) + fi ++# ++# Verify presence of lustre/lustre_user.h ++# + if test -n "$file_system_lustre"; then +- AC_DEFINE(ROMIO_LUSTRE,1,[Define for ROMIO with LUSTRE]) ++ AC_CHECK_HEADERS(lustre/lustre_user.h, ++ AC_DEFINE(ROMIO_LUSTRE,1,[Define for ROMIO with LUSTRE]), ++ AC_MSG_ERROR([LUSTRE support requested but cannot find lustre/lustre_user.h header file]) ++ ) + fi + + if test -n "$file_system_xfs"; then +--- adio/include/adioi_orig.h 2009-03-01 14:00:48.000000000 +0800 ++++ adio/include/adioi.h 2009-04-24 15:26:44.000000000 +0800 +@@ -52,6 +52,12 @@ + struct { + int debugmask; + } pvfs2; ++ struct { ++ int start_iodevice; ++ int co_ratio; ++ int coll_threshold; ++ int ds_in_coll; ++ } lustre; + } fs_hints; + + }; +diff -ruN adio/ad_lustre_orig/ad_lustre_aggregate.c adio/ad_lustre/ad_lustre_aggregate.c +--- adio/ad_lustre_orig/ad_lustre_aggregate.c 1970-01-01 08:00:00.000000000 +0800 ++++ adio/ad_lustre/ad_lustre_aggregate.c 2009-05-05 15:22:40.000000000 +0800 +@@ -0,0 +1,304 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (C) 1997 University of Chicago. @@ -9,17 +42,16 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + * + * Copyright (C) 2007 Oak Ridge National Laboratory + * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include "ad_lustre.h" +#include "adio_extern.h" + -+void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr, -+ int mode, int nprocs, -+ ADIO_Offset *st_offsets, -+ ADIO_Offset *end_offsets, -+ ADIO_Offset *min_st_offset_ptr) ++#undef AGG_DEBUG ++ ++void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int **striping_info_ptr, ++ int mode) +{ + int *striping_info = NULL; + /* get striping information: @@ -27,22 +59,16 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + * striping_info[1]: stripe_count + * striping_info[2]: avail_cb_nodes + */ -+ int stripe_size, stripe_count, CO = 1, CO_max = 1, lflag, i; -+ int user_cb_nodes = 0, avail_cb_nodes; -+ int nprocs_for_coll = fd->hints->cb_nodes; -+ ADIO_Offset min_st_offset, max_end_offset; ++ int stripe_size, stripe_count, CO = 1, CO_max = 1, CO_nodes, lflag; ++ int avail_cb_nodes, divisor, nprocs_for_coll = fd->hints->cb_nodes; + char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); + + /* Get hints value */ + /* stripe size */ -+ MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag) -+ stripe_size = atoi(value); ++ stripe_size = fd->hints->striping_unit; + /* stripe count */ + /* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */ -+ MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag) -+ stripe_count = atoi(value); ++ stripe_count = fd->hints->striping_factor; + + /* Calculate the available number of I/O clients, that is + * avail_cb_nodes=min(cb_nodes, stripe_count*CO), where @@ -61,53 +87,38 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + /* CO_max: the largest number of IO clients for each ost group */ + CO_max = (nprocs_for_coll - 1)/ stripe_count + 1; + /* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */ -+ MPI_Info_get(fd->info, "CO", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag) -+ CO = atoi(value); ++ CO = fd->hints->fs_hints.lustre.co_ratio; + CO = ADIOI_MIN(CO_max, CO); + } -+ avail_cb_nodes = ADIOI_MIN(nprocs_for_coll, stripe_count * CO); -+ -+ /* user_cb_nodes*/ -+ MPI_Info_get(fd->info, "user_cb_nodes", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag) -+ user_cb_nodes = atoi(value); -+ /* If the user doesn't change the cb_nodes and -+ * the whole file access portion is no larger than stripe size, -+ * we will perform the IO by the same process (rank0 by default). -+ */ -+ /* calculate the whole file access portion */ -+ min_st_offset = st_offsets[0]; -+ max_end_offset = end_offsets[0]; -+ for (i = 0; i < nprocs; i ++) { -+ min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]); -+ max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]); -+ } -+ if (!user_cb_nodes) { -+ /* Check the whole file access portion -+ * if (whole_range <= stripe_size) -+ * then always collect data to the same process; -+ * set avail_cb_nodes=1; (rank0 by default). -+ * This pattern can make good use of Lustre client cache and -+ * avoid extent lock assigning and revoking. -+ * -+ * The recent experiments show good performance. We still need more -+ * validation. -+ */ -+ if ((max_end_offset > min_st_offset) && -+ (max_end_offset - min_st_offset) <= (ADIO_Offset) stripe_size) -+ avail_cb_nodes = 1; ++ /* Calculate how many IO clients we need */ ++ /* To avoid extent lock conflicts, ++ * avail_cb_nodes should divide (stripe_count*CO) exactly, ++ * so that each OST is accessed by only one or more constant clients. */ ++ CO_nodes = stripe_count * CO; ++ avail_cb_nodes = ADIOI_MIN(nprocs_for_coll, CO_nodes); ++ if (avail_cb_nodes < CO_nodes) { ++ do { ++ /* find the divisor of CO_nodes */ ++ divisor = 1; ++ do { ++ divisor ++; ++ } while (CO_nodes % divisor); ++ CO_nodes = CO_nodes / divisor; ++ /* if stripe_count*CO is a prime number, change nothing */ ++ if ((CO_nodes <= avail_cb_nodes) && (CO_nodes != 1)) { ++ avail_cb_nodes = CO_nodes; ++ break; ++ } ++ } while (CO_nodes != 1); + } + -+ ADIOI_Free(value); -+ + *striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int)); + striping_info = *striping_info_ptr; + striping_info[0] = stripe_size; + striping_info[1] = stripe_count; + striping_info[2] = avail_cb_nodes; + -+ *min_st_offset_ptr = min_st_offset; ++ ADIOI_Free(value); +} + +int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off, @@ -121,6 +132,13 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + /* Produce the stripe-contiguous pattern for Lustre */ + rank_index = (int)((off / stripe_size) % avail_cb_nodes); + ++ /* we index into fd_end with rank_index, and fd_end was allocated to be no ++ * bigger than fd->hins->cb_nodes. If we ever violate that, we're ++ * overrunning arrays. Obviously, we should never ever hit this abort ++ */ ++ if (rank_index >= fd->hints->cb_nodes) ++ MPI_Abort(MPI_COMM_WORLD, 1); ++ + avail_bytes = (off / (ADIO_Offset)stripe_size + 1) * + (ADIO_Offset)stripe_size - off; + if (avail_bytes < *len) { @@ -134,12 +152,16 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + return rank; +} + ++/* ADIOI_LUSTRE_Calc_my_req() - calculate what portions of the access requests ++ * of this process are located in the file domains of various processes ++ * (including this one) ++ */ +void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, + int *len_list, int contig_access_count, + int *striping_info, int nprocs, + int *count_my_req_procs_ptr, + int **count_my_req_per_proc_ptr, -+ ADIOI_Access ** my_req_ptr, ++ ADIOI_Access **my_req_ptr, + int **buf_idx_ptr) +{ + /* Nothing different from ADIOI_Calc_my_req(), except calling @@ -151,13 +173,19 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + + *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); + count_my_req_per_proc = *count_my_req_per_proc_ptr; ++ /* count_my_req_per_proc[i] gives the no. of contig. requests of this ++ * process in process i's file domain. calloc initializes to zero. ++ * I'm allocating memory of size nprocs, so that I can do an ++ * MPI_Alltoall later on. ++ */ + ++ buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int)); + /* buf_idx is relevant only if buftype_is_contig. + * buf_idx[i] gives the index into user_buf where data received + * from proc. i should be placed. This allows receives to be done + * without extra buffer. This can't be done if buftype is not contig. + */ -+ buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int)); ++ + /* initialize buf_idx to -1 */ + for (i = 0; i < nprocs; i++) + buf_idx[i] = -1; @@ -173,12 +201,13 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + continue; + off = offset_list[i]; + avail_len = len_list[i]; -+ /* we set avail_len to be the total size of the access. ++ /* note: we set avail_len to be the total size of the access. + * then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return + * the amount that was available. + */ + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info); + count_my_req_per_proc[proc]++; ++ + /* figure out how many data is remaining in the access + * we'll take care of this data (if there is any) + * in the while loop below. @@ -194,6 +223,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + } + } + ++ /* now allocate space for my_req, offset, and len */ + *my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access)); + my_req = *my_req_ptr; + @@ -213,6 +243,8 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + /* now fill in my_req */ + curr_idx = 0; + for (i = 0; i < contig_access_count; i++) { ++ /* short circuit offset/len processing if len == 0 ++ * (zero-byte read/write */ + if (len_list[i] == 0) + continue; + off = offset_list[i]; @@ -266,12 +298,12 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + } + } + } -+#endif +#if 0 + for (i = 0; i < nprocs; i++) { + FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]); + } +#endif ++#endif + + *count_my_req_procs_ptr = count_my_req_procs; + *buf_idx_ptr = buf_idx; @@ -289,7 +321,6 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + int i, docollect = 1, lflag, big_req_size = 0; + ADIO_Offset req_size = 0, total_req_size; + int avg_req_size, total_access_count; -+ char *value = NULL; + + /* calculate total_req_size and total_access_count */ + for (i = 0; i < contig_access_count; i++) @@ -300,225 +331,17 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + fd->comm); + /* estimate average req_size */ + avg_req_size = (int)(total_req_size / total_access_count); -+ + /* get hint of big_req_size */ -+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); -+ MPI_Info_get(fd->info, "big_req_size", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag) -+ big_req_size = atoi(value); ++ big_req_size = fd->hints->fs_hints.lustre.coll_threshold; + /* Don't perform collective I/O if there are big requests */ + if ((big_req_size > 0) && (avg_req_size > big_req_size)) + docollect = 0; + -+ ADIOI_Free(value); -+ + return docollect; +} -+ -+void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs, -+ int *count_my_req_per_proc, -+ ADIOI_Access * my_req, -+ int nprocs, int myrank, -+ ADIO_Offset req_len, -+ ADIO_Offset min_st_offset, -+ int *striping_info, -+ int *count_others_req_procs_ptr, -+ ADIOI_Access ** others_req_ptr) -+{ -+ /* what requests of other processes will be written by this process */ -+ -+ int *count_others_req_per_proc, count_others_req_procs, proc; -+ int i, j, lflag, samesize = 0, contiguous = 0; -+ int avail_cb_nodes = striping_info[2]; -+ MPI_Request *send_requests, *recv_requests; -+ MPI_Status *statuses; -+ ADIOI_Access *others_req; -+ char *value = NULL; -+ ADIO_Offset off, avail_len, rem_len, *all_lens; -+ -+ /* There are two hints, which could reduce some MPI communication overhead, -+ * if the users knows the I/O pattern and set them correctly. */ -+ /* They are -+ * contiguous_data: if the data are contiguous, -+ * we don't need to do MPI_Alltoall(). -+ * same_io_size: And if the data req size is same, -+ * we can calculate the offset directly -+ */ -+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); -+ /* hint of contiguous data */ -+ MPI_Info_get(fd->info, "contiguous_data", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag && !strcmp(value, "yes")) -+ contiguous = 1; -+ /* hint of same io size */ -+ MPI_Info_get(fd->info, "same_io_size", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag && !strcmp(value, "yes")) -+ samesize = 1; -+ ADIOI_Free(value); -+ -+ *others_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * -+ sizeof(ADIOI_Access)); -+ others_req = *others_req_ptr; -+ -+ /* if the data are contiguous, we can calulate the offset and length -+ * of the other requests simply, instead of MPI_Alltoall() */ -+ if (contiguous) { -+ for (i = 0; i < nprocs; i++) { -+ others_req[i].count = 0; -+ } -+ all_lens = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); -+ -+ /* same req size ? */ -+ if (samesize == 0) { -+ /* exchange request length */ -+ MPI_Allgather(&req_len, 1, ADIO_OFFSET, all_lens, 1, ADIO_OFFSET, -+ fd->comm); -+ } else { /* same request size */ -+ /* assign request length to all_lens[] */ -+ for (i = 0; i < nprocs; i ++) -+ all_lens[i] = req_len; -+ } -+ if (myrank < avail_cb_nodes) { -+ /* It's a IO client and it will receive data from others */ -+ off = min_st_offset; -+ /* calcaulte other_req[i].count */ -+ for (i = 0; i < nprocs; i++) { -+ avail_len = all_lens[i]; -+ rem_len = avail_len; -+ while (rem_len > 0) { -+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, -+ striping_info); -+ if (proc == myrank) { -+ others_req[i].count ++; -+ } -+ off += avail_len; -+ rem_len -= avail_len; -+ avail_len = rem_len; -+ } -+ } -+ /* calculate offset and len for each request */ -+ off = min_st_offset; -+ for (i = 0; i < nprocs; i++) { -+ if (others_req[i].count) { -+ others_req[i].offsets = (ADIO_Offset *) -+ ADIOI_Malloc(others_req[i].count * -+ sizeof(ADIO_Offset)); -+ others_req[i].lens = (int *) -+ ADIOI_Malloc(others_req[i].count * -+ sizeof(int)); -+ others_req[i].mem_ptrs = (MPI_Aint *) -+ ADIOI_Malloc(others_req[i].count * -+ sizeof(MPI_Aint)); -+ } -+ j = 0; -+ avail_len = all_lens[i]; -+ rem_len = avail_len; -+ while (rem_len > 0) { -+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, -+ striping_info); -+ if (proc == myrank) { -+ others_req[i].offsets[j] = off; -+ others_req[i].lens[j] = (int)avail_len; -+ j ++; -+ } -+ off += avail_len; -+ rem_len -= avail_len; -+ avail_len = rem_len; -+ } -+ } -+ } -+ ADIOI_Free(all_lens); -+ } else { -+ /* multiple non-contiguous requests */ -+ /* first find out how much to send/recv and from/to whom */ -+ -+ /* -+ * count_others_req_procs: -+ * number of processes whose requests will be written by -+ * this process (including this process itself) -+ * count_others_req_per_proc[i]: -+ * how many separate contiguous requests of proc[i] will be -+ * written by this process. -+ */ -+ -+ count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int)); -+ -+ MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT, -+ count_others_req_per_proc, 1, MPI_INT, fd->comm); -+ -+ count_others_req_procs = 0; -+ for (i = 0; i < nprocs; i++) { -+ if (count_others_req_per_proc[i]) { -+ others_req[i].count = count_others_req_per_proc[i]; -+ others_req[i].offsets = (ADIO_Offset *) -+ ADIOI_Malloc(others_req[i].count * -+ sizeof(ADIO_Offset)); -+ others_req[i].lens = (int *) -+ ADIOI_Malloc(others_req[i].count * -+ sizeof(int)); -+ others_req[i].mem_ptrs = (MPI_Aint *) -+ ADIOI_Malloc(others_req[i].count * -+ sizeof(MPI_Aint)); -+ count_others_req_procs++; -+ } else -+ others_req[i].count = 0; -+ } -+ -+ /* now send the calculated offsets and lengths to respective processes */ -+ -+ send_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_my_req_procs + 1) * -+ sizeof(MPI_Request)); -+ recv_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_others_req_procs+1)* -+ sizeof(MPI_Request)); -+ /* +1 to avoid a 0-size malloc */ -+ -+ j = 0; -+ for (i = 0; i < nprocs; i++) { -+ if (others_req[i].count) { -+ MPI_Irecv(others_req[i].offsets, others_req[i].count, -+ ADIO_OFFSET, i, i + myrank, fd->comm, -+ &recv_requests[j]); -+ j++; -+ MPI_Irecv(others_req[i].lens, others_req[i].count, -+ MPI_INT, i, i + myrank + 1, fd->comm, -+ &recv_requests[j]); -+ j++; -+ } -+ } -+ -+ j = 0; -+ for (i = 0; i < nprocs; i++) { -+ if (my_req[i].count) { -+ MPI_Isend(my_req[i].offsets, my_req[i].count, -+ ADIO_OFFSET, i, i + myrank, fd->comm, -+ &send_requests[j]); -+ j++; -+ MPI_Isend(my_req[i].lens, my_req[i].count, -+ MPI_INT, i, i + myrank + 1, fd->comm, -+ &send_requests[j]); -+ j++; -+ } -+ } -+ -+ statuses = (MPI_Status *) -+ ADIOI_Malloc((1 + 2 * ADIOI_MAX(count_my_req_procs, -+ count_others_req_procs)) * -+ sizeof(MPI_Status)); -+ /* +1 to avoid a 0-size malloc */ -+ -+ MPI_Waitall(2 * count_my_req_procs, send_requests, statuses); -+ MPI_Waitall(2 * count_others_req_procs, recv_requests, statuses); -+ -+ ADIOI_Free(send_requests); -+ ADIOI_Free(recv_requests); -+ ADIOI_Free(statuses); -+ ADIOI_Free(count_others_req_per_proc); -+ -+ *count_others_req_procs_ptr = count_others_req_procs; -+ } -+} -diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c ---- ad_lustre_orig/ad_lustre.c 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/ad_lustre.c 2008-09-17 18:20:35.000000000 +0800 +diff -ruN adio/ad_lustre_orig/ad_lustre.c adio/ad_lustre/ad_lustre.c +--- adio/ad_lustre_orig/ad_lustre.c 2008-09-17 14:36:56.000000000 +0800 ++++ adio/ad_lustre/ad_lustre.c 2008-10-17 17:03:42.000000000 +0800 @@ -1,9 +1,11 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* @@ -529,77 +352,28 @@ diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c * * Copyright (C) 2007 Oak Ridge National Laboratory + * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include "ad_lustre.h" -@@ -13,13 +15,13 @@ +@@ -13,12 +15,12 @@ ADIOI_LUSTRE_ReadContig, /* ReadContig */ ADIOI_LUSTRE_WriteContig, /* WriteContig */ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */ - ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */ + ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */ ADIOI_GEN_SeekIndividual, /* SeekIndividual */ -- ADIOI_GEN_Fcntl, /* Fcntl */ -+ ADIOI_LUSTRE_Fcntl, /* Fcntl */ + ADIOI_GEN_Fcntl, /* Fcntl */ ADIOI_LUSTRE_SetInfo, /* SetInfo */ ADIOI_GEN_ReadStrided, /* ReadStrided */ - ADIOI_GEN_WriteStrided, /* WriteStrided */ -- ADIOI_GEN_Close, /* Close */ + ADIOI_LUSTRE_WriteStrided, /* WriteStrided */ -+ ADIOI_LUSTRE_Close, /* Close */ + ADIOI_GEN_Close, /* Close */ #if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE) ADIOI_GEN_IreadContig, /* IreadContig */ - ADIOI_GEN_IwriteContig, /* IwriteContig */ -diff -ruN ad_lustre_orig/ad_lustre_close.c ad_lustre/ad_lustre_close.c ---- ad_lustre_orig/ad_lustre_close.c 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/ad_lustre_close.c 2008-09-17 18:20:35.000000000 +0800 -@@ -0,0 +1,42 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ * -+ * Copyright (C) 2007 Oak Ridge National Laboratory -+ * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group -+ */ -+ -+#include "ad_lustre.h" -+ -+#ifdef PROFILE -+#include "mpe.h" -+#endif -+ -+void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code) -+{ -+ int err, derr = 0; -+ static char myname[] = "ADIOI_LUSTRE_CLOSE"; -+ -+#ifdef PROFILE -+ MPE_Log_event(9, 0, "start close"); -+#endif -+ -+ err = close(fd->fd_sys); -+ -+#ifdef PROFILE -+ MPE_Log_event(10, 0, "end close"); -+#endif -+ -+ fd->fd_sys = -1; -+ -+ if (err == -1 || derr == -1) { -+ *error_code = -+ MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, -+ __LINE__, MPI_ERR_IO, "**io", "**io %s", -+ strerror(errno)); -+ } else -+ *error_code = MPI_SUCCESS; -+} -diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h ---- ad_lustre_orig/ad_lustre.h 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/ad_lustre.h 2008-10-15 21:22:52.000000000 +0800 +diff -ruN adio/ad_lustre_orig/ad_lustre.h adio/ad_lustre/ad_lustre.h +--- adio/ad_lustre_orig/ad_lustre.h 2008-09-17 14:36:56.000000000 +0800 ++++ adio/ad_lustre/ad_lustre.h 2009-05-05 15:34:58.000000000 +0800 @@ -1,9 +1,11 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* @@ -610,44 +384,20 @@ diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h * * Copyright (C) 2007 Oak Ridge National Laboratory + * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef AD_UNIX_INCLUDE -@@ -24,7 +26,32 @@ +@@ -24,7 +26,7 @@ /*#include */ #include -+#ifdef WITH_LUSTRE - #include "lustre/lustre_user.h" -+#else -+/* copy something from lustre_user.h here */ -+# define LOV_USER_MAGIC 0x0BD10BD0 -+# define LL_IOC_LOV_SETSTRIPE _IOW ('f', 154, long) -+# define LL_IOC_LOV_GETSTRIPE _IOW ('f', 155, long) -+# define lov_user_ost_data lov_user_ost_data_v1 -+struct lov_user_ost_data_v1 { /* per-stripe data structure */ -+ __u64 l_object_id; /* OST object ID */ -+ __u64 l_object_gr; /* OST object group (creating MDS number) */ -+ __u32 l_ost_gen; /* generation of this OST index */ -+ __u32 l_ost_idx; /* OST index in LOV */ -+} __attribute__((packed)); -+#define lov_user_md lov_user_md_v1 -+struct lov_user_md_v1 { /* LOV EA user data (host-endian) */ -+ __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V1 */ -+ __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ -+ __u64 lmm_object_id; /* LOV object ID */ -+ __u64 lmm_object_gr; /* LOV object group */ -+ __u32 lmm_stripe_size; /* size of stripe in bytes */ -+ __u16 lmm_stripe_count; /* num stripes in use for this object */ -+ __u16 lmm_stripe_offset; /* starting stripe offset in lmm_objects */ -+ struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ -+} __attribute__((packed)); -+#endif +-#include "lustre/lustre_user.h" ++#include #include "adio.h" /*#include "adioi.h"*/ -@@ -41,24 +68,56 @@ +@@ -41,24 +43,31 @@ void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code); void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code); @@ -693,35 +443,10 @@ diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h int *error_code); void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); - -+void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr, -+ int mode, int nprocs, -+ ADIO_Offset *st_offsets, -+ ADIO_Offset *end_offsets, -+ ADIO_Offset *min_st_offset); -+int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off, -+ ADIO_Offset *len, int *striping_info); -+void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, -+ int *len_list, int contig_access_count, -+ int *striping_info, int nprocs, -+ int *count_my_req_procs_ptr, -+ int **count_my_req_per_proc_ptr, -+ ADIOI_Access ** my_req_ptr, -+ int **buf_idx_ptr); -+int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count, -+ int *len_list, int nprocs); -+void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs, -+ int *count_my_req_per_proc, -+ ADIOI_Access * my_req, -+ int nprocs, int myrank, -+ ADIO_Offset req_len, -+ ADIO_Offset min_st_offset, -+ int *striping_info, -+ int *count_others_req_procs_ptr, -+ ADIOI_Access ** others_req_ptr); #endif /* End of AD_UNIX_INCLUDE */ -diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c ---- ad_lustre_orig/ad_lustre_hints.c 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/ad_lustre_hints.c 2008-10-15 21:31:00.000000000 +0800 +diff -ruN adio/ad_lustre_orig/ad_lustre_hints.c adio/ad_lustre/ad_lustre_hints.c +--- adio/ad_lustre_orig/ad_lustre_hints.c 2008-09-17 14:36:56.000000000 +0800 ++++ adio/ad_lustre/ad_lustre_hints.c 2009-04-24 15:35:05.000000000 +0800 @@ -1,9 +1,11 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* @@ -732,25 +457,22 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c * * Copyright (C) 2007 Oak Ridge National Laboratory + * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include "ad_lustre.h" -@@ -11,130 +13,189 @@ - +@@ -12,46 +14,56 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { -- char *value, *value_in_fd; + char *value, *value_in_fd; - int flag, tmp_val[3], str_factor=-1, str_unit=0, start_iodev=-1; -- struct lov_user_md lum = { 0 }; -- int err, myrank, fd_sys, perm, amode, old_mask; -+ char *value = NULL; -+ int flag, tmp_val, int_val, str_factor, str_unit, start_iodev; ++ int flag, stripe_val[3], str_factor = -1, str_unit=0, start_iodev=-1; + struct lov_user_md lum = { 0 }; + int err, myrank, fd_sys, perm, amode, old_mask; ++ int int_val, tmp_val; + static char myname[] = "ADIOI_LUSTRE_SETINFO"; -+ *error_code = MPI_SUCCESS; value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); -+ if ( (fd->info) == MPI_INFO_NULL) { - /* This must be part of the open call. can set striping parameters - if necessary. */ @@ -763,29 +485,45 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c fd->direct_read = fd->direct_write = 0; - - /* has user specified striping or server buffering parameters ++ /* initialize lustre hints */ ++ MPI_Info_set(fd->info, "romio_lustre_co_ratio", "1"); ++ fd->hints->fs_hints.lustre.co_ratio = 1; ++ MPI_Info_set(fd->info, "romio_lustre_coll_threshold", "0"); ++ fd->hints->fs_hints.lustre.coll_threshold = 0; ++ MPI_Info_set(fd->info, "romio_lustre_ds_in_coll", "enable"); ++ fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_ENABLE; + + /* has user specified striping or server buffering parameters and do they have the same value on all processes? */ if (users_info != MPI_INFO_NULL) { - MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, -- value, &flag); ++ /* striping information */ ++ MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, + value, &flag); - if (flag) -- str_unit=atoi(value); -- ++ if (flag) + str_unit=atoi(value); + - MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, -- value, &flag); ++ MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, + value, &flag); - if (flag) -- str_factor=atoi(value); -- ++ if (flag) + str_factor=atoi(value); + - MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, -+ /* direct read and write */ -+ MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, - value, &flag); +- value, &flag); - if (flag) -- start_iodev=atoi(value); -- ++ MPI_Info_get(users_info, "romio_lustre_start_iodevice", ++ MPI_MAX_INFO_VAL, value, &flag); ++ if (flag) + start_iodev=atoi(value); + - MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, - value, &flag); ++ /* direct read and write */ ++ MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, ++ value, &flag); if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) { MPI_Info_set(fd->info, "direct_read", "true"); fd->direct_read = 1; @@ -796,183 +534,65 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c value, &flag); if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) { MPI_Info_set(fd->info, "direct_write", "true"); - fd->direct_write = 1; +@@ -59,22 +71,23 @@ } -+ /* stripe size */ -+ MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && (str_unit = atoi(value))) { -+ tmp_val = str_unit; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != str_unit) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "striping_unit", -+ error_code); -+ ADIOI_Free(value); -+ return; -+ } -+ MPI_Info_set(fd->info, "striping_unit", value); -+ } -+ /* stripe count */ -+ MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && (str_factor = atoi(value))) { -+ tmp_val = str_factor; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != str_factor) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "striping_factor", -+ error_code); -+ ADIOI_Free(value); -+ return; -+ } -+ MPI_Info_set(fd->info, "striping_factor", value); -+ } -+ /* stripe offset */ -+ MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && ((start_iodev = atoi(value)) >= 0)) { -+ tmp_val = start_iodev; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != start_iodev) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "start_iodevice", -+ error_code); -+ ADIOI_Free(value); -+ return; -+ } -+ MPI_Info_set(fd->info, "start_iodevice", value); -+ } } -- -- MPI_Comm_rank(fd->comm, &myrank); -- if (myrank == 0) { + ++ /* set striping information with ioctl */ + MPI_Comm_rank(fd->comm, &myrank); + if (myrank == 0) { - tmp_val[0] = str_factor; - tmp_val[1] = str_unit; - tmp_val[2] = start_iodev; -+ } -+ if (users_info != MPI_INFO_NULL) { -+ /* CO: IO Clients/OST, -+ * to keep the load balancing between clients and OSTs */ -+ MPI_Info_get(users_info, "CO", MPI_MAX_INFO_VAL, value, -+ &flag); -+ if (flag && (int_val = atoi(value)) > 0) { -+ tmp_val = int_val; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != int_val) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "CO", -+ error_code); -+ ADIOI_Free(value); -+ return; -+ } -+ MPI_Info_set(fd->info, "CO", value); ++ stripe_val[0] = str_factor; ++ stripe_val[1] = str_unit; ++ stripe_val[2] = start_iodev; } - MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm); -- ++ MPI_Bcast(stripe_val, 3, MPI_INT, 0, fd->comm); + - if (tmp_val[0] != str_factor - || tmp_val[1] != str_unit - || tmp_val[2] != start_iodev) { -- FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys" -- "-striping_factor:striping_unit:start_iodevice " -- "need to be identical across all processes\n"); -- MPI_Abort(MPI_COMM_WORLD, 1); ++ if (stripe_val[0] != str_factor ++ || stripe_val[1] != str_unit ++ || stripe_val[2] != start_iodev) { + FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys" + "-striping_factor:striping_unit:start_iodevice " + "need to be identical across all processes\n"); + MPI_Abort(MPI_COMM_WORLD, 1); - } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) { -- /* if user has specified striping info, process 0 tries to set it */ -- if (!myrank) { -- if (fd->perm == ADIO_PERM_NULL) { -- old_mask = umask(022); -- umask(old_mask); -- perm = old_mask ^ 0666; -- } -- else perm = fd->perm; -- -- amode = 0; -- if (fd->access_mode & ADIO_CREATE) -- amode = amode | O_CREAT; -- if (fd->access_mode & ADIO_RDONLY) -- amode = amode | O_RDONLY; -- if (fd->access_mode & ADIO_WRONLY) -- amode = amode | O_WRONLY; -- if (fd->access_mode & ADIO_RDWR) -- amode = amode | O_RDWR; -- if (fd->access_mode & ADIO_EXCL) -- amode = amode | O_EXCL; -- -- /* we need to create file so ensure this is set */ -- amode = amode | O_LOV_DELAY_CREATE | O_CREAT; -- -- fd_sys = open(fd->filename, amode, perm); ++ } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) { + /* if user has specified striping info, process 0 tries to set it */ + if (!myrank) { + if (fd->perm == ADIO_PERM_NULL) { +@@ -100,9 +113,9 @@ + amode = amode | O_LOV_DELAY_CREATE | O_CREAT; + + fd_sys = open(fd->filename, amode, perm); - if (fd_sys == -1) { - if (errno != EEXIST) - fprintf(stderr, -- "Failure to open file %s %d %d\n",strerror(errno), amode, perm); -- } else { -- lum.lmm_magic = LOV_USER_MAGIC; -- lum.lmm_pattern = 0; -- lum.lmm_stripe_size = str_unit; -- lum.lmm_stripe_count = str_factor; -- lum.lmm_stripe_offset = start_iodev; -- -- err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum); ++ if (fd_sys == -1) { ++ if (errno != EEXIST) ++ fprintf(stderr, + "Failure to open file %s %d %d\n",strerror(errno), amode, perm); + } else { + lum.lmm_magic = LOV_USER_MAGIC; +@@ -112,25 +125,73 @@ + lum.lmm_stripe_offset = start_iodev; + + err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum); - if (err == -1 && errno != EEXIST) { -- fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno)); -- } -- close(fd_sys); -- } -- } /* End of striping parameters validation */ -+ /* big_req_size: -+ * if the req size is bigger than this, -+ * collective IO may not be performed. -+ */ -+ MPI_Info_get(users_info, "big_req_size", MPI_MAX_INFO_VAL, value, -+ &flag); -+ if (flag && (int_val = atoi(value)) > 0) { -+ tmp_val = int_val; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != int_val) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "big_req_size", -+ error_code); -+ ADIOI_Free(value); -+ return; -+ } -+ MPI_Info_set(fd->info, "big_req_size", value); -+ } -+ /* ds_in_coll: disable data sieving in collective IO */ -+ MPI_Info_get(users_info, "ds_in_coll", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && (!strcmp(value, "enable") || -+ !strcmp(value, "ENABLE"))) { -+ tmp_val = int_val = 1; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != int_val) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "ds_in_coll", -+ error_code); -+ ADIOI_Free(value); -+ return; -+ } -+ MPI_Info_set(fd->info, "ds_in_coll", "enable"); -+ } -+ /* contiguous_data: whether the data are contiguous */ -+ MPI_Info_get(users_info, "contiguous_data", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && (!strcmp(value, "yes") || -+ !strcmp(value, "YES"))) { -+ tmp_val = int_val = 1; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != int_val) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "contiguous_data", -+ error_code); -+ ADIOI_Free(value); -+ return; -+ } -+ MPI_Info_set(fd->info, "contiguous_data", "yes"); ++ if (err == -1 && errno != EEXIST) { + fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno)); + } + close(fd_sys); + } + } /* End of striping parameters validation */ } - -- MPI_Barrier(fd->comm); + MPI_Barrier(fd->comm); - /* set the values for collective I/O and data sieving parameters */ - ADIOI_GEN_SetInfo(fd, users_info, error_code); - } else { @@ -981,55 +601,73 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c - - /* set the values for collective I/O and data sieving parameters */ - ADIOI_GEN_SetInfo(fd, users_info, error_code); -+ /* same_io_size: whether the req size is same */ -+ MPI_Info_get(users_info, "same_io_size", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && (!strcmp(value, "yes") || -+ !strcmp(value, "YES"))) { -+ tmp_val = int_val = 1; + } +- ++ /* get other hint */ ++ if (users_info != MPI_INFO_NULL) { ++ /* CO: IO Clients/OST, ++ * to keep the load balancing between clients and OSTs */ ++ MPI_Info_get(users_info, "romio_lustre_co_ratio", MPI_MAX_INFO_VAL, value, ++ &flag); ++ if (flag && (int_val = atoi(value)) > 0) { ++ tmp_val = int_val; + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); + if (tmp_val != int_val) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "same_io_size", -+ error_code); ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "romio_lustre_co_ratio", ++ error_code); + ADIOI_Free(value); -+ return; ++ return; + } -+ MPI_Info_set(fd->info, "same_io_size", "yes"); ++ MPI_Info_set(fd->info, "romio_lustre_co_ratio", value); ++ fd->hints->fs_hints.lustre.co_ratio = atoi(value); + } -+ /* Remember the current cb_nodes that the user set. -+ * It would be used to improve collective I/O. ++ /* coll_threshold: ++ * if the req size is bigger than this, collective IO may not be performed. + */ -+ MPI_Info_get(users_info, "cb_nodes", MPI_MAX_INFO_VAL, value, &flag); ++ MPI_Info_get(users_info, "romio_lustre_coll_threshold", MPI_MAX_INFO_VAL, value, ++ &flag); + if (flag && (int_val = atoi(value)) > 0) { + tmp_val = int_val; + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); + if (tmp_val != int_val) { + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "big_req_size", ++ "romio_lustre_coll_threshold", + error_code); + ADIOI_Free(value); + return; + } -+ MPI_Info_set(fd->info, "user_cb_nodes", value); ++ MPI_Info_set(fd->info, "romio_lustre_coll_threshold", value); ++ fd->hints->fs_hints.lustre.coll_threshold = atoi(value); + } - } -- -- if (ADIOI_Direct_read) fd->direct_read = 1; -- if (ADIOI_Direct_write) fd->direct_write = 1; -- - ADIOI_Free(value); ++ /* ds_in_coll: disable data sieving in collective IO */ ++ MPI_Info_get(users_info, "romio_lustre_ds_in_coll", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag && (!strcmp(value, "disable") || ++ !strcmp(value, "DISABLE"))) { ++ tmp_val = int_val = 2; ++ MPI_Bcast(&tmp_val, 2, MPI_INT, 0, fd->comm); ++ if (tmp_val != int_val) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "romio_lustre_ds_in_coll", ++ error_code); ++ ADIOI_Free(value); ++ return; ++ } ++ MPI_Info_set(fd->info, "romio_lustre_ds_in_coll", "disable"); ++ fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_DISABLE; ++ } ++ } + /* set the values for collective I/O and data sieving parameters */ + ADIOI_GEN_SetInfo(fd, users_info, error_code); ++ + if (ADIOI_Direct_read) fd->direct_read = 1; + if (ADIOI_Direct_write) fd->direct_write = 1; -- *error_code = MPI_SUCCESS; -+ if (ADIOI_Direct_read) fd->direct_read = 1; -+ if (ADIOI_Direct_write) fd->direct_write = 1; - } -diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c ---- ad_lustre_orig/ad_lustre_open.c 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/ad_lustre_open.c 2008-09-17 18:55:50.000000000 +0800 -@@ -1,18 +1,21 @@ +diff -ruN adio/ad_lustre_orig/ad_lustre_open.c adio/ad_lustre/ad_lustre_open.c +--- adio/ad_lustre_orig/ad_lustre_open.c 2008-09-17 14:36:56.000000000 +0800 ++++ adio/ad_lustre/ad_lustre_open.c 2009-03-01 11:32:32.000000000 +0800 +@@ -1,9 +1,11 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (C) 1997 University of Chicago. @@ -1039,220 +677,32 @@ diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c * * Copyright (C) 2007 Oak Ridge National Laboratory + * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include "ad_lustre.h" +@@ -51,14 +53,17 @@ + err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum); - void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) - { -- int perm, old_mask, amode, amode_direct; -+ int perm, old_mask, amode = 0, amode_direct = 0, flag = 0, err, myrank; -+ int stripe_size = 0, stripe_count = 0, stripe_offset = -1; - struct lov_user_md lum = { 0 }; -- char *value; -+ char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); - - #if defined(MPICH2) || !defined(PRINT_ERR_MSG) - static char myname[] = "ADIOI_LUSTRE_OPEN"; -@@ -22,12 +25,57 @@ - old_mask = umask(022); - umask(old_mask); - perm = old_mask ^ 0666; -- } -- else perm = fd->perm; -+ } else -+ perm = fd->perm; + if (!err) { ++ fd->hints->striping_unit = lum.lmm_stripe_size; + sprintf(value, "%d", lum.lmm_stripe_size); + MPI_Info_set(fd->info, "striping_unit", value); -- amode = 0; -- if (fd->access_mode & ADIO_CREATE) -+ if (fd->access_mode & ADIO_CREATE) { - amode = amode | O_CREAT; -+ /* Check striping info -+ * if already set by SetInfo(), set them to lum; otherwise, set by lum -+ */ -+ MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, -+ &flag); -+ if (flag) -+ stripe_size = atoi(value); -+ -+ MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, -+ &flag); -+ if (flag) -+ stripe_count = atoi(value); -+ -+ MPI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL, value, -+ &flag); -+ if (flag) -+ stripe_offset = atoi(value); -+ -+ /* if user has specified striping info, -+ * process 0 will try to check and set it. -+ */ -+ if ((stripe_size > 0) || (stripe_count > 0) || (stripe_offset >= 0)) { -+ MPI_Comm_rank(fd->comm, &myrank); -+ if (myrank == 0) { -+ int fd_sys = open(fd->filename, amode, perm); -+ if (fd_sys == -1) { -+ if (errno != EEXIST) -+ FPRINTF(stderr, "Failure to open file %s %d %d\n", -+ strerror(errno), amode, perm); -+ } else { -+ lum.lmm_magic = LOV_USER_MAGIC; -+ lum.lmm_pattern = 1; -+ lum.lmm_stripe_size = stripe_size; -+ lum.lmm_stripe_count = stripe_count; -+ lum.lmm_stripe_offset = stripe_offset; -+ -+ if (ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum)) -+ FPRINTF(stderr, -+ "Failure to set striping info to Lustre!\n"); -+ close(fd_sys); -+ } -+ } -+ MPI_Barrier(fd->comm); -+ } -+ } -+ - if (fd->access_mode & ADIO_RDONLY) - amode = amode | O_RDONLY; - if (fd->access_mode & ADIO_WRONLY) -@@ -42,32 +90,36 @@ - fd->fd_sys = open(fd->filename, amode|O_CREAT, perm); ++ fd->hints->striping_factor = lum.lmm_stripe_count; + sprintf(value, "%d", lum.lmm_stripe_count); + MPI_Info_set(fd->info, "striping_factor", value); - if (fd->fd_sys != -1) { -- int err; -- -- value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); -- - /* get file striping information and set it in info */ -- lum.lmm_magic = LOV_USER_MAGIC; -- err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum); -- -- if (!err) { -- sprintf(value, "%d", lum.lmm_stripe_size); -- MPI_Info_set(fd->info, "striping_unit", value); -- -- sprintf(value, "%d", lum.lmm_stripe_count); -- MPI_Info_set(fd->info, "striping_factor", value); -- -- sprintf(value, "%d", lum.lmm_stripe_offset); ++ fd->hints->fs_hints.lustre.start_iodevice = lum.lmm_stripe_offset; + sprintf(value, "%d", lum.lmm_stripe_offset); - MPI_Info_set(fd->info, "start_iodevice", value); -- } -- ADIOI_Free(value); -+ lum.lmm_magic = LOV_USER_MAGIC; -+ err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum); - -+ if (!err) { -+ if (lum.lmm_stripe_size && lum.lmm_stripe_count && -+ (lum.lmm_stripe_offset >= 0)) { -+ sprintf(value, "%d", lum.lmm_stripe_size); -+ MPI_Info_set(fd->info, "striping_unit", value); -+ -+ sprintf(value, "%d", lum.lmm_stripe_count); -+ MPI_Info_set(fd->info, "striping_factor", value); -+ -+ sprintf(value, "%d", lum.lmm_stripe_offset); -+ MPI_Info_set(fd->info, "start_iodevice", value); -+ } else { -+ FPRINTF(stderr, "Striping info is invalid!\n"); -+ ADIOI_Free(value); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } else { -+ FPRINTF(stderr, "Failed to get striping info from Lustre!\n"); -+ ADIOI_Free(value); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } - if (fd->access_mode & ADIO_APPEND) - fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); -- } -- -+ } - if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) -- fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); -+ fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); ++ MPI_Info_set(fd->info, "romio_lustre_start_iodevice", value); + } + ADIOI_Free(value); - fd->fd_direct = -1; - if (fd->direct_write || fd->direct_read) { -@@ -81,20 +133,22 @@ - } - - /* --BEGIN ERROR HANDLING-- */ -- if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && -- (fd->direct_write || fd->direct_read))) { -+ if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && -+ (fd->direct_write || fd->direct_read))) { - if (errno == ENAMETOOLONG) - *error_code = MPIO_Err_create_code(MPI_SUCCESS, -- MPIR_ERR_RECOVERABLE, myname, -- __LINE__, MPI_ERR_BAD_FILE, -+ MPIR_ERR_RECOVERABLE, -+ myname, __LINE__, -+ MPI_ERR_BAD_FILE, - "**filenamelong", - "**filenamelong %s %d", - fd->filename, - strlen(fd->filename)); - else if (errno == ENOENT) - *error_code = MPIO_Err_create_code(MPI_SUCCESS, -- MPIR_ERR_RECOVERABLE, myname, -- __LINE__, MPI_ERR_NO_SUCH_FILE, -+ MPIR_ERR_RECOVERABLE, -+ myname, __LINE__, -+ MPI_ERR_NO_SUCH_FILE, - "**filenoexist", - "**filenoexist %s", - fd->filename); -@@ -108,27 +162,30 @@ - fd->filename); - else if (errno == EACCES) { - *error_code = MPIO_Err_create_code(MPI_SUCCESS, -- MPIR_ERR_RECOVERABLE, myname, -- __LINE__, MPI_ERR_ACCESS, -+ MPIR_ERR_RECOVERABLE, -+ myname, __LINE__, -+ MPI_ERR_ACCESS, - "**fileaccess", -- "**fileaccess %s", -- fd->filename ); -- } -- else if (errno == EROFS) { -+ "**fileaccess %s", -+ fd->filename); -+ } else if (errno == EROFS) { - /* Read only file or file system and write access requested */ - *error_code = MPIO_Err_create_code(MPI_SUCCESS, -- MPIR_ERR_RECOVERABLE, myname, -- __LINE__, MPI_ERR_READ_ONLY, -- "**ioneedrd", 0 ); -- } -- else { -+ MPIR_ERR_RECOVERABLE, -+ myname, __LINE__, -+ MPI_ERR_READ_ONLY, -+ "**ioneedrd", 0); -+ } else { - *error_code = MPIO_Err_create_code(MPI_SUCCESS, -- MPIR_ERR_RECOVERABLE, myname, -- __LINE__, MPI_ERR_IO, "**io", -+ MPIR_ERR_RECOVERABLE, -+ myname, __LINE__, -+ MPI_ERR_IO, "**io", - "**io %s", strerror(errno)); - } -- } -+ } else { - /* --END ERROR HANDLING-- */ -- else *error_code = MPI_SUCCESS; -+ *error_code = MPI_SUCCESS; -+ } - -+ ADIOI_Free(value); - } -diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c ---- ad_lustre_orig/ad_lustre_rwcontig.c 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/ad_lustre_rwcontig.c 2008-10-15 22:44:35.000000000 +0800 +diff -ruN adio/ad_lustre_orig/ad_lustre_rwcontig.c adio/ad_lustre/ad_lustre_rwcontig.c +--- adio/ad_lustre_orig/ad_lustre_rwcontig.c 2008-09-17 14:36:56.000000000 +0800 ++++ adio/ad_lustre/ad_lustre_rwcontig.c 2009-05-05 15:34:29.000000000 +0800 @@ -1,9 +1,11 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* @@ -1263,14 +713,40 @@ diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c * * Copyright (C) 2007 Oak Ridge National Laboratory + * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #define _XOPEN_SOURCE 600 -diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c ---- ad_lustre_orig/ad_lustre_wrcoll.c 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/ad_lustre_wrcoll.c 2008-10-15 22:02:53.000000000 +0800 -@@ -0,0 +1,883 @@ +@@ -136,10 +138,23 @@ + if (err == -1) goto ioerr; + } + +- if (io_mode) ++ if (io_mode) { ++#ifdef ADIOI_MPE_LOGGING ++ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL); ++#endif + err = write(fd->fd_sys, buf, len); +- else ++#ifdef ADIOI_MPE_LOGGING ++ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL); ++#endif ++ } else { ++#ifdef ADIOI_MPE_LOGGING ++ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL); ++#endif + err = read(fd->fd_sys, buf, len); ++#ifdef ADIOI_MPE_LOGGING ++ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL); ++#endif ++ } + } else { + err = ADIOI_LUSTRE_Directio(fd, buf, len, offset, io_mode); + } +diff -ruN adio/ad_lustre_orig/ad_lustre_wrcoll.c adio/ad_lustre/ad_lustre_wrcoll.c +--- adio/ad_lustre_orig/ad_lustre_wrcoll.c 1970-01-01 08:00:00.000000000 +0800 ++++ adio/ad_lustre/ad_lustre_wrcoll.c 2009-04-24 14:48:34.000000000 +0800 +@@ -0,0 +1,934 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (C) 1997 University of Chicago. @@ -1278,7 +754,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + * + * Copyright (C) 2007 Oak Ridge National Laboratory + * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include "ad_lustre.h" @@ -1293,25 +769,25 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + ADIO_Offset *offset_list, + int *len_list, + int contig_access_count, -+ int * striping_info, ++ int *striping_info, + int *buf_idx, int *error_code); +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf, -+ ADIOI_Flatlist_node * flat_buf, ++ ADIOI_Flatlist_node *flat_buf, + char **send_buf, -+ ADIO_Offset * offset_list, ++ ADIO_Offset *offset_list, + int *len_list, int *send_size, -+ MPI_Request * requests, ++ MPI_Request *requests, + int *sent_to_proc, int nprocs, + int myrank, int contig_access_count, -+ int * striping_info, ++ int *striping_info, + int *send_buf_idx, + int *curr_to_proc, + int *done_to_proc, int iter, + MPI_Aint buftype_extent); +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf, + char *write_buf, -+ ADIOI_Flatlist_node * flat_buf, -+ ADIO_Offset * offset_list, ++ ADIOI_Flatlist_node *flat_buf, ++ ADIO_Offset *offset_list, + int *len_list, int *send_size, + int *recv_size, ADIO_Offset off, + int size, int *count, @@ -1319,22 +795,29 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + int *sent_to_proc, int nprocs, + int myrank, int buftype_is_contig, + int contig_access_count, -+ int * striping_info, -+ ADIOI_Access * others_req, ++ int *striping_info, ++ ADIOI_Access *others_req, + int *send_buf_idx, + int *curr_to_proc, + int *done_to_proc, int *hole, + int iter, MPI_Aint buftype_extent, + int *buf_idx, int *error_code); -+void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count, -+ ADIO_Offset * srt_off, int *srt_len, int *start_pos, ++void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, ++ ADIO_Offset *srt_off, int *srt_len, int *start_pos, + int nprocs, int nprocs_recv, int total_elements); + +void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, + int file_ptr_type, ADIO_Offset offset, -+ ADIO_Status * status, int *error_code) ++ ADIO_Status *status, int *error_code) +{ ++ /* Uses a generalized version of the extended two-phase method described ++ * in "An Extended Two-Phase Method for Accessing Sections of ++ * Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, ++ * Scientific Programming, (5)4:301--317, Winter 1996. ++ * http://www.mcs.anl.gov/home/thakur/ext2ph.ps ++ */ ++ + ADIOI_Access *my_req; + /* array of nprocs access structures, one for each other process has + this process's request */ @@ -1346,7 +829,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + int i, filetype_is_contig, nprocs, myrank, do_collect = 0; + int contig_access_count = 0, buftype_is_contig, interleave_count = 0; + int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; -+ ADIO_Offset orig_fp, start_offset, end_offset, off, min_st_offset; ++ ADIO_Offset orig_fp, start_offset, end_offset, off; + ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL; + int *buf_idx = NULL, *len_list = NULL, *striping_info = NULL; + int old_error, tmp_error; @@ -1360,13 +843,19 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { + /* For this process's request, calculate the list of offsets and + lengths in the file and determine the start and end offsets. */ ++ ++ /* Note: end_offset points to the last byte-offset that will be accessed. ++ * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 ++ */ ++ + ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, -+ &offset_list, &len_list, &start_offset, -+ &end_offset, &contig_access_count); ++ &offset_list, &len_list, &start_offset, ++ &end_offset, &contig_access_count); + + /* each process communicates its start and end offsets to other -+ processes. The result is an array each of start and end offsets stored -+ in order of process rank. */ ++ * processes. The result is an array each of start and end offsets ++ * stored in order of process rank. ++ */ + st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); + end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); + MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, @@ -1427,22 +916,26 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + } + + /* Get Lustre hints information */ -+ ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1, nprocs, -+ st_offsets, end_offsets, -+ &min_st_offset); ++ ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1); ++ + /* calculate what portions of the access requests of this process are + * located in which process + */ + ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count, + striping_info, nprocs, &count_my_req_procs, + &count_my_req_per_proc, &my_req, &buf_idx); -+ /* calculate what process's requests will be written by this process */ -+ ADIOI_LUSTRE_Calc_others_req(fd, count_my_req_procs, -+ count_my_req_per_proc, -+ my_req, nprocs, myrank, -+ end_offset - start_offset + 1, -+ min_st_offset, striping_info, -+ &count_others_req_procs, &others_req); ++ ++ /* based on everyone's my_req, calculate what requests of other processes ++ * will be accessed by this process. ++ * count_others_req_procs = number of processes whose requests (including ++ * this process itself) will be accessed by this process ++ * count_others_req_per_proc[i] indicates how many separate contiguous ++ * requests of proc. i will be accessed by this process. ++ */ ++ ++ ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, ++ my_req, nprocs, myrank, &count_others_req_procs, ++ &others_req); + ADIOI_Free(count_my_req_per_proc); + + /* exchange data and write in sizes of no more than stripe_size. */ @@ -1451,6 +944,17 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + offset_list, len_list, contig_access_count, + striping_info, buf_idx, error_code); + ++ /* If this collective write is followed by an independent write, ++ * it's possible to have those subsequent writes on other processes ++ * race ahead and sneak in before the read-modify-write completes. ++ * We carry out a collective communication at the end here so no one ++ * can start independent i/o before collective I/O completes. ++ * ++ * need to do some gymnastics with the error codes so that if something ++ * went wrong, all processes report error, but if a process has a more ++ * specific error code, we can still have that process report the ++ * additional information */ ++ + old_error = *error_code; + if (*error_code != MPI_SUCCESS) + *error_code = MPI_ERR_IO; @@ -1520,6 +1024,9 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + fd->fp_sys_posn = -1; /* set it to null. */ +} + ++/* If successful, error_code is set to MPI_SUCCESS. Otherwise an error ++ * code is created and returned in error_code. ++ */ +static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf, + MPI_Datatype datatype, int nprocs, + int myrank, ADIOI_Access *others_req, @@ -1529,6 +1036,16 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + int *striping_info, int *buf_idx, + int *error_code) +{ ++ /* Send data to appropriate processes and write in sizes of no more ++ * than lustre stripe_size. ++ * The idea is to reduce the amount of extra memory required for ++ * collective I/O. If all data were written all at once, which is much ++ * easier, it would require temp space more than the size of user_buf, ++ * which is often unacceptable. For example, to write a distributed ++ * array to a file, where each local array is 8Mbytes, requiring ++ * at least another 8Mbytes of temp space is unacceptable. ++ */ ++ + int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig; + ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc; + ADIO_Offset off, req_off, send_off, iter_st_off, *off_list; @@ -1538,14 +1055,15 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + int *send_curr_offlen_ptr, *send_size; + int *partial_recv, *sent_to_proc, *recv_start_pos; + int *send_buf_idx, *curr_to_proc, *done_to_proc; -+ char *write_buf = NULL, *value; ++ char *write_buf = NULL; + MPI_Status status; + ADIOI_Flatlist_node *flat_buf = NULL; + MPI_Aint buftype_extent; + int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2]; -+ int lflag, data_sieving = 0; ++ int data_sieving = 0; + + *error_code = MPI_SUCCESS; /* changed below if error */ ++ /* only I/O errors are currently reported */ + + /* calculate the number of writes of stripe size to be done. + * That gives the no. of communication phases as well. @@ -1636,6 +1154,16 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + flat_buf = flat_buf->next; + } + MPI_Type_extent(datatype, &buftype_extent); ++ /* I need to check if there are any outstanding nonblocking writes to ++ * the file, which could potentially interfere with the writes taking ++ * place in this collective write call. Since this is not likely to be ++ * common, let me do the simplest thing possible here: Each process ++ * completes all pending nonblocking operations before completing. ++ */ ++ /*ADIOI_Complete_async(error_code); ++ if (*error_code != MPI_SUCCESS) return; ++ MPI_Barrier(fd->comm); ++ */ + + iter_st_off = min_st_loc; + @@ -1645,15 +1173,11 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + * then rank0 will collect data [0, 30] and [60, 90] then write. There + * is a hole in [30, 60], which will cause a read-modify-write in [0, 90]. + * -+ * To reduce its impact on the performance, we disable data sieving -+ * by default, unless the hint "ds_in_coll" is enabled. ++ * To reduce its impact on the performance, we can disable data sieving ++ * by hint "ds_in_coll". + */ + /* check the hint for data sieving */ -+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); -+ MPI_Info_get(fd->info, "ds_in_coll", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag && !strcmp(value, "enable")) -+ data_sieving = 1; -+ ADIOI_Free(value); ++ data_sieving = fd->hints->fs_hints.lustre.ds_in_coll; + + for (m = 0; m < max_ntimes; m++) { + /* go through all others_req and my_req to check which will be received @@ -1738,7 +1262,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + } + if (flag) { + /* check whether to do data sieving */ -+ if(data_sieving) { ++ if(data_sieving == ADIOI_HINT_ENABLE) { + ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE, + ADIO_EXPLICIT_OFFSET, off, &status, + error_code); @@ -1790,10 +1314,13 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + ADIOI_Free(off_list); +} + ++/* Sets error_code to MPI_SUCCESS if successful, or creates an error code ++ * in the case of error. ++ */ +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf, + char *write_buf, -+ ADIOI_Flatlist_node * flat_buf, -+ ADIO_Offset * offset_list, ++ ADIOI_Flatlist_node *flat_buf, ++ ADIO_Offset *offset_list, + int *len_list, int *send_size, + int *recv_size, ADIO_Offset off, + int size, int *count, @@ -1801,8 +1328,8 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + int *sent_to_proc, int nprocs, + int myrank, int buftype_is_contig, + int contig_access_count, -+ int * striping_info, -+ ADIOI_Access * others_req, ++ int *striping_info, ++ ADIOI_Access *others_req, + int *send_buf_idx, + int *curr_to_proc, int *done_to_proc, + int *hole, int iter, @@ -1879,7 +1406,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + *hole = 1; + } + /* check the hint for data sieving */ -+ if (data_sieving && nprocs_recv && *hole) { ++ if (data_sieving == ADIOI_HINT_ENABLE && nprocs_recv && *hole) { + ADIO_ReadContig(fd, write_buf, size, MPI_BYTE, + ADIO_EXPLICIT_OFFSET, off, &status, &err); + // --BEGIN ERROR HANDLING-- @@ -2057,15 +1584,15 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c +} + +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf, -+ ADIOI_Flatlist_node * flat_buf, ++ ADIOI_Flatlist_node *flat_buf, + char **send_buf, -+ ADIO_Offset * offset_list, ++ ADIO_Offset *offset_list, + int *len_list, int *send_size, -+ MPI_Request * requests, ++ MPI_Request *requests, + int *sent_to_proc, int nprocs, + int myrank, + int contig_access_count, -+ int * striping_info, ++ int *striping_info, + int *send_buf_idx, + int *curr_to_proc, + int *done_to_proc, int iter, @@ -2154,10 +1681,10 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + if (send_size[i]) + sent_to_proc[i] = curr_to_proc[i]; +} -diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c ---- ad_lustre_orig/ad_lustre_wrstr.c 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/ad_lustre_wrstr.c 2008-10-13 15:34:53.000000000 +0800 -@@ -0,0 +1,472 @@ +diff -ruN adio/ad_lustre_orig/ad_lustre_wrstr.c adio/ad_lustre/ad_lustre_wrstr.c +--- adio/ad_lustre_orig/ad_lustre_wrstr.c 1970-01-01 08:00:00.000000000 +0800 ++++ adio/ad_lustre/ad_lustre_wrstr.c 2009-02-27 10:35:18.000000000 +0800 +@@ -0,0 +1,467 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (C) 1997 University of Chicago. @@ -2165,7 +1692,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + * + * Copyright (C) 2007 Oak Ridge National Laboratory + * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include "ad_lustre.h" @@ -2302,8 +1829,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz; + ADIO_Status status1; + int new_bwr_size, new_fwr_size; -+ char * value; -+ int stripe_size, lflag = 0; ++ int stripe_size; + static char myname[] = "ADIOI_LUSTRE_WriteStrided"; + int myrank; + MPI_Comm_rank(fd->comm, &myrank); @@ -2340,11 +1866,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + bufsize = buftype_size * count; + + /* get striping info */ -+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); -+ MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag) -+ stripe_size = atoi(value); -+ ADIOI_Free(value); ++ stripe_size = fd->hints->striping_unit; + + /* Different buftype to different filetype */ + if (!buftype_is_contig && filetype_is_contig) { @@ -2630,9 +2152,9 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + if (!buftype_is_contig) + ADIOI_Delete_flattened(datatype); +} -diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in ---- ad_lustre_orig/Makefile.in 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/Makefile.in 2008-09-17 18:20:35.000000000 +0800 +diff -ruN adio/ad_lustre_orig/Makefile.in adio/ad_lustre/Makefile.in +--- adio/ad_lustre_orig/Makefile.in 2008-09-17 14:36:56.000000000 +0800 ++++ adio/ad_lustre/Makefile.in 2008-10-17 17:03:06.000000000 +0800 @@ -16,7 +16,9 @@ @VPATH@ @@ -2644,10 +2166,10 @@ diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in default: $(LIBNAME) @if [ "@ENABLE_SHLIB@" != "none" ] ; then \ -diff -ruN ad_lustre_orig/README ad_lustre/README ---- ad_lustre_orig/README 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/README 2008-10-15 22:43:07.000000000 +0800 -@@ -5,6 +5,25 @@ +diff -ruN adio/ad_lustre_orig/README adio/ad_lustre/README +--- adio/ad_lustre_orig/README 2008-09-17 14:36:56.000000000 +0800 ++++ adio/ad_lustre/README 2009-04-24 09:46:20.000000000 +0800 +@@ -5,6 +5,21 @@ o To post the code for ParColl (Partitioned collective IO) ----------------------------------------------------- @@ -2656,25 +2178,21 @@ diff -ruN ad_lustre_orig/README ad_lustre/README +Improved data redistribution + o Improve I/O pattern identification. Besides checking interleaving, + if request I/O size is small, collective I/O will be performed. -+ The hint big_req_size can be used to define the req size value. ++ The hint bigsize can be used to define the req size value. + o Provide hint CO for load balancing to control the number of + IO clients for each OST + o Produce stripe-contiguous I/O pattern that Lustre prefers -+ o Reduce the collective overhead by hints contiguous_data and -+ same_io_size to remove unnecessary MPI_Alltoall() + o Control read-modify-write in data sieving in collective IO + by hint ds_in_coll. -+ o Optimize the IO pattern. -+ - If the whole access size <= stripe size, we suggest all the -+ IO data will be performed by the same client, to avoid the -+ extent lock revoking and reassignment. ++ o Reduce extent lock conflicts by make each OST accessed by one or ++ more constant clients. + +----------------------------------------------------- V04: ----------------------------------------------------- o Direct IO and Lockless IO support ---- common/ad_write_coll_orig.c 2008-10-15 11:24:31.000000000 +0800 -+++ common/ad_write_coll.c 2008-10-15 11:25:39.000000000 +0800 +--- adio/common/ad_write_coll_orig.c 2009-02-27 22:06:46.000000000 +0800 ++++ adio/common/ad_write_coll.c 2008-10-15 11:25:38.000000000 +0800 @@ -42,7 +42,7 @@ int *send_buf_idx, int *curr_to_proc, int *done_to_proc, int iter,