X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fcontrib%2Fadio_driver_mpich2-1.0.7.patch;h=9bb012630e1a67416d0dceca4223a88a6616ac3a;hb=cec72a356891eaa729314a7bc89c4b2aaef0a31b;hp=eb919cbe8108ef6b7b1b1889b0e6a3058baabdc3;hpb=25d4f73bfbbd6191f35f5ad9c7ed776ee9adf731;p=fs%2Flustre-release.git diff --git a/lustre/contrib/adio_driver_mpich2-1.0.7.patch b/lustre/contrib/adio_driver_mpich2-1.0.7.patch index eb919cb..9bb0126 100644 --- a/lustre/contrib/adio_driver_mpich2-1.0.7.patch +++ b/lustre/contrib/adio_driver_mpich2-1.0.7.patch @@ -1,7 +1,40 @@ -diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c ---- ad_lustre_orig/ad_lustre_aggregate.c 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/ad_lustre_aggregate.c 2008-09-17 18:20:35.000000000 +0800 -@@ -0,0 +1,676 @@ +--- configure_orig.in 2009-03-01 13:50:30.000000000 +0800 ++++ configure.in 2009-02-27 13:35:42.000000000 +0800 +@@ -1123,8 +1123,14 @@ + if test -n "$file_system_testfs"; then + AC_DEFINE(ROMIO_TESTFS,1,[Define for ROMIO with TESTFS]) + fi ++# ++# Verify presence of lustre/lustre_user.h ++# + if test -n "$file_system_lustre"; then +- AC_DEFINE(ROMIO_LUSTRE,1,[Define for ROMIO with LUSTRE]) ++ AC_CHECK_HEADERS(lustre/lustre_user.h, ++ AC_DEFINE(ROMIO_LUSTRE,1,[Define for ROMIO with LUSTRE]), ++ AC_MSG_ERROR([LUSTRE support requested but cannot find lustre/lustre_user.h header file]) ++ ) + fi + + if test -n "$file_system_xfs"; then +--- adio/include/adioi_orig.h 2009-03-01 14:00:48.000000000 +0800 ++++ adio/include/adioi.h 2009-04-24 15:26:44.000000000 +0800 +@@ -52,6 +52,12 @@ + struct { + int debugmask; + } pvfs2; ++ struct { ++ int start_iodevice; ++ int co_ratio; ++ int coll_threshold; ++ int ds_in_coll; ++ } lustre; + } fs_hints; + + }; +diff -ruN adio/ad_lustre_orig/ad_lustre_aggregate.c adio/ad_lustre/ad_lustre_aggregate.c +--- adio/ad_lustre_orig/ad_lustre_aggregate.c 1970-01-01 08:00:00.000000000 +0800 ++++ adio/ad_lustre/ad_lustre_aggregate.c 2009-05-05 15:22:40.000000000 +0800 +@@ -0,0 +1,304 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (C) 1997 University of Chicago. @@ -9,85 +42,102 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + * + * Copyright (C) 2007 Oak Ridge National Laboratory + * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include "ad_lustre.h" +#include "adio_extern.h" + -+void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr, -+ int mode, int nprocs) ++#undef AGG_DEBUG ++ ++void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int **striping_info_ptr, ++ int mode) +{ + int *striping_info = NULL; + /* get striping information: -+ * striping_info[0] = stripe_size; -+ * striping_info[1] = stripe_count; -+ * striping_info[2] = CO; ++ * striping_info[0]: stripe_size ++ * striping_info[1]: stripe_count ++ * striping_info[2]: avail_cb_nodes + */ -+ /* for easy understanding, we name some variables */ -+ int stripe_size, stripe_count, CO = 1, CO_max = 1, lflag; ++ int stripe_size, stripe_count, CO = 1, CO_max = 1, CO_nodes, lflag; ++ int avail_cb_nodes, divisor, nprocs_for_coll = fd->hints->cb_nodes; + char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); ++ ++ /* Get hints value */ + /* stripe size */ -+ MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag) -+ stripe_size = atoi(value); ++ stripe_size = fd->hints->striping_unit; + /* stripe count */ -+ MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag) -+ stripe_count = atoi(value); + /* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */ ++ stripe_count = fd->hints->striping_factor; + -+ /* calculate CO */ ++ /* Calculate the available number of I/O clients, that is ++ * avail_cb_nodes=min(cb_nodes, stripe_count*CO), where ++ * CO=1 by default ++ */ + if (!mode) { -+ /* for collective read, ++ /* for collective read, + * if "CO" clients access the same OST simultaneously, -+ * the OST disk seek time would be large. So, to avoid this, ++ * the OST disk seek time would be much. So, to avoid this, + * it might be better if 1 client only accesses 1 OST. + * So, we set CO = 1 to meet the above requirement. + */ + CO = 1; + /*XXX: maybe there are other better way for collective read */ + } else { -+ /* CO_max: the largest number of IO clients for each ost group */ -+ CO_max = (nprocs - 1)/ stripe_count + 1; ++ /* CO_max: the largest number of IO clients for each ost group */ ++ CO_max = (nprocs_for_coll - 1)/ stripe_count + 1; + /* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */ -+ MPI_Info_get(fd->info, "CO", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag) -+ CO = atoi(value); ++ CO = fd->hints->fs_hints.lustre.co_ratio; + CO = ADIOI_MIN(CO_max, CO); + } -+ ADIOI_Free(value); -+ /* although there are known "N" hints so far, we still malloc space here -+ * instead of declaring an array[3] outside, -+ * because on one hand in the future we probably need more hints, and -+ * on the other hand this function can be called by -+ * both collective read and write conveniently. -+ */ ++ /* Calculate how many IO clients we need */ ++ /* To avoid extent lock conflicts, ++ * avail_cb_nodes should divide (stripe_count*CO) exactly, ++ * so that each OST is accessed by only one or more constant clients. */ ++ CO_nodes = stripe_count * CO; ++ avail_cb_nodes = ADIOI_MIN(nprocs_for_coll, CO_nodes); ++ if (avail_cb_nodes < CO_nodes) { ++ do { ++ /* find the divisor of CO_nodes */ ++ divisor = 1; ++ do { ++ divisor ++; ++ } while (CO_nodes % divisor); ++ CO_nodes = CO_nodes / divisor; ++ /* if stripe_count*CO is a prime number, change nothing */ ++ if ((CO_nodes <= avail_cb_nodes) && (CO_nodes != 1)) { ++ avail_cb_nodes = CO_nodes; ++ break; ++ } ++ } while (CO_nodes != 1); ++ } ++ + *striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int)); + striping_info = *striping_info_ptr; + striping_info[0] = stripe_size; + striping_info[1] = stripe_count; -+ striping_info[2] = CO; ++ striping_info[2] = avail_cb_nodes; ++ ++ ADIOI_Free(value); +} + +int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off, -+ ADIO_Offset *len, int nprocs, -+ int *striping_info) ++ ADIO_Offset *len, int *striping_info) +{ -+ /* please refer the comments in above function for the detailed algorithm */ -+ int rank_index; ++ int rank_index, rank; + ADIO_Offset avail_bytes; -+ + int stripe_size = striping_info[0]; -+ int stripe_count = striping_info[1]; -+ int CO = striping_info[2]; -+ int avail_nprocs = ADIOI_MIN(stripe_count * CO, nprocs); -+ -+ /* calculate the rank by offset directly */ -+ rank_index = (int)((off / stripe_size) % avail_nprocs); -+ /* XXX: the above method is so simple that the processes in top ranks are always -+ * chosen to be I/O clients. we hope they are different each time. ++ int avail_cb_nodes = striping_info[2]; ++ ++ /* Produce the stripe-contiguous pattern for Lustre */ ++ rank_index = (int)((off / stripe_size) % avail_cb_nodes); ++ ++ /* we index into fd_end with rank_index, and fd_end was allocated to be no ++ * bigger than fd->hins->cb_nodes. If we ever violate that, we're ++ * overrunning arrays. Obviously, we should never ever hit this abort + */ ++ if (rank_index >= fd->hints->cb_nodes) ++ MPI_Abort(MPI_COMM_WORLD, 1); + + avail_bytes = (off / (ADIO_Offset)stripe_size + 1) * + (ADIO_Offset)stripe_size - off; @@ -95,18 +145,27 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + /* this proc only has part of the requested contig. region */ + *len = avail_bytes; + } ++ /* map our index to a rank */ ++ /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */ ++ rank = fd->hints->ranklist[rank_index]; + -+ return rank_index; ++ return rank; +} + ++/* ADIOI_LUSTRE_Calc_my_req() - calculate what portions of the access requests ++ * of this process are located in the file domains of various processes ++ * (including this one) ++ */ +void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, + int *len_list, int contig_access_count, + int *striping_info, int nprocs, + int *count_my_req_procs_ptr, + int **count_my_req_per_proc_ptr, -+ ADIOI_Access ** my_req_ptr, ++ ADIOI_Access **my_req_ptr, + int **buf_idx_ptr) +{ ++ /* Nothing different from ADIOI_Calc_my_req(), except calling ++ * ADIOI_Lustre_Calc_aggregator() instead of the old one */ + int *count_my_req_per_proc, count_my_req_procs, *buf_idx; + int i, l, proc; + ADIO_Offset avail_len, rem_len, curr_idx, off; @@ -114,13 +173,19 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + + *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); + count_my_req_per_proc = *count_my_req_per_proc_ptr; ++ /* count_my_req_per_proc[i] gives the no. of contig. requests of this ++ * process in process i's file domain. calloc initializes to zero. ++ * I'm allocating memory of size nprocs, so that I can do an ++ * MPI_Alltoall later on. ++ */ + ++ buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int)); + /* buf_idx is relevant only if buftype_is_contig. + * buf_idx[i] gives the index into user_buf where data received + * from proc. i should be placed. This allows receives to be done + * without extra buffer. This can't be done if buftype is not contig. + */ -+ buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int)); ++ + /* initialize buf_idx to -1 */ + for (i = 0; i < nprocs; i++) + buf_idx[i] = -1; @@ -136,13 +201,13 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + continue; + off = offset_list[i]; + avail_len = len_list[i]; -+ /* we set avail_len to be the total size of the access. ++ /* note: we set avail_len to be the total size of the access. + * then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return + * the amount that was available. + */ -+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs, -+ striping_info); ++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info); + count_my_req_per_proc[proc]++; ++ + /* figure out how many data is remaining in the access + * we'll take care of this data (if there is any) + * in the while loop below. @@ -152,13 +217,13 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + while (rem_len != 0) { + off += avail_len; /* point to first remaining byte */ + avail_len = rem_len; /* save remaining size, pass to calc */ -+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs, -+ striping_info); ++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info); + count_my_req_per_proc[proc]++; + rem_len -= avail_len; /* reduce remaining length by amount from fd */ + } + } + ++ /* now allocate space for my_req, offset, and len */ + *my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access)); + my_req = *my_req_ptr; + @@ -178,12 +243,13 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + /* now fill in my_req */ + curr_idx = 0; + for (i = 0; i < contig_access_count; i++) { ++ /* short circuit offset/len processing if len == 0 ++ * (zero-byte read/write */ + if (len_list[i] == 0) + continue; + off = offset_list[i]; + avail_len = len_list[i]; -+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs, -+ striping_info); ++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info); + + /* for each separate contiguous access from this process */ + if (buf_idx[proc] == -1) @@ -206,7 +272,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + while (rem_len != 0) { + off += avail_len; + avail_len = rem_len; -+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs, ++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, + striping_info); + if (buf_idx[proc] == -1) + buf_idx[proc] = (int) curr_idx; @@ -232,12 +298,12 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + } + } + } -+#endif +#if 0 + for (i = 0; i < nprocs; i++) { + FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]); + } +#endif ++#endif + + *count_my_req_procs_ptr = count_my_req_procs; + *buf_idx_ptr = buf_idx; @@ -246,19 +312,15 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c +int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count, + int *len_list, int nprocs) +{ -+ /* Algorithm: -+ * So far, only one case is suitable for collective I/O -+ * (1) request size <= big_req_size -+ * -+ * if (avg_req_size > big_req_size) { -+ * docollect = 0; -+ * } ++ /* If the processes are non-interleaved, we will check the req_size. ++ * if (avg_req_size > big_req_size) { ++ * docollect = 0; ++ * } + */ + + int i, docollect = 1, lflag, big_req_size = 0; + ADIO_Offset req_size = 0, total_req_size; + int avg_req_size, total_access_count; -+ char *value = NULL; + + /* calculate total_req_size and total_access_count */ + for (i = 0; i < contig_access_count; i++) @@ -269,418 +331,17 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + fd->comm); + /* estimate average req_size */ + avg_req_size = (int)(total_req_size / total_access_count); -+ -+ /* get hint of hole_ratio */ -+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); -+ MPI_Info_get(fd->info, "big_req_size", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag) -+ big_req_size = atoi(value); -+ ++ /* get hint of big_req_size */ ++ big_req_size = fd->hints->fs_hints.lustre.coll_threshold; ++ /* Don't perform collective I/O if there are big requests */ + if ((big_req_size > 0) && (avg_req_size > big_req_size)) + docollect = 0; + -+ ADIOI_Free(value); -+ + return docollect; +} -+ -+void ADIOI_LUSTRE_Calc_my_off_len(ADIO_File fd, int bufcount, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, -+ ADIO_Offset **offset_list_ptr, -+ int **len_list_ptr, -+ ADIO_Offset *start_offset_ptr, -+ ADIO_Offset *end_offset_ptr, -+ int *contig_access_count_ptr) -+{ -+ int filetype_size, buftype_size, etype_size; -+ int i, j, k, frd_size = 0, old_frd_size = 0, st_index = 0; -+ int n_filetypes, etype_in_filetype; -+ ADIO_Offset abs_off_in_filetype = 0; -+ int bufsize, sum, n_etypes_in_filetype, size_in_filetype; -+ int contig_access_count, *len_list, flag, filetype_is_contig; -+ MPI_Aint filetype_extent, filetype_lb; -+ ADIOI_Flatlist_node *flat_file; -+ ADIO_Offset *offset_list, off, end_offset = 0, disp; -+ -+ /* For this process's request, calculate the list of offsets and -+ lengths in the file and determine the start and end offsets. */ -+ -+ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); -+ -+ MPI_Type_size(fd->filetype, &filetype_size); -+ MPI_Type_extent(fd->filetype, &filetype_extent); -+ MPI_Type_lb(fd->filetype, &filetype_lb); -+ MPI_Type_size(datatype, &buftype_size); -+ etype_size = fd->etype_size; -+ -+ if (!filetype_size) { -+ *contig_access_count_ptr = 0; -+ *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset)); -+ *len_list_ptr = (int *) ADIOI_Malloc(2 * sizeof(int)); -+ /* 2 is for consistency. everywhere I malloc one more than needed */ -+ -+ offset_list = *offset_list_ptr; -+ len_list = *len_list_ptr; -+ offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : -+ fd->disp + etype_size * offset; -+ len_list[0] = 0; -+ *start_offset_ptr = offset_list[0]; -+ *end_offset_ptr = offset_list[0] + len_list[0] - 1; -+ return; -+ } -+ -+ if (filetype_is_contig) { -+ *contig_access_count_ptr = 1; -+ *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset)); -+ *len_list_ptr = (int *) ADIOI_Malloc(2 * sizeof(int)); -+ /* 2 is for consistency. everywhere I malloc one more than needed */ -+ -+ offset_list = *offset_list_ptr; -+ len_list = *len_list_ptr; -+ offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : -+ fd->disp + etype_size * offset; -+ len_list[0] = bufcount * buftype_size; -+ *start_offset_ptr = offset_list[0]; -+ *end_offset_ptr = offset_list[0] + len_list[0] - 1; -+ -+ /* update file pointer */ -+ if (file_ptr_type == ADIO_INDIVIDUAL) -+ fd->fp_ind = *end_offset_ptr + 1; -+ } else { -+ /* First calculate what size of offset_list and len_list to allocate */ -+ /* filetype already flattened in ADIO_Open or ADIO_Fcntl */ -+ flat_file = ADIOI_Flatlist; -+ while (flat_file->type != fd->filetype) -+ flat_file = flat_file->next; -+ disp = fd->disp; -+ -+ if (file_ptr_type == ADIO_INDIVIDUAL) { -+ offset = fd->fp_ind; /* in bytes */ -+ n_filetypes = -1; -+ flag = 0; -+ while (!flag) { -+ n_filetypes++; -+ for (i = 0; i < flat_file->count; i++) { -+ if (disp + flat_file->indices[i] + -+ (ADIO_Offset) n_filetypes * filetype_extent + -+ flat_file->blocklens[i] >= offset) { -+ st_index = i; -+ frd_size = (int) (disp + flat_file->indices[i] + -+ (ADIO_Offset) n_filetypes * -+ filetype_extent + -+ flat_file->blocklens[i] - -+ offset); -+ flag = 1; -+ break; -+ } -+ } -+ } -+ } else { -+ n_etypes_in_filetype = filetype_size / etype_size; -+ n_filetypes = (int) (offset / n_etypes_in_filetype); -+ etype_in_filetype = (int) (offset % n_etypes_in_filetype); -+ size_in_filetype = etype_in_filetype * etype_size; -+ -+ sum = 0; -+ for (i = 0; i < flat_file->count; i++) { -+ sum += flat_file->blocklens[i]; -+ if (sum > size_in_filetype) { -+ st_index = i; -+ frd_size = sum - size_in_filetype; -+ abs_off_in_filetype = flat_file->indices[i] + -+ size_in_filetype - -+ (sum - flat_file->blocklens[i]); -+ break; -+ } -+ } -+ -+ /* abs. offset in bytes in the file */ -+ offset = disp + (ADIO_Offset) n_filetypes *filetype_extent + -+ abs_off_in_filetype; -+ } -+ -+ /* calculate how much space to allocate for offset_list, len_list */ -+ -+ old_frd_size = frd_size; -+ contig_access_count = i = 0; -+ j = st_index; -+ bufsize = buftype_size * bufcount; -+ frd_size = ADIOI_MIN(frd_size, bufsize); -+ while (i < bufsize) { -+ if (frd_size) -+ contig_access_count++; -+ i += frd_size; -+ j = (j + 1) % flat_file->count; -+ frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i); -+ } -+ -+ /* allocate space for offset_list and len_list */ -+ -+ *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc((contig_access_count+1) * -+ sizeof(ADIO_Offset)); -+ *len_list_ptr = (int *) ADIOI_Malloc((contig_access_count + 1) * -+ sizeof(int)); -+ /* +1 to avoid a 0-size malloc */ -+ -+ offset_list = *offset_list_ptr; -+ len_list = *len_list_ptr; -+ -+ /* find start offset, end offset, and fill in offset_list and len_list */ -+ -+ *start_offset_ptr = offset; /* calculated above */ -+ -+ i = k = 0; -+ j = st_index; -+ off = offset; -+ frd_size = ADIOI_MIN(old_frd_size, bufsize); -+ while (i < bufsize) { -+ if (frd_size) { -+ offset_list[k] = off; -+ len_list[k] = frd_size; -+ k++; -+ } -+ i += frd_size; -+ end_offset = off + frd_size - 1; -+ -+ /* Note: end_offset points to the last byte-offset that will be accessed. -+ e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */ -+ -+ if (off + frd_size < disp + flat_file->indices[j] + -+ flat_file->blocklens[j] + -+ (ADIO_Offset) n_filetypes * filetype_extent) { -+ off += frd_size; -+ /* did not reach end of contiguous block in filetype. -+ * no more I/O needed. off is incremented by frd_size. -+ */ -+ } else { -+ if (j < (flat_file->count - 1)) -+ j++; -+ else { -+ /* hit end of flattened filetype; -+ * start at beginning again -+ */ -+ j = 0; -+ n_filetypes++; -+ } -+ off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes * -+ filetype_extent; -+ frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i); -+ } -+ } -+ -+ /* update file pointer */ -+ if (file_ptr_type == ADIO_INDIVIDUAL) -+ fd->fp_ind = off; -+ -+ *contig_access_count_ptr = contig_access_count; -+ *end_offset_ptr = end_offset; -+ } -+} -+ -+void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs, -+ int *count_my_req_per_proc, -+ ADIOI_Access * my_req, -+ int nprocs, int myrank, -+ ADIO_Offset start_offset, -+ ADIO_Offset end_offset, -+ int *striping_info, -+ int *count_others_req_procs_ptr, -+ ADIOI_Access ** others_req_ptr) -+{ -+ /* what requests of other processes will be written by this process */ -+ -+ int *count_others_req_per_proc, count_others_req_procs; -+ int i, j, lflag, samesize = 0, contiguous = 0; -+ MPI_Request *send_requests, *recv_requests; -+ MPI_Status *statuses; -+ ADIOI_Access *others_req; -+ char *value = NULL; -+ int proc, avail_nprocs, stripe_count, CO; -+ ADIO_Offset min_st_offset, off, req_len, avail_len, rem_len, *all_lens; -+ -+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); -+ /* same io size */ -+ MPI_Info_get(fd->info, "same_io_size", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag && !strcmp(value, "yes")) -+ samesize = 1; -+ /* contiguous data */ -+ MPI_Info_get(fd->info, "contiguous_data", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag && !strcmp(value, "yes")) -+ contiguous = 1; -+ -+ *others_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * -+ sizeof(ADIOI_Access)); -+ others_req = *others_req_ptr; -+ -+ /* if the data are contiguous, we don't need to do MPI_Alltoall */ -+ if (contiguous) { -+ stripe_count = striping_info[1]; -+ CO = striping_info[2]; -+ -+ for (i = 0; i < nprocs; i++) { -+ others_req[i].count = 0; -+ } -+ req_len = end_offset - start_offset + 1; -+ all_lens = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); -+ -+ if (samesize == 0) {/* different request size */ -+ /* calculate the min_st_offset */ -+ MPI_Allreduce(&start_offset, &min_st_offset, 1, MPI_LONG_LONG, -+ MPI_MIN, fd->comm); -+ /* exchange request length */ -+ MPI_Allgather(&req_len, 1, ADIO_OFFSET, all_lens, 1, ADIO_OFFSET, -+ fd->comm); -+ } else { /* same request size */ -+ /* calculate the min_st_offset */ -+ min_st_offset = start_offset - myrank * req_len; -+ /* assign request length to all_lens[] */ -+ for (i = 0; i < nprocs; i ++) -+ all_lens[i] = req_len; -+ } -+ avail_nprocs = ADIOI_MIN(nprocs, stripe_count * CO); -+ if (myrank < avail_nprocs) { -+ off = min_st_offset; -+ /* calcaulte other_req[i].count */ -+ for (i = 0; i < nprocs; i++) { -+ avail_len = all_lens[i]; -+ rem_len = avail_len; -+ while (rem_len > 0) { -+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, -+ nprocs, striping_info); -+ if (proc == myrank) { -+ others_req[i].count ++; -+ } -+ off += avail_len; -+ rem_len -= avail_len; -+ avail_len = rem_len; -+ } -+ } -+ /* calculate offset and len for each request */ -+ off = min_st_offset; -+ for (i = 0; i < nprocs; i++) { -+ if (others_req[i].count) { -+ others_req[i].offsets = (ADIO_Offset *) -+ ADIOI_Malloc(others_req[i].count * -+ sizeof(ADIO_Offset)); -+ others_req[i].lens = (int *) -+ ADIOI_Malloc(others_req[i].count * -+ sizeof(int)); -+ others_req[i].mem_ptrs = (MPI_Aint *) -+ ADIOI_Malloc(others_req[i].count * -+ sizeof(MPI_Aint)); -+ } -+ j = 0; -+ avail_len = all_lens[i]; -+ rem_len = avail_len; -+ while (rem_len > 0) { -+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, -+ nprocs, striping_info); -+ if (proc == myrank) { -+ others_req[i].offsets[j] = off; -+ others_req[i].lens[j] = (int)avail_len; -+ j ++; -+ } -+ off += avail_len; -+ rem_len -= avail_len; -+ avail_len = rem_len; -+ } -+ } -+ } -+ ADIOI_Free(value); -+ ADIOI_Free(all_lens); -+ } else { -+ /* multiple non-contiguous requests */ -+ /* first find out how much to send/recv and from/to whom */ -+ -+ /* -+ * count_others_req_procs: -+ * number of processes whose requests will be written by -+ * this process (including this process itself) -+ * count_others_req_per_proc[i]: -+ * how many separate contiguous requests of proc[i] will be -+ * written by this process. -+ */ -+ -+ count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int)); -+ -+ MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT, -+ count_others_req_per_proc, 1, MPI_INT, fd->comm); -+ -+ count_others_req_procs = 0; -+ for (i = 0; i < nprocs; i++) { -+ if (count_others_req_per_proc[i]) { -+ others_req[i].count = count_others_req_per_proc[i]; -+ others_req[i].offsets = (ADIO_Offset *) -+ ADIOI_Malloc(others_req[i].count * -+ sizeof(ADIO_Offset)); -+ others_req[i].lens = (int *) -+ ADIOI_Malloc(others_req[i].count * -+ sizeof(int)); -+ others_req[i].mem_ptrs = (MPI_Aint *) -+ ADIOI_Malloc(others_req[i].count * -+ sizeof(MPI_Aint)); -+ count_others_req_procs++; -+ } else -+ others_req[i].count = 0; -+ } -+ -+ /* now send the calculated offsets and lengths to respective processes */ -+ -+ send_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_my_req_procs + 1) * -+ sizeof(MPI_Request)); -+ recv_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_others_req_procs+1)* -+ sizeof(MPI_Request)); -+ /* +1 to avoid a 0-size malloc */ -+ -+ j = 0; -+ for (i = 0; i < nprocs; i++) { -+ if (others_req[i].count) { -+ MPI_Irecv(others_req[i].offsets, others_req[i].count, -+ ADIO_OFFSET, i, i + myrank, fd->comm, -+ &recv_requests[j]); -+ j++; -+ MPI_Irecv(others_req[i].lens, others_req[i].count, -+ MPI_INT, i, i + myrank + 1, fd->comm, -+ &recv_requests[j]); -+ j++; -+ } -+ } -+ -+ j = 0; -+ for (i = 0; i < nprocs; i++) { -+ if (my_req[i].count) { -+ MPI_Isend(my_req[i].offsets, my_req[i].count, -+ ADIO_OFFSET, i, i + myrank, fd->comm, -+ &send_requests[j]); -+ j++; -+ MPI_Isend(my_req[i].lens, my_req[i].count, -+ MPI_INT, i, i + myrank + 1, fd->comm, -+ &send_requests[j]); -+ j++; -+ } -+ } -+ -+ statuses = (MPI_Status *) -+ ADIOI_Malloc((1 + 2 * ADIOI_MAX(count_my_req_procs, -+ count_others_req_procs)) * -+ sizeof(MPI_Status)); -+ /* +1 to avoid a 0-size malloc */ -+ -+ MPI_Waitall(2 * count_my_req_procs, send_requests, statuses); -+ MPI_Waitall(2 * count_others_req_procs, recv_requests, statuses); -+ -+ ADIOI_Free(send_requests); -+ ADIOI_Free(recv_requests); -+ ADIOI_Free(statuses); -+ ADIOI_Free(count_others_req_per_proc); -+ -+ *count_others_req_procs_ptr = count_others_req_procs; -+ } -+} -diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c ---- ad_lustre_orig/ad_lustre.c 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/ad_lustre.c 2008-09-17 18:20:35.000000000 +0800 +diff -ruN adio/ad_lustre_orig/ad_lustre.c adio/ad_lustre/ad_lustre.c +--- adio/ad_lustre_orig/ad_lustre.c 2008-09-17 14:36:56.000000000 +0800 ++++ adio/ad_lustre/ad_lustre.c 2008-10-17 17:03:42.000000000 +0800 @@ -1,9 +1,11 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* @@ -691,77 +352,28 @@ diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c * * Copyright (C) 2007 Oak Ridge National Laboratory + * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include "ad_lustre.h" -@@ -13,13 +15,13 @@ +@@ -13,12 +15,12 @@ ADIOI_LUSTRE_ReadContig, /* ReadContig */ ADIOI_LUSTRE_WriteContig, /* WriteContig */ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */ - ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */ + ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */ ADIOI_GEN_SeekIndividual, /* SeekIndividual */ -- ADIOI_GEN_Fcntl, /* Fcntl */ -+ ADIOI_LUSTRE_Fcntl, /* Fcntl */ + ADIOI_GEN_Fcntl, /* Fcntl */ ADIOI_LUSTRE_SetInfo, /* SetInfo */ ADIOI_GEN_ReadStrided, /* ReadStrided */ - ADIOI_GEN_WriteStrided, /* WriteStrided */ -- ADIOI_GEN_Close, /* Close */ + ADIOI_LUSTRE_WriteStrided, /* WriteStrided */ -+ ADIOI_LUSTRE_Close, /* Close */ + ADIOI_GEN_Close, /* Close */ #if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE) ADIOI_GEN_IreadContig, /* IreadContig */ - ADIOI_GEN_IwriteContig, /* IwriteContig */ -diff -ruN ad_lustre_orig/ad_lustre_close.c ad_lustre/ad_lustre_close.c ---- ad_lustre_orig/ad_lustre_close.c 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/ad_lustre_close.c 2008-09-17 18:20:35.000000000 +0800 -@@ -0,0 +1,42 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ * -+ * Copyright (C) 2007 Oak Ridge National Laboratory -+ * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group -+ */ -+ -+#include "ad_lustre.h" -+ -+#ifdef PROFILE -+#include "mpe.h" -+#endif -+ -+void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code) -+{ -+ int err, derr = 0; -+ static char myname[] = "ADIOI_LUSTRE_CLOSE"; -+ -+#ifdef PROFILE -+ MPE_Log_event(9, 0, "start close"); -+#endif -+ -+ err = close(fd->fd_sys); -+ -+#ifdef PROFILE -+ MPE_Log_event(10, 0, "end close"); -+#endif -+ -+ fd->fd_sys = -1; -+ -+ if (err == -1 || derr == -1) { -+ *error_code = -+ MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, -+ __LINE__, MPI_ERR_IO, "**io", "**io %s", -+ strerror(errno)); -+ } else -+ *error_code = MPI_SUCCESS; -+} -diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h ---- ad_lustre_orig/ad_lustre.h 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/ad_lustre.h 2008-10-06 16:07:21.000000000 +0800 +diff -ruN adio/ad_lustre_orig/ad_lustre.h adio/ad_lustre/ad_lustre.h +--- adio/ad_lustre_orig/ad_lustre.h 2008-09-17 14:36:56.000000000 +0800 ++++ adio/ad_lustre/ad_lustre.h 2009-05-05 15:34:58.000000000 +0800 @@ -1,9 +1,11 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* @@ -772,44 +384,20 @@ diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h * * Copyright (C) 2007 Oak Ridge National Laboratory + * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef AD_UNIX_INCLUDE -@@ -24,7 +26,32 @@ +@@ -24,7 +26,7 @@ /*#include */ #include -+#ifdef WITH_LUSTRE - #include "lustre/lustre_user.h" -+#else -+/* copy something from lustre_user.h here */ -+# define LOV_USER_MAGIC 0x0BD10BD0 -+# define LL_IOC_LOV_SETSTRIPE _IOW ('f', 154, long) -+# define LL_IOC_LOV_GETSTRIPE _IOW ('f', 155, long) -+# define lov_user_ost_data lov_user_ost_data_v1 -+struct lov_user_ost_data_v1 { /* per-stripe data structure */ -+ __u64 l_object_id; /* OST object ID */ -+ __u64 l_object_gr; /* OST object group (creating MDS number) */ -+ __u32 l_ost_gen; /* generation of this OST index */ -+ __u32 l_ost_idx; /* OST index in LOV */ -+} __attribute__((packed)); -+#define lov_user_md lov_user_md_v1 -+struct lov_user_md_v1 { /* LOV EA user data (host-endian) */ -+ __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V1 */ -+ __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ -+ __u64 lmm_object_id; /* LOV object ID */ -+ __u64 lmm_object_gr; /* LOV object group */ -+ __u32 lmm_stripe_size; /* size of stripe in bytes */ -+ __u16 lmm_stripe_count; /* num stripes in use for this object */ -+ __u16 lmm_stripe_offset; /* starting stripe offset in lmm_objects */ -+ struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ -+} __attribute__((packed)); -+#endif +-#include "lustre/lustre_user.h" ++#include #include "adio.h" /*#include "adioi.h"*/ -@@ -41,24 +68,62 @@ +@@ -41,24 +43,31 @@ void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code); void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code); @@ -855,41 +443,10 @@ diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h int *error_code); void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); - -+void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr, -+ int mode, int nprocs); -+int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off, -+ ADIO_Offset *len, int nprocs, -+ int *striping_info); -+void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, -+ int *len_list, int contig_access_count, -+ int *striping_info, int nprocs, -+ int *count_my_req_procs_ptr, -+ int **count_my_req_per_proc_ptr, -+ ADIOI_Access ** my_req_ptr, -+ int **buf_idx_ptr); -+int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count, -+ int *len_list, int nprocs); -+void ADIOI_LUSTRE_Calc_my_off_len(ADIO_File fd, int bufcount, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, -+ ADIO_Offset **offset_list_ptr, -+ int **len_list_ptr, -+ ADIO_Offset *start_offset_ptr, -+ ADIO_Offset *end_offset_ptr, -+ int *contig_access_count_ptr); -+void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs, -+ int *count_my_req_per_proc, -+ ADIOI_Access * my_req, -+ int nprocs, int myrank, -+ ADIO_Offset start_offset, -+ ADIO_Offset end_offset, -+ int *striping_info, -+ int *count_others_req_procs_ptr, -+ ADIOI_Access ** others_req_ptr); #endif /* End of AD_UNIX_INCLUDE */ -diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c ---- ad_lustre_orig/ad_lustre_hints.c 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/ad_lustre_hints.c 2008-09-17 18:20:35.000000000 +0800 +diff -ruN adio/ad_lustre_orig/ad_lustre_hints.c adio/ad_lustre/ad_lustre_hints.c +--- adio/ad_lustre_orig/ad_lustre_hints.c 2008-09-17 14:36:56.000000000 +0800 ++++ adio/ad_lustre/ad_lustre_hints.c 2009-04-24 15:35:05.000000000 +0800 @@ -1,9 +1,11 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* @@ -900,23 +457,22 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c * * Copyright (C) 2007 Oak Ridge National Laboratory + * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include "ad_lustre.h" -@@ -11,130 +13,162 @@ - +@@ -12,46 +14,56 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { -- char *value, *value_in_fd; + char *value, *value_in_fd; - int flag, tmp_val[3], str_factor=-1, str_unit=0, start_iodev=-1; -- struct lov_user_md lum = { 0 }; -- int err, myrank, fd_sys, perm, amode, old_mask; -+ char *value = NULL; -+ int flag, tmp_val, int_val, str_factor, str_unit, start_iodev; ++ int flag, stripe_val[3], str_factor = -1, str_unit=0, start_iodev=-1; + struct lov_user_md lum = { 0 }; + int err, myrank, fd_sys, perm, amode, old_mask; ++ int int_val, tmp_val; + static char myname[] = "ADIOI_LUSTRE_SETINFO"; -- value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); if ( (fd->info) == MPI_INFO_NULL) { - /* This must be part of the open call. can set striping parameters - if necessary. */ @@ -929,222 +485,114 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c fd->direct_read = fd->direct_write = 0; - - /* has user specified striping or server buffering parameters ++ /* initialize lustre hints */ ++ MPI_Info_set(fd->info, "romio_lustre_co_ratio", "1"); ++ fd->hints->fs_hints.lustre.co_ratio = 1; ++ MPI_Info_set(fd->info, "romio_lustre_coll_threshold", "0"); ++ fd->hints->fs_hints.lustre.coll_threshold = 0; ++ MPI_Info_set(fd->info, "romio_lustre_ds_in_coll", "enable"); ++ fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_ENABLE; + + /* has user specified striping or server buffering parameters and do they have the same value on all processes? */ if (users_info != MPI_INFO_NULL) { - MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, -- value, &flag); ++ /* striping information */ ++ MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, + value, &flag); - if (flag) -- str_unit=atoi(value); -- ++ if (flag) + str_unit=atoi(value); + - MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, -- value, &flag); ++ MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, + value, &flag); - if (flag) -- str_factor=atoi(value); -+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ++ if (flag) + str_factor=atoi(value); - MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, -+ /* direct read and write */ -+ MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, - value, &flag); +- value, &flag); - if (flag) -- start_iodev=atoi(value); -- ++ MPI_Info_get(users_info, "romio_lustre_start_iodevice", ++ MPI_MAX_INFO_VAL, value, &flag); ++ if (flag) + start_iodev=atoi(value); + - MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, - value, &flag); ++ /* direct read and write */ ++ MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, ++ value, &flag); if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) { MPI_Info_set(fd->info, "direct_read", "true"); fd->direct_read = 1; } - +- - MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL, + MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL, value, &flag); if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) { MPI_Info_set(fd->info, "direct_write", "true"); - fd->direct_write = 1; +@@ -59,22 +71,23 @@ } -- } + } -- MPI_Comm_rank(fd->comm, &myrank); -- if (myrank == 0) { ++ /* set striping information with ioctl */ + MPI_Comm_rank(fd->comm, &myrank); + if (myrank == 0) { - tmp_val[0] = str_factor; - tmp_val[1] = str_unit; - tmp_val[2] = start_iodev; -- } ++ stripe_val[0] = str_factor; ++ stripe_val[1] = str_unit; ++ stripe_val[2] = start_iodev; + } - MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm); -- ++ MPI_Bcast(stripe_val, 3, MPI_INT, 0, fd->comm); + - if (tmp_val[0] != str_factor - || tmp_val[1] != str_unit - || tmp_val[2] != start_iodev) { -- FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys" -- "-striping_factor:striping_unit:start_iodevice " -- "need to be identical across all processes\n"); -- MPI_Abort(MPI_COMM_WORLD, 1); ++ if (stripe_val[0] != str_factor ++ || stripe_val[1] != str_unit ++ || stripe_val[2] != start_iodev) { + FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys" + "-striping_factor:striping_unit:start_iodevice " + "need to be identical across all processes\n"); + MPI_Abort(MPI_COMM_WORLD, 1); - } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) { -- /* if user has specified striping info, process 0 tries to set it */ -- if (!myrank) { -- if (fd->perm == ADIO_PERM_NULL) { -- old_mask = umask(022); -- umask(old_mask); -- perm = old_mask ^ 0666; -- } -- else perm = fd->perm; -- -- amode = 0; -- if (fd->access_mode & ADIO_CREATE) -- amode = amode | O_CREAT; -- if (fd->access_mode & ADIO_RDONLY) -- amode = amode | O_RDONLY; -- if (fd->access_mode & ADIO_WRONLY) -- amode = amode | O_WRONLY; -- if (fd->access_mode & ADIO_RDWR) -- amode = amode | O_RDWR; -- if (fd->access_mode & ADIO_EXCL) -- amode = amode | O_EXCL; -- -- /* we need to create file so ensure this is set */ -- amode = amode | O_LOV_DELAY_CREATE | O_CREAT; -- -- fd_sys = open(fd->filename, amode, perm); ++ } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) { + /* if user has specified striping info, process 0 tries to set it */ + if (!myrank) { + if (fd->perm == ADIO_PERM_NULL) { +@@ -100,9 +113,9 @@ + amode = amode | O_LOV_DELAY_CREATE | O_CREAT; + + fd_sys = open(fd->filename, amode, perm); - if (fd_sys == -1) { - if (errno != EEXIST) - fprintf(stderr, -- "Failure to open file %s %d %d\n",strerror(errno), amode, perm); -- } else { -- lum.lmm_magic = LOV_USER_MAGIC; -- lum.lmm_pattern = 0; -- lum.lmm_stripe_size = str_unit; -- lum.lmm_stripe_count = str_factor; -- lum.lmm_stripe_offset = start_iodev; -- -- err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum); ++ if (fd_sys == -1) { ++ if (errno != EEXIST) ++ fprintf(stderr, + "Failure to open file %s %d %d\n",strerror(errno), amode, perm); + } else { + lum.lmm_magic = LOV_USER_MAGIC; +@@ -112,25 +125,73 @@ + lum.lmm_stripe_offset = start_iodev; + + err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum); - if (err == -1 && errno != EEXIST) { -- fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno)); -- } -- close(fd_sys); -- } -- } /* End of striping parameters validation */ -+ /* stripe size */ -+ MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && (str_unit = atoi(value))) { -+ tmp_val = str_unit; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != str_unit) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "striping_unit", -+ error_code); -+ return; -+ } -+ MPI_Info_set(fd->info, "striping_unit", value); -+ } -+ /* stripe count */ -+ MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && (str_factor = atoi(value))) { -+ tmp_val = str_factor; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != str_factor) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "striping_factor", -+ error_code); -+ return; -+ } -+ MPI_Info_set(fd->info, "striping_factor", value); -+ } -+ /* stripe offset */ -+ MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && ((start_iodev = atoi(value)) >= 0)) { -+ tmp_val = start_iodev; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != start_iodev) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "start_iodevice", -+ error_code); -+ return; -+ } -+ MPI_Info_set(fd->info, "start_iodevice", value); -+ } -+ /* CO */ -+ MPI_Info_get(users_info, "CO", MPI_MAX_INFO_VAL, value, -+ &flag); -+ if (flag && (int_val = atoi(value)) > 0) { -+ tmp_val = int_val; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != int_val) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "CO", -+ error_code); -+ return; -+ } -+ MPI_Info_set(fd->info, "CO", value); -+ } -+ /* big_req_size */ -+ MPI_Info_get(users_info, "big_req_size", MPI_MAX_INFO_VAL, value, -+ &flag); -+ if (flag && (int_val = atoi(value)) > 0) { -+ tmp_val = int_val; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != int_val) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "big_req_size", -+ error_code); -+ return; -+ } -+ MPI_Info_set(fd->info, "big_req_size", value); -+ } -+ /* hint for disabling data sieving when do collective IO */ -+ MPI_Info_get(users_info, "ds_in_coll", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && (!strcmp(value, "enable") || -+ !strcmp(value, "ENABLE"))) { -+ tmp_val = int_val = 1; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != int_val) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "ds_in_coll", -+ error_code); -+ return; -+ } -+ MPI_Info_set(fd->info, "ds_in_coll", "enable"); -+ } -+ /* same io size */ -+ MPI_Info_get(users_info, "same_io_size", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && (!strcmp(value, "yes") || -+ !strcmp(value, "YES"))) { -+ tmp_val = int_val = 1; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != int_val) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "same_io_size", -+ error_code); -+ return; -+ } -+ MPI_Info_set(fd->info, "same_io_size", "yes"); -+ } -+ /* contiguous data */ -+ MPI_Info_get(users_info, "contiguous_data", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && (!strcmp(value, "yes") || -+ !strcmp(value, "YES"))) { -+ tmp_val = int_val = 1; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != int_val) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "contiguous_data", -+ error_code); -+ return; -+ } -+ MPI_Info_set(fd->info, "contiguous_data", "yes"); -+ } -+ ADIOI_Free(value); ++ if (err == -1 && errno != EEXIST) { + fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno)); + } + close(fd_sys); + } + } /* End of striping parameters validation */ } - -- MPI_Barrier(fd->comm); + MPI_Barrier(fd->comm); - /* set the values for collective I/O and data sieving parameters */ - ADIOI_GEN_SetInfo(fd, users_info, error_code); - } else { @@ -1155,20 +603,71 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c - ADIOI_GEN_SetInfo(fd, users_info, error_code); } - ++ /* get other hint */ ++ if (users_info != MPI_INFO_NULL) { ++ /* CO: IO Clients/OST, ++ * to keep the load balancing between clients and OSTs */ ++ MPI_Info_get(users_info, "romio_lustre_co_ratio", MPI_MAX_INFO_VAL, value, ++ &flag); ++ if (flag && (int_val = atoi(value)) > 0) { ++ tmp_val = int_val; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != int_val) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "romio_lustre_co_ratio", ++ error_code); ++ ADIOI_Free(value); ++ return; ++ } ++ MPI_Info_set(fd->info, "romio_lustre_co_ratio", value); ++ fd->hints->fs_hints.lustre.co_ratio = atoi(value); ++ } ++ /* coll_threshold: ++ * if the req size is bigger than this, collective IO may not be performed. ++ */ ++ MPI_Info_get(users_info, "romio_lustre_coll_threshold", MPI_MAX_INFO_VAL, value, ++ &flag); ++ if (flag && (int_val = atoi(value)) > 0) { ++ tmp_val = int_val; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != int_val) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "romio_lustre_coll_threshold", ++ error_code); ++ ADIOI_Free(value); ++ return; ++ } ++ MPI_Info_set(fd->info, "romio_lustre_coll_threshold", value); ++ fd->hints->fs_hints.lustre.coll_threshold = atoi(value); ++ } ++ /* ds_in_coll: disable data sieving in collective IO */ ++ MPI_Info_get(users_info, "romio_lustre_ds_in_coll", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag && (!strcmp(value, "disable") || ++ !strcmp(value, "DISABLE"))) { ++ tmp_val = int_val = 2; ++ MPI_Bcast(&tmp_val, 2, MPI_INT, 0, fd->comm); ++ if (tmp_val != int_val) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "romio_lustre_ds_in_coll", ++ error_code); ++ ADIOI_Free(value); ++ return; ++ } ++ MPI_Info_set(fd->info, "romio_lustre_ds_in_coll", "disable"); ++ fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_DISABLE; ++ } ++ } + /* set the values for collective I/O and data sieving parameters */ + ADIOI_GEN_SetInfo(fd, users_info, error_code); + if (ADIOI_Direct_read) fd->direct_read = 1; if (ADIOI_Direct_write) fd->direct_write = 1; -- ADIOI_Free(value); -- - *error_code = MPI_SUCCESS; - } -diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c ---- ad_lustre_orig/ad_lustre_open.c 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/ad_lustre_open.c 2008-09-17 18:55:50.000000000 +0800 -@@ -1,18 +1,21 @@ +diff -ruN adio/ad_lustre_orig/ad_lustre_open.c adio/ad_lustre/ad_lustre_open.c +--- adio/ad_lustre_orig/ad_lustre_open.c 2008-09-17 14:36:56.000000000 +0800 ++++ adio/ad_lustre/ad_lustre_open.c 2009-03-01 11:32:32.000000000 +0800 +@@ -1,9 +1,11 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (C) 1997 University of Chicago. @@ -1178,220 +677,32 @@ diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c * * Copyright (C) 2007 Oak Ridge National Laboratory + * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include "ad_lustre.h" +@@ -51,14 +53,17 @@ + err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum); - void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) - { -- int perm, old_mask, amode, amode_direct; -+ int perm, old_mask, amode = 0, amode_direct = 0, flag = 0, err, myrank; -+ int stripe_size = 0, stripe_count = 0, stripe_offset = -1; - struct lov_user_md lum = { 0 }; -- char *value; -+ char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); - - #if defined(MPICH2) || !defined(PRINT_ERR_MSG) - static char myname[] = "ADIOI_LUSTRE_OPEN"; -@@ -22,12 +25,57 @@ - old_mask = umask(022); - umask(old_mask); - perm = old_mask ^ 0666; -- } -- else perm = fd->perm; -+ } else -+ perm = fd->perm; + if (!err) { ++ fd->hints->striping_unit = lum.lmm_stripe_size; + sprintf(value, "%d", lum.lmm_stripe_size); + MPI_Info_set(fd->info, "striping_unit", value); -- amode = 0; -- if (fd->access_mode & ADIO_CREATE) -+ if (fd->access_mode & ADIO_CREATE) { - amode = amode | O_CREAT; -+ /* Check striping info -+ * if already set by SetInfo(), set them to lum; otherwise, set by lum -+ */ -+ MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, -+ &flag); -+ if (flag) -+ stripe_size = atoi(value); -+ -+ MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, -+ &flag); -+ if (flag) -+ stripe_count = atoi(value); -+ -+ MPI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL, value, -+ &flag); -+ if (flag) -+ stripe_offset = atoi(value); -+ -+ /* if user has specified striping info, -+ * process 0 will try to check and set it. -+ */ -+ if ((stripe_size > 0) || (stripe_count > 0) || (stripe_offset >= 0)) { -+ MPI_Comm_rank(fd->comm, &myrank); -+ if (myrank == 0) { -+ int fd_sys = open(fd->filename, amode, perm); -+ if (fd_sys == -1) { -+ if (errno != EEXIST) -+ FPRINTF(stderr, "Failure to open file %s %d %d\n", -+ strerror(errno), amode, perm); -+ } else { -+ lum.lmm_magic = LOV_USER_MAGIC; -+ lum.lmm_pattern = 1; -+ lum.lmm_stripe_size = stripe_size; -+ lum.lmm_stripe_count = stripe_count; -+ lum.lmm_stripe_offset = stripe_offset; -+ -+ if (ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum)) -+ FPRINTF(stderr, -+ "Failure to set striping info to Lustre!\n"); -+ close(fd_sys); -+ } -+ } -+ MPI_Barrier(fd->comm); -+ } -+ } -+ - if (fd->access_mode & ADIO_RDONLY) - amode = amode | O_RDONLY; - if (fd->access_mode & ADIO_WRONLY) -@@ -42,32 +90,36 @@ - fd->fd_sys = open(fd->filename, amode|O_CREAT, perm); ++ fd->hints->striping_factor = lum.lmm_stripe_count; + sprintf(value, "%d", lum.lmm_stripe_count); + MPI_Info_set(fd->info, "striping_factor", value); - if (fd->fd_sys != -1) { -- int err; -- -- value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); -- - /* get file striping information and set it in info */ -- lum.lmm_magic = LOV_USER_MAGIC; -- err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum); -- -- if (!err) { -- sprintf(value, "%d", lum.lmm_stripe_size); -- MPI_Info_set(fd->info, "striping_unit", value); -- -- sprintf(value, "%d", lum.lmm_stripe_count); -- MPI_Info_set(fd->info, "striping_factor", value); -- -- sprintf(value, "%d", lum.lmm_stripe_offset); ++ fd->hints->fs_hints.lustre.start_iodevice = lum.lmm_stripe_offset; + sprintf(value, "%d", lum.lmm_stripe_offset); - MPI_Info_set(fd->info, "start_iodevice", value); -- } -- ADIOI_Free(value); -+ lum.lmm_magic = LOV_USER_MAGIC; -+ err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum); - -+ if (!err) { -+ if (lum.lmm_stripe_size && lum.lmm_stripe_count && -+ (lum.lmm_stripe_offset >= 0)) { -+ sprintf(value, "%d", lum.lmm_stripe_size); -+ MPI_Info_set(fd->info, "striping_unit", value); -+ -+ sprintf(value, "%d", lum.lmm_stripe_count); -+ MPI_Info_set(fd->info, "striping_factor", value); -+ -+ sprintf(value, "%d", lum.lmm_stripe_offset); -+ MPI_Info_set(fd->info, "start_iodevice", value); -+ } else { -+ FPRINTF(stderr, "Striping info is invalid!\n"); -+ ADIOI_Free(value); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } -+ } else { -+ FPRINTF(stderr, "Failed to get striping info from Lustre!\n"); -+ ADIOI_Free(value); -+ MPI_Abort(MPI_COMM_WORLD, 1); -+ } - if (fd->access_mode & ADIO_APPEND) - fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); -- } -- -+ } - if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) -- fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); -+ fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); - - fd->fd_direct = -1; - if (fd->direct_write || fd->direct_read) { -@@ -81,20 +133,22 @@ - } ++ MPI_Info_set(fd->info, "romio_lustre_start_iodevice", value); + } + ADIOI_Free(value); - /* --BEGIN ERROR HANDLING-- */ -- if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && -- (fd->direct_write || fd->direct_read))) { -+ if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && -+ (fd->direct_write || fd->direct_read))) { - if (errno == ENAMETOOLONG) - *error_code = MPIO_Err_create_code(MPI_SUCCESS, -- MPIR_ERR_RECOVERABLE, myname, -- __LINE__, MPI_ERR_BAD_FILE, -+ MPIR_ERR_RECOVERABLE, -+ myname, __LINE__, -+ MPI_ERR_BAD_FILE, - "**filenamelong", - "**filenamelong %s %d", - fd->filename, - strlen(fd->filename)); - else if (errno == ENOENT) - *error_code = MPIO_Err_create_code(MPI_SUCCESS, -- MPIR_ERR_RECOVERABLE, myname, -- __LINE__, MPI_ERR_NO_SUCH_FILE, -+ MPIR_ERR_RECOVERABLE, -+ myname, __LINE__, -+ MPI_ERR_NO_SUCH_FILE, - "**filenoexist", - "**filenoexist %s", - fd->filename); -@@ -108,27 +162,30 @@ - fd->filename); - else if (errno == EACCES) { - *error_code = MPIO_Err_create_code(MPI_SUCCESS, -- MPIR_ERR_RECOVERABLE, myname, -- __LINE__, MPI_ERR_ACCESS, -+ MPIR_ERR_RECOVERABLE, -+ myname, __LINE__, -+ MPI_ERR_ACCESS, - "**fileaccess", -- "**fileaccess %s", -- fd->filename ); -- } -- else if (errno == EROFS) { -+ "**fileaccess %s", -+ fd->filename); -+ } else if (errno == EROFS) { - /* Read only file or file system and write access requested */ - *error_code = MPIO_Err_create_code(MPI_SUCCESS, -- MPIR_ERR_RECOVERABLE, myname, -- __LINE__, MPI_ERR_READ_ONLY, -- "**ioneedrd", 0 ); -- } -- else { -+ MPIR_ERR_RECOVERABLE, -+ myname, __LINE__, -+ MPI_ERR_READ_ONLY, -+ "**ioneedrd", 0); -+ } else { - *error_code = MPIO_Err_create_code(MPI_SUCCESS, -- MPIR_ERR_RECOVERABLE, myname, -- __LINE__, MPI_ERR_IO, "**io", -+ MPIR_ERR_RECOVERABLE, -+ myname, __LINE__, -+ MPI_ERR_IO, "**io", - "**io %s", strerror(errno)); - } -- } -+ } else { - /* --END ERROR HANDLING-- */ -- else *error_code = MPI_SUCCESS; -+ *error_code = MPI_SUCCESS; -+ } - -+ ADIOI_Free(value); - } -diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c ---- ad_lustre_orig/ad_lustre_rwcontig.c 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/ad_lustre_rwcontig.c 2008-09-17 18:52:01.000000000 +0800 +diff -ruN adio/ad_lustre_orig/ad_lustre_rwcontig.c adio/ad_lustre/ad_lustre_rwcontig.c +--- adio/ad_lustre_orig/ad_lustre_rwcontig.c 2008-09-17 14:36:56.000000000 +0800 ++++ adio/ad_lustre/ad_lustre_rwcontig.c 2009-05-05 15:34:29.000000000 +0800 @@ -1,9 +1,11 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* @@ -1402,23 +713,40 @@ diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c * * Copyright (C) 2007 Oak Ridge National Laboratory + * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #define _XOPEN_SOURCE 600 -@@ -138,7 +140,7 @@ +@@ -136,10 +138,23 @@ + if (err == -1) goto ioerr; + } - if (io_mode) +- if (io_mode) ++ if (io_mode) { ++#ifdef ADIOI_MPE_LOGGING ++ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL); ++#endif err = write(fd->fd_sys, buf, len); - else -+ else ++#ifdef ADIOI_MPE_LOGGING ++ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL); ++#endif ++ } else { ++#ifdef ADIOI_MPE_LOGGING ++ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL); ++#endif err = read(fd->fd_sys, buf, len); ++#ifdef ADIOI_MPE_LOGGING ++ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL); ++#endif ++ } } else { err = ADIOI_LUSTRE_Directio(fd, buf, len, offset, io_mode); -diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c ---- ad_lustre_orig/ad_lustre_wrcoll.c 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/ad_lustre_wrcoll.c 2008-09-17 18:20:35.000000000 +0800 -@@ -0,0 +1,973 @@ + } +diff -ruN adio/ad_lustre_orig/ad_lustre_wrcoll.c adio/ad_lustre/ad_lustre_wrcoll.c +--- adio/ad_lustre_orig/ad_lustre_wrcoll.c 1970-01-01 08:00:00.000000000 +0800 ++++ adio/ad_lustre/ad_lustre_wrcoll.c 2009-04-24 14:48:34.000000000 +0800 +@@ -0,0 +1,934 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (C) 1997 University of Chicago. @@ -1426,7 +754,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + * + * Copyright (C) 2007 Oak Ridge National Laboratory + * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include "ad_lustre.h" @@ -1441,25 +769,25 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + ADIO_Offset *offset_list, + int *len_list, + int contig_access_count, -+ int * striping_info, ++ int *striping_info, + int *buf_idx, int *error_code); +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf, -+ ADIOI_Flatlist_node * flat_buf, ++ ADIOI_Flatlist_node *flat_buf, + char **send_buf, -+ ADIO_Offset * offset_list, ++ ADIO_Offset *offset_list, + int *len_list, int *send_size, -+ MPI_Request * requests, ++ MPI_Request *requests, + int *sent_to_proc, int nprocs, + int myrank, int contig_access_count, -+ int * striping_info, ++ int *striping_info, + int *send_buf_idx, + int *curr_to_proc, + int *done_to_proc, int iter, + MPI_Aint buftype_extent); +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf, + char *write_buf, -+ ADIOI_Flatlist_node * flat_buf, -+ ADIO_Offset * offset_list, ++ ADIOI_Flatlist_node *flat_buf, ++ ADIO_Offset *offset_list, + int *len_list, int *send_size, + int *recv_size, ADIO_Offset off, + int size, int *count, @@ -1467,23 +795,29 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + int *sent_to_proc, int nprocs, + int myrank, int buftype_is_contig, + int contig_access_count, -+ int * striping_info, -+ ADIOI_Access * others_req, ++ int *striping_info, ++ ADIOI_Access *others_req, + int *send_buf_idx, + int *curr_to_proc, + int *done_to_proc, int *hole, + int iter, MPI_Aint buftype_extent, + int *buf_idx, int *error_code); -+static void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count, -+ ADIO_Offset * srt_off, int *srt_len, -+ int *start_pos, int nprocs, int nprocs_recv, -+ int total_elements); ++void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, ++ ADIO_Offset *srt_off, int *srt_len, int *start_pos, ++ int nprocs, int nprocs_recv, int total_elements); + +void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, + int file_ptr_type, ADIO_Offset offset, -+ ADIO_Status * status, int *error_code) ++ ADIO_Status *status, int *error_code) +{ ++ /* Uses a generalized version of the extended two-phase method described ++ * in "An Extended Two-Phase Method for Accessing Sections of ++ * Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, ++ * Scientific Programming, (5)4:301--317, Winter 1996. ++ * http://www.mcs.anl.gov/home/thakur/ext2ph.ps ++ */ ++ + ADIOI_Access *my_req; + /* array of nprocs access structures, one for each other process has + this process's request */ @@ -1492,31 +826,60 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + /* array of nprocs access structures, one for each other process + whose request is written by this process. */ + -+ int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank, do_collect = 0; -+ int contig_access_count = 0, buftype_is_contig; ++ int i, filetype_is_contig, nprocs, myrank, do_collect = 0; ++ int contig_access_count = 0, buftype_is_contig, interleave_count = 0; + int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; -+ ADIO_Offset orig_fp, start_offset, end_offset, off, *offset_list = NULL; ++ ADIO_Offset orig_fp, start_offset, end_offset, off; ++ ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL; + int *buf_idx = NULL, *len_list = NULL, *striping_info = NULL; + int old_error, tmp_error; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + -+ nprocs_for_coll = fd->hints->cb_nodes; + orig_fp = fd->fp_ind; + + /* IO patten identification if cb_write isn't disabled */ + if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { + /* For this process's request, calculate the list of offsets and + lengths in the file and determine the start and end offsets. */ -+ ADIOI_LUSTRE_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, -+ &offset_list, &len_list, &start_offset, -+ &end_offset, &contig_access_count); -+ /* Get striping information */ -+ ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1, nprocs); -+ /* check if the access pattern can benefit from collective write */ -+ do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count, -+ len_list, nprocs); ++ ++ /* Note: end_offset points to the last byte-offset that will be accessed. ++ * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 ++ */ ++ ++ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, ++ &offset_list, &len_list, &start_offset, ++ &end_offset, &contig_access_count); ++ ++ /* each process communicates its start and end offsets to other ++ * processes. The result is an array each of start and end offsets ++ * stored in order of process rank. ++ */ ++ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); ++ end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); ++ MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ++ ADIO_OFFSET, fd->comm); ++ MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ++ ADIO_OFFSET, fd->comm); ++ /* are the accesses of different processes interleaved? */ ++ for (i = 1; i < nprocs; i++) ++ if ((st_offsets[i] < end_offsets[i-1]) && ++ (st_offsets[i] <= end_offsets[i])) ++ interleave_count++; ++ /* This is a rudimentary check for interleaving, but should suffice ++ for the moment. */ ++ ++ /* Two typical access patterns can benefit from collective write. ++ * 1) the processes are interleaved, and ++ * 2) the req size is small. ++ */ ++ if (interleave_count > 0) { ++ do_collect = 1; ++ } else { ++ do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count, ++ len_list, nprocs); ++ } + } + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); + @@ -1530,6 +893,8 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { + ADIOI_Free(offset_list); + ADIOI_Free(len_list); ++ ADIOI_Free(st_offsets); ++ ADIOI_Free(end_offsets); + } + + fd->fp_ind = orig_fp; @@ -1550,18 +915,27 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + return; + } + ++ /* Get Lustre hints information */ ++ ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1); ++ + /* calculate what portions of the access requests of this process are + * located in which process + */ + ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count, + striping_info, nprocs, &count_my_req_procs, + &count_my_req_per_proc, &my_req, &buf_idx); -+ /* calculate what process's requests will be written by this process */ -+ ADIOI_LUSTRE_Calc_others_req(fd, count_my_req_procs, -+ count_my_req_per_proc, -+ my_req, nprocs, myrank, -+ start_offset, end_offset, striping_info, -+ &count_others_req_procs, &others_req); ++ ++ /* based on everyone's my_req, calculate what requests of other processes ++ * will be accessed by this process. ++ * count_others_req_procs = number of processes whose requests (including ++ * this process itself) will be accessed by this process ++ * count_others_req_per_proc[i] indicates how many separate contiguous ++ * requests of proc. i will be accessed by this process. ++ */ ++ ++ ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, ++ my_req, nprocs, myrank, &count_others_req_procs, ++ &others_req); + ADIOI_Free(count_my_req_per_proc); + + /* exchange data and write in sizes of no more than stripe_size. */ @@ -1570,6 +944,17 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + offset_list, len_list, contig_access_count, + striping_info, buf_idx, error_code); + ++ /* If this collective write is followed by an independent write, ++ * it's possible to have those subsequent writes on other processes ++ * race ahead and sneak in before the read-modify-write completes. ++ * We carry out a collective communication at the end here so no one ++ * can start independent i/o before collective I/O completes. ++ * ++ * need to do some gymnastics with the error codes so that if something ++ * went wrong, all processes report error, but if a process has a more ++ * specific error code, we can still have that process report the ++ * additional information */ ++ + old_error = *error_code; + if (*error_code != MPI_SUCCESS) + *error_code = MPI_ERR_IO; @@ -1619,6 +1004,8 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + ADIOI_Free(buf_idx); + ADIOI_Free(offset_list); + ADIOI_Free(len_list); ++ ADIOI_Free(st_offsets); ++ ADIOI_Free(end_offsets); + ADIOI_Free(striping_info); + +#ifdef HAVE_STATUS_SET_BYTES @@ -1637,6 +1024,9 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + fd->fp_sys_posn = -1; /* set it to null. */ +} + ++/* If successful, error_code is set to MPI_SUCCESS. Otherwise an error ++ * code is created and returned in error_code. ++ */ +static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf, + MPI_Datatype datatype, int nprocs, + int myrank, ADIOI_Access *others_req, @@ -1646,6 +1036,16 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + int *striping_info, int *buf_idx, + int *error_code) +{ ++ /* Send data to appropriate processes and write in sizes of no more ++ * than lustre stripe_size. ++ * The idea is to reduce the amount of extra memory required for ++ * collective I/O. If all data were written all at once, which is much ++ * easier, it would require temp space more than the size of user_buf, ++ * which is often unacceptable. For example, to write a distributed ++ * array to a file, where each local array is 8Mbytes, requiring ++ * at least another 8Mbytes of temp space is unacceptable. ++ */ ++ + int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig; + ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc; + ADIO_Offset off, req_off, send_off, iter_st_off, *off_list; @@ -1655,20 +1055,21 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + int *send_curr_offlen_ptr, *send_size; + int *partial_recv, *sent_to_proc, *recv_start_pos; + int *send_buf_idx, *curr_to_proc, *done_to_proc; -+ char *write_buf = NULL, *value; ++ char *write_buf = NULL; + MPI_Status status; + ADIOI_Flatlist_node *flat_buf = NULL; + MPI_Aint buftype_extent; -+ int stripe_size = striping_info[0], lflag, data_sieving = 0; -+ int stripe_count = striping_info[1], CO = striping_info[2]; -+ /* IO step size in each communication */ -+ static char myname[] = "ADIOI_EXCH_AND_WRITE"; ++ int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2]; ++ int data_sieving = 0; + + *error_code = MPI_SUCCESS; /* changed below if error */ ++ /* only I/O errors are currently reported */ + -+ /* calculate the number of writes of stripe size -+ * to be done by each process and the max among all processes. ++ /* calculate the number of writes of stripe size to be done. + * That gives the no. of communication phases as well. ++ * Note: ++ * Because we redistribute data in stripe-contiguous pattern for Lustre, ++ * each process has the same no. of communication phases. + */ + + for (i = 0; i < nprocs; i++) { @@ -1678,7 +1079,6 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + break; + } + } -+ + for (i = 0; i < nprocs; i++) { + for (j = 0; j < others_req[i].count; j++) { + st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]); @@ -1696,10 +1096,12 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm); + /* align downward */ + min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size; -+ /* when nprocs < stripe_count, there will be trouble, because some client -+ * would access more than one OST in one whole communication. ++ ++ /* Each time, only avail_cb_nodes number of IO clients perform IO, ++ * so, step_size=avail_cb_nodes*stripe_size IO will be performed at most, ++ * and ntimes=whole_file_portion/step_size + */ -+ step_size = (ADIO_Offset)ADIOI_MIN(nprocs, stripe_count * CO) * stripe_size; ++ step_size = (ADIO_Offset) avail_cb_nodes * stripe_size; + max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1); + if (ntimes) + write_buf = (char *) ADIOI_Malloc(stripe_size); @@ -1711,7 +1113,6 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + for (i = 0; i < nprocs; i++) { + for (j = 0; j < others_req[i].count; j ++) { + req_off = others_req[i].offsets[j]; -+ //m = (req_off - min_st_loc) / (stripe_size * stripe_count * CO); + m = (int)((req_off - min_st_loc) / step_size); + off_list[m] = ADIOI_MIN(off_list[m], req_off); + } @@ -1753,24 +1154,30 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + flat_buf = flat_buf->next; + } + MPI_Type_extent(datatype, &buftype_extent); ++ /* I need to check if there are any outstanding nonblocking writes to ++ * the file, which could potentially interfere with the writes taking ++ * place in this collective write call. Since this is not likely to be ++ * common, let me do the simplest thing possible here: Each process ++ * completes all pending nonblocking operations before completing. ++ */ ++ /*ADIOI_Complete_async(error_code); ++ if (*error_code != MPI_SUCCESS) return; ++ MPI_Barrier(fd->comm); ++ */ + + iter_st_off = min_st_loc; + + /* Although we have recognized the data according to OST index, + * a read-modify-write will be done if there is a hole between the data. -+ * For example: if blocksize=60, transfersize=30 and stripe_size=100, -+ * then process0 will collect data [0, 30] and [60, 90] then write. There -+ * is a hole [30, 60], which will cause a read-modify-write in [0, 90]. -+ * It will degrade collective performance. -+ * So we disable data sieving by default unless the hint "ds_in_coll" -+ * is set to "enable". ++ * For example: if blocksize=60, xfersize=30 and stripe_size=100, ++ * then rank0 will collect data [0, 30] and [60, 90] then write. There ++ * is a hole in [30, 60], which will cause a read-modify-write in [0, 90]. ++ * ++ * To reduce its impact on the performance, we can disable data sieving ++ * by hint "ds_in_coll". + */ + /* check the hint for data sieving */ -+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); -+ MPI_Info_get(fd->info, "ds_in_coll", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag && !strcmp(value, "enable")) -+ data_sieving = 1; -+ ADIOI_Free(value); ++ data_sieving = fd->hints->fs_hints.lustre.ds_in_coll; + + for (m = 0; m < max_ntimes; m++) { + /* go through all others_req and my_req to check which will be received @@ -1834,7 +1241,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + recv_curr_offlen_ptr[i] = j; + } + } -+ /* use hole to pass data_sieving flag into W_Exchange_data */ ++ /* use variable "hole" to pass data_sieving flag into W_Exchange_data */ + hole = data_sieving; + ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list, + len_list, send_size, recv_size, off, real_size, @@ -1845,7 +1252,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + curr_to_proc, done_to_proc, &hole, m, + buftype_extent, buf_idx, error_code); + if (*error_code != MPI_SUCCESS) -+ return; ++ goto over; + + flag = 0; + for (i = 0; i < nprocs; i++) @@ -1855,13 +1262,13 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + } + if (flag) { + /* check whether to do data sieving */ -+ if(data_sieving) { ++ if(data_sieving == ADIOI_HINT_ENABLE) { + ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE, + ADIO_EXPLICIT_OFFSET, off, &status, + error_code); + } else { -+ /* if there is no hole, write in one time; -+ * otherwise, write data separately */ ++ /* if there is no hole, write data in one time; ++ * otherwise, write data in several times */ + if (!hole) { + ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE, + ADIO_EXPLICIT_OFFSET, off, &status, @@ -1878,6 +1285,8 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + MPI_BYTE, ADIO_EXPLICIT_OFFSET, + others_req[i].offsets[j], &status, + error_code); ++ if (*error_code != MPI_SUCCESS) ++ goto over; + } + } + } @@ -1885,14 +1294,11 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + } + } + if (*error_code != MPI_SUCCESS) -+ return; ++ goto over; + } -+ + iter_st_off += max_size; + } -+ if (*error_code != MPI_SUCCESS) -+ return; -+ ++over: + if (ntimes) + ADIOI_Free(write_buf); + ADIOI_Free(recv_curr_offlen_ptr); @@ -1908,10 +1314,13 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + ADIOI_Free(off_list); +} + ++/* Sets error_code to MPI_SUCCESS if successful, or creates an error code ++ * in the case of error. ++ */ +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf, + char *write_buf, -+ ADIOI_Flatlist_node * flat_buf, -+ ADIO_Offset * offset_list, ++ ADIOI_Flatlist_node *flat_buf, ++ ADIO_Offset *offset_list, + int *len_list, int *send_size, + int *recv_size, ADIO_Offset off, + int size, int *count, @@ -1919,15 +1328,15 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + int *sent_to_proc, int nprocs, + int myrank, int buftype_is_contig, + int contig_access_count, -+ int * striping_info, -+ ADIOI_Access * others_req, ++ int *striping_info, ++ ADIOI_Access *others_req, + int *send_buf_idx, + int *curr_to_proc, int *done_to_proc, + int *hole, int iter, + MPI_Aint buftype_extent, + int *buf_idx, int *error_code) +{ -+ int i, j, *tmp_len, nprocs_recv, nprocs_send, err; ++ int i, j, nprocs_recv, nprocs_send, err; + char **send_buf = NULL; + MPI_Request *requests, *send_req; + MPI_Datatype *recv_types; @@ -1947,7 +1356,6 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + sizeof(MPI_Datatype)); + /* +1 to avoid a 0-size malloc */ + -+ tmp_len = (int *) ADIOI_Malloc(nprocs * sizeof(int)); + j = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { @@ -1976,8 +1384,6 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos, + nprocs, nprocs_recv, sum); + -+ ADIOI_Free(tmp_len); -+ + /* check if there are any holes */ + *hole = 0; + for (i = 0; i < sum - 1; i++) { @@ -2000,7 +1406,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + *hole = 1; + } + /* check the hint for data sieving */ -+ if (data_sieving && nprocs_recv && *hole) { ++ if (data_sieving == ADIOI_HINT_ENABLE && nprocs_recv && *hole) { + ADIO_ReadContig(fd, write_buf, size, MPI_BYTE, + ADIO_EXPLICIT_OFFSET, off, &status, &err); + // --BEGIN ERROR HANDLING-- @@ -2010,6 +1416,9 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + myname, __LINE__, + MPI_ERR_IO, + "**ioRMWrdwr", 0); ++ ADIOI_Free(recv_types); ++ ADIOI_Free(srt_off); ++ ADIOI_Free(srt_len); + return; + } + // --END ERROR HANDLING-- @@ -2175,15 +1584,15 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c +} + +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf, -+ ADIOI_Flatlist_node * flat_buf, ++ ADIOI_Flatlist_node *flat_buf, + char **send_buf, -+ ADIO_Offset * offset_list, ++ ADIO_Offset *offset_list, + int *len_list, int *send_size, -+ MPI_Request * requests, ++ MPI_Request *requests, + int *sent_to_proc, int nprocs, + int myrank, + int contig_access_count, -+ int * striping_info, ++ int *striping_info, + int *send_buf_idx, + int *curr_to_proc, + int *done_to_proc, int iter, @@ -2227,7 +1636,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + * longer than the single region that processor "p" is responsible + * for. + */ -+ p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, nprocs, striping_info); ++ p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, striping_info); + + if (send_buf_idx[p] < send_size[p]) { + if (curr_to_proc[p] + len > done_to_proc[p]) { @@ -2272,130 +1681,10 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + if (send_size[i]) + sent_to_proc[i] = curr_to_proc[i]; +} -+ -+static void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count, -+ ADIO_Offset * srt_off, int *srt_len, -+ int *start_pos, int nprocs, int nprocs_recv, -+ int total_elements) -+{ -+ typedef struct { -+ ADIO_Offset *off_list; -+ int *len_list; -+ int nelem; -+ } heap_struct; -+ -+ heap_struct *a, tmp; -+ int i, j, heapsize, l, r, k, smallest; -+ -+ a = (heap_struct *) ADIOI_Malloc((nprocs_recv + 1) * -+ sizeof(heap_struct)); -+ -+ j = 0; -+ for (i = 0; i < nprocs; i++) -+ if (count[i]) { -+ a[j].off_list = &(others_req[i].offsets[start_pos[i]]); -+ a[j].len_list = &(others_req[i].lens[start_pos[i]]); -+ a[j].nelem = count[i]; -+ j++; -+ } -+ -+ /* build a heap out of the first element from each list, with -+ the smallest element of the heap at the root */ -+ -+ heapsize = nprocs_recv; -+ for (i = heapsize / 2 - 1; i >= 0; i--) { -+ /* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143 -+ modified for a heap with smallest element at root. I have -+ removed the recursion so that there are no function calls. -+ Function calls are too expensive. */ -+ k = i; -+ for (;;) { -+ l = 2 * (k + 1) - 1; -+ r = 2 * (k + 1); -+ -+ if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list))) -+ smallest = l; -+ else -+ smallest = k; -+ -+ if ((r < heapsize) && -+ (*(a[r].off_list) < *(a[smallest].off_list))) -+ smallest = r; -+ -+ if (smallest != k) { -+ tmp.off_list = a[k].off_list; -+ tmp.len_list = a[k].len_list; -+ tmp.nelem = a[k].nelem; -+ -+ a[k].off_list = a[smallest].off_list; -+ a[k].len_list = a[smallest].len_list; -+ a[k].nelem = a[smallest].nelem; -+ -+ a[smallest].off_list = tmp.off_list; -+ a[smallest].len_list = tmp.len_list; -+ a[smallest].nelem = tmp.nelem; -+ -+ k = smallest; -+ } else -+ break; -+ } -+ } -+ -+ for (i = 0; i < total_elements; i++) { -+ /* extract smallest element from heap, i.e. the root */ -+ srt_off[i] = *(a[0].off_list); -+ srt_len[i] = *(a[0].len_list); -+ (a[0].nelem)--; -+ -+ if (!a[0].nelem) { -+ a[0].off_list = a[heapsize - 1].off_list; -+ a[0].len_list = a[heapsize - 1].len_list; -+ a[0].nelem = a[heapsize - 1].nelem; -+ heapsize--; -+ } else { -+ (a[0].off_list)++; -+ (a[0].len_list)++; -+ } -+ -+ /* Heapify(a, 0, heapsize); */ -+ k = 0; -+ for (;;) { -+ l = 2 * (k + 1) - 1; -+ r = 2 * (k + 1); -+ -+ if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list))) -+ smallest = l; -+ else -+ smallest = k; -+ -+ if ((r < heapsize) && -+ (*(a[r].off_list) < *(a[smallest].off_list))) -+ smallest = r; -+ -+ if (smallest != k) { -+ tmp.off_list = a[k].off_list; -+ tmp.len_list = a[k].len_list; -+ tmp.nelem = a[k].nelem; -+ -+ a[k].off_list = a[smallest].off_list; -+ a[k].len_list = a[smallest].len_list; -+ a[k].nelem = a[smallest].nelem; -+ -+ a[smallest].off_list = tmp.off_list; -+ a[smallest].len_list = tmp.len_list; -+ a[smallest].nelem = tmp.nelem; -+ -+ k = smallest; -+ } else -+ break; -+ } -+ } -+ ADIOI_Free(a); -+} -diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c ---- ad_lustre_orig/ad_lustre_wrstr.c 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/ad_lustre_wrstr.c 2008-09-17 18:20:35.000000000 +0800 -@@ -0,0 +1,463 @@ +diff -ruN adio/ad_lustre_orig/ad_lustre_wrstr.c adio/ad_lustre/ad_lustre_wrstr.c +--- adio/ad_lustre_orig/ad_lustre_wrstr.c 1970-01-01 08:00:00.000000000 +0800 ++++ adio/ad_lustre/ad_lustre_wrstr.c 2009-02-27 10:35:18.000000000 +0800 +@@ -0,0 +1,467 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (C) 1997 University of Chicago. @@ -2403,7 +1692,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + * + * Copyright (C) 2007 Oak Ridge National Laboratory + * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include "ad_lustre.h" @@ -2422,6 +1711,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + MPIR_ERR_RECOVERABLE, myname, \ + __LINE__, MPI_ERR_IO, \ + "**iowswc", 0); \ ++ ADIOI_Free(writebuf); \ + return; \ + } \ + } \ @@ -2439,6 +1729,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + MPIR_ERR_RECOVERABLE, myname, \ + __LINE__, MPI_ERR_IO, \ + "**iowsrc", 0); \ ++ ADIOI_Free(writebuf); \ + return; \ + } \ + } \ @@ -2454,6 +1745,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + MPIR_ERR_RECOVERABLE, myname, \ + __LINE__, MPI_ERR_IO, \ + "**iowswc", 0); \ ++ ADIOI_Free(writebuf); \ + return; \ + } \ + req_len -= write_sz; \ @@ -2472,6 +1764,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + MPIR_ERR_RECOVERABLE, myname, \ + __LINE__, MPI_ERR_IO, \ + "**iowsrc", 0); \ ++ ADIOI_Free(writebuf); \ + return; \ + } \ + write_sz = ADIOI_MIN(req_len, writebuf_len); \ @@ -2501,6 +1794,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + MPIR_ERR_RECOVERABLE, myname, \ + __LINE__, MPI_ERR_IO, \ + "**iowswc", 0); \ ++ ADIOI_Free(writebuf); \ + return; \ + } \ + req_len -= write_sz; \ @@ -2535,8 +1829,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz; + ADIO_Status status1; + int new_bwr_size, new_fwr_size; -+ char * value; -+ int stripe_size, lflag = 0; ++ int stripe_size; + static char myname[] = "ADIOI_LUSTRE_WriteStrided"; + int myrank; + MPI_Comm_rank(fd->comm, &myrank); @@ -2573,11 +1866,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + bufsize = buftype_size * count; + + /* get striping info */ -+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); -+ MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag) -+ stripe_size = atoi(value); -+ ADIOI_Free(value); ++ stripe_size = fd->hints->striping_unit; + + /* Different buftype to different filetype */ + if (!buftype_is_contig && filetype_is_contig) { @@ -2620,8 +1909,10 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + + if (fd->atomicity) + ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize); -+ if (*error_code != MPI_SUCCESS) ++ if (*error_code != MPI_SUCCESS) { ++ ADIOI_Free(writebuf); + return; ++ } + ADIOI_Free(writebuf); + if (file_ptr_type == ADIO_INDIVIDUAL) + fd->fp_ind = off; @@ -2838,8 +2129,10 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + writebuf_off, &status1, error_code); + if (!(fd->atomicity)) + ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); -+ if (*error_code != MPI_SUCCESS) ++ if (*error_code != MPI_SUCCESS) { ++ ADIOI_Free(writebuf); + return; ++ } + } + if (fd->atomicity) + ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize); @@ -2859,63 +2152,9 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + if (!buftype_is_contig) + ADIOI_Delete_flattened(datatype); +} -diff -ruN ad_lustre_orig/Makefile ad_lustre/Makefile ---- ad_lustre_orig/Makefile 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/Makefile 2008-09-17 18:20:35.000000000 +0800 -@@ -0,0 +1,50 @@ -+CC = gcc -+AR = ar cr -+RANLIB = ranlib -+LIBNAME = /work/download/mpich2-1.0.7-dev/lib/libmpich.a -+srcdir = /work/download/mpich2-1.0.7-dev/src/mpi/romio/adio/ad_lustre -+CC_SHL = true -+SHLIBNAME = /work/download/mpich2-1.0.7-dev/lib/libmpich -+ -+INCLUDE_DIR = -I. -I${srcdir}/../include -I../include -I../../include -I${srcdir}/../../../../include -I../../../../include -+CFLAGS = -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/include -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/include -I/work/download/mpich2-1.0.7-dev/src/mpid/common/datatype -I/work/download/mpich2-1.0.7-dev/src/mpid/common/datatype -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/channels/sock/include -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/channels/sock/include -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock/poll -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock/poll -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -O2 -DFORTRANDOUBLEUNDERSCORE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DHAVE_ROMIOCONF_H -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(INCLUDE_DIR) -+ -+top_builddir = /work/download/mpich2-1.0.7-dev -+LIBTOOL = -+C_COMPILE_SHL = $(CC_SHL) -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -O2 -DFORTRANDOUBLEUNDERSCORE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DHAVE_ROMIOCONF_H -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(INCLUDE_DIR) -+ -+VPATH = .:${srcdir} -+ -+AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_open.o \ -+ ad_lustre_rwcontig.o ad_lustre_wrcoll.o ad_lustre_wrstr.o \ -+ ad_lustre_fcntl.o ad_lustre_hints.o ad_lustre_close.o \ -+ ad_lustre_aggregate.o -+ -+ -+default: $(LIBNAME) -+ @if [ "none" != "none" ] ; then \ -+ $(MAKE) $(SHLIBNAME).la ;\ -+ fi -+ -+.SUFFIXES: $(SUFFIXES) .p .lo -+ -+.c.o: -+ $(CC) $(CFLAGS) -c $< -+.c.lo: -+ $(C_COMPILE_SHL) -c $< -o _s$*.o -+ @mv -f _s$*.o $*.lo -+ -+$(LIBNAME): $(AD_LUSTRE_OBJECTS) -+ $(AR) $(LIBNAME) $(AD_LUSTRE_OBJECTS) -+ $(RANLIB) $(LIBNAME) -+ -+AD_LUSTRE_LOOBJECTS=$(AD_LUSTRE_OBJECTS:.o=.lo) -+$(SHLIBNAME).la: $(AD_LUSTRE_LOOBJECTS) -+ $(AR) $(SHLIBNAME).la $(AD_LUSTRE_LOOBJECTS) -+ -+coverage: -+ -@for file in ${AD_LUSTRE_OBJECTS:.o=.c} ; do \ -+ gcov -b -f $$file ; done -+ -+clean: -+ @rm -f *.o *.lo -diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in ---- ad_lustre_orig/Makefile.in 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/Makefile.in 2008-09-17 18:20:35.000000000 +0800 +diff -ruN adio/ad_lustre_orig/Makefile.in adio/ad_lustre/Makefile.in +--- adio/ad_lustre_orig/Makefile.in 2008-09-17 14:36:56.000000000 +0800 ++++ adio/ad_lustre/Makefile.in 2008-10-17 17:03:06.000000000 +0800 @@ -16,7 +16,9 @@ @VPATH@ @@ -2927,29 +2166,48 @@ diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in default: $(LIBNAME) @if [ "@ENABLE_SHLIB@" != "none" ] ; then \ -diff -ruN ad_lustre_orig/README ad_lustre/README ---- ad_lustre_orig/README 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/README 2008-09-17 18:20:35.000000000 +0800 -@@ -5,6 +5,22 @@ +diff -ruN adio/ad_lustre_orig/README adio/ad_lustre/README +--- adio/ad_lustre_orig/README 2008-09-17 14:36:56.000000000 +0800 ++++ adio/ad_lustre/README 2009-04-24 09:46:20.000000000 +0800 +@@ -5,6 +5,21 @@ o To post the code for ParColl (Partitioned collective IO) ----------------------------------------------------- +V05: +----------------------------------------------------- -+ o Improved data redistribution -+ - add I/O pattern identification. If request I/O size is big, -+ collective I/O won't be done. The hint big_req_size can be -+ used to define this. -+ - provide hint CO for load balancing to control the number -+ of IO clients for each OST -+ - divide the IO clients into the different OST groups to -+ produce stripe-contiguous I/O pattern -+ - reduce the collective overhead by hints contiguous_data and -+ same_io_size to remove unnecessary MPI_Alltoall() ++Improved data redistribution ++ o Improve I/O pattern identification. Besides checking interleaving, ++ if request I/O size is small, collective I/O will be performed. ++ The hint bigsize can be used to define the req size value. ++ o Provide hint CO for load balancing to control the number of ++ IO clients for each OST ++ o Produce stripe-contiguous I/O pattern that Lustre prefers + o Control read-modify-write in data sieving in collective IO + by hint ds_in_coll. ++ o Reduce extent lock conflicts by make each OST accessed by one or ++ more constant clients. + +----------------------------------------------------- V04: ----------------------------------------------------- o Direct IO and Lockless IO support +--- adio/common/ad_write_coll_orig.c 2009-02-27 22:06:46.000000000 +0800 ++++ adio/common/ad_write_coll.c 2008-10-15 11:25:38.000000000 +0800 +@@ -42,7 +42,7 @@ + int *send_buf_idx, int *curr_to_proc, + int *done_to_proc, int iter, + MPI_Aint buftype_extent); +-static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, ++void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, + ADIO_Offset *srt_off, int *srt_len, int *start_pos, + int nprocs, int nprocs_recv, int total_elements); + +@@ -921,7 +921,7 @@ + + + +-static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, ++void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, + ADIO_Offset *srt_off, int *srt_len, int *start_pos, + int nprocs, int nprocs_recv, int total_elements) + {