-diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
---- ad_lustre_orig/ad_lustre_aggregate.c 1970-01-01 08:00:00.000000000 +0800
-+++ ad_lustre/ad_lustre_aggregate.c 2008-09-17 18:20:35.000000000 +0800
-@@ -0,0 +1,676 @@
+--- configure_orig.in 2009-03-01 13:50:30.000000000 +0800
++++ configure.in 2009-02-27 13:35:42.000000000 +0800
+@@ -1123,8 +1123,14 @@
+ if test -n "$file_system_testfs"; then
+ AC_DEFINE(ROMIO_TESTFS,1,[Define for ROMIO with TESTFS])
+ fi
++#
++# Verify presence of lustre/lustre_user.h
++#
+ if test -n "$file_system_lustre"; then
+- AC_DEFINE(ROMIO_LUSTRE,1,[Define for ROMIO with LUSTRE])
++ AC_CHECK_HEADERS(lustre/lustre_user.h,
++ AC_DEFINE(ROMIO_LUSTRE,1,[Define for ROMIO with LUSTRE]),
++ AC_MSG_ERROR([LUSTRE support requested but cannot find lustre/lustre_user.h header file])
++ )
+ fi
+
+ if test -n "$file_system_xfs"; then
+--- adio/include/adioi_orig.h 2009-03-01 14:00:48.000000000 +0800
++++ adio/include/adioi.h 2009-04-24 15:26:44.000000000 +0800
+@@ -52,6 +52,12 @@
+ struct {
+ int debugmask;
+ } pvfs2;
++ struct {
++ int start_iodevice;
++ int co_ratio;
++ int coll_threshold;
++ int ds_in_coll;
++ } lustre;
+ } fs_hints;
+
+ };
+diff -ruN adio/ad_lustre_orig/ad_lustre_aggregate.c adio/ad_lustre/ad_lustre_aggregate.c
+--- adio/ad_lustre_orig/ad_lustre_aggregate.c 1970-01-01 08:00:00.000000000 +0800
++++ adio/ad_lustre/ad_lustre_aggregate.c 2009-05-05 15:22:40.000000000 +0800
+@@ -0,0 +1,304 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ * Copyright (C) 1997 University of Chicago.
+ *
+ * Copyright (C) 2007 Oak Ridge National Laboratory
+ *
-+ * Copyright (C) 2008 Sun Microsystems, Lustre group
++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include "ad_lustre.h"
+#include "adio_extern.h"
+
-+void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
-+ int mode, int nprocs)
++#undef AGG_DEBUG
++
++void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int **striping_info_ptr,
++ int mode)
+{
+ int *striping_info = NULL;
+ /* get striping information:
-+ * striping_info[0] = stripe_size;
-+ * striping_info[1] = stripe_count;
-+ * striping_info[2] = CO;
++ * striping_info[0]: stripe_size
++ * striping_info[1]: stripe_count
++ * striping_info[2]: avail_cb_nodes
+ */
-+ /* for easy understanding, we name some variables */
-+ int stripe_size, stripe_count, CO = 1, CO_max = 1, lflag;
++ int stripe_size, stripe_count, CO = 1, CO_max = 1, CO_nodes, lflag;
++ int avail_cb_nodes, divisor, nprocs_for_coll = fd->hints->cb_nodes;
+ char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
++
++ /* Get hints value */
+ /* stripe size */
-+ MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
-+ if (lflag)
-+ stripe_size = atoi(value);
++ stripe_size = fd->hints->striping_unit;
+ /* stripe count */
-+ MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &lflag);
-+ if (lflag)
-+ stripe_count = atoi(value);
+ /* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */
++ stripe_count = fd->hints->striping_factor;
+
-+ /* calculate CO */
++ /* Calculate the available number of I/O clients, that is
++ * avail_cb_nodes=min(cb_nodes, stripe_count*CO), where
++ * CO=1 by default
++ */
+ if (!mode) {
-+ /* for collective read,
++ /* for collective read,
+ * if "CO" clients access the same OST simultaneously,
-+ * the OST disk seek time would be large. So, to avoid this,
++ * the OST disk seek time would be much. So, to avoid this,
+ * it might be better if 1 client only accesses 1 OST.
+ * So, we set CO = 1 to meet the above requirement.
+ */
+ CO = 1;
+ /*XXX: maybe there are other better way for collective read */
+ } else {
-+ /* CO_max: the largest number of IO clients for each ost group */
-+ CO_max = (nprocs - 1)/ stripe_count + 1;
++ /* CO_max: the largest number of IO clients for each ost group */
++ CO_max = (nprocs_for_coll - 1)/ stripe_count + 1;
+ /* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */
-+ MPI_Info_get(fd->info, "CO", MPI_MAX_INFO_VAL, value, &lflag);
-+ if (lflag)
-+ CO = atoi(value);
++ CO = fd->hints->fs_hints.lustre.co_ratio;
+ CO = ADIOI_MIN(CO_max, CO);
+ }
-+ ADIOI_Free(value);
-+ /* although there are known "N" hints so far, we still malloc space here
-+ * instead of declaring an array[3] outside,
-+ * because on one hand in the future we probably need more hints, and
-+ * on the other hand this function can be called by
-+ * both collective read and write conveniently.
-+ */
++ /* Calculate how many IO clients we need */
++ /* To avoid extent lock conflicts,
++ * avail_cb_nodes should divide (stripe_count*CO) exactly,
++ * so that each OST is accessed by only one or more constant clients. */
++ CO_nodes = stripe_count * CO;
++ avail_cb_nodes = ADIOI_MIN(nprocs_for_coll, CO_nodes);
++ if (avail_cb_nodes < CO_nodes) {
++ do {
++ /* find the divisor of CO_nodes */
++ divisor = 1;
++ do {
++ divisor ++;
++ } while (CO_nodes % divisor);
++ CO_nodes = CO_nodes / divisor;
++ /* if stripe_count*CO is a prime number, change nothing */
++ if ((CO_nodes <= avail_cb_nodes) && (CO_nodes != 1)) {
++ avail_cb_nodes = CO_nodes;
++ break;
++ }
++ } while (CO_nodes != 1);
++ }
++
+ *striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int));
+ striping_info = *striping_info_ptr;
+ striping_info[0] = stripe_size;
+ striping_info[1] = stripe_count;
-+ striping_info[2] = CO;
++ striping_info[2] = avail_cb_nodes;
++
++ ADIOI_Free(value);
+}
+
+int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
-+ ADIO_Offset *len, int nprocs,
-+ int *striping_info)
++ ADIO_Offset *len, int *striping_info)
+{
-+ /* please refer the comments in above function for the detailed algorithm */
-+ int rank_index;
++ int rank_index, rank;
+ ADIO_Offset avail_bytes;
-+
+ int stripe_size = striping_info[0];
-+ int stripe_count = striping_info[1];
-+ int CO = striping_info[2];
-+ int avail_nprocs = ADIOI_MIN(stripe_count * CO, nprocs);
-+
-+ /* calculate the rank by offset directly */
-+ rank_index = (int)((off / stripe_size) % avail_nprocs);
-+ /* XXX: the above method is so simple that the processes in top ranks are always
-+ * chosen to be I/O clients. we hope they are different each time.
++ int avail_cb_nodes = striping_info[2];
++
++ /* Produce the stripe-contiguous pattern for Lustre */
++ rank_index = (int)((off / stripe_size) % avail_cb_nodes);
++
++ /* we index into fd_end with rank_index, and fd_end was allocated to be no
++ * bigger than fd->hins->cb_nodes. If we ever violate that, we're
++ * overrunning arrays. Obviously, we should never ever hit this abort
+ */
++ if (rank_index >= fd->hints->cb_nodes)
++ MPI_Abort(MPI_COMM_WORLD, 1);
+
+ avail_bytes = (off / (ADIO_Offset)stripe_size + 1) *
+ (ADIO_Offset)stripe_size - off;
+ /* this proc only has part of the requested contig. region */
+ *len = avail_bytes;
+ }
++ /* map our index to a rank */
++ /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
++ rank = fd->hints->ranklist[rank_index];
+
-+ return rank_index;
++ return rank;
+}
+
++/* ADIOI_LUSTRE_Calc_my_req() - calculate what portions of the access requests
++ * of this process are located in the file domains of various processes
++ * (including this one)
++ */
+void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
+ int *len_list, int contig_access_count,
+ int *striping_info, int nprocs,
+ int *count_my_req_procs_ptr,
+ int **count_my_req_per_proc_ptr,
-+ ADIOI_Access ** my_req_ptr,
++ ADIOI_Access **my_req_ptr,
+ int **buf_idx_ptr)
+{
++ /* Nothing different from ADIOI_Calc_my_req(), except calling
++ * ADIOI_Lustre_Calc_aggregator() instead of the old one */
+ int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
+ int i, l, proc;
+ ADIO_Offset avail_len, rem_len, curr_idx, off;
+
+ *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
+ count_my_req_per_proc = *count_my_req_per_proc_ptr;
++ /* count_my_req_per_proc[i] gives the no. of contig. requests of this
++ * process in process i's file domain. calloc initializes to zero.
++ * I'm allocating memory of size nprocs, so that I can do an
++ * MPI_Alltoall later on.
++ */
+
++ buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+ /* buf_idx is relevant only if buftype_is_contig.
+ * buf_idx[i] gives the index into user_buf where data received
+ * from proc. i should be placed. This allows receives to be done
+ * without extra buffer. This can't be done if buftype is not contig.
+ */
-+ buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
++
+ /* initialize buf_idx to -1 */
+ for (i = 0; i < nprocs; i++)
+ buf_idx[i] = -1;
+ continue;
+ off = offset_list[i];
+ avail_len = len_list[i];
-+ /* we set avail_len to be the total size of the access.
++ /* note: we set avail_len to be the total size of the access.
+ * then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return
+ * the amount that was available.
+ */
-+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs,
-+ striping_info);
++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
+ count_my_req_per_proc[proc]++;
++
+ /* figure out how many data is remaining in the access
+ * we'll take care of this data (if there is any)
+ * in the while loop below.
+ while (rem_len != 0) {
+ off += avail_len; /* point to first remaining byte */
+ avail_len = rem_len; /* save remaining size, pass to calc */
-+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs,
-+ striping_info);
++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
+ count_my_req_per_proc[proc]++;
+ rem_len -= avail_len; /* reduce remaining length by amount from fd */
+ }
+ }
+
++ /* now allocate space for my_req, offset, and len */
+ *my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access));
+ my_req = *my_req_ptr;
+
+ /* now fill in my_req */
+ curr_idx = 0;
+ for (i = 0; i < contig_access_count; i++) {
++ /* short circuit offset/len processing if len == 0
++ * (zero-byte read/write */
+ if (len_list[i] == 0)
+ continue;
+ off = offset_list[i];
+ avail_len = len_list[i];
-+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs,
-+ striping_info);
++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
+
+ /* for each separate contiguous access from this process */
+ if (buf_idx[proc] == -1)
+ while (rem_len != 0) {
+ off += avail_len;
+ avail_len = rem_len;
-+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs,
++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
+ striping_info);
+ if (buf_idx[proc] == -1)
+ buf_idx[proc] = (int) curr_idx;
+ }
+ }
+ }
-+#endif
+#if 0
+ for (i = 0; i < nprocs; i++) {
+ FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
+ }
+#endif
++#endif
+
+ *count_my_req_procs_ptr = count_my_req_procs;
+ *buf_idx_ptr = buf_idx;
+int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
+ int *len_list, int nprocs)
+{
-+ /* Algorithm:
-+ * So far, only one case is suitable for collective I/O
-+ * (1) request size <= big_req_size
-+ *
-+ * if (avg_req_size > big_req_size) {
-+ * docollect = 0;
-+ * }
++ /* If the processes are non-interleaved, we will check the req_size.
++ * if (avg_req_size > big_req_size) {
++ * docollect = 0;
++ * }
+ */
+
+ int i, docollect = 1, lflag, big_req_size = 0;
+ ADIO_Offset req_size = 0, total_req_size;
+ int avg_req_size, total_access_count;
-+ char *value = NULL;
+
+ /* calculate total_req_size and total_access_count */
+ for (i = 0; i < contig_access_count; i++)
+ fd->comm);
+ /* estimate average req_size */
+ avg_req_size = (int)(total_req_size / total_access_count);
-+
-+ /* get hint of hole_ratio */
-+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
-+ MPI_Info_get(fd->info, "big_req_size", MPI_MAX_INFO_VAL, value, &lflag);
-+ if (lflag)
-+ big_req_size = atoi(value);
-+
++ /* get hint of big_req_size */
++ big_req_size = fd->hints->fs_hints.lustre.coll_threshold;
++ /* Don't perform collective I/O if there are big requests */
+ if ((big_req_size > 0) && (avg_req_size > big_req_size))
+ docollect = 0;
+
-+ ADIOI_Free(value);
-+
+ return docollect;
+}
-+
-+void ADIOI_LUSTRE_Calc_my_off_len(ADIO_File fd, int bufcount,
-+ MPI_Datatype datatype, int file_ptr_type,
-+ ADIO_Offset offset,
-+ ADIO_Offset **offset_list_ptr,
-+ int **len_list_ptr,
-+ ADIO_Offset *start_offset_ptr,
-+ ADIO_Offset *end_offset_ptr,
-+ int *contig_access_count_ptr)
-+{
-+ int filetype_size, buftype_size, etype_size;
-+ int i, j, k, frd_size = 0, old_frd_size = 0, st_index = 0;
-+ int n_filetypes, etype_in_filetype;
-+ ADIO_Offset abs_off_in_filetype = 0;
-+ int bufsize, sum, n_etypes_in_filetype, size_in_filetype;
-+ int contig_access_count, *len_list, flag, filetype_is_contig;
-+ MPI_Aint filetype_extent, filetype_lb;
-+ ADIOI_Flatlist_node *flat_file;
-+ ADIO_Offset *offset_list, off, end_offset = 0, disp;
-+
-+ /* For this process's request, calculate the list of offsets and
-+ lengths in the file and determine the start and end offsets. */
-+
-+ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
-+
-+ MPI_Type_size(fd->filetype, &filetype_size);
-+ MPI_Type_extent(fd->filetype, &filetype_extent);
-+ MPI_Type_lb(fd->filetype, &filetype_lb);
-+ MPI_Type_size(datatype, &buftype_size);
-+ etype_size = fd->etype_size;
-+
-+ if (!filetype_size) {
-+ *contig_access_count_ptr = 0;
-+ *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset));
-+ *len_list_ptr = (int *) ADIOI_Malloc(2 * sizeof(int));
-+ /* 2 is for consistency. everywhere I malloc one more than needed */
-+
-+ offset_list = *offset_list_ptr;
-+ len_list = *len_list_ptr;
-+ offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
-+ fd->disp + etype_size * offset;
-+ len_list[0] = 0;
-+ *start_offset_ptr = offset_list[0];
-+ *end_offset_ptr = offset_list[0] + len_list[0] - 1;
-+ return;
-+ }
-+
-+ if (filetype_is_contig) {
-+ *contig_access_count_ptr = 1;
-+ *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset));
-+ *len_list_ptr = (int *) ADIOI_Malloc(2 * sizeof(int));
-+ /* 2 is for consistency. everywhere I malloc one more than needed */
-+
-+ offset_list = *offset_list_ptr;
-+ len_list = *len_list_ptr;
-+ offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
-+ fd->disp + etype_size * offset;
-+ len_list[0] = bufcount * buftype_size;
-+ *start_offset_ptr = offset_list[0];
-+ *end_offset_ptr = offset_list[0] + len_list[0] - 1;
-+
-+ /* update file pointer */
-+ if (file_ptr_type == ADIO_INDIVIDUAL)
-+ fd->fp_ind = *end_offset_ptr + 1;
-+ } else {
-+ /* First calculate what size of offset_list and len_list to allocate */
-+ /* filetype already flattened in ADIO_Open or ADIO_Fcntl */
-+ flat_file = ADIOI_Flatlist;
-+ while (flat_file->type != fd->filetype)
-+ flat_file = flat_file->next;
-+ disp = fd->disp;
-+
-+ if (file_ptr_type == ADIO_INDIVIDUAL) {
-+ offset = fd->fp_ind; /* in bytes */
-+ n_filetypes = -1;
-+ flag = 0;
-+ while (!flag) {
-+ n_filetypes++;
-+ for (i = 0; i < flat_file->count; i++) {
-+ if (disp + flat_file->indices[i] +
-+ (ADIO_Offset) n_filetypes * filetype_extent +
-+ flat_file->blocklens[i] >= offset) {
-+ st_index = i;
-+ frd_size = (int) (disp + flat_file->indices[i] +
-+ (ADIO_Offset) n_filetypes *
-+ filetype_extent +
-+ flat_file->blocklens[i] -
-+ offset);
-+ flag = 1;
-+ break;
-+ }
-+ }
-+ }
-+ } else {
-+ n_etypes_in_filetype = filetype_size / etype_size;
-+ n_filetypes = (int) (offset / n_etypes_in_filetype);
-+ etype_in_filetype = (int) (offset % n_etypes_in_filetype);
-+ size_in_filetype = etype_in_filetype * etype_size;
-+
-+ sum = 0;
-+ for (i = 0; i < flat_file->count; i++) {
-+ sum += flat_file->blocklens[i];
-+ if (sum > size_in_filetype) {
-+ st_index = i;
-+ frd_size = sum - size_in_filetype;
-+ abs_off_in_filetype = flat_file->indices[i] +
-+ size_in_filetype -
-+ (sum - flat_file->blocklens[i]);
-+ break;
-+ }
-+ }
-+
-+ /* abs. offset in bytes in the file */
-+ offset = disp + (ADIO_Offset) n_filetypes *filetype_extent +
-+ abs_off_in_filetype;
-+ }
-+
-+ /* calculate how much space to allocate for offset_list, len_list */
-+
-+ old_frd_size = frd_size;
-+ contig_access_count = i = 0;
-+ j = st_index;
-+ bufsize = buftype_size * bufcount;
-+ frd_size = ADIOI_MIN(frd_size, bufsize);
-+ while (i < bufsize) {
-+ if (frd_size)
-+ contig_access_count++;
-+ i += frd_size;
-+ j = (j + 1) % flat_file->count;
-+ frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i);
-+ }
-+
-+ /* allocate space for offset_list and len_list */
-+
-+ *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc((contig_access_count+1) *
-+ sizeof(ADIO_Offset));
-+ *len_list_ptr = (int *) ADIOI_Malloc((contig_access_count + 1) *
-+ sizeof(int));
-+ /* +1 to avoid a 0-size malloc */
-+
-+ offset_list = *offset_list_ptr;
-+ len_list = *len_list_ptr;
-+
-+ /* find start offset, end offset, and fill in offset_list and len_list */
-+
-+ *start_offset_ptr = offset; /* calculated above */
-+
-+ i = k = 0;
-+ j = st_index;
-+ off = offset;
-+ frd_size = ADIOI_MIN(old_frd_size, bufsize);
-+ while (i < bufsize) {
-+ if (frd_size) {
-+ offset_list[k] = off;
-+ len_list[k] = frd_size;
-+ k++;
-+ }
-+ i += frd_size;
-+ end_offset = off + frd_size - 1;
-+
-+ /* Note: end_offset points to the last byte-offset that will be accessed.
-+ e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */
-+
-+ if (off + frd_size < disp + flat_file->indices[j] +
-+ flat_file->blocklens[j] +
-+ (ADIO_Offset) n_filetypes * filetype_extent) {
-+ off += frd_size;
-+ /* did not reach end of contiguous block in filetype.
-+ * no more I/O needed. off is incremented by frd_size.
-+ */
-+ } else {
-+ if (j < (flat_file->count - 1))
-+ j++;
-+ else {
-+ /* hit end of flattened filetype;
-+ * start at beginning again
-+ */
-+ j = 0;
-+ n_filetypes++;
-+ }
-+ off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes *
-+ filetype_extent;
-+ frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i);
-+ }
-+ }
-+
-+ /* update file pointer */
-+ if (file_ptr_type == ADIO_INDIVIDUAL)
-+ fd->fp_ind = off;
-+
-+ *contig_access_count_ptr = contig_access_count;
-+ *end_offset_ptr = end_offset;
-+ }
-+}
-+
-+void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
-+ int *count_my_req_per_proc,
-+ ADIOI_Access * my_req,
-+ int nprocs, int myrank,
-+ ADIO_Offset start_offset,
-+ ADIO_Offset end_offset,
-+ int *striping_info,
-+ int *count_others_req_procs_ptr,
-+ ADIOI_Access ** others_req_ptr)
-+{
-+ /* what requests of other processes will be written by this process */
-+
-+ int *count_others_req_per_proc, count_others_req_procs;
-+ int i, j, lflag, samesize = 0, contiguous = 0;
-+ MPI_Request *send_requests, *recv_requests;
-+ MPI_Status *statuses;
-+ ADIOI_Access *others_req;
-+ char *value = NULL;
-+ int proc, avail_nprocs, stripe_count, CO;
-+ ADIO_Offset min_st_offset, off, req_len, avail_len, rem_len, *all_lens;
-+
-+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
-+ /* same io size */
-+ MPI_Info_get(fd->info, "same_io_size", MPI_MAX_INFO_VAL, value, &lflag);
-+ if (lflag && !strcmp(value, "yes"))
-+ samesize = 1;
-+ /* contiguous data */
-+ MPI_Info_get(fd->info, "contiguous_data", MPI_MAX_INFO_VAL, value, &lflag);
-+ if (lflag && !strcmp(value, "yes"))
-+ contiguous = 1;
-+
-+ *others_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs *
-+ sizeof(ADIOI_Access));
-+ others_req = *others_req_ptr;
-+
-+ /* if the data are contiguous, we don't need to do MPI_Alltoall */
-+ if (contiguous) {
-+ stripe_count = striping_info[1];
-+ CO = striping_info[2];
-+
-+ for (i = 0; i < nprocs; i++) {
-+ others_req[i].count = 0;
-+ }
-+ req_len = end_offset - start_offset + 1;
-+ all_lens = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
-+
-+ if (samesize == 0) {/* different request size */
-+ /* calculate the min_st_offset */
-+ MPI_Allreduce(&start_offset, &min_st_offset, 1, MPI_LONG_LONG,
-+ MPI_MIN, fd->comm);
-+ /* exchange request length */
-+ MPI_Allgather(&req_len, 1, ADIO_OFFSET, all_lens, 1, ADIO_OFFSET,
-+ fd->comm);
-+ } else { /* same request size */
-+ /* calculate the min_st_offset */
-+ min_st_offset = start_offset - myrank * req_len;
-+ /* assign request length to all_lens[] */
-+ for (i = 0; i < nprocs; i ++)
-+ all_lens[i] = req_len;
-+ }
-+ avail_nprocs = ADIOI_MIN(nprocs, stripe_count * CO);
-+ if (myrank < avail_nprocs) {
-+ off = min_st_offset;
-+ /* calcaulte other_req[i].count */
-+ for (i = 0; i < nprocs; i++) {
-+ avail_len = all_lens[i];
-+ rem_len = avail_len;
-+ while (rem_len > 0) {
-+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
-+ nprocs, striping_info);
-+ if (proc == myrank) {
-+ others_req[i].count ++;
-+ }
-+ off += avail_len;
-+ rem_len -= avail_len;
-+ avail_len = rem_len;
-+ }
-+ }
-+ /* calculate offset and len for each request */
-+ off = min_st_offset;
-+ for (i = 0; i < nprocs; i++) {
-+ if (others_req[i].count) {
-+ others_req[i].offsets = (ADIO_Offset *)
-+ ADIOI_Malloc(others_req[i].count *
-+ sizeof(ADIO_Offset));
-+ others_req[i].lens = (int *)
-+ ADIOI_Malloc(others_req[i].count *
-+ sizeof(int));
-+ others_req[i].mem_ptrs = (MPI_Aint *)
-+ ADIOI_Malloc(others_req[i].count *
-+ sizeof(MPI_Aint));
-+ }
-+ j = 0;
-+ avail_len = all_lens[i];
-+ rem_len = avail_len;
-+ while (rem_len > 0) {
-+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
-+ nprocs, striping_info);
-+ if (proc == myrank) {
-+ others_req[i].offsets[j] = off;
-+ others_req[i].lens[j] = (int)avail_len;
-+ j ++;
-+ }
-+ off += avail_len;
-+ rem_len -= avail_len;
-+ avail_len = rem_len;
-+ }
-+ }
-+ }
-+ ADIOI_Free(value);
-+ ADIOI_Free(all_lens);
-+ } else {
-+ /* multiple non-contiguous requests */
-+ /* first find out how much to send/recv and from/to whom */
-+
-+ /*
-+ * count_others_req_procs:
-+ * number of processes whose requests will be written by
-+ * this process (including this process itself)
-+ * count_others_req_per_proc[i]:
-+ * how many separate contiguous requests of proc[i] will be
-+ * written by this process.
-+ */
-+
-+ count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
-+
-+ MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
-+ count_others_req_per_proc, 1, MPI_INT, fd->comm);
-+
-+ count_others_req_procs = 0;
-+ for (i = 0; i < nprocs; i++) {
-+ if (count_others_req_per_proc[i]) {
-+ others_req[i].count = count_others_req_per_proc[i];
-+ others_req[i].offsets = (ADIO_Offset *)
-+ ADIOI_Malloc(others_req[i].count *
-+ sizeof(ADIO_Offset));
-+ others_req[i].lens = (int *)
-+ ADIOI_Malloc(others_req[i].count *
-+ sizeof(int));
-+ others_req[i].mem_ptrs = (MPI_Aint *)
-+ ADIOI_Malloc(others_req[i].count *
-+ sizeof(MPI_Aint));
-+ count_others_req_procs++;
-+ } else
-+ others_req[i].count = 0;
-+ }
-+
-+ /* now send the calculated offsets and lengths to respective processes */
-+
-+ send_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_my_req_procs + 1) *
-+ sizeof(MPI_Request));
-+ recv_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_others_req_procs+1)*
-+ sizeof(MPI_Request));
-+ /* +1 to avoid a 0-size malloc */
-+
-+ j = 0;
-+ for (i = 0; i < nprocs; i++) {
-+ if (others_req[i].count) {
-+ MPI_Irecv(others_req[i].offsets, others_req[i].count,
-+ ADIO_OFFSET, i, i + myrank, fd->comm,
-+ &recv_requests[j]);
-+ j++;
-+ MPI_Irecv(others_req[i].lens, others_req[i].count,
-+ MPI_INT, i, i + myrank + 1, fd->comm,
-+ &recv_requests[j]);
-+ j++;
-+ }
-+ }
-+
-+ j = 0;
-+ for (i = 0; i < nprocs; i++) {
-+ if (my_req[i].count) {
-+ MPI_Isend(my_req[i].offsets, my_req[i].count,
-+ ADIO_OFFSET, i, i + myrank, fd->comm,
-+ &send_requests[j]);
-+ j++;
-+ MPI_Isend(my_req[i].lens, my_req[i].count,
-+ MPI_INT, i, i + myrank + 1, fd->comm,
-+ &send_requests[j]);
-+ j++;
-+ }
-+ }
-+
-+ statuses = (MPI_Status *)
-+ ADIOI_Malloc((1 + 2 * ADIOI_MAX(count_my_req_procs,
-+ count_others_req_procs)) *
-+ sizeof(MPI_Status));
-+ /* +1 to avoid a 0-size malloc */
-+
-+ MPI_Waitall(2 * count_my_req_procs, send_requests, statuses);
-+ MPI_Waitall(2 * count_others_req_procs, recv_requests, statuses);
-+
-+ ADIOI_Free(send_requests);
-+ ADIOI_Free(recv_requests);
-+ ADIOI_Free(statuses);
-+ ADIOI_Free(count_others_req_per_proc);
-+
-+ *count_others_req_procs_ptr = count_others_req_procs;
-+ }
-+}
-diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c
---- ad_lustre_orig/ad_lustre.c 2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/ad_lustre.c 2008-09-17 18:20:35.000000000 +0800
+diff -ruN adio/ad_lustre_orig/ad_lustre.c adio/ad_lustre/ad_lustre.c
+--- adio/ad_lustre_orig/ad_lustre.c 2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/ad_lustre.c 2008-10-17 17:03:42.000000000 +0800
@@ -1,9 +1,11 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/*
*
* Copyright (C) 2007 Oak Ridge National Laboratory
+ *
-+ * Copyright (C) 2008 Sun Microsystems, Lustre group
++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include "ad_lustre.h"
-@@ -13,13 +15,13 @@
+@@ -13,12 +15,12 @@
ADIOI_LUSTRE_ReadContig, /* ReadContig */
ADIOI_LUSTRE_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
- ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
+ ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */
ADIOI_GEN_SeekIndividual, /* SeekIndividual */
-- ADIOI_GEN_Fcntl, /* Fcntl */
-+ ADIOI_LUSTRE_Fcntl, /* Fcntl */
+ ADIOI_GEN_Fcntl, /* Fcntl */
ADIOI_LUSTRE_SetInfo, /* SetInfo */
ADIOI_GEN_ReadStrided, /* ReadStrided */
- ADIOI_GEN_WriteStrided, /* WriteStrided */
-- ADIOI_GEN_Close, /* Close */
+ ADIOI_LUSTRE_WriteStrided, /* WriteStrided */
-+ ADIOI_LUSTRE_Close, /* Close */
+ ADIOI_GEN_Close, /* Close */
#if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE)
ADIOI_GEN_IreadContig, /* IreadContig */
- ADIOI_GEN_IwriteContig, /* IwriteContig */
-diff -ruN ad_lustre_orig/ad_lustre_close.c ad_lustre/ad_lustre_close.c
---- ad_lustre_orig/ad_lustre_close.c 1970-01-01 08:00:00.000000000 +0800
-+++ ad_lustre/ad_lustre_close.c 2008-09-17 18:20:35.000000000 +0800
-@@ -0,0 +1,42 @@
-+/* -*- Mode: C; c-basic-offset:4 ; -*- */
-+/*
-+ *
-+ * Copyright (C) 1997 University of Chicago.
-+ * See COPYRIGHT notice in top-level directory.
-+ *
-+ * Copyright (C) 2007 Oak Ridge National Laboratory
-+ *
-+ * Copyright (C) 2008 Sun Microsystems, Lustre group
-+ */
-+
-+#include "ad_lustre.h"
-+
-+#ifdef PROFILE
-+#include "mpe.h"
-+#endif
-+
-+void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code)
-+{
-+ int err, derr = 0;
-+ static char myname[] = "ADIOI_LUSTRE_CLOSE";
-+
-+#ifdef PROFILE
-+ MPE_Log_event(9, 0, "start close");
-+#endif
-+
-+ err = close(fd->fd_sys);
-+
-+#ifdef PROFILE
-+ MPE_Log_event(10, 0, "end close");
-+#endif
-+
-+ fd->fd_sys = -1;
-+
-+ if (err == -1 || derr == -1) {
-+ *error_code =
-+ MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname,
-+ __LINE__, MPI_ERR_IO, "**io", "**io %s",
-+ strerror(errno));
-+ } else
-+ *error_code = MPI_SUCCESS;
-+}
-diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
---- ad_lustre_orig/ad_lustre.h 2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/ad_lustre.h 2008-10-06 16:07:21.000000000 +0800
+diff -ruN adio/ad_lustre_orig/ad_lustre.h adio/ad_lustre/ad_lustre.h
+--- adio/ad_lustre_orig/ad_lustre.h 2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/ad_lustre.h 2009-05-05 15:34:58.000000000 +0800
@@ -1,9 +1,11 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/*
*
* Copyright (C) 2007 Oak Ridge National Laboratory
+ *
-+ * Copyright (C) 2008 Sun Microsystems, Lustre group
++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef AD_UNIX_INCLUDE
-@@ -24,7 +26,32 @@
+@@ -24,7 +26,7 @@
/*#include <fcntl.h>*/
#include <sys/ioctl.h>
-+#ifdef WITH_LUSTRE
- #include "lustre/lustre_user.h"
-+#else
-+/* copy something from lustre_user.h here */
-+# define LOV_USER_MAGIC 0x0BD10BD0
-+# define LL_IOC_LOV_SETSTRIPE _IOW ('f', 154, long)
-+# define LL_IOC_LOV_GETSTRIPE _IOW ('f', 155, long)
-+# define lov_user_ost_data lov_user_ost_data_v1
-+struct lov_user_ost_data_v1 { /* per-stripe data structure */
-+ __u64 l_object_id; /* OST object ID */
-+ __u64 l_object_gr; /* OST object group (creating MDS number) */
-+ __u32 l_ost_gen; /* generation of this OST index */
-+ __u32 l_ost_idx; /* OST index in LOV */
-+} __attribute__((packed));
-+#define lov_user_md lov_user_md_v1
-+struct lov_user_md_v1 { /* LOV EA user data (host-endian) */
-+ __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V1 */
-+ __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
-+ __u64 lmm_object_id; /* LOV object ID */
-+ __u64 lmm_object_gr; /* LOV object group */
-+ __u32 lmm_stripe_size; /* size of stripe in bytes */
-+ __u16 lmm_stripe_count; /* num stripes in use for this object */
-+ __u16 lmm_stripe_offset; /* starting stripe offset in lmm_objects */
-+ struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
-+} __attribute__((packed));
-+#endif
+-#include "lustre/lustre_user.h"
++#include <lustre/lustre_user.h>
#include "adio.h"
/*#include "adioi.h"*/
-@@ -41,24 +68,62 @@
+@@ -41,24 +43,31 @@
void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code);
void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code);
int *error_code);
void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
-
-+void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
-+ int mode, int nprocs);
-+int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
-+ ADIO_Offset *len, int nprocs,
-+ int *striping_info);
-+void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
-+ int *len_list, int contig_access_count,
-+ int *striping_info, int nprocs,
-+ int *count_my_req_procs_ptr,
-+ int **count_my_req_per_proc_ptr,
-+ ADIOI_Access ** my_req_ptr,
-+ int **buf_idx_ptr);
-+int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
-+ int *len_list, int nprocs);
-+void ADIOI_LUSTRE_Calc_my_off_len(ADIO_File fd, int bufcount,
-+ MPI_Datatype datatype, int file_ptr_type,
-+ ADIO_Offset offset,
-+ ADIO_Offset **offset_list_ptr,
-+ int **len_list_ptr,
-+ ADIO_Offset *start_offset_ptr,
-+ ADIO_Offset *end_offset_ptr,
-+ int *contig_access_count_ptr);
-+void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
-+ int *count_my_req_per_proc,
-+ ADIOI_Access * my_req,
-+ int nprocs, int myrank,
-+ ADIO_Offset start_offset,
-+ ADIO_Offset end_offset,
-+ int *striping_info,
-+ int *count_others_req_procs_ptr,
-+ ADIOI_Access ** others_req_ptr);
#endif /* End of AD_UNIX_INCLUDE */
-diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
---- ad_lustre_orig/ad_lustre_hints.c 2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/ad_lustre_hints.c 2008-09-17 18:20:35.000000000 +0800
+diff -ruN adio/ad_lustre_orig/ad_lustre_hints.c adio/ad_lustre/ad_lustre_hints.c
+--- adio/ad_lustre_orig/ad_lustre_hints.c 2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/ad_lustre_hints.c 2009-04-24 15:35:05.000000000 +0800
@@ -1,9 +1,11 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/*
*
* Copyright (C) 2007 Oak Ridge National Laboratory
+ *
-+ * Copyright (C) 2008 Sun Microsystems, Lustre group
++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include "ad_lustre.h"
-@@ -11,130 +13,162 @@
-
+@@ -12,46 +14,56 @@
void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
-- char *value, *value_in_fd;
+ char *value, *value_in_fd;
- int flag, tmp_val[3], str_factor=-1, str_unit=0, start_iodev=-1;
-- struct lov_user_md lum = { 0 };
-- int err, myrank, fd_sys, perm, amode, old_mask;
-+ char *value = NULL;
-+ int flag, tmp_val, int_val, str_factor, str_unit, start_iodev;
++ int flag, stripe_val[3], str_factor = -1, str_unit=0, start_iodev=-1;
+ struct lov_user_md lum = { 0 };
+ int err, myrank, fd_sys, perm, amode, old_mask;
++ int int_val, tmp_val;
+ static char myname[] = "ADIOI_LUSTRE_SETINFO";
-- value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
if ( (fd->info) == MPI_INFO_NULL) {
- /* This must be part of the open call. can set striping parameters
- if necessary. */
fd->direct_read = fd->direct_write = 0;
-
- /* has user specified striping or server buffering parameters
++ /* initialize lustre hints */
++ MPI_Info_set(fd->info, "romio_lustre_co_ratio", "1");
++ fd->hints->fs_hints.lustre.co_ratio = 1;
++ MPI_Info_set(fd->info, "romio_lustre_coll_threshold", "0");
++ fd->hints->fs_hints.lustre.coll_threshold = 0;
++ MPI_Info_set(fd->info, "romio_lustre_ds_in_coll", "enable");
++ fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_ENABLE;
+
+ /* has user specified striping or server buffering parameters
and do they have the same value on all processes? */
if (users_info != MPI_INFO_NULL) {
- MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
-- value, &flag);
++ /* striping information */
++ MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
+ value, &flag);
- if (flag)
-- str_unit=atoi(value);
--
++ if (flag)
+ str_unit=atoi(value);
+
- MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
-- value, &flag);
++ MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
+ value, &flag);
- if (flag)
-- str_factor=atoi(value);
-+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
++ if (flag)
+ str_factor=atoi(value);
- MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
-+ /* direct read and write */
-+ MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
- value, &flag);
+- value, &flag);
- if (flag)
-- start_iodev=atoi(value);
--
++ MPI_Info_get(users_info, "romio_lustre_start_iodevice",
++ MPI_MAX_INFO_VAL, value, &flag);
++ if (flag)
+ start_iodev=atoi(value);
+
- MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
- value, &flag);
++ /* direct read and write */
++ MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
++ value, &flag);
if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
MPI_Info_set(fd->info, "direct_read", "true");
fd->direct_read = 1;
}
-
+-
- MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
+ MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
value, &flag);
if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
MPI_Info_set(fd->info, "direct_write", "true");
- fd->direct_write = 1;
+@@ -59,22 +71,23 @@
}
-- }
+ }
-- MPI_Comm_rank(fd->comm, &myrank);
-- if (myrank == 0) {
++ /* set striping information with ioctl */
+ MPI_Comm_rank(fd->comm, &myrank);
+ if (myrank == 0) {
- tmp_val[0] = str_factor;
- tmp_val[1] = str_unit;
- tmp_val[2] = start_iodev;
-- }
++ stripe_val[0] = str_factor;
++ stripe_val[1] = str_unit;
++ stripe_val[2] = start_iodev;
+ }
- MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm);
--
++ MPI_Bcast(stripe_val, 3, MPI_INT, 0, fd->comm);
+
- if (tmp_val[0] != str_factor
- || tmp_val[1] != str_unit
- || tmp_val[2] != start_iodev) {
-- FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys"
-- "-striping_factor:striping_unit:start_iodevice "
-- "need to be identical across all processes\n");
-- MPI_Abort(MPI_COMM_WORLD, 1);
++ if (stripe_val[0] != str_factor
++ || stripe_val[1] != str_unit
++ || stripe_val[2] != start_iodev) {
+ FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys"
+ "-striping_factor:striping_unit:start_iodevice "
+ "need to be identical across all processes\n");
+ MPI_Abort(MPI_COMM_WORLD, 1);
- } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) {
-- /* if user has specified striping info, process 0 tries to set it */
-- if (!myrank) {
-- if (fd->perm == ADIO_PERM_NULL) {
-- old_mask = umask(022);
-- umask(old_mask);
-- perm = old_mask ^ 0666;
-- }
-- else perm = fd->perm;
--
-- amode = 0;
-- if (fd->access_mode & ADIO_CREATE)
-- amode = amode | O_CREAT;
-- if (fd->access_mode & ADIO_RDONLY)
-- amode = amode | O_RDONLY;
-- if (fd->access_mode & ADIO_WRONLY)
-- amode = amode | O_WRONLY;
-- if (fd->access_mode & ADIO_RDWR)
-- amode = amode | O_RDWR;
-- if (fd->access_mode & ADIO_EXCL)
-- amode = amode | O_EXCL;
--
-- /* we need to create file so ensure this is set */
-- amode = amode | O_LOV_DELAY_CREATE | O_CREAT;
--
-- fd_sys = open(fd->filename, amode, perm);
++ } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) {
+ /* if user has specified striping info, process 0 tries to set it */
+ if (!myrank) {
+ if (fd->perm == ADIO_PERM_NULL) {
+@@ -100,9 +113,9 @@
+ amode = amode | O_LOV_DELAY_CREATE | O_CREAT;
+
+ fd_sys = open(fd->filename, amode, perm);
- if (fd_sys == -1) {
- if (errno != EEXIST)
- fprintf(stderr,
-- "Failure to open file %s %d %d\n",strerror(errno), amode, perm);
-- } else {
-- lum.lmm_magic = LOV_USER_MAGIC;
-- lum.lmm_pattern = 0;
-- lum.lmm_stripe_size = str_unit;
-- lum.lmm_stripe_count = str_factor;
-- lum.lmm_stripe_offset = start_iodev;
--
-- err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum);
++ if (fd_sys == -1) {
++ if (errno != EEXIST)
++ fprintf(stderr,
+ "Failure to open file %s %d %d\n",strerror(errno), amode, perm);
+ } else {
+ lum.lmm_magic = LOV_USER_MAGIC;
+@@ -112,25 +125,73 @@
+ lum.lmm_stripe_offset = start_iodev;
+
+ err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum);
- if (err == -1 && errno != EEXIST) {
-- fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno));
-- }
-- close(fd_sys);
-- }
-- } /* End of striping parameters validation */
-+ /* stripe size */
-+ MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
-+ value, &flag);
-+ if (flag && (str_unit = atoi(value))) {
-+ tmp_val = str_unit;
-+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+ if (tmp_val != str_unit) {
-+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+ "striping_unit",
-+ error_code);
-+ return;
-+ }
-+ MPI_Info_set(fd->info, "striping_unit", value);
-+ }
-+ /* stripe count */
-+ MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
-+ value, &flag);
-+ if (flag && (str_factor = atoi(value))) {
-+ tmp_val = str_factor;
-+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+ if (tmp_val != str_factor) {
-+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+ "striping_factor",
-+ error_code);
-+ return;
-+ }
-+ MPI_Info_set(fd->info, "striping_factor", value);
-+ }
-+ /* stripe offset */
-+ MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
-+ value, &flag);
-+ if (flag && ((start_iodev = atoi(value)) >= 0)) {
-+ tmp_val = start_iodev;
-+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+ if (tmp_val != start_iodev) {
-+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+ "start_iodevice",
-+ error_code);
-+ return;
-+ }
-+ MPI_Info_set(fd->info, "start_iodevice", value);
-+ }
-+ /* CO */
-+ MPI_Info_get(users_info, "CO", MPI_MAX_INFO_VAL, value,
-+ &flag);
-+ if (flag && (int_val = atoi(value)) > 0) {
-+ tmp_val = int_val;
-+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+ if (tmp_val != int_val) {
-+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+ "CO",
-+ error_code);
-+ return;
-+ }
-+ MPI_Info_set(fd->info, "CO", value);
-+ }
-+ /* big_req_size */
-+ MPI_Info_get(users_info, "big_req_size", MPI_MAX_INFO_VAL, value,
-+ &flag);
-+ if (flag && (int_val = atoi(value)) > 0) {
-+ tmp_val = int_val;
-+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+ if (tmp_val != int_val) {
-+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+ "big_req_size",
-+ error_code);
-+ return;
-+ }
-+ MPI_Info_set(fd->info, "big_req_size", value);
-+ }
-+ /* hint for disabling data sieving when do collective IO */
-+ MPI_Info_get(users_info, "ds_in_coll", MPI_MAX_INFO_VAL,
-+ value, &flag);
-+ if (flag && (!strcmp(value, "enable") ||
-+ !strcmp(value, "ENABLE"))) {
-+ tmp_val = int_val = 1;
-+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+ if (tmp_val != int_val) {
-+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+ "ds_in_coll",
-+ error_code);
-+ return;
-+ }
-+ MPI_Info_set(fd->info, "ds_in_coll", "enable");
-+ }
-+ /* same io size */
-+ MPI_Info_get(users_info, "same_io_size", MPI_MAX_INFO_VAL,
-+ value, &flag);
-+ if (flag && (!strcmp(value, "yes") ||
-+ !strcmp(value, "YES"))) {
-+ tmp_val = int_val = 1;
-+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+ if (tmp_val != int_val) {
-+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+ "same_io_size",
-+ error_code);
-+ return;
-+ }
-+ MPI_Info_set(fd->info, "same_io_size", "yes");
-+ }
-+ /* contiguous data */
-+ MPI_Info_get(users_info, "contiguous_data", MPI_MAX_INFO_VAL,
-+ value, &flag);
-+ if (flag && (!strcmp(value, "yes") ||
-+ !strcmp(value, "YES"))) {
-+ tmp_val = int_val = 1;
-+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+ if (tmp_val != int_val) {
-+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+ "contiguous_data",
-+ error_code);
-+ return;
-+ }
-+ MPI_Info_set(fd->info, "contiguous_data", "yes");
-+ }
-+ ADIOI_Free(value);
++ if (err == -1 && errno != EEXIST) {
+ fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno));
+ }
+ close(fd_sys);
+ }
+ } /* End of striping parameters validation */
}
-
-- MPI_Barrier(fd->comm);
+ MPI_Barrier(fd->comm);
- /* set the values for collective I/O and data sieving parameters */
- ADIOI_GEN_SetInfo(fd, users_info, error_code);
- } else {
- ADIOI_GEN_SetInfo(fd, users_info, error_code);
}
-
++ /* get other hint */
++ if (users_info != MPI_INFO_NULL) {
++ /* CO: IO Clients/OST,
++ * to keep the load balancing between clients and OSTs */
++ MPI_Info_get(users_info, "romio_lustre_co_ratio", MPI_MAX_INFO_VAL, value,
++ &flag);
++ if (flag && (int_val = atoi(value)) > 0) {
++ tmp_val = int_val;
++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
++ if (tmp_val != int_val) {
++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
++ "romio_lustre_co_ratio",
++ error_code);
++ ADIOI_Free(value);
++ return;
++ }
++ MPI_Info_set(fd->info, "romio_lustre_co_ratio", value);
++ fd->hints->fs_hints.lustre.co_ratio = atoi(value);
++ }
++ /* coll_threshold:
++ * if the req size is bigger than this, collective IO may not be performed.
++ */
++ MPI_Info_get(users_info, "romio_lustre_coll_threshold", MPI_MAX_INFO_VAL, value,
++ &flag);
++ if (flag && (int_val = atoi(value)) > 0) {
++ tmp_val = int_val;
++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
++ if (tmp_val != int_val) {
++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
++ "romio_lustre_coll_threshold",
++ error_code);
++ ADIOI_Free(value);
++ return;
++ }
++ MPI_Info_set(fd->info, "romio_lustre_coll_threshold", value);
++ fd->hints->fs_hints.lustre.coll_threshold = atoi(value);
++ }
++ /* ds_in_coll: disable data sieving in collective IO */
++ MPI_Info_get(users_info, "romio_lustre_ds_in_coll", MPI_MAX_INFO_VAL,
++ value, &flag);
++ if (flag && (!strcmp(value, "disable") ||
++ !strcmp(value, "DISABLE"))) {
++ tmp_val = int_val = 2;
++ MPI_Bcast(&tmp_val, 2, MPI_INT, 0, fd->comm);
++ if (tmp_val != int_val) {
++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
++ "romio_lustre_ds_in_coll",
++ error_code);
++ ADIOI_Free(value);
++ return;
++ }
++ MPI_Info_set(fd->info, "romio_lustre_ds_in_coll", "disable");
++ fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_DISABLE;
++ }
++ }
+ /* set the values for collective I/O and data sieving parameters */
+ ADIOI_GEN_SetInfo(fd, users_info, error_code);
+
if (ADIOI_Direct_read) fd->direct_read = 1;
if (ADIOI_Direct_write) fd->direct_write = 1;
-- ADIOI_Free(value);
--
- *error_code = MPI_SUCCESS;
- }
-diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c
---- ad_lustre_orig/ad_lustre_open.c 2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/ad_lustre_open.c 2008-09-17 18:55:50.000000000 +0800
-@@ -1,18 +1,21 @@
+diff -ruN adio/ad_lustre_orig/ad_lustre_open.c adio/ad_lustre/ad_lustre_open.c
+--- adio/ad_lustre_orig/ad_lustre_open.c 2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/ad_lustre_open.c 2009-03-01 11:32:32.000000000 +0800
+@@ -1,9 +1,11 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/*
- * Copyright (C) 1997 University of Chicago.
*
* Copyright (C) 2007 Oak Ridge National Laboratory
+ *
-+ * Copyright (C) 2008 Sun Microsystems, Lustre group
++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include "ad_lustre.h"
+@@ -51,14 +53,17 @@
+ err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
- void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
- {
-- int perm, old_mask, amode, amode_direct;
-+ int perm, old_mask, amode = 0, amode_direct = 0, flag = 0, err, myrank;
-+ int stripe_size = 0, stripe_count = 0, stripe_offset = -1;
- struct lov_user_md lum = { 0 };
-- char *value;
-+ char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
-
- #if defined(MPICH2) || !defined(PRINT_ERR_MSG)
- static char myname[] = "ADIOI_LUSTRE_OPEN";
-@@ -22,12 +25,57 @@
- old_mask = umask(022);
- umask(old_mask);
- perm = old_mask ^ 0666;
-- }
-- else perm = fd->perm;
-+ } else
-+ perm = fd->perm;
+ if (!err) {
++ fd->hints->striping_unit = lum.lmm_stripe_size;
+ sprintf(value, "%d", lum.lmm_stripe_size);
+ MPI_Info_set(fd->info, "striping_unit", value);
-- amode = 0;
-- if (fd->access_mode & ADIO_CREATE)
-+ if (fd->access_mode & ADIO_CREATE) {
- amode = amode | O_CREAT;
-+ /* Check striping info
-+ * if already set by SetInfo(), set them to lum; otherwise, set by lum
-+ */
-+ MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value,
-+ &flag);
-+ if (flag)
-+ stripe_size = atoi(value);
-+
-+ MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value,
-+ &flag);
-+ if (flag)
-+ stripe_count = atoi(value);
-+
-+ MPI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL, value,
-+ &flag);
-+ if (flag)
-+ stripe_offset = atoi(value);
-+
-+ /* if user has specified striping info,
-+ * process 0 will try to check and set it.
-+ */
-+ if ((stripe_size > 0) || (stripe_count > 0) || (stripe_offset >= 0)) {
-+ MPI_Comm_rank(fd->comm, &myrank);
-+ if (myrank == 0) {
-+ int fd_sys = open(fd->filename, amode, perm);
-+ if (fd_sys == -1) {
-+ if (errno != EEXIST)
-+ FPRINTF(stderr, "Failure to open file %s %d %d\n",
-+ strerror(errno), amode, perm);
-+ } else {
-+ lum.lmm_magic = LOV_USER_MAGIC;
-+ lum.lmm_pattern = 1;
-+ lum.lmm_stripe_size = stripe_size;
-+ lum.lmm_stripe_count = stripe_count;
-+ lum.lmm_stripe_offset = stripe_offset;
-+
-+ if (ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum))
-+ FPRINTF(stderr,
-+ "Failure to set striping info to Lustre!\n");
-+ close(fd_sys);
-+ }
-+ }
-+ MPI_Barrier(fd->comm);
-+ }
-+ }
-+
- if (fd->access_mode & ADIO_RDONLY)
- amode = amode | O_RDONLY;
- if (fd->access_mode & ADIO_WRONLY)
-@@ -42,32 +90,36 @@
- fd->fd_sys = open(fd->filename, amode|O_CREAT, perm);
++ fd->hints->striping_factor = lum.lmm_stripe_count;
+ sprintf(value, "%d", lum.lmm_stripe_count);
+ MPI_Info_set(fd->info, "striping_factor", value);
- if (fd->fd_sys != -1) {
-- int err;
--
-- value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
--
- /* get file striping information and set it in info */
-- lum.lmm_magic = LOV_USER_MAGIC;
-- err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
--
-- if (!err) {
-- sprintf(value, "%d", lum.lmm_stripe_size);
-- MPI_Info_set(fd->info, "striping_unit", value);
--
-- sprintf(value, "%d", lum.lmm_stripe_count);
-- MPI_Info_set(fd->info, "striping_factor", value);
--
-- sprintf(value, "%d", lum.lmm_stripe_offset);
++ fd->hints->fs_hints.lustre.start_iodevice = lum.lmm_stripe_offset;
+ sprintf(value, "%d", lum.lmm_stripe_offset);
- MPI_Info_set(fd->info, "start_iodevice", value);
-- }
-- ADIOI_Free(value);
-+ lum.lmm_magic = LOV_USER_MAGIC;
-+ err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
-
-+ if (!err) {
-+ if (lum.lmm_stripe_size && lum.lmm_stripe_count &&
-+ (lum.lmm_stripe_offset >= 0)) {
-+ sprintf(value, "%d", lum.lmm_stripe_size);
-+ MPI_Info_set(fd->info, "striping_unit", value);
-+
-+ sprintf(value, "%d", lum.lmm_stripe_count);
-+ MPI_Info_set(fd->info, "striping_factor", value);
-+
-+ sprintf(value, "%d", lum.lmm_stripe_offset);
-+ MPI_Info_set(fd->info, "start_iodevice", value);
-+ } else {
-+ FPRINTF(stderr, "Striping info is invalid!\n");
-+ ADIOI_Free(value);
-+ MPI_Abort(MPI_COMM_WORLD, 1);
-+ }
-+ } else {
-+ FPRINTF(stderr, "Failed to get striping info from Lustre!\n");
-+ ADIOI_Free(value);
-+ MPI_Abort(MPI_COMM_WORLD, 1);
-+ }
- if (fd->access_mode & ADIO_APPEND)
- fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
-- }
--
-+ }
- if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
-- fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
-+ fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
-
- fd->fd_direct = -1;
- if (fd->direct_write || fd->direct_read) {
-@@ -81,20 +133,22 @@
- }
++ MPI_Info_set(fd->info, "romio_lustre_start_iodevice", value);
+ }
+ ADIOI_Free(value);
- /* --BEGIN ERROR HANDLING-- */
-- if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
-- (fd->direct_write || fd->direct_read))) {
-+ if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
-+ (fd->direct_write || fd->direct_read))) {
- if (errno == ENAMETOOLONG)
- *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-- MPIR_ERR_RECOVERABLE, myname,
-- __LINE__, MPI_ERR_BAD_FILE,
-+ MPIR_ERR_RECOVERABLE,
-+ myname, __LINE__,
-+ MPI_ERR_BAD_FILE,
- "**filenamelong",
- "**filenamelong %s %d",
- fd->filename,
- strlen(fd->filename));
- else if (errno == ENOENT)
- *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-- MPIR_ERR_RECOVERABLE, myname,
-- __LINE__, MPI_ERR_NO_SUCH_FILE,
-+ MPIR_ERR_RECOVERABLE,
-+ myname, __LINE__,
-+ MPI_ERR_NO_SUCH_FILE,
- "**filenoexist",
- "**filenoexist %s",
- fd->filename);
-@@ -108,27 +162,30 @@
- fd->filename);
- else if (errno == EACCES) {
- *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-- MPIR_ERR_RECOVERABLE, myname,
-- __LINE__, MPI_ERR_ACCESS,
-+ MPIR_ERR_RECOVERABLE,
-+ myname, __LINE__,
-+ MPI_ERR_ACCESS,
- "**fileaccess",
-- "**fileaccess %s",
-- fd->filename );
-- }
-- else if (errno == EROFS) {
-+ "**fileaccess %s",
-+ fd->filename);
-+ } else if (errno == EROFS) {
- /* Read only file or file system and write access requested */
- *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-- MPIR_ERR_RECOVERABLE, myname,
-- __LINE__, MPI_ERR_READ_ONLY,
-- "**ioneedrd", 0 );
-- }
-- else {
-+ MPIR_ERR_RECOVERABLE,
-+ myname, __LINE__,
-+ MPI_ERR_READ_ONLY,
-+ "**ioneedrd", 0);
-+ } else {
- *error_code = MPIO_Err_create_code(MPI_SUCCESS,
-- MPIR_ERR_RECOVERABLE, myname,
-- __LINE__, MPI_ERR_IO, "**io",
-+ MPIR_ERR_RECOVERABLE,
-+ myname, __LINE__,
-+ MPI_ERR_IO, "**io",
- "**io %s", strerror(errno));
- }
-- }
-+ } else {
- /* --END ERROR HANDLING-- */
-- else *error_code = MPI_SUCCESS;
-+ *error_code = MPI_SUCCESS;
-+ }
-
-+ ADIOI_Free(value);
- }
-diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c
---- ad_lustre_orig/ad_lustre_rwcontig.c 2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/ad_lustre_rwcontig.c 2008-09-17 18:52:01.000000000 +0800
+diff -ruN adio/ad_lustre_orig/ad_lustre_rwcontig.c adio/ad_lustre/ad_lustre_rwcontig.c
+--- adio/ad_lustre_orig/ad_lustre_rwcontig.c 2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/ad_lustre_rwcontig.c 2009-05-05 15:34:29.000000000 +0800
@@ -1,9 +1,11 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/*
*
* Copyright (C) 2007 Oak Ridge National Laboratory
+ *
-+ * Copyright (C) 2008 Sun Microsystems, Lustre group
++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#define _XOPEN_SOURCE 600
-@@ -138,7 +140,7 @@
+@@ -136,10 +138,23 @@
+ if (err == -1) goto ioerr;
+ }
- if (io_mode)
+- if (io_mode)
++ if (io_mode) {
++#ifdef ADIOI_MPE_LOGGING
++ MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
++#endif
err = write(fd->fd_sys, buf, len);
- else
-+ else
++#ifdef ADIOI_MPE_LOGGING
++ MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
++#endif
++ } else {
++#ifdef ADIOI_MPE_LOGGING
++ MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
++#endif
err = read(fd->fd_sys, buf, len);
++#ifdef ADIOI_MPE_LOGGING
++ MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
++#endif
++ }
} else {
err = ADIOI_LUSTRE_Directio(fd, buf, len, offset, io_mode);
-diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
---- ad_lustre_orig/ad_lustre_wrcoll.c 1970-01-01 08:00:00.000000000 +0800
-+++ ad_lustre/ad_lustre_wrcoll.c 2008-09-17 18:20:35.000000000 +0800
-@@ -0,0 +1,973 @@
+ }
+diff -ruN adio/ad_lustre_orig/ad_lustre_wrcoll.c adio/ad_lustre/ad_lustre_wrcoll.c
+--- adio/ad_lustre_orig/ad_lustre_wrcoll.c 1970-01-01 08:00:00.000000000 +0800
++++ adio/ad_lustre/ad_lustre_wrcoll.c 2009-04-24 14:48:34.000000000 +0800
+@@ -0,0 +1,934 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ * Copyright (C) 1997 University of Chicago.
+ *
+ * Copyright (C) 2007 Oak Ridge National Laboratory
+ *
-+ * Copyright (C) 2008 Sun Microsystems, Lustre group
++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include "ad_lustre.h"
+ ADIO_Offset *offset_list,
+ int *len_list,
+ int contig_access_count,
-+ int * striping_info,
++ int *striping_info,
+ int *buf_idx, int *error_code);
+static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
-+ ADIOI_Flatlist_node * flat_buf,
++ ADIOI_Flatlist_node *flat_buf,
+ char **send_buf,
-+ ADIO_Offset * offset_list,
++ ADIO_Offset *offset_list,
+ int *len_list, int *send_size,
-+ MPI_Request * requests,
++ MPI_Request *requests,
+ int *sent_to_proc, int nprocs,
+ int myrank, int contig_access_count,
-+ int * striping_info,
++ int *striping_info,
+ int *send_buf_idx,
+ int *curr_to_proc,
+ int *done_to_proc, int iter,
+ MPI_Aint buftype_extent);
+static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
+ char *write_buf,
-+ ADIOI_Flatlist_node * flat_buf,
-+ ADIO_Offset * offset_list,
++ ADIOI_Flatlist_node *flat_buf,
++ ADIO_Offset *offset_list,
+ int *len_list, int *send_size,
+ int *recv_size, ADIO_Offset off,
+ int size, int *count,
+ int *sent_to_proc, int nprocs,
+ int myrank, int buftype_is_contig,
+ int contig_access_count,
-+ int * striping_info,
-+ ADIOI_Access * others_req,
++ int *striping_info,
++ ADIOI_Access *others_req,
+ int *send_buf_idx,
+ int *curr_to_proc,
+ int *done_to_proc, int *hole,
+ int iter, MPI_Aint buftype_extent,
+ int *buf_idx, int *error_code);
-+static void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
-+ ADIO_Offset * srt_off, int *srt_len,
-+ int *start_pos, int nprocs, int nprocs_recv,
-+ int total_elements);
++void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
++ ADIO_Offset *srt_off, int *srt_len, int *start_pos,
++ int nprocs, int nprocs_recv, int total_elements);
+
+void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
+ MPI_Datatype datatype,
+ int file_ptr_type, ADIO_Offset offset,
-+ ADIO_Status * status, int *error_code)
++ ADIO_Status *status, int *error_code)
+{
++ /* Uses a generalized version of the extended two-phase method described
++ * in "An Extended Two-Phase Method for Accessing Sections of
++ * Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
++ * Scientific Programming, (5)4:301--317, Winter 1996.
++ * http://www.mcs.anl.gov/home/thakur/ext2ph.ps
++ */
++
+ ADIOI_Access *my_req;
+ /* array of nprocs access structures, one for each other process has
+ this process's request */
+ /* array of nprocs access structures, one for each other process
+ whose request is written by this process. */
+
-+ int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank, do_collect = 0;
-+ int contig_access_count = 0, buftype_is_contig;
++ int i, filetype_is_contig, nprocs, myrank, do_collect = 0;
++ int contig_access_count = 0, buftype_is_contig, interleave_count = 0;
+ int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
-+ ADIO_Offset orig_fp, start_offset, end_offset, off, *offset_list = NULL;
++ ADIO_Offset orig_fp, start_offset, end_offset, off;
++ ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL;
+ int *buf_idx = NULL, *len_list = NULL, *striping_info = NULL;
+ int old_error, tmp_error;
+
+ MPI_Comm_size(fd->comm, &nprocs);
+ MPI_Comm_rank(fd->comm, &myrank);
+
-+ nprocs_for_coll = fd->hints->cb_nodes;
+ orig_fp = fd->fp_ind;
+
+ /* IO patten identification if cb_write isn't disabled */
+ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
+ /* For this process's request, calculate the list of offsets and
+ lengths in the file and determine the start and end offsets. */
-+ ADIOI_LUSTRE_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
-+ &offset_list, &len_list, &start_offset,
-+ &end_offset, &contig_access_count);
-+ /* Get striping information */
-+ ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1, nprocs);
-+ /* check if the access pattern can benefit from collective write */
-+ do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count,
-+ len_list, nprocs);
++
++ /* Note: end_offset points to the last byte-offset that will be accessed.
++ * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99
++ */
++
++ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
++ &offset_list, &len_list, &start_offset,
++ &end_offset, &contig_access_count);
++
++ /* each process communicates its start and end offsets to other
++ * processes. The result is an array each of start and end offsets
++ * stored in order of process rank.
++ */
++ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
++ end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
++ MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
++ ADIO_OFFSET, fd->comm);
++ MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
++ ADIO_OFFSET, fd->comm);
++ /* are the accesses of different processes interleaved? */
++ for (i = 1; i < nprocs; i++)
++ if ((st_offsets[i] < end_offsets[i-1]) &&
++ (st_offsets[i] <= end_offsets[i]))
++ interleave_count++;
++ /* This is a rudimentary check for interleaving, but should suffice
++ for the moment. */
++
++ /* Two typical access patterns can benefit from collective write.
++ * 1) the processes are interleaved, and
++ * 2) the req size is small.
++ */
++ if (interleave_count > 0) {
++ do_collect = 1;
++ } else {
++ do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count,
++ len_list, nprocs);
++ }
+ }
+ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
+
+ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
+ ADIOI_Free(offset_list);
+ ADIOI_Free(len_list);
++ ADIOI_Free(st_offsets);
++ ADIOI_Free(end_offsets);
+ }
+
+ fd->fp_ind = orig_fp;
+ return;
+ }
+
++ /* Get Lustre hints information */
++ ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1);
++
+ /* calculate what portions of the access requests of this process are
+ * located in which process
+ */
+ ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count,
+ striping_info, nprocs, &count_my_req_procs,
+ &count_my_req_per_proc, &my_req, &buf_idx);
-+ /* calculate what process's requests will be written by this process */
-+ ADIOI_LUSTRE_Calc_others_req(fd, count_my_req_procs,
-+ count_my_req_per_proc,
-+ my_req, nprocs, myrank,
-+ start_offset, end_offset, striping_info,
-+ &count_others_req_procs, &others_req);
++
++ /* based on everyone's my_req, calculate what requests of other processes
++ * will be accessed by this process.
++ * count_others_req_procs = number of processes whose requests (including
++ * this process itself) will be accessed by this process
++ * count_others_req_per_proc[i] indicates how many separate contiguous
++ * requests of proc. i will be accessed by this process.
++ */
++
++ ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc,
++ my_req, nprocs, myrank, &count_others_req_procs,
++ &others_req);
+ ADIOI_Free(count_my_req_per_proc);
+
+ /* exchange data and write in sizes of no more than stripe_size. */
+ offset_list, len_list, contig_access_count,
+ striping_info, buf_idx, error_code);
+
++ /* If this collective write is followed by an independent write,
++ * it's possible to have those subsequent writes on other processes
++ * race ahead and sneak in before the read-modify-write completes.
++ * We carry out a collective communication at the end here so no one
++ * can start independent i/o before collective I/O completes.
++ *
++ * need to do some gymnastics with the error codes so that if something
++ * went wrong, all processes report error, but if a process has a more
++ * specific error code, we can still have that process report the
++ * additional information */
++
+ old_error = *error_code;
+ if (*error_code != MPI_SUCCESS)
+ *error_code = MPI_ERR_IO;
+ ADIOI_Free(buf_idx);
+ ADIOI_Free(offset_list);
+ ADIOI_Free(len_list);
++ ADIOI_Free(st_offsets);
++ ADIOI_Free(end_offsets);
+ ADIOI_Free(striping_info);
+
+#ifdef HAVE_STATUS_SET_BYTES
+ fd->fp_sys_posn = -1; /* set it to null. */
+}
+
++/* If successful, error_code is set to MPI_SUCCESS. Otherwise an error
++ * code is created and returned in error_code.
++ */
+static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
+ MPI_Datatype datatype, int nprocs,
+ int myrank, ADIOI_Access *others_req,
+ int *striping_info, int *buf_idx,
+ int *error_code)
+{
++ /* Send data to appropriate processes and write in sizes of no more
++ * than lustre stripe_size.
++ * The idea is to reduce the amount of extra memory required for
++ * collective I/O. If all data were written all at once, which is much
++ * easier, it would require temp space more than the size of user_buf,
++ * which is often unacceptable. For example, to write a distributed
++ * array to a file, where each local array is 8Mbytes, requiring
++ * at least another 8Mbytes of temp space is unacceptable.
++ */
++
+ int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
+ ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
+ ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
+ int *send_curr_offlen_ptr, *send_size;
+ int *partial_recv, *sent_to_proc, *recv_start_pos;
+ int *send_buf_idx, *curr_to_proc, *done_to_proc;
-+ char *write_buf = NULL, *value;
++ char *write_buf = NULL;
+ MPI_Status status;
+ ADIOI_Flatlist_node *flat_buf = NULL;
+ MPI_Aint buftype_extent;
-+ int stripe_size = striping_info[0], lflag, data_sieving = 0;
-+ int stripe_count = striping_info[1], CO = striping_info[2];
-+ /* IO step size in each communication */
-+ static char myname[] = "ADIOI_EXCH_AND_WRITE";
++ int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
++ int data_sieving = 0;
+
+ *error_code = MPI_SUCCESS; /* changed below if error */
++ /* only I/O errors are currently reported */
+
-+ /* calculate the number of writes of stripe size
-+ * to be done by each process and the max among all processes.
++ /* calculate the number of writes of stripe size to be done.
+ * That gives the no. of communication phases as well.
++ * Note:
++ * Because we redistribute data in stripe-contiguous pattern for Lustre,
++ * each process has the same no. of communication phases.
+ */
+
+ for (i = 0; i < nprocs; i++) {
+ break;
+ }
+ }
-+
+ for (i = 0; i < nprocs; i++) {
+ for (j = 0; j < others_req[i].count; j++) {
+ st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
+ MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm);
+ /* align downward */
+ min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size;
-+ /* when nprocs < stripe_count, there will be trouble, because some client
-+ * would access more than one OST in one whole communication.
++
++ /* Each time, only avail_cb_nodes number of IO clients perform IO,
++ * so, step_size=avail_cb_nodes*stripe_size IO will be performed at most,
++ * and ntimes=whole_file_portion/step_size
+ */
-+ step_size = (ADIO_Offset)ADIOI_MIN(nprocs, stripe_count * CO) * stripe_size;
++ step_size = (ADIO_Offset) avail_cb_nodes * stripe_size;
+ max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1);
+ if (ntimes)
+ write_buf = (char *) ADIOI_Malloc(stripe_size);
+ for (i = 0; i < nprocs; i++) {
+ for (j = 0; j < others_req[i].count; j ++) {
+ req_off = others_req[i].offsets[j];
-+ //m = (req_off - min_st_loc) / (stripe_size * stripe_count * CO);
+ m = (int)((req_off - min_st_loc) / step_size);
+ off_list[m] = ADIOI_MIN(off_list[m], req_off);
+ }
+ flat_buf = flat_buf->next;
+ }
+ MPI_Type_extent(datatype, &buftype_extent);
++ /* I need to check if there are any outstanding nonblocking writes to
++ * the file, which could potentially interfere with the writes taking
++ * place in this collective write call. Since this is not likely to be
++ * common, let me do the simplest thing possible here: Each process
++ * completes all pending nonblocking operations before completing.
++ */
++ /*ADIOI_Complete_async(error_code);
++ if (*error_code != MPI_SUCCESS) return;
++ MPI_Barrier(fd->comm);
++ */
+
+ iter_st_off = min_st_loc;
+
+ /* Although we have recognized the data according to OST index,
+ * a read-modify-write will be done if there is a hole between the data.
-+ * For example: if blocksize=60, transfersize=30 and stripe_size=100,
-+ * then process0 will collect data [0, 30] and [60, 90] then write. There
-+ * is a hole [30, 60], which will cause a read-modify-write in [0, 90].
-+ * It will degrade collective performance.
-+ * So we disable data sieving by default unless the hint "ds_in_coll"
-+ * is set to "enable".
++ * For example: if blocksize=60, xfersize=30 and stripe_size=100,
++ * then rank0 will collect data [0, 30] and [60, 90] then write. There
++ * is a hole in [30, 60], which will cause a read-modify-write in [0, 90].
++ *
++ * To reduce its impact on the performance, we can disable data sieving
++ * by hint "ds_in_coll".
+ */
+ /* check the hint for data sieving */
-+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
-+ MPI_Info_get(fd->info, "ds_in_coll", MPI_MAX_INFO_VAL, value, &lflag);
-+ if (lflag && !strcmp(value, "enable"))
-+ data_sieving = 1;
-+ ADIOI_Free(value);
++ data_sieving = fd->hints->fs_hints.lustre.ds_in_coll;
+
+ for (m = 0; m < max_ntimes; m++) {
+ /* go through all others_req and my_req to check which will be received
+ recv_curr_offlen_ptr[i] = j;
+ }
+ }
-+ /* use hole to pass data_sieving flag into W_Exchange_data */
++ /* use variable "hole" to pass data_sieving flag into W_Exchange_data */
+ hole = data_sieving;
+ ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
+ len_list, send_size, recv_size, off, real_size,
+ curr_to_proc, done_to_proc, &hole, m,
+ buftype_extent, buf_idx, error_code);
+ if (*error_code != MPI_SUCCESS)
-+ return;
++ goto over;
+
+ flag = 0;
+ for (i = 0; i < nprocs; i++)
+ }
+ if (flag) {
+ /* check whether to do data sieving */
-+ if(data_sieving) {
++ if(data_sieving == ADIOI_HINT_ENABLE) {
+ ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
+ ADIO_EXPLICIT_OFFSET, off, &status,
+ error_code);
+ } else {
-+ /* if there is no hole, write in one time;
-+ * otherwise, write data separately */
++ /* if there is no hole, write data in one time;
++ * otherwise, write data in several times */
+ if (!hole) {
+ ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
+ ADIO_EXPLICIT_OFFSET, off, &status,
+ MPI_BYTE, ADIO_EXPLICIT_OFFSET,
+ others_req[i].offsets[j], &status,
+ error_code);
++ if (*error_code != MPI_SUCCESS)
++ goto over;
+ }
+ }
+ }
+ }
+ }
+ if (*error_code != MPI_SUCCESS)
-+ return;
++ goto over;
+ }
-+
+ iter_st_off += max_size;
+ }
-+ if (*error_code != MPI_SUCCESS)
-+ return;
-+
++over:
+ if (ntimes)
+ ADIOI_Free(write_buf);
+ ADIOI_Free(recv_curr_offlen_ptr);
+ ADIOI_Free(off_list);
+}
+
++/* Sets error_code to MPI_SUCCESS if successful, or creates an error code
++ * in the case of error.
++ */
+static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
+ char *write_buf,
-+ ADIOI_Flatlist_node * flat_buf,
-+ ADIO_Offset * offset_list,
++ ADIOI_Flatlist_node *flat_buf,
++ ADIO_Offset *offset_list,
+ int *len_list, int *send_size,
+ int *recv_size, ADIO_Offset off,
+ int size, int *count,
+ int *sent_to_proc, int nprocs,
+ int myrank, int buftype_is_contig,
+ int contig_access_count,
-+ int * striping_info,
-+ ADIOI_Access * others_req,
++ int *striping_info,
++ ADIOI_Access *others_req,
+ int *send_buf_idx,
+ int *curr_to_proc, int *done_to_proc,
+ int *hole, int iter,
+ MPI_Aint buftype_extent,
+ int *buf_idx, int *error_code)
+{
-+ int i, j, *tmp_len, nprocs_recv, nprocs_send, err;
++ int i, j, nprocs_recv, nprocs_send, err;
+ char **send_buf = NULL;
+ MPI_Request *requests, *send_req;
+ MPI_Datatype *recv_types;
+ sizeof(MPI_Datatype));
+ /* +1 to avoid a 0-size malloc */
+
-+ tmp_len = (int *) ADIOI_Malloc(nprocs * sizeof(int));
+ j = 0;
+ for (i = 0; i < nprocs; i++) {
+ if (recv_size[i]) {
+ ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
+ nprocs, nprocs_recv, sum);
+
-+ ADIOI_Free(tmp_len);
-+
+ /* check if there are any holes */
+ *hole = 0;
+ for (i = 0; i < sum - 1; i++) {
+ *hole = 1;
+ }
+ /* check the hint for data sieving */
-+ if (data_sieving && nprocs_recv && *hole) {
++ if (data_sieving == ADIOI_HINT_ENABLE && nprocs_recv && *hole) {
+ ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
+ ADIO_EXPLICIT_OFFSET, off, &status, &err);
+ // --BEGIN ERROR HANDLING--
+ myname, __LINE__,
+ MPI_ERR_IO,
+ "**ioRMWrdwr", 0);
++ ADIOI_Free(recv_types);
++ ADIOI_Free(srt_off);
++ ADIOI_Free(srt_len);
+ return;
+ }
+ // --END ERROR HANDLING--
+}
+
+static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
-+ ADIOI_Flatlist_node * flat_buf,
++ ADIOI_Flatlist_node *flat_buf,
+ char **send_buf,
-+ ADIO_Offset * offset_list,
++ ADIO_Offset *offset_list,
+ int *len_list, int *send_size,
-+ MPI_Request * requests,
++ MPI_Request *requests,
+ int *sent_to_proc, int nprocs,
+ int myrank,
+ int contig_access_count,
-+ int * striping_info,
++ int *striping_info,
+ int *send_buf_idx,
+ int *curr_to_proc,
+ int *done_to_proc, int iter,
+ * longer than the single region that processor "p" is responsible
+ * for.
+ */
-+ p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, nprocs, striping_info);
++ p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, striping_info);
+
+ if (send_buf_idx[p] < send_size[p]) {
+ if (curr_to_proc[p] + len > done_to_proc[p]) {
+ if (send_size[i])
+ sent_to_proc[i] = curr_to_proc[i];
+}
-+
-+static void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
-+ ADIO_Offset * srt_off, int *srt_len,
-+ int *start_pos, int nprocs, int nprocs_recv,
-+ int total_elements)
-+{
-+ typedef struct {
-+ ADIO_Offset *off_list;
-+ int *len_list;
-+ int nelem;
-+ } heap_struct;
-+
-+ heap_struct *a, tmp;
-+ int i, j, heapsize, l, r, k, smallest;
-+
-+ a = (heap_struct *) ADIOI_Malloc((nprocs_recv + 1) *
-+ sizeof(heap_struct));
-+
-+ j = 0;
-+ for (i = 0; i < nprocs; i++)
-+ if (count[i]) {
-+ a[j].off_list = &(others_req[i].offsets[start_pos[i]]);
-+ a[j].len_list = &(others_req[i].lens[start_pos[i]]);
-+ a[j].nelem = count[i];
-+ j++;
-+ }
-+
-+ /* build a heap out of the first element from each list, with
-+ the smallest element of the heap at the root */
-+
-+ heapsize = nprocs_recv;
-+ for (i = heapsize / 2 - 1; i >= 0; i--) {
-+ /* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143
-+ modified for a heap with smallest element at root. I have
-+ removed the recursion so that there are no function calls.
-+ Function calls are too expensive. */
-+ k = i;
-+ for (;;) {
-+ l = 2 * (k + 1) - 1;
-+ r = 2 * (k + 1);
-+
-+ if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list)))
-+ smallest = l;
-+ else
-+ smallest = k;
-+
-+ if ((r < heapsize) &&
-+ (*(a[r].off_list) < *(a[smallest].off_list)))
-+ smallest = r;
-+
-+ if (smallest != k) {
-+ tmp.off_list = a[k].off_list;
-+ tmp.len_list = a[k].len_list;
-+ tmp.nelem = a[k].nelem;
-+
-+ a[k].off_list = a[smallest].off_list;
-+ a[k].len_list = a[smallest].len_list;
-+ a[k].nelem = a[smallest].nelem;
-+
-+ a[smallest].off_list = tmp.off_list;
-+ a[smallest].len_list = tmp.len_list;
-+ a[smallest].nelem = tmp.nelem;
-+
-+ k = smallest;
-+ } else
-+ break;
-+ }
-+ }
-+
-+ for (i = 0; i < total_elements; i++) {
-+ /* extract smallest element from heap, i.e. the root */
-+ srt_off[i] = *(a[0].off_list);
-+ srt_len[i] = *(a[0].len_list);
-+ (a[0].nelem)--;
-+
-+ if (!a[0].nelem) {
-+ a[0].off_list = a[heapsize - 1].off_list;
-+ a[0].len_list = a[heapsize - 1].len_list;
-+ a[0].nelem = a[heapsize - 1].nelem;
-+ heapsize--;
-+ } else {
-+ (a[0].off_list)++;
-+ (a[0].len_list)++;
-+ }
-+
-+ /* Heapify(a, 0, heapsize); */
-+ k = 0;
-+ for (;;) {
-+ l = 2 * (k + 1) - 1;
-+ r = 2 * (k + 1);
-+
-+ if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list)))
-+ smallest = l;
-+ else
-+ smallest = k;
-+
-+ if ((r < heapsize) &&
-+ (*(a[r].off_list) < *(a[smallest].off_list)))
-+ smallest = r;
-+
-+ if (smallest != k) {
-+ tmp.off_list = a[k].off_list;
-+ tmp.len_list = a[k].len_list;
-+ tmp.nelem = a[k].nelem;
-+
-+ a[k].off_list = a[smallest].off_list;
-+ a[k].len_list = a[smallest].len_list;
-+ a[k].nelem = a[smallest].nelem;
-+
-+ a[smallest].off_list = tmp.off_list;
-+ a[smallest].len_list = tmp.len_list;
-+ a[smallest].nelem = tmp.nelem;
-+
-+ k = smallest;
-+ } else
-+ break;
-+ }
-+ }
-+ ADIOI_Free(a);
-+}
-diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
---- ad_lustre_orig/ad_lustre_wrstr.c 1970-01-01 08:00:00.000000000 +0800
-+++ ad_lustre/ad_lustre_wrstr.c 2008-09-17 18:20:35.000000000 +0800
-@@ -0,0 +1,463 @@
+diff -ruN adio/ad_lustre_orig/ad_lustre_wrstr.c adio/ad_lustre/ad_lustre_wrstr.c
+--- adio/ad_lustre_orig/ad_lustre_wrstr.c 1970-01-01 08:00:00.000000000 +0800
++++ adio/ad_lustre/ad_lustre_wrstr.c 2009-02-27 10:35:18.000000000 +0800
+@@ -0,0 +1,467 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ * Copyright (C) 1997 University of Chicago.
+ *
+ * Copyright (C) 2007 Oak Ridge National Laboratory
+ *
-+ * Copyright (C) 2008 Sun Microsystems, Lustre group
++ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include "ad_lustre.h"
+ MPIR_ERR_RECOVERABLE, myname, \
+ __LINE__, MPI_ERR_IO, \
+ "**iowswc", 0); \
++ ADIOI_Free(writebuf); \
+ return; \
+ } \
+ } \
+ MPIR_ERR_RECOVERABLE, myname, \
+ __LINE__, MPI_ERR_IO, \
+ "**iowsrc", 0); \
++ ADIOI_Free(writebuf); \
+ return; \
+ } \
+ } \
+ MPIR_ERR_RECOVERABLE, myname, \
+ __LINE__, MPI_ERR_IO, \
+ "**iowswc", 0); \
++ ADIOI_Free(writebuf); \
+ return; \
+ } \
+ req_len -= write_sz; \
+ MPIR_ERR_RECOVERABLE, myname, \
+ __LINE__, MPI_ERR_IO, \
+ "**iowsrc", 0); \
++ ADIOI_Free(writebuf); \
+ return; \
+ } \
+ write_sz = ADIOI_MIN(req_len, writebuf_len); \
+ MPIR_ERR_RECOVERABLE, myname, \
+ __LINE__, MPI_ERR_IO, \
+ "**iowswc", 0); \
++ ADIOI_Free(writebuf); \
+ return; \
+ } \
+ req_len -= write_sz; \
+ int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
+ ADIO_Status status1;
+ int new_bwr_size, new_fwr_size;
-+ char * value;
-+ int stripe_size, lflag = 0;
++ int stripe_size;
+ static char myname[] = "ADIOI_LUSTRE_WriteStrided";
+ int myrank;
+ MPI_Comm_rank(fd->comm, &myrank);
+ bufsize = buftype_size * count;
+
+ /* get striping info */
-+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
-+ MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
-+ if (lflag)
-+ stripe_size = atoi(value);
-+ ADIOI_Free(value);
++ stripe_size = fd->hints->striping_unit;
+
+ /* Different buftype to different filetype */
+ if (!buftype_is_contig && filetype_is_contig) {
+
+ if (fd->atomicity)
+ ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
-+ if (*error_code != MPI_SUCCESS)
++ if (*error_code != MPI_SUCCESS) {
++ ADIOI_Free(writebuf);
+ return;
++ }
+ ADIOI_Free(writebuf);
+ if (file_ptr_type == ADIO_INDIVIDUAL)
+ fd->fp_ind = off;
+ writebuf_off, &status1, error_code);
+ if (!(fd->atomicity))
+ ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
-+ if (*error_code != MPI_SUCCESS)
++ if (*error_code != MPI_SUCCESS) {
++ ADIOI_Free(writebuf);
+ return;
++ }
+ }
+ if (fd->atomicity)
+ ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
+ if (!buftype_is_contig)
+ ADIOI_Delete_flattened(datatype);
+}
-diff -ruN ad_lustre_orig/Makefile ad_lustre/Makefile
---- ad_lustre_orig/Makefile 1970-01-01 08:00:00.000000000 +0800
-+++ ad_lustre/Makefile 2008-09-17 18:20:35.000000000 +0800
-@@ -0,0 +1,50 @@
-+CC = gcc
-+AR = ar cr
-+RANLIB = ranlib
-+LIBNAME = /work/download/mpich2-1.0.7-dev/lib/libmpich.a
-+srcdir = /work/download/mpich2-1.0.7-dev/src/mpi/romio/adio/ad_lustre
-+CC_SHL = true
-+SHLIBNAME = /work/download/mpich2-1.0.7-dev/lib/libmpich
-+
-+INCLUDE_DIR = -I. -I${srcdir}/../include -I../include -I../../include -I${srcdir}/../../../../include -I../../../../include
-+CFLAGS = -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/include -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/include -I/work/download/mpich2-1.0.7-dev/src/mpid/common/datatype -I/work/download/mpich2-1.0.7-dev/src/mpid/common/datatype -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/channels/sock/include -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/channels/sock/include -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock/poll -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock/poll -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -O2 -DFORTRANDOUBLEUNDERSCORE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DHAVE_ROMIOCONF_H -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(INCLUDE_DIR)
-+
-+top_builddir = /work/download/mpich2-1.0.7-dev
-+LIBTOOL =
-+C_COMPILE_SHL = $(CC_SHL) -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -O2 -DFORTRANDOUBLEUNDERSCORE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DHAVE_ROMIOCONF_H -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(INCLUDE_DIR)
-+
-+VPATH = .:${srcdir}
-+
-+AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_open.o \
-+ ad_lustre_rwcontig.o ad_lustre_wrcoll.o ad_lustre_wrstr.o \
-+ ad_lustre_fcntl.o ad_lustre_hints.o ad_lustre_close.o \
-+ ad_lustre_aggregate.o
-+
-+
-+default: $(LIBNAME)
-+ @if [ "none" != "none" ] ; then \
-+ $(MAKE) $(SHLIBNAME).la ;\
-+ fi
-+
-+.SUFFIXES: $(SUFFIXES) .p .lo
-+
-+.c.o:
-+ $(CC) $(CFLAGS) -c $<
-+.c.lo:
-+ $(C_COMPILE_SHL) -c $< -o _s$*.o
-+ @mv -f _s$*.o $*.lo
-+
-+$(LIBNAME): $(AD_LUSTRE_OBJECTS)
-+ $(AR) $(LIBNAME) $(AD_LUSTRE_OBJECTS)
-+ $(RANLIB) $(LIBNAME)
-+
-+AD_LUSTRE_LOOBJECTS=$(AD_LUSTRE_OBJECTS:.o=.lo)
-+$(SHLIBNAME).la: $(AD_LUSTRE_LOOBJECTS)
-+ $(AR) $(SHLIBNAME).la $(AD_LUSTRE_LOOBJECTS)
-+
-+coverage:
-+ -@for file in ${AD_LUSTRE_OBJECTS:.o=.c} ; do \
-+ gcov -b -f $$file ; done
-+
-+clean:
-+ @rm -f *.o *.lo
-diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in
---- ad_lustre_orig/Makefile.in 2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/Makefile.in 2008-09-17 18:20:35.000000000 +0800
+diff -ruN adio/ad_lustre_orig/Makefile.in adio/ad_lustre/Makefile.in
+--- adio/ad_lustre_orig/Makefile.in 2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/Makefile.in 2008-10-17 17:03:06.000000000 +0800
@@ -16,7 +16,9 @@
@VPATH@
default: $(LIBNAME)
@if [ "@ENABLE_SHLIB@" != "none" ] ; then \
-diff -ruN ad_lustre_orig/README ad_lustre/README
---- ad_lustre_orig/README 2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/README 2008-09-17 18:20:35.000000000 +0800
-@@ -5,6 +5,22 @@
+diff -ruN adio/ad_lustre_orig/README adio/ad_lustre/README
+--- adio/ad_lustre_orig/README 2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/README 2009-04-24 09:46:20.000000000 +0800
+@@ -5,6 +5,21 @@
o To post the code for ParColl (Partitioned collective IO)
-----------------------------------------------------
+V05:
+-----------------------------------------------------
-+ o Improved data redistribution
-+ - add I/O pattern identification. If request I/O size is big,
-+ collective I/O won't be done. The hint big_req_size can be
-+ used to define this.
-+ - provide hint CO for load balancing to control the number
-+ of IO clients for each OST
-+ - divide the IO clients into the different OST groups to
-+ produce stripe-contiguous I/O pattern
-+ - reduce the collective overhead by hints contiguous_data and
-+ same_io_size to remove unnecessary MPI_Alltoall()
++Improved data redistribution
++ o Improve I/O pattern identification. Besides checking interleaving,
++ if request I/O size is small, collective I/O will be performed.
++ The hint bigsize can be used to define the req size value.
++ o Provide hint CO for load balancing to control the number of
++ IO clients for each OST
++ o Produce stripe-contiguous I/O pattern that Lustre prefers
+ o Control read-modify-write in data sieving in collective IO
+ by hint ds_in_coll.
++ o Reduce extent lock conflicts by make each OST accessed by one or
++ more constant clients.
+
+-----------------------------------------------------
V04:
-----------------------------------------------------
o Direct IO and Lockless IO support
+--- adio/common/ad_write_coll_orig.c 2009-02-27 22:06:46.000000000 +0800
++++ adio/common/ad_write_coll.c 2008-10-15 11:25:38.000000000 +0800
+@@ -42,7 +42,7 @@
+ int *send_buf_idx, int *curr_to_proc,
+ int *done_to_proc, int iter,
+ MPI_Aint buftype_extent);
+-static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
++void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
+ ADIO_Offset *srt_off, int *srt_len, int *start_pos,
+ int nprocs, int nprocs_recv, int total_elements);
+
+@@ -921,7 +921,7 @@
+
+
+
+-static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
++void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
+ ADIO_Offset *srt_off, int *srt_len, int *start_pos,
+ int nprocs, int nprocs_recv, int total_elements)
+ {