Whamcloud - gitweb
LU-8468 kernel: kernel update RHEL7.2 [3.10.0-327.28.2.el7]
[fs/lustre-release.git] / lustre / contrib / adio_driver_mpich2-1.0.7.patch
index 5f1daa3..9bb0126 100644 (file)
@@ -1,7 +1,40 @@
-diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
---- ad_lustre_orig/ad_lustre_aggregate.c       1970-01-01 08:00:00.000000000 +0800
-+++ ad_lustre/ad_lustre_aggregate.c    2008-10-15 22:26:35.000000000 +0800
-@@ -0,0 +1,514 @@
+--- configure_orig.in  2009-03-01 13:50:30.000000000 +0800
++++ configure.in       2009-02-27 13:35:42.000000000 +0800
+@@ -1123,8 +1123,14 @@
+ if test -n "$file_system_testfs"; then
+     AC_DEFINE(ROMIO_TESTFS,1,[Define for ROMIO with TESTFS])
+ fi
++#
++# Verify presence of lustre/lustre_user.h
++#
+ if test -n "$file_system_lustre"; then
+-    AC_DEFINE(ROMIO_LUSTRE,1,[Define for ROMIO with LUSTRE])
++    AC_CHECK_HEADERS(lustre/lustre_user.h,
++        AC_DEFINE(ROMIO_LUSTRE,1,[Define for ROMIO with LUSTRE]),
++        AC_MSG_ERROR([LUSTRE support requested but cannot find lustre/lustre_user.h header file])                                        
++    )
+ fi
+ if test -n "$file_system_xfs"; then
+--- adio/include/adioi_orig.h  2009-03-01 14:00:48.000000000 +0800
++++ adio/include/adioi.h       2009-04-24 15:26:44.000000000 +0800
+@@ -52,6 +52,12 @@
+           struct {
+                   int debugmask;
+           } pvfs2;
++            struct {
++                    int start_iodevice;
++                    int co_ratio;
++                    int coll_threshold;
++                    int ds_in_coll;
++            } lustre;
+     } fs_hints;
+ };
+diff -ruN adio/ad_lustre_orig/ad_lustre_aggregate.c adio/ad_lustre/ad_lustre_aggregate.c
+--- adio/ad_lustre_orig/ad_lustre_aggregate.c  1970-01-01 08:00:00.000000000 +0800
++++ adio/ad_lustre/ad_lustre_aggregate.c       2009-05-05 15:22:40.000000000 +0800
+@@ -0,0 +1,304 @@
 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
 +/*
 + *   Copyright (C) 1997 University of Chicago.
@@ -9,17 +42,16 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 + *
 + *   Copyright (C) 2007 Oak Ridge National Laboratory
 + *
-+ *   Copyright (C) 2008 Sun Microsystems, Lustre group
++ *   Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 + */
 +
 +#include "ad_lustre.h"
 +#include "adio_extern.h"
 +
-+void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
-+                                  int mode, int nprocs,
-+                                    ADIO_Offset *st_offsets,
-+                                    ADIO_Offset *end_offsets,
-+                                    ADIO_Offset *min_st_offset_ptr)
++#undef AGG_DEBUG
++
++void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int **striping_info_ptr,
++                                  int mode)
 +{
 +    int *striping_info = NULL;
 +    /* get striping information:
@@ -27,22 +59,16 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +     *  striping_info[1]: stripe_count
 +     *  striping_info[2]: avail_cb_nodes
 +     */
-+    int stripe_size, stripe_count, CO = 1, CO_max = 1, lflag, i;
-+    int user_cb_nodes = 0, avail_cb_nodes;
-+    int nprocs_for_coll = fd->hints->cb_nodes;
-+    ADIO_Offset min_st_offset, max_end_offset;
++    int stripe_size, stripe_count, CO = 1, CO_max = 1, CO_nodes, lflag;
++    int avail_cb_nodes, divisor, nprocs_for_coll = fd->hints->cb_nodes;
 +    char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
 +
 +    /* Get hints value */
 +    /* stripe size */
-+    MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
-+    if (lflag)
-+      stripe_size = atoi(value);
++    stripe_size = fd->hints->striping_unit;
 +    /* stripe count */
 +    /* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */
-+    MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &lflag);
-+    if (lflag)
-+      stripe_count = atoi(value);
++    stripe_count = fd->hints->striping_factor;
 +
 +    /* Calculate the available number of I/O clients, that is
 +     *  avail_cb_nodes=min(cb_nodes, stripe_count*CO), where
@@ -61,53 +87,38 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +        /* CO_max: the largest number of IO clients for each ost group */
 +        CO_max = (nprocs_for_coll - 1)/ stripe_count + 1;
 +        /* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */
-+      MPI_Info_get(fd->info, "CO", MPI_MAX_INFO_VAL, value, &lflag);
-+      if (lflag)
-+          CO = atoi(value);
++      CO = fd->hints->fs_hints.lustre.co_ratio;
 +      CO = ADIOI_MIN(CO_max, CO);
 +    }
-+    avail_cb_nodes = ADIOI_MIN(nprocs_for_coll, stripe_count * CO);
-+
-+    /* user_cb_nodes*/
-+    MPI_Info_get(fd->info, "user_cb_nodes", MPI_MAX_INFO_VAL, value, &lflag);
-+    if (lflag)
-+      user_cb_nodes = atoi(value);
-+    /* If the user doesn't change the cb_nodes and
-+     * the whole file access portion is no larger than stripe size,
-+     * we will perform the IO by the same process (rank0 by default).
-+     */
-+    /* calculate the whole file access portion */
-+    min_st_offset = st_offsets[0];
-+    max_end_offset = end_offsets[0];
-+    for (i = 0; i < nprocs; i ++) {
-+        min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
-+        max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
-+    }
-+    if (!user_cb_nodes) {
-+        /* Check the whole file access portion
-+         *  if (whole_range <= stripe_size)
-+         *      then always collect data to the same process;
-+         *      set avail_cb_nodes=1; (rank0 by default).
-+         * This pattern can make good use of Lustre client cache and
-+         * avoid extent lock assigning and revoking.
-+         *
-+         * The recent experiments show good performance. We still need more
-+         * validation.
-+         */
-+        if ((max_end_offset > min_st_offset) &&
-+            (max_end_offset - min_st_offset) <= (ADIO_Offset) stripe_size)
-+            avail_cb_nodes = 1;
++    /* Calculate how many IO clients we need */
++    /* To avoid extent lock conflicts,
++     * avail_cb_nodes should divide (stripe_count*CO) exactly,
++     * so that each OST is accessed by only one or more constant clients. */
++    CO_nodes = stripe_count * CO;
++    avail_cb_nodes = ADIOI_MIN(nprocs_for_coll, CO_nodes);
++    if (avail_cb_nodes < CO_nodes) {
++        do {
++            /* find the divisor of CO_nodes */
++            divisor = 1;
++            do {
++                divisor ++;
++            } while (CO_nodes % divisor);
++            CO_nodes = CO_nodes / divisor;
++            /* if stripe_count*CO is a prime number, change nothing */
++            if ((CO_nodes <= avail_cb_nodes) && (CO_nodes != 1)) {
++                avail_cb_nodes = CO_nodes;
++                break;
++            }
++        } while (CO_nodes != 1);
 +    }
 +
-+    ADIOI_Free(value);
-+
 +    *striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int));
 +    striping_info = *striping_info_ptr;
 +    striping_info[0] = stripe_size;
 +    striping_info[1] = stripe_count;
 +    striping_info[2] = avail_cb_nodes;
 +
-+    *min_st_offset_ptr = min_st_offset;
++    ADIOI_Free(value);
 +}
 +
 +int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
@@ -121,6 +132,13 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +    /* Produce the stripe-contiguous pattern for Lustre */
 +    rank_index = (int)((off / stripe_size) % avail_cb_nodes);
 +
++    /* we index into fd_end with rank_index, and fd_end was allocated to be no
++     * bigger than fd->hins->cb_nodes.   If we ever violate that, we're
++     * overrunning arrays.  Obviously, we should never ever hit this abort
++     */
++    if (rank_index >= fd->hints->cb_nodes)
++          MPI_Abort(MPI_COMM_WORLD, 1);
++
 +    avail_bytes = (off / (ADIO_Offset)stripe_size + 1) *
 +                  (ADIO_Offset)stripe_size - off;
 +    if (avail_bytes < *len) {
@@ -134,12 +152,16 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +    return rank;
 +}
 +
++/* ADIOI_LUSTRE_Calc_my_req() - calculate what portions of the access requests
++ * of this process are located in the file domains of various processes
++ * (including this one)
++ */
 +void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
 +                            int *len_list, int contig_access_count,
 +                            int *striping_info, int nprocs,
 +                              int *count_my_req_procs_ptr,
 +                            int **count_my_req_per_proc_ptr,
-+                            ADIOI_Access ** my_req_ptr,
++                            ADIOI_Access **my_req_ptr,
 +                            int **buf_idx_ptr)
 +{
 +    /* Nothing different from ADIOI_Calc_my_req(), except calling
@@ -151,13 +173,19 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +
 +    *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
 +    count_my_req_per_proc = *count_my_req_per_proc_ptr;
++    /* count_my_req_per_proc[i] gives the no. of contig. requests of this
++     * process in process i's file domain. calloc initializes to zero.
++     * I'm allocating memory of size nprocs, so that I can do an
++     * MPI_Alltoall later on.
++     */
 +
++    buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
 +    /* buf_idx is relevant only if buftype_is_contig.
 +     * buf_idx[i] gives the index into user_buf where data received
 +     * from proc. i should be placed. This allows receives to be done
 +     * without extra buffer. This can't be done if buftype is not contig.
 +     */
-+    buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
++
 +    /* initialize buf_idx to -1 */
 +    for (i = 0; i < nprocs; i++)
 +      buf_idx[i] = -1;
@@ -173,12 +201,13 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +          continue;
 +      off = offset_list[i];
 +      avail_len = len_list[i];
-+      /* we set avail_len to be the total size of the access.
++      /* note: we set avail_len to be the total size of the access.
 +       * then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return
 +       * the amount that was available.
 +       */
 +      proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
 +      count_my_req_per_proc[proc]++;
++
 +      /* figure out how many data is remaining in the access
 +       * we'll take care of this data (if there is any)
 +       * in the while loop below.
@@ -194,6 +223,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +      }
 +    }
 +
++    /* now allocate space for my_req, offset, and len */
 +    *my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access));
 +    my_req = *my_req_ptr;
 +
@@ -213,6 +243,8 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +    /* now fill in my_req */
 +    curr_idx = 0;
 +    for (i = 0; i < contig_access_count; i++) {
++      /* short circuit offset/len processing if len == 0
++       *      (zero-byte  read/write */
 +      if (len_list[i] == 0)
 +          continue;
 +      off = offset_list[i];
@@ -266,12 +298,12 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +          }
 +      }
 +    }
-+#endif
 +#if 0
 +    for (i = 0; i < nprocs; i++) {
 +      FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
 +    }
 +#endif
++#endif
 +
 +    *count_my_req_procs_ptr = count_my_req_procs;
 +    *buf_idx_ptr = buf_idx;
@@ -289,7 +321,6 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +    int i, docollect = 1, lflag, big_req_size = 0;
 +    ADIO_Offset req_size = 0, total_req_size;
 +    int avg_req_size, total_access_count;
-+    char *value = NULL;
 +
 +    /* calculate total_req_size and total_access_count */
 +    for (i = 0; i < contig_access_count; i++)
@@ -300,225 +331,17 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +               fd->comm);
 +    /* estimate average req_size */
 +    avg_req_size = (int)(total_req_size / total_access_count);
-+
 +    /* get hint of big_req_size */
-+    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
-+    MPI_Info_get(fd->info, "big_req_size", MPI_MAX_INFO_VAL, value, &lflag);
-+    if (lflag)
-+        big_req_size = atoi(value);
++    big_req_size = fd->hints->fs_hints.lustre.coll_threshold;
 +    /* Don't perform collective I/O if there are big requests */
 +    if ((big_req_size > 0) && (avg_req_size > big_req_size))
 +        docollect = 0;
 +
-+    ADIOI_Free(value);
-+
 +    return docollect;
 +}
-+
-+void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
-+                                int *count_my_req_per_proc,
-+                                ADIOI_Access * my_req,
-+                                int nprocs, int myrank,
-+                                  ADIO_Offset req_len,
-+                                  ADIO_Offset min_st_offset,
-+                                  int *striping_info,
-+                                int *count_others_req_procs_ptr,
-+                                ADIOI_Access ** others_req_ptr)
-+{
-+    /* what requests of other processes will be written by this process */
-+
-+    int *count_others_req_per_proc, count_others_req_procs, proc;
-+    int i, j, lflag, samesize = 0, contiguous = 0;
-+    int avail_cb_nodes = striping_info[2];
-+    MPI_Request *send_requests, *recv_requests;
-+    MPI_Status *statuses;
-+    ADIOI_Access *others_req;
-+    char *value = NULL;
-+    ADIO_Offset off, avail_len, rem_len, *all_lens;
-+
-+    /* There are two hints, which could reduce some MPI communication overhead,
-+     * if the users knows the I/O pattern and set them correctly. */
-+    /* They are
-+     * contiguous_data: if the data are contiguous,
-+     *                  we don't need to do MPI_Alltoall().
-+     * same_io_size: And if the data req size is same,
-+     *               we can calculate the offset directly
-+     */
-+    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
-+    /* hint of contiguous data */
-+    MPI_Info_get(fd->info, "contiguous_data", MPI_MAX_INFO_VAL, value, &lflag);
-+    if (lflag && !strcmp(value, "yes"))
-+        contiguous = 1;
-+    /* hint of same io size */
-+    MPI_Info_get(fd->info, "same_io_size", MPI_MAX_INFO_VAL, value, &lflag);
-+    if (lflag && !strcmp(value, "yes"))
-+        samesize = 1;
-+    ADIOI_Free(value);
-+
-+    *others_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs *
-+                                                    sizeof(ADIOI_Access));
-+    others_req = *others_req_ptr;
-+
-+    /* if the data are contiguous, we can calulate the offset and length
-+     * of the other requests simply, instead of MPI_Alltoall() */
-+    if (contiguous) {
-+        for (i = 0; i < nprocs; i++) {
-+            others_req[i].count = 0;
-+        }
-+        all_lens = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
-+
-+        /* same req size ? */
-+        if (samesize == 0) {
-+            /* exchange request length */
-+            MPI_Allgather(&req_len, 1, ADIO_OFFSET, all_lens, 1, ADIO_OFFSET,
-+                          fd->comm);
-+        } else { /* same request size */
-+            /* assign request length to all_lens[] */
-+            for (i = 0; i < nprocs; i ++)
-+               all_lens[i] = req_len;
-+        }
-+        if (myrank < avail_cb_nodes) {
-+            /* It's a IO client and it will receive data from others */
-+            off = min_st_offset;
-+            /* calcaulte other_req[i].count */
-+            for (i = 0; i < nprocs; i++) {
-+                avail_len = all_lens[i];
-+                rem_len = avail_len;
-+                while (rem_len > 0) {
-+                  proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
-+                                                        striping_info);
-+                    if (proc == myrank) {
-+                        others_req[i].count ++;
-+                    }
-+                    off += avail_len;
-+                    rem_len -= avail_len;
-+                    avail_len = rem_len;
-+                }
-+            }
-+            /* calculate offset and len for each request */
-+            off = min_st_offset;
-+            for (i = 0; i < nprocs; i++) {
-+                if (others_req[i].count) {
-+                  others_req[i].offsets = (ADIO_Offset *)
-+                                            ADIOI_Malloc(others_req[i].count *
-+                                                       sizeof(ADIO_Offset));
-+                  others_req[i].lens = (int *)
-+                                         ADIOI_Malloc(others_req[i].count *
-+                                                      sizeof(int));
-+                    others_req[i].mem_ptrs = (MPI_Aint *)
-+                                             ADIOI_Malloc(others_req[i].count *
-+                                                        sizeof(MPI_Aint));
-+                }
-+                j = 0;
-+                avail_len = all_lens[i];
-+                rem_len = avail_len;
-+                while (rem_len > 0) {
-+                  proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
-+                                                        striping_info);
-+                    if (proc == myrank) {
-+                        others_req[i].offsets[j] = off;
-+                        others_req[i].lens[j] = (int)avail_len;
-+                        j ++;
-+                    }
-+                    off += avail_len;
-+                    rem_len -= avail_len;
-+                    avail_len = rem_len;
-+                }
-+            }
-+        }
-+        ADIOI_Free(all_lens);
-+    } else {
-+        /* multiple non-contiguous requests */
-+        /* first find out how much to send/recv and from/to whom */
-+
-+        /*
-+         * count_others_req_procs:
-+         *    number of processes whose requests will be written by
-+         *    this process (including this process itself)
-+         * count_others_req_per_proc[i]:
-+         *    how many separate contiguous requests of proc[i] will be
-+         *    written by this process.
-+         */
-+
-+        count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
-+
-+        MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
-+                   count_others_req_per_proc, 1, MPI_INT, fd->comm);
-+
-+        count_others_req_procs = 0;
-+        for (i = 0; i < nprocs; i++) {
-+          if (count_others_req_per_proc[i]) {
-+              others_req[i].count = count_others_req_per_proc[i];
-+              others_req[i].offsets = (ADIO_Offset *)
-+                                        ADIOI_Malloc(others_req[i].count *
-+                                               sizeof(ADIO_Offset));
-+              others_req[i].lens = (int *)
-+                                   ADIOI_Malloc(others_req[i].count *
-+                                                  sizeof(int));
-+              others_req[i].mem_ptrs = (MPI_Aint *)
-+                                       ADIOI_Malloc(others_req[i].count *
-+                                                    sizeof(MPI_Aint));
-+              count_others_req_procs++;
-+          } else
-+              others_req[i].count = 0;
-+        }
-+
-+        /* now send the calculated offsets and lengths to respective processes */
-+
-+        send_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_my_req_procs + 1) *
-+                                                     sizeof(MPI_Request));
-+        recv_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_others_req_procs+1)*
-+                                                   sizeof(MPI_Request));
-+        /* +1 to avoid a 0-size malloc */
-+
-+        j = 0;
-+        for (i = 0; i < nprocs; i++) {
-+          if (others_req[i].count) {
-+              MPI_Irecv(others_req[i].offsets, others_req[i].count,
-+                        ADIO_OFFSET, i, i + myrank, fd->comm,
-+                        &recv_requests[j]);
-+              j++;
-+              MPI_Irecv(others_req[i].lens, others_req[i].count,
-+                        MPI_INT, i, i + myrank + 1, fd->comm,
-+                        &recv_requests[j]);
-+              j++;
-+          }
-+        }
-+
-+        j = 0;
-+        for (i = 0; i < nprocs; i++) {
-+          if (my_req[i].count) {
-+              MPI_Isend(my_req[i].offsets, my_req[i].count,
-+                        ADIO_OFFSET, i, i + myrank, fd->comm,
-+                        &send_requests[j]);
-+              j++;
-+              MPI_Isend(my_req[i].lens, my_req[i].count,
-+                        MPI_INT, i, i + myrank + 1, fd->comm,
-+                        &send_requests[j]);
-+              j++;
-+          }
-+        }
-+
-+        statuses = (MPI_Status *)
-+                   ADIOI_Malloc((1 + 2 * ADIOI_MAX(count_my_req_procs,
-+                                                 count_others_req_procs)) *
-+                                         sizeof(MPI_Status));
-+        /* +1 to avoid a 0-size malloc */
-+
-+        MPI_Waitall(2 * count_my_req_procs, send_requests, statuses);
-+        MPI_Waitall(2 * count_others_req_procs, recv_requests, statuses);
-+
-+        ADIOI_Free(send_requests);
-+        ADIOI_Free(recv_requests);
-+        ADIOI_Free(statuses);
-+        ADIOI_Free(count_others_req_per_proc);
-+
-+        *count_others_req_procs_ptr = count_others_req_procs;
-+    }
-+}
-diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c
---- ad_lustre_orig/ad_lustre.c 2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/ad_lustre.c      2008-09-17 18:20:35.000000000 +0800
+diff -ruN adio/ad_lustre_orig/ad_lustre.c adio/ad_lustre/ad_lustre.c
+--- adio/ad_lustre_orig/ad_lustre.c    2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/ad_lustre.c 2008-10-17 17:03:42.000000000 +0800
 @@ -1,9 +1,11 @@
  /* -*- Mode: C; c-basic-offset:4 ; -*- */
 -/* 
@@ -529,77 +352,28 @@ diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c
   *
   *   Copyright (C) 2007 Oak Ridge National Laboratory
 + *
-+ *   Copyright (C) 2008 Sun Microsystems, Lustre group
++ *   Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
   */
  
  #include "ad_lustre.h"
-@@ -13,13 +15,13 @@
+@@ -13,12 +15,12 @@
      ADIOI_LUSTRE_ReadContig, /* ReadContig */
      ADIOI_LUSTRE_WriteContig, /* WriteContig */
      ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
 -    ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
 +    ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */
      ADIOI_GEN_SeekIndividual, /* SeekIndividual */
--    ADIOI_GEN_Fcntl, /* Fcntl */
-+    ADIOI_LUSTRE_Fcntl, /* Fcntl */
+     ADIOI_GEN_Fcntl, /* Fcntl */
      ADIOI_LUSTRE_SetInfo, /* SetInfo */
      ADIOI_GEN_ReadStrided, /* ReadStrided */
 -    ADIOI_GEN_WriteStrided, /* WriteStrided */
--    ADIOI_GEN_Close, /* Close */
 +    ADIOI_LUSTRE_WriteStrided, /* WriteStrided */
-+    ADIOI_LUSTRE_Close, /* Close */
+     ADIOI_GEN_Close, /* Close */
  #if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE)
      ADIOI_GEN_IreadContig, /* IreadContig */
-     ADIOI_GEN_IwriteContig, /* IwriteContig */
-diff -ruN ad_lustre_orig/ad_lustre_close.c ad_lustre/ad_lustre_close.c
---- ad_lustre_orig/ad_lustre_close.c   1970-01-01 08:00:00.000000000 +0800
-+++ ad_lustre/ad_lustre_close.c        2008-09-17 18:20:35.000000000 +0800
-@@ -0,0 +1,42 @@
-+/* -*- Mode: C; c-basic-offset:4 ; -*- */
-+/*
-+ *
-+ *   Copyright (C) 1997 University of Chicago.
-+ *   See COPYRIGHT notice in top-level directory.
-+ *
-+ *   Copyright (C) 2007 Oak Ridge National Laboratory
-+ *
-+ *   Copyright (C) 2008 Sun Microsystems, Lustre group
-+ */
-+
-+#include "ad_lustre.h"
-+
-+#ifdef PROFILE
-+#include "mpe.h"
-+#endif
-+
-+void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code)
-+{
-+    int err, derr = 0;
-+    static char myname[] = "ADIOI_LUSTRE_CLOSE";
-+
-+#ifdef PROFILE
-+    MPE_Log_event(9, 0, "start close");
-+#endif
-+
-+    err = close(fd->fd_sys);
-+
-+#ifdef PROFILE
-+    MPE_Log_event(10, 0, "end close");
-+#endif
-+
-+    fd->fd_sys = -1;
-+
-+    if (err == -1 || derr == -1) {
-+      *error_code =
-+          MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname,
-+                               __LINE__, MPI_ERR_IO, "**io", "**io %s",
-+                               strerror(errno));
-+    } else
-+      *error_code = MPI_SUCCESS;
-+}
-diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
---- ad_lustre_orig/ad_lustre.h 2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/ad_lustre.h      2008-10-15 21:22:52.000000000 +0800
+diff -ruN adio/ad_lustre_orig/ad_lustre.h adio/ad_lustre/ad_lustre.h
+--- adio/ad_lustre_orig/ad_lustre.h    2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/ad_lustre.h 2009-05-05 15:34:58.000000000 +0800
 @@ -1,9 +1,11 @@
  /* -*- Mode: C; c-basic-offset:4 ; -*- */
 -/* 
@@ -610,44 +384,20 @@ diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
   *
   *   Copyright (C) 2007 Oak Ridge National Laboratory
 + *
-+ *   Copyright (C) 2008 Sun Microsystems, Lustre group
++ *   Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
   */
  
  #ifndef AD_UNIX_INCLUDE
-@@ -24,7 +26,32 @@
+@@ -24,7 +26,7 @@
  
  /*#include <fcntl.h>*/
  #include <sys/ioctl.h>
-+#ifdef WITH_LUSTRE
- #include "lustre/lustre_user.h"
-+#else
-+/* copy something from lustre_user.h here */
-+#  define LOV_USER_MAGIC 0x0BD10BD0
-+#  define LL_IOC_LOV_SETSTRIPE  _IOW ('f', 154, long)
-+#  define LL_IOC_LOV_GETSTRIPE  _IOW ('f', 155, long)
-+#  define lov_user_ost_data lov_user_ost_data_v1
-+struct lov_user_ost_data_v1 {     /* per-stripe data structure */
-+        __u64 l_object_id;        /* OST object ID */
-+        __u64 l_object_gr;        /* OST object group (creating MDS number) */
-+        __u32 l_ost_gen;          /* generation of this OST index */
-+        __u32 l_ost_idx;          /* OST index in LOV */
-+} __attribute__((packed));
-+#define lov_user_md lov_user_md_v1
-+struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
-+        __u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V1 */
-+        __u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
-+        __u64 lmm_object_id;      /* LOV object ID */
-+        __u64 lmm_object_gr;      /* LOV object group */
-+        __u32 lmm_stripe_size;    /* size of stripe in bytes */
-+        __u16 lmm_stripe_count;   /* num stripes in use for this object */
-+        __u16 lmm_stripe_offset;  /* starting stripe offset in lmm_objects */
-+        struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
-+} __attribute__((packed));
-+#endif
+-#include "lustre/lustre_user.h"
++#include <lustre/lustre_user.h>
  #include "adio.h"
  /*#include "adioi.h"*/
  
-@@ -41,24 +68,56 @@
+@@ -41,24 +43,31 @@
  
  void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code);
  void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code);
@@ -693,35 +443,10 @@ diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
                       int *error_code);
  void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
 -
-+void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
-+                                  int mode, int nprocs,
-+                                    ADIO_Offset *st_offsets,
-+                                    ADIO_Offset *end_offsets,
-+                                    ADIO_Offset *min_st_offset);
-+int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
-+                               ADIO_Offset *len, int *striping_info);
-+void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
-+                            int *len_list, int contig_access_count,
-+                            int *striping_info, int nprocs,
-+                              int *count_my_req_procs_ptr,
-+                            int **count_my_req_per_proc_ptr,
-+                            ADIOI_Access ** my_req_ptr,
-+                            int **buf_idx_ptr);
-+int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
-+                         int *len_list, int nprocs);
-+void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
-+                                int *count_my_req_per_proc,
-+                                ADIOI_Access * my_req,
-+                                int nprocs, int myrank,
-+                                  ADIO_Offset req_len,
-+                                  ADIO_Offset min_st_offset,
-+                                  int *striping_info,
-+                                int *count_others_req_procs_ptr,
-+                                ADIOI_Access ** others_req_ptr);
  #endif /* End of AD_UNIX_INCLUDE */
-diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
---- ad_lustre_orig/ad_lustre_hints.c   2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/ad_lustre_hints.c        2008-10-15 21:31:00.000000000 +0800
+diff -ruN adio/ad_lustre_orig/ad_lustre_hints.c adio/ad_lustre/ad_lustre_hints.c
+--- adio/ad_lustre_orig/ad_lustre_hints.c      2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/ad_lustre_hints.c   2009-04-24 15:35:05.000000000 +0800
 @@ -1,9 +1,11 @@
  /* -*- Mode: C; c-basic-offset:4 ; -*- */
 -/* 
@@ -732,25 +457,22 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
   *
   *   Copyright (C) 2007 Oak Ridge National Laboratory
 + *
-+ *   Copyright (C) 2008 Sun Microsystems, Lustre group
++ *   Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
   */
  
  #include "ad_lustre.h"
-@@ -11,130 +13,189 @@
+@@ -12,46 +14,56 @@
  void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
  {
--    char *value, *value_in_fd;
+     char *value, *value_in_fd;
 -    int flag, tmp_val[3], str_factor=-1, str_unit=0, start_iodev=-1;
--    struct lov_user_md lum = { 0 };
--    int err, myrank, fd_sys, perm, amode, old_mask;
-+    char *value = NULL;
-+    int flag, tmp_val, int_val, str_factor, str_unit, start_iodev;
++    int flag, stripe_val[3], str_factor = -1, str_unit=0, start_iodev=-1;
+     struct lov_user_md lum = { 0 };
+     int err, myrank, fd_sys, perm, amode, old_mask;
++    int int_val, tmp_val;
 +    static char myname[] = "ADIOI_LUSTRE_SETINFO";
  
-+    *error_code = MPI_SUCCESS;
      value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
-+
      if ( (fd->info) == MPI_INFO_NULL) {
 -      /* This must be part of the open call. can set striping parameters 
 -           if necessary. */ 
@@ -763,29 +485,45 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
        fd->direct_read = fd->direct_write = 0;
 -      
 -      /* has user specified striping or server buffering parameters 
++        /* initialize lustre hints */
++      MPI_Info_set(fd->info, "romio_lustre_co_ratio", "1");
++        fd->hints->fs_hints.lustre.co_ratio = 1;
++      MPI_Info_set(fd->info, "romio_lustre_coll_threshold", "0");
++        fd->hints->fs_hints.lustre.coll_threshold = 0;
++      MPI_Info_set(fd->info, "romio_lustre_ds_in_coll", "enable");
++        fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_ENABLE;
 +
 +      /* has user specified striping or server buffering parameters
             and do they have the same value on all processes? */
        if (users_info != MPI_INFO_NULL) {
 -          MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, 
--                       value, &flag);
++            /* striping information */
++          MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
+                        value, &flag);
 -          if (flag) 
--              str_unit=atoi(value);
--
++          if (flag)
+               str_unit=atoi(value);
 -          MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, 
--                       value, &flag);
++          MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
+                        value, &flag);
 -          if (flag) 
--              str_factor=atoi(value);
--
++          if (flag)
+               str_factor=atoi(value);
 -          MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, 
-+            /* direct read and write */
-+          MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
-                        value, &flag);
+-                       value, &flag);
 -          if (flag) 
--              start_iodev=atoi(value);
--
++          MPI_Info_get(users_info, "romio_lustre_start_iodevice",
++                         MPI_MAX_INFO_VAL, value, &flag);
++          if (flag)
+               start_iodev=atoi(value);
 -          MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, 
 -                           value, &flag);
++            /* direct read and write */
++          MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
++                       value, &flag);
            if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
                MPI_Info_set(fd->info, "direct_read", "true");
                fd->direct_read = 1;
@@ -796,183 +534,65 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
                             value, &flag);
            if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
                MPI_Info_set(fd->info, "direct_write", "true");
-               fd->direct_write = 1;
+@@ -59,22 +71,23 @@
            }
-+            /*  stripe size */
-+          MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
-+                       value, &flag);
-+          if (flag && (str_unit = atoi(value))) {
-+              tmp_val = str_unit;
-+              MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+              if (tmp_val != str_unit) {
-+                  MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                     "striping_unit",
-+                                                     error_code);
-+                    ADIOI_Free(value);
-+                  return;
-+              }
-+              MPI_Info_set(fd->info, "striping_unit", value);
-+          }
-+            /* stripe count */
-+          MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
-+                       value, &flag);
-+          if (flag && (str_factor = atoi(value))) {
-+              tmp_val = str_factor;
-+              MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+              if (tmp_val != str_factor) {
-+                  MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                     "striping_factor",
-+                                                     error_code);
-+                    ADIOI_Free(value);
-+                  return;
-+              }
-+              MPI_Info_set(fd->info, "striping_factor", value);
-+          }
-+            /* stripe offset */
-+            MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
-+                       value, &flag);
-+          if (flag && ((start_iodev = atoi(value)) >= 0)) {
-+              tmp_val = start_iodev;
-+              MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+              if (tmp_val != start_iodev) {
-+                  MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                     "start_iodevice",
-+                                                     error_code);
-+                    ADIOI_Free(value);
-+                  return;
-+              }
-+              MPI_Info_set(fd->info, "start_iodevice", value);
-+          }
        }
--
--      MPI_Comm_rank(fd->comm, &myrank);
--      if (myrank == 0) {
++        /* set striping information with ioctl */
+       MPI_Comm_rank(fd->comm, &myrank);
+       if (myrank == 0) {
 -          tmp_val[0] = str_factor;
 -          tmp_val[1] = str_unit;
 -          tmp_val[2] = start_iodev;
-+    }
-+    if (users_info != MPI_INFO_NULL) {
-+        /* CO: IO Clients/OST,
-+         * to keep the load balancing between clients and OSTs */
-+        MPI_Info_get(users_info, "CO", MPI_MAX_INFO_VAL, value,
-+                     &flag);
-+      if (flag && (int_val = atoi(value)) > 0) {
-+            tmp_val = int_val;
-+          MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+          if (tmp_val != int_val) {
-+                MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                   "CO",
-+                                                   error_code);
-+                ADIOI_Free(value);
-+              return;
-+          }
-+          MPI_Info_set(fd->info, "CO", value);
++          stripe_val[0] = str_factor;
++          stripe_val[1] = str_unit;
++          stripe_val[2] = start_iodev;
        }
 -      MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm);
--
++      MPI_Bcast(stripe_val, 3, MPI_INT, 0, fd->comm);
 -      if (tmp_val[0] != str_factor 
 -              || tmp_val[1] != str_unit 
 -              || tmp_val[2] != start_iodev) {
--          FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys"
--                  "-striping_factor:striping_unit:start_iodevice "
--                  "need to be identical across all processes\n");
--          MPI_Abort(MPI_COMM_WORLD, 1);
++      if (stripe_val[0] != str_factor
++              || stripe_val[1] != str_unit
++              || stripe_val[2] != start_iodev) {
+           FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys"
+                   "-striping_factor:striping_unit:start_iodevice "
+                   "need to be identical across all processes\n");
+           MPI_Abort(MPI_COMM_WORLD, 1);
 -              } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) {
--           /* if user has specified striping info, process 0 tries to set it */
--          if (!myrank) {
--              if (fd->perm == ADIO_PERM_NULL) {
--                  old_mask = umask(022);
--                  umask(old_mask);
--                  perm = old_mask ^ 0666;
--              }
--              else perm = fd->perm;
--
--              amode = 0;
--              if (fd->access_mode & ADIO_CREATE)
--                  amode = amode | O_CREAT;
--              if (fd->access_mode & ADIO_RDONLY)
--                  amode = amode | O_RDONLY;
--              if (fd->access_mode & ADIO_WRONLY)
--                  amode = amode | O_WRONLY;
--              if (fd->access_mode & ADIO_RDWR)
--                  amode = amode | O_RDWR;
--              if (fd->access_mode & ADIO_EXCL)
--                  amode = amode | O_EXCL;
--
--              /* we need to create file so ensure this is set */
--              amode = amode | O_LOV_DELAY_CREATE | O_CREAT;
--
--              fd_sys = open(fd->filename, amode, perm);
++      } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) {
+            /* if user has specified striping info, process 0 tries to set it */
+           if (!myrank) {
+               if (fd->perm == ADIO_PERM_NULL) {
+@@ -100,9 +113,9 @@
+               amode = amode | O_LOV_DELAY_CREATE | O_CREAT;
+               fd_sys = open(fd->filename, amode, perm);
 -              if (fd_sys == -1) { 
 -                  if (errno != EEXIST) 
 -                      fprintf(stderr, 
--                              "Failure to open file %s %d %d\n",strerror(errno), amode, perm);
--              } else {
--                  lum.lmm_magic = LOV_USER_MAGIC;
--                  lum.lmm_pattern = 0;
--                  lum.lmm_stripe_size = str_unit;
--                  lum.lmm_stripe_count = str_factor;
--                  lum.lmm_stripe_offset = start_iodev;
--
--                  err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum);
++              if (fd_sys == -1) {
++                  if (errno != EEXIST)
++                      fprintf(stderr,
+                               "Failure to open file %s %d %d\n",strerror(errno), amode, perm);
+               } else {
+                   lum.lmm_magic = LOV_USER_MAGIC;
+@@ -112,25 +125,73 @@
+                   lum.lmm_stripe_offset = start_iodev;
+                   err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum);
 -                  if (err == -1 && errno != EEXIST) { 
--                      fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno));
--                  }
--                  close(fd_sys);
--             }
--          } /* End of striping parameters validation */
-+        /* big_req_size:
-+         * if the req size is bigger than this,
-+         * collective IO may not be performed.
-+         */
-+      MPI_Info_get(users_info, "big_req_size", MPI_MAX_INFO_VAL, value,
-+                     &flag);
-+      if (flag && (int_val = atoi(value)) > 0) {
-+            tmp_val = int_val;
-+          MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+          if (tmp_val != int_val) {
-+              MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                 "big_req_size",
-+                                                 error_code);
-+                ADIOI_Free(value);
-+              return;
-+          }
-+          MPI_Info_set(fd->info, "big_req_size", value);
-+        }
-+        /* ds_in_coll: disable data sieving in collective IO */
-+      MPI_Info_get(users_info, "ds_in_coll", MPI_MAX_INFO_VAL,
-+                   value, &flag);
-+      if (flag && (!strcmp(value, "enable") ||
-+                     !strcmp(value, "ENABLE"))) {
-+            tmp_val = int_val = 1;
-+          MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+          if (tmp_val != int_val) {
-+              MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                 "ds_in_coll",
-+                                                 error_code);
-+                ADIOI_Free(value);
-+                return;
-+          }
-+          MPI_Info_set(fd->info, "ds_in_coll", "enable");
-+      }
-+        /* contiguous_data: whether the data are contiguous */
-+      MPI_Info_get(users_info, "contiguous_data", MPI_MAX_INFO_VAL,
-+                   value, &flag);
-+        if (flag && (!strcmp(value, "yes") ||
-+                     !strcmp(value, "YES"))) {
-+            tmp_val = int_val = 1;
-+          MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+          if (tmp_val != int_val) {
-+              MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                 "contiguous_data",
-+                                                 error_code);
-+                ADIOI_Free(value);
-+                return;
-+          }
-+          MPI_Info_set(fd->info, "contiguous_data", "yes");
++                  if (err == -1 && errno != EEXIST) {
+                       fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno));
+                   }
+                   close(fd_sys);
+              }
+           } /* End of striping parameters validation */
        }
 -      
--      MPI_Barrier(fd->comm);
+       MPI_Barrier(fd->comm);
 -      /* set the values for collective I/O and data sieving parameters */
 -      ADIOI_GEN_SetInfo(fd, users_info, error_code);
 -    } else {
@@ -981,55 +601,73 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
 -      
 -      /* set the values for collective I/O and data sieving parameters */
 -      ADIOI_GEN_SetInfo(fd, users_info, error_code);
-+        /* same_io_size: whether the req size is same */
-+      MPI_Info_get(users_info, "same_io_size", MPI_MAX_INFO_VAL,
-+                   value, &flag);
-+        if (flag && (!strcmp(value, "yes") ||
-+                     !strcmp(value, "YES"))) {
-+            tmp_val = int_val = 1;
+     }
+- 
++    /* get other hint */
++    if (users_info != MPI_INFO_NULL) {
++        /* CO: IO Clients/OST,
++         * to keep the load balancing between clients and OSTs */
++        MPI_Info_get(users_info, "romio_lustre_co_ratio", MPI_MAX_INFO_VAL, value,
++                     &flag);
++      if (flag && (int_val = atoi(value)) > 0) {
++            tmp_val = int_val;
 +          MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
 +          if (tmp_val != int_val) {
-+              MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                 "same_io_size",
-+                                                 error_code);
++                MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
++                                                   "romio_lustre_co_ratio",
++                                                   error_code);
 +                ADIOI_Free(value);
-+                return;
++              return;
 +          }
-+          MPI_Info_set(fd->info, "same_io_size", "yes");
++          MPI_Info_set(fd->info, "romio_lustre_co_ratio", value);
++            fd->hints->fs_hints.lustre.co_ratio = atoi(value);
 +      }
-+        /* Remember the current cb_nodes that the user set.
-+         * It would be used to improve collective I/O.
++        /* coll_threshold:
++         * if the req size is bigger than this, collective IO may not be performed.
 +         */
-+      MPI_Info_get(users_info, "cb_nodes", MPI_MAX_INFO_VAL, value, &flag);
++      MPI_Info_get(users_info, "romio_lustre_coll_threshold", MPI_MAX_INFO_VAL, value,
++                     &flag);
 +      if (flag && (int_val = atoi(value)) > 0) {
 +            tmp_val = int_val;
 +          MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
 +          if (tmp_val != int_val) {
 +              MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                 "big_req_size",
++                                                 "romio_lustre_coll_threshold",
 +                                                 error_code);
 +                ADIOI_Free(value);
 +              return;
 +          }
-+          MPI_Info_set(fd->info, "user_cb_nodes", value);
++          MPI_Info_set(fd->info, "romio_lustre_coll_threshold", value);
++            fd->hints->fs_hints.lustre.coll_threshold = atoi(value);
 +        }
-     }
-- 
--    if (ADIOI_Direct_read) fd->direct_read = 1;
--    if (ADIOI_Direct_write) fd->direct_write = 1;
--
-     ADIOI_Free(value);
++        /* ds_in_coll: disable data sieving in collective IO */
++      MPI_Info_get(users_info, "romio_lustre_ds_in_coll", MPI_MAX_INFO_VAL,
++                   value, &flag);
++      if (flag && (!strcmp(value, "disable") ||
++                     !strcmp(value, "DISABLE"))) {
++            tmp_val = int_val = 2;
++          MPI_Bcast(&tmp_val, 2, MPI_INT, 0, fd->comm);
++          if (tmp_val != int_val) {
++              MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
++                                                 "romio_lustre_ds_in_coll",
++                                                 error_code);
++                ADIOI_Free(value);
++                return;
++          }
++          MPI_Info_set(fd->info, "romio_lustre_ds_in_coll", "disable");
++            fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_DISABLE;
++      }
++    }
 +    /* set the values for collective I/O and data sieving parameters */
 +    ADIOI_GEN_SetInfo(fd, users_info, error_code);
++
+     if (ADIOI_Direct_read) fd->direct_read = 1;
+     if (ADIOI_Direct_write) fd->direct_write = 1;
  
--    *error_code = MPI_SUCCESS;
-+    if (ADIOI_Direct_read) fd->direct_read = 1;
-+    if (ADIOI_Direct_write) fd->direct_write = 1;
- }
-diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c
---- ad_lustre_orig/ad_lustre_open.c    2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/ad_lustre_open.c 2008-09-17 18:55:50.000000000 +0800
-@@ -1,18 +1,21 @@
+diff -ruN adio/ad_lustre_orig/ad_lustre_open.c adio/ad_lustre/ad_lustre_open.c
+--- adio/ad_lustre_orig/ad_lustre_open.c       2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/ad_lustre_open.c    2009-03-01 11:32:32.000000000 +0800
+@@ -1,9 +1,11 @@
  /* -*- Mode: C; c-basic-offset:4 ; -*- */
 -/* 
 - *   Copyright (C) 1997 University of Chicago. 
@@ -1039,220 +677,32 @@ diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c
   *
   *   Copyright (C) 2007 Oak Ridge National Laboratory
 + *
-+ *   Copyright (C) 2008 Sun Microsystems, Lustre group
++ *   Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
   */
  
  #include "ad_lustre.h"
+@@ -51,14 +53,17 @@
+         err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
  
- void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
- {
--    int perm, old_mask, amode, amode_direct;
-+    int perm, old_mask, amode = 0, amode_direct = 0, flag = 0, err, myrank;
-+    int stripe_size = 0, stripe_count = 0, stripe_offset = -1;
-     struct lov_user_md lum = { 0 };
--    char *value;
-+    char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
- #if defined(MPICH2) || !defined(PRINT_ERR_MSG)
-     static char myname[] = "ADIOI_LUSTRE_OPEN";
-@@ -22,12 +25,57 @@
-       old_mask = umask(022);
-       umask(old_mask);
-       perm = old_mask ^ 0666;
--    }
--    else perm = fd->perm;
-+    } else
-+      perm = fd->perm;
+         if (!err) {
++            fd->hints->striping_unit = lum.lmm_stripe_size;
+             sprintf(value, "%d", lum.lmm_stripe_size);
+             MPI_Info_set(fd->info, "striping_unit", value);
  
--    amode = 0;
--    if (fd->access_mode & ADIO_CREATE)
-+    if (fd->access_mode & ADIO_CREATE) {
-       amode = amode | O_CREAT;
-+        /* Check striping info
-+         * if already set by SetInfo(), set them to lum; otherwise, set by lum
-+         */
-+        MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value,
-+                   &flag);
-+        if (flag)
-+          stripe_size = atoi(value);
-+
-+        MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value,
-+                   &flag);
-+        if (flag)
-+          stripe_count = atoi(value);
-+
-+        MPI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL, value,
-+                   &flag);
-+        if (flag)
-+          stripe_offset = atoi(value);
-+
-+        /* if user has specified striping info,
-+         * process 0 will try to check and set it.
-+         */
-+        if ((stripe_size > 0) || (stripe_count > 0) || (stripe_offset >= 0)) {
-+          MPI_Comm_rank(fd->comm, &myrank);
-+          if (myrank == 0) {
-+              int fd_sys = open(fd->filename, amode, perm);
-+              if (fd_sys == -1) {
-+                  if (errno != EEXIST)
-+                      FPRINTF(stderr, "Failure to open file %s %d %d\n",
-+                              strerror(errno), amode, perm);
-+              } else {
-+                  lum.lmm_magic = LOV_USER_MAGIC;
-+                  lum.lmm_pattern = 1;
-+                  lum.lmm_stripe_size = stripe_size;
-+                  lum.lmm_stripe_count = stripe_count;
-+                  lum.lmm_stripe_offset = stripe_offset;
-+
-+                  if (ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum))
-+                      FPRINTF(stderr,
-+                              "Failure to set striping info to Lustre!\n");
-+                  close(fd_sys);
-+              }
-+          }
-+          MPI_Barrier(fd->comm);
-+        }
-+    }
-+
-     if (fd->access_mode & ADIO_RDONLY)
-       amode = amode | O_RDONLY;
-     if (fd->access_mode & ADIO_WRONLY)
-@@ -42,32 +90,36 @@
-     fd->fd_sys = open(fd->filename, amode|O_CREAT, perm);
++            fd->hints->striping_factor = lum.lmm_stripe_count;
+             sprintf(value, "%d", lum.lmm_stripe_count);
+             MPI_Info_set(fd->info, "striping_factor", value);
  
-     if (fd->fd_sys != -1) {
--        int err;
--
--        value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
--
-         /* get file striping information and set it in info */
--        lum.lmm_magic = LOV_USER_MAGIC;
--        err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
--
--        if (!err) {
--            sprintf(value, "%d", lum.lmm_stripe_size);
--            MPI_Info_set(fd->info, "striping_unit", value);
--
--            sprintf(value, "%d", lum.lmm_stripe_count);
--            MPI_Info_set(fd->info, "striping_factor", value);
--
--            sprintf(value, "%d", lum.lmm_stripe_offset);
++            fd->hints->fs_hints.lustre.start_iodevice = lum.lmm_stripe_offset;
+             sprintf(value, "%d", lum.lmm_stripe_offset);
 -            MPI_Info_set(fd->info, "start_iodevice", value);
--        }
--        ADIOI_Free(value);
-+      lum.lmm_magic = LOV_USER_MAGIC;
-+      err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
-+      if (!err) {
-+          if (lum.lmm_stripe_size && lum.lmm_stripe_count &&
-+                (lum.lmm_stripe_offset >= 0)) {
-+              sprintf(value, "%d", lum.lmm_stripe_size);
-+              MPI_Info_set(fd->info, "striping_unit", value);
-+
-+              sprintf(value, "%d", lum.lmm_stripe_count);
-+              MPI_Info_set(fd->info, "striping_factor", value);
-+
-+              sprintf(value, "%d", lum.lmm_stripe_offset);
-+              MPI_Info_set(fd->info, "start_iodevice", value);
-+          } else {
-+              FPRINTF(stderr, "Striping info is invalid!\n");
-+              ADIOI_Free(value);
-+              MPI_Abort(MPI_COMM_WORLD, 1);
-+          }
-+      } else {
-+          FPRINTF(stderr, "Failed to get striping info from Lustre!\n");
-+            ADIOI_Free(value);
-+          MPI_Abort(MPI_COMM_WORLD, 1);
-+      }
-         if (fd->access_mode & ADIO_APPEND)
-             fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
--    } 
--
-+    }
-     if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
--      fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
-+        fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
++            MPI_Info_set(fd->info, "romio_lustre_start_iodevice", value);
+         }
+         ADIOI_Free(value);
  
-     fd->fd_direct = -1;
-     if (fd->direct_write || fd->direct_read) {
-@@ -81,20 +133,22 @@
-     }
-     /* --BEGIN ERROR HANDLING-- */
--    if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && 
--              (fd->direct_write || fd->direct_read))) {
-+    if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
-+      (fd->direct_write || fd->direct_read))) {
-       if (errno == ENAMETOOLONG)
-           *error_code = MPIO_Err_create_code(MPI_SUCCESS,
--                                             MPIR_ERR_RECOVERABLE, myname,
--                                             __LINE__, MPI_ERR_BAD_FILE,
-+                                             MPIR_ERR_RECOVERABLE,
-+                                             myname, __LINE__,
-+                                             MPI_ERR_BAD_FILE,
-                                              "**filenamelong",
-                                              "**filenamelong %s %d",
-                                              fd->filename,
-                                              strlen(fd->filename));
-       else if (errno == ENOENT)
-           *error_code = MPIO_Err_create_code(MPI_SUCCESS,
--                                             MPIR_ERR_RECOVERABLE, myname,
--                                             __LINE__, MPI_ERR_NO_SUCH_FILE,
-+                                             MPIR_ERR_RECOVERABLE,
-+                                             myname, __LINE__,
-+                                             MPI_ERR_NO_SUCH_FILE,
-                                              "**filenoexist",
-                                              "**filenoexist %s",
-                                              fd->filename);
-@@ -108,27 +162,30 @@
-                                              fd->filename);
-       else if (errno == EACCES) {
-           *error_code = MPIO_Err_create_code(MPI_SUCCESS,
--                                             MPIR_ERR_RECOVERABLE, myname,
--                                             __LINE__, MPI_ERR_ACCESS,
-+                                             MPIR_ERR_RECOVERABLE,
-+                                             myname, __LINE__,
-+                                             MPI_ERR_ACCESS,
-                                              "**fileaccess",
--                                             "**fileaccess %s", 
--                                             fd->filename );
--      }
--      else if (errno == EROFS) {
-+                                             "**fileaccess %s",
-+                                             fd->filename);
-+      } else if (errno == EROFS) {
-           /* Read only file or file system and write access requested */
-           *error_code = MPIO_Err_create_code(MPI_SUCCESS,
--                                             MPIR_ERR_RECOVERABLE, myname,
--                                             __LINE__, MPI_ERR_READ_ONLY,
--                                             "**ioneedrd", 0 );
--      }
--      else {
-+                                             MPIR_ERR_RECOVERABLE,
-+                                             myname, __LINE__,
-+                                             MPI_ERR_READ_ONLY,
-+                                             "**ioneedrd", 0);
-+      } else {
-           *error_code = MPIO_Err_create_code(MPI_SUCCESS,
--                                             MPIR_ERR_RECOVERABLE, myname,
--                                             __LINE__, MPI_ERR_IO, "**io",
-+                                             MPIR_ERR_RECOVERABLE,
-+                                             myname, __LINE__,
-+                                             MPI_ERR_IO, "**io",
-                                              "**io %s", strerror(errno));
-       }
--    }
-+    } else {
-     /* --END ERROR HANDLING-- */
--    else *error_code = MPI_SUCCESS;
-+        *error_code = MPI_SUCCESS;
-+    }
-+    ADIOI_Free(value);
- }
-diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c
---- ad_lustre_orig/ad_lustre_rwcontig.c        2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/ad_lustre_rwcontig.c     2008-10-15 22:44:35.000000000 +0800
+diff -ruN adio/ad_lustre_orig/ad_lustre_rwcontig.c adio/ad_lustre/ad_lustre_rwcontig.c
+--- adio/ad_lustre_orig/ad_lustre_rwcontig.c   2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/ad_lustre_rwcontig.c        2009-05-05 15:34:29.000000000 +0800
 @@ -1,9 +1,11 @@
  /* -*- Mode: C; c-basic-offset:4 ; -*- */
 -/* 
@@ -1263,14 +713,40 @@ diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c
   *
   *   Copyright (C) 2007 Oak Ridge National Laboratory
 + *
-+ *   Copyright (C) 2008 Sun Microsystems, Lustre group
++ *   Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
   */
  
  #define _XOPEN_SOURCE 600
-diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
---- ad_lustre_orig/ad_lustre_wrcoll.c  1970-01-01 08:00:00.000000000 +0800
-+++ ad_lustre/ad_lustre_wrcoll.c       2008-10-15 22:02:53.000000000 +0800
-@@ -0,0 +1,883 @@
+@@ -136,10 +138,23 @@
+           if (err == -1) goto ioerr;
+       }
+       
+-      if (io_mode)
++      if (io_mode) {
++#ifdef ADIOI_MPE_LOGGING
++        MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
++#endif
+           err = write(fd->fd_sys, buf, len);
+-      else 
++#ifdef ADIOI_MPE_LOGGING
++        MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
++#endif
++        } else {
++#ifdef ADIOI_MPE_LOGGING
++        MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
++#endif
+           err = read(fd->fd_sys, buf, len);
++#ifdef ADIOI_MPE_LOGGING
++        MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
++#endif
++        }
+     } else {
+       err = ADIOI_LUSTRE_Directio(fd, buf, len, offset, io_mode);
+     }
+diff -ruN adio/ad_lustre_orig/ad_lustre_wrcoll.c adio/ad_lustre/ad_lustre_wrcoll.c
+--- adio/ad_lustre_orig/ad_lustre_wrcoll.c     1970-01-01 08:00:00.000000000 +0800
++++ adio/ad_lustre/ad_lustre_wrcoll.c  2009-04-24 14:48:34.000000000 +0800
+@@ -0,0 +1,934 @@
 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
 +/*
 + *   Copyright (C) 1997 University of Chicago.
@@ -1278,7 +754,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 + *
 + *   Copyright (C) 2007 Oak Ridge National Laboratory
 + *
-+ *   Copyright (C) 2008 Sun Microsystems, Lustre group
++ *   Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 + */
 +
 +#include "ad_lustre.h"
@@ -1293,25 +769,25 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +                                      ADIO_Offset *offset_list,
 +                                      int *len_list,
 +                                      int contig_access_count,
-+                                      int * striping_info,
++                                      int *striping_info,
 +                                      int *buf_idx, int *error_code);
 +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
-+                                        ADIOI_Flatlist_node * flat_buf,
++                                        ADIOI_Flatlist_node *flat_buf,
 +                                        char **send_buf,
-+                                        ADIO_Offset * offset_list,
++                                        ADIO_Offset *offset_list,
 +                                        int *len_list, int *send_size,
-+                                        MPI_Request * requests,
++                                        MPI_Request *requests,
 +                                        int *sent_to_proc, int nprocs,
 +                                        int myrank, int contig_access_count,
-+                                        int * striping_info,
++                                        int *striping_info,
 +                                        int *send_buf_idx,
 +                                          int *curr_to_proc,
 +                                        int *done_to_proc, int iter,
 +                                        MPI_Aint buftype_extent);
 +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
 +                                       char *write_buf,
-+                                       ADIOI_Flatlist_node * flat_buf,
-+                                       ADIO_Offset * offset_list,
++                                       ADIOI_Flatlist_node *flat_buf,
++                                       ADIO_Offset *offset_list,
 +                                       int *len_list, int *send_size,
 +                                       int *recv_size, ADIO_Offset off,
 +                                       int size, int *count,
@@ -1319,22 +795,29 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +                                       int *sent_to_proc, int nprocs,
 +                                       int myrank, int buftype_is_contig,
 +                                       int contig_access_count,
-+                                       int * striping_info,
-+                                       ADIOI_Access * others_req,
++                                       int *striping_info,
++                                       ADIOI_Access *others_req,
 +                                       int *send_buf_idx,
 +                                       int *curr_to_proc,
 +                                       int *done_to_proc, int *hole,
 +                                       int iter, MPI_Aint buftype_extent,
 +                                       int *buf_idx, int *error_code);
-+void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
-+                      ADIO_Offset * srt_off, int *srt_len, int *start_pos,
++void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
++                      ADIO_Offset *srt_off, int *srt_len, int *start_pos,
 +                      int nprocs, int nprocs_recv, int total_elements);
 +
 +void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
 +                                 MPI_Datatype datatype,
 +                                 int file_ptr_type, ADIO_Offset offset,
-+                                 ADIO_Status * status, int *error_code)
++                                 ADIO_Status *status, int *error_code)
 +{
++    /* Uses a generalized version of the extended two-phase method described
++     * in "An Extended Two-Phase Method for Accessing Sections of
++     * Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
++     * Scientific Programming, (5)4:301--317, Winter 1996.
++     * http://www.mcs.anl.gov/home/thakur/ext2ph.ps
++     */
++
 +    ADIOI_Access *my_req;
 +    /* array of nprocs access structures, one for each other process has
 +       this process's request */
@@ -1346,7 +829,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +    int i, filetype_is_contig, nprocs, myrank, do_collect = 0;
 +    int contig_access_count = 0, buftype_is_contig, interleave_count = 0;
 +    int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
-+    ADIO_Offset orig_fp, start_offset, end_offset, off, min_st_offset;
++    ADIO_Offset orig_fp, start_offset, end_offset, off;
 +    ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL;
 +    int *buf_idx = NULL, *len_list = NULL, *striping_info = NULL;
 +    int old_error, tmp_error;
@@ -1360,13 +843,19 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +    if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
 +      /* For this process's request, calculate the list of offsets and
 +         lengths in the file and determine the start and end offsets. */
++
++      /* Note: end_offset points to the last byte-offset that will be accessed.
++         * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99
++         */
++
 +      ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
-+                            &offset_list, &len_list, &start_offset,
-+                            &end_offset, &contig_access_count);
++                              &offset_list, &len_list, &start_offset,
++                              &end_offset, &contig_access_count);
 +
 +      /* each process communicates its start and end offsets to other
-+         processes. The result is an array each of start and end offsets stored
-+         in order of process rank. */
++         * processes. The result is an array each of start and end offsets
++         * stored in order of process rank.
++         */
 +      st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
 +      end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
 +      MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
@@ -1427,22 +916,26 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +    }
 +
 +    /* Get Lustre hints information */
-+    ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1, nprocs,
-+                                   st_offsets, end_offsets,
-+                                   &min_st_offset);
++    ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1);
++
 +    /* calculate what portions of the access requests of this process are
 +     * located in which process
 +     */
 +    ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count,
 +                             striping_info, nprocs, &count_my_req_procs,
 +                             &count_my_req_per_proc, &my_req, &buf_idx);
-+    /* calculate what process's requests will be written by this process */
-+    ADIOI_LUSTRE_Calc_others_req(fd, count_my_req_procs,
-+                                 count_my_req_per_proc,
-+                               my_req, nprocs, myrank,
-+                                 end_offset - start_offset + 1,
-+                                 min_st_offset, striping_info,
-+                                 &count_others_req_procs, &others_req);
++
++    /* based on everyone's my_req, calculate what requests of other processes
++     * will be accessed by this process.
++     * count_others_req_procs = number of processes whose requests (including
++     * this process itself) will be accessed by this process
++     * count_others_req_per_proc[i] indicates how many separate contiguous
++     * requests of proc. i will be accessed by this process.
++     */
++
++    ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc,
++                          my_req, nprocs, myrank, &count_others_req_procs,
++                          &others_req);
 +    ADIOI_Free(count_my_req_per_proc);
 +
 +    /* exchange data and write in sizes of no more than stripe_size. */
@@ -1451,6 +944,17 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +                                offset_list, len_list, contig_access_count,
 +                              striping_info, buf_idx, error_code);
 +
++    /* If this collective write is followed by an independent write,
++     * it's possible to have those subsequent writes on other processes
++     * race ahead and sneak in before the read-modify-write completes.
++     * We carry out a collective communication at the end here so no one
++     * can start independent i/o before collective I/O completes.
++     *
++     * need to do some gymnastics with the error codes so that if something
++     * went wrong, all processes report error, but if a process has a more
++     * specific error code, we can still have that process report the
++     * additional information */
++
 +    old_error = *error_code;
 +    if (*error_code != MPI_SUCCESS)
 +      *error_code = MPI_ERR_IO;
@@ -1520,6 +1024,9 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +    fd->fp_sys_posn = -1;     /* set it to null. */
 +}
 +
++/* If successful, error_code is set to MPI_SUCCESS.  Otherwise an error
++ * code is created and returned in error_code.
++ */
 +static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
 +                                      MPI_Datatype datatype, int nprocs,
 +                                      int myrank, ADIOI_Access *others_req,
@@ -1529,6 +1036,16 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +                                      int *striping_info, int *buf_idx,
 +                                        int *error_code)
 +{
++    /* Send data to appropriate processes and write in sizes of no more
++     * than lustre stripe_size.
++     * The idea is to reduce the amount of extra memory required for
++     * collective I/O. If all data were written all at once, which is much
++     * easier, it would require temp space more than the size of user_buf,
++     * which is often unacceptable. For example, to write a distributed
++     * array to a file, where each local array is 8Mbytes, requiring
++     * at least another 8Mbytes of temp space is unacceptable.
++     */
++
 +    int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
 +    ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
 +    ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
@@ -1538,14 +1055,15 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +    int *send_curr_offlen_ptr, *send_size;
 +    int *partial_recv, *sent_to_proc, *recv_start_pos;
 +    int *send_buf_idx, *curr_to_proc, *done_to_proc;
-+    char *write_buf = NULL, *value;
++    char *write_buf = NULL;
 +    MPI_Status status;
 +    ADIOI_Flatlist_node *flat_buf = NULL;
 +    MPI_Aint buftype_extent;
 +    int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
-+    int lflag, data_sieving = 0;
++    int data_sieving = 0;
 +
 +    *error_code = MPI_SUCCESS;        /* changed below if error */
++    /* only I/O errors are currently reported */
 +
 +    /* calculate the number of writes of stripe size to be done.
 +     * That gives the no. of communication phases as well.
@@ -1636,6 +1154,16 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +          flat_buf = flat_buf->next;
 +    }
 +    MPI_Type_extent(datatype, &buftype_extent);
++    /* I need to check if there are any outstanding nonblocking writes to
++     * the file, which could potentially interfere with the writes taking
++     * place in this collective write call. Since this is not likely to be
++     * common, let me do the simplest thing possible here: Each process
++     * completes all pending nonblocking operations before completing.
++     */
++    /*ADIOI_Complete_async(error_code);
++    if (*error_code != MPI_SUCCESS) return;
++    MPI_Barrier(fd->comm);
++    */
 +
 +    iter_st_off = min_st_loc;
 +
@@ -1645,15 +1173,11 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +     * then rank0 will collect data [0, 30] and [60, 90] then write. There
 +     * is a hole in [30, 60], which will cause a read-modify-write in [0, 90].
 +     *
-+     * To reduce its impact on the performance, we disable data sieving
-+     * by default, unless the hint "ds_in_coll" is enabled.
++     * To reduce its impact on the performance, we can disable data sieving
++     * by hint "ds_in_coll".
 +     */
 +    /* check the hint for data sieving */
-+    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
-+    MPI_Info_get(fd->info, "ds_in_coll", MPI_MAX_INFO_VAL, value, &lflag);
-+    if (lflag && !strcmp(value, "enable"))
-+        data_sieving = 1;
-+    ADIOI_Free(value);
++    data_sieving = fd->hints->fs_hints.lustre.ds_in_coll;
 +
 +    for (m = 0; m < max_ntimes; m++) {
 +      /* go through all others_req and my_req to check which will be received
@@ -1738,7 +1262,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +          }
 +      if (flag) {
 +            /* check whether to do data sieving */
-+            if(data_sieving) {
++            if(data_sieving == ADIOI_HINT_ENABLE) {
 +              ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
 +                               ADIO_EXPLICIT_OFFSET, off, &status,
 +                               error_code);
@@ -1790,10 +1314,13 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +    ADIOI_Free(off_list);
 +}
 +
++/* Sets error_code to MPI_SUCCESS if successful, or creates an error code
++ * in the case of error.
++ */
 +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
 +                                       char *write_buf,
-+                                       ADIOI_Flatlist_node * flat_buf,
-+                                       ADIO_Offset * offset_list,
++                                       ADIOI_Flatlist_node *flat_buf,
++                                       ADIO_Offset *offset_list,
 +                                       int *len_list, int *send_size,
 +                                       int *recv_size, ADIO_Offset off,
 +                                       int size, int *count,
@@ -1801,8 +1328,8 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +                                       int *sent_to_proc, int nprocs,
 +                                       int myrank, int buftype_is_contig,
 +                                       int contig_access_count,
-+                                       int * striping_info,
-+                                       ADIOI_Access * others_req,
++                                       int *striping_info,
++                                       ADIOI_Access *others_req,
 +                                       int *send_buf_idx,
 +                                       int *curr_to_proc, int *done_to_proc,
 +                                         int *hole, int iter,
@@ -1879,7 +1406,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +          *hole = 1;
 +    }
 +    /* check the hint for data sieving */
-+    if (data_sieving && nprocs_recv && *hole) {
++    if (data_sieving == ADIOI_HINT_ENABLE && nprocs_recv && *hole) {
 +        ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
 +                        ADIO_EXPLICIT_OFFSET, off, &status, &err);
 +        // --BEGIN ERROR HANDLING--
@@ -2057,15 +1584,15 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +}
 +
 +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
-+                                        ADIOI_Flatlist_node * flat_buf,
++                                        ADIOI_Flatlist_node *flat_buf,
 +                                        char **send_buf,
-+                                        ADIO_Offset * offset_list,
++                                        ADIO_Offset *offset_list,
 +                                        int *len_list, int *send_size,
-+                                        MPI_Request * requests,
++                                        MPI_Request *requests,
 +                                        int *sent_to_proc, int nprocs,
 +                                        int myrank,
 +                                        int contig_access_count,
-+                                        int * striping_info,
++                                        int *striping_info,
 +                                        int *send_buf_idx,
 +                                        int *curr_to_proc,
 +                                        int *done_to_proc, int iter,
@@ -2154,10 +1681,10 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +      if (send_size[i])
 +          sent_to_proc[i] = curr_to_proc[i];
 +}
-diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
---- ad_lustre_orig/ad_lustre_wrstr.c   1970-01-01 08:00:00.000000000 +0800
-+++ ad_lustre/ad_lustre_wrstr.c        2008-10-13 15:34:53.000000000 +0800
-@@ -0,0 +1,472 @@
+diff -ruN adio/ad_lustre_orig/ad_lustre_wrstr.c adio/ad_lustre/ad_lustre_wrstr.c
+--- adio/ad_lustre_orig/ad_lustre_wrstr.c      1970-01-01 08:00:00.000000000 +0800
++++ adio/ad_lustre/ad_lustre_wrstr.c   2009-02-27 10:35:18.000000000 +0800
+@@ -0,0 +1,467 @@
 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
 +/*
 + *   Copyright (C) 1997 University of Chicago.
@@ -2165,7 +1692,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
 + *
 + *   Copyright (C) 2007 Oak Ridge National Laboratory
 + *
-+ *   Copyright (C) 2008 Sun Microsystems, Lustre group
++ *   Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 + */
 +
 +#include "ad_lustre.h"
@@ -2302,8 +1829,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
 +    int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
 +    ADIO_Status status1;
 +    int new_bwr_size, new_fwr_size;
-+    char * value;
-+    int stripe_size, lflag = 0;
++    int stripe_size;
 +    static char myname[] = "ADIOI_LUSTRE_WriteStrided";
 +    int myrank;
 +    MPI_Comm_rank(fd->comm, &myrank);
@@ -2340,11 +1866,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
 +    bufsize = buftype_size * count;
 +
 +    /* get striping info */
-+    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
-+    MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
-+    if (lflag)
-+      stripe_size = atoi(value);
-+    ADIOI_Free(value);
++    stripe_size = fd->hints->striping_unit;
 +
 +    /* Different buftype to different filetype */
 +    if (!buftype_is_contig && filetype_is_contig) {
@@ -2630,9 +2152,9 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
 +    if (!buftype_is_contig)
 +        ADIOI_Delete_flattened(datatype);
 +}
-diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in
---- ad_lustre_orig/Makefile.in 2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/Makefile.in      2008-09-17 18:20:35.000000000 +0800
+diff -ruN adio/ad_lustre_orig/Makefile.in adio/ad_lustre/Makefile.in
+--- adio/ad_lustre_orig/Makefile.in    2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/Makefile.in 2008-10-17 17:03:06.000000000 +0800
 @@ -16,7 +16,9 @@
  @VPATH@
  
@@ -2644,10 +2166,10 @@ diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in
  
  default: $(LIBNAME)
        @if [ "@ENABLE_SHLIB@" != "none" ] ; then \
-diff -ruN ad_lustre_orig/README ad_lustre/README
---- ad_lustre_orig/README      2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/README   2008-10-15 22:43:07.000000000 +0800
-@@ -5,6 +5,25 @@
+diff -ruN adio/ad_lustre_orig/README adio/ad_lustre/README
+--- adio/ad_lustre_orig/README 2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/README      2009-04-24 09:46:20.000000000 +0800
+@@ -5,6 +5,21 @@
    o To post the code for ParColl (Partitioned collective IO)
   
  -----------------------------------------------------
@@ -2656,25 +2178,21 @@ diff -ruN ad_lustre_orig/README ad_lustre/README
 +Improved data redistribution
 +  o Improve I/O pattern identification. Besides checking interleaving,
 +    if request I/O size is small, collective I/O will be performed.
-+    The hint big_req_size can be used to define the req size value.
++    The hint bigsize can be used to define the req size value.
 +  o Provide hint CO for load balancing to control the number of
 +    IO clients for each OST
 +  o Produce stripe-contiguous I/O pattern that Lustre prefers
-+  o Reduce the collective overhead by hints contiguous_data and
-+    same_io_size to remove unnecessary MPI_Alltoall()
 +  o Control read-modify-write in data sieving in collective IO
 +    by hint ds_in_coll.
-+  o Optimize the IO pattern.
-+    - If the whole access size <= stripe size, we suggest all the
-+      IO data will be performed by the same client, to avoid the
-+      extent lock revoking and reassignment.
++  o Reduce extent lock conflicts by make each OST accessed by one or
++    more constant clients.
 +
 +-----------------------------------------------------
  V04: 
  -----------------------------------------------------
    o Direct IO and Lockless IO support
---- common/ad_write_coll_orig.c        2008-10-15 11:24:31.000000000 +0800
-+++ common/ad_write_coll.c     2008-10-15 11:25:39.000000000 +0800
+--- adio/common/ad_write_coll_orig.c   2009-02-27 22:06:46.000000000 +0800
++++ adio/common/ad_write_coll.c        2008-10-15 11:25:38.000000000 +0800
 @@ -42,7 +42,7 @@
                             int *send_buf_idx, int *curr_to_proc, 
                             int *done_to_proc, int iter,