Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lustre / contrib / adio_driver_mpich2-1.0.7.patch
1 diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
2 --- ad_lustre_orig/ad_lustre_aggregate.c        1970-01-01 08:00:00.000000000 +0800
3 +++ ad_lustre/ad_lustre_aggregate.c     2008-10-15 22:26:35.000000000 +0800
4 @@ -0,0 +1,514 @@
5 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
6 +/*
7 + *   Copyright (C) 1997 University of Chicago.
8 + *   See COPYRIGHT notice in top-level directory.
9 + *
10 + *   Copyright (C) 2007 Oak Ridge National Laboratory
11 + *
12 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
13 + */
14 +
15 +#include "ad_lustre.h"
16 +#include "adio_extern.h"
17 +
18 +void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
19 +                                   int mode, int nprocs,
20 +                                    ADIO_Offset *st_offsets,
21 +                                    ADIO_Offset *end_offsets,
22 +                                    ADIO_Offset *min_st_offset_ptr)
23 +{
24 +    int *striping_info = NULL;
25 +    /* get striping information:
26 +     *  striping_info[0]: stripe_size
27 +     *  striping_info[1]: stripe_count
28 +     *  striping_info[2]: avail_cb_nodes
29 +     */
30 +    int stripe_size, stripe_count, CO = 1, CO_max = 1, lflag, i;
31 +    int user_cb_nodes = 0, avail_cb_nodes;
32 +    int nprocs_for_coll = fd->hints->cb_nodes;
33 +    ADIO_Offset min_st_offset, max_end_offset;
34 +    char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
35 +
36 +    /* Get hints value */
37 +    /* stripe size */
38 +    MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
39 +    if (lflag)
40 +       stripe_size = atoi(value);
41 +    /* stripe count */
42 +    /* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */
43 +    MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &lflag);
44 +    if (lflag)
45 +       stripe_count = atoi(value);
46 +
47 +    /* Calculate the available number of I/O clients, that is
48 +     *  avail_cb_nodes=min(cb_nodes, stripe_count*CO), where
49 +     *  CO=1 by default
50 +     */
51 +    if (!mode) {
52 +        /* for collective read,
53 +        * if "CO" clients access the same OST simultaneously,
54 +        * the OST disk seek time would be much. So, to avoid this,
55 +        * it might be better if 1 client only accesses 1 OST.
56 +        * So, we set CO = 1 to meet the above requirement.
57 +        */
58 +       CO = 1;
59 +       /*XXX: maybe there are other better way for collective read */
60 +    } else {
61 +        /* CO_max: the largest number of IO clients for each ost group */
62 +        CO_max = (nprocs_for_coll - 1)/ stripe_count + 1;
63 +        /* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */
64 +       MPI_Info_get(fd->info, "CO", MPI_MAX_INFO_VAL, value, &lflag);
65 +       if (lflag)
66 +           CO = atoi(value);
67 +       CO = ADIOI_MIN(CO_max, CO);
68 +    }
69 +    avail_cb_nodes = ADIOI_MIN(nprocs_for_coll, stripe_count * CO);
70 +
71 +    /* user_cb_nodes*/
72 +    MPI_Info_get(fd->info, "user_cb_nodes", MPI_MAX_INFO_VAL, value, &lflag);
73 +    if (lflag)
74 +       user_cb_nodes = atoi(value);
75 +    /* If the user doesn't change the cb_nodes and
76 +     * the whole file access portion is no larger than stripe size,
77 +     * we will perform the IO by the same process (rank0 by default).
78 +     */
79 +    /* calculate the whole file access portion */
80 +    min_st_offset = st_offsets[0];
81 +    max_end_offset = end_offsets[0];
82 +    for (i = 0; i < nprocs; i ++) {
83 +        min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
84 +        max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
85 +    }
86 +    if (!user_cb_nodes) {
87 +        /* Check the whole file access portion
88 +         *  if (whole_range <= stripe_size)
89 +         *      then always collect data to the same process;
90 +         *      set avail_cb_nodes=1; (rank0 by default).
91 +         * This pattern can make good use of Lustre client cache and
92 +         * avoid extent lock assigning and revoking.
93 +         *
94 +         * The recent experiments show good performance. We still need more
95 +         * validation.
96 +         */
97 +        if ((max_end_offset > min_st_offset) &&
98 +            (max_end_offset - min_st_offset) <= (ADIO_Offset) stripe_size)
99 +            avail_cb_nodes = 1;
100 +    }
101 +
102 +    ADIOI_Free(value);
103 +
104 +    *striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int));
105 +    striping_info = *striping_info_ptr;
106 +    striping_info[0] = stripe_size;
107 +    striping_info[1] = stripe_count;
108 +    striping_info[2] = avail_cb_nodes;
109 +
110 +    *min_st_offset_ptr = min_st_offset;
111 +}
112 +
113 +int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
114 +                                 ADIO_Offset *len, int *striping_info)
115 +{
116 +    int rank_index, rank;
117 +    ADIO_Offset avail_bytes;
118 +    int stripe_size = striping_info[0];
119 +    int avail_cb_nodes = striping_info[2];
120 +
121 +    /* Produce the stripe-contiguous pattern for Lustre */
122 +    rank_index = (int)((off / stripe_size) % avail_cb_nodes);
123 +
124 +    avail_bytes = (off / (ADIO_Offset)stripe_size + 1) *
125 +                  (ADIO_Offset)stripe_size - off;
126 +    if (avail_bytes < *len) {
127 +       /* this proc only has part of the requested contig. region */
128 +       *len = avail_bytes;
129 +    }
130 +    /* map our index to a rank */
131 +    /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
132 +    rank = fd->hints->ranklist[rank_index];
133 +
134 +    return rank;
135 +}
136 +
137 +void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
138 +                             int *len_list, int contig_access_count,
139 +                             int *striping_info, int nprocs,
140 +                              int *count_my_req_procs_ptr,
141 +                             int **count_my_req_per_proc_ptr,
142 +                             ADIOI_Access ** my_req_ptr,
143 +                             int **buf_idx_ptr)
144 +{
145 +    /* Nothing different from ADIOI_Calc_my_req(), except calling
146 +     * ADIOI_Lustre_Calc_aggregator() instead of the old one */
147 +    int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
148 +    int i, l, proc;
149 +    ADIO_Offset avail_len, rem_len, curr_idx, off;
150 +    ADIOI_Access *my_req;
151 +
152 +    *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
153 +    count_my_req_per_proc = *count_my_req_per_proc_ptr;
154 +
155 +    /* buf_idx is relevant only if buftype_is_contig.
156 +     * buf_idx[i] gives the index into user_buf where data received
157 +     * from proc. i should be placed. This allows receives to be done
158 +     * without extra buffer. This can't be done if buftype is not contig.
159 +     */
160 +    buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
161 +    /* initialize buf_idx to -1 */
162 +    for (i = 0; i < nprocs; i++)
163 +       buf_idx[i] = -1;
164 +
165 +    /* one pass just to calculate how much space to allocate for my_req;
166 +     * contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
167 +     */
168 +    for (i = 0; i < contig_access_count; i++) {
169 +       /* short circuit offset/len processing if len == 0
170 +        * (zero-byte  read/write
171 +        */
172 +       if (len_list[i] == 0)
173 +           continue;
174 +       off = offset_list[i];
175 +       avail_len = len_list[i];
176 +       /* we set avail_len to be the total size of the access.
177 +        * then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return
178 +        * the amount that was available.
179 +        */
180 +       proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
181 +       count_my_req_per_proc[proc]++;
182 +       /* figure out how many data is remaining in the access
183 +        * we'll take care of this data (if there is any)
184 +        * in the while loop below.
185 +        */
186 +       rem_len = len_list[i] - avail_len;
187 +
188 +       while (rem_len != 0) {
189 +           off += avail_len;   /* point to first remaining byte */
190 +           avail_len = rem_len;        /* save remaining size, pass to calc */
191 +           proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
192 +           count_my_req_per_proc[proc]++;
193 +           rem_len -= avail_len;       /* reduce remaining length by amount from fd */
194 +       }
195 +    }
196 +
197 +    *my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access));
198 +    my_req = *my_req_ptr;
199 +
200 +    count_my_req_procs = 0;
201 +    for (i = 0; i < nprocs; i++) {
202 +       if (count_my_req_per_proc[i]) {
203 +           my_req[i].offsets = (ADIO_Offset *)
204 +                               ADIOI_Malloc(count_my_req_per_proc[i] *
205 +                                             sizeof(ADIO_Offset));
206 +           my_req[i].lens = (int *) ADIOI_Malloc(count_my_req_per_proc[i] *
207 +                                                 sizeof(int));
208 +           count_my_req_procs++;
209 +       }
210 +       my_req[i].count = 0;    /* will be incremented where needed later */
211 +    }
212 +
213 +    /* now fill in my_req */
214 +    curr_idx = 0;
215 +    for (i = 0; i < contig_access_count; i++) {
216 +       if (len_list[i] == 0)
217 +           continue;
218 +       off = offset_list[i];
219 +       avail_len = len_list[i];
220 +       proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
221 +
222 +       /* for each separate contiguous access from this process */
223 +       if (buf_idx[proc] == -1)
224 +           buf_idx[proc] = (int) curr_idx;
225 +
226 +       l = my_req[proc].count;
227 +       curr_idx += (int) avail_len;    /* NOTE: Why is curr_idx an int?  Fix? */
228 +
229 +       rem_len = len_list[i] - avail_len;
230 +
231 +       /* store the proc, offset, and len information in an array
232 +        * of structures, my_req. Each structure contains the
233 +        * offsets and lengths located in that process's FD,
234 +        * and the associated count.
235 +        */
236 +       my_req[proc].offsets[l] = off;
237 +       my_req[proc].lens[l] = (int) avail_len;
238 +       my_req[proc].count++;
239 +
240 +       while (rem_len != 0) {
241 +           off += avail_len;
242 +           avail_len = rem_len;
243 +           proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
244 +                                                striping_info);
245 +           if (buf_idx[proc] == -1)
246 +               buf_idx[proc] = (int) curr_idx;
247 +
248 +           l = my_req[proc].count;
249 +           curr_idx += avail_len;
250 +           rem_len -= avail_len;
251 +
252 +           my_req[proc].offsets[l] = off;
253 +           my_req[proc].lens[l] = (int) avail_len;
254 +           my_req[proc].count++;
255 +       }
256 +    }
257 +
258 +#ifdef AGG_DEBUG
259 +    for (i = 0; i < nprocs; i++) {
260 +       if (count_my_req_per_proc[i] > 0) {
261 +           FPRINTF(stdout, "data needed from %d (count = %d):\n",
262 +                           i, my_req[i].count);
263 +           for (l = 0; l < my_req[i].count; l++) {
264 +               FPRINTF(stdout, "   off[%d] = %lld, len[%d] = %d\n",
265 +                               l, my_req[i].offsets[l], l, my_req[i].lens[l]);
266 +           }
267 +       }
268 +    }
269 +#endif
270 +#if 0
271 +    for (i = 0; i < nprocs; i++) {
272 +       FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
273 +    }
274 +#endif
275 +
276 +    *count_my_req_procs_ptr = count_my_req_procs;
277 +    *buf_idx_ptr = buf_idx;
278 +}
279 +
280 +int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
281 +                          int *len_list, int nprocs)
282 +{
283 +    /* If the processes are non-interleaved, we will check the req_size.
284 +     *   if (avg_req_size > big_req_size) {
285 +     *       docollect = 0;
286 +     *   }
287 +     */
288 +
289 +    int i, docollect = 1, lflag, big_req_size = 0;
290 +    ADIO_Offset req_size = 0, total_req_size;
291 +    int avg_req_size, total_access_count;
292 +    char *value = NULL;
293 +
294 +    /* calculate total_req_size and total_access_count */
295 +    for (i = 0; i < contig_access_count; i++)
296 +        req_size += len_list[i];
297 +    MPI_Allreduce(&req_size, &total_req_size, 1, MPI_LONG_LONG_INT, MPI_SUM,
298 +               fd->comm);
299 +    MPI_Allreduce(&contig_access_count, &total_access_count, 1, MPI_INT, MPI_SUM,
300 +               fd->comm);
301 +    /* estimate average req_size */
302 +    avg_req_size = (int)(total_req_size / total_access_count);
303 +
304 +    /* get hint of big_req_size */
305 +    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
306 +    MPI_Info_get(fd->info, "big_req_size", MPI_MAX_INFO_VAL, value, &lflag);
307 +    if (lflag)
308 +        big_req_size = atoi(value);
309 +    /* Don't perform collective I/O if there are big requests */
310 +    if ((big_req_size > 0) && (avg_req_size > big_req_size))
311 +        docollect = 0;
312 +
313 +    ADIOI_Free(value);
314 +
315 +    return docollect;
316 +}
317 +
318 +void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
319 +                                 int *count_my_req_per_proc,
320 +                                 ADIOI_Access * my_req,
321 +                                 int nprocs, int myrank,
322 +                                  ADIO_Offset req_len,
323 +                                  ADIO_Offset min_st_offset,
324 +                                  int *striping_info,
325 +                                 int *count_others_req_procs_ptr,
326 +                                 ADIOI_Access ** others_req_ptr)
327 +{
328 +    /* what requests of other processes will be written by this process */
329 +
330 +    int *count_others_req_per_proc, count_others_req_procs, proc;
331 +    int i, j, lflag, samesize = 0, contiguous = 0;
332 +    int avail_cb_nodes = striping_info[2];
333 +    MPI_Request *send_requests, *recv_requests;
334 +    MPI_Status *statuses;
335 +    ADIOI_Access *others_req;
336 +    char *value = NULL;
337 +    ADIO_Offset off, avail_len, rem_len, *all_lens;
338 +
339 +    /* There are two hints, which could reduce some MPI communication overhead,
340 +     * if the users knows the I/O pattern and set them correctly. */
341 +    /* They are
342 +     * contiguous_data: if the data are contiguous,
343 +     *                  we don't need to do MPI_Alltoall().
344 +     * same_io_size: And if the data req size is same,
345 +     *               we can calculate the offset directly
346 +     */
347 +    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
348 +    /* hint of contiguous data */
349 +    MPI_Info_get(fd->info, "contiguous_data", MPI_MAX_INFO_VAL, value, &lflag);
350 +    if (lflag && !strcmp(value, "yes"))
351 +        contiguous = 1;
352 +    /* hint of same io size */
353 +    MPI_Info_get(fd->info, "same_io_size", MPI_MAX_INFO_VAL, value, &lflag);
354 +    if (lflag && !strcmp(value, "yes"))
355 +        samesize = 1;
356 +    ADIOI_Free(value);
357 +
358 +    *others_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs *
359 +                                                    sizeof(ADIOI_Access));
360 +    others_req = *others_req_ptr;
361 +
362 +    /* if the data are contiguous, we can calulate the offset and length
363 +     * of the other requests simply, instead of MPI_Alltoall() */
364 +    if (contiguous) {
365 +        for (i = 0; i < nprocs; i++) {
366 +            others_req[i].count = 0;
367 +        }
368 +        all_lens = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
369 +
370 +        /* same req size ? */
371 +        if (samesize == 0) {
372 +            /* exchange request length */
373 +            MPI_Allgather(&req_len, 1, ADIO_OFFSET, all_lens, 1, ADIO_OFFSET,
374 +                          fd->comm);
375 +        } else { /* same request size */
376 +            /* assign request length to all_lens[] */
377 +            for (i = 0; i < nprocs; i ++)
378 +               all_lens[i] = req_len;
379 +        }
380 +        if (myrank < avail_cb_nodes) {
381 +            /* It's a IO client and it will receive data from others */
382 +            off = min_st_offset;
383 +            /* calcaulte other_req[i].count */
384 +            for (i = 0; i < nprocs; i++) {
385 +                avail_len = all_lens[i];
386 +                rem_len = avail_len;
387 +                while (rem_len > 0) {
388 +                   proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
389 +                                                        striping_info);
390 +                    if (proc == myrank) {
391 +                        others_req[i].count ++;
392 +                    }
393 +                    off += avail_len;
394 +                    rem_len -= avail_len;
395 +                    avail_len = rem_len;
396 +                }
397 +            }
398 +            /* calculate offset and len for each request */
399 +            off = min_st_offset;
400 +            for (i = 0; i < nprocs; i++) {
401 +                if (others_req[i].count) {
402 +                   others_req[i].offsets = (ADIO_Offset *)
403 +                                            ADIOI_Malloc(others_req[i].count *
404 +                                                        sizeof(ADIO_Offset));
405 +                   others_req[i].lens = (int *)
406 +                                         ADIOI_Malloc(others_req[i].count *
407 +                                                      sizeof(int));
408 +                    others_req[i].mem_ptrs = (MPI_Aint *)
409 +                                             ADIOI_Malloc(others_req[i].count *
410 +                                                         sizeof(MPI_Aint));
411 +                }
412 +                j = 0;
413 +                avail_len = all_lens[i];
414 +                rem_len = avail_len;
415 +                while (rem_len > 0) {
416 +                   proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
417 +                                                        striping_info);
418 +                    if (proc == myrank) {
419 +                        others_req[i].offsets[j] = off;
420 +                        others_req[i].lens[j] = (int)avail_len;
421 +                        j ++;
422 +                    }
423 +                    off += avail_len;
424 +                    rem_len -= avail_len;
425 +                    avail_len = rem_len;
426 +                }
427 +            }
428 +        }
429 +        ADIOI_Free(all_lens);
430 +    } else {
431 +        /* multiple non-contiguous requests */
432 +        /* first find out how much to send/recv and from/to whom */
433 +
434 +        /*
435 +         * count_others_req_procs:
436 +         *    number of processes whose requests will be written by
437 +         *    this process (including this process itself)
438 +         * count_others_req_per_proc[i]:
439 +         *    how many separate contiguous requests of proc[i] will be
440 +         *    written by this process.
441 +         */
442 +
443 +        count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
444 +
445 +        MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
446 +                    count_others_req_per_proc, 1, MPI_INT, fd->comm);
447 +
448 +        count_others_req_procs = 0;
449 +        for (i = 0; i < nprocs; i++) {
450 +           if (count_others_req_per_proc[i]) {
451 +               others_req[i].count = count_others_req_per_proc[i];
452 +               others_req[i].offsets = (ADIO_Offset *)
453 +                                        ADIOI_Malloc(others_req[i].count *
454 +                                                sizeof(ADIO_Offset));
455 +               others_req[i].lens = (int *)
456 +                                    ADIOI_Malloc(others_req[i].count *
457 +                                                  sizeof(int));
458 +               others_req[i].mem_ptrs = (MPI_Aint *)
459 +                                        ADIOI_Malloc(others_req[i].count *
460 +                                                     sizeof(MPI_Aint));
461 +               count_others_req_procs++;
462 +           } else
463 +               others_req[i].count = 0;
464 +        }
465 +
466 +        /* now send the calculated offsets and lengths to respective processes */
467 +
468 +        send_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_my_req_procs + 1) *
469 +                                                     sizeof(MPI_Request));
470 +        recv_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_others_req_procs+1)*
471 +                                                    sizeof(MPI_Request));
472 +        /* +1 to avoid a 0-size malloc */
473 +
474 +        j = 0;
475 +        for (i = 0; i < nprocs; i++) {
476 +           if (others_req[i].count) {
477 +               MPI_Irecv(others_req[i].offsets, others_req[i].count,
478 +                         ADIO_OFFSET, i, i + myrank, fd->comm,
479 +                         &recv_requests[j]);
480 +               j++;
481 +               MPI_Irecv(others_req[i].lens, others_req[i].count,
482 +                         MPI_INT, i, i + myrank + 1, fd->comm,
483 +                         &recv_requests[j]);
484 +               j++;
485 +           }
486 +        }
487 +
488 +        j = 0;
489 +        for (i = 0; i < nprocs; i++) {
490 +           if (my_req[i].count) {
491 +               MPI_Isend(my_req[i].offsets, my_req[i].count,
492 +                         ADIO_OFFSET, i, i + myrank, fd->comm,
493 +                         &send_requests[j]);
494 +               j++;
495 +               MPI_Isend(my_req[i].lens, my_req[i].count,
496 +                         MPI_INT, i, i + myrank + 1, fd->comm,
497 +                         &send_requests[j]);
498 +               j++;
499 +           }
500 +        }
501 +
502 +        statuses = (MPI_Status *)
503 +                   ADIOI_Malloc((1 + 2 * ADIOI_MAX(count_my_req_procs,
504 +                                                  count_others_req_procs)) *
505 +                                         sizeof(MPI_Status));
506 +        /* +1 to avoid a 0-size malloc */
507 +
508 +        MPI_Waitall(2 * count_my_req_procs, send_requests, statuses);
509 +        MPI_Waitall(2 * count_others_req_procs, recv_requests, statuses);
510 +
511 +        ADIOI_Free(send_requests);
512 +        ADIOI_Free(recv_requests);
513 +        ADIOI_Free(statuses);
514 +        ADIOI_Free(count_others_req_per_proc);
515 +
516 +        *count_others_req_procs_ptr = count_others_req_procs;
517 +    }
518 +}
519 diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c
520 --- ad_lustre_orig/ad_lustre.c  2008-09-17 14:36:57.000000000 +0800
521 +++ ad_lustre/ad_lustre.c       2008-09-17 18:20:35.000000000 +0800
522 @@ -1,9 +1,11 @@
523  /* -*- Mode: C; c-basic-offset:4 ; -*- */
524 -/* 
525 - *   Copyright (C) 2001 University of Chicago. 
526 +/*
527 + *   Copyright (C) 2001 University of Chicago.
528   *   See COPYRIGHT notice in top-level directory.
529   *
530   *   Copyright (C) 2007 Oak Ridge National Laboratory
531 + *
532 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
533   */
534  
535  #include "ad_lustre.h"
536 @@ -13,13 +15,13 @@
537      ADIOI_LUSTRE_ReadContig, /* ReadContig */
538      ADIOI_LUSTRE_WriteContig, /* WriteContig */
539      ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
540 -    ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
541 +    ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */
542      ADIOI_GEN_SeekIndividual, /* SeekIndividual */
543 -    ADIOI_GEN_Fcntl, /* Fcntl */
544 +    ADIOI_LUSTRE_Fcntl, /* Fcntl */
545      ADIOI_LUSTRE_SetInfo, /* SetInfo */
546      ADIOI_GEN_ReadStrided, /* ReadStrided */
547 -    ADIOI_GEN_WriteStrided, /* WriteStrided */
548 -    ADIOI_GEN_Close, /* Close */
549 +    ADIOI_LUSTRE_WriteStrided, /* WriteStrided */
550 +    ADIOI_LUSTRE_Close, /* Close */
551  #if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE)
552      ADIOI_GEN_IreadContig, /* IreadContig */
553      ADIOI_GEN_IwriteContig, /* IwriteContig */
554 diff -ruN ad_lustre_orig/ad_lustre_close.c ad_lustre/ad_lustre_close.c
555 --- ad_lustre_orig/ad_lustre_close.c    1970-01-01 08:00:00.000000000 +0800
556 +++ ad_lustre/ad_lustre_close.c 2008-09-17 18:20:35.000000000 +0800
557 @@ -0,0 +1,42 @@
558 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
559 +/*
560 + *
561 + *   Copyright (C) 1997 University of Chicago.
562 + *   See COPYRIGHT notice in top-level directory.
563 + *
564 + *   Copyright (C) 2007 Oak Ridge National Laboratory
565 + *
566 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
567 + */
568 +
569 +#include "ad_lustre.h"
570 +
571 +#ifdef PROFILE
572 +#include "mpe.h"
573 +#endif
574 +
575 +void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code)
576 +{
577 +    int err, derr = 0;
578 +    static char myname[] = "ADIOI_LUSTRE_CLOSE";
579 +
580 +#ifdef PROFILE
581 +    MPE_Log_event(9, 0, "start close");
582 +#endif
583 +
584 +    err = close(fd->fd_sys);
585 +
586 +#ifdef PROFILE
587 +    MPE_Log_event(10, 0, "end close");
588 +#endif
589 +
590 +    fd->fd_sys = -1;
591 +
592 +    if (err == -1 || derr == -1) {
593 +       *error_code =
594 +           MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname,
595 +                                __LINE__, MPI_ERR_IO, "**io", "**io %s",
596 +                                strerror(errno));
597 +    } else
598 +       *error_code = MPI_SUCCESS;
599 +}
600 diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
601 --- ad_lustre_orig/ad_lustre.h  2008-09-17 14:36:57.000000000 +0800
602 +++ ad_lustre/ad_lustre.h       2008-10-15 21:22:52.000000000 +0800
603 @@ -1,9 +1,11 @@
604  /* -*- Mode: C; c-basic-offset:4 ; -*- */
605 -/* 
606 - *   Copyright (C) 1997 University of Chicago. 
607 +/*
608 + *   Copyright (C) 1997 University of Chicago.
609   *   See COPYRIGHT notice in top-level directory.
610   *
611   *   Copyright (C) 2007 Oak Ridge National Laboratory
612 + *
613 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
614   */
615  
616  #ifndef AD_UNIX_INCLUDE
617 @@ -24,7 +26,32 @@
618  
619  /*#include <fcntl.h>*/
620  #include <sys/ioctl.h>
621 +#ifdef WITH_LUSTRE
622  #include "lustre/lustre_user.h"
623 +#else
624 +/* copy something from lustre_user.h here */
625 +#  define LOV_USER_MAGIC 0x0BD10BD0
626 +#  define LL_IOC_LOV_SETSTRIPE  _IOW ('f', 154, long)
627 +#  define LL_IOC_LOV_GETSTRIPE  _IOW ('f', 155, long)
628 +#  define lov_user_ost_data lov_user_ost_data_v1
629 +struct lov_user_ost_data_v1 {     /* per-stripe data structure */
630 +        __u64 l_object_id;        /* OST object ID */
631 +        __u64 l_object_gr;        /* OST object group (creating MDS number) */
632 +        __u32 l_ost_gen;          /* generation of this OST index */
633 +        __u32 l_ost_idx;          /* OST index in LOV */
634 +} __attribute__((packed));
635 +#define lov_user_md lov_user_md_v1
636 +struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
637 +        __u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V1 */
638 +        __u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
639 +        __u64 lmm_object_id;      /* LOV object ID */
640 +        __u64 lmm_object_gr;      /* LOV object group */
641 +        __u32 lmm_stripe_size;    /* size of stripe in bytes */
642 +        __u16 lmm_stripe_count;   /* num stripes in use for this object */
643 +        __u16 lmm_stripe_offset;  /* starting stripe offset in lmm_objects */
644 +        struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
645 +} __attribute__((packed));
646 +#endif
647  #include "adio.h"
648  /*#include "adioi.h"*/
649  
650 @@ -41,24 +68,56 @@
651  
652  void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code);
653  void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code);
654 -void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count, 
655 -                      MPI_Datatype datatype, int file_ptr_type,
656 -                     ADIO_Offset offset, ADIO_Status *status, int
657 -                    *error_code);
658 -void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count, 
659 -                      MPI_Datatype datatype, int file_ptr_type,
660 -                      ADIO_Offset offset, ADIO_Status *status, int
661 -                     *error_code);   
662 +void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
663 +                             MPI_Datatype datatype, int file_ptr_type,
664 +                             ADIO_Offset offset, ADIO_Status *status,
665 +                             int *error_code);
666 +void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count,
667 +                              MPI_Datatype datatype, int file_ptr_type,
668 +                              ADIO_Offset offset, ADIO_Status *status,
669 +                              int *error_code);
670 +void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
671 +                              MPI_Datatype datatype, int file_ptr_type,
672 +                              ADIO_Offset offset, ADIO_Status *status,
673 +                              int *error_code);
674  void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
675 -                      MPI_Datatype datatype, int file_ptr_type,
676 -                      ADIO_Offset offset, ADIO_Status *status, int
677 -                      *error_code);
678 +                                  MPI_Datatype datatype, int file_ptr_type,
679 +                                  ADIO_Offset offset, ADIO_Status *status,
680 +                                   int *error_code);
681  void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count,
682 -                      MPI_Datatype datatype, int file_ptr_type,
683 -                      ADIO_Offset offset, ADIO_Status *status, int
684 -                      *error_code);
685 +                                 MPI_Datatype datatype, int file_ptr_type,
686 +                                 ADIO_Offset offset, ADIO_Status *status,
687 +                                  int *error_code);
688 +void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count,
689 +                             MPI_Datatype datatype, int file_ptr_type,
690 +                             ADIO_Offset offset, ADIO_Status *status,
691 +                              int *error_code);
692  void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
693                        int *error_code);
694  void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
695 -
696 +void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
697 +                                   int mode, int nprocs,
698 +                                    ADIO_Offset *st_offsets,
699 +                                    ADIO_Offset *end_offsets,
700 +                                    ADIO_Offset *min_st_offset);
701 +int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
702 +                                ADIO_Offset *len, int *striping_info);
703 +void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
704 +                             int *len_list, int contig_access_count,
705 +                             int *striping_info, int nprocs,
706 +                              int *count_my_req_procs_ptr,
707 +                             int **count_my_req_per_proc_ptr,
708 +                             ADIOI_Access ** my_req_ptr,
709 +                             int **buf_idx_ptr);
710 +int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
711 +                          int *len_list, int nprocs);
712 +void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
713 +                                 int *count_my_req_per_proc,
714 +                                 ADIOI_Access * my_req,
715 +                                 int nprocs, int myrank,
716 +                                  ADIO_Offset req_len,
717 +                                  ADIO_Offset min_st_offset,
718 +                                  int *striping_info,
719 +                                 int *count_others_req_procs_ptr,
720 +                                 ADIOI_Access ** others_req_ptr);
721  #endif /* End of AD_UNIX_INCLUDE */
722 diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
723 --- ad_lustre_orig/ad_lustre_hints.c    2008-09-17 14:36:57.000000000 +0800
724 +++ ad_lustre/ad_lustre_hints.c 2008-10-15 21:31:00.000000000 +0800
725 @@ -1,9 +1,11 @@
726  /* -*- Mode: C; c-basic-offset:4 ; -*- */
727 -/* 
728 - *   Copyright (C) 1997 University of Chicago. 
729 +/*
730 + *   Copyright (C) 1997 University of Chicago.
731   *   See COPYRIGHT notice in top-level directory.
732   *
733   *   Copyright (C) 2007 Oak Ridge National Laboratory
734 + *
735 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
736   */
737  
738  #include "ad_lustre.h"
739 @@ -11,130 +13,189 @@
740  
741  void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
742  {
743 -    char *value, *value_in_fd;
744 -    int flag, tmp_val[3], str_factor=-1, str_unit=0, start_iodev=-1;
745 -    struct lov_user_md lum = { 0 };
746 -    int err, myrank, fd_sys, perm, amode, old_mask;
747 +    char *value = NULL;
748 +    int flag, tmp_val, int_val, str_factor, str_unit, start_iodev;
749 +    static char myname[] = "ADIOI_LUSTRE_SETINFO";
750  
751 +    *error_code = MPI_SUCCESS;
752      value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
753 +
754      if ( (fd->info) == MPI_INFO_NULL) {
755 -       /* This must be part of the open call. can set striping parameters 
756 -           if necessary. */ 
757 +       /* This must be part of the open call. can set striping parameters
758 +           if necessary. */
759         MPI_Info_create(&(fd->info));
760  
761         MPI_Info_set(fd->info, "direct_read", "false");
762         MPI_Info_set(fd->info, "direct_write", "false");
763         fd->direct_read = fd->direct_write = 0;
764 -       
765 -       /* has user specified striping or server buffering parameters 
766 +
767 +       /* has user specified striping or server buffering parameters
768             and do they have the same value on all processes? */
769         if (users_info != MPI_INFO_NULL) {
770 -           MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, 
771 -                        value, &flag);
772 -           if (flag) 
773 -               str_unit=atoi(value);
774 -
775 -           MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, 
776 -                        value, &flag);
777 -           if (flag) 
778 -               str_factor=atoi(value);
779 -
780 -           MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, 
781 +            /* direct read and write */
782 +           MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
783                          value, &flag);
784 -           if (flag) 
785 -               start_iodev=atoi(value);
786 -
787 -           MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, 
788 -                            value, &flag);
789             if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
790                 MPI_Info_set(fd->info, "direct_read", "true");
791                 fd->direct_read = 1;
792             }
793 -
794 -           MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL, 
795 +           MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
796                              value, &flag);
797             if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
798                 MPI_Info_set(fd->info, "direct_write", "true");
799                 fd->direct_write = 1;
800             }
801 +            /*  stripe size */
802 +           MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
803 +                        value, &flag);
804 +           if (flag && (str_unit = atoi(value))) {
805 +               tmp_val = str_unit;
806 +               MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
807 +               if (tmp_val != str_unit) {
808 +                   MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
809 +                                                      "striping_unit",
810 +                                                      error_code);
811 +                    ADIOI_Free(value);
812 +                   return;
813 +               }
814 +               MPI_Info_set(fd->info, "striping_unit", value);
815 +           }
816 +            /* stripe count */
817 +           MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
818 +                        value, &flag);
819 +           if (flag && (str_factor = atoi(value))) {
820 +               tmp_val = str_factor;
821 +               MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
822 +               if (tmp_val != str_factor) {
823 +                   MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
824 +                                                      "striping_factor",
825 +                                                      error_code);
826 +                    ADIOI_Free(value);
827 +                   return;
828 +               }
829 +               MPI_Info_set(fd->info, "striping_factor", value);
830 +           }
831 +            /* stripe offset */
832 +            MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
833 +                        value, &flag);
834 +           if (flag && ((start_iodev = atoi(value)) >= 0)) {
835 +               tmp_val = start_iodev;
836 +               MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
837 +               if (tmp_val != start_iodev) {
838 +                   MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
839 +                                                      "start_iodevice",
840 +                                                      error_code);
841 +                    ADIOI_Free(value);
842 +                   return;
843 +               }
844 +               MPI_Info_set(fd->info, "start_iodevice", value);
845 +           }
846         }
847 -
848 -       MPI_Comm_rank(fd->comm, &myrank);
849 -       if (myrank == 0) {
850 -           tmp_val[0] = str_factor;
851 -           tmp_val[1] = str_unit;
852 -           tmp_val[2] = start_iodev;
853 +    }
854 +    if (users_info != MPI_INFO_NULL) {
855 +        /* CO: IO Clients/OST,
856 +         * to keep the load balancing between clients and OSTs */
857 +        MPI_Info_get(users_info, "CO", MPI_MAX_INFO_VAL, value,
858 +                     &flag);
859 +       if (flag && (int_val = atoi(value)) > 0) {
860 +            tmp_val = int_val;
861 +           MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
862 +           if (tmp_val != int_val) {
863 +                MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
864 +                                                   "CO",
865 +                                                   error_code);
866 +                ADIOI_Free(value);
867 +               return;
868 +           }
869 +           MPI_Info_set(fd->info, "CO", value);
870         }
871 -       MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm);
872 -
873 -       if (tmp_val[0] != str_factor 
874 -               || tmp_val[1] != str_unit 
875 -               || tmp_val[2] != start_iodev) {
876 -           FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys"
877 -                   "-striping_factor:striping_unit:start_iodevice "
878 -                   "need to be identical across all processes\n");
879 -           MPI_Abort(MPI_COMM_WORLD, 1);
880 -               } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) {
881 -            /* if user has specified striping info, process 0 tries to set it */
882 -           if (!myrank) {
883 -               if (fd->perm == ADIO_PERM_NULL) {
884 -                   old_mask = umask(022);
885 -                   umask(old_mask);
886 -                   perm = old_mask ^ 0666;
887 -               }
888 -               else perm = fd->perm;
889 -
890 -               amode = 0;
891 -               if (fd->access_mode & ADIO_CREATE)
892 -                   amode = amode | O_CREAT;
893 -               if (fd->access_mode & ADIO_RDONLY)
894 -                   amode = amode | O_RDONLY;
895 -               if (fd->access_mode & ADIO_WRONLY)
896 -                   amode = amode | O_WRONLY;
897 -               if (fd->access_mode & ADIO_RDWR)
898 -                   amode = amode | O_RDWR;
899 -               if (fd->access_mode & ADIO_EXCL)
900 -                   amode = amode | O_EXCL;
901 -
902 -               /* we need to create file so ensure this is set */
903 -               amode = amode | O_LOV_DELAY_CREATE | O_CREAT;
904 -
905 -               fd_sys = open(fd->filename, amode, perm);
906 -               if (fd_sys == -1) { 
907 -                   if (errno != EEXIST) 
908 -                       fprintf(stderr, 
909 -                               "Failure to open file %s %d %d\n",strerror(errno), amode, perm);
910 -               } else {
911 -                   lum.lmm_magic = LOV_USER_MAGIC;
912 -                   lum.lmm_pattern = 0;
913 -                   lum.lmm_stripe_size = str_unit;
914 -                   lum.lmm_stripe_count = str_factor;
915 -                   lum.lmm_stripe_offset = start_iodev;
916 -
917 -                   err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum);
918 -                   if (err == -1 && errno != EEXIST) { 
919 -                       fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno));
920 -                   }
921 -                   close(fd_sys);
922 -              }
923 -           } /* End of striping parameters validation */
924 +        /* big_req_size:
925 +         * if the req size is bigger than this,
926 +         * collective IO may not be performed.
927 +         */
928 +       MPI_Info_get(users_info, "big_req_size", MPI_MAX_INFO_VAL, value,
929 +                     &flag);
930 +       if (flag && (int_val = atoi(value)) > 0) {
931 +            tmp_val = int_val;
932 +           MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
933 +           if (tmp_val != int_val) {
934 +               MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
935 +                                                  "big_req_size",
936 +                                                  error_code);
937 +                ADIOI_Free(value);
938 +               return;
939 +           }
940 +           MPI_Info_set(fd->info, "big_req_size", value);
941 +        }
942 +        /* ds_in_coll: disable data sieving in collective IO */
943 +       MPI_Info_get(users_info, "ds_in_coll", MPI_MAX_INFO_VAL,
944 +                    value, &flag);
945 +       if (flag && (!strcmp(value, "enable") ||
946 +                     !strcmp(value, "ENABLE"))) {
947 +            tmp_val = int_val = 1;
948 +           MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
949 +           if (tmp_val != int_val) {
950 +               MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
951 +                                                  "ds_in_coll",
952 +                                                  error_code);
953 +                ADIOI_Free(value);
954 +                return;
955 +           }
956 +           MPI_Info_set(fd->info, "ds_in_coll", "enable");
957 +       }
958 +        /* contiguous_data: whether the data are contiguous */
959 +       MPI_Info_get(users_info, "contiguous_data", MPI_MAX_INFO_VAL,
960 +                    value, &flag);
961 +        if (flag && (!strcmp(value, "yes") ||
962 +                     !strcmp(value, "YES"))) {
963 +            tmp_val = int_val = 1;
964 +           MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
965 +           if (tmp_val != int_val) {
966 +               MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
967 +                                                  "contiguous_data",
968 +                                                  error_code);
969 +                ADIOI_Free(value);
970 +                return;
971 +           }
972 +           MPI_Info_set(fd->info, "contiguous_data", "yes");
973         }
974 -       
975 -       MPI_Barrier(fd->comm);
976 -       /* set the values for collective I/O and data sieving parameters */
977 -       ADIOI_GEN_SetInfo(fd, users_info, error_code);
978 -    } else {
979 -       /* The file has been opened previously and fd->fd_sys is a valid
980 -           file descriptor. cannot set striping parameters now. */
981 -       
982 -       /* set the values for collective I/O and data sieving parameters */
983 -       ADIOI_GEN_SetInfo(fd, users_info, error_code);
984 +        /* same_io_size: whether the req size is same */
985 +       MPI_Info_get(users_info, "same_io_size", MPI_MAX_INFO_VAL,
986 +                    value, &flag);
987 +        if (flag && (!strcmp(value, "yes") ||
988 +                     !strcmp(value, "YES"))) {
989 +            tmp_val = int_val = 1;
990 +           MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
991 +           if (tmp_val != int_val) {
992 +               MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
993 +                                                  "same_io_size",
994 +                                                  error_code);
995 +                ADIOI_Free(value);
996 +                return;
997 +           }
998 +           MPI_Info_set(fd->info, "same_io_size", "yes");
999 +       }
1000 +        /* Remember the current cb_nodes that the user set.
1001 +         * It would be used to improve collective I/O.
1002 +         */
1003 +       MPI_Info_get(users_info, "cb_nodes", MPI_MAX_INFO_VAL, value, &flag);
1004 +       if (flag && (int_val = atoi(value)) > 0) {
1005 +            tmp_val = int_val;
1006 +           MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1007 +           if (tmp_val != int_val) {
1008 +               MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1009 +                                                  "big_req_size",
1010 +                                                  error_code);
1011 +                ADIOI_Free(value);
1012 +               return;
1013 +           }
1014 +           MPI_Info_set(fd->info, "user_cb_nodes", value);
1015 +        }
1016      }
1017
1018 -    if (ADIOI_Direct_read) fd->direct_read = 1;
1019 -    if (ADIOI_Direct_write) fd->direct_write = 1;
1020 -
1021      ADIOI_Free(value);
1022 +    /* set the values for collective I/O and data sieving parameters */
1023 +    ADIOI_GEN_SetInfo(fd, users_info, error_code);
1024  
1025 -    *error_code = MPI_SUCCESS;
1026 +    if (ADIOI_Direct_read) fd->direct_read = 1;
1027 +    if (ADIOI_Direct_write) fd->direct_write = 1;
1028  }
1029 diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c
1030 --- ad_lustre_orig/ad_lustre_open.c     2008-09-17 14:36:57.000000000 +0800
1031 +++ ad_lustre/ad_lustre_open.c  2008-09-17 18:55:50.000000000 +0800
1032 @@ -1,18 +1,21 @@
1033  /* -*- Mode: C; c-basic-offset:4 ; -*- */
1034 -/* 
1035 - *   Copyright (C) 1997 University of Chicago. 
1036 +/*
1037 + *   Copyright (C) 1997 University of Chicago.
1038   *   See COPYRIGHT notice in top-level directory.
1039   *
1040   *   Copyright (C) 2007 Oak Ridge National Laboratory
1041 + *
1042 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
1043   */
1044  
1045  #include "ad_lustre.h"
1046  
1047  void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
1048  {
1049 -    int perm, old_mask, amode, amode_direct;
1050 +    int perm, old_mask, amode = 0, amode_direct = 0, flag = 0, err, myrank;
1051 +    int stripe_size = 0, stripe_count = 0, stripe_offset = -1;
1052      struct lov_user_md lum = { 0 };
1053 -    char *value;
1054 +    char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
1055  
1056  #if defined(MPICH2) || !defined(PRINT_ERR_MSG)
1057      static char myname[] = "ADIOI_LUSTRE_OPEN";
1058 @@ -22,12 +25,57 @@
1059         old_mask = umask(022);
1060         umask(old_mask);
1061         perm = old_mask ^ 0666;
1062 -    }
1063 -    else perm = fd->perm;
1064 +    } else
1065 +       perm = fd->perm;
1066  
1067 -    amode = 0;
1068 -    if (fd->access_mode & ADIO_CREATE)
1069 +    if (fd->access_mode & ADIO_CREATE) {
1070         amode = amode | O_CREAT;
1071 +        /* Check striping info
1072 +         * if already set by SetInfo(), set them to lum; otherwise, set by lum
1073 +         */
1074 +        MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value,
1075 +                    &flag);
1076 +        if (flag)
1077 +           stripe_size = atoi(value);
1078 +
1079 +        MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value,
1080 +                    &flag);
1081 +        if (flag)
1082 +           stripe_count = atoi(value);
1083 +
1084 +        MPI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL, value,
1085 +                    &flag);
1086 +        if (flag)
1087 +           stripe_offset = atoi(value);
1088 +
1089 +        /* if user has specified striping info,
1090 +         * process 0 will try to check and set it.
1091 +         */
1092 +        if ((stripe_size > 0) || (stripe_count > 0) || (stripe_offset >= 0)) {
1093 +           MPI_Comm_rank(fd->comm, &myrank);
1094 +           if (myrank == 0) {
1095 +               int fd_sys = open(fd->filename, amode, perm);
1096 +               if (fd_sys == -1) {
1097 +                   if (errno != EEXIST)
1098 +                       FPRINTF(stderr, "Failure to open file %s %d %d\n",
1099 +                               strerror(errno), amode, perm);
1100 +               } else {
1101 +                   lum.lmm_magic = LOV_USER_MAGIC;
1102 +                   lum.lmm_pattern = 1;
1103 +                   lum.lmm_stripe_size = stripe_size;
1104 +                   lum.lmm_stripe_count = stripe_count;
1105 +                   lum.lmm_stripe_offset = stripe_offset;
1106 +
1107 +                   if (ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum))
1108 +                       FPRINTF(stderr,
1109 +                               "Failure to set striping info to Lustre!\n");
1110 +                   close(fd_sys);
1111 +               }
1112 +           }
1113 +           MPI_Barrier(fd->comm);
1114 +        }
1115 +    }
1116 +
1117      if (fd->access_mode & ADIO_RDONLY)
1118         amode = amode | O_RDONLY;
1119      if (fd->access_mode & ADIO_WRONLY)
1120 @@ -42,32 +90,36 @@
1121      fd->fd_sys = open(fd->filename, amode|O_CREAT, perm);
1122  
1123      if (fd->fd_sys != -1) {
1124 -        int err;
1125 -
1126 -        value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
1127 -
1128          /* get file striping information and set it in info */
1129 -        lum.lmm_magic = LOV_USER_MAGIC;
1130 -        err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
1131 -
1132 -        if (!err) {
1133 -            sprintf(value, "%d", lum.lmm_stripe_size);
1134 -            MPI_Info_set(fd->info, "striping_unit", value);
1135 -
1136 -            sprintf(value, "%d", lum.lmm_stripe_count);
1137 -            MPI_Info_set(fd->info, "striping_factor", value);
1138 -
1139 -            sprintf(value, "%d", lum.lmm_stripe_offset);
1140 -            MPI_Info_set(fd->info, "start_iodevice", value);
1141 -        }
1142 -        ADIOI_Free(value);
1143 +       lum.lmm_magic = LOV_USER_MAGIC;
1144 +       err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
1145  
1146 +       if (!err) {
1147 +           if (lum.lmm_stripe_size && lum.lmm_stripe_count &&
1148 +                (lum.lmm_stripe_offset >= 0)) {
1149 +               sprintf(value, "%d", lum.lmm_stripe_size);
1150 +               MPI_Info_set(fd->info, "striping_unit", value);
1151 +
1152 +               sprintf(value, "%d", lum.lmm_stripe_count);
1153 +               MPI_Info_set(fd->info, "striping_factor", value);
1154 +
1155 +               sprintf(value, "%d", lum.lmm_stripe_offset);
1156 +               MPI_Info_set(fd->info, "start_iodevice", value);
1157 +           } else {
1158 +               FPRINTF(stderr, "Striping info is invalid!\n");
1159 +               ADIOI_Free(value);
1160 +               MPI_Abort(MPI_COMM_WORLD, 1);
1161 +           }
1162 +       } else {
1163 +           FPRINTF(stderr, "Failed to get striping info from Lustre!\n");
1164 +            ADIOI_Free(value);
1165 +           MPI_Abort(MPI_COMM_WORLD, 1);
1166 +       }
1167          if (fd->access_mode & ADIO_APPEND)
1168              fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1169 -    } 
1170 -
1171 +    }
1172      if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
1173 -       fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1174 +        fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1175  
1176      fd->fd_direct = -1;
1177      if (fd->direct_write || fd->direct_read) {
1178 @@ -81,20 +133,22 @@
1179      }
1180  
1181      /* --BEGIN ERROR HANDLING-- */
1182 -    if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && 
1183 -               (fd->direct_write || fd->direct_read))) {
1184 +    if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
1185 +       (fd->direct_write || fd->direct_read))) {
1186         if (errno == ENAMETOOLONG)
1187             *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1188 -                                              MPIR_ERR_RECOVERABLE, myname,
1189 -                                              __LINE__, MPI_ERR_BAD_FILE,
1190 +                                              MPIR_ERR_RECOVERABLE,
1191 +                                              myname, __LINE__,
1192 +                                              MPI_ERR_BAD_FILE,
1193                                                "**filenamelong",
1194                                                "**filenamelong %s %d",
1195                                                fd->filename,
1196                                                strlen(fd->filename));
1197         else if (errno == ENOENT)
1198             *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1199 -                                              MPIR_ERR_RECOVERABLE, myname,
1200 -                                              __LINE__, MPI_ERR_NO_SUCH_FILE,
1201 +                                              MPIR_ERR_RECOVERABLE,
1202 +                                              myname, __LINE__,
1203 +                                              MPI_ERR_NO_SUCH_FILE,
1204                                                "**filenoexist",
1205                                                "**filenoexist %s",
1206                                                fd->filename);
1207 @@ -108,27 +162,30 @@
1208                                                fd->filename);
1209         else if (errno == EACCES) {
1210             *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1211 -                                              MPIR_ERR_RECOVERABLE, myname,
1212 -                                              __LINE__, MPI_ERR_ACCESS,
1213 +                                              MPIR_ERR_RECOVERABLE,
1214 +                                              myname, __LINE__,
1215 +                                              MPI_ERR_ACCESS,
1216                                                "**fileaccess",
1217 -                                              "**fileaccess %s", 
1218 -                                              fd->filename );
1219 -       }
1220 -       else if (errno == EROFS) {
1221 +                                              "**fileaccess %s",
1222 +                                              fd->filename);
1223 +       } else if (errno == EROFS) {
1224             /* Read only file or file system and write access requested */
1225             *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1226 -                                              MPIR_ERR_RECOVERABLE, myname,
1227 -                                              __LINE__, MPI_ERR_READ_ONLY,
1228 -                                              "**ioneedrd", 0 );
1229 -       }
1230 -       else {
1231 +                                              MPIR_ERR_RECOVERABLE,
1232 +                                              myname, __LINE__,
1233 +                                              MPI_ERR_READ_ONLY,
1234 +                                              "**ioneedrd", 0);
1235 +       } else {
1236             *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1237 -                                              MPIR_ERR_RECOVERABLE, myname,
1238 -                                              __LINE__, MPI_ERR_IO, "**io",
1239 +                                              MPIR_ERR_RECOVERABLE,
1240 +                                              myname, __LINE__,
1241 +                                              MPI_ERR_IO, "**io",
1242                                                "**io %s", strerror(errno));
1243         }
1244 -    }
1245 +    } else {
1246      /* --END ERROR HANDLING-- */
1247 -    else *error_code = MPI_SUCCESS;
1248 +        *error_code = MPI_SUCCESS;
1249 +    }
1250  
1251 +    ADIOI_Free(value);
1252  }
1253 diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c
1254 --- ad_lustre_orig/ad_lustre_rwcontig.c 2008-09-17 14:36:57.000000000 +0800
1255 +++ ad_lustre/ad_lustre_rwcontig.c      2008-10-15 22:44:35.000000000 +0800
1256 @@ -1,9 +1,11 @@
1257  /* -*- Mode: C; c-basic-offset:4 ; -*- */
1258 -/* 
1259 - *   Copyright (C) 1997 University of Chicago. 
1260 +/*
1261 + *   Copyright (C) 1997 University of Chicago.
1262   *   See COPYRIGHT notice in top-level directory.
1263   *
1264   *   Copyright (C) 2007 Oak Ridge National Laboratory
1265 + *
1266 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
1267   */
1268  
1269  #define _XOPEN_SOURCE 600
1270 diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
1271 --- ad_lustre_orig/ad_lustre_wrcoll.c   1970-01-01 08:00:00.000000000 +0800
1272 +++ ad_lustre/ad_lustre_wrcoll.c        2008-10-15 22:02:53.000000000 +0800
1273 @@ -0,0 +1,883 @@
1274 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
1275 +/*
1276 + *   Copyright (C) 1997 University of Chicago.
1277 + *   See COPYRIGHT notice in top-level directory.
1278 + *
1279 + *   Copyright (C) 2007 Oak Ridge National Laboratory
1280 + *
1281 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
1282 + */
1283 +
1284 +#include "ad_lustre.h"
1285 +#include "adio_extern.h"
1286 +
1287 +/* prototypes of functions used for collective writes only. */
1288 +static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
1289 +                                       MPI_Datatype datatype, int nprocs,
1290 +                                       int myrank,
1291 +                                       ADIOI_Access *others_req,
1292 +                                       ADIOI_Access *my_req,
1293 +                                       ADIO_Offset *offset_list,
1294 +                                       int *len_list,
1295 +                                       int contig_access_count,
1296 +                                       int * striping_info,
1297 +                                       int *buf_idx, int *error_code);
1298 +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
1299 +                                         ADIOI_Flatlist_node * flat_buf,
1300 +                                         char **send_buf,
1301 +                                         ADIO_Offset * offset_list,
1302 +                                         int *len_list, int *send_size,
1303 +                                         MPI_Request * requests,
1304 +                                         int *sent_to_proc, int nprocs,
1305 +                                         int myrank, int contig_access_count,
1306 +                                         int * striping_info,
1307 +                                         int *send_buf_idx,
1308 +                                          int *curr_to_proc,
1309 +                                         int *done_to_proc, int iter,
1310 +                                         MPI_Aint buftype_extent);
1311 +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
1312 +                                        char *write_buf,
1313 +                                        ADIOI_Flatlist_node * flat_buf,
1314 +                                        ADIO_Offset * offset_list,
1315 +                                        int *len_list, int *send_size,
1316 +                                        int *recv_size, ADIO_Offset off,
1317 +                                        int size, int *count,
1318 +                                        int *start_pos, int *partial_recv,
1319 +                                        int *sent_to_proc, int nprocs,
1320 +                                        int myrank, int buftype_is_contig,
1321 +                                        int contig_access_count,
1322 +                                        int * striping_info,
1323 +                                        ADIOI_Access * others_req,
1324 +                                        int *send_buf_idx,
1325 +                                        int *curr_to_proc,
1326 +                                        int *done_to_proc, int *hole,
1327 +                                        int iter, MPI_Aint buftype_extent,
1328 +                                        int *buf_idx, int *error_code);
1329 +void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
1330 +                      ADIO_Offset * srt_off, int *srt_len, int *start_pos,
1331 +                      int nprocs, int nprocs_recv, int total_elements);
1332 +
1333 +void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
1334 +                                  MPI_Datatype datatype,
1335 +                                  int file_ptr_type, ADIO_Offset offset,
1336 +                                  ADIO_Status * status, int *error_code)
1337 +{
1338 +    ADIOI_Access *my_req;
1339 +    /* array of nprocs access structures, one for each other process has
1340 +       this process's request */
1341 +
1342 +    ADIOI_Access *others_req;
1343 +    /* array of nprocs access structures, one for each other process
1344 +       whose request is written by this process. */
1345 +
1346 +    int i, filetype_is_contig, nprocs, myrank, do_collect = 0;
1347 +    int contig_access_count = 0, buftype_is_contig, interleave_count = 0;
1348 +    int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
1349 +    ADIO_Offset orig_fp, start_offset, end_offset, off, min_st_offset;
1350 +    ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL;
1351 +    int *buf_idx = NULL, *len_list = NULL, *striping_info = NULL;
1352 +    int old_error, tmp_error;
1353 +
1354 +    MPI_Comm_size(fd->comm, &nprocs);
1355 +    MPI_Comm_rank(fd->comm, &myrank);
1356 +
1357 +    orig_fp = fd->fp_ind;
1358 +
1359 +    /* IO patten identification if cb_write isn't disabled */
1360 +    if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
1361 +       /* For this process's request, calculate the list of offsets and
1362 +          lengths in the file and determine the start and end offsets. */
1363 +       ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
1364 +                             &offset_list, &len_list, &start_offset,
1365 +                             &end_offset, &contig_access_count);
1366 +
1367 +       /* each process communicates its start and end offsets to other
1368 +          processes. The result is an array each of start and end offsets stored
1369 +          in order of process rank. */
1370 +       st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
1371 +       end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
1372 +       MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
1373 +                     ADIO_OFFSET, fd->comm);
1374 +       MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
1375 +                     ADIO_OFFSET, fd->comm);
1376 +       /* are the accesses of different processes interleaved? */
1377 +       for (i = 1; i < nprocs; i++)
1378 +           if ((st_offsets[i] < end_offsets[i-1]) &&
1379 +                (st_offsets[i] <= end_offsets[i]))
1380 +                interleave_count++;
1381 +       /* This is a rudimentary check for interleaving, but should suffice
1382 +          for the moment. */
1383 +
1384 +       /* Two typical access patterns can benefit from collective write.
1385 +         *   1) the processes are interleaved, and
1386 +         *   2) the req size is small.
1387 +         */
1388 +        if (interleave_count > 0) {
1389 +           do_collect = 1;
1390 +        } else {
1391 +            do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count,
1392 +                                               len_list, nprocs);
1393 +        }
1394 +    }
1395 +    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
1396 +
1397 +    /* Decide if collective I/O should be done */
1398 +    if ((!do_collect && fd->hints->cb_write == ADIOI_HINT_AUTO) ||
1399 +        fd->hints->cb_write == ADIOI_HINT_DISABLE) {
1400 +
1401 +       int filerange_is_contig = 0;
1402 +
1403 +       /* use independent accesses */
1404 +       if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
1405 +           ADIOI_Free(offset_list);
1406 +           ADIOI_Free(len_list);
1407 +            ADIOI_Free(st_offsets);
1408 +            ADIOI_Free(end_offsets);
1409 +       }
1410 +
1411 +       fd->fp_ind = orig_fp;
1412 +       ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
1413 +       if (buftype_is_contig && filetype_is_contig) {
1414 +           if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
1415 +               off = fd->disp + (fd->etype_size) * offset;
1416 +               ADIO_WriteContig(fd, buf, count, datatype,
1417 +                                ADIO_EXPLICIT_OFFSET,
1418 +                                off, status, error_code);
1419 +           } else
1420 +               ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
1421 +                                0, status, error_code);
1422 +       } else {
1423 +           ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
1424 +                             offset, status, error_code);
1425 +       }
1426 +       return;
1427 +    }
1428 +
1429 +    /* Get Lustre hints information */
1430 +    ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1, nprocs,
1431 +                                   st_offsets, end_offsets,
1432 +                                   &min_st_offset);
1433 +    /* calculate what portions of the access requests of this process are
1434 +     * located in which process
1435 +     */
1436 +    ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count,
1437 +                             striping_info, nprocs, &count_my_req_procs,
1438 +                             &count_my_req_per_proc, &my_req, &buf_idx);
1439 +    /* calculate what process's requests will be written by this process */
1440 +    ADIOI_LUSTRE_Calc_others_req(fd, count_my_req_procs,
1441 +                                 count_my_req_per_proc,
1442 +                                my_req, nprocs, myrank,
1443 +                                 end_offset - start_offset + 1,
1444 +                                 min_st_offset, striping_info,
1445 +                                 &count_others_req_procs, &others_req);
1446 +    ADIOI_Free(count_my_req_per_proc);
1447 +
1448 +    /* exchange data and write in sizes of no more than stripe_size. */
1449 +    ADIOI_LUSTRE_Exch_and_write(fd, buf, datatype, nprocs, myrank,
1450 +                                others_req, my_req,
1451 +                                offset_list, len_list, contig_access_count,
1452 +                               striping_info, buf_idx, error_code);
1453 +
1454 +    old_error = *error_code;
1455 +    if (*error_code != MPI_SUCCESS)
1456 +       *error_code = MPI_ERR_IO;
1457 +
1458 +    /* optimization: if only one process performing i/o, we can perform
1459 +     * a less-expensive Bcast  */
1460 +#ifdef ADIOI_MPE_LOGGING
1461 +    MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL);
1462 +#endif
1463 +    if (fd->hints->cb_nodes == 1)
1464 +       MPI_Bcast(error_code, 1, MPI_INT,
1465 +                 fd->hints->ranklist[0], fd->comm);
1466 +    else {
1467 +       tmp_error = *error_code;
1468 +       MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT,
1469 +                     MPI_MAX, fd->comm);
1470 +    }
1471 +#ifdef ADIOI_MPE_LOGGING
1472 +    MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL);
1473 +#endif
1474 +
1475 +    if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO))
1476 +       *error_code = old_error;
1477 +
1478 +
1479 +    if (!buftype_is_contig)
1480 +       ADIOI_Delete_flattened(datatype);
1481 +
1482 +    /* free all memory allocated for collective I/O */
1483 +    /* free others_req */
1484 +    for (i = 0; i < nprocs; i++) {
1485 +       if (others_req[i].count) {
1486 +           ADIOI_Free(others_req[i].offsets);
1487 +           ADIOI_Free(others_req[i].lens);
1488 +           ADIOI_Free(others_req[i].mem_ptrs);
1489 +       }
1490 +    }
1491 +    ADIOI_Free(others_req);
1492 +    /* free my_req here */
1493 +    for (i = 0; i < nprocs; i++) {
1494 +       if (my_req[i].count) {
1495 +           ADIOI_Free(my_req[i].offsets);
1496 +           ADIOI_Free(my_req[i].lens);
1497 +       }
1498 +    }
1499 +    ADIOI_Free(my_req);
1500 +    ADIOI_Free(buf_idx);
1501 +    ADIOI_Free(offset_list);
1502 +    ADIOI_Free(len_list);
1503 +    ADIOI_Free(st_offsets);
1504 +    ADIOI_Free(end_offsets);
1505 +    ADIOI_Free(striping_info);
1506 +
1507 +#ifdef HAVE_STATUS_SET_BYTES
1508 +    if (status) {
1509 +       int bufsize, size;
1510 +       /* Don't set status if it isn't needed */
1511 +       MPI_Type_size(datatype, &size);
1512 +       bufsize = size * count;
1513 +       MPIR_Status_set_bytes(status, datatype, bufsize);
1514 +    }
1515 +    /* This is a temporary way of filling in status. The right way is to
1516 +     * keep track of how much data was actually written during collective I/O.
1517 +     */
1518 +#endif
1519 +
1520 +    fd->fp_sys_posn = -1;      /* set it to null. */
1521 +}
1522 +
1523 +static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
1524 +                                       MPI_Datatype datatype, int nprocs,
1525 +                                       int myrank, ADIOI_Access *others_req,
1526 +                                        ADIOI_Access *my_req,
1527 +                                       ADIO_Offset *offset_list,
1528 +                                        int *len_list, int contig_access_count,
1529 +                                       int *striping_info, int *buf_idx,
1530 +                                        int *error_code)
1531 +{
1532 +    int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
1533 +    ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
1534 +    ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
1535 +    ADIO_Offset max_size, step_size = 0;
1536 +    int real_size, req_len, send_len;
1537 +    int *recv_curr_offlen_ptr, *recv_count, *recv_size;
1538 +    int *send_curr_offlen_ptr, *send_size;
1539 +    int *partial_recv, *sent_to_proc, *recv_start_pos;
1540 +    int *send_buf_idx, *curr_to_proc, *done_to_proc;
1541 +    char *write_buf = NULL, *value;
1542 +    MPI_Status status;
1543 +    ADIOI_Flatlist_node *flat_buf = NULL;
1544 +    MPI_Aint buftype_extent;
1545 +    int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
1546 +    int lflag, data_sieving = 0;
1547 +
1548 +    *error_code = MPI_SUCCESS; /* changed below if error */
1549 +
1550 +    /* calculate the number of writes of stripe size to be done.
1551 +     * That gives the no. of communication phases as well.
1552 +     * Note:
1553 +     *   Because we redistribute data in stripe-contiguous pattern for Lustre,
1554 +     *   each process has the same no. of communication phases.
1555 +     */
1556 +
1557 +    for (i = 0; i < nprocs; i++) {
1558 +       if (others_req[i].count) {
1559 +           st_loc = others_req[i].offsets[0];
1560 +           end_loc = others_req[i].offsets[0];
1561 +           break;
1562 +       }
1563 +    }
1564 +    for (i = 0; i < nprocs; i++) {
1565 +       for (j = 0; j < others_req[i].count; j++) {
1566 +           st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
1567 +           end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] +
1568 +                                          others_req[i].lens[j] - 1));
1569 +       }
1570 +    }
1571 +    /* this process does no writing. */
1572 +    if ((st_loc == -1) && (end_loc == -1))
1573 +       ntimes = 0;
1574 +    MPI_Allreduce(&end_loc, &max_end_loc, 1, MPI_LONG_LONG_INT, MPI_MAX, fd->comm);
1575 +    /* avoid min_st_loc be -1 */
1576 +    if (st_loc == -1)
1577 +        st_loc = max_end_loc;
1578 +    MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm);
1579 +    /* align downward */
1580 +    min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size;
1581 +
1582 +    /* Each time, only avail_cb_nodes number of IO clients perform IO,
1583 +     * so, step_size=avail_cb_nodes*stripe_size IO will be performed at most,
1584 +     * and ntimes=whole_file_portion/step_size
1585 +     */
1586 +    step_size = (ADIO_Offset) avail_cb_nodes * stripe_size;
1587 +    max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1);
1588 +    if (ntimes)
1589 +       write_buf = (char *) ADIOI_Malloc(stripe_size);
1590 +
1591 +    /* calculate the start offset for each iteration */
1592 +    off_list = (ADIO_Offset *) ADIOI_Malloc(max_ntimes * sizeof(ADIO_Offset));
1593 +    for (m = 0; m < max_ntimes; m ++)
1594 +        off_list[m] = max_end_loc;
1595 +    for (i = 0; i < nprocs; i++) {
1596 +        for (j = 0; j < others_req[i].count; j ++) {
1597 +            req_off = others_req[i].offsets[j];
1598 +            m = (int)((req_off - min_st_loc) / step_size);
1599 +            off_list[m] = ADIOI_MIN(off_list[m], req_off);
1600 +        }
1601 +    }
1602 +
1603 +    recv_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1604 +    send_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1605 +    /* their use is explained below. calloc initializes to 0. */
1606 +
1607 +    recv_count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1608 +    /* to store count of how many off-len pairs per proc are satisfied
1609 +       in an iteration. */
1610 +
1611 +    send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1612 +    /* total size of data to be sent to each proc. in an iteration.
1613 +       Of size nprocs so that I can use MPI_Alltoall later. */
1614 +
1615 +    recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1616 +    /* total size of data to be recd. from each proc. in an iteration. */
1617 +
1618 +    sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1619 +    /* amount of data sent to each proc so far. Used in
1620 +       ADIOI_Fill_send_buffer. initialized to 0 here. */
1621 +
1622 +    send_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1623 +    curr_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1624 +    done_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1625 +    /* Above three are used in ADIOI_Fill_send_buffer */
1626 +
1627 +    recv_start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1628 +    /* used to store the starting value of recv_curr_offlen_ptr[i] in
1629 +       this iteration */
1630 +
1631 +    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
1632 +    if (!buftype_is_contig) {
1633 +       ADIOI_Flatten_datatype(datatype);
1634 +       flat_buf = ADIOI_Flatlist;
1635 +       while (flat_buf->type != datatype)
1636 +           flat_buf = flat_buf->next;
1637 +    }
1638 +    MPI_Type_extent(datatype, &buftype_extent);
1639 +
1640 +    iter_st_off = min_st_loc;
1641 +
1642 +    /* Although we have recognized the data according to OST index,
1643 +     * a read-modify-write will be done if there is a hole between the data.
1644 +     * For example: if blocksize=60, xfersize=30 and stripe_size=100,
1645 +     * then rank0 will collect data [0, 30] and [60, 90] then write. There
1646 +     * is a hole in [30, 60], which will cause a read-modify-write in [0, 90].
1647 +     *
1648 +     * To reduce its impact on the performance, we disable data sieving
1649 +     * by default, unless the hint "ds_in_coll" is enabled.
1650 +     */
1651 +    /* check the hint for data sieving */
1652 +    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
1653 +    MPI_Info_get(fd->info, "ds_in_coll", MPI_MAX_INFO_VAL, value, &lflag);
1654 +    if (lflag && !strcmp(value, "enable"))
1655 +        data_sieving = 1;
1656 +    ADIOI_Free(value);
1657 +
1658 +    for (m = 0; m < max_ntimes; m++) {
1659 +       /* go through all others_req and my_req to check which will be received
1660 +         * and sent in this iteration.
1661 +         */
1662 +
1663 +       /* Note that MPI guarantees that displacements in filetypes are in
1664 +          monotonically nondecreasing order and that, for writes, the
1665 +          filetypes cannot specify overlapping regions in the file. This
1666 +          simplifies implementation a bit compared to reads. */
1667 +
1668 +       /*
1669 +           off         = start offset in the file for the data to be written in
1670 +                         this iteration
1671 +           iter_st_off = start offset of this iteration
1672 +           real_size   = size of data written (bytes) corresponding to off
1673 +           max_size    = possible maximum size of data written in this iteration
1674 +           req_off     = offset in the file for a particular contiguous request minus
1675 +                         what was satisfied in previous iteration
1676 +           send_off    = offset the request needed by other processes in this iteration
1677 +           req_len     = size corresponding to req_off
1678 +           send_len    = size corresponding to send_off
1679 +         */
1680 +
1681 +       /* first calculate what should be communicated */
1682 +       for (i = 0; i < nprocs; i++)
1683 +           recv_count[i] = recv_size[i] = send_size[i] = 0;
1684 +
1685 +        off = off_list[m];
1686 +        max_size = ADIOI_MIN(step_size, max_end_loc - iter_st_off + 1);
1687 +        real_size = (int) ADIOI_MIN((off / stripe_size + 1) * stripe_size - off,
1688 +                                    end_loc - off + 1);
1689 +
1690 +       for (i = 0; i < nprocs; i++) {
1691 +            if (my_req[i].count) {
1692 +                for (j = send_curr_offlen_ptr[i]; j < my_req[i].count; j++) {
1693 +                    send_off = my_req[i].offsets[j];
1694 +                    send_len = my_req[i].lens[j];
1695 +                    if (send_off < iter_st_off + max_size) {
1696 +                        send_size[i] += send_len;
1697 +                    } else {
1698 +                        break;
1699 +                    }
1700 +                }
1701 +                send_curr_offlen_ptr[i] = j;
1702 +            }
1703 +           if (others_req[i].count) {
1704 +               recv_start_pos[i] = recv_curr_offlen_ptr[i];
1705 +               for (j = recv_curr_offlen_ptr[i]; j < others_req[i].count; j++) {
1706 +                    req_off = others_req[i].offsets[j];
1707 +                    req_len = others_req[i].lens[j];
1708 +                   if (req_off < iter_st_off + max_size) {
1709 +                       recv_count[i]++;
1710 +                       MPI_Address(write_buf + req_off - off,
1711 +                                   &(others_req[i].mem_ptrs[j]));
1712 +                        recv_size[i] += req_len;
1713 +                   } else {
1714 +                       break;
1715 +                    }
1716 +               }
1717 +               recv_curr_offlen_ptr[i] = j;
1718 +           }
1719 +       }
1720 +        /* use variable "hole" to pass data_sieving flag into W_Exchange_data */
1721 +        hole = data_sieving;
1722 +       ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
1723 +                                     len_list, send_size, recv_size, off, real_size,
1724 +                                     recv_count, recv_start_pos, partial_recv,
1725 +                                     sent_to_proc, nprocs, myrank,
1726 +                                     buftype_is_contig, contig_access_count,
1727 +                                     striping_info, others_req, send_buf_idx,
1728 +                                     curr_to_proc, done_to_proc, &hole, m,
1729 +                                     buftype_extent, buf_idx, error_code);
1730 +       if (*error_code != MPI_SUCCESS)
1731 +            goto over;
1732 +
1733 +       flag = 0;
1734 +       for (i = 0; i < nprocs; i++)
1735 +           if (recv_count[i]) {
1736 +               flag = 1;
1737 +               break;
1738 +           }
1739 +       if (flag) {
1740 +            /* check whether to do data sieving */
1741 +            if(data_sieving) {
1742 +               ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
1743 +                                ADIO_EXPLICIT_OFFSET, off, &status,
1744 +                                error_code);
1745 +            } else {
1746 +                /* if there is no hole, write data in one time;
1747 +                 * otherwise, write data in several times */
1748 +                if (!hole) {
1749 +                    ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
1750 +                                     ADIO_EXPLICIT_OFFSET, off, &status,
1751 +                                     error_code);
1752 +                } else {
1753 +                    for (i = 0; i < nprocs; i++) {
1754 +                        if (others_req[i].count) {
1755 +                            for (j = 0; j < others_req[i].count; j++) {
1756 +                                if (others_req[i].offsets[j] < off + real_size &&
1757 +                                    others_req[i].offsets[j] >= off) {
1758 +                                    ADIO_WriteContig(fd,
1759 +                                                     write_buf + others_req[i].offsets[j] - off,
1760 +                                                     others_req[i].lens[j],
1761 +                                                     MPI_BYTE, ADIO_EXPLICIT_OFFSET,
1762 +                                                     others_req[i].offsets[j], &status,
1763 +                                                     error_code);
1764 +                                   if (*error_code != MPI_SUCCESS)
1765 +                                       goto over;
1766 +                                }
1767 +                            }
1768 +                        }
1769 +                    }
1770 +                }
1771 +            }
1772 +           if (*error_code != MPI_SUCCESS)
1773 +               goto over;
1774 +       }
1775 +        iter_st_off += max_size;
1776 +    }
1777 +over:
1778 +    if (ntimes)
1779 +       ADIOI_Free(write_buf);
1780 +    ADIOI_Free(recv_curr_offlen_ptr);
1781 +    ADIOI_Free(send_curr_offlen_ptr);
1782 +    ADIOI_Free(recv_count);
1783 +    ADIOI_Free(send_size);
1784 +    ADIOI_Free(recv_size);
1785 +    ADIOI_Free(sent_to_proc);
1786 +    ADIOI_Free(recv_start_pos);
1787 +    ADIOI_Free(send_buf_idx);
1788 +    ADIOI_Free(curr_to_proc);
1789 +    ADIOI_Free(done_to_proc);
1790 +    ADIOI_Free(off_list);
1791 +}
1792 +
1793 +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
1794 +                                        char *write_buf,
1795 +                                        ADIOI_Flatlist_node * flat_buf,
1796 +                                        ADIO_Offset * offset_list,
1797 +                                        int *len_list, int *send_size,
1798 +                                        int *recv_size, ADIO_Offset off,
1799 +                                        int size, int *count,
1800 +                                        int *start_pos, int *partial_recv,
1801 +                                        int *sent_to_proc, int nprocs,
1802 +                                        int myrank, int buftype_is_contig,
1803 +                                        int contig_access_count,
1804 +                                        int * striping_info,
1805 +                                        ADIOI_Access * others_req,
1806 +                                        int *send_buf_idx,
1807 +                                        int *curr_to_proc, int *done_to_proc,
1808 +                                         int *hole, int iter,
1809 +                                         MPI_Aint buftype_extent,
1810 +                                        int *buf_idx, int *error_code)
1811 +{
1812 +    int i, j, nprocs_recv, nprocs_send, err;
1813 +    char **send_buf = NULL;
1814 +    MPI_Request *requests, *send_req;
1815 +    MPI_Datatype *recv_types;
1816 +    MPI_Status *statuses, status;
1817 +    int *srt_len, sum, sum_recv;
1818 +    ADIO_Offset *srt_off;
1819 +    int data_sieving = *hole;
1820 +    static char myname[] = "ADIOI_W_EXCHANGE_DATA";
1821 +
1822 +    /* create derived datatypes for recv */
1823 +    nprocs_recv = 0;
1824 +    for (i = 0; i < nprocs; i++)
1825 +       if (recv_size[i])
1826 +           nprocs_recv++;
1827 +
1828 +    recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) *
1829 +                                              sizeof(MPI_Datatype));
1830 +    /* +1 to avoid a 0-size malloc */
1831 +
1832 +    j = 0;
1833 +    for (i = 0; i < nprocs; i++) {
1834 +       if (recv_size[i]) {
1835 +           MPI_Type_hindexed(count[i],
1836 +                             &(others_req[i].lens[start_pos[i]]),
1837 +                             &(others_req[i].mem_ptrs[start_pos[i]]),
1838 +                             MPI_BYTE, recv_types + j);
1839 +           /* absolute displacements; use MPI_BOTTOM in recv */
1840 +           MPI_Type_commit(recv_types + j);
1841 +           j++;
1842 +       }
1843 +    }
1844 +
1845 +    /* To avoid a read-modify-write,
1846 +     * check if there are holes in the data to be written.
1847 +     * For this, merge the (sorted) offset lists others_req using a heap-merge.
1848 +     */
1849 +
1850 +    sum = 0;
1851 +    for (i = 0; i < nprocs; i++)
1852 +       sum += count[i];
1853 +    srt_off = (ADIO_Offset *) ADIOI_Malloc((sum + 1) * sizeof(ADIO_Offset));
1854 +    srt_len = (int *) ADIOI_Malloc((sum + 1) * sizeof(int));
1855 +    /* +1 to avoid a 0-size malloc */
1856 +
1857 +    ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
1858 +                    nprocs, nprocs_recv, sum);
1859 +
1860 +    /* check if there are any holes */
1861 +    *hole = 0;
1862 +    for (i = 0; i < sum - 1; i++) {
1863 +        if (srt_off[i] + srt_len[i] < srt_off[i + 1]) {
1864 +            *hole = 1;
1865 +           break;
1866 +       }
1867 +    }
1868 +    /* In some cases (see John Bent ROMIO REQ # 835), an odd interaction
1869 +     * between aggregation, nominally contiguous regions, and cb_buffer_size
1870 +     * should be handled with a read-modify-write (otherwise we will write out
1871 +     * more data than we receive from everyone else (inclusive), so override
1872 +     * hole detection
1873 +     */
1874 +    if (*hole == 0) {
1875 +        sum_recv = 0;
1876 +        for (i = 0; i < nprocs; i++)
1877 +            sum_recv += recv_size[i];
1878 +       if (size > sum_recv)
1879 +           *hole = 1;
1880 +    }
1881 +    /* check the hint for data sieving */
1882 +    if (data_sieving && nprocs_recv && *hole) {
1883 +        ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
1884 +                        ADIO_EXPLICIT_OFFSET, off, &status, &err);
1885 +        // --BEGIN ERROR HANDLING--
1886 +        if (err != MPI_SUCCESS) {
1887 +            *error_code = MPIO_Err_create_code(err,
1888 +                                               MPIR_ERR_RECOVERABLE,
1889 +                                               myname, __LINE__,
1890 +                                               MPI_ERR_IO,
1891 +                                               "**ioRMWrdwr", 0);
1892 +            ADIOI_Free(recv_types);
1893 +            ADIOI_Free(srt_off);
1894 +            ADIOI_Free(srt_len);
1895 +            return;
1896 +        }
1897 +        // --END ERROR HANDLING--
1898 +    }
1899 +    ADIOI_Free(srt_off);
1900 +    ADIOI_Free(srt_len);
1901 +
1902 +    nprocs_send = 0;
1903 +    for (i = 0; i < nprocs; i++)
1904 +       if (send_size[i])
1905 +           nprocs_send++;
1906 +
1907 +    if (fd->atomicity) {
1908 +       /* bug fix from Wei-keng Liao and Kenin Coloma */
1909 +       requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) *
1910 +                                                sizeof(MPI_Request));
1911 +       send_req = requests;
1912 +    } else {
1913 +       requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1)*
1914 +                                                sizeof(MPI_Request));
1915 +       /* +1 to avoid a 0-size malloc */
1916 +
1917 +       /* post receives */
1918 +       j = 0;
1919 +       for (i = 0; i < nprocs; i++) {
1920 +           if (recv_size[i]) {
1921 +               MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i,
1922 +                         myrank + i + 100 * iter, fd->comm, requests + j);
1923 +               j++;
1924 +           }
1925 +       }
1926 +       send_req = requests + nprocs_recv;
1927 +    }
1928 +
1929 +    /* post sends.
1930 +     * if buftype_is_contig, data can be directly sent from
1931 +     * user buf at location given by buf_idx. else use send_buf.
1932 +     */
1933 +    if (buftype_is_contig) {
1934 +       j = 0;
1935 +       for (i = 0; i < nprocs; i++)
1936 +           if (send_size[i]) {
1937 +               MPI_Isend(((char *) buf) + buf_idx[i], send_size[i],
1938 +                         MPI_BYTE, i, myrank + i + 100 * iter, fd->comm,
1939 +                         send_req + j);
1940 +               j++;
1941 +               buf_idx[i] += send_size[i];
1942 +           }
1943 +    } else if (nprocs_send) {
1944 +       /* buftype is not contig */
1945 +       send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
1946 +       for (i = 0; i < nprocs; i++)
1947 +           if (send_size[i])
1948 +               send_buf[i] = (char *) ADIOI_Malloc(send_size[i]);
1949 +
1950 +       ADIOI_LUSTRE_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list,
1951 +                                      len_list, send_size, send_req,
1952 +                                      sent_to_proc, nprocs, myrank,
1953 +                                      contig_access_count, striping_info,
1954 +                                      send_buf_idx, curr_to_proc, done_to_proc,
1955 +                                      iter, buftype_extent);
1956 +       /* the send is done in ADIOI_Fill_send_buffer */
1957 +    }
1958 +
1959 +       /* bug fix from Wei-keng Liao and Kenin Coloma */
1960 +    if (fd->atomicity) {
1961 +       j = 0;
1962 +       for (i = 0; i < nprocs; i++) {
1963 +           MPI_Status wkl_status;
1964 +           if (recv_size[i]) {
1965 +               MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i,
1966 +                        myrank + i + 100 * iter, fd->comm, &wkl_status);
1967 +               j++;
1968 +           }
1969 +       }
1970 +    }
1971 +
1972 +    for (i = 0; i < nprocs_recv; i++)
1973 +       MPI_Type_free(recv_types + i);
1974 +    ADIOI_Free(recv_types);
1975 +
1976 +       /* bug fix from Wei-keng Liao and Kenin Coloma */
1977 +       /* +1 to avoid a 0-size malloc */
1978 +    if (fd->atomicity) {
1979 +       statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) *
1980 +                                              sizeof(MPI_Status));
1981 +    } else {
1982 +       statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) *
1983 +                                              sizeof(MPI_Status));
1984 +    }
1985 +
1986 +#ifdef NEEDS_MPI_TEST
1987 +    i = 0;
1988 +    if (fd->atomicity) {
1989 +       /* bug fix from Wei-keng Liao and Kenin Coloma */
1990 +       while (!i)
1991 +           MPI_Testall(nprocs_send, send_req, &i, statuses);
1992 +    } else {
1993 +       while (!i)
1994 +           MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses);
1995 +    }
1996 +#else
1997 +       /* bug fix from Wei-keng Liao and Kenin Coloma */
1998 +    if (fd->atomicity)
1999 +       MPI_Waitall(nprocs_send, send_req, statuses);
2000 +    else
2001 +       MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses);
2002 +#endif
2003 +    ADIOI_Free(statuses);
2004 +    ADIOI_Free(requests);
2005 +    if (!buftype_is_contig && nprocs_send) {
2006 +       for (i = 0; i < nprocs; i++)
2007 +           if (send_size[i])
2008 +               ADIOI_Free(send_buf[i]);
2009 +       ADIOI_Free(send_buf);
2010 +    }
2011 +}
2012 +
2013 +#define ADIOI_BUF_INCR \
2014 +{ \
2015 +    while (buf_incr) { \
2016 +        size_in_buf = ADIOI_MIN(buf_incr, flat_buf_sz); \
2017 +        user_buf_idx += size_in_buf; \
2018 +        flat_buf_sz -= size_in_buf; \
2019 +        if (!flat_buf_sz) { \
2020 +            if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
2021 +            else { \
2022 +                flat_buf_idx = 0; \
2023 +                n_buftypes++; \
2024 +            } \
2025 +            user_buf_idx = flat_buf->indices[flat_buf_idx] + \
2026 +                              n_buftypes*buftype_extent; \
2027 +            flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
2028 +        } \
2029 +        buf_incr -= size_in_buf; \
2030 +    } \
2031 +}
2032 +
2033 +
2034 +#define ADIOI_BUF_COPY \
2035 +{ \
2036 +    while (size) { \
2037 +        size_in_buf = ADIOI_MIN(size, flat_buf_sz); \
2038 +        memcpy(&(send_buf[p][send_buf_idx[p]]), \
2039 +               ((char *) buf) + user_buf_idx, size_in_buf); \
2040 +        send_buf_idx[p] += size_in_buf; \
2041 +        user_buf_idx += size_in_buf; \
2042 +        flat_buf_sz -= size_in_buf; \
2043 +        if (!flat_buf_sz) { \
2044 +            if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
2045 +            else { \
2046 +                flat_buf_idx = 0; \
2047 +                n_buftypes++; \
2048 +            } \
2049 +            user_buf_idx = flat_buf->indices[flat_buf_idx] + \
2050 +                              n_buftypes*buftype_extent; \
2051 +            flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
2052 +        } \
2053 +        size -= size_in_buf; \
2054 +        buf_incr -= size_in_buf; \
2055 +    } \
2056 +    ADIOI_BUF_INCR \
2057 +}
2058 +
2059 +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
2060 +                                         ADIOI_Flatlist_node * flat_buf,
2061 +                                         char **send_buf,
2062 +                                         ADIO_Offset * offset_list,
2063 +                                         int *len_list, int *send_size,
2064 +                                         MPI_Request * requests,
2065 +                                         int *sent_to_proc, int nprocs,
2066 +                                         int myrank,
2067 +                                         int contig_access_count,
2068 +                                         int * striping_info,
2069 +                                         int *send_buf_idx,
2070 +                                         int *curr_to_proc,
2071 +                                         int *done_to_proc, int iter,
2072 +                                         MPI_Aint buftype_extent)
2073 +{
2074 +    /* this function is only called if buftype is not contig */
2075 +    int i, p, flat_buf_idx, size;
2076 +    int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes;
2077 +    ADIO_Offset off, len, rem_len, user_buf_idx;
2078 +
2079 +    /* curr_to_proc[p] = amount of data sent to proc. p that has already
2080 +     * been accounted for so far
2081 +     * done_to_proc[p] = amount of data already sent to proc. p in
2082 +     * previous iterations
2083 +     * user_buf_idx = current location in user buffer
2084 +     * send_buf_idx[p] = current location in send_buf of proc. p
2085 +     */
2086 +
2087 +    for (i = 0; i < nprocs; i++) {
2088 +       send_buf_idx[i] = curr_to_proc[i] = 0;
2089 +       done_to_proc[i] = sent_to_proc[i];
2090 +    }
2091 +    jj = 0;
2092 +
2093 +    user_buf_idx = flat_buf->indices[0];
2094 +    flat_buf_idx = 0;
2095 +    n_buftypes = 0;
2096 +    flat_buf_sz = flat_buf->blocklens[0];
2097 +
2098 +    /* flat_buf_idx = current index into flattened buftype
2099 +     * flat_buf_sz = size of current contiguous component in flattened buf
2100 +     */
2101 +    for (i = 0; i < contig_access_count; i++) {
2102 +       off = offset_list[i];
2103 +       rem_len = (ADIO_Offset) len_list[i];
2104 +
2105 +       /*this request may span to more than one process */
2106 +       while (rem_len != 0) {
2107 +           len = rem_len;
2108 +           /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
2109 +            * longer than the single region that processor "p" is responsible
2110 +            * for.
2111 +            */
2112 +           p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, striping_info);
2113 +
2114 +           if (send_buf_idx[p] < send_size[p]) {
2115 +               if (curr_to_proc[p] + len > done_to_proc[p]) {
2116 +                   if (done_to_proc[p] > curr_to_proc[p]) {
2117 +                       size = (int) ADIOI_MIN(curr_to_proc[p] + len -
2118 +                                              done_to_proc[p],
2119 +                                              send_size[p] -
2120 +                                              send_buf_idx[p]);
2121 +                       buf_incr = done_to_proc[p] - curr_to_proc[p];
2122 +                       ADIOI_BUF_INCR
2123 +                           buf_incr = (int) (curr_to_proc[p] + len -
2124 +                                             done_to_proc[p]);
2125 +                       curr_to_proc[p] = done_to_proc[p] + size;
2126 +                       ADIOI_BUF_COPY
2127 +                    } else {
2128 +                       size = (int) ADIOI_MIN(len, send_size[p] -
2129 +                                              send_buf_idx[p]);
2130 +                       buf_incr = (int) len;
2131 +                       curr_to_proc[p] += size;
2132 +                       ADIOI_BUF_COPY
2133 +                    }
2134 +                   if (send_buf_idx[p] == send_size[p]) {
2135 +                       MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p,
2136 +                                 myrank + p + 100 * iter, fd->comm,
2137 +                                 requests + jj);
2138 +                       jj++;
2139 +                   }
2140 +               } else {
2141 +                   curr_to_proc[p] += (int) len;
2142 +                   buf_incr = (int) len;
2143 +                   ADIOI_BUF_INCR
2144 +                }
2145 +           } else {
2146 +               buf_incr = (int) len;
2147 +               ADIOI_BUF_INCR
2148 +            }
2149 +           off += len;
2150 +           rem_len -= len;
2151 +       }
2152 +    }
2153 +    for (i = 0; i < nprocs; i++)
2154 +       if (send_size[i])
2155 +           sent_to_proc[i] = curr_to_proc[i];
2156 +}
2157 diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
2158 --- ad_lustre_orig/ad_lustre_wrstr.c    1970-01-01 08:00:00.000000000 +0800
2159 +++ ad_lustre/ad_lustre_wrstr.c 2008-10-13 15:34:53.000000000 +0800
2160 @@ -0,0 +1,472 @@
2161 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
2162 +/*
2163 + *   Copyright (C) 1997 University of Chicago.
2164 + *   See COPYRIGHT notice in top-level directory.
2165 + *
2166 + *   Copyright (C) 2007 Oak Ridge National Laboratory
2167 + *
2168 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
2169 + */
2170 +
2171 +#include "ad_lustre.h"
2172 +#include "adio_extern.h"
2173 +
2174 +#define ADIOI_BUFFERED_WRITE \
2175 +{ \
2176 +    if (req_off >= writebuf_off + writebuf_len) { \
2177 +        if (writebuf_len) { \
2178 +           ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2179 +                  ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2180 +           if (!(fd->atomicity)) \
2181 +                ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2182 +           if (*error_code != MPI_SUCCESS) { \
2183 +               *error_code = MPIO_Err_create_code(*error_code, \
2184 +                                                  MPIR_ERR_RECOVERABLE, myname, \
2185 +                                                  __LINE__, MPI_ERR_IO, \
2186 +                                                  "**iowswc", 0); \
2187 +               ADIOI_Free(writebuf); \
2188 +               return; \
2189 +           } \
2190 +        } \
2191 +       writebuf_off = req_off; \
2192 +        /* stripe_size alignment */ \
2193 +        writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2194 +                                       (writebuf_off / stripe_size + 1) * \
2195 +                                       stripe_size - writebuf_off);\
2196 +       if (!(fd->atomicity)) \
2197 +            ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2198 +       ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,\
2199 +                        writebuf_off, &status1, error_code); \
2200 +       if (*error_code != MPI_SUCCESS) { \
2201 +           *error_code = MPIO_Err_create_code(*error_code, \
2202 +                                              MPIR_ERR_RECOVERABLE, myname, \
2203 +                                              __LINE__, MPI_ERR_IO, \
2204 +                                              "**iowsrc", 0); \
2205 +            ADIOI_Free(writebuf); \
2206 +           return; \
2207 +       } \
2208 +    } \
2209 +    write_sz = (int) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
2210 +    memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, write_sz);\
2211 +    while (write_sz != req_len) {\
2212 +        ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2213 +                         ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2214 +        if (!(fd->atomicity)) \
2215 +            ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2216 +        if (*error_code != MPI_SUCCESS) { \
2217 +            *error_code = MPIO_Err_create_code(*error_code, \
2218 +                                               MPIR_ERR_RECOVERABLE, myname, \
2219 +                                               __LINE__, MPI_ERR_IO, \
2220 +                                               "**iowswc", 0); \
2221 +            ADIOI_Free(writebuf); \
2222 +            return; \
2223 +        } \
2224 +        req_len -= write_sz; \
2225 +        userbuf_off += write_sz; \
2226 +        writebuf_off += writebuf_len; \
2227 +        /* stripe_size alignment */ \
2228 +        writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2229 +                                       (writebuf_off / stripe_size + 1) * \
2230 +                                       stripe_size - writebuf_off);\
2231 +       if (!(fd->atomicity)) \
2232 +            ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2233 +        ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,\
2234 +                        writebuf_off, &status1, error_code); \
2235 +       if (*error_code != MPI_SUCCESS) { \
2236 +           *error_code = MPIO_Err_create_code(*error_code, \
2237 +                                              MPIR_ERR_RECOVERABLE, myname, \
2238 +                                              __LINE__, MPI_ERR_IO, \
2239 +                                              "**iowsrc", 0); \
2240 +            ADIOI_Free(writebuf); \
2241 +           return; \
2242 +       } \
2243 +        write_sz = ADIOI_MIN(req_len, writebuf_len); \
2244 +        memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
2245 +    } \
2246 +}
2247 +
2248 +
2249 +/* this macro is used when filetype is contig and buftype is not contig.
2250 +   it does not do a read-modify-write and does not lock*/
2251 +#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
2252 +{ \
2253 +    if (req_off >= writebuf_off + writebuf_len) { \
2254 +       writebuf_off = req_off; \
2255 +        /* stripe_size alignment */ \
2256 +        writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2257 +                                       (writebuf_off / stripe_size + 1) * \
2258 +                                       stripe_size - writebuf_off);\
2259 +    } \
2260 +    write_sz = (int) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
2261 +    memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, write_sz);\
2262 +    while (req_len) { \
2263 +        ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2264 +                         ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2265 +        if (*error_code != MPI_SUCCESS) { \
2266 +            *error_code = MPIO_Err_create_code(*error_code, \
2267 +                                               MPIR_ERR_RECOVERABLE, myname, \
2268 +                                               __LINE__, MPI_ERR_IO, \
2269 +                                               "**iowswc", 0); \
2270 +            ADIOI_Free(writebuf); \
2271 +            return; \
2272 +        } \
2273 +        req_len -= write_sz; \
2274 +        userbuf_off += write_sz; \
2275 +        writebuf_off += writebuf_len; \
2276 +        /* stripe_size alignment */ \
2277 +        writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2278 +                                       (writebuf_off / stripe_size + 1) * \
2279 +                                       stripe_size - writebuf_off);\
2280 +        write_sz = ADIOI_MIN(req_len, writebuf_len); \
2281 +        memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
2282 +    } \
2283 +}
2284 +
2285 +void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
2286 +                              MPI_Datatype datatype, int file_ptr_type,
2287 +                              ADIO_Offset offset, ADIO_Status * status,
2288 +                              int *error_code)
2289 +{
2290 +    /* offset is in units of etype relative to the filetype. */
2291 +    ADIOI_Flatlist_node *flat_buf, *flat_file;
2292 +    int i, j, k, bwr_size, fwr_size = 0, st_index = 0;
2293 +    int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
2294 +    int n_filetypes, etype_in_filetype;
2295 +    ADIO_Offset abs_off_in_filetype = 0;
2296 +    int filetype_size, etype_size, buftype_size, req_len;
2297 +    MPI_Aint filetype_extent, buftype_extent;
2298 +    int buf_count, buftype_is_contig, filetype_is_contig;
2299 +    ADIO_Offset userbuf_off;
2300 +    ADIO_Offset off, req_off, disp, end_offset = 0, writebuf_off, start_off;
2301 +    char *writebuf;
2302 +    int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
2303 +    ADIO_Status status1;
2304 +    int new_bwr_size, new_fwr_size;
2305 +    char * value;
2306 +    int stripe_size, lflag = 0;
2307 +    static char myname[] = "ADIOI_LUSTRE_WriteStrided";
2308 +    int myrank;
2309 +    MPI_Comm_rank(fd->comm, &myrank);
2310 +
2311 +    if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
2312 +       /* if user has disabled data sieving on writes, use naive
2313 +        * approach instead.
2314 +        */
2315 +       ADIOI_GEN_WriteStrided_naive(fd,
2316 +                                    buf,
2317 +                                    count,
2318 +                                    datatype,
2319 +                                    file_ptr_type,
2320 +                                    offset, status, error_code);
2321 +       return;
2322 +    }
2323 +
2324 +    *error_code = MPI_SUCCESS; /* changed below if error */
2325 +
2326 +    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
2327 +    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
2328 +
2329 +    MPI_Type_size(fd->filetype, &filetype_size);
2330 +    if (!filetype_size) {
2331 +       *error_code = MPI_SUCCESS;
2332 +       return;
2333 +    }
2334 +
2335 +    MPI_Type_extent(fd->filetype, &filetype_extent);
2336 +    MPI_Type_size(datatype, &buftype_size);
2337 +    MPI_Type_extent(datatype, &buftype_extent);
2338 +    etype_size = fd->etype_size;
2339 +
2340 +    bufsize = buftype_size * count;
2341 +
2342 +    /* get striping info */
2343 +    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
2344 +    MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
2345 +    if (lflag)
2346 +       stripe_size = atoi(value);
2347 +    ADIOI_Free(value);
2348 +
2349 +    /* Different buftype to different filetype */
2350 +    if (!buftype_is_contig && filetype_is_contig) {
2351 +        /* noncontiguous in memory, contiguous in file. */
2352 +       ADIOI_Flatten_datatype(datatype);
2353 +       flat_buf = ADIOI_Flatlist;
2354 +       while (flat_buf->type != datatype)
2355 +           flat_buf = flat_buf->next;
2356 +
2357 +       off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
2358 +           fd->disp + etype_size * offset;
2359 +
2360 +       start_off = off;
2361 +       end_offset = start_off + bufsize - 1;
2362 +       writebuf_off = start_off;
2363 +        /* write stripe size buffer each time */
2364 +       writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
2365 +        writebuf_len = (int) ADIOI_MIN(bufsize,
2366 +                                       (writebuf_off / stripe_size + 1) *
2367 +                                       stripe_size - writebuf_off);
2368 +
2369 +        /* if atomicity is true, lock the region to be accessed */
2370 +       if (fd->atomicity)
2371 +           ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
2372 +
2373 +       for (j = 0; j < count; j++) {
2374 +           for (i = 0; i < flat_buf->count; i++) {
2375 +               userbuf_off = j * buftype_extent + flat_buf->indices[i];
2376 +               req_off = off;
2377 +               req_len = flat_buf->blocklens[i];
2378 +               ADIOI_BUFFERED_WRITE_WITHOUT_READ
2379 +               off += flat_buf->blocklens[i];
2380 +           }
2381 +        }
2382 +
2383 +       /* write the buffer out finally */
2384 +       ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
2385 +                        ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
2386 +                        error_code);
2387 +
2388 +       if (fd->atomicity)
2389 +           ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
2390 +       if (*error_code != MPI_SUCCESS) {
2391 +            ADIOI_Free(writebuf);
2392 +           return;
2393 +        }
2394 +       ADIOI_Free(writebuf);
2395 +       if (file_ptr_type == ADIO_INDIVIDUAL)
2396 +           fd->fp_ind = off;
2397 +    } else {
2398 +        /* noncontiguous in file */
2399 +        /* filetype already flattened in ADIO_Open */
2400 +       flat_file = ADIOI_Flatlist;
2401 +       while (flat_file->type != fd->filetype)
2402 +           flat_file = flat_file->next;
2403 +       disp = fd->disp;
2404 +
2405 +       if (file_ptr_type == ADIO_INDIVIDUAL) {
2406 +           offset = fd->fp_ind;        /* in bytes */
2407 +           n_filetypes = -1;
2408 +           flag = 0;
2409 +           while (!flag) {
2410 +               n_filetypes++;
2411 +               for (i = 0; i < flat_file->count; i++) {
2412 +                   if (disp + flat_file->indices[i] +
2413 +                       (ADIO_Offset) n_filetypes * filetype_extent +
2414 +                       flat_file->blocklens[i] >= offset) {
2415 +                       st_index = i;
2416 +                       fwr_size = (int) (disp + flat_file->indices[i] +
2417 +                                         (ADIO_Offset) n_filetypes *
2418 +                                         filetype_extent +
2419 +                                         flat_file->blocklens[i] -
2420 +                                         offset);
2421 +                       flag = 1;
2422 +                       break;
2423 +                   }
2424 +               }
2425 +           }
2426 +       } else {
2427 +           n_etypes_in_filetype = filetype_size / etype_size;
2428 +           n_filetypes = (int) (offset / n_etypes_in_filetype);
2429 +           etype_in_filetype = (int) (offset % n_etypes_in_filetype);
2430 +           size_in_filetype = etype_in_filetype * etype_size;
2431 +
2432 +           sum = 0;
2433 +           for (i = 0; i < flat_file->count; i++) {
2434 +               sum += flat_file->blocklens[i];
2435 +               if (sum > size_in_filetype) {
2436 +                   st_index = i;
2437 +                   fwr_size = sum - size_in_filetype;
2438 +                   abs_off_in_filetype = flat_file->indices[i] +
2439 +                       size_in_filetype - (sum - flat_file->blocklens[i]);
2440 +                   break;
2441 +               }
2442 +           }
2443 +
2444 +           /* abs. offset in bytes in the file */
2445 +           offset = disp + (ADIO_Offset) n_filetypes *filetype_extent +
2446 +                    abs_off_in_filetype;
2447 +       }
2448 +
2449 +       start_off = offset;
2450 +
2451 +       /* If the file bytes is actually contiguous, we do not need data sieve at all */
2452 +       if (bufsize <= fwr_size) {
2453 +            req_off = start_off;
2454 +            req_len = bufsize;
2455 +            end_offset = start_off + bufsize - 1;
2456 +           writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
2457 +           memset(writebuf, -1, ADIOI_MIN(bufsize, stripe_size));
2458 +            writebuf_off = 0;
2459 +            writebuf_len = 0;
2460 +            userbuf_off = 0;
2461 +            ADIOI_BUFFERED_WRITE_WITHOUT_READ
2462 +       } else {
2463 +           /* Calculate end_offset, the last byte-offset that will be accessed.
2464 +              e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */
2465 +           st_fwr_size = fwr_size;
2466 +           st_n_filetypes = n_filetypes;
2467 +           i = 0;
2468 +           j = st_index;
2469 +           off = offset;
2470 +           fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
2471 +           while (i < bufsize) {
2472 +               i += fwr_size;
2473 +               end_offset = off + fwr_size - 1;
2474 +
2475 +               if (j < (flat_file->count - 1))
2476 +                   j++;
2477 +               else {
2478 +                   j = 0;
2479 +                   n_filetypes++;
2480 +               }
2481 +
2482 +               off = disp + flat_file->indices[j] +
2483 +                     (ADIO_Offset) n_filetypes * filetype_extent;
2484 +               fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i);
2485 +           }
2486 +
2487 +           writebuf_off = 0;
2488 +           writebuf_len = 0;
2489 +           writebuf = (char *) ADIOI_Malloc(stripe_size);
2490 +           memset(writebuf, -1, stripe_size);
2491 +           /* if atomicity is true, lock the region to be accessed */
2492 +           if (fd->atomicity)
2493 +               ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
2494 +
2495 +           if (buftype_is_contig && !filetype_is_contig) {
2496 +               /* contiguous in memory, noncontiguous in file. should be the most
2497 +                  common case. */
2498 +               i = 0;
2499 +               j = st_index;
2500 +               off = offset;
2501 +               n_filetypes = st_n_filetypes;
2502 +               fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
2503 +               while (i < bufsize) {
2504 +                   if (fwr_size) {
2505 +                       /* TYPE_UB and TYPE_LB can result in
2506 +                          fwr_size = 0. save system call in such cases */
2507 +                       /*
2508 +                        lseek(fd->fd_sys, off, SEEK_SET);
2509 +                       err = write(fd->fd_sys, ((char *) buf) + i, fwr_size);
2510 +                        */
2511 +                       req_off = off;
2512 +                       req_len = fwr_size;
2513 +                       userbuf_off = i;
2514 +                       ADIOI_BUFFERED_WRITE
2515 +                    }
2516 +                   i += fwr_size;
2517 +
2518 +                   if (off + fwr_size < disp + flat_file->indices[j] +
2519 +                                        flat_file->blocklens[j] +
2520 +                           (ADIO_Offset) n_filetypes * filetype_extent)
2521 +                       off += fwr_size;
2522 +                   /* did not reach end of contiguous block in filetype.
2523 +                   no more I/O needed. off is incremented by fwr_size. */
2524 +                   else {
2525 +                       if (j < (flat_file->count - 1))
2526 +                           j++;
2527 +                       else {
2528 +                           j = 0;
2529 +                           n_filetypes++;
2530 +                       }
2531 +                       off = disp + flat_file->indices[j] +
2532 +                             (ADIO_Offset) n_filetypes * filetype_extent;
2533 +                       fwr_size = ADIOI_MIN(flat_file->blocklens[j],
2534 +                                             bufsize - i);
2535 +                   }
2536 +               }
2537 +           } else {
2538 +                   /* noncontiguous in memory as well as in file */
2539 +               ADIOI_Flatten_datatype(datatype);
2540 +               flat_buf = ADIOI_Flatlist;
2541 +               while (flat_buf->type != datatype)
2542 +                   flat_buf = flat_buf->next;
2543 +
2544 +               k = num = buf_count = 0;
2545 +               i = (int) (flat_buf->indices[0]);
2546 +               j = st_index;
2547 +               off = offset;
2548 +               n_filetypes = st_n_filetypes;
2549 +               fwr_size = st_fwr_size;
2550 +               bwr_size = flat_buf->blocklens[0];
2551 +
2552 +               while (num < bufsize) {
2553 +                   size = ADIOI_MIN(fwr_size, bwr_size);
2554 +                   if (size) {
2555 +                       /*
2556 +                        lseek(fd->fd_sys, off, SEEK_SET);
2557 +                        err = write(fd->fd_sys, ((char *) buf) + i, size);
2558 +                        */
2559 +                       req_off = off;
2560 +                       req_len = size;
2561 +                       userbuf_off = i;
2562 +                       ADIOI_BUFFERED_WRITE
2563 +                    }
2564 +
2565 +                   new_fwr_size = fwr_size;
2566 +                   new_bwr_size = bwr_size;
2567 +
2568 +                   if (size == fwr_size) {
2569 +                       /* reached end of contiguous block in file */
2570 +                       if (j < (flat_file->count - 1)) {
2571 +                           j++;
2572 +                        } else {
2573 +                           j = 0;
2574 +                           n_filetypes++;
2575 +                       }
2576 +                       off = disp + flat_file->indices[j] +
2577 +                             (ADIO_Offset) n_filetypes * filetype_extent;
2578 +                        new_fwr_size = flat_file->blocklens[j];
2579 +                       if (size != bwr_size) {
2580 +                           i += size;
2581 +                           new_bwr_size -= size;
2582 +                       }
2583 +                   }
2584 +                   if (size == bwr_size) {
2585 +                       /* reached end of contiguous block in memory */
2586 +                       k = (k + 1) % flat_buf->count;
2587 +                       buf_count++;
2588 +                       i = (int) (buftype_extent *
2589 +                                  (buf_count / flat_buf->count) +
2590 +                                 flat_buf->indices[k]);
2591 +                       new_bwr_size = flat_buf->blocklens[k];
2592 +                       if (size != fwr_size) {
2593 +                           off += size;
2594 +                           new_fwr_size -= size;
2595 +                       }
2596 +                   }
2597 +                   num += size;
2598 +                   fwr_size = new_fwr_size;
2599 +                   bwr_size = new_bwr_size;
2600 +               }
2601 +            }
2602 +
2603 +           /* write the buffer out finally */
2604 +           if (writebuf_len) {
2605 +               ADIO_WriteContig(fd, writebuf, writebuf_len,
2606 +                                MPI_BYTE, ADIO_EXPLICIT_OFFSET,
2607 +                                writebuf_off, &status1, error_code);
2608 +               if (!(fd->atomicity))
2609 +                   ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
2610 +               if (*error_code != MPI_SUCCESS) {
2611 +                    ADIOI_Free(writebuf);
2612 +                   return;
2613 +                }
2614 +           }
2615 +           if (fd->atomicity)
2616 +               ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
2617 +       }
2618 +        ADIOI_Free(writebuf);
2619 +       if (file_ptr_type == ADIO_INDIVIDUAL)
2620 +           fd->fp_ind = off;
2621 +    }
2622 +    fd->fp_sys_posn = -1;      /* set it to null. */
2623 +
2624 +#ifdef HAVE_STATUS_SET_BYTES
2625 +    MPIR_Status_set_bytes(status, datatype, bufsize);
2626 +    /* This is a temporary way of filling in status. The right way is to
2627 +    keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
2628 +#endif
2629 +
2630 +    if (!buftype_is_contig)
2631 +        ADIOI_Delete_flattened(datatype);
2632 +}
2633 diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in
2634 --- ad_lustre_orig/Makefile.in  2008-09-17 14:36:57.000000000 +0800
2635 +++ ad_lustre/Makefile.in       2008-09-17 18:20:35.000000000 +0800
2636 @@ -16,7 +16,9 @@
2637  @VPATH@
2638  
2639  AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_open.o \
2640 -      ad_lustre_rwcontig.o ad_lustre_hints.o 
2641 +      ad_lustre_rwcontig.o ad_lustre_wrcoll.o ad_lustre_wrstr.o  \
2642 +      ad_lustre_hints.o ad_lustre_aggregate.o
2643 +
2644  
2645  default: $(LIBNAME)
2646         @if [ "@ENABLE_SHLIB@" != "none" ] ; then \
2647 diff -ruN ad_lustre_orig/README ad_lustre/README
2648 --- ad_lustre_orig/README       2008-09-17 14:36:57.000000000 +0800
2649 +++ ad_lustre/README    2008-10-15 22:43:07.000000000 +0800
2650 @@ -5,6 +5,25 @@
2651    o To post the code for ParColl (Partitioned collective IO)
2652   
2653  -----------------------------------------------------
2654 +V05: 
2655 +-----------------------------------------------------
2656 +Improved data redistribution
2657 +  o Improve I/O pattern identification. Besides checking interleaving,
2658 +    if request I/O size is small, collective I/O will be performed.
2659 +    The hint big_req_size can be used to define the req size value.
2660 +  o Provide hint CO for load balancing to control the number of
2661 +    IO clients for each OST
2662 +  o Produce stripe-contiguous I/O pattern that Lustre prefers
2663 +  o Reduce the collective overhead by hints contiguous_data and
2664 +    same_io_size to remove unnecessary MPI_Alltoall()
2665 +  o Control read-modify-write in data sieving in collective IO
2666 +    by hint ds_in_coll.
2667 +  o Optimize the IO pattern.
2668 +    - If the whole access size <= stripe size, we suggest all the
2669 +      IO data will be performed by the same client, to avoid the
2670 +      extent lock revoking and reassignment.
2671 +
2672 +-----------------------------------------------------
2673  V04: 
2674  -----------------------------------------------------
2675    o Direct IO and Lockless IO support
2676 --- common/ad_write_coll_orig.c 2008-10-15 11:24:31.000000000 +0800
2677 +++ common/ad_write_coll.c      2008-10-15 11:25:39.000000000 +0800
2678 @@ -42,7 +42,7 @@
2679                             int *send_buf_idx, int *curr_to_proc, 
2680                             int *done_to_proc, int iter, 
2681                             MPI_Aint buftype_extent);
2682 -static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, 
2683 +void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, 
2684                        ADIO_Offset *srt_off, int *srt_len, int *start_pos,
2685                        int nprocs, int nprocs_recv, int total_elements);
2686  
2687 @@ -921,7 +921,7 @@
2688  
2689  
2690  
2691 -static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, 
2692 +void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, 
2693                       ADIO_Offset *srt_off, int *srt_len, int *start_pos,
2694                       int nprocs, int nprocs_recv, int total_elements)
2695  {