Whamcloud - gitweb
Add new LINVRNT() macro, optional on new --enable-invariants configure
[fs/lustre-release.git] / lustre / contrib / adio_driver_mpich2-1.0.7.patch
1 diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
2 --- ad_lustre_orig/ad_lustre_aggregate.c        1970-01-01 08:00:00.000000000 +0800
3 +++ ad_lustre/ad_lustre_aggregate.c     2008-10-17 17:30:00.000000000 +0800
4 @@ -0,0 +1,502 @@
5 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
6 +/*
7 + *   Copyright (C) 1997 University of Chicago.
8 + *   See COPYRIGHT notice in top-level directory.
9 + *
10 + *   Copyright (C) 2007 Oak Ridge National Laboratory
11 + *
12 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
13 + */
14 +
15 +#include "ad_lustre.h"
16 +#include "adio_extern.h"
17 +
18 +void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
19 +                                   int mode)
20 +{
21 +    int *striping_info = NULL;
22 +    /* get striping information:
23 +     *  striping_info[0]: stripe_size
24 +     *  striping_info[1]: stripe_count
25 +     *  striping_info[2]: avail_cb_nodes
26 +     */
27 +    int stripe_size, stripe_count, CO = 1, CO_max = 1, CO_nodes, lflag;
28 +    int avail_cb_nodes, divisor, nprocs_for_coll = fd->hints->cb_nodes;
29 +    char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
30 +
31 +    /* Get hints value */
32 +    /* stripe size */
33 +    MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
34 +    if (lflag)
35 +       stripe_size = atoi(value);
36 +    /* stripe count */
37 +    /* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */
38 +    MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &lflag);
39 +    if (lflag)
40 +       stripe_count = atoi(value);
41 +
42 +    /* Calculate the available number of I/O clients, that is
43 +     *  avail_cb_nodes=min(cb_nodes, stripe_count*CO), where
44 +     *  CO=1 by default
45 +     */
46 +    if (!mode) {
47 +        /* for collective read,
48 +        * if "CO" clients access the same OST simultaneously,
49 +        * the OST disk seek time would be much. So, to avoid this,
50 +        * it might be better if 1 client only accesses 1 OST.
51 +        * So, we set CO = 1 to meet the above requirement.
52 +        */
53 +       CO = 1;
54 +       /*XXX: maybe there are other better way for collective read */
55 +    } else {
56 +        /* CO_max: the largest number of IO clients for each ost group */
57 +        CO_max = (nprocs_for_coll - 1)/ stripe_count + 1;
58 +        /* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */
59 +       MPI_Info_get(fd->info, "CO", MPI_MAX_INFO_VAL, value, &lflag);
60 +       if (lflag)
61 +           CO = atoi(value);
62 +       CO = ADIOI_MIN(CO_max, CO);
63 +    }
64 +    /* Calculate how many IO clients we need */
65 +    /* To avoid extent lock conflicts,
66 +     * avail_cb_nodes should divide (stripe_count*CO) exactly,
67 +     * so that each OST is accessed by only one or more constant clients. */
68 +    avail_cb_nodes = ADIOI_MIN(nprocs_for_coll, stripe_count * CO);
69 +    if (avail_cb_nodes == nprocs_for_coll) {
70 +        CO_nodes = stripe_count * CO;
71 +        do {
72 +            /* find the divisor of CO_nodes */
73 +            divisor = 1;
74 +            do {
75 +                divisor ++;
76 +            } while (CO_nodes % divisor);
77 +            CO_nodes = CO_nodes / divisor;
78 +            /* if stripe_count*CO is a prime number, change nothing */
79 +            if ((CO_nodes <= avail_cb_nodes) && (CO_nodes != 1)) {
80 +                avail_cb_nodes = CO_nodes;
81 +                break;
82 +            }
83 +        } while (CO_nodes != 1);
84 +    }
85 +
86 +    *striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int));
87 +    striping_info = *striping_info_ptr;
88 +    striping_info[0] = stripe_size;
89 +    striping_info[1] = stripe_count;
90 +    striping_info[2] = avail_cb_nodes;
91 +
92 +    ADIOI_Free(value);
93 +}
94 +
95 +int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
96 +                                 ADIO_Offset *len, int *striping_info)
97 +{
98 +    int rank_index, rank;
99 +    ADIO_Offset avail_bytes;
100 +    int stripe_size = striping_info[0];
101 +    int avail_cb_nodes = striping_info[2];
102 +
103 +    /* Produce the stripe-contiguous pattern for Lustre */
104 +    rank_index = (int)((off / stripe_size) % avail_cb_nodes);
105 +
106 +    avail_bytes = (off / (ADIO_Offset)stripe_size + 1) *
107 +                  (ADIO_Offset)stripe_size - off;
108 +    if (avail_bytes < *len) {
109 +       /* this proc only has part of the requested contig. region */
110 +       *len = avail_bytes;
111 +    }
112 +    /* map our index to a rank */
113 +    /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
114 +    rank = fd->hints->ranklist[rank_index];
115 +
116 +    return rank;
117 +}
118 +
119 +void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
120 +                             int *len_list, int contig_access_count,
121 +                             int *striping_info, int nprocs,
122 +                              int *count_my_req_procs_ptr,
123 +                             int **count_my_req_per_proc_ptr,
124 +                             ADIOI_Access ** my_req_ptr,
125 +                             int **buf_idx_ptr)
126 +{
127 +    /* Nothing different from ADIOI_Calc_my_req(), except calling
128 +     * ADIOI_Lustre_Calc_aggregator() instead of the old one */
129 +    int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
130 +    int i, l, proc;
131 +    ADIO_Offset avail_len, rem_len, curr_idx, off;
132 +    ADIOI_Access *my_req;
133 +
134 +    *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
135 +    count_my_req_per_proc = *count_my_req_per_proc_ptr;
136 +
137 +    /* buf_idx is relevant only if buftype_is_contig.
138 +     * buf_idx[i] gives the index into user_buf where data received
139 +     * from proc. i should be placed. This allows receives to be done
140 +     * without extra buffer. This can't be done if buftype is not contig.
141 +     */
142 +    buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
143 +    /* initialize buf_idx to -1 */
144 +    for (i = 0; i < nprocs; i++)
145 +       buf_idx[i] = -1;
146 +
147 +    /* one pass just to calculate how much space to allocate for my_req;
148 +     * contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
149 +     */
150 +    for (i = 0; i < contig_access_count; i++) {
151 +       /* short circuit offset/len processing if len == 0
152 +        * (zero-byte  read/write
153 +        */
154 +       if (len_list[i] == 0)
155 +           continue;
156 +       off = offset_list[i];
157 +       avail_len = len_list[i];
158 +       /* we set avail_len to be the total size of the access.
159 +        * then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return
160 +        * the amount that was available.
161 +        */
162 +       proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
163 +       count_my_req_per_proc[proc]++;
164 +       /* figure out how many data is remaining in the access
165 +        * we'll take care of this data (if there is any)
166 +        * in the while loop below.
167 +        */
168 +       rem_len = len_list[i] - avail_len;
169 +
170 +       while (rem_len != 0) {
171 +           off += avail_len;   /* point to first remaining byte */
172 +           avail_len = rem_len;        /* save remaining size, pass to calc */
173 +           proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
174 +           count_my_req_per_proc[proc]++;
175 +           rem_len -= avail_len;       /* reduce remaining length by amount from fd */
176 +       }
177 +    }
178 +
179 +    *my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access));
180 +    my_req = *my_req_ptr;
181 +
182 +    count_my_req_procs = 0;
183 +    for (i = 0; i < nprocs; i++) {
184 +       if (count_my_req_per_proc[i]) {
185 +           my_req[i].offsets = (ADIO_Offset *)
186 +                               ADIOI_Malloc(count_my_req_per_proc[i] *
187 +                                             sizeof(ADIO_Offset));
188 +           my_req[i].lens = (int *) ADIOI_Malloc(count_my_req_per_proc[i] *
189 +                                                 sizeof(int));
190 +           count_my_req_procs++;
191 +       }
192 +       my_req[i].count = 0;    /* will be incremented where needed later */
193 +    }
194 +
195 +    /* now fill in my_req */
196 +    curr_idx = 0;
197 +    for (i = 0; i < contig_access_count; i++) {
198 +       if (len_list[i] == 0)
199 +           continue;
200 +       off = offset_list[i];
201 +       avail_len = len_list[i];
202 +       proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
203 +
204 +       /* for each separate contiguous access from this process */
205 +       if (buf_idx[proc] == -1)
206 +           buf_idx[proc] = (int) curr_idx;
207 +
208 +       l = my_req[proc].count;
209 +       curr_idx += (int) avail_len;    /* NOTE: Why is curr_idx an int?  Fix? */
210 +
211 +       rem_len = len_list[i] - avail_len;
212 +
213 +       /* store the proc, offset, and len information in an array
214 +        * of structures, my_req. Each structure contains the
215 +        * offsets and lengths located in that process's FD,
216 +        * and the associated count.
217 +        */
218 +       my_req[proc].offsets[l] = off;
219 +       my_req[proc].lens[l] = (int) avail_len;
220 +       my_req[proc].count++;
221 +
222 +       while (rem_len != 0) {
223 +           off += avail_len;
224 +           avail_len = rem_len;
225 +           proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
226 +                                                striping_info);
227 +           if (buf_idx[proc] == -1)
228 +               buf_idx[proc] = (int) curr_idx;
229 +
230 +           l = my_req[proc].count;
231 +           curr_idx += avail_len;
232 +           rem_len -= avail_len;
233 +
234 +           my_req[proc].offsets[l] = off;
235 +           my_req[proc].lens[l] = (int) avail_len;
236 +           my_req[proc].count++;
237 +       }
238 +    }
239 +
240 +#ifdef AGG_DEBUG
241 +    for (i = 0; i < nprocs; i++) {
242 +       if (count_my_req_per_proc[i] > 0) {
243 +           FPRINTF(stdout, "data needed from %d (count = %d):\n",
244 +                           i, my_req[i].count);
245 +           for (l = 0; l < my_req[i].count; l++) {
246 +               FPRINTF(stdout, "   off[%d] = %lld, len[%d] = %d\n",
247 +                               l, my_req[i].offsets[l], l, my_req[i].lens[l]);
248 +           }
249 +       }
250 +    }
251 +#endif
252 +#if 0
253 +    for (i = 0; i < nprocs; i++) {
254 +       FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
255 +    }
256 +#endif
257 +
258 +    *count_my_req_procs_ptr = count_my_req_procs;
259 +    *buf_idx_ptr = buf_idx;
260 +}
261 +
262 +int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
263 +                          int *len_list, int nprocs)
264 +{
265 +    /* If the processes are non-interleaved, we will check the req_size.
266 +     *   if (avg_req_size > big_req_size) {
267 +     *       docollect = 0;
268 +     *   }
269 +     */
270 +
271 +    int i, docollect = 1, lflag, big_req_size = 0;
272 +    ADIO_Offset req_size = 0, total_req_size;
273 +    int avg_req_size, total_access_count;
274 +    char *value = NULL;
275 +
276 +    /* calculate total_req_size and total_access_count */
277 +    for (i = 0; i < contig_access_count; i++)
278 +        req_size += len_list[i];
279 +    MPI_Allreduce(&req_size, &total_req_size, 1, MPI_LONG_LONG_INT, MPI_SUM,
280 +               fd->comm);
281 +    MPI_Allreduce(&contig_access_count, &total_access_count, 1, MPI_INT, MPI_SUM,
282 +               fd->comm);
283 +    /* estimate average req_size */
284 +    avg_req_size = (int)(total_req_size / total_access_count);
285 +
286 +    /* get hint of big_req_size */
287 +    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
288 +    MPI_Info_get(fd->info, "big_req_size", MPI_MAX_INFO_VAL, value, &lflag);
289 +    if (lflag)
290 +        big_req_size = atoi(value);
291 +    /* Don't perform collective I/O if there are big requests */
292 +    if ((big_req_size > 0) && (avg_req_size > big_req_size))
293 +        docollect = 0;
294 +
295 +    ADIOI_Free(value);
296 +
297 +    return docollect;
298 +}
299 +
300 +void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
301 +                                 int *count_my_req_per_proc,
302 +                                 ADIOI_Access * my_req,
303 +                                 int nprocs, int myrank,
304 +                                  ADIO_Offset start_offset,
305 +                                  ADIO_Offset end_offset,
306 +                                  int *striping_info,
307 +                                 int *count_others_req_procs_ptr,
308 +                                 ADIOI_Access ** others_req_ptr)
309 +{
310 +    /* what requests of other processes will be written by this process */
311 +
312 +    int *count_others_req_per_proc, count_others_req_procs, proc;
313 +    int i, j, lflag, samesize = 0, contiguous = 0;
314 +    int avail_cb_nodes = striping_info[2];
315 +    MPI_Request *send_requests, *recv_requests;
316 +    MPI_Status *statuses;
317 +    ADIOI_Access *others_req;
318 +    char *value = NULL;
319 +    ADIO_Offset min_st_offset, off, req_len, avail_len, rem_len, *all_lens;
320 +
321 +    /* There are two hints, which could reduce some MPI communication overhead,
322 +     * if the users knows the I/O pattern and set them correctly. */
323 +    /* They are
324 +     * contiguous_data: if the data are contiguous,
325 +     *                  we don't need to do MPI_Alltoall().
326 +     * same_io_size: And if the data req size is same,
327 +     *               we can calculate the offset directly
328 +     */
329 +    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
330 +    /* hint of contiguous data */
331 +    MPI_Info_get(fd->info, "contiguous_data", MPI_MAX_INFO_VAL, value, &lflag);
332 +    if (lflag && !strcmp(value, "yes"))
333 +        contiguous = 1;
334 +    /* hint of same io size */
335 +    MPI_Info_get(fd->info, "same_io_size", MPI_MAX_INFO_VAL, value, &lflag);
336 +    if (lflag && !strcmp(value, "yes"))
337 +        samesize = 1;
338 +    ADIOI_Free(value);
339 +
340 +    *others_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs *
341 +                                                    sizeof(ADIOI_Access));
342 +    others_req = *others_req_ptr;
343 +
344 +    /* if the data are contiguous, we can calulate the offset and length
345 +     * of the other requests simply, instead of MPI_Alltoall() */
346 +    if (contiguous) {
347 +        for (i = 0; i < nprocs; i++) {
348 +            others_req[i].count = 0;
349 +        }
350 +        req_len = end_offset - start_offset + 1;
351 +        all_lens = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
352 +
353 +        /* same req size ? */
354 +        if (samesize == 0) {
355 +            /* calculate the min_st_offset */
356 +            MPI_Allreduce(&start_offset, &min_st_offset, 1, MPI_LONG_LONG,
357 +                          MPI_MIN, fd->comm);
358 +            /* exchange request length */
359 +            MPI_Allgather(&req_len, 1, ADIO_OFFSET, all_lens, 1, ADIO_OFFSET,
360 +                          fd->comm);
361 +        } else { /* same request size */
362 +            /* calculate the 1st request's offset */
363 +            min_st_offset = start_offset - myrank * req_len;
364 +            /* assign request length to all_lens[] */
365 +            for (i = 0; i < nprocs; i ++)
366 +               all_lens[i] = req_len;
367 +        }
368 +        if (myrank < avail_cb_nodes) {
369 +            /* This is a IO client and it will receive data from others */
370 +            off = min_st_offset;
371 +            /* calcaulte other_req[i].count */
372 +            for (i = 0; i < nprocs; i++) {
373 +                avail_len = all_lens[i];
374 +                rem_len = avail_len;
375 +                while (rem_len > 0) {
376 +                   proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
377 +                                                        striping_info);
378 +                    if (proc == myrank) {
379 +                        others_req[i].count ++;
380 +                    }
381 +                    off += avail_len;
382 +                    rem_len -= avail_len;
383 +                    avail_len = rem_len;
384 +                }
385 +            }
386 +            /* calculate offset and len for each request */
387 +            off = min_st_offset;
388 +            for (i = 0; i < nprocs; i++) {
389 +                if (others_req[i].count) {
390 +                   others_req[i].offsets = (ADIO_Offset *)
391 +                                            ADIOI_Malloc(others_req[i].count *
392 +                                                        sizeof(ADIO_Offset));
393 +                   others_req[i].lens = (int *)
394 +                                         ADIOI_Malloc(others_req[i].count *
395 +                                                      sizeof(int));
396 +                    others_req[i].mem_ptrs = (MPI_Aint *)
397 +                                             ADIOI_Malloc(others_req[i].count *
398 +                                                         sizeof(MPI_Aint));
399 +                }
400 +                j = 0;
401 +                avail_len = all_lens[i];
402 +                rem_len = avail_len;
403 +                while (rem_len > 0) {
404 +                   proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
405 +                                                        striping_info);
406 +                    if (proc == myrank) {
407 +                        others_req[i].offsets[j] = off;
408 +                        others_req[i].lens[j] = (int)avail_len;
409 +                        j ++;
410 +                    }
411 +                    off += avail_len;
412 +                    rem_len -= avail_len;
413 +                    avail_len = rem_len;
414 +                }
415 +            }
416 +        }
417 +        ADIOI_Free(all_lens);
418 +    } else {
419 +        /* multiple non-contiguous requests */
420 +        /* first find out how much to send/recv and from/to whom */
421 +
422 +        /*
423 +         * count_others_req_procs:
424 +         *    number of processes whose requests will be written by
425 +         *    this process (including this process itself)
426 +         * count_others_req_per_proc[i]:
427 +         *    how many separate contiguous requests of proc[i] will be
428 +         *    written by this process.
429 +         */
430 +
431 +        count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
432 +
433 +        MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
434 +                    count_others_req_per_proc, 1, MPI_INT, fd->comm);
435 +
436 +        count_others_req_procs = 0;
437 +        for (i = 0; i < nprocs; i++) {
438 +           if (count_others_req_per_proc[i]) {
439 +               others_req[i].count = count_others_req_per_proc[i];
440 +               others_req[i].offsets = (ADIO_Offset *)
441 +                                        ADIOI_Malloc(others_req[i].count *
442 +                                                sizeof(ADIO_Offset));
443 +               others_req[i].lens = (int *)
444 +                                    ADIOI_Malloc(others_req[i].count *
445 +                                                  sizeof(int));
446 +               others_req[i].mem_ptrs = (MPI_Aint *)
447 +                                        ADIOI_Malloc(others_req[i].count *
448 +                                                     sizeof(MPI_Aint));
449 +               count_others_req_procs++;
450 +           } else
451 +               others_req[i].count = 0;
452 +        }
453 +
454 +        /* now send the calculated offsets and lengths to respective processes */
455 +
456 +        send_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_my_req_procs + 1) *
457 +                                                     sizeof(MPI_Request));
458 +        recv_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_others_req_procs+1)*
459 +                                                    sizeof(MPI_Request));
460 +        /* +1 to avoid a 0-size malloc */
461 +
462 +        j = 0;
463 +        for (i = 0; i < nprocs; i++) {
464 +           if (others_req[i].count) {
465 +               MPI_Irecv(others_req[i].offsets, others_req[i].count,
466 +                         ADIO_OFFSET, i, i + myrank, fd->comm,
467 +                         &recv_requests[j]);
468 +               j++;
469 +               MPI_Irecv(others_req[i].lens, others_req[i].count,
470 +                         MPI_INT, i, i + myrank + 1, fd->comm,
471 +                         &recv_requests[j]);
472 +               j++;
473 +           }
474 +        }
475 +
476 +        j = 0;
477 +        for (i = 0; i < nprocs; i++) {
478 +           if (my_req[i].count) {
479 +               MPI_Isend(my_req[i].offsets, my_req[i].count,
480 +                         ADIO_OFFSET, i, i + myrank, fd->comm,
481 +                         &send_requests[j]);
482 +               j++;
483 +               MPI_Isend(my_req[i].lens, my_req[i].count,
484 +                         MPI_INT, i, i + myrank + 1, fd->comm,
485 +                         &send_requests[j]);
486 +               j++;
487 +           }
488 +        }
489 +
490 +        statuses = (MPI_Status *)
491 +                   ADIOI_Malloc((1 + 2 * ADIOI_MAX(count_my_req_procs,
492 +                                                  count_others_req_procs)) *
493 +                                         sizeof(MPI_Status));
494 +        /* +1 to avoid a 0-size malloc */
495 +
496 +        MPI_Waitall(2 * count_my_req_procs, send_requests, statuses);
497 +        MPI_Waitall(2 * count_others_req_procs, recv_requests, statuses);
498 +
499 +        ADIOI_Free(send_requests);
500 +        ADIOI_Free(recv_requests);
501 +        ADIOI_Free(statuses);
502 +        ADIOI_Free(count_others_req_per_proc);
503 +
504 +        *count_others_req_procs_ptr = count_others_req_procs;
505 +    }
506 +}
507 diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c
508 --- ad_lustre_orig/ad_lustre.c  2008-09-17 14:36:57.000000000 +0800
509 +++ ad_lustre/ad_lustre.c       2008-10-17 17:03:42.000000000 +0800
510 @@ -1,9 +1,11 @@
511  /* -*- Mode: C; c-basic-offset:4 ; -*- */
512 -/* 
513 - *   Copyright (C) 2001 University of Chicago. 
514 +/*
515 + *   Copyright (C) 2001 University of Chicago.
516   *   See COPYRIGHT notice in top-level directory.
517   *
518   *   Copyright (C) 2007 Oak Ridge National Laboratory
519 + *
520 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
521   */
522  
523  #include "ad_lustre.h"
524 @@ -13,12 +15,12 @@
525      ADIOI_LUSTRE_ReadContig, /* ReadContig */
526      ADIOI_LUSTRE_WriteContig, /* WriteContig */
527      ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
528 -    ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
529 +    ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */
530      ADIOI_GEN_SeekIndividual, /* SeekIndividual */
531      ADIOI_GEN_Fcntl, /* Fcntl */
532      ADIOI_LUSTRE_SetInfo, /* SetInfo */
533      ADIOI_GEN_ReadStrided, /* ReadStrided */
534 -    ADIOI_GEN_WriteStrided, /* WriteStrided */
535 +    ADIOI_LUSTRE_WriteStrided, /* WriteStrided */
536      ADIOI_GEN_Close, /* Close */
537  #if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE)
538      ADIOI_GEN_IreadContig, /* IreadContig */
539 diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
540 --- ad_lustre_orig/ad_lustre.h  2008-09-17 14:36:57.000000000 +0800
541 +++ ad_lustre/ad_lustre.h       2008-10-17 17:11:11.000000000 +0800
542 @@ -1,9 +1,11 @@
543  /* -*- Mode: C; c-basic-offset:4 ; -*- */
544 -/* 
545 - *   Copyright (C) 1997 University of Chicago. 
546 +/*
547 + *   Copyright (C) 1997 University of Chicago.
548   *   See COPYRIGHT notice in top-level directory.
549   *
550   *   Copyright (C) 2007 Oak Ridge National Laboratory
551 + *
552 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
553   */
554  
555  #ifndef AD_UNIX_INCLUDE
556 @@ -24,7 +26,32 @@
557  
558  /*#include <fcntl.h>*/
559  #include <sys/ioctl.h>
560 +#ifdef WITH_LUSTRE
561  #include "lustre/lustre_user.h"
562 +#else
563 +/* copy something from lustre_user.h here */
564 +#  define LOV_USER_MAGIC 0x0BD10BD0
565 +#  define LL_IOC_LOV_SETSTRIPE  _IOW ('f', 154, long)
566 +#  define LL_IOC_LOV_GETSTRIPE  _IOW ('f', 155, long)
567 +#  define lov_user_ost_data lov_user_ost_data_v1
568 +struct lov_user_ost_data_v1 {     /* per-stripe data structure */
569 +        __u64 l_object_id;        /* OST object ID */
570 +        __u64 l_object_gr;        /* OST object group (creating MDS number) */
571 +        __u32 l_ost_gen;          /* generation of this OST index */
572 +        __u32 l_ost_idx;          /* OST index in LOV */
573 +} __attribute__((packed));
574 +#define lov_user_md lov_user_md_v1
575 +struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
576 +        __u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V1 */
577 +        __u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
578 +        __u64 lmm_object_id;      /* LOV object ID */
579 +        __u64 lmm_object_gr;      /* LOV object group */
580 +        __u32 lmm_stripe_size;    /* size of stripe in bytes */
581 +        __u16 lmm_stripe_count;   /* num stripes in use for this object */
582 +        __u16 lmm_stripe_offset;  /* starting stripe offset in lmm_objects */
583 +        struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
584 +} __attribute__((packed));
585 +#endif
586  #include "adio.h"
587  /*#include "adioi.h"*/
588  
589 @@ -41,24 +68,31 @@
590  
591  void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code);
592  void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code);
593 -void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count, 
594 -                      MPI_Datatype datatype, int file_ptr_type,
595 -                     ADIO_Offset offset, ADIO_Status *status, int
596 -                    *error_code);
597 -void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count, 
598 -                      MPI_Datatype datatype, int file_ptr_type,
599 -                      ADIO_Offset offset, ADIO_Status *status, int
600 -                     *error_code);   
601 +void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
602 +                             MPI_Datatype datatype, int file_ptr_type,
603 +                             ADIO_Offset offset, ADIO_Status *status,
604 +                             int *error_code);
605 +void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count,
606 +                              MPI_Datatype datatype, int file_ptr_type,
607 +                              ADIO_Offset offset, ADIO_Status *status,
608 +                              int *error_code);
609 +void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
610 +                              MPI_Datatype datatype, int file_ptr_type,
611 +                              ADIO_Offset offset, ADIO_Status *status,
612 +                              int *error_code);
613  void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
614 -                      MPI_Datatype datatype, int file_ptr_type,
615 -                      ADIO_Offset offset, ADIO_Status *status, int
616 -                      *error_code);
617 +                                  MPI_Datatype datatype, int file_ptr_type,
618 +                                  ADIO_Offset offset, ADIO_Status *status,
619 +                                   int *error_code);
620  void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count,
621 -                      MPI_Datatype datatype, int file_ptr_type,
622 -                      ADIO_Offset offset, ADIO_Status *status, int
623 -                      *error_code);
624 +                                 MPI_Datatype datatype, int file_ptr_type,
625 +                                 ADIO_Offset offset, ADIO_Status *status,
626 +                                  int *error_code);
627 +void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count,
628 +                             MPI_Datatype datatype, int file_ptr_type,
629 +                             ADIO_Offset offset, ADIO_Status *status,
630 +                              int *error_code);
631  void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
632                        int *error_code);
633  void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
634 -
635  #endif /* End of AD_UNIX_INCLUDE */
636 diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
637 --- ad_lustre_orig/ad_lustre_hints.c    2008-09-17 14:36:57.000000000 +0800
638 +++ ad_lustre/ad_lustre_hints.c 2008-10-15 21:31:00.000000000 +0800
639 @@ -1,9 +1,11 @@
640  /* -*- Mode: C; c-basic-offset:4 ; -*- */
641 -/* 
642 - *   Copyright (C) 1997 University of Chicago. 
643 +/*
644 + *   Copyright (C) 1997 University of Chicago.
645   *   See COPYRIGHT notice in top-level directory.
646   *
647   *   Copyright (C) 2007 Oak Ridge National Laboratory
648 + *
649 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
650   */
651  
652  #include "ad_lustre.h"
653 @@ -11,130 +13,189 @@
654  
655  void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
656  {
657 -    char *value, *value_in_fd;
658 -    int flag, tmp_val[3], str_factor=-1, str_unit=0, start_iodev=-1;
659 -    struct lov_user_md lum = { 0 };
660 -    int err, myrank, fd_sys, perm, amode, old_mask;
661 +    char *value = NULL;
662 +    int flag, tmp_val, int_val, str_factor, str_unit, start_iodev;
663 +    static char myname[] = "ADIOI_LUSTRE_SETINFO";
664  
665 +    *error_code = MPI_SUCCESS;
666      value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
667 +
668      if ( (fd->info) == MPI_INFO_NULL) {
669 -       /* This must be part of the open call. can set striping parameters 
670 -           if necessary. */ 
671 +       /* This must be part of the open call. can set striping parameters
672 +           if necessary. */
673         MPI_Info_create(&(fd->info));
674  
675         MPI_Info_set(fd->info, "direct_read", "false");
676         MPI_Info_set(fd->info, "direct_write", "false");
677         fd->direct_read = fd->direct_write = 0;
678 -       
679 -       /* has user specified striping or server buffering parameters 
680 +
681 +       /* has user specified striping or server buffering parameters
682             and do they have the same value on all processes? */
683         if (users_info != MPI_INFO_NULL) {
684 -           MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, 
685 -                        value, &flag);
686 -           if (flag) 
687 -               str_unit=atoi(value);
688 -
689 -           MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, 
690 -                        value, &flag);
691 -           if (flag) 
692 -               str_factor=atoi(value);
693 -
694 -           MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, 
695 +            /* direct read and write */
696 +           MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
697                          value, &flag);
698 -           if (flag) 
699 -               start_iodev=atoi(value);
700 -
701 -           MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, 
702 -                            value, &flag);
703             if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
704                 MPI_Info_set(fd->info, "direct_read", "true");
705                 fd->direct_read = 1;
706             }
707 -
708 -           MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL, 
709 +           MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
710                              value, &flag);
711             if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
712                 MPI_Info_set(fd->info, "direct_write", "true");
713                 fd->direct_write = 1;
714             }
715 +            /*  stripe size */
716 +           MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
717 +                        value, &flag);
718 +           if (flag && (str_unit = atoi(value))) {
719 +               tmp_val = str_unit;
720 +               MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
721 +               if (tmp_val != str_unit) {
722 +                   MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
723 +                                                      "striping_unit",
724 +                                                      error_code);
725 +                    ADIOI_Free(value);
726 +                   return;
727 +               }
728 +               MPI_Info_set(fd->info, "striping_unit", value);
729 +           }
730 +            /* stripe count */
731 +           MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
732 +                        value, &flag);
733 +           if (flag && (str_factor = atoi(value))) {
734 +               tmp_val = str_factor;
735 +               MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
736 +               if (tmp_val != str_factor) {
737 +                   MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
738 +                                                      "striping_factor",
739 +                                                      error_code);
740 +                    ADIOI_Free(value);
741 +                   return;
742 +               }
743 +               MPI_Info_set(fd->info, "striping_factor", value);
744 +           }
745 +            /* stripe offset */
746 +            MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
747 +                        value, &flag);
748 +           if (flag && ((start_iodev = atoi(value)) >= 0)) {
749 +               tmp_val = start_iodev;
750 +               MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
751 +               if (tmp_val != start_iodev) {
752 +                   MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
753 +                                                      "start_iodevice",
754 +                                                      error_code);
755 +                    ADIOI_Free(value);
756 +                   return;
757 +               }
758 +               MPI_Info_set(fd->info, "start_iodevice", value);
759 +           }
760         }
761 -
762 -       MPI_Comm_rank(fd->comm, &myrank);
763 -       if (myrank == 0) {
764 -           tmp_val[0] = str_factor;
765 -           tmp_val[1] = str_unit;
766 -           tmp_val[2] = start_iodev;
767 +    }
768 +    if (users_info != MPI_INFO_NULL) {
769 +        /* CO: IO Clients/OST,
770 +         * to keep the load balancing between clients and OSTs */
771 +        MPI_Info_get(users_info, "CO", MPI_MAX_INFO_VAL, value,
772 +                     &flag);
773 +       if (flag && (int_val = atoi(value)) > 0) {
774 +            tmp_val = int_val;
775 +           MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
776 +           if (tmp_val != int_val) {
777 +                MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
778 +                                                   "CO",
779 +                                                   error_code);
780 +                ADIOI_Free(value);
781 +               return;
782 +           }
783 +           MPI_Info_set(fd->info, "CO", value);
784         }
785 -       MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm);
786 -
787 -       if (tmp_val[0] != str_factor 
788 -               || tmp_val[1] != str_unit 
789 -               || tmp_val[2] != start_iodev) {
790 -           FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys"
791 -                   "-striping_factor:striping_unit:start_iodevice "
792 -                   "need to be identical across all processes\n");
793 -           MPI_Abort(MPI_COMM_WORLD, 1);
794 -               } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) {
795 -            /* if user has specified striping info, process 0 tries to set it */
796 -           if (!myrank) {
797 -               if (fd->perm == ADIO_PERM_NULL) {
798 -                   old_mask = umask(022);
799 -                   umask(old_mask);
800 -                   perm = old_mask ^ 0666;
801 -               }
802 -               else perm = fd->perm;
803 -
804 -               amode = 0;
805 -               if (fd->access_mode & ADIO_CREATE)
806 -                   amode = amode | O_CREAT;
807 -               if (fd->access_mode & ADIO_RDONLY)
808 -                   amode = amode | O_RDONLY;
809 -               if (fd->access_mode & ADIO_WRONLY)
810 -                   amode = amode | O_WRONLY;
811 -               if (fd->access_mode & ADIO_RDWR)
812 -                   amode = amode | O_RDWR;
813 -               if (fd->access_mode & ADIO_EXCL)
814 -                   amode = amode | O_EXCL;
815 -
816 -               /* we need to create file so ensure this is set */
817 -               amode = amode | O_LOV_DELAY_CREATE | O_CREAT;
818 -
819 -               fd_sys = open(fd->filename, amode, perm);
820 -               if (fd_sys == -1) { 
821 -                   if (errno != EEXIST) 
822 -                       fprintf(stderr, 
823 -                               "Failure to open file %s %d %d\n",strerror(errno), amode, perm);
824 -               } else {
825 -                   lum.lmm_magic = LOV_USER_MAGIC;
826 -                   lum.lmm_pattern = 0;
827 -                   lum.lmm_stripe_size = str_unit;
828 -                   lum.lmm_stripe_count = str_factor;
829 -                   lum.lmm_stripe_offset = start_iodev;
830 -
831 -                   err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum);
832 -                   if (err == -1 && errno != EEXIST) { 
833 -                       fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno));
834 -                   }
835 -                   close(fd_sys);
836 -              }
837 -           } /* End of striping parameters validation */
838 +        /* big_req_size:
839 +         * if the req size is bigger than this,
840 +         * collective IO may not be performed.
841 +         */
842 +       MPI_Info_get(users_info, "big_req_size", MPI_MAX_INFO_VAL, value,
843 +                     &flag);
844 +       if (flag && (int_val = atoi(value)) > 0) {
845 +            tmp_val = int_val;
846 +           MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
847 +           if (tmp_val != int_val) {
848 +               MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
849 +                                                  "big_req_size",
850 +                                                  error_code);
851 +                ADIOI_Free(value);
852 +               return;
853 +           }
854 +           MPI_Info_set(fd->info, "big_req_size", value);
855 +        }
856 +        /* ds_in_coll: disable data sieving in collective IO */
857 +       MPI_Info_get(users_info, "ds_in_coll", MPI_MAX_INFO_VAL,
858 +                    value, &flag);
859 +       if (flag && (!strcmp(value, "enable") ||
860 +                     !strcmp(value, "ENABLE"))) {
861 +            tmp_val = int_val = 1;
862 +           MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
863 +           if (tmp_val != int_val) {
864 +               MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
865 +                                                  "ds_in_coll",
866 +                                                  error_code);
867 +                ADIOI_Free(value);
868 +                return;
869 +           }
870 +           MPI_Info_set(fd->info, "ds_in_coll", "enable");
871 +       }
872 +        /* contiguous_data: whether the data are contiguous */
873 +       MPI_Info_get(users_info, "contiguous_data", MPI_MAX_INFO_VAL,
874 +                    value, &flag);
875 +        if (flag && (!strcmp(value, "yes") ||
876 +                     !strcmp(value, "YES"))) {
877 +            tmp_val = int_val = 1;
878 +           MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
879 +           if (tmp_val != int_val) {
880 +               MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
881 +                                                  "contiguous_data",
882 +                                                  error_code);
883 +                ADIOI_Free(value);
884 +                return;
885 +           }
886 +           MPI_Info_set(fd->info, "contiguous_data", "yes");
887         }
888 -       
889 -       MPI_Barrier(fd->comm);
890 -       /* set the values for collective I/O and data sieving parameters */
891 -       ADIOI_GEN_SetInfo(fd, users_info, error_code);
892 -    } else {
893 -       /* The file has been opened previously and fd->fd_sys is a valid
894 -           file descriptor. cannot set striping parameters now. */
895 -       
896 -       /* set the values for collective I/O and data sieving parameters */
897 -       ADIOI_GEN_SetInfo(fd, users_info, error_code);
898 +        /* same_io_size: whether the req size is same */
899 +       MPI_Info_get(users_info, "same_io_size", MPI_MAX_INFO_VAL,
900 +                    value, &flag);
901 +        if (flag && (!strcmp(value, "yes") ||
902 +                     !strcmp(value, "YES"))) {
903 +            tmp_val = int_val = 1;
904 +           MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
905 +           if (tmp_val != int_val) {
906 +               MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
907 +                                                  "same_io_size",
908 +                                                  error_code);
909 +                ADIOI_Free(value);
910 +                return;
911 +           }
912 +           MPI_Info_set(fd->info, "same_io_size", "yes");
913 +       }
914 +        /* Remember the current cb_nodes that the user set.
915 +         * It would be used to improve collective I/O.
916 +         */
917 +       MPI_Info_get(users_info, "cb_nodes", MPI_MAX_INFO_VAL, value, &flag);
918 +       if (flag && (int_val = atoi(value)) > 0) {
919 +            tmp_val = int_val;
920 +           MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
921 +           if (tmp_val != int_val) {
922 +               MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
923 +                                                  "big_req_size",
924 +                                                  error_code);
925 +                ADIOI_Free(value);
926 +               return;
927 +           }
928 +           MPI_Info_set(fd->info, "user_cb_nodes", value);
929 +        }
930      }
931
932 -    if (ADIOI_Direct_read) fd->direct_read = 1;
933 -    if (ADIOI_Direct_write) fd->direct_write = 1;
934 -
935      ADIOI_Free(value);
936 +    /* set the values for collective I/O and data sieving parameters */
937 +    ADIOI_GEN_SetInfo(fd, users_info, error_code);
938  
939 -    *error_code = MPI_SUCCESS;
940 +    if (ADIOI_Direct_read) fd->direct_read = 1;
941 +    if (ADIOI_Direct_write) fd->direct_write = 1;
942  }
943 diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c
944 --- ad_lustre_orig/ad_lustre_open.c     2008-09-17 14:36:57.000000000 +0800
945 +++ ad_lustre/ad_lustre_open.c  2008-09-17 18:55:50.000000000 +0800
946 @@ -1,18 +1,21 @@
947  /* -*- Mode: C; c-basic-offset:4 ; -*- */
948 -/* 
949 - *   Copyright (C) 1997 University of Chicago. 
950 +/*
951 + *   Copyright (C) 1997 University of Chicago.
952   *   See COPYRIGHT notice in top-level directory.
953   *
954   *   Copyright (C) 2007 Oak Ridge National Laboratory
955 + *
956 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
957   */
958  
959  #include "ad_lustre.h"
960  
961  void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
962  {
963 -    int perm, old_mask, amode, amode_direct;
964 +    int perm, old_mask, amode = 0, amode_direct = 0, flag = 0, err, myrank;
965 +    int stripe_size = 0, stripe_count = 0, stripe_offset = -1;
966      struct lov_user_md lum = { 0 };
967 -    char *value;
968 +    char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
969  
970  #if defined(MPICH2) || !defined(PRINT_ERR_MSG)
971      static char myname[] = "ADIOI_LUSTRE_OPEN";
972 @@ -22,12 +25,57 @@
973         old_mask = umask(022);
974         umask(old_mask);
975         perm = old_mask ^ 0666;
976 -    }
977 -    else perm = fd->perm;
978 +    } else
979 +       perm = fd->perm;
980  
981 -    amode = 0;
982 -    if (fd->access_mode & ADIO_CREATE)
983 +    if (fd->access_mode & ADIO_CREATE) {
984         amode = amode | O_CREAT;
985 +        /* Check striping info
986 +         * if already set by SetInfo(), set them to lum; otherwise, set by lum
987 +         */
988 +        MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value,
989 +                    &flag);
990 +        if (flag)
991 +           stripe_size = atoi(value);
992 +
993 +        MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value,
994 +                    &flag);
995 +        if (flag)
996 +           stripe_count = atoi(value);
997 +
998 +        MPI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL, value,
999 +                    &flag);
1000 +        if (flag)
1001 +           stripe_offset = atoi(value);
1002 +
1003 +        /* if user has specified striping info,
1004 +         * process 0 will try to check and set it.
1005 +         */
1006 +        if ((stripe_size > 0) || (stripe_count > 0) || (stripe_offset >= 0)) {
1007 +           MPI_Comm_rank(fd->comm, &myrank);
1008 +           if (myrank == 0) {
1009 +               int fd_sys = open(fd->filename, amode, perm);
1010 +               if (fd_sys == -1) {
1011 +                   if (errno != EEXIST)
1012 +                       FPRINTF(stderr, "Failure to open file %s %d %d\n",
1013 +                               strerror(errno), amode, perm);
1014 +               } else {
1015 +                   lum.lmm_magic = LOV_USER_MAGIC;
1016 +                   lum.lmm_pattern = 1;
1017 +                   lum.lmm_stripe_size = stripe_size;
1018 +                   lum.lmm_stripe_count = stripe_count;
1019 +                   lum.lmm_stripe_offset = stripe_offset;
1020 +
1021 +                   if (ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum))
1022 +                       FPRINTF(stderr,
1023 +                               "Failure to set striping info to Lustre!\n");
1024 +                   close(fd_sys);
1025 +               }
1026 +           }
1027 +           MPI_Barrier(fd->comm);
1028 +        }
1029 +    }
1030 +
1031      if (fd->access_mode & ADIO_RDONLY)
1032         amode = amode | O_RDONLY;
1033      if (fd->access_mode & ADIO_WRONLY)
1034 @@ -42,32 +90,36 @@
1035      fd->fd_sys = open(fd->filename, amode|O_CREAT, perm);
1036  
1037      if (fd->fd_sys != -1) {
1038 -        int err;
1039 -
1040 -        value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
1041 -
1042          /* get file striping information and set it in info */
1043 -        lum.lmm_magic = LOV_USER_MAGIC;
1044 -        err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
1045 -
1046 -        if (!err) {
1047 -            sprintf(value, "%d", lum.lmm_stripe_size);
1048 -            MPI_Info_set(fd->info, "striping_unit", value);
1049 -
1050 -            sprintf(value, "%d", lum.lmm_stripe_count);
1051 -            MPI_Info_set(fd->info, "striping_factor", value);
1052 -
1053 -            sprintf(value, "%d", lum.lmm_stripe_offset);
1054 -            MPI_Info_set(fd->info, "start_iodevice", value);
1055 -        }
1056 -        ADIOI_Free(value);
1057 +       lum.lmm_magic = LOV_USER_MAGIC;
1058 +       err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
1059  
1060 +       if (!err) {
1061 +           if (lum.lmm_stripe_size && lum.lmm_stripe_count &&
1062 +                (lum.lmm_stripe_offset >= 0)) {
1063 +               sprintf(value, "%d", lum.lmm_stripe_size);
1064 +               MPI_Info_set(fd->info, "striping_unit", value);
1065 +
1066 +               sprintf(value, "%d", lum.lmm_stripe_count);
1067 +               MPI_Info_set(fd->info, "striping_factor", value);
1068 +
1069 +               sprintf(value, "%d", lum.lmm_stripe_offset);
1070 +               MPI_Info_set(fd->info, "start_iodevice", value);
1071 +           } else {
1072 +               FPRINTF(stderr, "Striping info is invalid!\n");
1073 +               ADIOI_Free(value);
1074 +               MPI_Abort(MPI_COMM_WORLD, 1);
1075 +           }
1076 +       } else {
1077 +           FPRINTF(stderr, "Failed to get striping info from Lustre!\n");
1078 +            ADIOI_Free(value);
1079 +           MPI_Abort(MPI_COMM_WORLD, 1);
1080 +       }
1081          if (fd->access_mode & ADIO_APPEND)
1082              fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1083 -    } 
1084 -
1085 +    }
1086      if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
1087 -       fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1088 +        fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1089  
1090      fd->fd_direct = -1;
1091      if (fd->direct_write || fd->direct_read) {
1092 @@ -81,20 +133,22 @@
1093      }
1094  
1095      /* --BEGIN ERROR HANDLING-- */
1096 -    if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && 
1097 -               (fd->direct_write || fd->direct_read))) {
1098 +    if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
1099 +       (fd->direct_write || fd->direct_read))) {
1100         if (errno == ENAMETOOLONG)
1101             *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1102 -                                              MPIR_ERR_RECOVERABLE, myname,
1103 -                                              __LINE__, MPI_ERR_BAD_FILE,
1104 +                                              MPIR_ERR_RECOVERABLE,
1105 +                                              myname, __LINE__,
1106 +                                              MPI_ERR_BAD_FILE,
1107                                                "**filenamelong",
1108                                                "**filenamelong %s %d",
1109                                                fd->filename,
1110                                                strlen(fd->filename));
1111         else if (errno == ENOENT)
1112             *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1113 -                                              MPIR_ERR_RECOVERABLE, myname,
1114 -                                              __LINE__, MPI_ERR_NO_SUCH_FILE,
1115 +                                              MPIR_ERR_RECOVERABLE,
1116 +                                              myname, __LINE__,
1117 +                                              MPI_ERR_NO_SUCH_FILE,
1118                                                "**filenoexist",
1119                                                "**filenoexist %s",
1120                                                fd->filename);
1121 @@ -108,27 +162,30 @@
1122                                                fd->filename);
1123         else if (errno == EACCES) {
1124             *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1125 -                                              MPIR_ERR_RECOVERABLE, myname,
1126 -                                              __LINE__, MPI_ERR_ACCESS,
1127 +                                              MPIR_ERR_RECOVERABLE,
1128 +                                              myname, __LINE__,
1129 +                                              MPI_ERR_ACCESS,
1130                                                "**fileaccess",
1131 -                                              "**fileaccess %s", 
1132 -                                              fd->filename );
1133 -       }
1134 -       else if (errno == EROFS) {
1135 +                                              "**fileaccess %s",
1136 +                                              fd->filename);
1137 +       } else if (errno == EROFS) {
1138             /* Read only file or file system and write access requested */
1139             *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1140 -                                              MPIR_ERR_RECOVERABLE, myname,
1141 -                                              __LINE__, MPI_ERR_READ_ONLY,
1142 -                                              "**ioneedrd", 0 );
1143 -       }
1144 -       else {
1145 +                                              MPIR_ERR_RECOVERABLE,
1146 +                                              myname, __LINE__,
1147 +                                              MPI_ERR_READ_ONLY,
1148 +                                              "**ioneedrd", 0);
1149 +       } else {
1150             *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1151 -                                              MPIR_ERR_RECOVERABLE, myname,
1152 -                                              __LINE__, MPI_ERR_IO, "**io",
1153 +                                              MPIR_ERR_RECOVERABLE,
1154 +                                              myname, __LINE__,
1155 +                                              MPI_ERR_IO, "**io",
1156                                                "**io %s", strerror(errno));
1157         }
1158 -    }
1159 +    } else {
1160      /* --END ERROR HANDLING-- */
1161 -    else *error_code = MPI_SUCCESS;
1162 +        *error_code = MPI_SUCCESS;
1163 +    }
1164  
1165 +    ADIOI_Free(value);
1166  }
1167 diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c
1168 --- ad_lustre_orig/ad_lustre_rwcontig.c 2008-09-17 14:36:57.000000000 +0800
1169 +++ ad_lustre/ad_lustre_rwcontig.c      2008-10-15 22:44:35.000000000 +0800
1170 @@ -1,9 +1,11 @@
1171  /* -*- Mode: C; c-basic-offset:4 ; -*- */
1172 -/* 
1173 - *   Copyright (C) 1997 University of Chicago. 
1174 +/*
1175 + *   Copyright (C) 1997 University of Chicago.
1176   *   See COPYRIGHT notice in top-level directory.
1177   *
1178   *   Copyright (C) 2007 Oak Ridge National Laboratory
1179 + *
1180 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
1181   */
1182  
1183  #define _XOPEN_SOURCE 600
1184 diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
1185 --- ad_lustre_orig/ad_lustre_wrcoll.c   1970-01-01 08:00:00.000000000 +0800
1186 +++ ad_lustre/ad_lustre_wrcoll.c        2008-10-17 16:34:36.000000000 +0800
1187 @@ -0,0 +1,880 @@
1188 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
1189 +/*
1190 + *   Copyright (C) 1997 University of Chicago.
1191 + *   See COPYRIGHT notice in top-level directory.
1192 + *
1193 + *   Copyright (C) 2007 Oak Ridge National Laboratory
1194 + *
1195 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
1196 + */
1197 +
1198 +#include "ad_lustre.h"
1199 +#include "adio_extern.h"
1200 +
1201 +/* prototypes of functions used for collective writes only. */
1202 +static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
1203 +                                       MPI_Datatype datatype, int nprocs,
1204 +                                       int myrank,
1205 +                                       ADIOI_Access *others_req,
1206 +                                       ADIOI_Access *my_req,
1207 +                                       ADIO_Offset *offset_list,
1208 +                                       int *len_list,
1209 +                                       int contig_access_count,
1210 +                                       int * striping_info,
1211 +                                       int *buf_idx, int *error_code);
1212 +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
1213 +                                         ADIOI_Flatlist_node * flat_buf,
1214 +                                         char **send_buf,
1215 +                                         ADIO_Offset * offset_list,
1216 +                                         int *len_list, int *send_size,
1217 +                                         MPI_Request * requests,
1218 +                                         int *sent_to_proc, int nprocs,
1219 +                                         int myrank, int contig_access_count,
1220 +                                         int * striping_info,
1221 +                                         int *send_buf_idx,
1222 +                                          int *curr_to_proc,
1223 +                                         int *done_to_proc, int iter,
1224 +                                         MPI_Aint buftype_extent);
1225 +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
1226 +                                        char *write_buf,
1227 +                                        ADIOI_Flatlist_node * flat_buf,
1228 +                                        ADIO_Offset * offset_list,
1229 +                                        int *len_list, int *send_size,
1230 +                                        int *recv_size, ADIO_Offset off,
1231 +                                        int size, int *count,
1232 +                                        int *start_pos, int *partial_recv,
1233 +                                        int *sent_to_proc, int nprocs,
1234 +                                        int myrank, int buftype_is_contig,
1235 +                                        int contig_access_count,
1236 +                                        int * striping_info,
1237 +                                        ADIOI_Access * others_req,
1238 +                                        int *send_buf_idx,
1239 +                                        int *curr_to_proc,
1240 +                                        int *done_to_proc, int *hole,
1241 +                                        int iter, MPI_Aint buftype_extent,
1242 +                                        int *buf_idx, int *error_code);
1243 +void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
1244 +                      ADIO_Offset * srt_off, int *srt_len, int *start_pos,
1245 +                      int nprocs, int nprocs_recv, int total_elements);
1246 +
1247 +void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
1248 +                                  MPI_Datatype datatype,
1249 +                                  int file_ptr_type, ADIO_Offset offset,
1250 +                                  ADIO_Status * status, int *error_code)
1251 +{
1252 +    ADIOI_Access *my_req;
1253 +    /* array of nprocs access structures, one for each other process has
1254 +       this process's request */
1255 +
1256 +    ADIOI_Access *others_req;
1257 +    /* array of nprocs access structures, one for each other process
1258 +       whose request is written by this process. */
1259 +
1260 +    int i, filetype_is_contig, nprocs, myrank, do_collect = 0;
1261 +    int contig_access_count = 0, buftype_is_contig, interleave_count = 0;
1262 +    int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
1263 +    ADIO_Offset orig_fp, start_offset, end_offset, off;
1264 +    ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL;
1265 +    int *buf_idx = NULL, *len_list = NULL, *striping_info = NULL;
1266 +    int old_error, tmp_error;
1267 +
1268 +    MPI_Comm_size(fd->comm, &nprocs);
1269 +    MPI_Comm_rank(fd->comm, &myrank);
1270 +
1271 +    orig_fp = fd->fp_ind;
1272 +
1273 +    /* IO patten identification if cb_write isn't disabled */
1274 +    if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
1275 +       /* For this process's request, calculate the list of offsets and
1276 +          lengths in the file and determine the start and end offsets. */
1277 +       ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
1278 +                             &offset_list, &len_list, &start_offset,
1279 +                             &end_offset, &contig_access_count);
1280 +
1281 +       /* each process communicates its start and end offsets to other
1282 +          processes. The result is an array each of start and end offsets stored
1283 +          in order of process rank. */
1284 +       st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
1285 +       end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
1286 +       MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
1287 +                     ADIO_OFFSET, fd->comm);
1288 +       MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
1289 +                     ADIO_OFFSET, fd->comm);
1290 +       /* are the accesses of different processes interleaved? */
1291 +       for (i = 1; i < nprocs; i++)
1292 +           if ((st_offsets[i] < end_offsets[i-1]) &&
1293 +                (st_offsets[i] <= end_offsets[i]))
1294 +                interleave_count++;
1295 +       /* This is a rudimentary check for interleaving, but should suffice
1296 +          for the moment. */
1297 +
1298 +       /* Two typical access patterns can benefit from collective write.
1299 +         *   1) the processes are interleaved, and
1300 +         *   2) the req size is small.
1301 +         */
1302 +        if (interleave_count > 0) {
1303 +           do_collect = 1;
1304 +        } else {
1305 +            do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count,
1306 +                                               len_list, nprocs);
1307 +        }
1308 +    }
1309 +    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
1310 +
1311 +    /* Decide if collective I/O should be done */
1312 +    if ((!do_collect && fd->hints->cb_write == ADIOI_HINT_AUTO) ||
1313 +        fd->hints->cb_write == ADIOI_HINT_DISABLE) {
1314 +
1315 +       int filerange_is_contig = 0;
1316 +
1317 +       /* use independent accesses */
1318 +       if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
1319 +           ADIOI_Free(offset_list);
1320 +           ADIOI_Free(len_list);
1321 +            ADIOI_Free(st_offsets);
1322 +            ADIOI_Free(end_offsets);
1323 +       }
1324 +
1325 +       fd->fp_ind = orig_fp;
1326 +       ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
1327 +       if (buftype_is_contig && filetype_is_contig) {
1328 +           if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
1329 +               off = fd->disp + (fd->etype_size) * offset;
1330 +               ADIO_WriteContig(fd, buf, count, datatype,
1331 +                                ADIO_EXPLICIT_OFFSET,
1332 +                                off, status, error_code);
1333 +           } else
1334 +               ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
1335 +                                0, status, error_code);
1336 +       } else {
1337 +           ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
1338 +                             offset, status, error_code);
1339 +       }
1340 +       return;
1341 +    }
1342 +
1343 +    /* Get Lustre hints information */
1344 +    ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1);
1345 +    /* calculate what portions of the access requests of this process are
1346 +     * located in which process
1347 +     */
1348 +    ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count,
1349 +                             striping_info, nprocs, &count_my_req_procs,
1350 +                             &count_my_req_per_proc, &my_req, &buf_idx);
1351 +    /* calculate what process's requests will be written by this process */
1352 +    ADIOI_LUSTRE_Calc_others_req(fd, count_my_req_procs,
1353 +                                 count_my_req_per_proc,
1354 +                                my_req, nprocs, myrank,
1355 +                                 start_offset, end_offset, striping_info,
1356 +                                 &count_others_req_procs, &others_req);
1357 +    ADIOI_Free(count_my_req_per_proc);
1358 +
1359 +    /* exchange data and write in sizes of no more than stripe_size. */
1360 +    ADIOI_LUSTRE_Exch_and_write(fd, buf, datatype, nprocs, myrank,
1361 +                                others_req, my_req,
1362 +                                offset_list, len_list, contig_access_count,
1363 +                               striping_info, buf_idx, error_code);
1364 +
1365 +    old_error = *error_code;
1366 +    if (*error_code != MPI_SUCCESS)
1367 +       *error_code = MPI_ERR_IO;
1368 +
1369 +    /* optimization: if only one process performing i/o, we can perform
1370 +     * a less-expensive Bcast  */
1371 +#ifdef ADIOI_MPE_LOGGING
1372 +    MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL);
1373 +#endif
1374 +    if (fd->hints->cb_nodes == 1)
1375 +       MPI_Bcast(error_code, 1, MPI_INT,
1376 +                 fd->hints->ranklist[0], fd->comm);
1377 +    else {
1378 +       tmp_error = *error_code;
1379 +       MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT,
1380 +                     MPI_MAX, fd->comm);
1381 +    }
1382 +#ifdef ADIOI_MPE_LOGGING
1383 +    MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL);
1384 +#endif
1385 +
1386 +    if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO))
1387 +       *error_code = old_error;
1388 +
1389 +
1390 +    if (!buftype_is_contig)
1391 +       ADIOI_Delete_flattened(datatype);
1392 +
1393 +    /* free all memory allocated for collective I/O */
1394 +    /* free others_req */
1395 +    for (i = 0; i < nprocs; i++) {
1396 +       if (others_req[i].count) {
1397 +           ADIOI_Free(others_req[i].offsets);
1398 +           ADIOI_Free(others_req[i].lens);
1399 +           ADIOI_Free(others_req[i].mem_ptrs);
1400 +       }
1401 +    }
1402 +    ADIOI_Free(others_req);
1403 +    /* free my_req here */
1404 +    for (i = 0; i < nprocs; i++) {
1405 +       if (my_req[i].count) {
1406 +           ADIOI_Free(my_req[i].offsets);
1407 +           ADIOI_Free(my_req[i].lens);
1408 +       }
1409 +    }
1410 +    ADIOI_Free(my_req);
1411 +    ADIOI_Free(buf_idx);
1412 +    ADIOI_Free(offset_list);
1413 +    ADIOI_Free(len_list);
1414 +    ADIOI_Free(st_offsets);
1415 +    ADIOI_Free(end_offsets);
1416 +    ADIOI_Free(striping_info);
1417 +
1418 +#ifdef HAVE_STATUS_SET_BYTES
1419 +    if (status) {
1420 +       int bufsize, size;
1421 +       /* Don't set status if it isn't needed */
1422 +       MPI_Type_size(datatype, &size);
1423 +       bufsize = size * count;
1424 +       MPIR_Status_set_bytes(status, datatype, bufsize);
1425 +    }
1426 +    /* This is a temporary way of filling in status. The right way is to
1427 +     * keep track of how much data was actually written during collective I/O.
1428 +     */
1429 +#endif
1430 +
1431 +    fd->fp_sys_posn = -1;      /* set it to null. */
1432 +}
1433 +
1434 +static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
1435 +                                       MPI_Datatype datatype, int nprocs,
1436 +                                       int myrank, ADIOI_Access *others_req,
1437 +                                        ADIOI_Access *my_req,
1438 +                                       ADIO_Offset *offset_list,
1439 +                                        int *len_list, int contig_access_count,
1440 +                                       int *striping_info, int *buf_idx,
1441 +                                        int *error_code)
1442 +{
1443 +    int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
1444 +    ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
1445 +    ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
1446 +    ADIO_Offset max_size, step_size = 0;
1447 +    int real_size, req_len, send_len;
1448 +    int *recv_curr_offlen_ptr, *recv_count, *recv_size;
1449 +    int *send_curr_offlen_ptr, *send_size;
1450 +    int *partial_recv, *sent_to_proc, *recv_start_pos;
1451 +    int *send_buf_idx, *curr_to_proc, *done_to_proc;
1452 +    char *write_buf = NULL, *value;
1453 +    MPI_Status status;
1454 +    ADIOI_Flatlist_node *flat_buf = NULL;
1455 +    MPI_Aint buftype_extent;
1456 +    int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
1457 +    int lflag, data_sieving = 0;
1458 +
1459 +    *error_code = MPI_SUCCESS; /* changed below if error */
1460 +
1461 +    /* calculate the number of writes of stripe size to be done.
1462 +     * That gives the no. of communication phases as well.
1463 +     * Note:
1464 +     *   Because we redistribute data in stripe-contiguous pattern for Lustre,
1465 +     *   each process has the same no. of communication phases.
1466 +     */
1467 +
1468 +    for (i = 0; i < nprocs; i++) {
1469 +       if (others_req[i].count) {
1470 +           st_loc = others_req[i].offsets[0];
1471 +           end_loc = others_req[i].offsets[0];
1472 +           break;
1473 +       }
1474 +    }
1475 +    for (i = 0; i < nprocs; i++) {
1476 +       for (j = 0; j < others_req[i].count; j++) {
1477 +           st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
1478 +           end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] +
1479 +                                          others_req[i].lens[j] - 1));
1480 +       }
1481 +    }
1482 +    /* this process does no writing. */
1483 +    if ((st_loc == -1) && (end_loc == -1))
1484 +       ntimes = 0;
1485 +    MPI_Allreduce(&end_loc, &max_end_loc, 1, MPI_LONG_LONG_INT, MPI_MAX, fd->comm);
1486 +    /* avoid min_st_loc be -1 */
1487 +    if (st_loc == -1)
1488 +        st_loc = max_end_loc;
1489 +    MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm);
1490 +    /* align downward */
1491 +    min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size;
1492 +
1493 +    /* Each time, only avail_cb_nodes number of IO clients perform IO,
1494 +     * so, step_size=avail_cb_nodes*stripe_size IO will be performed at most,
1495 +     * and ntimes=whole_file_portion/step_size
1496 +     */
1497 +    step_size = (ADIO_Offset) avail_cb_nodes * stripe_size;
1498 +    max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1);
1499 +    if (ntimes)
1500 +       write_buf = (char *) ADIOI_Malloc(stripe_size);
1501 +
1502 +    /* calculate the start offset for each iteration */
1503 +    off_list = (ADIO_Offset *) ADIOI_Malloc(max_ntimes * sizeof(ADIO_Offset));
1504 +    for (m = 0; m < max_ntimes; m ++)
1505 +        off_list[m] = max_end_loc;
1506 +    for (i = 0; i < nprocs; i++) {
1507 +        for (j = 0; j < others_req[i].count; j ++) {
1508 +            req_off = others_req[i].offsets[j];
1509 +            m = (int)((req_off - min_st_loc) / step_size);
1510 +            off_list[m] = ADIOI_MIN(off_list[m], req_off);
1511 +        }
1512 +    }
1513 +
1514 +    recv_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1515 +    send_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1516 +    /* their use is explained below. calloc initializes to 0. */
1517 +
1518 +    recv_count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1519 +    /* to store count of how many off-len pairs per proc are satisfied
1520 +       in an iteration. */
1521 +
1522 +    send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1523 +    /* total size of data to be sent to each proc. in an iteration.
1524 +       Of size nprocs so that I can use MPI_Alltoall later. */
1525 +
1526 +    recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1527 +    /* total size of data to be recd. from each proc. in an iteration. */
1528 +
1529 +    sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1530 +    /* amount of data sent to each proc so far. Used in
1531 +       ADIOI_Fill_send_buffer. initialized to 0 here. */
1532 +
1533 +    send_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1534 +    curr_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1535 +    done_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1536 +    /* Above three are used in ADIOI_Fill_send_buffer */
1537 +
1538 +    recv_start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1539 +    /* used to store the starting value of recv_curr_offlen_ptr[i] in
1540 +       this iteration */
1541 +
1542 +    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
1543 +    if (!buftype_is_contig) {
1544 +       ADIOI_Flatten_datatype(datatype);
1545 +       flat_buf = ADIOI_Flatlist;
1546 +       while (flat_buf->type != datatype)
1547 +           flat_buf = flat_buf->next;
1548 +    }
1549 +    MPI_Type_extent(datatype, &buftype_extent);
1550 +
1551 +    iter_st_off = min_st_loc;
1552 +
1553 +    /* Although we have recognized the data according to OST index,
1554 +     * a read-modify-write will be done if there is a hole between the data.
1555 +     * For example: if blocksize=60, xfersize=30 and stripe_size=100,
1556 +     * then rank0 will collect data [0, 30] and [60, 90] then write. There
1557 +     * is a hole in [30, 60], which will cause a read-modify-write in [0, 90].
1558 +     *
1559 +     * To reduce its impact on the performance, we disable data sieving
1560 +     * by default, unless the hint "ds_in_coll" is enabled.
1561 +     */
1562 +    /* check the hint for data sieving */
1563 +    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
1564 +    MPI_Info_get(fd->info, "ds_in_coll", MPI_MAX_INFO_VAL, value, &lflag);
1565 +    if (lflag && !strcmp(value, "enable"))
1566 +        data_sieving = 1;
1567 +    ADIOI_Free(value);
1568 +
1569 +    for (m = 0; m < max_ntimes; m++) {
1570 +       /* go through all others_req and my_req to check which will be received
1571 +         * and sent in this iteration.
1572 +         */
1573 +
1574 +       /* Note that MPI guarantees that displacements in filetypes are in
1575 +          monotonically nondecreasing order and that, for writes, the
1576 +          filetypes cannot specify overlapping regions in the file. This
1577 +          simplifies implementation a bit compared to reads. */
1578 +
1579 +       /*
1580 +           off         = start offset in the file for the data to be written in
1581 +                         this iteration
1582 +           iter_st_off = start offset of this iteration
1583 +           real_size   = size of data written (bytes) corresponding to off
1584 +           max_size    = possible maximum size of data written in this iteration
1585 +           req_off     = offset in the file for a particular contiguous request minus
1586 +                         what was satisfied in previous iteration
1587 +           send_off    = offset the request needed by other processes in this iteration
1588 +           req_len     = size corresponding to req_off
1589 +           send_len    = size corresponding to send_off
1590 +         */
1591 +
1592 +       /* first calculate what should be communicated */
1593 +       for (i = 0; i < nprocs; i++)
1594 +           recv_count[i] = recv_size[i] = send_size[i] = 0;
1595 +
1596 +        off = off_list[m];
1597 +        max_size = ADIOI_MIN(step_size, max_end_loc - iter_st_off + 1);
1598 +        real_size = (int) ADIOI_MIN((off / stripe_size + 1) * stripe_size - off,
1599 +                                    end_loc - off + 1);
1600 +
1601 +       for (i = 0; i < nprocs; i++) {
1602 +            if (my_req[i].count) {
1603 +                for (j = send_curr_offlen_ptr[i]; j < my_req[i].count; j++) {
1604 +                    send_off = my_req[i].offsets[j];
1605 +                    send_len = my_req[i].lens[j];
1606 +                    if (send_off < iter_st_off + max_size) {
1607 +                        send_size[i] += send_len;
1608 +                    } else {
1609 +                        break;
1610 +                    }
1611 +                }
1612 +                send_curr_offlen_ptr[i] = j;
1613 +            }
1614 +           if (others_req[i].count) {
1615 +               recv_start_pos[i] = recv_curr_offlen_ptr[i];
1616 +               for (j = recv_curr_offlen_ptr[i]; j < others_req[i].count; j++) {
1617 +                    req_off = others_req[i].offsets[j];
1618 +                    req_len = others_req[i].lens[j];
1619 +                   if (req_off < iter_st_off + max_size) {
1620 +                       recv_count[i]++;
1621 +                       MPI_Address(write_buf + req_off - off,
1622 +                                   &(others_req[i].mem_ptrs[j]));
1623 +                        recv_size[i] += req_len;
1624 +                   } else {
1625 +                       break;
1626 +                    }
1627 +               }
1628 +               recv_curr_offlen_ptr[i] = j;
1629 +           }
1630 +       }
1631 +        /* use variable "hole" to pass data_sieving flag into W_Exchange_data */
1632 +        hole = data_sieving;
1633 +       ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
1634 +                                     len_list, send_size, recv_size, off, real_size,
1635 +                                     recv_count, recv_start_pos, partial_recv,
1636 +                                     sent_to_proc, nprocs, myrank,
1637 +                                     buftype_is_contig, contig_access_count,
1638 +                                     striping_info, others_req, send_buf_idx,
1639 +                                     curr_to_proc, done_to_proc, &hole, m,
1640 +                                     buftype_extent, buf_idx, error_code);
1641 +       if (*error_code != MPI_SUCCESS)
1642 +            goto over;
1643 +
1644 +       flag = 0;
1645 +       for (i = 0; i < nprocs; i++)
1646 +           if (recv_count[i]) {
1647 +               flag = 1;
1648 +               break;
1649 +           }
1650 +       if (flag) {
1651 +            /* check whether to do data sieving */
1652 +            if(data_sieving) {
1653 +               ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
1654 +                                ADIO_EXPLICIT_OFFSET, off, &status,
1655 +                                error_code);
1656 +            } else {
1657 +                /* if there is no hole, write data in one time;
1658 +                 * otherwise, write data in several times */
1659 +                if (!hole) {
1660 +                    ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
1661 +                                     ADIO_EXPLICIT_OFFSET, off, &status,
1662 +                                     error_code);
1663 +                } else {
1664 +                    for (i = 0; i < nprocs; i++) {
1665 +                        if (others_req[i].count) {
1666 +                            for (j = 0; j < others_req[i].count; j++) {
1667 +                                if (others_req[i].offsets[j] < off + real_size &&
1668 +                                    others_req[i].offsets[j] >= off) {
1669 +                                    ADIO_WriteContig(fd,
1670 +                                                     write_buf + others_req[i].offsets[j] - off,
1671 +                                                     others_req[i].lens[j],
1672 +                                                     MPI_BYTE, ADIO_EXPLICIT_OFFSET,
1673 +                                                     others_req[i].offsets[j], &status,
1674 +                                                     error_code);
1675 +                                   if (*error_code != MPI_SUCCESS)
1676 +                                       goto over;
1677 +                                }
1678 +                            }
1679 +                        }
1680 +                    }
1681 +                }
1682 +            }
1683 +           if (*error_code != MPI_SUCCESS)
1684 +               goto over;
1685 +       }
1686 +        iter_st_off += max_size;
1687 +    }
1688 +over:
1689 +    if (ntimes)
1690 +       ADIOI_Free(write_buf);
1691 +    ADIOI_Free(recv_curr_offlen_ptr);
1692 +    ADIOI_Free(send_curr_offlen_ptr);
1693 +    ADIOI_Free(recv_count);
1694 +    ADIOI_Free(send_size);
1695 +    ADIOI_Free(recv_size);
1696 +    ADIOI_Free(sent_to_proc);
1697 +    ADIOI_Free(recv_start_pos);
1698 +    ADIOI_Free(send_buf_idx);
1699 +    ADIOI_Free(curr_to_proc);
1700 +    ADIOI_Free(done_to_proc);
1701 +    ADIOI_Free(off_list);
1702 +}
1703 +
1704 +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
1705 +                                        char *write_buf,
1706 +                                        ADIOI_Flatlist_node * flat_buf,
1707 +                                        ADIO_Offset * offset_list,
1708 +                                        int *len_list, int *send_size,
1709 +                                        int *recv_size, ADIO_Offset off,
1710 +                                        int size, int *count,
1711 +                                        int *start_pos, int *partial_recv,
1712 +                                        int *sent_to_proc, int nprocs,
1713 +                                        int myrank, int buftype_is_contig,
1714 +                                        int contig_access_count,
1715 +                                        int * striping_info,
1716 +                                        ADIOI_Access * others_req,
1717 +                                        int *send_buf_idx,
1718 +                                        int *curr_to_proc, int *done_to_proc,
1719 +                                         int *hole, int iter,
1720 +                                         MPI_Aint buftype_extent,
1721 +                                        int *buf_idx, int *error_code)
1722 +{
1723 +    int i, j, nprocs_recv, nprocs_send, err;
1724 +    char **send_buf = NULL;
1725 +    MPI_Request *requests, *send_req;
1726 +    MPI_Datatype *recv_types;
1727 +    MPI_Status *statuses, status;
1728 +    int *srt_len, sum, sum_recv;
1729 +    ADIO_Offset *srt_off;
1730 +    int data_sieving = *hole;
1731 +    static char myname[] = "ADIOI_W_EXCHANGE_DATA";
1732 +
1733 +    /* create derived datatypes for recv */
1734 +    nprocs_recv = 0;
1735 +    for (i = 0; i < nprocs; i++)
1736 +       if (recv_size[i])
1737 +           nprocs_recv++;
1738 +
1739 +    recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) *
1740 +                                              sizeof(MPI_Datatype));
1741 +    /* +1 to avoid a 0-size malloc */
1742 +
1743 +    j = 0;
1744 +    for (i = 0; i < nprocs; i++) {
1745 +       if (recv_size[i]) {
1746 +           MPI_Type_hindexed(count[i],
1747 +                             &(others_req[i].lens[start_pos[i]]),
1748 +                             &(others_req[i].mem_ptrs[start_pos[i]]),
1749 +                             MPI_BYTE, recv_types + j);
1750 +           /* absolute displacements; use MPI_BOTTOM in recv */
1751 +           MPI_Type_commit(recv_types + j);
1752 +           j++;
1753 +       }
1754 +    }
1755 +
1756 +    /* To avoid a read-modify-write,
1757 +     * check if there are holes in the data to be written.
1758 +     * For this, merge the (sorted) offset lists others_req using a heap-merge.
1759 +     */
1760 +
1761 +    sum = 0;
1762 +    for (i = 0; i < nprocs; i++)
1763 +       sum += count[i];
1764 +    srt_off = (ADIO_Offset *) ADIOI_Malloc((sum + 1) * sizeof(ADIO_Offset));
1765 +    srt_len = (int *) ADIOI_Malloc((sum + 1) * sizeof(int));
1766 +    /* +1 to avoid a 0-size malloc */
1767 +
1768 +    ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
1769 +                    nprocs, nprocs_recv, sum);
1770 +
1771 +    /* check if there are any holes */
1772 +    *hole = 0;
1773 +    for (i = 0; i < sum - 1; i++) {
1774 +        if (srt_off[i] + srt_len[i] < srt_off[i + 1]) {
1775 +            *hole = 1;
1776 +           break;
1777 +       }
1778 +    }
1779 +    /* In some cases (see John Bent ROMIO REQ # 835), an odd interaction
1780 +     * between aggregation, nominally contiguous regions, and cb_buffer_size
1781 +     * should be handled with a read-modify-write (otherwise we will write out
1782 +     * more data than we receive from everyone else (inclusive), so override
1783 +     * hole detection
1784 +     */
1785 +    if (*hole == 0) {
1786 +        sum_recv = 0;
1787 +        for (i = 0; i < nprocs; i++)
1788 +            sum_recv += recv_size[i];
1789 +       if (size > sum_recv)
1790 +           *hole = 1;
1791 +    }
1792 +    /* check the hint for data sieving */
1793 +    if (data_sieving && nprocs_recv && *hole) {
1794 +        ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
1795 +                        ADIO_EXPLICIT_OFFSET, off, &status, &err);
1796 +        // --BEGIN ERROR HANDLING--
1797 +        if (err != MPI_SUCCESS) {
1798 +            *error_code = MPIO_Err_create_code(err,
1799 +                                               MPIR_ERR_RECOVERABLE,
1800 +                                               myname, __LINE__,
1801 +                                               MPI_ERR_IO,
1802 +                                               "**ioRMWrdwr", 0);
1803 +            ADIOI_Free(recv_types);
1804 +            ADIOI_Free(srt_off);
1805 +            ADIOI_Free(srt_len);
1806 +            return;
1807 +        }
1808 +        // --END ERROR HANDLING--
1809 +    }
1810 +    ADIOI_Free(srt_off);
1811 +    ADIOI_Free(srt_len);
1812 +
1813 +    nprocs_send = 0;
1814 +    for (i = 0; i < nprocs; i++)
1815 +       if (send_size[i])
1816 +           nprocs_send++;
1817 +
1818 +    if (fd->atomicity) {
1819 +       /* bug fix from Wei-keng Liao and Kenin Coloma */
1820 +       requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) *
1821 +                                                sizeof(MPI_Request));
1822 +       send_req = requests;
1823 +    } else {
1824 +       requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1)*
1825 +                                                sizeof(MPI_Request));
1826 +       /* +1 to avoid a 0-size malloc */
1827 +
1828 +       /* post receives */
1829 +       j = 0;
1830 +       for (i = 0; i < nprocs; i++) {
1831 +           if (recv_size[i]) {
1832 +               MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i,
1833 +                         myrank + i + 100 * iter, fd->comm, requests + j);
1834 +               j++;
1835 +           }
1836 +       }
1837 +       send_req = requests + nprocs_recv;
1838 +    }
1839 +
1840 +    /* post sends.
1841 +     * if buftype_is_contig, data can be directly sent from
1842 +     * user buf at location given by buf_idx. else use send_buf.
1843 +     */
1844 +    if (buftype_is_contig) {
1845 +       j = 0;
1846 +       for (i = 0; i < nprocs; i++)
1847 +           if (send_size[i]) {
1848 +               MPI_Isend(((char *) buf) + buf_idx[i], send_size[i],
1849 +                         MPI_BYTE, i, myrank + i + 100 * iter, fd->comm,
1850 +                         send_req + j);
1851 +               j++;
1852 +               buf_idx[i] += send_size[i];
1853 +           }
1854 +    } else if (nprocs_send) {
1855 +       /* buftype is not contig */
1856 +       send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
1857 +       for (i = 0; i < nprocs; i++)
1858 +           if (send_size[i])
1859 +               send_buf[i] = (char *) ADIOI_Malloc(send_size[i]);
1860 +
1861 +       ADIOI_LUSTRE_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list,
1862 +                                      len_list, send_size, send_req,
1863 +                                      sent_to_proc, nprocs, myrank,
1864 +                                      contig_access_count, striping_info,
1865 +                                      send_buf_idx, curr_to_proc, done_to_proc,
1866 +                                      iter, buftype_extent);
1867 +       /* the send is done in ADIOI_Fill_send_buffer */
1868 +    }
1869 +
1870 +       /* bug fix from Wei-keng Liao and Kenin Coloma */
1871 +    if (fd->atomicity) {
1872 +       j = 0;
1873 +       for (i = 0; i < nprocs; i++) {
1874 +           MPI_Status wkl_status;
1875 +           if (recv_size[i]) {
1876 +               MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i,
1877 +                        myrank + i + 100 * iter, fd->comm, &wkl_status);
1878 +               j++;
1879 +           }
1880 +       }
1881 +    }
1882 +
1883 +    for (i = 0; i < nprocs_recv; i++)
1884 +       MPI_Type_free(recv_types + i);
1885 +    ADIOI_Free(recv_types);
1886 +
1887 +       /* bug fix from Wei-keng Liao and Kenin Coloma */
1888 +       /* +1 to avoid a 0-size malloc */
1889 +    if (fd->atomicity) {
1890 +       statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) *
1891 +                                              sizeof(MPI_Status));
1892 +    } else {
1893 +       statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) *
1894 +                                              sizeof(MPI_Status));
1895 +    }
1896 +
1897 +#ifdef NEEDS_MPI_TEST
1898 +    i = 0;
1899 +    if (fd->atomicity) {
1900 +       /* bug fix from Wei-keng Liao and Kenin Coloma */
1901 +       while (!i)
1902 +           MPI_Testall(nprocs_send, send_req, &i, statuses);
1903 +    } else {
1904 +       while (!i)
1905 +           MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses);
1906 +    }
1907 +#else
1908 +       /* bug fix from Wei-keng Liao and Kenin Coloma */
1909 +    if (fd->atomicity)
1910 +       MPI_Waitall(nprocs_send, send_req, statuses);
1911 +    else
1912 +       MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses);
1913 +#endif
1914 +    ADIOI_Free(statuses);
1915 +    ADIOI_Free(requests);
1916 +    if (!buftype_is_contig && nprocs_send) {
1917 +       for (i = 0; i < nprocs; i++)
1918 +           if (send_size[i])
1919 +               ADIOI_Free(send_buf[i]);
1920 +       ADIOI_Free(send_buf);
1921 +    }
1922 +}
1923 +
1924 +#define ADIOI_BUF_INCR \
1925 +{ \
1926 +    while (buf_incr) { \
1927 +        size_in_buf = ADIOI_MIN(buf_incr, flat_buf_sz); \
1928 +        user_buf_idx += size_in_buf; \
1929 +        flat_buf_sz -= size_in_buf; \
1930 +        if (!flat_buf_sz) { \
1931 +            if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
1932 +            else { \
1933 +                flat_buf_idx = 0; \
1934 +                n_buftypes++; \
1935 +            } \
1936 +            user_buf_idx = flat_buf->indices[flat_buf_idx] + \
1937 +                              n_buftypes*buftype_extent; \
1938 +            flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
1939 +        } \
1940 +        buf_incr -= size_in_buf; \
1941 +    } \
1942 +}
1943 +
1944 +
1945 +#define ADIOI_BUF_COPY \
1946 +{ \
1947 +    while (size) { \
1948 +        size_in_buf = ADIOI_MIN(size, flat_buf_sz); \
1949 +        memcpy(&(send_buf[p][send_buf_idx[p]]), \
1950 +               ((char *) buf) + user_buf_idx, size_in_buf); \
1951 +        send_buf_idx[p] += size_in_buf; \
1952 +        user_buf_idx += size_in_buf; \
1953 +        flat_buf_sz -= size_in_buf; \
1954 +        if (!flat_buf_sz) { \
1955 +            if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
1956 +            else { \
1957 +                flat_buf_idx = 0; \
1958 +                n_buftypes++; \
1959 +            } \
1960 +            user_buf_idx = flat_buf->indices[flat_buf_idx] + \
1961 +                              n_buftypes*buftype_extent; \
1962 +            flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
1963 +        } \
1964 +        size -= size_in_buf; \
1965 +        buf_incr -= size_in_buf; \
1966 +    } \
1967 +    ADIOI_BUF_INCR \
1968 +}
1969 +
1970 +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
1971 +                                         ADIOI_Flatlist_node * flat_buf,
1972 +                                         char **send_buf,
1973 +                                         ADIO_Offset * offset_list,
1974 +                                         int *len_list, int *send_size,
1975 +                                         MPI_Request * requests,
1976 +                                         int *sent_to_proc, int nprocs,
1977 +                                         int myrank,
1978 +                                         int contig_access_count,
1979 +                                         int * striping_info,
1980 +                                         int *send_buf_idx,
1981 +                                         int *curr_to_proc,
1982 +                                         int *done_to_proc, int iter,
1983 +                                         MPI_Aint buftype_extent)
1984 +{
1985 +    /* this function is only called if buftype is not contig */
1986 +    int i, p, flat_buf_idx, size;
1987 +    int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes;
1988 +    ADIO_Offset off, len, rem_len, user_buf_idx;
1989 +
1990 +    /* curr_to_proc[p] = amount of data sent to proc. p that has already
1991 +     * been accounted for so far
1992 +     * done_to_proc[p] = amount of data already sent to proc. p in
1993 +     * previous iterations
1994 +     * user_buf_idx = current location in user buffer
1995 +     * send_buf_idx[p] = current location in send_buf of proc. p
1996 +     */
1997 +
1998 +    for (i = 0; i < nprocs; i++) {
1999 +       send_buf_idx[i] = curr_to_proc[i] = 0;
2000 +       done_to_proc[i] = sent_to_proc[i];
2001 +    }
2002 +    jj = 0;
2003 +
2004 +    user_buf_idx = flat_buf->indices[0];
2005 +    flat_buf_idx = 0;
2006 +    n_buftypes = 0;
2007 +    flat_buf_sz = flat_buf->blocklens[0];
2008 +
2009 +    /* flat_buf_idx = current index into flattened buftype
2010 +     * flat_buf_sz = size of current contiguous component in flattened buf
2011 +     */
2012 +    for (i = 0; i < contig_access_count; i++) {
2013 +       off = offset_list[i];
2014 +       rem_len = (ADIO_Offset) len_list[i];
2015 +
2016 +       /*this request may span to more than one process */
2017 +       while (rem_len != 0) {
2018 +           len = rem_len;
2019 +           /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
2020 +            * longer than the single region that processor "p" is responsible
2021 +            * for.
2022 +            */
2023 +           p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, striping_info);
2024 +
2025 +           if (send_buf_idx[p] < send_size[p]) {
2026 +               if (curr_to_proc[p] + len > done_to_proc[p]) {
2027 +                   if (done_to_proc[p] > curr_to_proc[p]) {
2028 +                       size = (int) ADIOI_MIN(curr_to_proc[p] + len -
2029 +                                              done_to_proc[p],
2030 +                                              send_size[p] -
2031 +                                              send_buf_idx[p]);
2032 +                       buf_incr = done_to_proc[p] - curr_to_proc[p];
2033 +                       ADIOI_BUF_INCR
2034 +                           buf_incr = (int) (curr_to_proc[p] + len -
2035 +                                             done_to_proc[p]);
2036 +                       curr_to_proc[p] = done_to_proc[p] + size;
2037 +                       ADIOI_BUF_COPY
2038 +                    } else {
2039 +                       size = (int) ADIOI_MIN(len, send_size[p] -
2040 +                                              send_buf_idx[p]);
2041 +                       buf_incr = (int) len;
2042 +                       curr_to_proc[p] += size;
2043 +                       ADIOI_BUF_COPY
2044 +                    }
2045 +                   if (send_buf_idx[p] == send_size[p]) {
2046 +                       MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p,
2047 +                                 myrank + p + 100 * iter, fd->comm,
2048 +                                 requests + jj);
2049 +                       jj++;
2050 +                   }
2051 +               } else {
2052 +                   curr_to_proc[p] += (int) len;
2053 +                   buf_incr = (int) len;
2054 +                   ADIOI_BUF_INCR
2055 +                }
2056 +           } else {
2057 +               buf_incr = (int) len;
2058 +               ADIOI_BUF_INCR
2059 +            }
2060 +           off += len;
2061 +           rem_len -= len;
2062 +       }
2063 +    }
2064 +    for (i = 0; i < nprocs; i++)
2065 +       if (send_size[i])
2066 +           sent_to_proc[i] = curr_to_proc[i];
2067 +}
2068 diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
2069 --- ad_lustre_orig/ad_lustre_wrstr.c    1970-01-01 08:00:00.000000000 +0800
2070 +++ ad_lustre/ad_lustre_wrstr.c 2008-10-13 15:34:53.000000000 +0800
2071 @@ -0,0 +1,472 @@
2072 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
2073 +/*
2074 + *   Copyright (C) 1997 University of Chicago.
2075 + *   See COPYRIGHT notice in top-level directory.
2076 + *
2077 + *   Copyright (C) 2007 Oak Ridge National Laboratory
2078 + *
2079 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
2080 + */
2081 +
2082 +#include "ad_lustre.h"
2083 +#include "adio_extern.h"
2084 +
2085 +#define ADIOI_BUFFERED_WRITE \
2086 +{ \
2087 +    if (req_off >= writebuf_off + writebuf_len) { \
2088 +        if (writebuf_len) { \
2089 +           ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2090 +                  ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2091 +           if (!(fd->atomicity)) \
2092 +                ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2093 +           if (*error_code != MPI_SUCCESS) { \
2094 +               *error_code = MPIO_Err_create_code(*error_code, \
2095 +                                                  MPIR_ERR_RECOVERABLE, myname, \
2096 +                                                  __LINE__, MPI_ERR_IO, \
2097 +                                                  "**iowswc", 0); \
2098 +               ADIOI_Free(writebuf); \
2099 +               return; \
2100 +           } \
2101 +        } \
2102 +       writebuf_off = req_off; \
2103 +        /* stripe_size alignment */ \
2104 +        writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2105 +                                       (writebuf_off / stripe_size + 1) * \
2106 +                                       stripe_size - writebuf_off);\
2107 +       if (!(fd->atomicity)) \
2108 +            ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2109 +       ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,\
2110 +                        writebuf_off, &status1, error_code); \
2111 +       if (*error_code != MPI_SUCCESS) { \
2112 +           *error_code = MPIO_Err_create_code(*error_code, \
2113 +                                              MPIR_ERR_RECOVERABLE, myname, \
2114 +                                              __LINE__, MPI_ERR_IO, \
2115 +                                              "**iowsrc", 0); \
2116 +            ADIOI_Free(writebuf); \
2117 +           return; \
2118 +       } \
2119 +    } \
2120 +    write_sz = (int) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
2121 +    memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, write_sz);\
2122 +    while (write_sz != req_len) {\
2123 +        ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2124 +                         ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2125 +        if (!(fd->atomicity)) \
2126 +            ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2127 +        if (*error_code != MPI_SUCCESS) { \
2128 +            *error_code = MPIO_Err_create_code(*error_code, \
2129 +                                               MPIR_ERR_RECOVERABLE, myname, \
2130 +                                               __LINE__, MPI_ERR_IO, \
2131 +                                               "**iowswc", 0); \
2132 +            ADIOI_Free(writebuf); \
2133 +            return; \
2134 +        } \
2135 +        req_len -= write_sz; \
2136 +        userbuf_off += write_sz; \
2137 +        writebuf_off += writebuf_len; \
2138 +        /* stripe_size alignment */ \
2139 +        writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2140 +                                       (writebuf_off / stripe_size + 1) * \
2141 +                                       stripe_size - writebuf_off);\
2142 +       if (!(fd->atomicity)) \
2143 +            ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2144 +        ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,\
2145 +                        writebuf_off, &status1, error_code); \
2146 +       if (*error_code != MPI_SUCCESS) { \
2147 +           *error_code = MPIO_Err_create_code(*error_code, \
2148 +                                              MPIR_ERR_RECOVERABLE, myname, \
2149 +                                              __LINE__, MPI_ERR_IO, \
2150 +                                              "**iowsrc", 0); \
2151 +            ADIOI_Free(writebuf); \
2152 +           return; \
2153 +       } \
2154 +        write_sz = ADIOI_MIN(req_len, writebuf_len); \
2155 +        memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
2156 +    } \
2157 +}
2158 +
2159 +
2160 +/* this macro is used when filetype is contig and buftype is not contig.
2161 +   it does not do a read-modify-write and does not lock*/
2162 +#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
2163 +{ \
2164 +    if (req_off >= writebuf_off + writebuf_len) { \
2165 +       writebuf_off = req_off; \
2166 +        /* stripe_size alignment */ \
2167 +        writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2168 +                                       (writebuf_off / stripe_size + 1) * \
2169 +                                       stripe_size - writebuf_off);\
2170 +    } \
2171 +    write_sz = (int) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
2172 +    memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, write_sz);\
2173 +    while (req_len) { \
2174 +        ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2175 +                         ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2176 +        if (*error_code != MPI_SUCCESS) { \
2177 +            *error_code = MPIO_Err_create_code(*error_code, \
2178 +                                               MPIR_ERR_RECOVERABLE, myname, \
2179 +                                               __LINE__, MPI_ERR_IO, \
2180 +                                               "**iowswc", 0); \
2181 +            ADIOI_Free(writebuf); \
2182 +            return; \
2183 +        } \
2184 +        req_len -= write_sz; \
2185 +        userbuf_off += write_sz; \
2186 +        writebuf_off += writebuf_len; \
2187 +        /* stripe_size alignment */ \
2188 +        writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2189 +                                       (writebuf_off / stripe_size + 1) * \
2190 +                                       stripe_size - writebuf_off);\
2191 +        write_sz = ADIOI_MIN(req_len, writebuf_len); \
2192 +        memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
2193 +    } \
2194 +}
2195 +
2196 +void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
2197 +                              MPI_Datatype datatype, int file_ptr_type,
2198 +                              ADIO_Offset offset, ADIO_Status * status,
2199 +                              int *error_code)
2200 +{
2201 +    /* offset is in units of etype relative to the filetype. */
2202 +    ADIOI_Flatlist_node *flat_buf, *flat_file;
2203 +    int i, j, k, bwr_size, fwr_size = 0, st_index = 0;
2204 +    int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
2205 +    int n_filetypes, etype_in_filetype;
2206 +    ADIO_Offset abs_off_in_filetype = 0;
2207 +    int filetype_size, etype_size, buftype_size, req_len;
2208 +    MPI_Aint filetype_extent, buftype_extent;
2209 +    int buf_count, buftype_is_contig, filetype_is_contig;
2210 +    ADIO_Offset userbuf_off;
2211 +    ADIO_Offset off, req_off, disp, end_offset = 0, writebuf_off, start_off;
2212 +    char *writebuf;
2213 +    int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
2214 +    ADIO_Status status1;
2215 +    int new_bwr_size, new_fwr_size;
2216 +    char * value;
2217 +    int stripe_size, lflag = 0;
2218 +    static char myname[] = "ADIOI_LUSTRE_WriteStrided";
2219 +    int myrank;
2220 +    MPI_Comm_rank(fd->comm, &myrank);
2221 +
2222 +    if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
2223 +       /* if user has disabled data sieving on writes, use naive
2224 +        * approach instead.
2225 +        */
2226 +       ADIOI_GEN_WriteStrided_naive(fd,
2227 +                                    buf,
2228 +                                    count,
2229 +                                    datatype,
2230 +                                    file_ptr_type,
2231 +                                    offset, status, error_code);
2232 +       return;
2233 +    }
2234 +
2235 +    *error_code = MPI_SUCCESS; /* changed below if error */
2236 +
2237 +    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
2238 +    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
2239 +
2240 +    MPI_Type_size(fd->filetype, &filetype_size);
2241 +    if (!filetype_size) {
2242 +       *error_code = MPI_SUCCESS;
2243 +       return;
2244 +    }
2245 +
2246 +    MPI_Type_extent(fd->filetype, &filetype_extent);
2247 +    MPI_Type_size(datatype, &buftype_size);
2248 +    MPI_Type_extent(datatype, &buftype_extent);
2249 +    etype_size = fd->etype_size;
2250 +
2251 +    bufsize = buftype_size * count;
2252 +
2253 +    /* get striping info */
2254 +    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
2255 +    MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
2256 +    if (lflag)
2257 +       stripe_size = atoi(value);
2258 +    ADIOI_Free(value);
2259 +
2260 +    /* Different buftype to different filetype */
2261 +    if (!buftype_is_contig && filetype_is_contig) {
2262 +        /* noncontiguous in memory, contiguous in file. */
2263 +       ADIOI_Flatten_datatype(datatype);
2264 +       flat_buf = ADIOI_Flatlist;
2265 +       while (flat_buf->type != datatype)
2266 +           flat_buf = flat_buf->next;
2267 +
2268 +       off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
2269 +           fd->disp + etype_size * offset;
2270 +
2271 +       start_off = off;
2272 +       end_offset = start_off + bufsize - 1;
2273 +       writebuf_off = start_off;
2274 +        /* write stripe size buffer each time */
2275 +       writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
2276 +        writebuf_len = (int) ADIOI_MIN(bufsize,
2277 +                                       (writebuf_off / stripe_size + 1) *
2278 +                                       stripe_size - writebuf_off);
2279 +
2280 +        /* if atomicity is true, lock the region to be accessed */
2281 +       if (fd->atomicity)
2282 +           ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
2283 +
2284 +       for (j = 0; j < count; j++) {
2285 +           for (i = 0; i < flat_buf->count; i++) {
2286 +               userbuf_off = j * buftype_extent + flat_buf->indices[i];
2287 +               req_off = off;
2288 +               req_len = flat_buf->blocklens[i];
2289 +               ADIOI_BUFFERED_WRITE_WITHOUT_READ
2290 +               off += flat_buf->blocklens[i];
2291 +           }
2292 +        }
2293 +
2294 +       /* write the buffer out finally */
2295 +       ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
2296 +                        ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
2297 +                        error_code);
2298 +
2299 +       if (fd->atomicity)
2300 +           ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
2301 +       if (*error_code != MPI_SUCCESS) {
2302 +            ADIOI_Free(writebuf);
2303 +           return;
2304 +        }
2305 +       ADIOI_Free(writebuf);
2306 +       if (file_ptr_type == ADIO_INDIVIDUAL)
2307 +           fd->fp_ind = off;
2308 +    } else {
2309 +        /* noncontiguous in file */
2310 +        /* filetype already flattened in ADIO_Open */
2311 +       flat_file = ADIOI_Flatlist;
2312 +       while (flat_file->type != fd->filetype)
2313 +           flat_file = flat_file->next;
2314 +       disp = fd->disp;
2315 +
2316 +       if (file_ptr_type == ADIO_INDIVIDUAL) {
2317 +           offset = fd->fp_ind;        /* in bytes */
2318 +           n_filetypes = -1;
2319 +           flag = 0;
2320 +           while (!flag) {
2321 +               n_filetypes++;
2322 +               for (i = 0; i < flat_file->count; i++) {
2323 +                   if (disp + flat_file->indices[i] +
2324 +                       (ADIO_Offset) n_filetypes * filetype_extent +
2325 +                       flat_file->blocklens[i] >= offset) {
2326 +                       st_index = i;
2327 +                       fwr_size = (int) (disp + flat_file->indices[i] +
2328 +                                         (ADIO_Offset) n_filetypes *
2329 +                                         filetype_extent +
2330 +                                         flat_file->blocklens[i] -
2331 +                                         offset);
2332 +                       flag = 1;
2333 +                       break;
2334 +                   }
2335 +               }
2336 +           }
2337 +       } else {
2338 +           n_etypes_in_filetype = filetype_size / etype_size;
2339 +           n_filetypes = (int) (offset / n_etypes_in_filetype);
2340 +           etype_in_filetype = (int) (offset % n_etypes_in_filetype);
2341 +           size_in_filetype = etype_in_filetype * etype_size;
2342 +
2343 +           sum = 0;
2344 +           for (i = 0; i < flat_file->count; i++) {
2345 +               sum += flat_file->blocklens[i];
2346 +               if (sum > size_in_filetype) {
2347 +                   st_index = i;
2348 +                   fwr_size = sum - size_in_filetype;
2349 +                   abs_off_in_filetype = flat_file->indices[i] +
2350 +                       size_in_filetype - (sum - flat_file->blocklens[i]);
2351 +                   break;
2352 +               }
2353 +           }
2354 +
2355 +           /* abs. offset in bytes in the file */
2356 +           offset = disp + (ADIO_Offset) n_filetypes *filetype_extent +
2357 +                    abs_off_in_filetype;
2358 +       }
2359 +
2360 +       start_off = offset;
2361 +
2362 +       /* If the file bytes is actually contiguous, we do not need data sieve at all */
2363 +       if (bufsize <= fwr_size) {
2364 +            req_off = start_off;
2365 +            req_len = bufsize;
2366 +            end_offset = start_off + bufsize - 1;
2367 +           writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
2368 +           memset(writebuf, -1, ADIOI_MIN(bufsize, stripe_size));
2369 +            writebuf_off = 0;
2370 +            writebuf_len = 0;
2371 +            userbuf_off = 0;
2372 +            ADIOI_BUFFERED_WRITE_WITHOUT_READ
2373 +       } else {
2374 +           /* Calculate end_offset, the last byte-offset that will be accessed.
2375 +              e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */
2376 +           st_fwr_size = fwr_size;
2377 +           st_n_filetypes = n_filetypes;
2378 +           i = 0;
2379 +           j = st_index;
2380 +           off = offset;
2381 +           fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
2382 +           while (i < bufsize) {
2383 +               i += fwr_size;
2384 +               end_offset = off + fwr_size - 1;
2385 +
2386 +               if (j < (flat_file->count - 1))
2387 +                   j++;
2388 +               else {
2389 +                   j = 0;
2390 +                   n_filetypes++;
2391 +               }
2392 +
2393 +               off = disp + flat_file->indices[j] +
2394 +                     (ADIO_Offset) n_filetypes * filetype_extent;
2395 +               fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i);
2396 +           }
2397 +
2398 +           writebuf_off = 0;
2399 +           writebuf_len = 0;
2400 +           writebuf = (char *) ADIOI_Malloc(stripe_size);
2401 +           memset(writebuf, -1, stripe_size);
2402 +           /* if atomicity is true, lock the region to be accessed */
2403 +           if (fd->atomicity)
2404 +               ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
2405 +
2406 +           if (buftype_is_contig && !filetype_is_contig) {
2407 +               /* contiguous in memory, noncontiguous in file. should be the most
2408 +                  common case. */
2409 +               i = 0;
2410 +               j = st_index;
2411 +               off = offset;
2412 +               n_filetypes = st_n_filetypes;
2413 +               fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
2414 +               while (i < bufsize) {
2415 +                   if (fwr_size) {
2416 +                       /* TYPE_UB and TYPE_LB can result in
2417 +                          fwr_size = 0. save system call in such cases */
2418 +                       /*
2419 +                        lseek(fd->fd_sys, off, SEEK_SET);
2420 +                       err = write(fd->fd_sys, ((char *) buf) + i, fwr_size);
2421 +                        */
2422 +                       req_off = off;
2423 +                       req_len = fwr_size;
2424 +                       userbuf_off = i;
2425 +                       ADIOI_BUFFERED_WRITE
2426 +                    }
2427 +                   i += fwr_size;
2428 +
2429 +                   if (off + fwr_size < disp + flat_file->indices[j] +
2430 +                                        flat_file->blocklens[j] +
2431 +                           (ADIO_Offset) n_filetypes * filetype_extent)
2432 +                       off += fwr_size;
2433 +                   /* did not reach end of contiguous block in filetype.
2434 +                   no more I/O needed. off is incremented by fwr_size. */
2435 +                   else {
2436 +                       if (j < (flat_file->count - 1))
2437 +                           j++;
2438 +                       else {
2439 +                           j = 0;
2440 +                           n_filetypes++;
2441 +                       }
2442 +                       off = disp + flat_file->indices[j] +
2443 +                             (ADIO_Offset) n_filetypes * filetype_extent;
2444 +                       fwr_size = ADIOI_MIN(flat_file->blocklens[j],
2445 +                                             bufsize - i);
2446 +                   }
2447 +               }
2448 +           } else {
2449 +                   /* noncontiguous in memory as well as in file */
2450 +               ADIOI_Flatten_datatype(datatype);
2451 +               flat_buf = ADIOI_Flatlist;
2452 +               while (flat_buf->type != datatype)
2453 +                   flat_buf = flat_buf->next;
2454 +
2455 +               k = num = buf_count = 0;
2456 +               i = (int) (flat_buf->indices[0]);
2457 +               j = st_index;
2458 +               off = offset;
2459 +               n_filetypes = st_n_filetypes;
2460 +               fwr_size = st_fwr_size;
2461 +               bwr_size = flat_buf->blocklens[0];
2462 +
2463 +               while (num < bufsize) {
2464 +                   size = ADIOI_MIN(fwr_size, bwr_size);
2465 +                   if (size) {
2466 +                       /*
2467 +                        lseek(fd->fd_sys, off, SEEK_SET);
2468 +                        err = write(fd->fd_sys, ((char *) buf) + i, size);
2469 +                        */
2470 +                       req_off = off;
2471 +                       req_len = size;
2472 +                       userbuf_off = i;
2473 +                       ADIOI_BUFFERED_WRITE
2474 +                    }
2475 +
2476 +                   new_fwr_size = fwr_size;
2477 +                   new_bwr_size = bwr_size;
2478 +
2479 +                   if (size == fwr_size) {
2480 +                       /* reached end of contiguous block in file */
2481 +                       if (j < (flat_file->count - 1)) {
2482 +                           j++;
2483 +                        } else {
2484 +                           j = 0;
2485 +                           n_filetypes++;
2486 +                       }
2487 +                       off = disp + flat_file->indices[j] +
2488 +                             (ADIO_Offset) n_filetypes * filetype_extent;
2489 +                        new_fwr_size = flat_file->blocklens[j];
2490 +                       if (size != bwr_size) {
2491 +                           i += size;
2492 +                           new_bwr_size -= size;
2493 +                       }
2494 +                   }
2495 +                   if (size == bwr_size) {
2496 +                       /* reached end of contiguous block in memory */
2497 +                       k = (k + 1) % flat_buf->count;
2498 +                       buf_count++;
2499 +                       i = (int) (buftype_extent *
2500 +                                  (buf_count / flat_buf->count) +
2501 +                                 flat_buf->indices[k]);
2502 +                       new_bwr_size = flat_buf->blocklens[k];
2503 +                       if (size != fwr_size) {
2504 +                           off += size;
2505 +                           new_fwr_size -= size;
2506 +                       }
2507 +                   }
2508 +                   num += size;
2509 +                   fwr_size = new_fwr_size;
2510 +                   bwr_size = new_bwr_size;
2511 +               }
2512 +            }
2513 +
2514 +           /* write the buffer out finally */
2515 +           if (writebuf_len) {
2516 +               ADIO_WriteContig(fd, writebuf, writebuf_len,
2517 +                                MPI_BYTE, ADIO_EXPLICIT_OFFSET,
2518 +                                writebuf_off, &status1, error_code);
2519 +               if (!(fd->atomicity))
2520 +                   ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
2521 +               if (*error_code != MPI_SUCCESS) {
2522 +                    ADIOI_Free(writebuf);
2523 +                   return;
2524 +                }
2525 +           }
2526 +           if (fd->atomicity)
2527 +               ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
2528 +       }
2529 +        ADIOI_Free(writebuf);
2530 +       if (file_ptr_type == ADIO_INDIVIDUAL)
2531 +           fd->fp_ind = off;
2532 +    }
2533 +    fd->fp_sys_posn = -1;      /* set it to null. */
2534 +
2535 +#ifdef HAVE_STATUS_SET_BYTES
2536 +    MPIR_Status_set_bytes(status, datatype, bufsize);
2537 +    /* This is a temporary way of filling in status. The right way is to
2538 +    keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
2539 +#endif
2540 +
2541 +    if (!buftype_is_contig)
2542 +        ADIOI_Delete_flattened(datatype);
2543 +}
2544 diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in
2545 --- ad_lustre_orig/Makefile.in  2008-09-17 14:36:57.000000000 +0800
2546 +++ ad_lustre/Makefile.in       2008-10-17 17:03:06.000000000 +0800
2547 @@ -16,7 +16,9 @@
2548  @VPATH@
2549  
2550  AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_open.o \
2551 -      ad_lustre_rwcontig.o ad_lustre_hints.o 
2552 +      ad_lustre_rwcontig.o ad_lustre_wrcoll.o ad_lustre_wrstr.o  \
2553 +      ad_lustre_hints.o ad_lustre_aggregate.o
2554 +
2555  
2556  default: $(LIBNAME)
2557         @if [ "@ENABLE_SHLIB@" != "none" ] ; then \
2558 diff -ruN ad_lustre_orig/README ad_lustre/README
2559 --- ad_lustre_orig/README       2008-09-17 14:36:57.000000000 +0800
2560 +++ ad_lustre/README    2008-10-17 16:50:15.000000000 +0800
2561 @@ -5,6 +5,23 @@
2562    o To post the code for ParColl (Partitioned collective IO)
2563   
2564  -----------------------------------------------------
2565 +V05: 
2566 +-----------------------------------------------------
2567 +Improved data redistribution
2568 +  o Improve I/O pattern identification. Besides checking interleaving,
2569 +    if request I/O size is small, collective I/O will be performed.
2570 +    The hint big_req_size can be used to define the req size value.
2571 +  o Provide hint CO for load balancing to control the number of
2572 +    IO clients for each OST
2573 +  o Produce stripe-contiguous I/O pattern that Lustre prefers
2574 +  o Reduce the collective overhead by hints contiguous_data and
2575 +    same_io_size to remove unnecessary MPI_Alltoall()
2576 +  o Control read-modify-write in data sieving in collective IO
2577 +    by hint ds_in_coll.
2578 +  o Reduce extent lock conflicts by make each OST accessed by one or
2579 +    more constant clients.
2580 +
2581 +-----------------------------------------------------
2582  V04: 
2583  -----------------------------------------------------
2584    o Direct IO and Lockless IO support
2585 --- common/ad_write_coll_orig.c 2008-10-15 11:24:31.000000000 +0800
2586 +++ common/ad_write_coll.c      2008-10-15 11:25:39.000000000 +0800
2587 @@ -42,7 +42,7 @@
2588                             int *send_buf_idx, int *curr_to_proc, 
2589                             int *done_to_proc, int iter, 
2590                             MPI_Aint buftype_extent);
2591 -static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, 
2592 +void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, 
2593                        ADIO_Offset *srt_off, int *srt_len, int *start_pos,
2594                        int nprocs, int nprocs_recv, int total_elements);
2595  
2596 @@ -921,7 +921,7 @@
2597  
2598  
2599  
2600 -static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, 
2601 +void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, 
2602                       ADIO_Offset *srt_off, int *srt_len, int *start_pos,
2603                       int nprocs, int nprocs_recv, int total_elements)
2604  {