Whamcloud - gitweb
Update the adio driver patch for mpich2-1.0.7
[fs/lustre-release.git] / lustre / contrib / adio_driver_mpich2-1.0.7.patch
1 diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
2 --- ad_lustre_orig/ad_lustre_aggregate.c        1970-01-01 08:00:00.000000000 +0800
3 +++ ad_lustre/ad_lustre_aggregate.c     2008-09-17 18:20:35.000000000 +0800
4 @@ -0,0 +1,676 @@
5 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
6 +/*
7 + *   Copyright (C) 1997 University of Chicago.
8 + *   See COPYRIGHT notice in top-level directory.
9 + *
10 + *   Copyright (C) 2007 Oak Ridge National Laboratory
11 + *
12 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
13 + */
14 +
15 +#include "ad_lustre.h"
16 +#include "adio_extern.h"
17 +
18 +void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
19 +                                   int mode, int nprocs)
20 +{
21 +    int *striping_info = NULL;
22 +    /* get striping information:
23 +     *  striping_info[0] = stripe_size;
24 +     *  striping_info[1] = stripe_count;
25 +     *  striping_info[2] = CO;
26 +     */
27 +    /* for easy understanding, we name some variables */
28 +    int stripe_size, stripe_count, CO = 1, CO_max = 1, lflag;
29 +    char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
30 +    /* stripe size */
31 +    MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
32 +    if (lflag)
33 +       stripe_size = atoi(value);
34 +    /* stripe count */
35 +    MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &lflag);
36 +    if (lflag)
37 +       stripe_count = atoi(value);
38 +    /* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */
39 +
40 +    /* calculate CO */
41 +    if (!mode) {
42 +       /* for collective read,
43 +        * if "CO" clients access the same OST simultaneously,
44 +        * the OST disk seek time would be large. So, to avoid this,
45 +        * it might be better if 1 client only accesses 1 OST.
46 +        * So, we set CO = 1 to meet the above requirement.
47 +        */
48 +       CO = 1;
49 +       /*XXX: maybe there are other better way for collective read */
50 +    } else {
51 +       /* CO_max: the largest number of IO clients for each ost group */
52 +        CO_max = (nprocs - 1)/ stripe_count + 1;
53 +        /* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */
54 +       MPI_Info_get(fd->info, "CO", MPI_MAX_INFO_VAL, value, &lflag);
55 +       if (lflag)
56 +           CO = atoi(value);
57 +       CO = ADIOI_MIN(CO_max, CO);
58 +    }
59 +    ADIOI_Free(value);
60 +    /* although there are known "N" hints so far, we still malloc space here
61 +     * instead of declaring an array[3] outside,
62 +     * because on one hand in the future we probably need more hints, and
63 +     * on the other hand this function can be called by
64 +     * both collective read and write conveniently.
65 +     */
66 +    *striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int));
67 +    striping_info = *striping_info_ptr;
68 +    striping_info[0] = stripe_size;
69 +    striping_info[1] = stripe_count;
70 +    striping_info[2] = CO;
71 +}
72 +
73 +int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
74 +                                 ADIO_Offset *len, int nprocs,
75 +                                 int *striping_info)
76 +{
77 +    /* please refer the comments in above function for the detailed algorithm */
78 +    int rank_index;
79 +    ADIO_Offset avail_bytes;
80 +
81 +    int stripe_size = striping_info[0];
82 +    int stripe_count = striping_info[1];
83 +    int CO = striping_info[2];
84 +    int avail_nprocs = ADIOI_MIN(stripe_count * CO, nprocs);
85 +
86 +    /* calculate the rank by offset directly */
87 +    rank_index = (int)((off / stripe_size) % avail_nprocs);
88 +    /* XXX: the above method is so simple that the processes in top ranks are always
89 +     * chosen to be I/O clients. we hope they are different each time.
90 +     */
91 +
92 +    avail_bytes = (off / (ADIO_Offset)stripe_size + 1) *
93 +                  (ADIO_Offset)stripe_size - off;
94 +    if (avail_bytes < *len) {
95 +       /* this proc only has part of the requested contig. region */
96 +       *len = avail_bytes;
97 +    }
98 +
99 +    return rank_index;
100 +}
101 +
102 +void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
103 +                             int *len_list, int contig_access_count,
104 +                             int *striping_info, int nprocs,
105 +                              int *count_my_req_procs_ptr,
106 +                             int **count_my_req_per_proc_ptr,
107 +                             ADIOI_Access ** my_req_ptr,
108 +                             int **buf_idx_ptr)
109 +{
110 +    int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
111 +    int i, l, proc;
112 +    ADIO_Offset avail_len, rem_len, curr_idx, off;
113 +    ADIOI_Access *my_req;
114 +
115 +    *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
116 +    count_my_req_per_proc = *count_my_req_per_proc_ptr;
117 +
118 +    /* buf_idx is relevant only if buftype_is_contig.
119 +     * buf_idx[i] gives the index into user_buf where data received
120 +     * from proc. i should be placed. This allows receives to be done
121 +     * without extra buffer. This can't be done if buftype is not contig.
122 +     */
123 +    buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
124 +    /* initialize buf_idx to -1 */
125 +    for (i = 0; i < nprocs; i++)
126 +       buf_idx[i] = -1;
127 +
128 +    /* one pass just to calculate how much space to allocate for my_req;
129 +     * contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
130 +     */
131 +    for (i = 0; i < contig_access_count; i++) {
132 +       /* short circuit offset/len processing if len == 0
133 +        * (zero-byte  read/write
134 +        */
135 +       if (len_list[i] == 0)
136 +           continue;
137 +       off = offset_list[i];
138 +       avail_len = len_list[i];
139 +       /* we set avail_len to be the total size of the access.
140 +        * then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return
141 +        * the amount that was available.
142 +        */
143 +       proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs,
144 +                                            striping_info);
145 +       count_my_req_per_proc[proc]++;
146 +       /* figure out how many data is remaining in the access
147 +        * we'll take care of this data (if there is any)
148 +        * in the while loop below.
149 +        */
150 +       rem_len = len_list[i] - avail_len;
151 +
152 +       while (rem_len != 0) {
153 +           off += avail_len;   /* point to first remaining byte */
154 +           avail_len = rem_len;        /* save remaining size, pass to calc */
155 +           proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs,
156 +                                               striping_info);
157 +           count_my_req_per_proc[proc]++;
158 +           rem_len -= avail_len;       /* reduce remaining length by amount from fd */
159 +       }
160 +    }
161 +
162 +    *my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access));
163 +    my_req = *my_req_ptr;
164 +
165 +    count_my_req_procs = 0;
166 +    for (i = 0; i < nprocs; i++) {
167 +       if (count_my_req_per_proc[i]) {
168 +           my_req[i].offsets = (ADIO_Offset *)
169 +                               ADIOI_Malloc(count_my_req_per_proc[i] *
170 +                                             sizeof(ADIO_Offset));
171 +           my_req[i].lens = (int *) ADIOI_Malloc(count_my_req_per_proc[i] *
172 +                                                 sizeof(int));
173 +           count_my_req_procs++;
174 +       }
175 +       my_req[i].count = 0;    /* will be incremented where needed later */
176 +    }
177 +
178 +    /* now fill in my_req */
179 +    curr_idx = 0;
180 +    for (i = 0; i < contig_access_count; i++) {
181 +       if (len_list[i] == 0)
182 +           continue;
183 +       off = offset_list[i];
184 +       avail_len = len_list[i];
185 +       proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs,
186 +                                            striping_info);
187 +
188 +       /* for each separate contiguous access from this process */
189 +       if (buf_idx[proc] == -1)
190 +           buf_idx[proc] = (int) curr_idx;
191 +
192 +       l = my_req[proc].count;
193 +       curr_idx += (int) avail_len;    /* NOTE: Why is curr_idx an int?  Fix? */
194 +
195 +       rem_len = len_list[i] - avail_len;
196 +
197 +       /* store the proc, offset, and len information in an array
198 +        * of structures, my_req. Each structure contains the
199 +        * offsets and lengths located in that process's FD,
200 +        * and the associated count.
201 +        */
202 +       my_req[proc].offsets[l] = off;
203 +       my_req[proc].lens[l] = (int) avail_len;
204 +       my_req[proc].count++;
205 +
206 +       while (rem_len != 0) {
207 +           off += avail_len;
208 +           avail_len = rem_len;
209 +           proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs,
210 +                                                striping_info);
211 +           if (buf_idx[proc] == -1)
212 +               buf_idx[proc] = (int) curr_idx;
213 +
214 +           l = my_req[proc].count;
215 +           curr_idx += avail_len;
216 +           rem_len -= avail_len;
217 +
218 +           my_req[proc].offsets[l] = off;
219 +           my_req[proc].lens[l] = (int) avail_len;
220 +           my_req[proc].count++;
221 +       }
222 +    }
223 +
224 +#ifdef AGG_DEBUG
225 +    for (i = 0; i < nprocs; i++) {
226 +       if (count_my_req_per_proc[i] > 0) {
227 +           FPRINTF(stdout, "data needed from %d (count = %d):\n",
228 +                           i, my_req[i].count);
229 +           for (l = 0; l < my_req[i].count; l++) {
230 +               FPRINTF(stdout, "   off[%d] = %lld, len[%d] = %d\n",
231 +                               l, my_req[i].offsets[l], l, my_req[i].lens[l]);
232 +           }
233 +       }
234 +    }
235 +#endif
236 +#if 0
237 +    for (i = 0; i < nprocs; i++) {
238 +       FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
239 +    }
240 +#endif
241 +
242 +    *count_my_req_procs_ptr = count_my_req_procs;
243 +    *buf_idx_ptr = buf_idx;
244 +}
245 +
246 +int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
247 +                          int *len_list, int nprocs)
248 +{
249 +    /* Algorithm:
250 +     * So far, only one case is suitable for collective I/O
251 +     *  (1) request size <= big_req_size
252 +     *
253 +     * if (avg_req_size > big_req_size) {
254 +     *    docollect = 0;
255 +     * }
256 +     */
257 +
258 +    int i, docollect = 1, lflag, big_req_size = 0;
259 +    ADIO_Offset req_size = 0, total_req_size;
260 +    int avg_req_size, total_access_count;
261 +    char *value = NULL;
262 +
263 +    /* calculate total_req_size and total_access_count */
264 +    for (i = 0; i < contig_access_count; i++)
265 +        req_size += len_list[i];
266 +    MPI_Allreduce(&req_size, &total_req_size, 1, MPI_LONG_LONG_INT, MPI_SUM,
267 +               fd->comm);
268 +    MPI_Allreduce(&contig_access_count, &total_access_count, 1, MPI_INT, MPI_SUM,
269 +               fd->comm);
270 +    /* estimate average req_size */
271 +    avg_req_size = (int)(total_req_size / total_access_count);
272 +
273 +    /* get hint of hole_ratio */
274 +    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
275 +    MPI_Info_get(fd->info, "big_req_size", MPI_MAX_INFO_VAL, value, &lflag);
276 +    if (lflag)
277 +        big_req_size = atoi(value);
278 +
279 +    if ((big_req_size > 0) && (avg_req_size > big_req_size))
280 +        docollect = 0;
281 +
282 +    ADIOI_Free(value);
283 +
284 +    return docollect;
285 +}
286 +
287 +void ADIOI_LUSTRE_Calc_my_off_len(ADIO_File fd, int bufcount,
288 +                                  MPI_Datatype datatype, int file_ptr_type,
289 +                                  ADIO_Offset offset,
290 +                                  ADIO_Offset **offset_list_ptr,
291 +                                 int **len_list_ptr,
292 +                                 ADIO_Offset *start_offset_ptr,
293 +                                 ADIO_Offset *end_offset_ptr,
294 +                                  int *contig_access_count_ptr)
295 +{
296 +    int filetype_size, buftype_size, etype_size;
297 +    int i, j, k, frd_size = 0, old_frd_size = 0, st_index = 0;
298 +    int n_filetypes, etype_in_filetype;
299 +    ADIO_Offset abs_off_in_filetype = 0;
300 +    int bufsize, sum, n_etypes_in_filetype, size_in_filetype;
301 +    int contig_access_count, *len_list, flag, filetype_is_contig;
302 +    MPI_Aint filetype_extent, filetype_lb;
303 +    ADIOI_Flatlist_node *flat_file;
304 +    ADIO_Offset *offset_list, off, end_offset = 0, disp;
305 +
306 +    /* For this process's request, calculate the list of offsets and
307 +    lengths in the file and determine the start and end offsets. */
308 +
309 +    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
310 +
311 +    MPI_Type_size(fd->filetype, &filetype_size);
312 +    MPI_Type_extent(fd->filetype, &filetype_extent);
313 +    MPI_Type_lb(fd->filetype, &filetype_lb);
314 +    MPI_Type_size(datatype, &buftype_size);
315 +    etype_size = fd->etype_size;
316 +
317 +    if (!filetype_size) {
318 +       *contig_access_count_ptr = 0;
319 +       *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset));
320 +       *len_list_ptr = (int *) ADIOI_Malloc(2 * sizeof(int));
321 +       /* 2 is for consistency. everywhere I malloc one more than needed */
322 +
323 +       offset_list = *offset_list_ptr;
324 +       len_list = *len_list_ptr;
325 +       offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
326 +                        fd->disp + etype_size * offset;
327 +       len_list[0] = 0;
328 +       *start_offset_ptr = offset_list[0];
329 +       *end_offset_ptr = offset_list[0] + len_list[0] - 1;
330 +       return;
331 +    }
332 +
333 +    if (filetype_is_contig) {
334 +       *contig_access_count_ptr = 1;
335 +       *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset));
336 +       *len_list_ptr = (int *) ADIOI_Malloc(2 * sizeof(int));
337 +       /* 2 is for consistency. everywhere I malloc one more than needed */
338 +
339 +       offset_list = *offset_list_ptr;
340 +       len_list = *len_list_ptr;
341 +       offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
342 +                        fd->disp + etype_size * offset;
343 +       len_list[0] = bufcount * buftype_size;
344 +       *start_offset_ptr = offset_list[0];
345 +       *end_offset_ptr = offset_list[0] + len_list[0] - 1;
346 +
347 +       /* update file pointer */
348 +       if (file_ptr_type == ADIO_INDIVIDUAL)
349 +           fd->fp_ind = *end_offset_ptr + 1;
350 +    } else {
351 +       /* First calculate what size of offset_list and len_list to allocate */
352 +       /* filetype already flattened in ADIO_Open or ADIO_Fcntl */
353 +       flat_file = ADIOI_Flatlist;
354 +       while (flat_file->type != fd->filetype)
355 +           flat_file = flat_file->next;
356 +       disp = fd->disp;
357 +
358 +       if (file_ptr_type == ADIO_INDIVIDUAL) {
359 +           offset = fd->fp_ind;        /* in bytes */
360 +           n_filetypes = -1;
361 +           flag = 0;
362 +           while (!flag) {
363 +               n_filetypes++;
364 +               for (i = 0; i < flat_file->count; i++) {
365 +                   if (disp + flat_file->indices[i] +
366 +                       (ADIO_Offset) n_filetypes * filetype_extent +
367 +                       flat_file->blocklens[i] >= offset) {
368 +                       st_index = i;
369 +                       frd_size = (int) (disp + flat_file->indices[i] +
370 +                                         (ADIO_Offset) n_filetypes *
371 +                                         filetype_extent +
372 +                                         flat_file->blocklens[i] -
373 +                                         offset);
374 +                       flag = 1;
375 +                       break;
376 +                   }
377 +               }
378 +           }
379 +       } else {
380 +           n_etypes_in_filetype = filetype_size / etype_size;
381 +           n_filetypes = (int) (offset / n_etypes_in_filetype);
382 +           etype_in_filetype = (int) (offset % n_etypes_in_filetype);
383 +           size_in_filetype = etype_in_filetype * etype_size;
384 +
385 +           sum = 0;
386 +           for (i = 0; i < flat_file->count; i++) {
387 +               sum += flat_file->blocklens[i];
388 +               if (sum > size_in_filetype) {
389 +                   st_index = i;
390 +                   frd_size = sum - size_in_filetype;
391 +                   abs_off_in_filetype = flat_file->indices[i] +
392 +                                          size_in_filetype -
393 +                                          (sum - flat_file->blocklens[i]);
394 +                   break;
395 +               }
396 +           }
397 +
398 +           /* abs. offset in bytes in the file */
399 +           offset = disp + (ADIO_Offset) n_filetypes *filetype_extent +
400 +                    abs_off_in_filetype;
401 +       }
402 +
403 +       /* calculate how much space to allocate for offset_list, len_list */
404 +
405 +       old_frd_size = frd_size;
406 +       contig_access_count = i = 0;
407 +       j = st_index;
408 +       bufsize = buftype_size * bufcount;
409 +       frd_size = ADIOI_MIN(frd_size, bufsize);
410 +       while (i < bufsize) {
411 +           if (frd_size)
412 +               contig_access_count++;
413 +           i += frd_size;
414 +           j = (j + 1) % flat_file->count;
415 +           frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i);
416 +       }
417 +
418 +       /* allocate space for offset_list and len_list */
419 +
420 +       *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc((contig_access_count+1) *
421 +                                                        sizeof(ADIO_Offset));
422 +       *len_list_ptr = (int *) ADIOI_Malloc((contig_access_count + 1) *
423 +                                             sizeof(int));
424 +       /* +1 to avoid a 0-size malloc */
425 +
426 +       offset_list = *offset_list_ptr;
427 +       len_list = *len_list_ptr;
428 +
429 +       /* find start offset, end offset, and fill in offset_list and len_list */
430 +
431 +       *start_offset_ptr = offset;     /* calculated above */
432 +
433 +       i = k = 0;
434 +       j = st_index;
435 +       off = offset;
436 +       frd_size = ADIOI_MIN(old_frd_size, bufsize);
437 +       while (i < bufsize) {
438 +           if (frd_size) {
439 +               offset_list[k] = off;
440 +               len_list[k] = frd_size;
441 +               k++;
442 +           }
443 +           i += frd_size;
444 +           end_offset = off + frd_size - 1;
445 +
446 +           /* Note: end_offset points to the last byte-offset that will be accessed.
447 +              e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */
448 +
449 +           if (off + frd_size < disp + flat_file->indices[j] +
450 +                                flat_file->blocklens[j] +
451 +                                (ADIO_Offset) n_filetypes * filetype_extent) {
452 +               off += frd_size;
453 +               /* did not reach end of contiguous block in filetype.
454 +                * no more I/O needed. off is incremented by frd_size.
455 +                */
456 +           } else {
457 +               if (j < (flat_file->count - 1))
458 +                   j++;
459 +               else {
460 +                   /* hit end of flattened filetype;
461 +                    * start at beginning again
462 +                    */
463 +                   j = 0;
464 +                   n_filetypes++;
465 +               }
466 +               off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes *
467 +                                                     filetype_extent;
468 +               frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i);
469 +           }
470 +       }
471 +
472 +       /* update file pointer */
473 +       if (file_ptr_type == ADIO_INDIVIDUAL)
474 +           fd->fp_ind = off;
475 +
476 +       *contig_access_count_ptr = contig_access_count;
477 +       *end_offset_ptr = end_offset;
478 +    }
479 +}
480 +
481 +void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
482 +                                 int *count_my_req_per_proc,
483 +                                 ADIOI_Access * my_req,
484 +                                 int nprocs, int myrank,
485 +                                  ADIO_Offset start_offset,
486 +                                  ADIO_Offset end_offset,
487 +                                  int *striping_info,
488 +                                 int *count_others_req_procs_ptr,
489 +                                 ADIOI_Access ** others_req_ptr)
490 +{
491 +    /* what requests of other processes will be written by this process */
492 +
493 +    int *count_others_req_per_proc, count_others_req_procs;
494 +    int i, j, lflag, samesize = 0, contiguous = 0;
495 +    MPI_Request *send_requests, *recv_requests;
496 +    MPI_Status *statuses;
497 +    ADIOI_Access *others_req;
498 +    char *value = NULL;
499 +    int proc, avail_nprocs, stripe_count, CO;
500 +    ADIO_Offset min_st_offset, off, req_len, avail_len, rem_len, *all_lens;
501 +
502 +    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
503 +    /* same io size */
504 +    MPI_Info_get(fd->info, "same_io_size", MPI_MAX_INFO_VAL, value, &lflag);
505 +    if (lflag && !strcmp(value, "yes"))
506 +        samesize = 1;
507 +    /* contiguous data */
508 +    MPI_Info_get(fd->info, "contiguous_data", MPI_MAX_INFO_VAL, value, &lflag);
509 +    if (lflag && !strcmp(value, "yes"))
510 +        contiguous = 1;
511 +
512 +    *others_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs *
513 +                                                    sizeof(ADIOI_Access));
514 +    others_req = *others_req_ptr;
515 +
516 +    /* if the data are contiguous, we don't need to do MPI_Alltoall */
517 +    if (contiguous) {
518 +        stripe_count = striping_info[1];
519 +        CO = striping_info[2];
520 +
521 +        for (i = 0; i < nprocs; i++) {
522 +            others_req[i].count = 0;
523 +        }
524 +        req_len = end_offset - start_offset + 1;
525 +        all_lens = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
526 +
527 +        if (samesize == 0) {/* different request size */
528 +            /* calculate the min_st_offset */
529 +            MPI_Allreduce(&start_offset, &min_st_offset, 1, MPI_LONG_LONG,
530 +                          MPI_MIN, fd->comm);
531 +            /* exchange request length */
532 +            MPI_Allgather(&req_len, 1, ADIO_OFFSET, all_lens, 1, ADIO_OFFSET,
533 +                          fd->comm);
534 +        } else { /* same request size */
535 +            /* calculate the min_st_offset */
536 +            min_st_offset = start_offset - myrank * req_len;
537 +            /* assign request length to all_lens[] */
538 +            for (i = 0; i < nprocs; i ++)
539 +               all_lens[i] = req_len;
540 +        }
541 +        avail_nprocs = ADIOI_MIN(nprocs, stripe_count * CO);
542 +        if (myrank < avail_nprocs) {
543 +            off = min_st_offset;
544 +            /* calcaulte other_req[i].count */
545 +            for (i = 0; i < nprocs; i++) {
546 +                avail_len = all_lens[i];
547 +                rem_len = avail_len;
548 +                while (rem_len > 0) {
549 +                   proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
550 +                                                        nprocs, striping_info);
551 +                    if (proc == myrank) {
552 +                        others_req[i].count ++;
553 +                    }
554 +                    off += avail_len;
555 +                    rem_len -= avail_len;
556 +                    avail_len = rem_len;
557 +                }
558 +            }
559 +            /* calculate offset and len for each request */
560 +            off = min_st_offset;
561 +            for (i = 0; i < nprocs; i++) {
562 +                if (others_req[i].count) {
563 +                   others_req[i].offsets = (ADIO_Offset *)
564 +                                            ADIOI_Malloc(others_req[i].count *
565 +                                                        sizeof(ADIO_Offset));
566 +                   others_req[i].lens = (int *)
567 +                                         ADIOI_Malloc(others_req[i].count *
568 +                                                      sizeof(int));
569 +                    others_req[i].mem_ptrs = (MPI_Aint *)
570 +                                             ADIOI_Malloc(others_req[i].count *
571 +                                                         sizeof(MPI_Aint));
572 +                }
573 +                j = 0;
574 +                avail_len = all_lens[i];
575 +                rem_len = avail_len;
576 +                while (rem_len > 0) {
577 +                   proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
578 +                                                        nprocs, striping_info);
579 +                    if (proc == myrank) {
580 +                        others_req[i].offsets[j] = off;
581 +                        others_req[i].lens[j] = (int)avail_len;
582 +                        j ++;
583 +                    }
584 +                    off += avail_len;
585 +                    rem_len -= avail_len;
586 +                    avail_len = rem_len;
587 +                }
588 +            }
589 +        }
590 +        ADIOI_Free(value);
591 +        ADIOI_Free(all_lens);
592 +    } else {
593 +        /* multiple non-contiguous requests */
594 +        /* first find out how much to send/recv and from/to whom */
595 +
596 +        /*
597 +         * count_others_req_procs:
598 +         *    number of processes whose requests will be written by
599 +         *    this process (including this process itself)
600 +         * count_others_req_per_proc[i]:
601 +         *    how many separate contiguous requests of proc[i] will be
602 +         *    written by this process.
603 +         */
604 +
605 +        count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
606 +
607 +        MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
608 +                    count_others_req_per_proc, 1, MPI_INT, fd->comm);
609 +
610 +        count_others_req_procs = 0;
611 +        for (i = 0; i < nprocs; i++) {
612 +           if (count_others_req_per_proc[i]) {
613 +               others_req[i].count = count_others_req_per_proc[i];
614 +               others_req[i].offsets = (ADIO_Offset *)
615 +                                        ADIOI_Malloc(others_req[i].count *
616 +                                                sizeof(ADIO_Offset));
617 +               others_req[i].lens = (int *)
618 +                                    ADIOI_Malloc(others_req[i].count *
619 +                                                  sizeof(int));
620 +               others_req[i].mem_ptrs = (MPI_Aint *)
621 +                                        ADIOI_Malloc(others_req[i].count *
622 +                                                     sizeof(MPI_Aint));
623 +               count_others_req_procs++;
624 +           } else
625 +               others_req[i].count = 0;
626 +        }
627 +
628 +        /* now send the calculated offsets and lengths to respective processes */
629 +
630 +        send_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_my_req_procs + 1) *
631 +                                                     sizeof(MPI_Request));
632 +        recv_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_others_req_procs+1)*
633 +                                                    sizeof(MPI_Request));
634 +        /* +1 to avoid a 0-size malloc */
635 +
636 +        j = 0;
637 +        for (i = 0; i < nprocs; i++) {
638 +           if (others_req[i].count) {
639 +               MPI_Irecv(others_req[i].offsets, others_req[i].count,
640 +                         ADIO_OFFSET, i, i + myrank, fd->comm,
641 +                         &recv_requests[j]);
642 +               j++;
643 +               MPI_Irecv(others_req[i].lens, others_req[i].count,
644 +                         MPI_INT, i, i + myrank + 1, fd->comm,
645 +                         &recv_requests[j]);
646 +               j++;
647 +           }
648 +        }
649 +
650 +        j = 0;
651 +        for (i = 0; i < nprocs; i++) {
652 +           if (my_req[i].count) {
653 +               MPI_Isend(my_req[i].offsets, my_req[i].count,
654 +                         ADIO_OFFSET, i, i + myrank, fd->comm,
655 +                         &send_requests[j]);
656 +               j++;
657 +               MPI_Isend(my_req[i].lens, my_req[i].count,
658 +                         MPI_INT, i, i + myrank + 1, fd->comm,
659 +                         &send_requests[j]);
660 +               j++;
661 +           }
662 +        }
663 +
664 +        statuses = (MPI_Status *)
665 +                   ADIOI_Malloc((1 + 2 * ADIOI_MAX(count_my_req_procs,
666 +                                                  count_others_req_procs)) *
667 +                                         sizeof(MPI_Status));
668 +        /* +1 to avoid a 0-size malloc */
669 +
670 +        MPI_Waitall(2 * count_my_req_procs, send_requests, statuses);
671 +        MPI_Waitall(2 * count_others_req_procs, recv_requests, statuses);
672 +
673 +        ADIOI_Free(send_requests);
674 +        ADIOI_Free(recv_requests);
675 +        ADIOI_Free(statuses);
676 +        ADIOI_Free(count_others_req_per_proc);
677 +
678 +        *count_others_req_procs_ptr = count_others_req_procs;
679 +    }
680 +}
681 diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c
682 --- ad_lustre_orig/ad_lustre.c  2008-09-17 14:36:57.000000000 +0800
683 +++ ad_lustre/ad_lustre.c       2008-09-17 18:20:35.000000000 +0800
684 @@ -1,9 +1,11 @@
685  /* -*- Mode: C; c-basic-offset:4 ; -*- */
686 -/* 
687 - *   Copyright (C) 2001 University of Chicago. 
688 +/*
689 + *   Copyright (C) 2001 University of Chicago.
690   *   See COPYRIGHT notice in top-level directory.
691   *
692   *   Copyright (C) 2007 Oak Ridge National Laboratory
693 + *
694 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
695   */
696  
697  #include "ad_lustre.h"
698 @@ -13,13 +15,13 @@
699      ADIOI_LUSTRE_ReadContig, /* ReadContig */
700      ADIOI_LUSTRE_WriteContig, /* WriteContig */
701      ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
702 -    ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
703 +    ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */
704      ADIOI_GEN_SeekIndividual, /* SeekIndividual */
705 -    ADIOI_GEN_Fcntl, /* Fcntl */
706 +    ADIOI_LUSTRE_Fcntl, /* Fcntl */
707      ADIOI_LUSTRE_SetInfo, /* SetInfo */
708      ADIOI_GEN_ReadStrided, /* ReadStrided */
709 -    ADIOI_GEN_WriteStrided, /* WriteStrided */
710 -    ADIOI_GEN_Close, /* Close */
711 +    ADIOI_LUSTRE_WriteStrided, /* WriteStrided */
712 +    ADIOI_LUSTRE_Close, /* Close */
713  #if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE)
714      ADIOI_GEN_IreadContig, /* IreadContig */
715      ADIOI_GEN_IwriteContig, /* IwriteContig */
716 diff -ruN ad_lustre_orig/ad_lustre_close.c ad_lustre/ad_lustre_close.c
717 --- ad_lustre_orig/ad_lustre_close.c    1970-01-01 08:00:00.000000000 +0800
718 +++ ad_lustre/ad_lustre_close.c 2008-09-17 18:20:35.000000000 +0800
719 @@ -0,0 +1,42 @@
720 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
721 +/*
722 + *
723 + *   Copyright (C) 1997 University of Chicago.
724 + *   See COPYRIGHT notice in top-level directory.
725 + *
726 + *   Copyright (C) 2007 Oak Ridge National Laboratory
727 + *
728 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
729 + */
730 +
731 +#include "ad_lustre.h"
732 +
733 +#ifdef PROFILE
734 +#include "mpe.h"
735 +#endif
736 +
737 +void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code)
738 +{
739 +    int err, derr = 0;
740 +    static char myname[] = "ADIOI_LUSTRE_CLOSE";
741 +
742 +#ifdef PROFILE
743 +    MPE_Log_event(9, 0, "start close");
744 +#endif
745 +
746 +    err = close(fd->fd_sys);
747 +
748 +#ifdef PROFILE
749 +    MPE_Log_event(10, 0, "end close");
750 +#endif
751 +
752 +    fd->fd_sys = -1;
753 +
754 +    if (err == -1 || derr == -1) {
755 +       *error_code =
756 +           MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname,
757 +                                __LINE__, MPI_ERR_IO, "**io", "**io %s",
758 +                                strerror(errno));
759 +    } else
760 +       *error_code = MPI_SUCCESS;
761 +}
762 diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
763 --- ad_lustre_orig/ad_lustre.h  2008-09-17 14:36:57.000000000 +0800
764 +++ ad_lustre/ad_lustre.h       2008-10-06 16:07:21.000000000 +0800
765 @@ -1,9 +1,11 @@
766  /* -*- Mode: C; c-basic-offset:4 ; -*- */
767 -/* 
768 - *   Copyright (C) 1997 University of Chicago. 
769 +/*
770 + *   Copyright (C) 1997 University of Chicago.
771   *   See COPYRIGHT notice in top-level directory.
772   *
773   *   Copyright (C) 2007 Oak Ridge National Laboratory
774 + *
775 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
776   */
777  
778  #ifndef AD_UNIX_INCLUDE
779 @@ -24,7 +26,32 @@
780  
781  /*#include <fcntl.h>*/
782  #include <sys/ioctl.h>
783 +#ifdef WITH_LUSTRE
784  #include "lustre/lustre_user.h"
785 +#else
786 +/* copy something from lustre_user.h here */
787 +#  define LOV_USER_MAGIC 0x0BD10BD0
788 +#  define LL_IOC_LOV_SETSTRIPE  _IOW ('f', 154, long)
789 +#  define LL_IOC_LOV_GETSTRIPE  _IOW ('f', 155, long)
790 +#  define lov_user_ost_data lov_user_ost_data_v1
791 +struct lov_user_ost_data_v1 {     /* per-stripe data structure */
792 +        __u64 l_object_id;        /* OST object ID */
793 +        __u64 l_object_gr;        /* OST object group (creating MDS number) */
794 +        __u32 l_ost_gen;          /* generation of this OST index */
795 +        __u32 l_ost_idx;          /* OST index in LOV */
796 +} __attribute__((packed));
797 +#define lov_user_md lov_user_md_v1
798 +struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
799 +        __u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V1 */
800 +        __u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
801 +        __u64 lmm_object_id;      /* LOV object ID */
802 +        __u64 lmm_object_gr;      /* LOV object group */
803 +        __u32 lmm_stripe_size;    /* size of stripe in bytes */
804 +        __u16 lmm_stripe_count;   /* num stripes in use for this object */
805 +        __u16 lmm_stripe_offset;  /* starting stripe offset in lmm_objects */
806 +        struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
807 +} __attribute__((packed));
808 +#endif
809  #include "adio.h"
810  /*#include "adioi.h"*/
811  
812 @@ -41,24 +68,62 @@
813  
814  void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code);
815  void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code);
816 -void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count, 
817 -                      MPI_Datatype datatype, int file_ptr_type,
818 -                     ADIO_Offset offset, ADIO_Status *status, int
819 -                    *error_code);
820 -void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count, 
821 -                      MPI_Datatype datatype, int file_ptr_type,
822 -                      ADIO_Offset offset, ADIO_Status *status, int
823 -                     *error_code);   
824 +void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
825 +                             MPI_Datatype datatype, int file_ptr_type,
826 +                             ADIO_Offset offset, ADIO_Status *status,
827 +                             int *error_code);
828 +void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count,
829 +                              MPI_Datatype datatype, int file_ptr_type,
830 +                              ADIO_Offset offset, ADIO_Status *status,
831 +                              int *error_code);
832 +void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
833 +                              MPI_Datatype datatype, int file_ptr_type,
834 +                              ADIO_Offset offset, ADIO_Status *status,
835 +                              int *error_code);
836  void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
837 -                      MPI_Datatype datatype, int file_ptr_type,
838 -                      ADIO_Offset offset, ADIO_Status *status, int
839 -                      *error_code);
840 +                                  MPI_Datatype datatype, int file_ptr_type,
841 +                                  ADIO_Offset offset, ADIO_Status *status,
842 +                                   int *error_code);
843  void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count,
844 -                      MPI_Datatype datatype, int file_ptr_type,
845 -                      ADIO_Offset offset, ADIO_Status *status, int
846 -                      *error_code);
847 +                                 MPI_Datatype datatype, int file_ptr_type,
848 +                                 ADIO_Offset offset, ADIO_Status *status,
849 +                                  int *error_code);
850 +void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count,
851 +                             MPI_Datatype datatype, int file_ptr_type,
852 +                             ADIO_Offset offset, ADIO_Status *status,
853 +                              int *error_code);
854  void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
855                        int *error_code);
856  void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
857 -
858 +void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
859 +                                   int mode, int nprocs);
860 +int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
861 +                                ADIO_Offset *len, int nprocs,
862 +                                 int *striping_info);
863 +void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
864 +                             int *len_list, int contig_access_count,
865 +                             int *striping_info, int nprocs,
866 +                              int *count_my_req_procs_ptr,
867 +                             int **count_my_req_per_proc_ptr,
868 +                             ADIOI_Access ** my_req_ptr,
869 +                             int **buf_idx_ptr);
870 +int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
871 +                          int *len_list, int nprocs);
872 +void ADIOI_LUSTRE_Calc_my_off_len(ADIO_File fd, int bufcount,
873 +                                  MPI_Datatype datatype, int file_ptr_type,
874 +                                  ADIO_Offset offset,
875 +                                  ADIO_Offset **offset_list_ptr,
876 +                                 int **len_list_ptr,
877 +                                 ADIO_Offset *start_offset_ptr,
878 +                                 ADIO_Offset *end_offset_ptr,
879 +                                  int *contig_access_count_ptr);
880 +void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
881 +                                 int *count_my_req_per_proc,
882 +                                 ADIOI_Access * my_req,
883 +                                 int nprocs, int myrank,
884 +                                  ADIO_Offset start_offset,
885 +                                  ADIO_Offset end_offset,
886 +                                  int *striping_info,
887 +                                 int *count_others_req_procs_ptr,
888 +                                 ADIOI_Access ** others_req_ptr);
889  #endif /* End of AD_UNIX_INCLUDE */
890 diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
891 --- ad_lustre_orig/ad_lustre_hints.c    2008-09-17 14:36:57.000000000 +0800
892 +++ ad_lustre/ad_lustre_hints.c 2008-09-17 18:20:35.000000000 +0800
893 @@ -1,9 +1,11 @@
894  /* -*- Mode: C; c-basic-offset:4 ; -*- */
895 -/* 
896 - *   Copyright (C) 1997 University of Chicago. 
897 +/*
898 + *   Copyright (C) 1997 University of Chicago.
899   *   See COPYRIGHT notice in top-level directory.
900   *
901   *   Copyright (C) 2007 Oak Ridge National Laboratory
902 + *
903 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
904   */
905  
906  #include "ad_lustre.h"
907 @@ -11,130 +13,162 @@
908  
909  void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
910  {
911 -    char *value, *value_in_fd;
912 -    int flag, tmp_val[3], str_factor=-1, str_unit=0, start_iodev=-1;
913 -    struct lov_user_md lum = { 0 };
914 -    int err, myrank, fd_sys, perm, amode, old_mask;
915 +    char *value = NULL;
916 +    int flag, tmp_val, int_val, str_factor, str_unit, start_iodev;
917 +    static char myname[] = "ADIOI_LUSTRE_SETINFO";
918  
919 -    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
920      if ( (fd->info) == MPI_INFO_NULL) {
921 -       /* This must be part of the open call. can set striping parameters 
922 -           if necessary. */ 
923 +       /* This must be part of the open call. can set striping parameters
924 +           if necessary. */
925         MPI_Info_create(&(fd->info));
926  
927         MPI_Info_set(fd->info, "direct_read", "false");
928         MPI_Info_set(fd->info, "direct_write", "false");
929         fd->direct_read = fd->direct_write = 0;
930 -       
931 -       /* has user specified striping or server buffering parameters 
932 +
933 +       /* has user specified striping or server buffering parameters
934             and do they have the same value on all processes? */
935         if (users_info != MPI_INFO_NULL) {
936 -           MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, 
937 -                        value, &flag);
938 -           if (flag) 
939 -               str_unit=atoi(value);
940 -
941 -           MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, 
942 -                        value, &flag);
943 -           if (flag) 
944 -               str_factor=atoi(value);
945 +           value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
946  
947 -           MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, 
948 +            /* direct read and write */
949 +           MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
950                          value, &flag);
951 -           if (flag) 
952 -               start_iodev=atoi(value);
953 -
954 -           MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, 
955 -                            value, &flag);
956             if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
957                 MPI_Info_set(fd->info, "direct_read", "true");
958                 fd->direct_read = 1;
959             }
960  
961 -           MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL, 
962 +           MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
963                              value, &flag);
964             if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
965                 MPI_Info_set(fd->info, "direct_write", "true");
966                 fd->direct_write = 1;
967             }
968 -       }
969  
970 -       MPI_Comm_rank(fd->comm, &myrank);
971 -       if (myrank == 0) {
972 -           tmp_val[0] = str_factor;
973 -           tmp_val[1] = str_unit;
974 -           tmp_val[2] = start_iodev;
975 -       }
976 -       MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm);
977 -
978 -       if (tmp_val[0] != str_factor 
979 -               || tmp_val[1] != str_unit 
980 -               || tmp_val[2] != start_iodev) {
981 -           FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys"
982 -                   "-striping_factor:striping_unit:start_iodevice "
983 -                   "need to be identical across all processes\n");
984 -           MPI_Abort(MPI_COMM_WORLD, 1);
985 -               } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) {
986 -            /* if user has specified striping info, process 0 tries to set it */
987 -           if (!myrank) {
988 -               if (fd->perm == ADIO_PERM_NULL) {
989 -                   old_mask = umask(022);
990 -                   umask(old_mask);
991 -                   perm = old_mask ^ 0666;
992 -               }
993 -               else perm = fd->perm;
994 -
995 -               amode = 0;
996 -               if (fd->access_mode & ADIO_CREATE)
997 -                   amode = amode | O_CREAT;
998 -               if (fd->access_mode & ADIO_RDONLY)
999 -                   amode = amode | O_RDONLY;
1000 -               if (fd->access_mode & ADIO_WRONLY)
1001 -                   amode = amode | O_WRONLY;
1002 -               if (fd->access_mode & ADIO_RDWR)
1003 -                   amode = amode | O_RDWR;
1004 -               if (fd->access_mode & ADIO_EXCL)
1005 -                   amode = amode | O_EXCL;
1006 -
1007 -               /* we need to create file so ensure this is set */
1008 -               amode = amode | O_LOV_DELAY_CREATE | O_CREAT;
1009 -
1010 -               fd_sys = open(fd->filename, amode, perm);
1011 -               if (fd_sys == -1) { 
1012 -                   if (errno != EEXIST) 
1013 -                       fprintf(stderr, 
1014 -                               "Failure to open file %s %d %d\n",strerror(errno), amode, perm);
1015 -               } else {
1016 -                   lum.lmm_magic = LOV_USER_MAGIC;
1017 -                   lum.lmm_pattern = 0;
1018 -                   lum.lmm_stripe_size = str_unit;
1019 -                   lum.lmm_stripe_count = str_factor;
1020 -                   lum.lmm_stripe_offset = start_iodev;
1021 -
1022 -                   err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum);
1023 -                   if (err == -1 && errno != EEXIST) { 
1024 -                       fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno));
1025 -                   }
1026 -                   close(fd_sys);
1027 -              }
1028 -           } /* End of striping parameters validation */
1029 +            /*  stripe size */
1030 +           MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
1031 +                        value, &flag);
1032 +           if (flag && (str_unit = atoi(value))) {
1033 +               tmp_val = str_unit;
1034 +               MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1035 +               if (tmp_val != str_unit) {
1036 +                   MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1037 +                                                      "striping_unit",
1038 +                                                      error_code);
1039 +                   return;
1040 +               }
1041 +               MPI_Info_set(fd->info, "striping_unit", value);
1042 +           }
1043 +            /* stripe count */
1044 +           MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
1045 +                        value, &flag);
1046 +           if (flag && (str_factor = atoi(value))) {
1047 +               tmp_val = str_factor;
1048 +               MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1049 +               if (tmp_val != str_factor) {
1050 +                   MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1051 +                                                      "striping_factor",
1052 +                                                      error_code);
1053 +                   return;
1054 +               }
1055 +               MPI_Info_set(fd->info, "striping_factor", value);
1056 +           }
1057 +            /* stripe offset */
1058 +            MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
1059 +                        value, &flag);
1060 +           if (flag && ((start_iodev = atoi(value)) >= 0)) {
1061 +               tmp_val = start_iodev;
1062 +               MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1063 +               if (tmp_val != start_iodev) {
1064 +                   MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1065 +                                                      "start_iodevice",
1066 +                                                      error_code);
1067 +                   return;
1068 +               }
1069 +               MPI_Info_set(fd->info, "start_iodevice", value);
1070 +           }
1071 +            /* CO */
1072 +           MPI_Info_get(users_info, "CO", MPI_MAX_INFO_VAL, value,
1073 +                         &flag);
1074 +           if (flag && (int_val = atoi(value)) > 0) {
1075 +                tmp_val = int_val;
1076 +               MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1077 +               if (tmp_val != int_val) {
1078 +                   MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1079 +                                                      "CO",
1080 +                                                      error_code);
1081 +                   return;
1082 +               }
1083 +               MPI_Info_set(fd->info, "CO", value);
1084 +           }
1085 +            /* big_req_size */
1086 +           MPI_Info_get(users_info, "big_req_size", MPI_MAX_INFO_VAL, value,
1087 +                         &flag);
1088 +           if (flag && (int_val = atoi(value)) > 0) {
1089 +                tmp_val = int_val;
1090 +               MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1091 +               if (tmp_val != int_val) {
1092 +                   MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1093 +                                                      "big_req_size",
1094 +                                                      error_code);
1095 +                   return;
1096 +               }
1097 +               MPI_Info_set(fd->info, "big_req_size", value);
1098 +           }
1099 +            /* hint for disabling data sieving when do collective IO */
1100 +           MPI_Info_get(users_info, "ds_in_coll", MPI_MAX_INFO_VAL,
1101 +                        value, &flag);
1102 +           if (flag && (!strcmp(value, "enable") ||
1103 +                         !strcmp(value, "ENABLE"))) {
1104 +                tmp_val = int_val = 1;
1105 +               MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1106 +               if (tmp_val != int_val) {
1107 +                   MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1108 +                                                      "ds_in_coll",
1109 +                                                      error_code);
1110 +                   return;
1111 +               }
1112 +               MPI_Info_set(fd->info, "ds_in_coll", "enable");
1113 +           }
1114 +            /* same io size */
1115 +           MPI_Info_get(users_info, "same_io_size", MPI_MAX_INFO_VAL,
1116 +                        value, &flag);
1117 +           if (flag && (!strcmp(value, "yes") ||
1118 +                         !strcmp(value, "YES"))) {
1119 +                tmp_val = int_val = 1;
1120 +               MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1121 +               if (tmp_val != int_val) {
1122 +                   MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1123 +                                                      "same_io_size",
1124 +                                                      error_code);
1125 +                   return;
1126 +               }
1127 +               MPI_Info_set(fd->info, "same_io_size", "yes");
1128 +           }
1129 +            /* contiguous data */
1130 +           MPI_Info_get(users_info, "contiguous_data", MPI_MAX_INFO_VAL,
1131 +                        value, &flag);
1132 +           if (flag && (!strcmp(value, "yes") ||
1133 +                         !strcmp(value, "YES"))) {
1134 +                tmp_val = int_val = 1;
1135 +               MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1136 +               if (tmp_val != int_val) {
1137 +                   MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1138 +                                                      "contiguous_data",
1139 +                                                      error_code);
1140 +                   return;
1141 +               }
1142 +               MPI_Info_set(fd->info, "contiguous_data", "yes");
1143 +           }
1144 +           ADIOI_Free(value);
1145         }
1146 -       
1147 -       MPI_Barrier(fd->comm);
1148 -       /* set the values for collective I/O and data sieving parameters */
1149 -       ADIOI_GEN_SetInfo(fd, users_info, error_code);
1150 -    } else {
1151 -       /* The file has been opened previously and fd->fd_sys is a valid
1152 -           file descriptor. cannot set striping parameters now. */
1153 -       
1154 -       /* set the values for collective I/O and data sieving parameters */
1155 -       ADIOI_GEN_SetInfo(fd, users_info, error_code);
1156      }
1157
1158 +    /* set the values for collective I/O and data sieving parameters */
1159 +    ADIOI_GEN_SetInfo(fd, users_info, error_code);
1160 +
1161      if (ADIOI_Direct_read) fd->direct_read = 1;
1162      if (ADIOI_Direct_write) fd->direct_write = 1;
1163  
1164 -    ADIOI_Free(value);
1165 -
1166      *error_code = MPI_SUCCESS;
1167  }
1168 diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c
1169 --- ad_lustre_orig/ad_lustre_open.c     2008-09-17 14:36:57.000000000 +0800
1170 +++ ad_lustre/ad_lustre_open.c  2008-09-17 18:55:50.000000000 +0800
1171 @@ -1,18 +1,21 @@
1172  /* -*- Mode: C; c-basic-offset:4 ; -*- */
1173 -/* 
1174 - *   Copyright (C) 1997 University of Chicago. 
1175 +/*
1176 + *   Copyright (C) 1997 University of Chicago.
1177   *   See COPYRIGHT notice in top-level directory.
1178   *
1179   *   Copyright (C) 2007 Oak Ridge National Laboratory
1180 + *
1181 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
1182   */
1183  
1184  #include "ad_lustre.h"
1185  
1186  void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
1187  {
1188 -    int perm, old_mask, amode, amode_direct;
1189 +    int perm, old_mask, amode = 0, amode_direct = 0, flag = 0, err, myrank;
1190 +    int stripe_size = 0, stripe_count = 0, stripe_offset = -1;
1191      struct lov_user_md lum = { 0 };
1192 -    char *value;
1193 +    char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
1194  
1195  #if defined(MPICH2) || !defined(PRINT_ERR_MSG)
1196      static char myname[] = "ADIOI_LUSTRE_OPEN";
1197 @@ -22,12 +25,57 @@
1198         old_mask = umask(022);
1199         umask(old_mask);
1200         perm = old_mask ^ 0666;
1201 -    }
1202 -    else perm = fd->perm;
1203 +    } else
1204 +       perm = fd->perm;
1205  
1206 -    amode = 0;
1207 -    if (fd->access_mode & ADIO_CREATE)
1208 +    if (fd->access_mode & ADIO_CREATE) {
1209         amode = amode | O_CREAT;
1210 +        /* Check striping info
1211 +         * if already set by SetInfo(), set them to lum; otherwise, set by lum
1212 +         */
1213 +        MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value,
1214 +                    &flag);
1215 +        if (flag)
1216 +           stripe_size = atoi(value);
1217 +
1218 +        MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value,
1219 +                    &flag);
1220 +        if (flag)
1221 +           stripe_count = atoi(value);
1222 +
1223 +        MPI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL, value,
1224 +                    &flag);
1225 +        if (flag)
1226 +           stripe_offset = atoi(value);
1227 +
1228 +        /* if user has specified striping info,
1229 +         * process 0 will try to check and set it.
1230 +         */
1231 +        if ((stripe_size > 0) || (stripe_count > 0) || (stripe_offset >= 0)) {
1232 +           MPI_Comm_rank(fd->comm, &myrank);
1233 +           if (myrank == 0) {
1234 +               int fd_sys = open(fd->filename, amode, perm);
1235 +               if (fd_sys == -1) {
1236 +                   if (errno != EEXIST)
1237 +                       FPRINTF(stderr, "Failure to open file %s %d %d\n",
1238 +                               strerror(errno), amode, perm);
1239 +               } else {
1240 +                   lum.lmm_magic = LOV_USER_MAGIC;
1241 +                   lum.lmm_pattern = 1;
1242 +                   lum.lmm_stripe_size = stripe_size;
1243 +                   lum.lmm_stripe_count = stripe_count;
1244 +                   lum.lmm_stripe_offset = stripe_offset;
1245 +
1246 +                   if (ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum))
1247 +                       FPRINTF(stderr,
1248 +                               "Failure to set striping info to Lustre!\n");
1249 +                   close(fd_sys);
1250 +               }
1251 +           }
1252 +           MPI_Barrier(fd->comm);
1253 +        }
1254 +    }
1255 +
1256      if (fd->access_mode & ADIO_RDONLY)
1257         amode = amode | O_RDONLY;
1258      if (fd->access_mode & ADIO_WRONLY)
1259 @@ -42,32 +90,36 @@
1260      fd->fd_sys = open(fd->filename, amode|O_CREAT, perm);
1261  
1262      if (fd->fd_sys != -1) {
1263 -        int err;
1264 -
1265 -        value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
1266 -
1267          /* get file striping information and set it in info */
1268 -        lum.lmm_magic = LOV_USER_MAGIC;
1269 -        err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
1270 -
1271 -        if (!err) {
1272 -            sprintf(value, "%d", lum.lmm_stripe_size);
1273 -            MPI_Info_set(fd->info, "striping_unit", value);
1274 -
1275 -            sprintf(value, "%d", lum.lmm_stripe_count);
1276 -            MPI_Info_set(fd->info, "striping_factor", value);
1277 -
1278 -            sprintf(value, "%d", lum.lmm_stripe_offset);
1279 -            MPI_Info_set(fd->info, "start_iodevice", value);
1280 -        }
1281 -        ADIOI_Free(value);
1282 +       lum.lmm_magic = LOV_USER_MAGIC;
1283 +       err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
1284  
1285 +       if (!err) {
1286 +           if (lum.lmm_stripe_size && lum.lmm_stripe_count &&
1287 +                (lum.lmm_stripe_offset >= 0)) {
1288 +               sprintf(value, "%d", lum.lmm_stripe_size);
1289 +               MPI_Info_set(fd->info, "striping_unit", value);
1290 +
1291 +               sprintf(value, "%d", lum.lmm_stripe_count);
1292 +               MPI_Info_set(fd->info, "striping_factor", value);
1293 +
1294 +               sprintf(value, "%d", lum.lmm_stripe_offset);
1295 +               MPI_Info_set(fd->info, "start_iodevice", value);
1296 +           } else {
1297 +               FPRINTF(stderr, "Striping info is invalid!\n");
1298 +               ADIOI_Free(value);
1299 +               MPI_Abort(MPI_COMM_WORLD, 1);
1300 +           }
1301 +       } else {
1302 +           FPRINTF(stderr, "Failed to get striping info from Lustre!\n");
1303 +            ADIOI_Free(value);
1304 +           MPI_Abort(MPI_COMM_WORLD, 1);
1305 +       }
1306          if (fd->access_mode & ADIO_APPEND)
1307              fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1308 -    } 
1309 -
1310 +    }
1311      if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
1312 -       fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1313 +        fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1314  
1315      fd->fd_direct = -1;
1316      if (fd->direct_write || fd->direct_read) {
1317 @@ -81,20 +133,22 @@
1318      }
1319  
1320      /* --BEGIN ERROR HANDLING-- */
1321 -    if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && 
1322 -               (fd->direct_write || fd->direct_read))) {
1323 +    if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
1324 +       (fd->direct_write || fd->direct_read))) {
1325         if (errno == ENAMETOOLONG)
1326             *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1327 -                                              MPIR_ERR_RECOVERABLE, myname,
1328 -                                              __LINE__, MPI_ERR_BAD_FILE,
1329 +                                              MPIR_ERR_RECOVERABLE,
1330 +                                              myname, __LINE__,
1331 +                                              MPI_ERR_BAD_FILE,
1332                                                "**filenamelong",
1333                                                "**filenamelong %s %d",
1334                                                fd->filename,
1335                                                strlen(fd->filename));
1336         else if (errno == ENOENT)
1337             *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1338 -                                              MPIR_ERR_RECOVERABLE, myname,
1339 -                                              __LINE__, MPI_ERR_NO_SUCH_FILE,
1340 +                                              MPIR_ERR_RECOVERABLE,
1341 +                                              myname, __LINE__,
1342 +                                              MPI_ERR_NO_SUCH_FILE,
1343                                                "**filenoexist",
1344                                                "**filenoexist %s",
1345                                                fd->filename);
1346 @@ -108,27 +162,30 @@
1347                                                fd->filename);
1348         else if (errno == EACCES) {
1349             *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1350 -                                              MPIR_ERR_RECOVERABLE, myname,
1351 -                                              __LINE__, MPI_ERR_ACCESS,
1352 +                                              MPIR_ERR_RECOVERABLE,
1353 +                                              myname, __LINE__,
1354 +                                              MPI_ERR_ACCESS,
1355                                                "**fileaccess",
1356 -                                              "**fileaccess %s", 
1357 -                                              fd->filename );
1358 -       }
1359 -       else if (errno == EROFS) {
1360 +                                              "**fileaccess %s",
1361 +                                              fd->filename);
1362 +       } else if (errno == EROFS) {
1363             /* Read only file or file system and write access requested */
1364             *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1365 -                                              MPIR_ERR_RECOVERABLE, myname,
1366 -                                              __LINE__, MPI_ERR_READ_ONLY,
1367 -                                              "**ioneedrd", 0 );
1368 -       }
1369 -       else {
1370 +                                              MPIR_ERR_RECOVERABLE,
1371 +                                              myname, __LINE__,
1372 +                                              MPI_ERR_READ_ONLY,
1373 +                                              "**ioneedrd", 0);
1374 +       } else {
1375             *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1376 -                                              MPIR_ERR_RECOVERABLE, myname,
1377 -                                              __LINE__, MPI_ERR_IO, "**io",
1378 +                                              MPIR_ERR_RECOVERABLE,
1379 +                                              myname, __LINE__,
1380 +                                              MPI_ERR_IO, "**io",
1381                                                "**io %s", strerror(errno));
1382         }
1383 -    }
1384 +    } else {
1385      /* --END ERROR HANDLING-- */
1386 -    else *error_code = MPI_SUCCESS;
1387 +        *error_code = MPI_SUCCESS;
1388 +    }
1389  
1390 +    ADIOI_Free(value);
1391  }
1392 diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c
1393 --- ad_lustre_orig/ad_lustre_rwcontig.c 2008-09-17 14:36:57.000000000 +0800
1394 +++ ad_lustre/ad_lustre_rwcontig.c      2008-09-17 18:52:01.000000000 +0800
1395 @@ -1,9 +1,11 @@
1396  /* -*- Mode: C; c-basic-offset:4 ; -*- */
1397 -/* 
1398 - *   Copyright (C) 1997 University of Chicago. 
1399 +/*
1400 + *   Copyright (C) 1997 University of Chicago.
1401   *   See COPYRIGHT notice in top-level directory.
1402   *
1403   *   Copyright (C) 2007 Oak Ridge National Laboratory
1404 + *
1405 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
1406   */
1407  
1408  #define _XOPEN_SOURCE 600
1409 @@ -138,7 +140,7 @@
1410         
1411         if (io_mode)
1412             err = write(fd->fd_sys, buf, len);
1413 -       else 
1414 +       else
1415             err = read(fd->fd_sys, buf, len);
1416      } else {
1417         err = ADIOI_LUSTRE_Directio(fd, buf, len, offset, io_mode);
1418 diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
1419 --- ad_lustre_orig/ad_lustre_wrcoll.c   1970-01-01 08:00:00.000000000 +0800
1420 +++ ad_lustre/ad_lustre_wrcoll.c        2008-09-17 18:20:35.000000000 +0800
1421 @@ -0,0 +1,973 @@
1422 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
1423 +/*
1424 + *   Copyright (C) 1997 University of Chicago.
1425 + *   See COPYRIGHT notice in top-level directory.
1426 + *
1427 + *   Copyright (C) 2007 Oak Ridge National Laboratory
1428 + *
1429 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
1430 + */
1431 +
1432 +#include "ad_lustre.h"
1433 +#include "adio_extern.h"
1434 +
1435 +/* prototypes of functions used for collective writes only. */
1436 +static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
1437 +                                       MPI_Datatype datatype, int nprocs,
1438 +                                       int myrank,
1439 +                                       ADIOI_Access *others_req,
1440 +                                       ADIOI_Access *my_req,
1441 +                                       ADIO_Offset *offset_list,
1442 +                                       int *len_list,
1443 +                                       int contig_access_count,
1444 +                                       int * striping_info,
1445 +                                       int *buf_idx, int *error_code);
1446 +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
1447 +                                         ADIOI_Flatlist_node * flat_buf,
1448 +                                         char **send_buf,
1449 +                                         ADIO_Offset * offset_list,
1450 +                                         int *len_list, int *send_size,
1451 +                                         MPI_Request * requests,
1452 +                                         int *sent_to_proc, int nprocs,
1453 +                                         int myrank, int contig_access_count,
1454 +                                         int * striping_info,
1455 +                                         int *send_buf_idx,
1456 +                                          int *curr_to_proc,
1457 +                                         int *done_to_proc, int iter,
1458 +                                         MPI_Aint buftype_extent);
1459 +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
1460 +                                        char *write_buf,
1461 +                                        ADIOI_Flatlist_node * flat_buf,
1462 +                                        ADIO_Offset * offset_list,
1463 +                                        int *len_list, int *send_size,
1464 +                                        int *recv_size, ADIO_Offset off,
1465 +                                        int size, int *count,
1466 +                                        int *start_pos, int *partial_recv,
1467 +                                        int *sent_to_proc, int nprocs,
1468 +                                        int myrank, int buftype_is_contig,
1469 +                                        int contig_access_count,
1470 +                                        int * striping_info,
1471 +                                        ADIOI_Access * others_req,
1472 +                                        int *send_buf_idx,
1473 +                                        int *curr_to_proc,
1474 +                                        int *done_to_proc, int *hole,
1475 +                                        int iter, MPI_Aint buftype_extent,
1476 +                                        int *buf_idx, int *error_code);
1477 +static void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
1478 +                            ADIO_Offset * srt_off, int *srt_len,
1479 +                            int *start_pos, int nprocs, int nprocs_recv,
1480 +                            int total_elements);
1481 +
1482 +void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
1483 +                                  MPI_Datatype datatype,
1484 +                                  int file_ptr_type, ADIO_Offset offset,
1485 +                                  ADIO_Status * status, int *error_code)
1486 +{
1487 +    ADIOI_Access *my_req;
1488 +    /* array of nprocs access structures, one for each other process has
1489 +       this process's request */
1490 +
1491 +    ADIOI_Access *others_req;
1492 +    /* array of nprocs access structures, one for each other process
1493 +       whose request is written by this process. */
1494 +
1495 +    int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank,        do_collect = 0;
1496 +    int contig_access_count = 0, buftype_is_contig;
1497 +    int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
1498 +    ADIO_Offset orig_fp, start_offset, end_offset, off, *offset_list = NULL;
1499 +    int *buf_idx = NULL, *len_list = NULL, *striping_info = NULL;
1500 +    int old_error, tmp_error;
1501 +
1502 +    MPI_Comm_size(fd->comm, &nprocs);
1503 +    MPI_Comm_rank(fd->comm, &myrank);
1504 +
1505 +    nprocs_for_coll = fd->hints->cb_nodes;
1506 +    orig_fp = fd->fp_ind;
1507 +
1508 +    /* IO patten identification if cb_write isn't disabled */
1509 +    if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
1510 +       /* For this process's request, calculate the list of offsets and
1511 +          lengths in the file and determine the start and end offsets. */
1512 +       ADIOI_LUSTRE_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
1513 +                                    &offset_list, &len_list, &start_offset,
1514 +                                    &end_offset, &contig_access_count);
1515 +       /* Get striping information */
1516 +       ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1, nprocs);
1517 +       /* check if the access pattern can benefit from collective write */
1518 +       do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count,
1519 +                                           len_list, nprocs);
1520 +    }
1521 +    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
1522 +
1523 +    /* Decide if collective I/O should be done */
1524 +    if ((!do_collect && fd->hints->cb_write == ADIOI_HINT_AUTO) ||
1525 +        fd->hints->cb_write == ADIOI_HINT_DISABLE) {
1526 +
1527 +       int filerange_is_contig = 0;
1528 +
1529 +       /* use independent accesses */
1530 +       if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
1531 +           ADIOI_Free(offset_list);
1532 +           ADIOI_Free(len_list);
1533 +       }
1534 +
1535 +       fd->fp_ind = orig_fp;
1536 +       ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
1537 +       if (buftype_is_contig && filetype_is_contig) {
1538 +           if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
1539 +               off = fd->disp + (fd->etype_size) * offset;
1540 +               ADIO_WriteContig(fd, buf, count, datatype,
1541 +                                ADIO_EXPLICIT_OFFSET,
1542 +                                off, status, error_code);
1543 +           } else
1544 +               ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
1545 +                                0, status, error_code);
1546 +       } else {
1547 +           ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
1548 +                             offset, status, error_code);
1549 +       }
1550 +       return;
1551 +    }
1552 +
1553 +    /* calculate what portions of the access requests of this process are
1554 +     * located in which process
1555 +     */
1556 +    ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count,
1557 +                             striping_info, nprocs, &count_my_req_procs,
1558 +                             &count_my_req_per_proc, &my_req, &buf_idx);
1559 +    /* calculate what process's requests will be written by this process */
1560 +    ADIOI_LUSTRE_Calc_others_req(fd, count_my_req_procs,
1561 +                                 count_my_req_per_proc,
1562 +                                my_req, nprocs, myrank,
1563 +                                 start_offset, end_offset, striping_info,
1564 +                                 &count_others_req_procs, &others_req);
1565 +    ADIOI_Free(count_my_req_per_proc);
1566 +
1567 +    /* exchange data and write in sizes of no more than stripe_size. */
1568 +    ADIOI_LUSTRE_Exch_and_write(fd, buf, datatype, nprocs, myrank,
1569 +                                others_req, my_req,
1570 +                                offset_list, len_list, contig_access_count,
1571 +                               striping_info, buf_idx, error_code);
1572 +
1573 +    old_error = *error_code;
1574 +    if (*error_code != MPI_SUCCESS)
1575 +       *error_code = MPI_ERR_IO;
1576 +
1577 +    /* optimization: if only one process performing i/o, we can perform
1578 +     * a less-expensive Bcast  */
1579 +#ifdef ADIOI_MPE_LOGGING
1580 +    MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL);
1581 +#endif
1582 +    if (fd->hints->cb_nodes == 1)
1583 +       MPI_Bcast(error_code, 1, MPI_INT,
1584 +                 fd->hints->ranklist[0], fd->comm);
1585 +    else {
1586 +       tmp_error = *error_code;
1587 +       MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT,
1588 +                     MPI_MAX, fd->comm);
1589 +    }
1590 +#ifdef ADIOI_MPE_LOGGING
1591 +    MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL);
1592 +#endif
1593 +
1594 +    if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO))
1595 +       *error_code = old_error;
1596 +
1597 +
1598 +    if (!buftype_is_contig)
1599 +       ADIOI_Delete_flattened(datatype);
1600 +
1601 +    /* free all memory allocated for collective I/O */
1602 +    /* free others_req */
1603 +    for (i = 0; i < nprocs; i++) {
1604 +       if (others_req[i].count) {
1605 +           ADIOI_Free(others_req[i].offsets);
1606 +           ADIOI_Free(others_req[i].lens);
1607 +           ADIOI_Free(others_req[i].mem_ptrs);
1608 +       }
1609 +    }
1610 +    ADIOI_Free(others_req);
1611 +    /* free my_req here */
1612 +    for (i = 0; i < nprocs; i++) {
1613 +       if (my_req[i].count) {
1614 +           ADIOI_Free(my_req[i].offsets);
1615 +           ADIOI_Free(my_req[i].lens);
1616 +       }
1617 +    }
1618 +    ADIOI_Free(my_req);
1619 +    ADIOI_Free(buf_idx);
1620 +    ADIOI_Free(offset_list);
1621 +    ADIOI_Free(len_list);
1622 +    ADIOI_Free(striping_info);
1623 +
1624 +#ifdef HAVE_STATUS_SET_BYTES
1625 +    if (status) {
1626 +       int bufsize, size;
1627 +       /* Don't set status if it isn't needed */
1628 +       MPI_Type_size(datatype, &size);
1629 +       bufsize = size * count;
1630 +       MPIR_Status_set_bytes(status, datatype, bufsize);
1631 +    }
1632 +    /* This is a temporary way of filling in status. The right way is to
1633 +     * keep track of how much data was actually written during collective I/O.
1634 +     */
1635 +#endif
1636 +
1637 +    fd->fp_sys_posn = -1;      /* set it to null. */
1638 +}
1639 +
1640 +static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
1641 +                                       MPI_Datatype datatype, int nprocs,
1642 +                                       int myrank, ADIOI_Access *others_req,
1643 +                                        ADIOI_Access *my_req,
1644 +                                       ADIO_Offset *offset_list,
1645 +                                        int *len_list, int contig_access_count,
1646 +                                       int *striping_info, int *buf_idx,
1647 +                                        int *error_code)
1648 +{
1649 +    int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
1650 +    ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
1651 +    ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
1652 +    ADIO_Offset max_size, step_size = 0;
1653 +    int real_size, req_len, send_len;
1654 +    int *recv_curr_offlen_ptr, *recv_count, *recv_size;
1655 +    int *send_curr_offlen_ptr, *send_size;
1656 +    int *partial_recv, *sent_to_proc, *recv_start_pos;
1657 +    int *send_buf_idx, *curr_to_proc, *done_to_proc;
1658 +    char *write_buf = NULL, *value;
1659 +    MPI_Status status;
1660 +    ADIOI_Flatlist_node *flat_buf = NULL;
1661 +    MPI_Aint buftype_extent;
1662 +    int stripe_size = striping_info[0], lflag, data_sieving = 0;
1663 +    int stripe_count = striping_info[1], CO = striping_info[2];
1664 +    /* IO step size in each communication */
1665 +    static char myname[] = "ADIOI_EXCH_AND_WRITE";
1666 +
1667 +    *error_code = MPI_SUCCESS; /* changed below if error */
1668 +
1669 +    /* calculate the number of writes of stripe size
1670 +     * to be done by each process and the max among all processes.
1671 +     * That gives the no. of communication phases as well.
1672 +     */
1673 +
1674 +    for (i = 0; i < nprocs; i++) {
1675 +       if (others_req[i].count) {
1676 +           st_loc = others_req[i].offsets[0];
1677 +           end_loc = others_req[i].offsets[0];
1678 +           break;
1679 +       }
1680 +    }
1681 +
1682 +    for (i = 0; i < nprocs; i++) {
1683 +       for (j = 0; j < others_req[i].count; j++) {
1684 +           st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
1685 +           end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] +
1686 +                                          others_req[i].lens[j] - 1));
1687 +       }
1688 +    }
1689 +    /* this process does no writing. */
1690 +    if ((st_loc == -1) && (end_loc == -1))
1691 +       ntimes = 0;
1692 +    MPI_Allreduce(&end_loc, &max_end_loc, 1, MPI_LONG_LONG_INT, MPI_MAX, fd->comm);
1693 +    /* avoid min_st_loc be -1 */
1694 +    if (st_loc == -1)
1695 +        st_loc = max_end_loc;
1696 +    MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm);
1697 +    /* align downward */
1698 +    min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size;
1699 +    /* when nprocs < stripe_count, there will be trouble, because some client
1700 +     * would access more than one OST in one whole communication.
1701 +     */
1702 +    step_size = (ADIO_Offset)ADIOI_MIN(nprocs, stripe_count * CO) * stripe_size;
1703 +    max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1);
1704 +    if (ntimes)
1705 +       write_buf = (char *) ADIOI_Malloc(stripe_size);
1706 +
1707 +    /* calculate the start offset for each iteration */
1708 +    off_list = (ADIO_Offset *) ADIOI_Malloc(max_ntimes * sizeof(ADIO_Offset));
1709 +    for (m = 0; m < max_ntimes; m ++)
1710 +        off_list[m] = max_end_loc;
1711 +    for (i = 0; i < nprocs; i++) {
1712 +        for (j = 0; j < others_req[i].count; j ++) {
1713 +            req_off = others_req[i].offsets[j];
1714 +            //m = (req_off - min_st_loc) / (stripe_size * stripe_count * CO);
1715 +            m = (int)((req_off - min_st_loc) / step_size);
1716 +            off_list[m] = ADIOI_MIN(off_list[m], req_off);
1717 +        }
1718 +    }
1719 +
1720 +    recv_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1721 +    send_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1722 +    /* their use is explained below. calloc initializes to 0. */
1723 +
1724 +    recv_count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1725 +    /* to store count of how many off-len pairs per proc are satisfied
1726 +       in an iteration. */
1727 +
1728 +    send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1729 +    /* total size of data to be sent to each proc. in an iteration.
1730 +       Of size nprocs so that I can use MPI_Alltoall later. */
1731 +
1732 +    recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1733 +    /* total size of data to be recd. from each proc. in an iteration. */
1734 +
1735 +    sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1736 +    /* amount of data sent to each proc so far. Used in
1737 +       ADIOI_Fill_send_buffer. initialized to 0 here. */
1738 +
1739 +    send_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1740 +    curr_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1741 +    done_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1742 +    /* Above three are used in ADIOI_Fill_send_buffer */
1743 +
1744 +    recv_start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1745 +    /* used to store the starting value of recv_curr_offlen_ptr[i] in
1746 +       this iteration */
1747 +
1748 +    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
1749 +    if (!buftype_is_contig) {
1750 +       ADIOI_Flatten_datatype(datatype);
1751 +       flat_buf = ADIOI_Flatlist;
1752 +       while (flat_buf->type != datatype)
1753 +           flat_buf = flat_buf->next;
1754 +    }
1755 +    MPI_Type_extent(datatype, &buftype_extent);
1756 +
1757 +    iter_st_off = min_st_loc;
1758 +
1759 +    /* Although we have recognized the data according to OST index,
1760 +     * a read-modify-write will be done if there is a hole between the data.
1761 +     * For example: if blocksize=60, transfersize=30 and stripe_size=100,
1762 +     * then process0 will collect data [0, 30] and [60, 90] then write. There
1763 +     * is a hole [30, 60], which will cause a read-modify-write in [0, 90].
1764 +     * It will degrade collective performance.
1765 +     * So we disable data sieving by default unless the hint "ds_in_coll"
1766 +     * is set to "enable".
1767 +     */
1768 +    /* check the hint for data sieving */
1769 +    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
1770 +    MPI_Info_get(fd->info, "ds_in_coll", MPI_MAX_INFO_VAL, value, &lflag);
1771 +    if (lflag && !strcmp(value, "enable"))
1772 +        data_sieving = 1;
1773 +    ADIOI_Free(value);
1774 +
1775 +    for (m = 0; m < max_ntimes; m++) {
1776 +       /* go through all others_req and my_req to check which will be received
1777 +         * and sent in this iteration.
1778 +         */
1779 +
1780 +       /* Note that MPI guarantees that displacements in filetypes are in
1781 +          monotonically nondecreasing order and that, for writes, the
1782 +          filetypes cannot specify overlapping regions in the file. This
1783 +          simplifies implementation a bit compared to reads. */
1784 +
1785 +       /*
1786 +           off         = start offset in the file for the data to be written in
1787 +                         this iteration
1788 +           iter_st_off = start offset of this iteration
1789 +           real_size   = size of data written (bytes) corresponding to off
1790 +           max_size    = possible maximum size of data written in this iteration
1791 +           req_off     = offset in the file for a particular contiguous request minus
1792 +                         what was satisfied in previous iteration
1793 +           send_off    = offset the request needed by other processes in this iteration
1794 +           req_len     = size corresponding to req_off
1795 +           send_len    = size corresponding to send_off
1796 +         */
1797 +
1798 +       /* first calculate what should be communicated */
1799 +       for (i = 0; i < nprocs; i++)
1800 +           recv_count[i] = recv_size[i] = send_size[i] = 0;
1801 +
1802 +        off = off_list[m];
1803 +        max_size = ADIOI_MIN(step_size, max_end_loc - iter_st_off + 1);
1804 +        real_size = (int) ADIOI_MIN((off / stripe_size + 1) * stripe_size - off,
1805 +                                    end_loc - off + 1);
1806 +
1807 +       for (i = 0; i < nprocs; i++) {
1808 +            if (my_req[i].count) {
1809 +                for (j = send_curr_offlen_ptr[i]; j < my_req[i].count; j++) {
1810 +                    send_off = my_req[i].offsets[j];
1811 +                    send_len = my_req[i].lens[j];
1812 +                    if (send_off < iter_st_off + max_size) {
1813 +                        send_size[i] += send_len;
1814 +                    } else {
1815 +                        break;
1816 +                    }
1817 +                }
1818 +                send_curr_offlen_ptr[i] = j;
1819 +            }
1820 +           if (others_req[i].count) {
1821 +               recv_start_pos[i] = recv_curr_offlen_ptr[i];
1822 +               for (j = recv_curr_offlen_ptr[i]; j < others_req[i].count; j++) {
1823 +                    req_off = others_req[i].offsets[j];
1824 +                    req_len = others_req[i].lens[j];
1825 +                   if (req_off < iter_st_off + max_size) {
1826 +                       recv_count[i]++;
1827 +                       MPI_Address(write_buf + req_off - off,
1828 +                                   &(others_req[i].mem_ptrs[j]));
1829 +                        recv_size[i] += req_len;
1830 +                   } else {
1831 +                       break;
1832 +                    }
1833 +               }
1834 +               recv_curr_offlen_ptr[i] = j;
1835 +           }
1836 +       }
1837 +        /* use hole to pass data_sieving flag into W_Exchange_data */
1838 +        hole = data_sieving;
1839 +       ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
1840 +                                     len_list, send_size, recv_size, off, real_size,
1841 +                                     recv_count, recv_start_pos, partial_recv,
1842 +                                     sent_to_proc, nprocs, myrank,
1843 +                                     buftype_is_contig, contig_access_count,
1844 +                                     striping_info, others_req, send_buf_idx,
1845 +                                     curr_to_proc, done_to_proc, &hole, m,
1846 +                                     buftype_extent, buf_idx, error_code);
1847 +       if (*error_code != MPI_SUCCESS)
1848 +           return;
1849 +
1850 +       flag = 0;
1851 +       for (i = 0; i < nprocs; i++)
1852 +           if (recv_count[i]) {
1853 +               flag = 1;
1854 +               break;
1855 +           }
1856 +       if (flag) {
1857 +            /* check whether to do data sieving */
1858 +            if(data_sieving) {
1859 +               ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
1860 +                                ADIO_EXPLICIT_OFFSET, off, &status,
1861 +                                error_code);
1862 +            } else {
1863 +                /* if there is no hole, write in one time;
1864 +                 * otherwise, write data separately */
1865 +                if (!hole) {
1866 +                    ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
1867 +                                     ADIO_EXPLICIT_OFFSET, off, &status,
1868 +                                     error_code);
1869 +                } else {
1870 +                    for (i = 0; i < nprocs; i++) {
1871 +                        if (others_req[i].count) {
1872 +                            for (j = 0; j < others_req[i].count; j++) {
1873 +                                if (others_req[i].offsets[j] < off + real_size &&
1874 +                                    others_req[i].offsets[j] >= off) {
1875 +                                    ADIO_WriteContig(fd,
1876 +                                                     write_buf + others_req[i].offsets[j] - off,
1877 +                                                     others_req[i].lens[j],
1878 +                                                     MPI_BYTE, ADIO_EXPLICIT_OFFSET,
1879 +                                                     others_req[i].offsets[j], &status,
1880 +                                                     error_code);
1881 +                                }
1882 +                            }
1883 +                        }
1884 +                    }
1885 +                }
1886 +            }
1887 +           if (*error_code != MPI_SUCCESS)
1888 +               return;
1889 +       }
1890 +
1891 +        iter_st_off += max_size;
1892 +    }
1893 +    if (*error_code != MPI_SUCCESS)
1894 +       return;
1895 +
1896 +    if (ntimes)
1897 +       ADIOI_Free(write_buf);
1898 +    ADIOI_Free(recv_curr_offlen_ptr);
1899 +    ADIOI_Free(send_curr_offlen_ptr);
1900 +    ADIOI_Free(recv_count);
1901 +    ADIOI_Free(send_size);
1902 +    ADIOI_Free(recv_size);
1903 +    ADIOI_Free(sent_to_proc);
1904 +    ADIOI_Free(recv_start_pos);
1905 +    ADIOI_Free(send_buf_idx);
1906 +    ADIOI_Free(curr_to_proc);
1907 +    ADIOI_Free(done_to_proc);
1908 +    ADIOI_Free(off_list);
1909 +}
1910 +
1911 +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
1912 +                                        char *write_buf,
1913 +                                        ADIOI_Flatlist_node * flat_buf,
1914 +                                        ADIO_Offset * offset_list,
1915 +                                        int *len_list, int *send_size,
1916 +                                        int *recv_size, ADIO_Offset off,
1917 +                                        int size, int *count,
1918 +                                        int *start_pos, int *partial_recv,
1919 +                                        int *sent_to_proc, int nprocs,
1920 +                                        int myrank, int buftype_is_contig,
1921 +                                        int contig_access_count,
1922 +                                        int * striping_info,
1923 +                                        ADIOI_Access * others_req,
1924 +                                        int *send_buf_idx,
1925 +                                        int *curr_to_proc, int *done_to_proc,
1926 +                                         int *hole, int iter,
1927 +                                         MPI_Aint buftype_extent,
1928 +                                        int *buf_idx, int *error_code)
1929 +{
1930 +    int i, j, *tmp_len, nprocs_recv, nprocs_send, err;
1931 +    char **send_buf = NULL;
1932 +    MPI_Request *requests, *send_req;
1933 +    MPI_Datatype *recv_types;
1934 +    MPI_Status *statuses, status;
1935 +    int *srt_len, sum, sum_recv;
1936 +    ADIO_Offset *srt_off;
1937 +    int data_sieving = *hole;
1938 +    static char myname[] = "ADIOI_W_EXCHANGE_DATA";
1939 +
1940 +    /* create derived datatypes for recv */
1941 +    nprocs_recv = 0;
1942 +    for (i = 0; i < nprocs; i++)
1943 +       if (recv_size[i])
1944 +           nprocs_recv++;
1945 +
1946 +    recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) *
1947 +                                              sizeof(MPI_Datatype));
1948 +    /* +1 to avoid a 0-size malloc */
1949 +
1950 +    tmp_len = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1951 +    j = 0;
1952 +    for (i = 0; i < nprocs; i++) {
1953 +       if (recv_size[i]) {
1954 +           MPI_Type_hindexed(count[i],
1955 +                             &(others_req[i].lens[start_pos[i]]),
1956 +                             &(others_req[i].mem_ptrs[start_pos[i]]),
1957 +                             MPI_BYTE, recv_types + j);
1958 +           /* absolute displacements; use MPI_BOTTOM in recv */
1959 +           MPI_Type_commit(recv_types + j);
1960 +           j++;
1961 +       }
1962 +    }
1963 +
1964 +    /* To avoid a read-modify-write,
1965 +     * check if there are holes in the data to be written.
1966 +     * For this, merge the (sorted) offset lists others_req using a heap-merge.
1967 +     */
1968 +
1969 +    sum = 0;
1970 +    for (i = 0; i < nprocs; i++)
1971 +       sum += count[i];
1972 +    srt_off = (ADIO_Offset *) ADIOI_Malloc((sum + 1) * sizeof(ADIO_Offset));
1973 +    srt_len = (int *) ADIOI_Malloc((sum + 1) * sizeof(int));
1974 +    /* +1 to avoid a 0-size malloc */
1975 +
1976 +    ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
1977 +                    nprocs, nprocs_recv, sum);
1978 +
1979 +    ADIOI_Free(tmp_len);
1980 +
1981 +    /* check if there are any holes */
1982 +    *hole = 0;
1983 +    for (i = 0; i < sum - 1; i++) {
1984 +        if (srt_off[i] + srt_len[i] < srt_off[i + 1]) {
1985 +            *hole = 1;
1986 +           break;
1987 +       }
1988 +    }
1989 +    /* In some cases (see John Bent ROMIO REQ # 835), an odd interaction
1990 +     * between aggregation, nominally contiguous regions, and cb_buffer_size
1991 +     * should be handled with a read-modify-write (otherwise we will write out
1992 +     * more data than we receive from everyone else (inclusive), so override
1993 +     * hole detection
1994 +     */
1995 +    if (*hole == 0) {
1996 +        sum_recv = 0;
1997 +        for (i = 0; i < nprocs; i++)
1998 +            sum_recv += recv_size[i];
1999 +       if (size > sum_recv)
2000 +           *hole = 1;
2001 +    }
2002 +    /* check the hint for data sieving */
2003 +    if (data_sieving && nprocs_recv && *hole) {
2004 +        ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
2005 +                        ADIO_EXPLICIT_OFFSET, off, &status, &err);
2006 +        // --BEGIN ERROR HANDLING--
2007 +        if (err != MPI_SUCCESS) {
2008 +            *error_code = MPIO_Err_create_code(err,
2009 +                                               MPIR_ERR_RECOVERABLE,
2010 +                                               myname, __LINE__,
2011 +                                               MPI_ERR_IO,
2012 +                                               "**ioRMWrdwr", 0);
2013 +            return;
2014 +        }
2015 +        // --END ERROR HANDLING--
2016 +    }
2017 +    ADIOI_Free(srt_off);
2018 +    ADIOI_Free(srt_len);
2019 +
2020 +    nprocs_send = 0;
2021 +    for (i = 0; i < nprocs; i++)
2022 +       if (send_size[i])
2023 +           nprocs_send++;
2024 +
2025 +    if (fd->atomicity) {
2026 +       /* bug fix from Wei-keng Liao and Kenin Coloma */
2027 +       requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) *
2028 +                                                sizeof(MPI_Request));
2029 +       send_req = requests;
2030 +    } else {
2031 +       requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1)*
2032 +                                                sizeof(MPI_Request));
2033 +       /* +1 to avoid a 0-size malloc */
2034 +
2035 +       /* post receives */
2036 +       j = 0;
2037 +       for (i = 0; i < nprocs; i++) {
2038 +           if (recv_size[i]) {
2039 +               MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i,
2040 +                         myrank + i + 100 * iter, fd->comm, requests + j);
2041 +               j++;
2042 +           }
2043 +       }
2044 +       send_req = requests + nprocs_recv;
2045 +    }
2046 +
2047 +    /* post sends.
2048 +     * if buftype_is_contig, data can be directly sent from
2049 +     * user buf at location given by buf_idx. else use send_buf.
2050 +     */
2051 +    if (buftype_is_contig) {
2052 +       j = 0;
2053 +       for (i = 0; i < nprocs; i++)
2054 +           if (send_size[i]) {
2055 +               MPI_Isend(((char *) buf) + buf_idx[i], send_size[i],
2056 +                         MPI_BYTE, i, myrank + i + 100 * iter, fd->comm,
2057 +                         send_req + j);
2058 +               j++;
2059 +               buf_idx[i] += send_size[i];
2060 +           }
2061 +    } else if (nprocs_send) {
2062 +       /* buftype is not contig */
2063 +       send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
2064 +       for (i = 0; i < nprocs; i++)
2065 +           if (send_size[i])
2066 +               send_buf[i] = (char *) ADIOI_Malloc(send_size[i]);
2067 +
2068 +       ADIOI_LUSTRE_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list,
2069 +                                      len_list, send_size, send_req,
2070 +                                      sent_to_proc, nprocs, myrank,
2071 +                                      contig_access_count, striping_info,
2072 +                                      send_buf_idx, curr_to_proc, done_to_proc,
2073 +                                      iter, buftype_extent);
2074 +       /* the send is done in ADIOI_Fill_send_buffer */
2075 +    }
2076 +
2077 +       /* bug fix from Wei-keng Liao and Kenin Coloma */
2078 +    if (fd->atomicity) {
2079 +       j = 0;
2080 +       for (i = 0; i < nprocs; i++) {
2081 +           MPI_Status wkl_status;
2082 +           if (recv_size[i]) {
2083 +               MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i,
2084 +                        myrank + i + 100 * iter, fd->comm, &wkl_status);
2085 +               j++;
2086 +           }
2087 +       }
2088 +    }
2089 +
2090 +    for (i = 0; i < nprocs_recv; i++)
2091 +       MPI_Type_free(recv_types + i);
2092 +    ADIOI_Free(recv_types);
2093 +
2094 +       /* bug fix from Wei-keng Liao and Kenin Coloma */
2095 +       /* +1 to avoid a 0-size malloc */
2096 +    if (fd->atomicity) {
2097 +       statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) *
2098 +                                              sizeof(MPI_Status));
2099 +    } else {
2100 +       statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) *
2101 +                                              sizeof(MPI_Status));
2102 +    }
2103 +
2104 +#ifdef NEEDS_MPI_TEST
2105 +    i = 0;
2106 +    if (fd->atomicity) {
2107 +       /* bug fix from Wei-keng Liao and Kenin Coloma */
2108 +       while (!i)
2109 +           MPI_Testall(nprocs_send, send_req, &i, statuses);
2110 +    } else {
2111 +       while (!i)
2112 +           MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses);
2113 +    }
2114 +#else
2115 +       /* bug fix from Wei-keng Liao and Kenin Coloma */
2116 +    if (fd->atomicity)
2117 +       MPI_Waitall(nprocs_send, send_req, statuses);
2118 +    else
2119 +       MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses);
2120 +#endif
2121 +    ADIOI_Free(statuses);
2122 +    ADIOI_Free(requests);
2123 +    if (!buftype_is_contig && nprocs_send) {
2124 +       for (i = 0; i < nprocs; i++)
2125 +           if (send_size[i])
2126 +               ADIOI_Free(send_buf[i]);
2127 +       ADIOI_Free(send_buf);
2128 +    }
2129 +}
2130 +
2131 +#define ADIOI_BUF_INCR \
2132 +{ \
2133 +    while (buf_incr) { \
2134 +        size_in_buf = ADIOI_MIN(buf_incr, flat_buf_sz); \
2135 +        user_buf_idx += size_in_buf; \
2136 +        flat_buf_sz -= size_in_buf; \
2137 +        if (!flat_buf_sz) { \
2138 +            if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
2139 +            else { \
2140 +                flat_buf_idx = 0; \
2141 +                n_buftypes++; \
2142 +            } \
2143 +            user_buf_idx = flat_buf->indices[flat_buf_idx] + \
2144 +                              n_buftypes*buftype_extent; \
2145 +            flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
2146 +        } \
2147 +        buf_incr -= size_in_buf; \
2148 +    } \
2149 +}
2150 +
2151 +
2152 +#define ADIOI_BUF_COPY \
2153 +{ \
2154 +    while (size) { \
2155 +        size_in_buf = ADIOI_MIN(size, flat_buf_sz); \
2156 +        memcpy(&(send_buf[p][send_buf_idx[p]]), \
2157 +               ((char *) buf) + user_buf_idx, size_in_buf); \
2158 +        send_buf_idx[p] += size_in_buf; \
2159 +        user_buf_idx += size_in_buf; \
2160 +        flat_buf_sz -= size_in_buf; \
2161 +        if (!flat_buf_sz) { \
2162 +            if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
2163 +            else { \
2164 +                flat_buf_idx = 0; \
2165 +                n_buftypes++; \
2166 +            } \
2167 +            user_buf_idx = flat_buf->indices[flat_buf_idx] + \
2168 +                              n_buftypes*buftype_extent; \
2169 +            flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
2170 +        } \
2171 +        size -= size_in_buf; \
2172 +        buf_incr -= size_in_buf; \
2173 +    } \
2174 +    ADIOI_BUF_INCR \
2175 +}
2176 +
2177 +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
2178 +                                         ADIOI_Flatlist_node * flat_buf,
2179 +                                         char **send_buf,
2180 +                                         ADIO_Offset * offset_list,
2181 +                                         int *len_list, int *send_size,
2182 +                                         MPI_Request * requests,
2183 +                                         int *sent_to_proc, int nprocs,
2184 +                                         int myrank,
2185 +                                         int contig_access_count,
2186 +                                         int * striping_info,
2187 +                                         int *send_buf_idx,
2188 +                                         int *curr_to_proc,
2189 +                                         int *done_to_proc, int iter,
2190 +                                         MPI_Aint buftype_extent)
2191 +{
2192 +    /* this function is only called if buftype is not contig */
2193 +    int i, p, flat_buf_idx, size;
2194 +    int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes;
2195 +    ADIO_Offset off, len, rem_len, user_buf_idx;
2196 +
2197 +    /* curr_to_proc[p] = amount of data sent to proc. p that has already
2198 +     * been accounted for so far
2199 +     * done_to_proc[p] = amount of data already sent to proc. p in
2200 +     * previous iterations
2201 +     * user_buf_idx = current location in user buffer
2202 +     * send_buf_idx[p] = current location in send_buf of proc. p
2203 +     */
2204 +
2205 +    for (i = 0; i < nprocs; i++) {
2206 +       send_buf_idx[i] = curr_to_proc[i] = 0;
2207 +       done_to_proc[i] = sent_to_proc[i];
2208 +    }
2209 +    jj = 0;
2210 +
2211 +    user_buf_idx = flat_buf->indices[0];
2212 +    flat_buf_idx = 0;
2213 +    n_buftypes = 0;
2214 +    flat_buf_sz = flat_buf->blocklens[0];
2215 +
2216 +    /* flat_buf_idx = current index into flattened buftype
2217 +     * flat_buf_sz = size of current contiguous component in flattened buf
2218 +     */
2219 +    for (i = 0; i < contig_access_count; i++) {
2220 +       off = offset_list[i];
2221 +       rem_len = (ADIO_Offset) len_list[i];
2222 +
2223 +       /*this request may span to more than one process */
2224 +       while (rem_len != 0) {
2225 +           len = rem_len;
2226 +           /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
2227 +            * longer than the single region that processor "p" is responsible
2228 +            * for.
2229 +            */
2230 +           p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, nprocs, striping_info);
2231 +
2232 +           if (send_buf_idx[p] < send_size[p]) {
2233 +               if (curr_to_proc[p] + len > done_to_proc[p]) {
2234 +                   if (done_to_proc[p] > curr_to_proc[p]) {
2235 +                       size = (int) ADIOI_MIN(curr_to_proc[p] + len -
2236 +                                              done_to_proc[p],
2237 +                                              send_size[p] -
2238 +                                              send_buf_idx[p]);
2239 +                       buf_incr = done_to_proc[p] - curr_to_proc[p];
2240 +                       ADIOI_BUF_INCR
2241 +                           buf_incr = (int) (curr_to_proc[p] + len -
2242 +                                             done_to_proc[p]);
2243 +                       curr_to_proc[p] = done_to_proc[p] + size;
2244 +                       ADIOI_BUF_COPY
2245 +                    } else {
2246 +                       size = (int) ADIOI_MIN(len, send_size[p] -
2247 +                                              send_buf_idx[p]);
2248 +                       buf_incr = (int) len;
2249 +                       curr_to_proc[p] += size;
2250 +                       ADIOI_BUF_COPY
2251 +                    }
2252 +                   if (send_buf_idx[p] == send_size[p]) {
2253 +                       MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p,
2254 +                                 myrank + p + 100 * iter, fd->comm,
2255 +                                 requests + jj);
2256 +                       jj++;
2257 +                   }
2258 +               } else {
2259 +                   curr_to_proc[p] += (int) len;
2260 +                   buf_incr = (int) len;
2261 +                   ADIOI_BUF_INCR
2262 +                }
2263 +           } else {
2264 +               buf_incr = (int) len;
2265 +               ADIOI_BUF_INCR
2266 +            }
2267 +           off += len;
2268 +           rem_len -= len;
2269 +       }
2270 +    }
2271 +    for (i = 0; i < nprocs; i++)
2272 +       if (send_size[i])
2273 +           sent_to_proc[i] = curr_to_proc[i];
2274 +}
2275 +
2276 +static void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
2277 +                            ADIO_Offset * srt_off, int *srt_len,
2278 +                            int *start_pos, int nprocs, int nprocs_recv,
2279 +                            int total_elements)
2280 +{
2281 +    typedef struct {
2282 +       ADIO_Offset *off_list;
2283 +       int *len_list;
2284 +       int nelem;
2285 +    } heap_struct;
2286 +
2287 +    heap_struct *a, tmp;
2288 +    int i, j, heapsize, l, r, k, smallest;
2289 +
2290 +    a = (heap_struct *) ADIOI_Malloc((nprocs_recv + 1) *
2291 +                                    sizeof(heap_struct));
2292 +
2293 +    j = 0;
2294 +    for (i = 0; i < nprocs; i++)
2295 +       if (count[i]) {
2296 +           a[j].off_list = &(others_req[i].offsets[start_pos[i]]);
2297 +           a[j].len_list = &(others_req[i].lens[start_pos[i]]);
2298 +           a[j].nelem = count[i];
2299 +           j++;
2300 +       }
2301 +
2302 +    /* build a heap out of the first element from each list, with
2303 +       the smallest element of the heap at the root */
2304 +
2305 +    heapsize = nprocs_recv;
2306 +    for (i = heapsize / 2 - 1; i >= 0; i--) {
2307 +       /* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143
2308 +          modified for a heap with smallest element at root. I have 
2309 +          removed the recursion so that there are no function calls.
2310 +          Function calls are too expensive. */
2311 +       k = i;
2312 +       for (;;) {
2313 +           l = 2 * (k + 1) - 1;
2314 +           r = 2 * (k + 1);
2315 +
2316 +           if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list)))
2317 +               smallest = l;
2318 +           else
2319 +               smallest = k;
2320 +
2321 +           if ((r < heapsize) &&
2322 +               (*(a[r].off_list) < *(a[smallest].off_list)))
2323 +               smallest = r;
2324 +
2325 +           if (smallest != k) {
2326 +               tmp.off_list = a[k].off_list;
2327 +               tmp.len_list = a[k].len_list;
2328 +               tmp.nelem = a[k].nelem;
2329 +
2330 +               a[k].off_list = a[smallest].off_list;
2331 +               a[k].len_list = a[smallest].len_list;
2332 +               a[k].nelem = a[smallest].nelem;
2333 +
2334 +               a[smallest].off_list = tmp.off_list;
2335 +               a[smallest].len_list = tmp.len_list;
2336 +               a[smallest].nelem = tmp.nelem;
2337 +
2338 +               k = smallest;
2339 +           } else
2340 +               break;
2341 +       }
2342 +    }
2343 +
2344 +    for (i = 0; i < total_elements; i++) {
2345 +       /* extract smallest element from heap, i.e. the root */
2346 +       srt_off[i] = *(a[0].off_list);
2347 +       srt_len[i] = *(a[0].len_list);
2348 +       (a[0].nelem)--;
2349 +
2350 +       if (!a[0].nelem) {
2351 +           a[0].off_list = a[heapsize - 1].off_list;
2352 +           a[0].len_list = a[heapsize - 1].len_list;
2353 +           a[0].nelem = a[heapsize - 1].nelem;
2354 +           heapsize--;
2355 +       } else {
2356 +           (a[0].off_list)++;
2357 +           (a[0].len_list)++;
2358 +       }
2359 +
2360 +       /* Heapify(a, 0, heapsize); */
2361 +       k = 0;
2362 +       for (;;) {
2363 +           l = 2 * (k + 1) - 1;
2364 +           r = 2 * (k + 1);
2365 +
2366 +           if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list)))
2367 +               smallest = l;
2368 +           else
2369 +               smallest = k;
2370 +
2371 +           if ((r < heapsize) &&
2372 +               (*(a[r].off_list) < *(a[smallest].off_list)))
2373 +               smallest = r;
2374 +
2375 +           if (smallest != k) {
2376 +               tmp.off_list = a[k].off_list;
2377 +               tmp.len_list = a[k].len_list;
2378 +               tmp.nelem = a[k].nelem;
2379 +
2380 +               a[k].off_list = a[smallest].off_list;
2381 +               a[k].len_list = a[smallest].len_list;
2382 +               a[k].nelem = a[smallest].nelem;
2383 +
2384 +               a[smallest].off_list = tmp.off_list;
2385 +               a[smallest].len_list = tmp.len_list;
2386 +               a[smallest].nelem = tmp.nelem;
2387 +
2388 +               k = smallest;
2389 +           } else
2390 +               break;
2391 +       }
2392 +    }
2393 +    ADIOI_Free(a);
2394 +}
2395 diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
2396 --- ad_lustre_orig/ad_lustre_wrstr.c    1970-01-01 08:00:00.000000000 +0800
2397 +++ ad_lustre/ad_lustre_wrstr.c 2008-09-17 18:20:35.000000000 +0800
2398 @@ -0,0 +1,463 @@
2399 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
2400 +/*
2401 + *   Copyright (C) 1997 University of Chicago.
2402 + *   See COPYRIGHT notice in top-level directory.
2403 + *
2404 + *   Copyright (C) 2007 Oak Ridge National Laboratory
2405 + *
2406 + *   Copyright (C) 2008 Sun Microsystems, Lustre group
2407 + */
2408 +
2409 +#include "ad_lustre.h"
2410 +#include "adio_extern.h"
2411 +
2412 +#define ADIOI_BUFFERED_WRITE \
2413 +{ \
2414 +    if (req_off >= writebuf_off + writebuf_len) { \
2415 +        if (writebuf_len) { \
2416 +           ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2417 +                  ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2418 +           if (!(fd->atomicity)) \
2419 +                ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2420 +           if (*error_code != MPI_SUCCESS) { \
2421 +               *error_code = MPIO_Err_create_code(*error_code, \
2422 +                                                  MPIR_ERR_RECOVERABLE, myname, \
2423 +                                                  __LINE__, MPI_ERR_IO, \
2424 +                                                  "**iowswc", 0); \
2425 +               return; \
2426 +           } \
2427 +        } \
2428 +       writebuf_off = req_off; \
2429 +        /* stripe_size alignment */ \
2430 +        writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2431 +                                       (writebuf_off / stripe_size + 1) * \
2432 +                                       stripe_size - writebuf_off);\
2433 +       if (!(fd->atomicity)) \
2434 +            ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2435 +       ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,\
2436 +                        writebuf_off, &status1, error_code); \
2437 +       if (*error_code != MPI_SUCCESS) { \
2438 +           *error_code = MPIO_Err_create_code(*error_code, \
2439 +                                              MPIR_ERR_RECOVERABLE, myname, \
2440 +                                              __LINE__, MPI_ERR_IO, \
2441 +                                              "**iowsrc", 0); \
2442 +           return; \
2443 +       } \
2444 +    } \
2445 +    write_sz = (int) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
2446 +    memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, write_sz);\
2447 +    while (write_sz != req_len) {\
2448 +        ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2449 +                         ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2450 +        if (!(fd->atomicity)) \
2451 +            ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2452 +        if (*error_code != MPI_SUCCESS) { \
2453 +            *error_code = MPIO_Err_create_code(*error_code, \
2454 +                                               MPIR_ERR_RECOVERABLE, myname, \
2455 +                                               __LINE__, MPI_ERR_IO, \
2456 +                                               "**iowswc", 0); \
2457 +            return; \
2458 +        } \
2459 +        req_len -= write_sz; \
2460 +        userbuf_off += write_sz; \
2461 +        writebuf_off += writebuf_len; \
2462 +        /* stripe_size alignment */ \
2463 +        writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2464 +                                       (writebuf_off / stripe_size + 1) * \
2465 +                                       stripe_size - writebuf_off);\
2466 +       if (!(fd->atomicity)) \
2467 +            ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2468 +        ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,\
2469 +                        writebuf_off, &status1, error_code); \
2470 +       if (*error_code != MPI_SUCCESS) { \
2471 +           *error_code = MPIO_Err_create_code(*error_code, \
2472 +                                              MPIR_ERR_RECOVERABLE, myname, \
2473 +                                              __LINE__, MPI_ERR_IO, \
2474 +                                              "**iowsrc", 0); \
2475 +           return; \
2476 +       } \
2477 +        write_sz = ADIOI_MIN(req_len, writebuf_len); \
2478 +        memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
2479 +    } \
2480 +}
2481 +
2482 +
2483 +/* this macro is used when filetype is contig and buftype is not contig.
2484 +   it does not do a read-modify-write and does not lock*/
2485 +#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
2486 +{ \
2487 +    if (req_off >= writebuf_off + writebuf_len) { \
2488 +       writebuf_off = req_off; \
2489 +        /* stripe_size alignment */ \
2490 +        writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2491 +                                       (writebuf_off / stripe_size + 1) * \
2492 +                                       stripe_size - writebuf_off);\
2493 +    } \
2494 +    write_sz = (int) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
2495 +    memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, write_sz);\
2496 +    while (req_len) { \
2497 +        ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2498 +                         ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2499 +        if (*error_code != MPI_SUCCESS) { \
2500 +            *error_code = MPIO_Err_create_code(*error_code, \
2501 +                                               MPIR_ERR_RECOVERABLE, myname, \
2502 +                                               __LINE__, MPI_ERR_IO, \
2503 +                                               "**iowswc", 0); \
2504 +            return; \
2505 +        } \
2506 +        req_len -= write_sz; \
2507 +        userbuf_off += write_sz; \
2508 +        writebuf_off += writebuf_len; \
2509 +        /* stripe_size alignment */ \
2510 +        writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2511 +                                       (writebuf_off / stripe_size + 1) * \
2512 +                                       stripe_size - writebuf_off);\
2513 +        write_sz = ADIOI_MIN(req_len, writebuf_len); \
2514 +        memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
2515 +    } \
2516 +}
2517 +
2518 +void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
2519 +                              MPI_Datatype datatype, int file_ptr_type,
2520 +                              ADIO_Offset offset, ADIO_Status * status,
2521 +                              int *error_code)
2522 +{
2523 +    /* offset is in units of etype relative to the filetype. */
2524 +    ADIOI_Flatlist_node *flat_buf, *flat_file;
2525 +    int i, j, k, bwr_size, fwr_size = 0, st_index = 0;
2526 +    int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
2527 +    int n_filetypes, etype_in_filetype;
2528 +    ADIO_Offset abs_off_in_filetype = 0;
2529 +    int filetype_size, etype_size, buftype_size, req_len;
2530 +    MPI_Aint filetype_extent, buftype_extent;
2531 +    int buf_count, buftype_is_contig, filetype_is_contig;
2532 +    ADIO_Offset userbuf_off;
2533 +    ADIO_Offset off, req_off, disp, end_offset = 0, writebuf_off, start_off;
2534 +    char *writebuf;
2535 +    int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
2536 +    ADIO_Status status1;
2537 +    int new_bwr_size, new_fwr_size;
2538 +    char * value;
2539 +    int stripe_size, lflag = 0;
2540 +    static char myname[] = "ADIOI_LUSTRE_WriteStrided";
2541 +    int myrank;
2542 +    MPI_Comm_rank(fd->comm, &myrank);
2543 +
2544 +    if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
2545 +       /* if user has disabled data sieving on writes, use naive
2546 +        * approach instead.
2547 +        */
2548 +       ADIOI_GEN_WriteStrided_naive(fd,
2549 +                                    buf,
2550 +                                    count,
2551 +                                    datatype,
2552 +                                    file_ptr_type,
2553 +                                    offset, status, error_code);
2554 +       return;
2555 +    }
2556 +
2557 +    *error_code = MPI_SUCCESS; /* changed below if error */
2558 +
2559 +    ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
2560 +    ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
2561 +
2562 +    MPI_Type_size(fd->filetype, &filetype_size);
2563 +    if (!filetype_size) {
2564 +       *error_code = MPI_SUCCESS;
2565 +       return;
2566 +    }
2567 +
2568 +    MPI_Type_extent(fd->filetype, &filetype_extent);
2569 +    MPI_Type_size(datatype, &buftype_size);
2570 +    MPI_Type_extent(datatype, &buftype_extent);
2571 +    etype_size = fd->etype_size;
2572 +
2573 +    bufsize = buftype_size * count;
2574 +
2575 +    /* get striping info */
2576 +    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
2577 +    MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
2578 +    if (lflag)
2579 +       stripe_size = atoi(value);
2580 +    ADIOI_Free(value);
2581 +
2582 +    /* Different buftype to different filetype */
2583 +    if (!buftype_is_contig && filetype_is_contig) {
2584 +        /* noncontiguous in memory, contiguous in file. */
2585 +       ADIOI_Flatten_datatype(datatype);
2586 +       flat_buf = ADIOI_Flatlist;
2587 +       while (flat_buf->type != datatype)
2588 +           flat_buf = flat_buf->next;
2589 +
2590 +       off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
2591 +           fd->disp + etype_size * offset;
2592 +
2593 +       start_off = off;
2594 +       end_offset = start_off + bufsize - 1;
2595 +       writebuf_off = start_off;
2596 +        /* write stripe size buffer each time */
2597 +       writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
2598 +        writebuf_len = (int) ADIOI_MIN(bufsize,
2599 +                                       (writebuf_off / stripe_size + 1) *
2600 +                                       stripe_size - writebuf_off);
2601 +
2602 +        /* if atomicity is true, lock the region to be accessed */
2603 +       if (fd->atomicity)
2604 +           ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
2605 +
2606 +       for (j = 0; j < count; j++) {
2607 +           for (i = 0; i < flat_buf->count; i++) {
2608 +               userbuf_off = j * buftype_extent + flat_buf->indices[i];
2609 +               req_off = off;
2610 +               req_len = flat_buf->blocklens[i];
2611 +               ADIOI_BUFFERED_WRITE_WITHOUT_READ
2612 +               off += flat_buf->blocklens[i];
2613 +           }
2614 +        }
2615 +
2616 +       /* write the buffer out finally */
2617 +       ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
2618 +                        ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
2619 +                        error_code);
2620 +
2621 +       if (fd->atomicity)
2622 +           ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
2623 +       if (*error_code != MPI_SUCCESS)
2624 +           return;
2625 +       ADIOI_Free(writebuf);
2626 +       if (file_ptr_type == ADIO_INDIVIDUAL)
2627 +           fd->fp_ind = off;
2628 +    } else {
2629 +        /* noncontiguous in file */
2630 +        /* filetype already flattened in ADIO_Open */
2631 +       flat_file = ADIOI_Flatlist;
2632 +       while (flat_file->type != fd->filetype)
2633 +           flat_file = flat_file->next;
2634 +       disp = fd->disp;
2635 +
2636 +       if (file_ptr_type == ADIO_INDIVIDUAL) {
2637 +           offset = fd->fp_ind;        /* in bytes */
2638 +           n_filetypes = -1;
2639 +           flag = 0;
2640 +           while (!flag) {
2641 +               n_filetypes++;
2642 +               for (i = 0; i < flat_file->count; i++) {
2643 +                   if (disp + flat_file->indices[i] +
2644 +                       (ADIO_Offset) n_filetypes * filetype_extent +
2645 +                       flat_file->blocklens[i] >= offset) {
2646 +                       st_index = i;
2647 +                       fwr_size = (int) (disp + flat_file->indices[i] +
2648 +                                         (ADIO_Offset) n_filetypes *
2649 +                                         filetype_extent +
2650 +                                         flat_file->blocklens[i] -
2651 +                                         offset);
2652 +                       flag = 1;
2653 +                       break;
2654 +                   }
2655 +               }
2656 +           }
2657 +       } else {
2658 +           n_etypes_in_filetype = filetype_size / etype_size;
2659 +           n_filetypes = (int) (offset / n_etypes_in_filetype);
2660 +           etype_in_filetype = (int) (offset % n_etypes_in_filetype);
2661 +           size_in_filetype = etype_in_filetype * etype_size;
2662 +
2663 +           sum = 0;
2664 +           for (i = 0; i < flat_file->count; i++) {
2665 +               sum += flat_file->blocklens[i];
2666 +               if (sum > size_in_filetype) {
2667 +                   st_index = i;
2668 +                   fwr_size = sum - size_in_filetype;
2669 +                   abs_off_in_filetype = flat_file->indices[i] +
2670 +                       size_in_filetype - (sum - flat_file->blocklens[i]);
2671 +                   break;
2672 +               }
2673 +           }
2674 +
2675 +           /* abs. offset in bytes in the file */
2676 +           offset = disp + (ADIO_Offset) n_filetypes *filetype_extent +
2677 +                    abs_off_in_filetype;
2678 +       }
2679 +
2680 +       start_off = offset;
2681 +
2682 +       /* If the file bytes is actually contiguous, we do not need data sieve at all */
2683 +       if (bufsize <= fwr_size) {
2684 +            req_off = start_off;
2685 +            req_len = bufsize;
2686 +            end_offset = start_off + bufsize - 1;
2687 +           writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
2688 +           memset(writebuf, -1, ADIOI_MIN(bufsize, stripe_size));
2689 +            writebuf_off = 0;
2690 +            writebuf_len = 0;
2691 +            userbuf_off = 0;
2692 +            ADIOI_BUFFERED_WRITE_WITHOUT_READ
2693 +       } else {
2694 +           /* Calculate end_offset, the last byte-offset that will be accessed.
2695 +              e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */
2696 +           st_fwr_size = fwr_size;
2697 +           st_n_filetypes = n_filetypes;
2698 +           i = 0;
2699 +           j = st_index;
2700 +           off = offset;
2701 +           fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
2702 +           while (i < bufsize) {
2703 +               i += fwr_size;
2704 +               end_offset = off + fwr_size - 1;
2705 +
2706 +               if (j < (flat_file->count - 1))
2707 +                   j++;
2708 +               else {
2709 +                   j = 0;
2710 +                   n_filetypes++;
2711 +               }
2712 +
2713 +               off = disp + flat_file->indices[j] +
2714 +                     (ADIO_Offset) n_filetypes * filetype_extent;
2715 +               fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i);
2716 +           }
2717 +
2718 +           writebuf_off = 0;
2719 +           writebuf_len = 0;
2720 +           writebuf = (char *) ADIOI_Malloc(stripe_size);
2721 +           memset(writebuf, -1, stripe_size);
2722 +           /* if atomicity is true, lock the region to be accessed */
2723 +           if (fd->atomicity)
2724 +               ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
2725 +
2726 +           if (buftype_is_contig && !filetype_is_contig) {
2727 +               /* contiguous in memory, noncontiguous in file. should be the most
2728 +                  common case. */
2729 +               i = 0;
2730 +               j = st_index;
2731 +               off = offset;
2732 +               n_filetypes = st_n_filetypes;
2733 +               fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
2734 +               while (i < bufsize) {
2735 +                   if (fwr_size) {
2736 +                       /* TYPE_UB and TYPE_LB can result in
2737 +                          fwr_size = 0. save system call in such cases */
2738 +                       /*
2739 +                        lseek(fd->fd_sys, off, SEEK_SET);
2740 +                       err = write(fd->fd_sys, ((char *) buf) + i, fwr_size);
2741 +                        */
2742 +                       req_off = off;
2743 +                       req_len = fwr_size;
2744 +                       userbuf_off = i;
2745 +                       ADIOI_BUFFERED_WRITE
2746 +                    }
2747 +                   i += fwr_size;
2748 +
2749 +                   if (off + fwr_size < disp + flat_file->indices[j] +
2750 +                                        flat_file->blocklens[j] +
2751 +                           (ADIO_Offset) n_filetypes * filetype_extent)
2752 +                       off += fwr_size;
2753 +                   /* did not reach end of contiguous block in filetype.
2754 +                   no more I/O needed. off is incremented by fwr_size. */
2755 +                   else {
2756 +                       if (j < (flat_file->count - 1))
2757 +                           j++;
2758 +                       else {
2759 +                           j = 0;
2760 +                           n_filetypes++;
2761 +                       }
2762 +                       off = disp + flat_file->indices[j] +
2763 +                             (ADIO_Offset) n_filetypes * filetype_extent;
2764 +                       fwr_size = ADIOI_MIN(flat_file->blocklens[j],
2765 +                                             bufsize - i);
2766 +                   }
2767 +               }
2768 +           } else {
2769 +                   /* noncontiguous in memory as well as in file */
2770 +               ADIOI_Flatten_datatype(datatype);
2771 +               flat_buf = ADIOI_Flatlist;
2772 +               while (flat_buf->type != datatype)
2773 +                   flat_buf = flat_buf->next;
2774 +
2775 +               k = num = buf_count = 0;
2776 +               i = (int) (flat_buf->indices[0]);
2777 +               j = st_index;
2778 +               off = offset;
2779 +               n_filetypes = st_n_filetypes;
2780 +               fwr_size = st_fwr_size;
2781 +               bwr_size = flat_buf->blocklens[0];
2782 +
2783 +               while (num < bufsize) {
2784 +                   size = ADIOI_MIN(fwr_size, bwr_size);
2785 +                   if (size) {
2786 +                       /*
2787 +                        lseek(fd->fd_sys, off, SEEK_SET);
2788 +                        err = write(fd->fd_sys, ((char *) buf) + i, size);
2789 +                        */
2790 +                       req_off = off;
2791 +                       req_len = size;
2792 +                       userbuf_off = i;
2793 +                       ADIOI_BUFFERED_WRITE
2794 +                    }
2795 +
2796 +                   new_fwr_size = fwr_size;
2797 +                   new_bwr_size = bwr_size;
2798 +
2799 +                   if (size == fwr_size) {
2800 +                       /* reached end of contiguous block in file */
2801 +                       if (j < (flat_file->count - 1)) {
2802 +                           j++;
2803 +                        } else {
2804 +                           j = 0;
2805 +                           n_filetypes++;
2806 +                       }
2807 +                       off = disp + flat_file->indices[j] +
2808 +                             (ADIO_Offset) n_filetypes * filetype_extent;
2809 +                        new_fwr_size = flat_file->blocklens[j];
2810 +                       if (size != bwr_size) {
2811 +                           i += size;
2812 +                           new_bwr_size -= size;
2813 +                       }
2814 +                   }
2815 +                   if (size == bwr_size) {
2816 +                       /* reached end of contiguous block in memory */
2817 +                       k = (k + 1) % flat_buf->count;
2818 +                       buf_count++;
2819 +                       i = (int) (buftype_extent *
2820 +                                  (buf_count / flat_buf->count) +
2821 +                                 flat_buf->indices[k]);
2822 +                       new_bwr_size = flat_buf->blocklens[k];
2823 +                       if (size != fwr_size) {
2824 +                           off += size;
2825 +                           new_fwr_size -= size;
2826 +                       }
2827 +                   }
2828 +                   num += size;
2829 +                   fwr_size = new_fwr_size;
2830 +                   bwr_size = new_bwr_size;
2831 +               }
2832 +            }
2833 +
2834 +           /* write the buffer out finally */
2835 +           if (writebuf_len) {
2836 +               ADIO_WriteContig(fd, writebuf, writebuf_len,
2837 +                                MPI_BYTE, ADIO_EXPLICIT_OFFSET,
2838 +                                writebuf_off, &status1, error_code);
2839 +               if (!(fd->atomicity))
2840 +                   ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
2841 +               if (*error_code != MPI_SUCCESS)
2842 +                   return;
2843 +           }
2844 +           if (fd->atomicity)
2845 +               ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
2846 +       }
2847 +        ADIOI_Free(writebuf);
2848 +       if (file_ptr_type == ADIO_INDIVIDUAL)
2849 +           fd->fp_ind = off;
2850 +    }
2851 +    fd->fp_sys_posn = -1;      /* set it to null. */
2852 +
2853 +#ifdef HAVE_STATUS_SET_BYTES
2854 +    MPIR_Status_set_bytes(status, datatype, bufsize);
2855 +    /* This is a temporary way of filling in status. The right way is to
2856 +    keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
2857 +#endif
2858 +
2859 +    if (!buftype_is_contig)
2860 +        ADIOI_Delete_flattened(datatype);
2861 +}
2862 diff -ruN ad_lustre_orig/Makefile ad_lustre/Makefile
2863 --- ad_lustre_orig/Makefile     1970-01-01 08:00:00.000000000 +0800
2864 +++ ad_lustre/Makefile  2008-09-17 18:20:35.000000000 +0800
2865 @@ -0,0 +1,50 @@
2866 +CC          = gcc
2867 +AR          = ar cr
2868 +RANLIB      = ranlib
2869 +LIBNAME     = /work/download/mpich2-1.0.7-dev/lib/libmpich.a
2870 +srcdir      = /work/download/mpich2-1.0.7-dev/src/mpi/romio/adio/ad_lustre
2871 +CC_SHL      = true
2872 +SHLIBNAME   = /work/download/mpich2-1.0.7-dev/lib/libmpich
2873 +
2874 +INCLUDE_DIR = -I. -I${srcdir}/../include -I../include -I../../include -I${srcdir}/../../../../include -I../../../../include
2875 +CFLAGS      =   -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/include -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/include -I/work/download/mpich2-1.0.7-dev/src/mpid/common/datatype -I/work/download/mpich2-1.0.7-dev/src/mpid/common/datatype -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/channels/sock/include -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/channels/sock/include -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock/poll -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock/poll -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -O2  -DFORTRANDOUBLEUNDERSCORE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DHAVE_ROMIOCONF_H -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(INCLUDE_DIR)
2876 +
2877 +top_builddir  = /work/download/mpich2-1.0.7-dev
2878 +LIBTOOL       = 
2879 +C_COMPILE_SHL = $(CC_SHL) -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -O2  -DFORTRANDOUBLEUNDERSCORE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DHAVE_ROMIOCONF_H -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(INCLUDE_DIR)
2880 +
2881 +VPATH = .:${srcdir}
2882 +
2883 +AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_open.o \
2884 +      ad_lustre_rwcontig.o ad_lustre_wrcoll.o ad_lustre_wrstr.o  \
2885 +      ad_lustre_fcntl.o ad_lustre_hints.o ad_lustre_close.o \
2886 +      ad_lustre_aggregate.o
2887 +
2888 +
2889 +default: $(LIBNAME)
2890 +       @if [ "none" != "none" ] ; then \
2891 +           $(MAKE) $(SHLIBNAME).la ;\
2892 +       fi
2893 +
2894 +.SUFFIXES: $(SUFFIXES) .p .lo
2895 +
2896 +.c.o:
2897 +       $(CC) $(CFLAGS) -c $<
2898 +.c.lo:
2899 +       $(C_COMPILE_SHL) -c $< -o _s$*.o
2900 +       @mv -f _s$*.o $*.lo
2901 +
2902 +$(LIBNAME): $(AD_LUSTRE_OBJECTS)
2903 +       $(AR) $(LIBNAME) $(AD_LUSTRE_OBJECTS)
2904 +       $(RANLIB) $(LIBNAME)
2905 +
2906 +AD_LUSTRE_LOOBJECTS=$(AD_LUSTRE_OBJECTS:.o=.lo)
2907 +$(SHLIBNAME).la: $(AD_LUSTRE_LOOBJECTS)
2908 +       $(AR) $(SHLIBNAME).la $(AD_LUSTRE_LOOBJECTS)
2909 +
2910 +coverage:
2911 +       -@for file in  ${AD_LUSTRE_OBJECTS:.o=.c} ; do \
2912 +               gcov -b -f $$file ; done
2913 +
2914 +clean:
2915 +       @rm -f *.o *.lo
2916 diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in
2917 --- ad_lustre_orig/Makefile.in  2008-09-17 14:36:57.000000000 +0800
2918 +++ ad_lustre/Makefile.in       2008-09-17 18:20:35.000000000 +0800
2919 @@ -16,7 +16,9 @@
2920  @VPATH@
2921  
2922  AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_open.o \
2923 -      ad_lustre_rwcontig.o ad_lustre_hints.o 
2924 +      ad_lustre_rwcontig.o ad_lustre_wrcoll.o ad_lustre_wrstr.o  \
2925 +      ad_lustre_hints.o ad_lustre_aggregate.o
2926 +
2927  
2928  default: $(LIBNAME)
2929         @if [ "@ENABLE_SHLIB@" != "none" ] ; then \
2930 diff -ruN ad_lustre_orig/README ad_lustre/README
2931 --- ad_lustre_orig/README       2008-09-17 14:36:57.000000000 +0800
2932 +++ ad_lustre/README    2008-09-17 18:20:35.000000000 +0800
2933 @@ -5,6 +5,22 @@
2934    o To post the code for ParColl (Partitioned collective IO)
2935   
2936  -----------------------------------------------------
2937 +V05: 
2938 +-----------------------------------------------------
2939 +  o Improved data redistribution
2940 +    - add I/O pattern identification. If request I/O size is big,
2941 +      collective I/O won't be done. The hint big_req_size can be
2942 +      used to define this.
2943 +    - provide hint CO for load balancing to control the number 
2944 +      of IO clients for each OST
2945 +    - divide the IO clients into the different OST groups to
2946 +      produce stripe-contiguous I/O pattern
2947 +    - reduce the collective overhead by hints contiguous_data and
2948 +      same_io_size to remove unnecessary MPI_Alltoall()
2949 +  o Control read-modify-write in data sieving in collective IO
2950 +    by hint ds_in_coll.
2951 +
2952 +-----------------------------------------------------
2953  V04: 
2954  -----------------------------------------------------
2955    o Direct IO and Lockless IO support