1 diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
2 --- ad_lustre_orig/ad_lustre_aggregate.c 1970-01-01 08:00:00.000000000 +0800
3 +++ ad_lustre/ad_lustre_aggregate.c 2008-09-17 18:20:35.000000000 +0800
5 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
7 + * Copyright (C) 1997 University of Chicago.
8 + * See COPYRIGHT notice in top-level directory.
10 + * Copyright (C) 2007 Oak Ridge National Laboratory
12 + * Copyright (C) 2008 Sun Microsystems, Lustre group
15 +#include "ad_lustre.h"
16 +#include "adio_extern.h"
18 +void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
19 + int mode, int nprocs)
21 + int *striping_info = NULL;
22 + /* get striping information:
23 + * striping_info[0] = stripe_size;
24 + * striping_info[1] = stripe_count;
25 + * striping_info[2] = CO;
27 + /* for easy understanding, we name some variables */
28 + int stripe_size, stripe_count, CO = 1, CO_max = 1, lflag;
29 + char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
31 + MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
33 + stripe_size = atoi(value);
35 + MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &lflag);
37 + stripe_count = atoi(value);
38 + /* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */
42 + /* for collective read,
43 + * if "CO" clients access the same OST simultaneously,
44 + * the OST disk seek time would be large. So, to avoid this,
45 + * it might be better if 1 client only accesses 1 OST.
46 + * So, we set CO = 1 to meet the above requirement.
49 + /*XXX: maybe there are other better way for collective read */
51 + /* CO_max: the largest number of IO clients for each ost group */
52 + CO_max = (nprocs - 1)/ stripe_count + 1;
53 + /* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */
54 + MPI_Info_get(fd->info, "CO", MPI_MAX_INFO_VAL, value, &lflag);
57 + CO = ADIOI_MIN(CO_max, CO);
60 + /* although there are known "N" hints so far, we still malloc space here
61 + * instead of declaring an array[3] outside,
62 + * because on one hand in the future we probably need more hints, and
63 + * on the other hand this function can be called by
64 + * both collective read and write conveniently.
66 + *striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int));
67 + striping_info = *striping_info_ptr;
68 + striping_info[0] = stripe_size;
69 + striping_info[1] = stripe_count;
70 + striping_info[2] = CO;
73 +int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
74 + ADIO_Offset *len, int nprocs,
77 + /* please refer the comments in above function for the detailed algorithm */
79 + ADIO_Offset avail_bytes;
81 + int stripe_size = striping_info[0];
82 + int stripe_count = striping_info[1];
83 + int CO = striping_info[2];
84 + int avail_nprocs = ADIOI_MIN(stripe_count * CO, nprocs);
86 + /* calculate the rank by offset directly */
87 + rank_index = (int)((off / stripe_size) % avail_nprocs);
88 + /* XXX: the above method is so simple that the processes in top ranks are always
89 + * chosen to be I/O clients. we hope they are different each time.
92 + avail_bytes = (off / (ADIO_Offset)stripe_size + 1) *
93 + (ADIO_Offset)stripe_size - off;
94 + if (avail_bytes < *len) {
95 + /* this proc only has part of the requested contig. region */
102 +void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
103 + int *len_list, int contig_access_count,
104 + int *striping_info, int nprocs,
105 + int *count_my_req_procs_ptr,
106 + int **count_my_req_per_proc_ptr,
107 + ADIOI_Access ** my_req_ptr,
110 + int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
112 + ADIO_Offset avail_len, rem_len, curr_idx, off;
113 + ADIOI_Access *my_req;
115 + *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
116 + count_my_req_per_proc = *count_my_req_per_proc_ptr;
118 + /* buf_idx is relevant only if buftype_is_contig.
119 + * buf_idx[i] gives the index into user_buf where data received
120 + * from proc. i should be placed. This allows receives to be done
121 + * without extra buffer. This can't be done if buftype is not contig.
123 + buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
124 + /* initialize buf_idx to -1 */
125 + for (i = 0; i < nprocs; i++)
128 + /* one pass just to calculate how much space to allocate for my_req;
129 + * contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
131 + for (i = 0; i < contig_access_count; i++) {
132 + /* short circuit offset/len processing if len == 0
133 + * (zero-byte read/write
135 + if (len_list[i] == 0)
137 + off = offset_list[i];
138 + avail_len = len_list[i];
139 + /* we set avail_len to be the total size of the access.
140 + * then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return
141 + * the amount that was available.
143 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs,
145 + count_my_req_per_proc[proc]++;
146 + /* figure out how many data is remaining in the access
147 + * we'll take care of this data (if there is any)
148 + * in the while loop below.
150 + rem_len = len_list[i] - avail_len;
152 + while (rem_len != 0) {
153 + off += avail_len; /* point to first remaining byte */
154 + avail_len = rem_len; /* save remaining size, pass to calc */
155 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs,
157 + count_my_req_per_proc[proc]++;
158 + rem_len -= avail_len; /* reduce remaining length by amount from fd */
162 + *my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access));
163 + my_req = *my_req_ptr;
165 + count_my_req_procs = 0;
166 + for (i = 0; i < nprocs; i++) {
167 + if (count_my_req_per_proc[i]) {
168 + my_req[i].offsets = (ADIO_Offset *)
169 + ADIOI_Malloc(count_my_req_per_proc[i] *
170 + sizeof(ADIO_Offset));
171 + my_req[i].lens = (int *) ADIOI_Malloc(count_my_req_per_proc[i] *
173 + count_my_req_procs++;
175 + my_req[i].count = 0; /* will be incremented where needed later */
178 + /* now fill in my_req */
180 + for (i = 0; i < contig_access_count; i++) {
181 + if (len_list[i] == 0)
183 + off = offset_list[i];
184 + avail_len = len_list[i];
185 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs,
188 + /* for each separate contiguous access from this process */
189 + if (buf_idx[proc] == -1)
190 + buf_idx[proc] = (int) curr_idx;
192 + l = my_req[proc].count;
193 + curr_idx += (int) avail_len; /* NOTE: Why is curr_idx an int? Fix? */
195 + rem_len = len_list[i] - avail_len;
197 + /* store the proc, offset, and len information in an array
198 + * of structures, my_req. Each structure contains the
199 + * offsets and lengths located in that process's FD,
200 + * and the associated count.
202 + my_req[proc].offsets[l] = off;
203 + my_req[proc].lens[l] = (int) avail_len;
204 + my_req[proc].count++;
206 + while (rem_len != 0) {
208 + avail_len = rem_len;
209 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs,
211 + if (buf_idx[proc] == -1)
212 + buf_idx[proc] = (int) curr_idx;
214 + l = my_req[proc].count;
215 + curr_idx += avail_len;
216 + rem_len -= avail_len;
218 + my_req[proc].offsets[l] = off;
219 + my_req[proc].lens[l] = (int) avail_len;
220 + my_req[proc].count++;
225 + for (i = 0; i < nprocs; i++) {
226 + if (count_my_req_per_proc[i] > 0) {
227 + FPRINTF(stdout, "data needed from %d (count = %d):\n",
228 + i, my_req[i].count);
229 + for (l = 0; l < my_req[i].count; l++) {
230 + FPRINTF(stdout, " off[%d] = %lld, len[%d] = %d\n",
231 + l, my_req[i].offsets[l], l, my_req[i].lens[l]);
237 + for (i = 0; i < nprocs; i++) {
238 + FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
242 + *count_my_req_procs_ptr = count_my_req_procs;
243 + *buf_idx_ptr = buf_idx;
246 +int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
247 + int *len_list, int nprocs)
250 + * So far, only one case is suitable for collective I/O
251 + * (1) request size <= big_req_size
253 + * if (avg_req_size > big_req_size) {
258 + int i, docollect = 1, lflag, big_req_size = 0;
259 + ADIO_Offset req_size = 0, total_req_size;
260 + int avg_req_size, total_access_count;
261 + char *value = NULL;
263 + /* calculate total_req_size and total_access_count */
264 + for (i = 0; i < contig_access_count; i++)
265 + req_size += len_list[i];
266 + MPI_Allreduce(&req_size, &total_req_size, 1, MPI_LONG_LONG_INT, MPI_SUM,
268 + MPI_Allreduce(&contig_access_count, &total_access_count, 1, MPI_INT, MPI_SUM,
270 + /* estimate average req_size */
271 + avg_req_size = (int)(total_req_size / total_access_count);
273 + /* get hint of hole_ratio */
274 + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
275 + MPI_Info_get(fd->info, "big_req_size", MPI_MAX_INFO_VAL, value, &lflag);
277 + big_req_size = atoi(value);
279 + if ((big_req_size > 0) && (avg_req_size > big_req_size))
287 +void ADIOI_LUSTRE_Calc_my_off_len(ADIO_File fd, int bufcount,
288 + MPI_Datatype datatype, int file_ptr_type,
289 + ADIO_Offset offset,
290 + ADIO_Offset **offset_list_ptr,
291 + int **len_list_ptr,
292 + ADIO_Offset *start_offset_ptr,
293 + ADIO_Offset *end_offset_ptr,
294 + int *contig_access_count_ptr)
296 + int filetype_size, buftype_size, etype_size;
297 + int i, j, k, frd_size = 0, old_frd_size = 0, st_index = 0;
298 + int n_filetypes, etype_in_filetype;
299 + ADIO_Offset abs_off_in_filetype = 0;
300 + int bufsize, sum, n_etypes_in_filetype, size_in_filetype;
301 + int contig_access_count, *len_list, flag, filetype_is_contig;
302 + MPI_Aint filetype_extent, filetype_lb;
303 + ADIOI_Flatlist_node *flat_file;
304 + ADIO_Offset *offset_list, off, end_offset = 0, disp;
306 + /* For this process's request, calculate the list of offsets and
307 + lengths in the file and determine the start and end offsets. */
309 + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
311 + MPI_Type_size(fd->filetype, &filetype_size);
312 + MPI_Type_extent(fd->filetype, &filetype_extent);
313 + MPI_Type_lb(fd->filetype, &filetype_lb);
314 + MPI_Type_size(datatype, &buftype_size);
315 + etype_size = fd->etype_size;
317 + if (!filetype_size) {
318 + *contig_access_count_ptr = 0;
319 + *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset));
320 + *len_list_ptr = (int *) ADIOI_Malloc(2 * sizeof(int));
321 + /* 2 is for consistency. everywhere I malloc one more than needed */
323 + offset_list = *offset_list_ptr;
324 + len_list = *len_list_ptr;
325 + offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
326 + fd->disp + etype_size * offset;
328 + *start_offset_ptr = offset_list[0];
329 + *end_offset_ptr = offset_list[0] + len_list[0] - 1;
333 + if (filetype_is_contig) {
334 + *contig_access_count_ptr = 1;
335 + *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset));
336 + *len_list_ptr = (int *) ADIOI_Malloc(2 * sizeof(int));
337 + /* 2 is for consistency. everywhere I malloc one more than needed */
339 + offset_list = *offset_list_ptr;
340 + len_list = *len_list_ptr;
341 + offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
342 + fd->disp + etype_size * offset;
343 + len_list[0] = bufcount * buftype_size;
344 + *start_offset_ptr = offset_list[0];
345 + *end_offset_ptr = offset_list[0] + len_list[0] - 1;
347 + /* update file pointer */
348 + if (file_ptr_type == ADIO_INDIVIDUAL)
349 + fd->fp_ind = *end_offset_ptr + 1;
351 + /* First calculate what size of offset_list and len_list to allocate */
352 + /* filetype already flattened in ADIO_Open or ADIO_Fcntl */
353 + flat_file = ADIOI_Flatlist;
354 + while (flat_file->type != fd->filetype)
355 + flat_file = flat_file->next;
358 + if (file_ptr_type == ADIO_INDIVIDUAL) {
359 + offset = fd->fp_ind; /* in bytes */
364 + for (i = 0; i < flat_file->count; i++) {
365 + if (disp + flat_file->indices[i] +
366 + (ADIO_Offset) n_filetypes * filetype_extent +
367 + flat_file->blocklens[i] >= offset) {
369 + frd_size = (int) (disp + flat_file->indices[i] +
370 + (ADIO_Offset) n_filetypes *
372 + flat_file->blocklens[i] -
380 + n_etypes_in_filetype = filetype_size / etype_size;
381 + n_filetypes = (int) (offset / n_etypes_in_filetype);
382 + etype_in_filetype = (int) (offset % n_etypes_in_filetype);
383 + size_in_filetype = etype_in_filetype * etype_size;
386 + for (i = 0; i < flat_file->count; i++) {
387 + sum += flat_file->blocklens[i];
388 + if (sum > size_in_filetype) {
390 + frd_size = sum - size_in_filetype;
391 + abs_off_in_filetype = flat_file->indices[i] +
393 + (sum - flat_file->blocklens[i]);
398 + /* abs. offset in bytes in the file */
399 + offset = disp + (ADIO_Offset) n_filetypes *filetype_extent +
400 + abs_off_in_filetype;
403 + /* calculate how much space to allocate for offset_list, len_list */
405 + old_frd_size = frd_size;
406 + contig_access_count = i = 0;
408 + bufsize = buftype_size * bufcount;
409 + frd_size = ADIOI_MIN(frd_size, bufsize);
410 + while (i < bufsize) {
412 + contig_access_count++;
414 + j = (j + 1) % flat_file->count;
415 + frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i);
418 + /* allocate space for offset_list and len_list */
420 + *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc((contig_access_count+1) *
421 + sizeof(ADIO_Offset));
422 + *len_list_ptr = (int *) ADIOI_Malloc((contig_access_count + 1) *
424 + /* +1 to avoid a 0-size malloc */
426 + offset_list = *offset_list_ptr;
427 + len_list = *len_list_ptr;
429 + /* find start offset, end offset, and fill in offset_list and len_list */
431 + *start_offset_ptr = offset; /* calculated above */
436 + frd_size = ADIOI_MIN(old_frd_size, bufsize);
437 + while (i < bufsize) {
439 + offset_list[k] = off;
440 + len_list[k] = frd_size;
444 + end_offset = off + frd_size - 1;
446 + /* Note: end_offset points to the last byte-offset that will be accessed.
447 + e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */
449 + if (off + frd_size < disp + flat_file->indices[j] +
450 + flat_file->blocklens[j] +
451 + (ADIO_Offset) n_filetypes * filetype_extent) {
453 + /* did not reach end of contiguous block in filetype.
454 + * no more I/O needed. off is incremented by frd_size.
457 + if (j < (flat_file->count - 1))
460 + /* hit end of flattened filetype;
461 + * start at beginning again
466 + off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes *
468 + frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i);
472 + /* update file pointer */
473 + if (file_ptr_type == ADIO_INDIVIDUAL)
476 + *contig_access_count_ptr = contig_access_count;
477 + *end_offset_ptr = end_offset;
481 +void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
482 + int *count_my_req_per_proc,
483 + ADIOI_Access * my_req,
484 + int nprocs, int myrank,
485 + ADIO_Offset start_offset,
486 + ADIO_Offset end_offset,
487 + int *striping_info,
488 + int *count_others_req_procs_ptr,
489 + ADIOI_Access ** others_req_ptr)
491 + /* what requests of other processes will be written by this process */
493 + int *count_others_req_per_proc, count_others_req_procs;
494 + int i, j, lflag, samesize = 0, contiguous = 0;
495 + MPI_Request *send_requests, *recv_requests;
496 + MPI_Status *statuses;
497 + ADIOI_Access *others_req;
498 + char *value = NULL;
499 + int proc, avail_nprocs, stripe_count, CO;
500 + ADIO_Offset min_st_offset, off, req_len, avail_len, rem_len, *all_lens;
502 + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
504 + MPI_Info_get(fd->info, "same_io_size", MPI_MAX_INFO_VAL, value, &lflag);
505 + if (lflag && !strcmp(value, "yes"))
507 + /* contiguous data */
508 + MPI_Info_get(fd->info, "contiguous_data", MPI_MAX_INFO_VAL, value, &lflag);
509 + if (lflag && !strcmp(value, "yes"))
512 + *others_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs *
513 + sizeof(ADIOI_Access));
514 + others_req = *others_req_ptr;
516 + /* if the data are contiguous, we don't need to do MPI_Alltoall */
518 + stripe_count = striping_info[1];
519 + CO = striping_info[2];
521 + for (i = 0; i < nprocs; i++) {
522 + others_req[i].count = 0;
524 + req_len = end_offset - start_offset + 1;
525 + all_lens = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
527 + if (samesize == 0) {/* different request size */
528 + /* calculate the min_st_offset */
529 + MPI_Allreduce(&start_offset, &min_st_offset, 1, MPI_LONG_LONG,
530 + MPI_MIN, fd->comm);
531 + /* exchange request length */
532 + MPI_Allgather(&req_len, 1, ADIO_OFFSET, all_lens, 1, ADIO_OFFSET,
534 + } else { /* same request size */
535 + /* calculate the min_st_offset */
536 + min_st_offset = start_offset - myrank * req_len;
537 + /* assign request length to all_lens[] */
538 + for (i = 0; i < nprocs; i ++)
539 + all_lens[i] = req_len;
541 + avail_nprocs = ADIOI_MIN(nprocs, stripe_count * CO);
542 + if (myrank < avail_nprocs) {
543 + off = min_st_offset;
544 + /* calcaulte other_req[i].count */
545 + for (i = 0; i < nprocs; i++) {
546 + avail_len = all_lens[i];
547 + rem_len = avail_len;
548 + while (rem_len > 0) {
549 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
550 + nprocs, striping_info);
551 + if (proc == myrank) {
552 + others_req[i].count ++;
555 + rem_len -= avail_len;
556 + avail_len = rem_len;
559 + /* calculate offset and len for each request */
560 + off = min_st_offset;
561 + for (i = 0; i < nprocs; i++) {
562 + if (others_req[i].count) {
563 + others_req[i].offsets = (ADIO_Offset *)
564 + ADIOI_Malloc(others_req[i].count *
565 + sizeof(ADIO_Offset));
566 + others_req[i].lens = (int *)
567 + ADIOI_Malloc(others_req[i].count *
569 + others_req[i].mem_ptrs = (MPI_Aint *)
570 + ADIOI_Malloc(others_req[i].count *
574 + avail_len = all_lens[i];
575 + rem_len = avail_len;
576 + while (rem_len > 0) {
577 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
578 + nprocs, striping_info);
579 + if (proc == myrank) {
580 + others_req[i].offsets[j] = off;
581 + others_req[i].lens[j] = (int)avail_len;
585 + rem_len -= avail_len;
586 + avail_len = rem_len;
591 + ADIOI_Free(all_lens);
593 + /* multiple non-contiguous requests */
594 + /* first find out how much to send/recv and from/to whom */
597 + * count_others_req_procs:
598 + * number of processes whose requests will be written by
599 + * this process (including this process itself)
600 + * count_others_req_per_proc[i]:
601 + * how many separate contiguous requests of proc[i] will be
602 + * written by this process.
605 + count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
607 + MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
608 + count_others_req_per_proc, 1, MPI_INT, fd->comm);
610 + count_others_req_procs = 0;
611 + for (i = 0; i < nprocs; i++) {
612 + if (count_others_req_per_proc[i]) {
613 + others_req[i].count = count_others_req_per_proc[i];
614 + others_req[i].offsets = (ADIO_Offset *)
615 + ADIOI_Malloc(others_req[i].count *
616 + sizeof(ADIO_Offset));
617 + others_req[i].lens = (int *)
618 + ADIOI_Malloc(others_req[i].count *
620 + others_req[i].mem_ptrs = (MPI_Aint *)
621 + ADIOI_Malloc(others_req[i].count *
623 + count_others_req_procs++;
625 + others_req[i].count = 0;
628 + /* now send the calculated offsets and lengths to respective processes */
630 + send_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_my_req_procs + 1) *
631 + sizeof(MPI_Request));
632 + recv_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_others_req_procs+1)*
633 + sizeof(MPI_Request));
634 + /* +1 to avoid a 0-size malloc */
637 + for (i = 0; i < nprocs; i++) {
638 + if (others_req[i].count) {
639 + MPI_Irecv(others_req[i].offsets, others_req[i].count,
640 + ADIO_OFFSET, i, i + myrank, fd->comm,
641 + &recv_requests[j]);
643 + MPI_Irecv(others_req[i].lens, others_req[i].count,
644 + MPI_INT, i, i + myrank + 1, fd->comm,
645 + &recv_requests[j]);
651 + for (i = 0; i < nprocs; i++) {
652 + if (my_req[i].count) {
653 + MPI_Isend(my_req[i].offsets, my_req[i].count,
654 + ADIO_OFFSET, i, i + myrank, fd->comm,
655 + &send_requests[j]);
657 + MPI_Isend(my_req[i].lens, my_req[i].count,
658 + MPI_INT, i, i + myrank + 1, fd->comm,
659 + &send_requests[j]);
664 + statuses = (MPI_Status *)
665 + ADIOI_Malloc((1 + 2 * ADIOI_MAX(count_my_req_procs,
666 + count_others_req_procs)) *
667 + sizeof(MPI_Status));
668 + /* +1 to avoid a 0-size malloc */
670 + MPI_Waitall(2 * count_my_req_procs, send_requests, statuses);
671 + MPI_Waitall(2 * count_others_req_procs, recv_requests, statuses);
673 + ADIOI_Free(send_requests);
674 + ADIOI_Free(recv_requests);
675 + ADIOI_Free(statuses);
676 + ADIOI_Free(count_others_req_per_proc);
678 + *count_others_req_procs_ptr = count_others_req_procs;
681 diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c
682 --- ad_lustre_orig/ad_lustre.c 2008-09-17 14:36:57.000000000 +0800
683 +++ ad_lustre/ad_lustre.c 2008-09-17 18:20:35.000000000 +0800
685 /* -*- Mode: C; c-basic-offset:4 ; -*- */
687 - * Copyright (C) 2001 University of Chicago.
689 + * Copyright (C) 2001 University of Chicago.
690 * See COPYRIGHT notice in top-level directory.
692 * Copyright (C) 2007 Oak Ridge National Laboratory
694 + * Copyright (C) 2008 Sun Microsystems, Lustre group
697 #include "ad_lustre.h"
699 ADIOI_LUSTRE_ReadContig, /* ReadContig */
700 ADIOI_LUSTRE_WriteContig, /* WriteContig */
701 ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
702 - ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
703 + ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */
704 ADIOI_GEN_SeekIndividual, /* SeekIndividual */
705 - ADIOI_GEN_Fcntl, /* Fcntl */
706 + ADIOI_LUSTRE_Fcntl, /* Fcntl */
707 ADIOI_LUSTRE_SetInfo, /* SetInfo */
708 ADIOI_GEN_ReadStrided, /* ReadStrided */
709 - ADIOI_GEN_WriteStrided, /* WriteStrided */
710 - ADIOI_GEN_Close, /* Close */
711 + ADIOI_LUSTRE_WriteStrided, /* WriteStrided */
712 + ADIOI_LUSTRE_Close, /* Close */
713 #if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE)
714 ADIOI_GEN_IreadContig, /* IreadContig */
715 ADIOI_GEN_IwriteContig, /* IwriteContig */
716 diff -ruN ad_lustre_orig/ad_lustre_close.c ad_lustre/ad_lustre_close.c
717 --- ad_lustre_orig/ad_lustre_close.c 1970-01-01 08:00:00.000000000 +0800
718 +++ ad_lustre/ad_lustre_close.c 2008-09-17 18:20:35.000000000 +0800
720 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
723 + * Copyright (C) 1997 University of Chicago.
724 + * See COPYRIGHT notice in top-level directory.
726 + * Copyright (C) 2007 Oak Ridge National Laboratory
728 + * Copyright (C) 2008 Sun Microsystems, Lustre group
731 +#include "ad_lustre.h"
737 +void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code)
740 + static char myname[] = "ADIOI_LUSTRE_CLOSE";
743 + MPE_Log_event(9, 0, "start close");
746 + err = close(fd->fd_sys);
749 + MPE_Log_event(10, 0, "end close");
754 + if (err == -1 || derr == -1) {
756 + MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname,
757 + __LINE__, MPI_ERR_IO, "**io", "**io %s",
760 + *error_code = MPI_SUCCESS;
762 diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
763 --- ad_lustre_orig/ad_lustre.h 2008-09-17 14:36:57.000000000 +0800
764 +++ ad_lustre/ad_lustre.h 2008-09-17 18:20:35.000000000 +0800
766 /* -*- Mode: C; c-basic-offset:4 ; -*- */
768 - * Copyright (C) 1997 University of Chicago.
770 + * Copyright (C) 1997 University of Chicago.
771 * See COPYRIGHT notice in top-level directory.
773 * Copyright (C) 2007 Oak Ridge National Laboratory
775 + * Copyright (C) 2008 Sun Microsystems, Lustre group
778 #ifndef AD_UNIX_INCLUDE
781 void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code);
782 void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code);
783 -void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
784 - MPI_Datatype datatype, int file_ptr_type,
785 - ADIO_Offset offset, ADIO_Status *status, int
787 -void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count,
788 - MPI_Datatype datatype, int file_ptr_type,
789 - ADIO_Offset offset, ADIO_Status *status, int
791 +void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
792 + MPI_Datatype datatype, int file_ptr_type,
793 + ADIO_Offset offset, ADIO_Status *status,
795 +void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count,
796 + MPI_Datatype datatype, int file_ptr_type,
797 + ADIO_Offset offset, ADIO_Status *status,
799 +void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
800 + MPI_Datatype datatype, int file_ptr_type,
801 + ADIO_Offset offset, ADIO_Status *status,
803 void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
804 - MPI_Datatype datatype, int file_ptr_type,
805 - ADIO_Offset offset, ADIO_Status *status, int
807 + MPI_Datatype datatype, int file_ptr_type,
808 + ADIO_Offset offset, ADIO_Status *status,
810 void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count,
811 - MPI_Datatype datatype, int file_ptr_type,
812 - ADIO_Offset offset, ADIO_Status *status, int
814 + MPI_Datatype datatype, int file_ptr_type,
815 + ADIO_Offset offset, ADIO_Status *status,
817 +void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count,
818 + MPI_Datatype datatype, int file_ptr_type,
819 + ADIO_Offset offset, ADIO_Status *status,
821 void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
823 void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
825 +void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
826 + int mode, int nprocs);
827 +int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
828 + ADIO_Offset *len, int nprocs,
829 + int *striping_info);
830 +void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
831 + int *len_list, int contig_access_count,
832 + int *striping_info, int nprocs,
833 + int *count_my_req_procs_ptr,
834 + int **count_my_req_per_proc_ptr,
835 + ADIOI_Access ** my_req_ptr,
836 + int **buf_idx_ptr);
837 +int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
838 + int *len_list, int nprocs);
839 +void ADIOI_LUSTRE_Calc_my_off_len(ADIO_File fd, int bufcount,
840 + MPI_Datatype datatype, int file_ptr_type,
841 + ADIO_Offset offset,
842 + ADIO_Offset **offset_list_ptr,
843 + int **len_list_ptr,
844 + ADIO_Offset *start_offset_ptr,
845 + ADIO_Offset *end_offset_ptr,
846 + int *contig_access_count_ptr);
847 +void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
848 + int *count_my_req_per_proc,
849 + ADIOI_Access * my_req,
850 + int nprocs, int myrank,
851 + ADIO_Offset start_offset,
852 + ADIO_Offset end_offset,
853 + int *striping_info,
854 + int *count_others_req_procs_ptr,
855 + ADIOI_Access ** others_req_ptr);
856 #endif /* End of AD_UNIX_INCLUDE */
857 diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
858 --- ad_lustre_orig/ad_lustre_hints.c 2008-09-17 14:36:57.000000000 +0800
859 +++ ad_lustre/ad_lustre_hints.c 2008-09-17 18:20:35.000000000 +0800
861 /* -*- Mode: C; c-basic-offset:4 ; -*- */
863 - * Copyright (C) 1997 University of Chicago.
865 + * Copyright (C) 1997 University of Chicago.
866 * See COPYRIGHT notice in top-level directory.
868 * Copyright (C) 2007 Oak Ridge National Laboratory
870 + * Copyright (C) 2008 Sun Microsystems, Lustre group
873 #include "ad_lustre.h"
874 @@ -11,130 +13,162 @@
876 void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
878 - char *value, *value_in_fd;
879 - int flag, tmp_val[3], str_factor=-1, str_unit=0, start_iodev=-1;
880 - struct lov_user_md lum = { 0 };
881 - int err, myrank, fd_sys, perm, amode, old_mask;
882 + char *value = NULL;
883 + int flag, tmp_val, int_val, str_factor, str_unit, start_iodev;
884 + static char myname[] = "ADIOI_LUSTRE_SETINFO";
886 - value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
887 if ( (fd->info) == MPI_INFO_NULL) {
888 - /* This must be part of the open call. can set striping parameters
890 + /* This must be part of the open call. can set striping parameters
892 MPI_Info_create(&(fd->info));
894 MPI_Info_set(fd->info, "direct_read", "false");
895 MPI_Info_set(fd->info, "direct_write", "false");
896 fd->direct_read = fd->direct_write = 0;
898 - /* has user specified striping or server buffering parameters
900 + /* has user specified striping or server buffering parameters
901 and do they have the same value on all processes? */
902 if (users_info != MPI_INFO_NULL) {
903 - MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
906 - str_unit=atoi(value);
908 - MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
911 - str_factor=atoi(value);
912 + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
914 - MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
915 + /* direct read and write */
916 + MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
919 - start_iodev=atoi(value);
921 - MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
923 if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
924 MPI_Info_set(fd->info, "direct_read", "true");
928 - MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
929 + MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
931 if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
932 MPI_Info_set(fd->info, "direct_write", "true");
933 fd->direct_write = 1;
937 - MPI_Comm_rank(fd->comm, &myrank);
939 - tmp_val[0] = str_factor;
940 - tmp_val[1] = str_unit;
941 - tmp_val[2] = start_iodev;
943 - MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm);
945 - if (tmp_val[0] != str_factor
946 - || tmp_val[1] != str_unit
947 - || tmp_val[2] != start_iodev) {
948 - FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys"
949 - "-striping_factor:striping_unit:start_iodevice "
950 - "need to be identical across all processes\n");
951 - MPI_Abort(MPI_COMM_WORLD, 1);
952 - } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) {
953 - /* if user has specified striping info, process 0 tries to set it */
955 - if (fd->perm == ADIO_PERM_NULL) {
956 - old_mask = umask(022);
958 - perm = old_mask ^ 0666;
960 - else perm = fd->perm;
963 - if (fd->access_mode & ADIO_CREATE)
964 - amode = amode | O_CREAT;
965 - if (fd->access_mode & ADIO_RDONLY)
966 - amode = amode | O_RDONLY;
967 - if (fd->access_mode & ADIO_WRONLY)
968 - amode = amode | O_WRONLY;
969 - if (fd->access_mode & ADIO_RDWR)
970 - amode = amode | O_RDWR;
971 - if (fd->access_mode & ADIO_EXCL)
972 - amode = amode | O_EXCL;
974 - /* we need to create file so ensure this is set */
975 - amode = amode | O_LOV_DELAY_CREATE | O_CREAT;
977 - fd_sys = open(fd->filename, amode, perm);
978 - if (fd_sys == -1) {
979 - if (errno != EEXIST)
981 - "Failure to open file %s %d %d\n",strerror(errno), amode, perm);
983 - lum.lmm_magic = LOV_USER_MAGIC;
984 - lum.lmm_pattern = 0;
985 - lum.lmm_stripe_size = str_unit;
986 - lum.lmm_stripe_count = str_factor;
987 - lum.lmm_stripe_offset = start_iodev;
989 - err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum);
990 - if (err == -1 && errno != EEXIST) {
991 - fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno));
995 - } /* End of striping parameters validation */
997 + MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
999 + if (flag && (str_unit = atoi(value))) {
1000 + tmp_val = str_unit;
1001 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1002 + if (tmp_val != str_unit) {
1003 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1008 + MPI_Info_set(fd->info, "striping_unit", value);
1010 + /* stripe count */
1011 + MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
1013 + if (flag && (str_factor = atoi(value))) {
1014 + tmp_val = str_factor;
1015 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1016 + if (tmp_val != str_factor) {
1017 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1018 + "striping_factor",
1022 + MPI_Info_set(fd->info, "striping_factor", value);
1024 + /* stripe offset */
1025 + MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
1027 + if (flag && ((start_iodev = atoi(value)) >= 0)) {
1028 + tmp_val = start_iodev;
1029 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1030 + if (tmp_val != start_iodev) {
1031 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1036 + MPI_Info_set(fd->info, "start_iodevice", value);
1039 + MPI_Info_get(users_info, "CO", MPI_MAX_INFO_VAL, value,
1041 + if (flag && (int_val = atoi(value)) > 0) {
1042 + tmp_val = int_val;
1043 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1044 + if (tmp_val != int_val) {
1045 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1050 + MPI_Info_set(fd->info, "CO", value);
1052 + /* big_req_size */
1053 + MPI_Info_get(users_info, "big_req_size", MPI_MAX_INFO_VAL, value,
1055 + if (flag && (int_val = atoi(value)) > 0) {
1056 + tmp_val = int_val;
1057 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1058 + if (tmp_val != int_val) {
1059 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1064 + MPI_Info_set(fd->info, "big_req_size", value);
1066 + /* hint for disabling data sieving when do collective IO */
1067 + MPI_Info_get(users_info, "ds_in_coll", MPI_MAX_INFO_VAL,
1069 + if (flag && (!strcmp(value, "enable") ||
1070 + !strcmp(value, "ENABLE"))) {
1071 + tmp_val = int_val = 1;
1072 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1073 + if (tmp_val != int_val) {
1074 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1079 + MPI_Info_set(fd->info, "ds_in_coll", "enable");
1081 + /* same io size */
1082 + MPI_Info_get(users_info, "same_io_size", MPI_MAX_INFO_VAL,
1084 + if (flag && (!strcmp(value, "yes") ||
1085 + !strcmp(value, "YES"))) {
1086 + tmp_val = int_val = 1;
1087 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1088 + if (tmp_val != int_val) {
1089 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1094 + MPI_Info_set(fd->info, "same_io_size", "yes");
1096 + /* contiguous data */
1097 + MPI_Info_get(users_info, "contiguous_data", MPI_MAX_INFO_VAL,
1099 + if (flag && (!strcmp(value, "yes") ||
1100 + !strcmp(value, "YES"))) {
1101 + tmp_val = int_val = 1;
1102 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1103 + if (tmp_val != int_val) {
1104 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1105 + "contiguous_data",
1109 + MPI_Info_set(fd->info, "contiguous_data", "yes");
1111 + ADIOI_Free(value);
1114 - MPI_Barrier(fd->comm);
1115 - /* set the values for collective I/O and data sieving parameters */
1116 - ADIOI_GEN_SetInfo(fd, users_info, error_code);
1118 - /* The file has been opened previously and fd->fd_sys is a valid
1119 - file descriptor. cannot set striping parameters now. */
1121 - /* set the values for collective I/O and data sieving parameters */
1122 - ADIOI_GEN_SetInfo(fd, users_info, error_code);
1125 + /* set the values for collective I/O and data sieving parameters */
1126 + ADIOI_GEN_SetInfo(fd, users_info, error_code);
1128 if (ADIOI_Direct_read) fd->direct_read = 1;
1129 if (ADIOI_Direct_write) fd->direct_write = 1;
1131 - ADIOI_Free(value);
1133 *error_code = MPI_SUCCESS;
1135 diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c
1136 --- ad_lustre_orig/ad_lustre_open.c 2008-09-17 14:36:57.000000000 +0800
1137 +++ ad_lustre/ad_lustre_open.c 2008-09-17 18:55:50.000000000 +0800
1139 /* -*- Mode: C; c-basic-offset:4 ; -*- */
1141 - * Copyright (C) 1997 University of Chicago.
1143 + * Copyright (C) 1997 University of Chicago.
1144 * See COPYRIGHT notice in top-level directory.
1146 * Copyright (C) 2007 Oak Ridge National Laboratory
1148 + * Copyright (C) 2008 Sun Microsystems, Lustre group
1151 #include "ad_lustre.h"
1153 void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
1155 - int perm, old_mask, amode, amode_direct;
1156 + int perm, old_mask, amode = 0, amode_direct = 0, flag = 0, err, myrank;
1157 + int stripe_size = 0, stripe_count = 0, stripe_offset = -1;
1158 struct lov_user_md lum = { 0 };
1160 + char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
1162 #if defined(MPICH2) || !defined(PRINT_ERR_MSG)
1163 static char myname[] = "ADIOI_LUSTRE_OPEN";
1165 old_mask = umask(022);
1167 perm = old_mask ^ 0666;
1169 - else perm = fd->perm;
1174 - if (fd->access_mode & ADIO_CREATE)
1175 + if (fd->access_mode & ADIO_CREATE) {
1176 amode = amode | O_CREAT;
1177 + /* Check striping info
1178 + * if already set by SetInfo(), set them to lum; otherwise, set by lum
1180 + MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value,
1183 + stripe_size = atoi(value);
1185 + MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value,
1188 + stripe_count = atoi(value);
1190 + MPI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL, value,
1193 + stripe_offset = atoi(value);
1195 + /* if user has specified striping info,
1196 + * process 0 will try to check and set it.
1198 + if ((stripe_size > 0) || (stripe_count > 0) || (stripe_offset >= 0)) {
1199 + MPI_Comm_rank(fd->comm, &myrank);
1200 + if (myrank == 0) {
1201 + int fd_sys = open(fd->filename, amode, perm);
1202 + if (fd_sys == -1) {
1203 + if (errno != EEXIST)
1204 + FPRINTF(stderr, "Failure to open file %s %d %d\n",
1205 + strerror(errno), amode, perm);
1207 + lum.lmm_magic = LOV_USER_MAGIC;
1208 + lum.lmm_pattern = 1;
1209 + lum.lmm_stripe_size = stripe_size;
1210 + lum.lmm_stripe_count = stripe_count;
1211 + lum.lmm_stripe_offset = stripe_offset;
1213 + if (ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum))
1215 + "Failure to set striping info to Lustre!\n");
1219 + MPI_Barrier(fd->comm);
1223 if (fd->access_mode & ADIO_RDONLY)
1224 amode = amode | O_RDONLY;
1225 if (fd->access_mode & ADIO_WRONLY)
1227 fd->fd_sys = open(fd->filename, amode|O_CREAT, perm);
1229 if (fd->fd_sys != -1) {
1232 - value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
1234 /* get file striping information and set it in info */
1235 - lum.lmm_magic = LOV_USER_MAGIC;
1236 - err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
1239 - sprintf(value, "%d", lum.lmm_stripe_size);
1240 - MPI_Info_set(fd->info, "striping_unit", value);
1242 - sprintf(value, "%d", lum.lmm_stripe_count);
1243 - MPI_Info_set(fd->info, "striping_factor", value);
1245 - sprintf(value, "%d", lum.lmm_stripe_offset);
1246 - MPI_Info_set(fd->info, "start_iodevice", value);
1248 - ADIOI_Free(value);
1249 + lum.lmm_magic = LOV_USER_MAGIC;
1250 + err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
1253 + if (lum.lmm_stripe_size && lum.lmm_stripe_count &&
1254 + (lum.lmm_stripe_offset >= 0)) {
1255 + sprintf(value, "%d", lum.lmm_stripe_size);
1256 + MPI_Info_set(fd->info, "striping_unit", value);
1258 + sprintf(value, "%d", lum.lmm_stripe_count);
1259 + MPI_Info_set(fd->info, "striping_factor", value);
1261 + sprintf(value, "%d", lum.lmm_stripe_offset);
1262 + MPI_Info_set(fd->info, "start_iodevice", value);
1264 + FPRINTF(stderr, "Striping info is invalid!\n");
1265 + ADIOI_Free(value);
1266 + MPI_Abort(MPI_COMM_WORLD, 1);
1269 + FPRINTF(stderr, "Failed to get striping info from Lustre!\n");
1270 + ADIOI_Free(value);
1271 + MPI_Abort(MPI_COMM_WORLD, 1);
1273 if (fd->access_mode & ADIO_APPEND)
1274 fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1278 if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
1279 - fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1280 + fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1283 if (fd->direct_write || fd->direct_read) {
1284 @@ -81,20 +133,22 @@
1287 /* --BEGIN ERROR HANDLING-- */
1288 - if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
1289 - (fd->direct_write || fd->direct_read))) {
1290 + if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
1291 + (fd->direct_write || fd->direct_read))) {
1292 if (errno == ENAMETOOLONG)
1293 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1294 - MPIR_ERR_RECOVERABLE, myname,
1295 - __LINE__, MPI_ERR_BAD_FILE,
1296 + MPIR_ERR_RECOVERABLE,
1300 "**filenamelong %s %d",
1302 strlen(fd->filename));
1303 else if (errno == ENOENT)
1304 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1305 - MPIR_ERR_RECOVERABLE, myname,
1306 - __LINE__, MPI_ERR_NO_SUCH_FILE,
1307 + MPIR_ERR_RECOVERABLE,
1309 + MPI_ERR_NO_SUCH_FILE,
1313 @@ -108,27 +162,30 @@
1315 else if (errno == EACCES) {
1316 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1317 - MPIR_ERR_RECOVERABLE, myname,
1318 - __LINE__, MPI_ERR_ACCESS,
1319 + MPIR_ERR_RECOVERABLE,
1323 - "**fileaccess %s",
1326 - else if (errno == EROFS) {
1327 + "**fileaccess %s",
1329 + } else if (errno == EROFS) {
1330 /* Read only file or file system and write access requested */
1331 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1332 - MPIR_ERR_RECOVERABLE, myname,
1333 - __LINE__, MPI_ERR_READ_ONLY,
1334 - "**ioneedrd", 0 );
1337 + MPIR_ERR_RECOVERABLE,
1339 + MPI_ERR_READ_ONLY,
1342 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1343 - MPIR_ERR_RECOVERABLE, myname,
1344 - __LINE__, MPI_ERR_IO, "**io",
1345 + MPIR_ERR_RECOVERABLE,
1347 + MPI_ERR_IO, "**io",
1348 "**io %s", strerror(errno));
1352 /* --END ERROR HANDLING-- */
1353 - else *error_code = MPI_SUCCESS;
1354 + *error_code = MPI_SUCCESS;
1357 + ADIOI_Free(value);
1359 diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c
1360 --- ad_lustre_orig/ad_lustre_rwcontig.c 2008-09-17 14:36:57.000000000 +0800
1361 +++ ad_lustre/ad_lustre_rwcontig.c 2008-09-17 18:52:01.000000000 +0800
1363 /* -*- Mode: C; c-basic-offset:4 ; -*- */
1365 - * Copyright (C) 1997 University of Chicago.
1367 + * Copyright (C) 1997 University of Chicago.
1368 * See COPYRIGHT notice in top-level directory.
1370 * Copyright (C) 2007 Oak Ridge National Laboratory
1372 + * Copyright (C) 2008 Sun Microsystems, Lustre group
1375 #define _XOPEN_SOURCE 600
1379 err = write(fd->fd_sys, buf, len);
1382 err = read(fd->fd_sys, buf, len);
1384 err = ADIOI_LUSTRE_Directio(fd, buf, len, offset, io_mode);
1385 diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
1386 --- ad_lustre_orig/ad_lustre_wrcoll.c 1970-01-01 08:00:00.000000000 +0800
1387 +++ ad_lustre/ad_lustre_wrcoll.c 2008-09-17 18:20:35.000000000 +0800
1389 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
1391 + * Copyright (C) 1997 University of Chicago.
1392 + * See COPYRIGHT notice in top-level directory.
1394 + * Copyright (C) 2007 Oak Ridge National Laboratory
1396 + * Copyright (C) 2008 Sun Microsystems, Lustre group
1399 +#include "ad_lustre.h"
1400 +#include "adio_extern.h"
1402 +/* prototypes of functions used for collective writes only. */
1403 +static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
1404 + MPI_Datatype datatype, int nprocs,
1406 + ADIOI_Access *others_req,
1407 + ADIOI_Access *my_req,
1408 + ADIO_Offset *offset_list,
1410 + int contig_access_count,
1411 + int * striping_info,
1412 + int *buf_idx, int *error_code);
1413 +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
1414 + ADIOI_Flatlist_node * flat_buf,
1416 + ADIO_Offset * offset_list,
1417 + int *len_list, int *send_size,
1418 + MPI_Request * requests,
1419 + int *sent_to_proc, int nprocs,
1420 + int myrank, int contig_access_count,
1421 + int * striping_info,
1422 + int *send_buf_idx,
1423 + int *curr_to_proc,
1424 + int *done_to_proc, int iter,
1425 + MPI_Aint buftype_extent);
1426 +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
1428 + ADIOI_Flatlist_node * flat_buf,
1429 + ADIO_Offset * offset_list,
1430 + int *len_list, int *send_size,
1431 + int *recv_size, ADIO_Offset off,
1432 + int size, int *count,
1433 + int *start_pos, int *partial_recv,
1434 + int *sent_to_proc, int nprocs,
1435 + int myrank, int buftype_is_contig,
1436 + int contig_access_count,
1437 + int * striping_info,
1438 + ADIOI_Access * others_req,
1439 + int *send_buf_idx,
1440 + int *curr_to_proc,
1441 + int *done_to_proc, int *hole,
1442 + int iter, MPI_Aint buftype_extent,
1443 + int *buf_idx, int *error_code);
1444 +static void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
1445 + ADIO_Offset * srt_off, int *srt_len,
1446 + int *start_pos, int nprocs, int nprocs_recv,
1447 + int total_elements);
1449 +void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
1450 + MPI_Datatype datatype,
1451 + int file_ptr_type, ADIO_Offset offset,
1452 + ADIO_Status * status, int *error_code)
1454 + ADIOI_Access *my_req;
1455 + /* array of nprocs access structures, one for each other process has
1456 + this process's request */
1458 + ADIOI_Access *others_req;
1459 + /* array of nprocs access structures, one for each other process
1460 + whose request is written by this process. */
1462 + int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank, do_collect = 0;
1463 + int contig_access_count = 0, buftype_is_contig;
1464 + int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
1465 + ADIO_Offset orig_fp, start_offset, end_offset, off, *offset_list = NULL;
1466 + int *buf_idx = NULL, *len_list = NULL, *striping_info = NULL;
1467 + int old_error, tmp_error;
1469 + MPI_Comm_size(fd->comm, &nprocs);
1470 + MPI_Comm_rank(fd->comm, &myrank);
1472 + nprocs_for_coll = fd->hints->cb_nodes;
1473 + orig_fp = fd->fp_ind;
1475 + /* IO patten identification if cb_write isn't disabled */
1476 + if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
1477 + /* For this process's request, calculate the list of offsets and
1478 + lengths in the file and determine the start and end offsets. */
1479 + ADIOI_LUSTRE_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
1480 + &offset_list, &len_list, &start_offset,
1481 + &end_offset, &contig_access_count);
1482 + /* Get striping information */
1483 + ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1, nprocs);
1484 + /* check if the access pattern can benefit from collective write */
1485 + do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count,
1486 + len_list, nprocs);
1488 + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
1490 + /* Decide if collective I/O should be done */
1491 + if ((!do_collect && fd->hints->cb_write == ADIOI_HINT_AUTO) ||
1492 + fd->hints->cb_write == ADIOI_HINT_DISABLE) {
1494 + int filerange_is_contig = 0;
1496 + /* use independent accesses */
1497 + if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
1498 + ADIOI_Free(offset_list);
1499 + ADIOI_Free(len_list);
1502 + fd->fp_ind = orig_fp;
1503 + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
1504 + if (buftype_is_contig && filetype_is_contig) {
1505 + if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
1506 + off = fd->disp + (fd->etype_size) * offset;
1507 + ADIO_WriteContig(fd, buf, count, datatype,
1508 + ADIO_EXPLICIT_OFFSET,
1509 + off, status, error_code);
1511 + ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
1512 + 0, status, error_code);
1514 + ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
1515 + offset, status, error_code);
1520 + /* calculate what portions of the access requests of this process are
1521 + * located in which process
1523 + ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count,
1524 + striping_info, nprocs, &count_my_req_procs,
1525 + &count_my_req_per_proc, &my_req, &buf_idx);
1526 + /* calculate what process's requests will be written by this process */
1527 + ADIOI_LUSTRE_Calc_others_req(fd, count_my_req_procs,
1528 + count_my_req_per_proc,
1529 + my_req, nprocs, myrank,
1530 + start_offset, end_offset, striping_info,
1531 + &count_others_req_procs, &others_req);
1532 + ADIOI_Free(count_my_req_per_proc);
1534 + /* exchange data and write in sizes of no more than stripe_size. */
1535 + ADIOI_LUSTRE_Exch_and_write(fd, buf, datatype, nprocs, myrank,
1536 + others_req, my_req,
1537 + offset_list, len_list, contig_access_count,
1538 + striping_info, buf_idx, error_code);
1540 + old_error = *error_code;
1541 + if (*error_code != MPI_SUCCESS)
1542 + *error_code = MPI_ERR_IO;
1544 + /* optimization: if only one process performing i/o, we can perform
1545 + * a less-expensive Bcast */
1546 +#ifdef ADIOI_MPE_LOGGING
1547 + MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL);
1549 + if (fd->hints->cb_nodes == 1)
1550 + MPI_Bcast(error_code, 1, MPI_INT,
1551 + fd->hints->ranklist[0], fd->comm);
1553 + tmp_error = *error_code;
1554 + MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT,
1555 + MPI_MAX, fd->comm);
1557 +#ifdef ADIOI_MPE_LOGGING
1558 + MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL);
1561 + if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO))
1562 + *error_code = old_error;
1565 + if (!buftype_is_contig)
1566 + ADIOI_Delete_flattened(datatype);
1568 + /* free all memory allocated for collective I/O */
1569 + /* free others_req */
1570 + for (i = 0; i < nprocs; i++) {
1571 + if (others_req[i].count) {
1572 + ADIOI_Free(others_req[i].offsets);
1573 + ADIOI_Free(others_req[i].lens);
1574 + ADIOI_Free(others_req[i].mem_ptrs);
1577 + ADIOI_Free(others_req);
1578 + /* free my_req here */
1579 + for (i = 0; i < nprocs; i++) {
1580 + if (my_req[i].count) {
1581 + ADIOI_Free(my_req[i].offsets);
1582 + ADIOI_Free(my_req[i].lens);
1585 + ADIOI_Free(my_req);
1586 + ADIOI_Free(buf_idx);
1587 + ADIOI_Free(offset_list);
1588 + ADIOI_Free(len_list);
1589 + ADIOI_Free(striping_info);
1591 +#ifdef HAVE_STATUS_SET_BYTES
1593 + int bufsize, size;
1594 + /* Don't set status if it isn't needed */
1595 + MPI_Type_size(datatype, &size);
1596 + bufsize = size * count;
1597 + MPIR_Status_set_bytes(status, datatype, bufsize);
1599 + /* This is a temporary way of filling in status. The right way is to
1600 + * keep track of how much data was actually written during collective I/O.
1604 + fd->fp_sys_posn = -1; /* set it to null. */
1607 +static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
1608 + MPI_Datatype datatype, int nprocs,
1609 + int myrank, ADIOI_Access *others_req,
1610 + ADIOI_Access *my_req,
1611 + ADIO_Offset *offset_list,
1612 + int *len_list, int contig_access_count,
1613 + int *striping_info, int *buf_idx,
1616 + int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
1617 + ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
1618 + ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
1619 + ADIO_Offset max_size, step_size = 0;
1620 + int real_size, req_len, send_len;
1621 + int *recv_curr_offlen_ptr, *recv_count, *recv_size;
1622 + int *send_curr_offlen_ptr, *send_size;
1623 + int *partial_recv, *sent_to_proc, *recv_start_pos;
1624 + int *send_buf_idx, *curr_to_proc, *done_to_proc;
1625 + char *write_buf = NULL, *value;
1626 + MPI_Status status;
1627 + ADIOI_Flatlist_node *flat_buf = NULL;
1628 + MPI_Aint buftype_extent;
1629 + int stripe_size = striping_info[0], lflag, data_sieving = 0;
1630 + int stripe_count = striping_info[1], CO = striping_info[2];
1631 + /* IO step size in each communication */
1632 + static char myname[] = "ADIOI_EXCH_AND_WRITE";
1634 + *error_code = MPI_SUCCESS; /* changed below if error */
1636 + /* calculate the number of writes of stripe size
1637 + * to be done by each process and the max among all processes.
1638 + * That gives the no. of communication phases as well.
1641 + for (i = 0; i < nprocs; i++) {
1642 + if (others_req[i].count) {
1643 + st_loc = others_req[i].offsets[0];
1644 + end_loc = others_req[i].offsets[0];
1649 + for (i = 0; i < nprocs; i++) {
1650 + for (j = 0; j < others_req[i].count; j++) {
1651 + st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
1652 + end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] +
1653 + others_req[i].lens[j] - 1));
1656 + /* this process does no writing. */
1657 + if ((st_loc == -1) && (end_loc == -1))
1659 + MPI_Allreduce(&end_loc, &max_end_loc, 1, MPI_LONG_LONG_INT, MPI_MAX, fd->comm);
1660 + /* avoid min_st_loc be -1 */
1662 + st_loc = max_end_loc;
1663 + MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm);
1664 + /* align downward */
1665 + min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size;
1666 + /* when nprocs < stripe_count, there will be trouble, because some client
1667 + * would access more than one OST in one whole communication.
1669 + step_size = (ADIO_Offset)ADIOI_MIN(nprocs, stripe_count * CO) * stripe_size;
1670 + max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1);
1672 + write_buf = (char *) ADIOI_Malloc(stripe_size);
1674 + /* calculate the start offset for each iteration */
1675 + off_list = (ADIO_Offset *) ADIOI_Malloc(max_ntimes * sizeof(ADIO_Offset));
1676 + for (m = 0; m < max_ntimes; m ++)
1677 + off_list[m] = max_end_loc;
1678 + for (i = 0; i < nprocs; i++) {
1679 + for (j = 0; j < others_req[i].count; j ++) {
1680 + req_off = others_req[i].offsets[j];
1681 + //m = (req_off - min_st_loc) / (stripe_size * stripe_count * CO);
1682 + m = (int)((req_off - min_st_loc) / step_size);
1683 + off_list[m] = ADIOI_MIN(off_list[m], req_off);
1687 + recv_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1688 + send_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1689 + /* their use is explained below. calloc initializes to 0. */
1691 + recv_count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1692 + /* to store count of how many off-len pairs per proc are satisfied
1693 + in an iteration. */
1695 + send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1696 + /* total size of data to be sent to each proc. in an iteration.
1697 + Of size nprocs so that I can use MPI_Alltoall later. */
1699 + recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1700 + /* total size of data to be recd. from each proc. in an iteration. */
1702 + sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1703 + /* amount of data sent to each proc so far. Used in
1704 + ADIOI_Fill_send_buffer. initialized to 0 here. */
1706 + send_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1707 + curr_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1708 + done_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1709 + /* Above three are used in ADIOI_Fill_send_buffer */
1711 + recv_start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1712 + /* used to store the starting value of recv_curr_offlen_ptr[i] in
1715 + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
1716 + if (!buftype_is_contig) {
1717 + ADIOI_Flatten_datatype(datatype);
1718 + flat_buf = ADIOI_Flatlist;
1719 + while (flat_buf->type != datatype)
1720 + flat_buf = flat_buf->next;
1722 + MPI_Type_extent(datatype, &buftype_extent);
1724 + iter_st_off = min_st_loc;
1726 + /* Although we have recognized the data according to OST index,
1727 + * a read-modify-write will be done if there is a hole between the data.
1728 + * For example: if blocksize=60, transfersize=30 and stripe_size=100,
1729 + * then process0 will collect data [0, 30] and [60, 90] then write. There
1730 + * is a hole [30, 60], which will cause a read-modify-write in [0, 90].
1731 + * It will degrade collective performance.
1732 + * So we disable data sieving by default unless the hint "ds_in_coll"
1733 + * is set to "enable".
1735 + /* check the hint for data sieving */
1736 + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
1737 + MPI_Info_get(fd->info, "ds_in_coll", MPI_MAX_INFO_VAL, value, &lflag);
1738 + if (lflag && !strcmp(value, "enable"))
1740 + ADIOI_Free(value);
1742 + for (m = 0; m < max_ntimes; m++) {
1743 + /* go through all others_req and my_req to check which will be received
1744 + * and sent in this iteration.
1747 + /* Note that MPI guarantees that displacements in filetypes are in
1748 + monotonically nondecreasing order and that, for writes, the
1749 + filetypes cannot specify overlapping regions in the file. This
1750 + simplifies implementation a bit compared to reads. */
1753 + off = start offset in the file for the data to be written in
1755 + iter_st_off = start offset of this iteration
1756 + real_size = size of data written (bytes) corresponding to off
1757 + max_size = possible maximum size of data written in this iteration
1758 + req_off = offset in the file for a particular contiguous request minus
1759 + what was satisfied in previous iteration
1760 + send_off = offset the request needed by other processes in this iteration
1761 + req_len = size corresponding to req_off
1762 + send_len = size corresponding to send_off
1765 + /* first calculate what should be communicated */
1766 + for (i = 0; i < nprocs; i++)
1767 + recv_count[i] = recv_size[i] = send_size[i] = 0;
1769 + off = off_list[m];
1770 + max_size = ADIOI_MIN(step_size, max_end_loc - iter_st_off + 1);
1771 + real_size = (int) ADIOI_MIN((off / stripe_size + 1) * stripe_size - off,
1772 + end_loc - off + 1);
1774 + for (i = 0; i < nprocs; i++) {
1775 + if (my_req[i].count) {
1776 + for (j = send_curr_offlen_ptr[i]; j < my_req[i].count; j++) {
1777 + send_off = my_req[i].offsets[j];
1778 + send_len = my_req[i].lens[j];
1779 + if (send_off < iter_st_off + max_size) {
1780 + send_size[i] += send_len;
1785 + send_curr_offlen_ptr[i] = j;
1787 + if (others_req[i].count) {
1788 + recv_start_pos[i] = recv_curr_offlen_ptr[i];
1789 + for (j = recv_curr_offlen_ptr[i]; j < others_req[i].count; j++) {
1790 + req_off = others_req[i].offsets[j];
1791 + req_len = others_req[i].lens[j];
1792 + if (req_off < iter_st_off + max_size) {
1794 + MPI_Address(write_buf + req_off - off,
1795 + &(others_req[i].mem_ptrs[j]));
1796 + recv_size[i] += req_len;
1801 + recv_curr_offlen_ptr[i] = j;
1804 + /* use hole to pass data_sieving flag into W_Exchange_data */
1805 + hole = data_sieving;
1806 + ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
1807 + len_list, send_size, recv_size, off, real_size,
1808 + recv_count, recv_start_pos, partial_recv,
1809 + sent_to_proc, nprocs, myrank,
1810 + buftype_is_contig, contig_access_count,
1811 + striping_info, others_req, send_buf_idx,
1812 + curr_to_proc, done_to_proc, &hole, m,
1813 + buftype_extent, buf_idx, error_code);
1814 + if (*error_code != MPI_SUCCESS)
1818 + for (i = 0; i < nprocs; i++)
1819 + if (recv_count[i]) {
1824 + /* check whether to do data sieving */
1825 + if(data_sieving) {
1826 + ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
1827 + ADIO_EXPLICIT_OFFSET, off, &status,
1830 + /* if there is no hole, write in one time;
1831 + * otherwise, write data separately */
1833 + ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
1834 + ADIO_EXPLICIT_OFFSET, off, &status,
1837 + for (i = 0; i < nprocs; i++) {
1838 + if (others_req[i].count) {
1839 + for (j = 0; j < others_req[i].count; j++) {
1840 + if (others_req[i].offsets[j] < off + real_size &&
1841 + others_req[i].offsets[j] >= off) {
1842 + ADIO_WriteContig(fd,
1843 + write_buf + others_req[i].offsets[j] - off,
1844 + others_req[i].lens[j],
1845 + MPI_BYTE, ADIO_EXPLICIT_OFFSET,
1846 + others_req[i].offsets[j], &status,
1854 + if (*error_code != MPI_SUCCESS)
1858 + iter_st_off += max_size;
1860 + if (*error_code != MPI_SUCCESS)
1864 + ADIOI_Free(write_buf);
1865 + ADIOI_Free(recv_curr_offlen_ptr);
1866 + ADIOI_Free(send_curr_offlen_ptr);
1867 + ADIOI_Free(recv_count);
1868 + ADIOI_Free(send_size);
1869 + ADIOI_Free(recv_size);
1870 + ADIOI_Free(sent_to_proc);
1871 + ADIOI_Free(recv_start_pos);
1872 + ADIOI_Free(send_buf_idx);
1873 + ADIOI_Free(curr_to_proc);
1874 + ADIOI_Free(done_to_proc);
1875 + ADIOI_Free(off_list);
1878 +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
1880 + ADIOI_Flatlist_node * flat_buf,
1881 + ADIO_Offset * offset_list,
1882 + int *len_list, int *send_size,
1883 + int *recv_size, ADIO_Offset off,
1884 + int size, int *count,
1885 + int *start_pos, int *partial_recv,
1886 + int *sent_to_proc, int nprocs,
1887 + int myrank, int buftype_is_contig,
1888 + int contig_access_count,
1889 + int * striping_info,
1890 + ADIOI_Access * others_req,
1891 + int *send_buf_idx,
1892 + int *curr_to_proc, int *done_to_proc,
1893 + int *hole, int iter,
1894 + MPI_Aint buftype_extent,
1895 + int *buf_idx, int *error_code)
1897 + int i, j, *tmp_len, nprocs_recv, nprocs_send, err;
1898 + char **send_buf = NULL;
1899 + MPI_Request *requests, *send_req;
1900 + MPI_Datatype *recv_types;
1901 + MPI_Status *statuses, status;
1902 + int *srt_len, sum, sum_recv;
1903 + ADIO_Offset *srt_off;
1904 + int data_sieving = *hole;
1905 + static char myname[] = "ADIOI_W_EXCHANGE_DATA";
1907 + /* create derived datatypes for recv */
1909 + for (i = 0; i < nprocs; i++)
1913 + recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) *
1914 + sizeof(MPI_Datatype));
1915 + /* +1 to avoid a 0-size malloc */
1917 + tmp_len = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1919 + for (i = 0; i < nprocs; i++) {
1920 + if (recv_size[i]) {
1921 + MPI_Type_hindexed(count[i],
1922 + &(others_req[i].lens[start_pos[i]]),
1923 + &(others_req[i].mem_ptrs[start_pos[i]]),
1924 + MPI_BYTE, recv_types + j);
1925 + /* absolute displacements; use MPI_BOTTOM in recv */
1926 + MPI_Type_commit(recv_types + j);
1931 + /* To avoid a read-modify-write,
1932 + * check if there are holes in the data to be written.
1933 + * For this, merge the (sorted) offset lists others_req using a heap-merge.
1937 + for (i = 0; i < nprocs; i++)
1939 + srt_off = (ADIO_Offset *) ADIOI_Malloc((sum + 1) * sizeof(ADIO_Offset));
1940 + srt_len = (int *) ADIOI_Malloc((sum + 1) * sizeof(int));
1941 + /* +1 to avoid a 0-size malloc */
1943 + ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
1944 + nprocs, nprocs_recv, sum);
1946 + ADIOI_Free(tmp_len);
1948 + /* check if there are any holes */
1950 + for (i = 0; i < sum - 1; i++) {
1951 + if (srt_off[i] + srt_len[i] < srt_off[i + 1]) {
1956 + /* In some cases (see John Bent ROMIO REQ # 835), an odd interaction
1957 + * between aggregation, nominally contiguous regions, and cb_buffer_size
1958 + * should be handled with a read-modify-write (otherwise we will write out
1959 + * more data than we receive from everyone else (inclusive), so override
1964 + for (i = 0; i < nprocs; i++)
1965 + sum_recv += recv_size[i];
1966 + if (size > sum_recv)
1969 + /* check the hint for data sieving */
1970 + if (data_sieving && nprocs_recv && *hole) {
1971 + ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
1972 + ADIO_EXPLICIT_OFFSET, off, &status, &err);
1973 + // --BEGIN ERROR HANDLING--
1974 + if (err != MPI_SUCCESS) {
1975 + *error_code = MPIO_Err_create_code(err,
1976 + MPIR_ERR_RECOVERABLE,
1979 + "**ioRMWrdwr", 0);
1982 + // --END ERROR HANDLING--
1984 + ADIOI_Free(srt_off);
1985 + ADIOI_Free(srt_len);
1988 + for (i = 0; i < nprocs; i++)
1992 + if (fd->atomicity) {
1993 + /* bug fix from Wei-keng Liao and Kenin Coloma */
1994 + requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) *
1995 + sizeof(MPI_Request));
1996 + send_req = requests;
1998 + requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1)*
1999 + sizeof(MPI_Request));
2000 + /* +1 to avoid a 0-size malloc */
2002 + /* post receives */
2004 + for (i = 0; i < nprocs; i++) {
2005 + if (recv_size[i]) {
2006 + MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i,
2007 + myrank + i + 100 * iter, fd->comm, requests + j);
2011 + send_req = requests + nprocs_recv;
2015 + * if buftype_is_contig, data can be directly sent from
2016 + * user buf at location given by buf_idx. else use send_buf.
2018 + if (buftype_is_contig) {
2020 + for (i = 0; i < nprocs; i++)
2021 + if (send_size[i]) {
2022 + MPI_Isend(((char *) buf) + buf_idx[i], send_size[i],
2023 + MPI_BYTE, i, myrank + i + 100 * iter, fd->comm,
2026 + buf_idx[i] += send_size[i];
2028 + } else if (nprocs_send) {
2029 + /* buftype is not contig */
2030 + send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
2031 + for (i = 0; i < nprocs; i++)
2033 + send_buf[i] = (char *) ADIOI_Malloc(send_size[i]);
2035 + ADIOI_LUSTRE_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list,
2036 + len_list, send_size, send_req,
2037 + sent_to_proc, nprocs, myrank,
2038 + contig_access_count, striping_info,
2039 + send_buf_idx, curr_to_proc, done_to_proc,
2040 + iter, buftype_extent);
2041 + /* the send is done in ADIOI_Fill_send_buffer */
2044 + /* bug fix from Wei-keng Liao and Kenin Coloma */
2045 + if (fd->atomicity) {
2047 + for (i = 0; i < nprocs; i++) {
2048 + MPI_Status wkl_status;
2049 + if (recv_size[i]) {
2050 + MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i,
2051 + myrank + i + 100 * iter, fd->comm, &wkl_status);
2057 + for (i = 0; i < nprocs_recv; i++)
2058 + MPI_Type_free(recv_types + i);
2059 + ADIOI_Free(recv_types);
2061 + /* bug fix from Wei-keng Liao and Kenin Coloma */
2062 + /* +1 to avoid a 0-size malloc */
2063 + if (fd->atomicity) {
2064 + statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) *
2065 + sizeof(MPI_Status));
2067 + statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) *
2068 + sizeof(MPI_Status));
2071 +#ifdef NEEDS_MPI_TEST
2073 + if (fd->atomicity) {
2074 + /* bug fix from Wei-keng Liao and Kenin Coloma */
2076 + MPI_Testall(nprocs_send, send_req, &i, statuses);
2079 + MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses);
2082 + /* bug fix from Wei-keng Liao and Kenin Coloma */
2083 + if (fd->atomicity)
2084 + MPI_Waitall(nprocs_send, send_req, statuses);
2086 + MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses);
2088 + ADIOI_Free(statuses);
2089 + ADIOI_Free(requests);
2090 + if (!buftype_is_contig && nprocs_send) {
2091 + for (i = 0; i < nprocs; i++)
2093 + ADIOI_Free(send_buf[i]);
2094 + ADIOI_Free(send_buf);
2098 +#define ADIOI_BUF_INCR \
2100 + while (buf_incr) { \
2101 + size_in_buf = ADIOI_MIN(buf_incr, flat_buf_sz); \
2102 + user_buf_idx += size_in_buf; \
2103 + flat_buf_sz -= size_in_buf; \
2104 + if (!flat_buf_sz) { \
2105 + if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
2107 + flat_buf_idx = 0; \
2110 + user_buf_idx = flat_buf->indices[flat_buf_idx] + \
2111 + n_buftypes*buftype_extent; \
2112 + flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
2114 + buf_incr -= size_in_buf; \
2119 +#define ADIOI_BUF_COPY \
2122 + size_in_buf = ADIOI_MIN(size, flat_buf_sz); \
2123 + memcpy(&(send_buf[p][send_buf_idx[p]]), \
2124 + ((char *) buf) + user_buf_idx, size_in_buf); \
2125 + send_buf_idx[p] += size_in_buf; \
2126 + user_buf_idx += size_in_buf; \
2127 + flat_buf_sz -= size_in_buf; \
2128 + if (!flat_buf_sz) { \
2129 + if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
2131 + flat_buf_idx = 0; \
2134 + user_buf_idx = flat_buf->indices[flat_buf_idx] + \
2135 + n_buftypes*buftype_extent; \
2136 + flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
2138 + size -= size_in_buf; \
2139 + buf_incr -= size_in_buf; \
2144 +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
2145 + ADIOI_Flatlist_node * flat_buf,
2147 + ADIO_Offset * offset_list,
2148 + int *len_list, int *send_size,
2149 + MPI_Request * requests,
2150 + int *sent_to_proc, int nprocs,
2152 + int contig_access_count,
2153 + int * striping_info,
2154 + int *send_buf_idx,
2155 + int *curr_to_proc,
2156 + int *done_to_proc, int iter,
2157 + MPI_Aint buftype_extent)
2159 + /* this function is only called if buftype is not contig */
2160 + int i, p, flat_buf_idx, size;
2161 + int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes;
2162 + ADIO_Offset off, len, rem_len, user_buf_idx;
2164 + /* curr_to_proc[p] = amount of data sent to proc. p that has already
2165 + * been accounted for so far
2166 + * done_to_proc[p] = amount of data already sent to proc. p in
2167 + * previous iterations
2168 + * user_buf_idx = current location in user buffer
2169 + * send_buf_idx[p] = current location in send_buf of proc. p
2172 + for (i = 0; i < nprocs; i++) {
2173 + send_buf_idx[i] = curr_to_proc[i] = 0;
2174 + done_to_proc[i] = sent_to_proc[i];
2178 + user_buf_idx = flat_buf->indices[0];
2181 + flat_buf_sz = flat_buf->blocklens[0];
2183 + /* flat_buf_idx = current index into flattened buftype
2184 + * flat_buf_sz = size of current contiguous component in flattened buf
2186 + for (i = 0; i < contig_access_count; i++) {
2187 + off = offset_list[i];
2188 + rem_len = (ADIO_Offset) len_list[i];
2190 + /*this request may span to more than one process */
2191 + while (rem_len != 0) {
2193 + /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
2194 + * longer than the single region that processor "p" is responsible
2197 + p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, nprocs, striping_info);
2199 + if (send_buf_idx[p] < send_size[p]) {
2200 + if (curr_to_proc[p] + len > done_to_proc[p]) {
2201 + if (done_to_proc[p] > curr_to_proc[p]) {
2202 + size = (int) ADIOI_MIN(curr_to_proc[p] + len -
2206 + buf_incr = done_to_proc[p] - curr_to_proc[p];
2208 + buf_incr = (int) (curr_to_proc[p] + len -
2210 + curr_to_proc[p] = done_to_proc[p] + size;
2213 + size = (int) ADIOI_MIN(len, send_size[p] -
2215 + buf_incr = (int) len;
2216 + curr_to_proc[p] += size;
2219 + if (send_buf_idx[p] == send_size[p]) {
2220 + MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p,
2221 + myrank + p + 100 * iter, fd->comm,
2226 + curr_to_proc[p] += (int) len;
2227 + buf_incr = (int) len;
2231 + buf_incr = (int) len;
2238 + for (i = 0; i < nprocs; i++)
2240 + sent_to_proc[i] = curr_to_proc[i];
2243 +static void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
2244 + ADIO_Offset * srt_off, int *srt_len,
2245 + int *start_pos, int nprocs, int nprocs_recv,
2246 + int total_elements)
2249 + ADIO_Offset *off_list;
2254 + heap_struct *a, tmp;
2255 + int i, j, heapsize, l, r, k, smallest;
2257 + a = (heap_struct *) ADIOI_Malloc((nprocs_recv + 1) *
2258 + sizeof(heap_struct));
2261 + for (i = 0; i < nprocs; i++)
2263 + a[j].off_list = &(others_req[i].offsets[start_pos[i]]);
2264 + a[j].len_list = &(others_req[i].lens[start_pos[i]]);
2265 + a[j].nelem = count[i];
2269 + /* build a heap out of the first element from each list, with
2270 + the smallest element of the heap at the root */
2272 + heapsize = nprocs_recv;
2273 + for (i = heapsize / 2 - 1; i >= 0; i--) {
2274 + /* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143
2275 + modified for a heap with smallest element at root. I have
2276 + removed the recursion so that there are no function calls.
2277 + Function calls are too expensive. */
2280 + l = 2 * (k + 1) - 1;
2283 + if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list)))
2288 + if ((r < heapsize) &&
2289 + (*(a[r].off_list) < *(a[smallest].off_list)))
2292 + if (smallest != k) {
2293 + tmp.off_list = a[k].off_list;
2294 + tmp.len_list = a[k].len_list;
2295 + tmp.nelem = a[k].nelem;
2297 + a[k].off_list = a[smallest].off_list;
2298 + a[k].len_list = a[smallest].len_list;
2299 + a[k].nelem = a[smallest].nelem;
2301 + a[smallest].off_list = tmp.off_list;
2302 + a[smallest].len_list = tmp.len_list;
2303 + a[smallest].nelem = tmp.nelem;
2311 + for (i = 0; i < total_elements; i++) {
2312 + /* extract smallest element from heap, i.e. the root */
2313 + srt_off[i] = *(a[0].off_list);
2314 + srt_len[i] = *(a[0].len_list);
2317 + if (!a[0].nelem) {
2318 + a[0].off_list = a[heapsize - 1].off_list;
2319 + a[0].len_list = a[heapsize - 1].len_list;
2320 + a[0].nelem = a[heapsize - 1].nelem;
2323 + (a[0].off_list)++;
2324 + (a[0].len_list)++;
2327 + /* Heapify(a, 0, heapsize); */
2330 + l = 2 * (k + 1) - 1;
2333 + if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list)))
2338 + if ((r < heapsize) &&
2339 + (*(a[r].off_list) < *(a[smallest].off_list)))
2342 + if (smallest != k) {
2343 + tmp.off_list = a[k].off_list;
2344 + tmp.len_list = a[k].len_list;
2345 + tmp.nelem = a[k].nelem;
2347 + a[k].off_list = a[smallest].off_list;
2348 + a[k].len_list = a[smallest].len_list;
2349 + a[k].nelem = a[smallest].nelem;
2351 + a[smallest].off_list = tmp.off_list;
2352 + a[smallest].len_list = tmp.len_list;
2353 + a[smallest].nelem = tmp.nelem;
2362 diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
2363 --- ad_lustre_orig/ad_lustre_wrstr.c 1970-01-01 08:00:00.000000000 +0800
2364 +++ ad_lustre/ad_lustre_wrstr.c 2008-09-17 18:20:35.000000000 +0800
2366 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
2368 + * Copyright (C) 1997 University of Chicago.
2369 + * See COPYRIGHT notice in top-level directory.
2371 + * Copyright (C) 2007 Oak Ridge National Laboratory
2373 + * Copyright (C) 2008 Sun Microsystems, Lustre group
2376 +#include "ad_lustre.h"
2377 +#include "adio_extern.h"
2379 +#define ADIOI_BUFFERED_WRITE \
2381 + if (req_off >= writebuf_off + writebuf_len) { \
2382 + if (writebuf_len) { \
2383 + ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2384 + ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2385 + if (!(fd->atomicity)) \
2386 + ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2387 + if (*error_code != MPI_SUCCESS) { \
2388 + *error_code = MPIO_Err_create_code(*error_code, \
2389 + MPIR_ERR_RECOVERABLE, myname, \
2390 + __LINE__, MPI_ERR_IO, \
2395 + writebuf_off = req_off; \
2396 + /* stripe_size alignment */ \
2397 + writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2398 + (writebuf_off / stripe_size + 1) * \
2399 + stripe_size - writebuf_off);\
2400 + if (!(fd->atomicity)) \
2401 + ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2402 + ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,\
2403 + writebuf_off, &status1, error_code); \
2404 + if (*error_code != MPI_SUCCESS) { \
2405 + *error_code = MPIO_Err_create_code(*error_code, \
2406 + MPIR_ERR_RECOVERABLE, myname, \
2407 + __LINE__, MPI_ERR_IO, \
2412 + write_sz = (int) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
2413 + memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, write_sz);\
2414 + while (write_sz != req_len) {\
2415 + ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2416 + ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2417 + if (!(fd->atomicity)) \
2418 + ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2419 + if (*error_code != MPI_SUCCESS) { \
2420 + *error_code = MPIO_Err_create_code(*error_code, \
2421 + MPIR_ERR_RECOVERABLE, myname, \
2422 + __LINE__, MPI_ERR_IO, \
2426 + req_len -= write_sz; \
2427 + userbuf_off += write_sz; \
2428 + writebuf_off += writebuf_len; \
2429 + /* stripe_size alignment */ \
2430 + writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2431 + (writebuf_off / stripe_size + 1) * \
2432 + stripe_size - writebuf_off);\
2433 + if (!(fd->atomicity)) \
2434 + ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2435 + ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,\
2436 + writebuf_off, &status1, error_code); \
2437 + if (*error_code != MPI_SUCCESS) { \
2438 + *error_code = MPIO_Err_create_code(*error_code, \
2439 + MPIR_ERR_RECOVERABLE, myname, \
2440 + __LINE__, MPI_ERR_IO, \
2444 + write_sz = ADIOI_MIN(req_len, writebuf_len); \
2445 + memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
2450 +/* this macro is used when filetype is contig and buftype is not contig.
2451 + it does not do a read-modify-write and does not lock*/
2452 +#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
2454 + if (req_off >= writebuf_off + writebuf_len) { \
2455 + writebuf_off = req_off; \
2456 + /* stripe_size alignment */ \
2457 + writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2458 + (writebuf_off / stripe_size + 1) * \
2459 + stripe_size - writebuf_off);\
2461 + write_sz = (int) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
2462 + memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, write_sz);\
2463 + while (req_len) { \
2464 + ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2465 + ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2466 + if (*error_code != MPI_SUCCESS) { \
2467 + *error_code = MPIO_Err_create_code(*error_code, \
2468 + MPIR_ERR_RECOVERABLE, myname, \
2469 + __LINE__, MPI_ERR_IO, \
2473 + req_len -= write_sz; \
2474 + userbuf_off += write_sz; \
2475 + writebuf_off += writebuf_len; \
2476 + /* stripe_size alignment */ \
2477 + writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2478 + (writebuf_off / stripe_size + 1) * \
2479 + stripe_size - writebuf_off);\
2480 + write_sz = ADIOI_MIN(req_len, writebuf_len); \
2481 + memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
2485 +void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
2486 + MPI_Datatype datatype, int file_ptr_type,
2487 + ADIO_Offset offset, ADIO_Status * status,
2490 + /* offset is in units of etype relative to the filetype. */
2491 + ADIOI_Flatlist_node *flat_buf, *flat_file;
2492 + int i, j, k, bwr_size, fwr_size = 0, st_index = 0;
2493 + int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
2494 + int n_filetypes, etype_in_filetype;
2495 + ADIO_Offset abs_off_in_filetype = 0;
2496 + int filetype_size, etype_size, buftype_size, req_len;
2497 + MPI_Aint filetype_extent, buftype_extent;
2498 + int buf_count, buftype_is_contig, filetype_is_contig;
2499 + ADIO_Offset userbuf_off;
2500 + ADIO_Offset off, req_off, disp, end_offset = 0, writebuf_off, start_off;
2502 + int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
2503 + ADIO_Status status1;
2504 + int new_bwr_size, new_fwr_size;
2506 + int stripe_size, lflag = 0;
2507 + static char myname[] = "ADIOI_LUSTRE_WriteStrided";
2509 + MPI_Comm_rank(fd->comm, &myrank);
2511 + if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
2512 + /* if user has disabled data sieving on writes, use naive
2513 + * approach instead.
2515 + ADIOI_GEN_WriteStrided_naive(fd,
2520 + offset, status, error_code);
2524 + *error_code = MPI_SUCCESS; /* changed below if error */
2526 + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
2527 + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
2529 + MPI_Type_size(fd->filetype, &filetype_size);
2530 + if (!filetype_size) {
2531 + *error_code = MPI_SUCCESS;
2535 + MPI_Type_extent(fd->filetype, &filetype_extent);
2536 + MPI_Type_size(datatype, &buftype_size);
2537 + MPI_Type_extent(datatype, &buftype_extent);
2538 + etype_size = fd->etype_size;
2540 + bufsize = buftype_size * count;
2542 + /* get striping info */
2543 + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
2544 + MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
2546 + stripe_size = atoi(value);
2547 + ADIOI_Free(value);
2549 + /* Different buftype to different filetype */
2550 + if (!buftype_is_contig && filetype_is_contig) {
2551 + /* noncontiguous in memory, contiguous in file. */
2552 + ADIOI_Flatten_datatype(datatype);
2553 + flat_buf = ADIOI_Flatlist;
2554 + while (flat_buf->type != datatype)
2555 + flat_buf = flat_buf->next;
2557 + off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
2558 + fd->disp + etype_size * offset;
2561 + end_offset = start_off + bufsize - 1;
2562 + writebuf_off = start_off;
2563 + /* write stripe size buffer each time */
2564 + writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
2565 + writebuf_len = (int) ADIOI_MIN(bufsize,
2566 + (writebuf_off / stripe_size + 1) *
2567 + stripe_size - writebuf_off);
2569 + /* if atomicity is true, lock the region to be accessed */
2570 + if (fd->atomicity)
2571 + ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
2573 + for (j = 0; j < count; j++) {
2574 + for (i = 0; i < flat_buf->count; i++) {
2575 + userbuf_off = j * buftype_extent + flat_buf->indices[i];
2577 + req_len = flat_buf->blocklens[i];
2578 + ADIOI_BUFFERED_WRITE_WITHOUT_READ
2579 + off += flat_buf->blocklens[i];
2583 + /* write the buffer out finally */
2584 + ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
2585 + ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
2588 + if (fd->atomicity)
2589 + ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
2590 + if (*error_code != MPI_SUCCESS)
2592 + ADIOI_Free(writebuf);
2593 + if (file_ptr_type == ADIO_INDIVIDUAL)
2596 + /* noncontiguous in file */
2597 + /* filetype already flattened in ADIO_Open */
2598 + flat_file = ADIOI_Flatlist;
2599 + while (flat_file->type != fd->filetype)
2600 + flat_file = flat_file->next;
2603 + if (file_ptr_type == ADIO_INDIVIDUAL) {
2604 + offset = fd->fp_ind; /* in bytes */
2609 + for (i = 0; i < flat_file->count; i++) {
2610 + if (disp + flat_file->indices[i] +
2611 + (ADIO_Offset) n_filetypes * filetype_extent +
2612 + flat_file->blocklens[i] >= offset) {
2614 + fwr_size = (int) (disp + flat_file->indices[i] +
2615 + (ADIO_Offset) n_filetypes *
2617 + flat_file->blocklens[i] -
2625 + n_etypes_in_filetype = filetype_size / etype_size;
2626 + n_filetypes = (int) (offset / n_etypes_in_filetype);
2627 + etype_in_filetype = (int) (offset % n_etypes_in_filetype);
2628 + size_in_filetype = etype_in_filetype * etype_size;
2631 + for (i = 0; i < flat_file->count; i++) {
2632 + sum += flat_file->blocklens[i];
2633 + if (sum > size_in_filetype) {
2635 + fwr_size = sum - size_in_filetype;
2636 + abs_off_in_filetype = flat_file->indices[i] +
2637 + size_in_filetype - (sum - flat_file->blocklens[i]);
2642 + /* abs. offset in bytes in the file */
2643 + offset = disp + (ADIO_Offset) n_filetypes *filetype_extent +
2644 + abs_off_in_filetype;
2647 + start_off = offset;
2649 + /* If the file bytes is actually contiguous, we do not need data sieve at all */
2650 + if (bufsize <= fwr_size) {
2651 + req_off = start_off;
2652 + req_len = bufsize;
2653 + end_offset = start_off + bufsize - 1;
2654 + writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
2655 + memset(writebuf, -1, ADIOI_MIN(bufsize, stripe_size));
2659 + ADIOI_BUFFERED_WRITE_WITHOUT_READ
2661 + /* Calculate end_offset, the last byte-offset that will be accessed.
2662 + e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */
2663 + st_fwr_size = fwr_size;
2664 + st_n_filetypes = n_filetypes;
2668 + fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
2669 + while (i < bufsize) {
2671 + end_offset = off + fwr_size - 1;
2673 + if (j < (flat_file->count - 1))
2680 + off = disp + flat_file->indices[j] +
2681 + (ADIO_Offset) n_filetypes * filetype_extent;
2682 + fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i);
2687 + writebuf = (char *) ADIOI_Malloc(stripe_size);
2688 + memset(writebuf, -1, stripe_size);
2689 + /* if atomicity is true, lock the region to be accessed */
2690 + if (fd->atomicity)
2691 + ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
2693 + if (buftype_is_contig && !filetype_is_contig) {
2694 + /* contiguous in memory, noncontiguous in file. should be the most
2699 + n_filetypes = st_n_filetypes;
2700 + fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
2701 + while (i < bufsize) {
2703 + /* TYPE_UB and TYPE_LB can result in
2704 + fwr_size = 0. save system call in such cases */
2706 + lseek(fd->fd_sys, off, SEEK_SET);
2707 + err = write(fd->fd_sys, ((char *) buf) + i, fwr_size);
2710 + req_len = fwr_size;
2712 + ADIOI_BUFFERED_WRITE
2716 + if (off + fwr_size < disp + flat_file->indices[j] +
2717 + flat_file->blocklens[j] +
2718 + (ADIO_Offset) n_filetypes * filetype_extent)
2720 + /* did not reach end of contiguous block in filetype.
2721 + no more I/O needed. off is incremented by fwr_size. */
2723 + if (j < (flat_file->count - 1))
2729 + off = disp + flat_file->indices[j] +
2730 + (ADIO_Offset) n_filetypes * filetype_extent;
2731 + fwr_size = ADIOI_MIN(flat_file->blocklens[j],
2736 + /* noncontiguous in memory as well as in file */
2737 + ADIOI_Flatten_datatype(datatype);
2738 + flat_buf = ADIOI_Flatlist;
2739 + while (flat_buf->type != datatype)
2740 + flat_buf = flat_buf->next;
2742 + k = num = buf_count = 0;
2743 + i = (int) (flat_buf->indices[0]);
2746 + n_filetypes = st_n_filetypes;
2747 + fwr_size = st_fwr_size;
2748 + bwr_size = flat_buf->blocklens[0];
2750 + while (num < bufsize) {
2751 + size = ADIOI_MIN(fwr_size, bwr_size);
2754 + lseek(fd->fd_sys, off, SEEK_SET);
2755 + err = write(fd->fd_sys, ((char *) buf) + i, size);
2760 + ADIOI_BUFFERED_WRITE
2763 + new_fwr_size = fwr_size;
2764 + new_bwr_size = bwr_size;
2766 + if (size == fwr_size) {
2767 + /* reached end of contiguous block in file */
2768 + if (j < (flat_file->count - 1)) {
2774 + off = disp + flat_file->indices[j] +
2775 + (ADIO_Offset) n_filetypes * filetype_extent;
2776 + new_fwr_size = flat_file->blocklens[j];
2777 + if (size != bwr_size) {
2779 + new_bwr_size -= size;
2782 + if (size == bwr_size) {
2783 + /* reached end of contiguous block in memory */
2784 + k = (k + 1) % flat_buf->count;
2786 + i = (int) (buftype_extent *
2787 + (buf_count / flat_buf->count) +
2788 + flat_buf->indices[k]);
2789 + new_bwr_size = flat_buf->blocklens[k];
2790 + if (size != fwr_size) {
2792 + new_fwr_size -= size;
2796 + fwr_size = new_fwr_size;
2797 + bwr_size = new_bwr_size;
2801 + /* write the buffer out finally */
2802 + if (writebuf_len) {
2803 + ADIO_WriteContig(fd, writebuf, writebuf_len,
2804 + MPI_BYTE, ADIO_EXPLICIT_OFFSET,
2805 + writebuf_off, &status1, error_code);
2806 + if (!(fd->atomicity))
2807 + ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
2808 + if (*error_code != MPI_SUCCESS)
2811 + if (fd->atomicity)
2812 + ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
2814 + ADIOI_Free(writebuf);
2815 + if (file_ptr_type == ADIO_INDIVIDUAL)
2818 + fd->fp_sys_posn = -1; /* set it to null. */
2820 +#ifdef HAVE_STATUS_SET_BYTES
2821 + MPIR_Status_set_bytes(status, datatype, bufsize);
2822 + /* This is a temporary way of filling in status. The right way is to
2823 + keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
2826 + if (!buftype_is_contig)
2827 + ADIOI_Delete_flattened(datatype);
2829 diff -ruN ad_lustre_orig/Makefile ad_lustre/Makefile
2830 --- ad_lustre_orig/Makefile 1970-01-01 08:00:00.000000000 +0800
2831 +++ ad_lustre/Makefile 2008-09-17 18:20:35.000000000 +0800
2836 +LIBNAME = /work/download/mpich2-1.0.7-dev/lib/libmpich.a
2837 +srcdir = /work/download/mpich2-1.0.7-dev/src/mpi/romio/adio/ad_lustre
2839 +SHLIBNAME = /work/download/mpich2-1.0.7-dev/lib/libmpich
2841 +INCLUDE_DIR = -I. -I${srcdir}/../include -I../include -I../../include -I${srcdir}/../../../../include -I../../../../include
2842 +CFLAGS = -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/include -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/include -I/work/download/mpich2-1.0.7-dev/src/mpid/common/datatype -I/work/download/mpich2-1.0.7-dev/src/mpid/common/datatype -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/channels/sock/include -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/channels/sock/include -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock/poll -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock/poll -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -O2 -DFORTRANDOUBLEUNDERSCORE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DHAVE_ROMIOCONF_H -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(INCLUDE_DIR)
2844 +top_builddir = /work/download/mpich2-1.0.7-dev
2846 +C_COMPILE_SHL = $(CC_SHL) -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -O2 -DFORTRANDOUBLEUNDERSCORE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DHAVE_ROMIOCONF_H -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(INCLUDE_DIR)
2848 +VPATH = .:${srcdir}
2850 +AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_open.o \
2851 + ad_lustre_rwcontig.o ad_lustre_wrcoll.o ad_lustre_wrstr.o \
2852 + ad_lustre_fcntl.o ad_lustre_hints.o ad_lustre_close.o \
2853 + ad_lustre_aggregate.o
2856 +default: $(LIBNAME)
2857 + @if [ "none" != "none" ] ; then \
2858 + $(MAKE) $(SHLIBNAME).la ;\
2861 +.SUFFIXES: $(SUFFIXES) .p .lo
2864 + $(CC) $(CFLAGS) -c $<
2866 + $(C_COMPILE_SHL) -c $< -o _s$*.o
2867 + @mv -f _s$*.o $*.lo
2869 +$(LIBNAME): $(AD_LUSTRE_OBJECTS)
2870 + $(AR) $(LIBNAME) $(AD_LUSTRE_OBJECTS)
2871 + $(RANLIB) $(LIBNAME)
2873 +AD_LUSTRE_LOOBJECTS=$(AD_LUSTRE_OBJECTS:.o=.lo)
2874 +$(SHLIBNAME).la: $(AD_LUSTRE_LOOBJECTS)
2875 + $(AR) $(SHLIBNAME).la $(AD_LUSTRE_LOOBJECTS)
2878 + -@for file in ${AD_LUSTRE_OBJECTS:.o=.c} ; do \
2879 + gcov -b -f $$file ; done
2883 diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in
2884 --- ad_lustre_orig/Makefile.in 2008-09-17 14:36:57.000000000 +0800
2885 +++ ad_lustre/Makefile.in 2008-09-17 18:20:35.000000000 +0800
2889 AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_open.o \
2890 - ad_lustre_rwcontig.o ad_lustre_hints.o
2891 + ad_lustre_rwcontig.o ad_lustre_wrcoll.o ad_lustre_wrstr.o \
2892 + ad_lustre_hints.o ad_lustre_aggregate.o
2896 @if [ "@ENABLE_SHLIB@" != "none" ] ; then \
2897 diff -ruN ad_lustre_orig/README ad_lustre/README
2898 --- ad_lustre_orig/README 2008-09-17 14:36:57.000000000 +0800
2899 +++ ad_lustre/README 2008-09-17 18:20:35.000000000 +0800
2901 o To post the code for ParColl (Partitioned collective IO)
2903 -----------------------------------------------------
2905 +-----------------------------------------------------
2906 + o Improved data redistribution
2907 + - add I/O pattern identification. If request I/O size is big,
2908 + collective I/O won't be done. The hint big_req_size can be
2909 + used to define this.
2910 + - provide hint CO for load balancing to control the number
2911 + of IO clients for each OST
2912 + - divide the IO clients into the different OST groups to
2913 + produce stripe-contiguous I/O pattern
2914 + - reduce the collective overhead by hints contiguous_data and
2915 + same_io_size to remove unnecessary MPI_Alltoall()
2916 + o Control read-modify-write in data sieving in collective IO
2917 + by hint ds_in_coll.
2919 +-----------------------------------------------------
2921 -----------------------------------------------------
2922 o Direct IO and Lockless IO support