1 diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
2 --- ad_lustre_orig/ad_lustre_aggregate.c 1970-01-01 08:00:00.000000000 +0800
3 +++ ad_lustre/ad_lustre_aggregate.c 2008-09-17 18:20:35.000000000 +0800
5 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
7 + * Copyright (C) 1997 University of Chicago.
8 + * See COPYRIGHT notice in top-level directory.
10 + * Copyright (C) 2007 Oak Ridge National Laboratory
12 + * Copyright (C) 2008 Sun Microsystems, Lustre group
15 +#include "ad_lustre.h"
16 +#include "adio_extern.h"
18 +void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
19 + int mode, int nprocs)
21 + int *striping_info = NULL;
22 + /* get striping information:
23 + * striping_info[0] = stripe_size;
24 + * striping_info[1] = stripe_count;
25 + * striping_info[2] = CO;
27 + /* for easy understanding, we name some variables */
28 + int stripe_size, stripe_count, CO = 1, CO_max = 1, lflag;
29 + char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
31 + MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
33 + stripe_size = atoi(value);
35 + MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &lflag);
37 + stripe_count = atoi(value);
38 + /* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */
42 + /* for collective read,
43 + * if "CO" clients access the same OST simultaneously,
44 + * the OST disk seek time would be large. So, to avoid this,
45 + * it might be better if 1 client only accesses 1 OST.
46 + * So, we set CO = 1 to meet the above requirement.
49 + /*XXX: maybe there are other better way for collective read */
51 + /* CO_max: the largest number of IO clients for each ost group */
52 + CO_max = (nprocs - 1)/ stripe_count + 1;
53 + /* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */
54 + MPI_Info_get(fd->info, "CO", MPI_MAX_INFO_VAL, value, &lflag);
57 + CO = ADIOI_MIN(CO_max, CO);
60 + /* although there are known "N" hints so far, we still malloc space here
61 + * instead of declaring an array[3] outside,
62 + * because on one hand in the future we probably need more hints, and
63 + * on the other hand this function can be called by
64 + * both collective read and write conveniently.
66 + *striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int));
67 + striping_info = *striping_info_ptr;
68 + striping_info[0] = stripe_size;
69 + striping_info[1] = stripe_count;
70 + striping_info[2] = CO;
73 +int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
74 + ADIO_Offset *len, int nprocs,
77 + /* please refer the comments in above function for the detailed algorithm */
79 + ADIO_Offset avail_bytes;
81 + int stripe_size = striping_info[0];
82 + int stripe_count = striping_info[1];
83 + int CO = striping_info[2];
84 + int avail_nprocs = ADIOI_MIN(stripe_count * CO, nprocs);
86 + /* calculate the rank by offset directly */
87 + rank_index = (int)((off / stripe_size) % avail_nprocs);
88 + /* XXX: the above method is so simple that the processes in top ranks are always
89 + * chosen to be I/O clients. we hope they are different each time.
92 + avail_bytes = (off / (ADIO_Offset)stripe_size + 1) *
93 + (ADIO_Offset)stripe_size - off;
94 + if (avail_bytes < *len) {
95 + /* this proc only has part of the requested contig. region */
102 +void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
103 + int *len_list, int contig_access_count,
104 + int *striping_info, int nprocs,
105 + int *count_my_req_procs_ptr,
106 + int **count_my_req_per_proc_ptr,
107 + ADIOI_Access ** my_req_ptr,
110 + int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
112 + ADIO_Offset avail_len, rem_len, curr_idx, off;
113 + ADIOI_Access *my_req;
115 + *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
116 + count_my_req_per_proc = *count_my_req_per_proc_ptr;
118 + /* buf_idx is relevant only if buftype_is_contig.
119 + * buf_idx[i] gives the index into user_buf where data received
120 + * from proc. i should be placed. This allows receives to be done
121 + * without extra buffer. This can't be done if buftype is not contig.
123 + buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
124 + /* initialize buf_idx to -1 */
125 + for (i = 0; i < nprocs; i++)
128 + /* one pass just to calculate how much space to allocate for my_req;
129 + * contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
131 + for (i = 0; i < contig_access_count; i++) {
132 + /* short circuit offset/len processing if len == 0
133 + * (zero-byte read/write
135 + if (len_list[i] == 0)
137 + off = offset_list[i];
138 + avail_len = len_list[i];
139 + /* we set avail_len to be the total size of the access.
140 + * then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return
141 + * the amount that was available.
143 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs,
145 + count_my_req_per_proc[proc]++;
146 + /* figure out how many data is remaining in the access
147 + * we'll take care of this data (if there is any)
148 + * in the while loop below.
150 + rem_len = len_list[i] - avail_len;
152 + while (rem_len != 0) {
153 + off += avail_len; /* point to first remaining byte */
154 + avail_len = rem_len; /* save remaining size, pass to calc */
155 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs,
157 + count_my_req_per_proc[proc]++;
158 + rem_len -= avail_len; /* reduce remaining length by amount from fd */
162 + *my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access));
163 + my_req = *my_req_ptr;
165 + count_my_req_procs = 0;
166 + for (i = 0; i < nprocs; i++) {
167 + if (count_my_req_per_proc[i]) {
168 + my_req[i].offsets = (ADIO_Offset *)
169 + ADIOI_Malloc(count_my_req_per_proc[i] *
170 + sizeof(ADIO_Offset));
171 + my_req[i].lens = (int *) ADIOI_Malloc(count_my_req_per_proc[i] *
173 + count_my_req_procs++;
175 + my_req[i].count = 0; /* will be incremented where needed later */
178 + /* now fill in my_req */
180 + for (i = 0; i < contig_access_count; i++) {
181 + if (len_list[i] == 0)
183 + off = offset_list[i];
184 + avail_len = len_list[i];
185 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs,
188 + /* for each separate contiguous access from this process */
189 + if (buf_idx[proc] == -1)
190 + buf_idx[proc] = (int) curr_idx;
192 + l = my_req[proc].count;
193 + curr_idx += (int) avail_len; /* NOTE: Why is curr_idx an int? Fix? */
195 + rem_len = len_list[i] - avail_len;
197 + /* store the proc, offset, and len information in an array
198 + * of structures, my_req. Each structure contains the
199 + * offsets and lengths located in that process's FD,
200 + * and the associated count.
202 + my_req[proc].offsets[l] = off;
203 + my_req[proc].lens[l] = (int) avail_len;
204 + my_req[proc].count++;
206 + while (rem_len != 0) {
208 + avail_len = rem_len;
209 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs,
211 + if (buf_idx[proc] == -1)
212 + buf_idx[proc] = (int) curr_idx;
214 + l = my_req[proc].count;
215 + curr_idx += avail_len;
216 + rem_len -= avail_len;
218 + my_req[proc].offsets[l] = off;
219 + my_req[proc].lens[l] = (int) avail_len;
220 + my_req[proc].count++;
225 + for (i = 0; i < nprocs; i++) {
226 + if (count_my_req_per_proc[i] > 0) {
227 + FPRINTF(stdout, "data needed from %d (count = %d):\n",
228 + i, my_req[i].count);
229 + for (l = 0; l < my_req[i].count; l++) {
230 + FPRINTF(stdout, " off[%d] = %lld, len[%d] = %d\n",
231 + l, my_req[i].offsets[l], l, my_req[i].lens[l]);
237 + for (i = 0; i < nprocs; i++) {
238 + FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
242 + *count_my_req_procs_ptr = count_my_req_procs;
243 + *buf_idx_ptr = buf_idx;
246 +int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
247 + int *len_list, int nprocs)
250 + * So far, only one case is suitable for collective I/O
251 + * (1) request size <= big_req_size
253 + * if (avg_req_size > big_req_size) {
258 + int i, docollect = 1, lflag, big_req_size = 0;
259 + ADIO_Offset req_size = 0, total_req_size;
260 + int avg_req_size, total_access_count;
261 + char *value = NULL;
263 + /* calculate total_req_size and total_access_count */
264 + for (i = 0; i < contig_access_count; i++)
265 + req_size += len_list[i];
266 + MPI_Allreduce(&req_size, &total_req_size, 1, MPI_LONG_LONG_INT, MPI_SUM,
268 + MPI_Allreduce(&contig_access_count, &total_access_count, 1, MPI_INT, MPI_SUM,
270 + /* estimate average req_size */
271 + avg_req_size = (int)(total_req_size / total_access_count);
273 + /* get hint of hole_ratio */
274 + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
275 + MPI_Info_get(fd->info, "big_req_size", MPI_MAX_INFO_VAL, value, &lflag);
277 + big_req_size = atoi(value);
279 + if ((big_req_size > 0) && (avg_req_size > big_req_size))
287 +void ADIOI_LUSTRE_Calc_my_off_len(ADIO_File fd, int bufcount,
288 + MPI_Datatype datatype, int file_ptr_type,
289 + ADIO_Offset offset,
290 + ADIO_Offset **offset_list_ptr,
291 + int **len_list_ptr,
292 + ADIO_Offset *start_offset_ptr,
293 + ADIO_Offset *end_offset_ptr,
294 + int *contig_access_count_ptr)
296 + int filetype_size, buftype_size, etype_size;
297 + int i, j, k, frd_size = 0, old_frd_size = 0, st_index = 0;
298 + int n_filetypes, etype_in_filetype;
299 + ADIO_Offset abs_off_in_filetype = 0;
300 + int bufsize, sum, n_etypes_in_filetype, size_in_filetype;
301 + int contig_access_count, *len_list, flag, filetype_is_contig;
302 + MPI_Aint filetype_extent, filetype_lb;
303 + ADIOI_Flatlist_node *flat_file;
304 + ADIO_Offset *offset_list, off, end_offset = 0, disp;
306 + /* For this process's request, calculate the list of offsets and
307 + lengths in the file and determine the start and end offsets. */
309 + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
311 + MPI_Type_size(fd->filetype, &filetype_size);
312 + MPI_Type_extent(fd->filetype, &filetype_extent);
313 + MPI_Type_lb(fd->filetype, &filetype_lb);
314 + MPI_Type_size(datatype, &buftype_size);
315 + etype_size = fd->etype_size;
317 + if (!filetype_size) {
318 + *contig_access_count_ptr = 0;
319 + *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset));
320 + *len_list_ptr = (int *) ADIOI_Malloc(2 * sizeof(int));
321 + /* 2 is for consistency. everywhere I malloc one more than needed */
323 + offset_list = *offset_list_ptr;
324 + len_list = *len_list_ptr;
325 + offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
326 + fd->disp + etype_size * offset;
328 + *start_offset_ptr = offset_list[0];
329 + *end_offset_ptr = offset_list[0] + len_list[0] - 1;
333 + if (filetype_is_contig) {
334 + *contig_access_count_ptr = 1;
335 + *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset));
336 + *len_list_ptr = (int *) ADIOI_Malloc(2 * sizeof(int));
337 + /* 2 is for consistency. everywhere I malloc one more than needed */
339 + offset_list = *offset_list_ptr;
340 + len_list = *len_list_ptr;
341 + offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
342 + fd->disp + etype_size * offset;
343 + len_list[0] = bufcount * buftype_size;
344 + *start_offset_ptr = offset_list[0];
345 + *end_offset_ptr = offset_list[0] + len_list[0] - 1;
347 + /* update file pointer */
348 + if (file_ptr_type == ADIO_INDIVIDUAL)
349 + fd->fp_ind = *end_offset_ptr + 1;
351 + /* First calculate what size of offset_list and len_list to allocate */
352 + /* filetype already flattened in ADIO_Open or ADIO_Fcntl */
353 + flat_file = ADIOI_Flatlist;
354 + while (flat_file->type != fd->filetype)
355 + flat_file = flat_file->next;
358 + if (file_ptr_type == ADIO_INDIVIDUAL) {
359 + offset = fd->fp_ind; /* in bytes */
364 + for (i = 0; i < flat_file->count; i++) {
365 + if (disp + flat_file->indices[i] +
366 + (ADIO_Offset) n_filetypes * filetype_extent +
367 + flat_file->blocklens[i] >= offset) {
369 + frd_size = (int) (disp + flat_file->indices[i] +
370 + (ADIO_Offset) n_filetypes *
372 + flat_file->blocklens[i] -
380 + n_etypes_in_filetype = filetype_size / etype_size;
381 + n_filetypes = (int) (offset / n_etypes_in_filetype);
382 + etype_in_filetype = (int) (offset % n_etypes_in_filetype);
383 + size_in_filetype = etype_in_filetype * etype_size;
386 + for (i = 0; i < flat_file->count; i++) {
387 + sum += flat_file->blocklens[i];
388 + if (sum > size_in_filetype) {
390 + frd_size = sum - size_in_filetype;
391 + abs_off_in_filetype = flat_file->indices[i] +
393 + (sum - flat_file->blocklens[i]);
398 + /* abs. offset in bytes in the file */
399 + offset = disp + (ADIO_Offset) n_filetypes *filetype_extent +
400 + abs_off_in_filetype;
403 + /* calculate how much space to allocate for offset_list, len_list */
405 + old_frd_size = frd_size;
406 + contig_access_count = i = 0;
408 + bufsize = buftype_size * bufcount;
409 + frd_size = ADIOI_MIN(frd_size, bufsize);
410 + while (i < bufsize) {
412 + contig_access_count++;
414 + j = (j + 1) % flat_file->count;
415 + frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i);
418 + /* allocate space for offset_list and len_list */
420 + *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc((contig_access_count+1) *
421 + sizeof(ADIO_Offset));
422 + *len_list_ptr = (int *) ADIOI_Malloc((contig_access_count + 1) *
424 + /* +1 to avoid a 0-size malloc */
426 + offset_list = *offset_list_ptr;
427 + len_list = *len_list_ptr;
429 + /* find start offset, end offset, and fill in offset_list and len_list */
431 + *start_offset_ptr = offset; /* calculated above */
436 + frd_size = ADIOI_MIN(old_frd_size, bufsize);
437 + while (i < bufsize) {
439 + offset_list[k] = off;
440 + len_list[k] = frd_size;
444 + end_offset = off + frd_size - 1;
446 + /* Note: end_offset points to the last byte-offset that will be accessed.
447 + e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */
449 + if (off + frd_size < disp + flat_file->indices[j] +
450 + flat_file->blocklens[j] +
451 + (ADIO_Offset) n_filetypes * filetype_extent) {
453 + /* did not reach end of contiguous block in filetype.
454 + * no more I/O needed. off is incremented by frd_size.
457 + if (j < (flat_file->count - 1))
460 + /* hit end of flattened filetype;
461 + * start at beginning again
466 + off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes *
468 + frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i);
472 + /* update file pointer */
473 + if (file_ptr_type == ADIO_INDIVIDUAL)
476 + *contig_access_count_ptr = contig_access_count;
477 + *end_offset_ptr = end_offset;
481 +void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
482 + int *count_my_req_per_proc,
483 + ADIOI_Access * my_req,
484 + int nprocs, int myrank,
485 + ADIO_Offset start_offset,
486 + ADIO_Offset end_offset,
487 + int *striping_info,
488 + int *count_others_req_procs_ptr,
489 + ADIOI_Access ** others_req_ptr)
491 + /* what requests of other processes will be written by this process */
493 + int *count_others_req_per_proc, count_others_req_procs;
494 + int i, j, lflag, samesize = 0, contiguous = 0;
495 + MPI_Request *send_requests, *recv_requests;
496 + MPI_Status *statuses;
497 + ADIOI_Access *others_req;
498 + char *value = NULL;
499 + int proc, avail_nprocs, stripe_count, CO;
500 + ADIO_Offset min_st_offset, off, req_len, avail_len, rem_len, *all_lens;
502 + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
504 + MPI_Info_get(fd->info, "same_io_size", MPI_MAX_INFO_VAL, value, &lflag);
505 + if (lflag && !strcmp(value, "yes"))
507 + /* contiguous data */
508 + MPI_Info_get(fd->info, "contiguous_data", MPI_MAX_INFO_VAL, value, &lflag);
509 + if (lflag && !strcmp(value, "yes"))
512 + *others_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs *
513 + sizeof(ADIOI_Access));
514 + others_req = *others_req_ptr;
516 + /* if the data are contiguous, we don't need to do MPI_Alltoall */
518 + stripe_count = striping_info[1];
519 + CO = striping_info[2];
521 + for (i = 0; i < nprocs; i++) {
522 + others_req[i].count = 0;
524 + req_len = end_offset - start_offset + 1;
525 + all_lens = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
527 + if (samesize == 0) {/* different request size */
528 + /* calculate the min_st_offset */
529 + MPI_Allreduce(&start_offset, &min_st_offset, 1, MPI_LONG_LONG,
530 + MPI_MIN, fd->comm);
531 + /* exchange request length */
532 + MPI_Allgather(&req_len, 1, ADIO_OFFSET, all_lens, 1, ADIO_OFFSET,
534 + } else { /* same request size */
535 + /* calculate the min_st_offset */
536 + min_st_offset = start_offset - myrank * req_len;
537 + /* assign request length to all_lens[] */
538 + for (i = 0; i < nprocs; i ++)
539 + all_lens[i] = req_len;
541 + avail_nprocs = ADIOI_MIN(nprocs, stripe_count * CO);
542 + if (myrank < avail_nprocs) {
543 + off = min_st_offset;
544 + /* calcaulte other_req[i].count */
545 + for (i = 0; i < nprocs; i++) {
546 + avail_len = all_lens[i];
547 + rem_len = avail_len;
548 + while (rem_len > 0) {
549 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
550 + nprocs, striping_info);
551 + if (proc == myrank) {
552 + others_req[i].count ++;
555 + rem_len -= avail_len;
556 + avail_len = rem_len;
559 + /* calculate offset and len for each request */
560 + off = min_st_offset;
561 + for (i = 0; i < nprocs; i++) {
562 + if (others_req[i].count) {
563 + others_req[i].offsets = (ADIO_Offset *)
564 + ADIOI_Malloc(others_req[i].count *
565 + sizeof(ADIO_Offset));
566 + others_req[i].lens = (int *)
567 + ADIOI_Malloc(others_req[i].count *
569 + others_req[i].mem_ptrs = (MPI_Aint *)
570 + ADIOI_Malloc(others_req[i].count *
574 + avail_len = all_lens[i];
575 + rem_len = avail_len;
576 + while (rem_len > 0) {
577 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
578 + nprocs, striping_info);
579 + if (proc == myrank) {
580 + others_req[i].offsets[j] = off;
581 + others_req[i].lens[j] = (int)avail_len;
585 + rem_len -= avail_len;
586 + avail_len = rem_len;
591 + ADIOI_Free(all_lens);
593 + /* multiple non-contiguous requests */
594 + /* first find out how much to send/recv and from/to whom */
597 + * count_others_req_procs:
598 + * number of processes whose requests will be written by
599 + * this process (including this process itself)
600 + * count_others_req_per_proc[i]:
601 + * how many separate contiguous requests of proc[i] will be
602 + * written by this process.
605 + count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
607 + MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
608 + count_others_req_per_proc, 1, MPI_INT, fd->comm);
610 + count_others_req_procs = 0;
611 + for (i = 0; i < nprocs; i++) {
612 + if (count_others_req_per_proc[i]) {
613 + others_req[i].count = count_others_req_per_proc[i];
614 + others_req[i].offsets = (ADIO_Offset *)
615 + ADIOI_Malloc(others_req[i].count *
616 + sizeof(ADIO_Offset));
617 + others_req[i].lens = (int *)
618 + ADIOI_Malloc(others_req[i].count *
620 + others_req[i].mem_ptrs = (MPI_Aint *)
621 + ADIOI_Malloc(others_req[i].count *
623 + count_others_req_procs++;
625 + others_req[i].count = 0;
628 + /* now send the calculated offsets and lengths to respective processes */
630 + send_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_my_req_procs + 1) *
631 + sizeof(MPI_Request));
632 + recv_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_others_req_procs+1)*
633 + sizeof(MPI_Request));
634 + /* +1 to avoid a 0-size malloc */
637 + for (i = 0; i < nprocs; i++) {
638 + if (others_req[i].count) {
639 + MPI_Irecv(others_req[i].offsets, others_req[i].count,
640 + ADIO_OFFSET, i, i + myrank, fd->comm,
641 + &recv_requests[j]);
643 + MPI_Irecv(others_req[i].lens, others_req[i].count,
644 + MPI_INT, i, i + myrank + 1, fd->comm,
645 + &recv_requests[j]);
651 + for (i = 0; i < nprocs; i++) {
652 + if (my_req[i].count) {
653 + MPI_Isend(my_req[i].offsets, my_req[i].count,
654 + ADIO_OFFSET, i, i + myrank, fd->comm,
655 + &send_requests[j]);
657 + MPI_Isend(my_req[i].lens, my_req[i].count,
658 + MPI_INT, i, i + myrank + 1, fd->comm,
659 + &send_requests[j]);
664 + statuses = (MPI_Status *)
665 + ADIOI_Malloc((1 + 2 * ADIOI_MAX(count_my_req_procs,
666 + count_others_req_procs)) *
667 + sizeof(MPI_Status));
668 + /* +1 to avoid a 0-size malloc */
670 + MPI_Waitall(2 * count_my_req_procs, send_requests, statuses);
671 + MPI_Waitall(2 * count_others_req_procs, recv_requests, statuses);
673 + ADIOI_Free(send_requests);
674 + ADIOI_Free(recv_requests);
675 + ADIOI_Free(statuses);
676 + ADIOI_Free(count_others_req_per_proc);
678 + *count_others_req_procs_ptr = count_others_req_procs;
681 diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c
682 --- ad_lustre_orig/ad_lustre.c 2008-09-17 14:36:57.000000000 +0800
683 +++ ad_lustre/ad_lustre.c 2008-09-17 18:20:35.000000000 +0800
685 /* -*- Mode: C; c-basic-offset:4 ; -*- */
687 - * Copyright (C) 2001 University of Chicago.
689 + * Copyright (C) 2001 University of Chicago.
690 * See COPYRIGHT notice in top-level directory.
692 * Copyright (C) 2007 Oak Ridge National Laboratory
694 + * Copyright (C) 2008 Sun Microsystems, Lustre group
697 #include "ad_lustre.h"
699 ADIOI_LUSTRE_ReadContig, /* ReadContig */
700 ADIOI_LUSTRE_WriteContig, /* WriteContig */
701 ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
702 - ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
703 + ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */
704 ADIOI_GEN_SeekIndividual, /* SeekIndividual */
705 - ADIOI_GEN_Fcntl, /* Fcntl */
706 + ADIOI_LUSTRE_Fcntl, /* Fcntl */
707 ADIOI_LUSTRE_SetInfo, /* SetInfo */
708 ADIOI_GEN_ReadStrided, /* ReadStrided */
709 - ADIOI_GEN_WriteStrided, /* WriteStrided */
710 - ADIOI_GEN_Close, /* Close */
711 + ADIOI_LUSTRE_WriteStrided, /* WriteStrided */
712 + ADIOI_LUSTRE_Close, /* Close */
713 #if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE)
714 ADIOI_GEN_IreadContig, /* IreadContig */
715 ADIOI_GEN_IwriteContig, /* IwriteContig */
716 diff -ruN ad_lustre_orig/ad_lustre_close.c ad_lustre/ad_lustre_close.c
717 --- ad_lustre_orig/ad_lustre_close.c 1970-01-01 08:00:00.000000000 +0800
718 +++ ad_lustre/ad_lustre_close.c 2008-09-17 18:20:35.000000000 +0800
720 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
723 + * Copyright (C) 1997 University of Chicago.
724 + * See COPYRIGHT notice in top-level directory.
726 + * Copyright (C) 2007 Oak Ridge National Laboratory
728 + * Copyright (C) 2008 Sun Microsystems, Lustre group
731 +#include "ad_lustre.h"
737 +void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code)
740 + static char myname[] = "ADIOI_LUSTRE_CLOSE";
743 + MPE_Log_event(9, 0, "start close");
746 + err = close(fd->fd_sys);
749 + MPE_Log_event(10, 0, "end close");
754 + if (err == -1 || derr == -1) {
756 + MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname,
757 + __LINE__, MPI_ERR_IO, "**io", "**io %s",
760 + *error_code = MPI_SUCCESS;
762 diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
763 --- ad_lustre_orig/ad_lustre.h 2008-09-17 14:36:57.000000000 +0800
764 +++ ad_lustre/ad_lustre.h 2008-10-06 16:07:21.000000000 +0800
766 /* -*- Mode: C; c-basic-offset:4 ; -*- */
768 - * Copyright (C) 1997 University of Chicago.
770 + * Copyright (C) 1997 University of Chicago.
771 * See COPYRIGHT notice in top-level directory.
773 * Copyright (C) 2007 Oak Ridge National Laboratory
775 + * Copyright (C) 2008 Sun Microsystems, Lustre group
778 #ifndef AD_UNIX_INCLUDE
781 /*#include <fcntl.h>*/
782 #include <sys/ioctl.h>
784 #include "lustre/lustre_user.h"
786 +/* copy something from lustre_user.h here */
787 +# define LOV_USER_MAGIC 0x0BD10BD0
788 +# define LL_IOC_LOV_SETSTRIPE _IOW ('f', 154, long)
789 +# define LL_IOC_LOV_GETSTRIPE _IOW ('f', 155, long)
790 +# define lov_user_ost_data lov_user_ost_data_v1
791 +struct lov_user_ost_data_v1 { /* per-stripe data structure */
792 + __u64 l_object_id; /* OST object ID */
793 + __u64 l_object_gr; /* OST object group (creating MDS number) */
794 + __u32 l_ost_gen; /* generation of this OST index */
795 + __u32 l_ost_idx; /* OST index in LOV */
796 +} __attribute__((packed));
797 +#define lov_user_md lov_user_md_v1
798 +struct lov_user_md_v1 { /* LOV EA user data (host-endian) */
799 + __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V1 */
800 + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
801 + __u64 lmm_object_id; /* LOV object ID */
802 + __u64 lmm_object_gr; /* LOV object group */
803 + __u32 lmm_stripe_size; /* size of stripe in bytes */
804 + __u16 lmm_stripe_count; /* num stripes in use for this object */
805 + __u16 lmm_stripe_offset; /* starting stripe offset in lmm_objects */
806 + struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
807 +} __attribute__((packed));
810 /*#include "adioi.h"*/
814 void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code);
815 void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code);
816 -void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
817 - MPI_Datatype datatype, int file_ptr_type,
818 - ADIO_Offset offset, ADIO_Status *status, int
820 -void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count,
821 - MPI_Datatype datatype, int file_ptr_type,
822 - ADIO_Offset offset, ADIO_Status *status, int
824 +void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
825 + MPI_Datatype datatype, int file_ptr_type,
826 + ADIO_Offset offset, ADIO_Status *status,
828 +void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count,
829 + MPI_Datatype datatype, int file_ptr_type,
830 + ADIO_Offset offset, ADIO_Status *status,
832 +void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
833 + MPI_Datatype datatype, int file_ptr_type,
834 + ADIO_Offset offset, ADIO_Status *status,
836 void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
837 - MPI_Datatype datatype, int file_ptr_type,
838 - ADIO_Offset offset, ADIO_Status *status, int
840 + MPI_Datatype datatype, int file_ptr_type,
841 + ADIO_Offset offset, ADIO_Status *status,
843 void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count,
844 - MPI_Datatype datatype, int file_ptr_type,
845 - ADIO_Offset offset, ADIO_Status *status, int
847 + MPI_Datatype datatype, int file_ptr_type,
848 + ADIO_Offset offset, ADIO_Status *status,
850 +void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count,
851 + MPI_Datatype datatype, int file_ptr_type,
852 + ADIO_Offset offset, ADIO_Status *status,
854 void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
856 void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
858 +void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
859 + int mode, int nprocs);
860 +int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
861 + ADIO_Offset *len, int nprocs,
862 + int *striping_info);
863 +void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
864 + int *len_list, int contig_access_count,
865 + int *striping_info, int nprocs,
866 + int *count_my_req_procs_ptr,
867 + int **count_my_req_per_proc_ptr,
868 + ADIOI_Access ** my_req_ptr,
869 + int **buf_idx_ptr);
870 +int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
871 + int *len_list, int nprocs);
872 +void ADIOI_LUSTRE_Calc_my_off_len(ADIO_File fd, int bufcount,
873 + MPI_Datatype datatype, int file_ptr_type,
874 + ADIO_Offset offset,
875 + ADIO_Offset **offset_list_ptr,
876 + int **len_list_ptr,
877 + ADIO_Offset *start_offset_ptr,
878 + ADIO_Offset *end_offset_ptr,
879 + int *contig_access_count_ptr);
880 +void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
881 + int *count_my_req_per_proc,
882 + ADIOI_Access * my_req,
883 + int nprocs, int myrank,
884 + ADIO_Offset start_offset,
885 + ADIO_Offset end_offset,
886 + int *striping_info,
887 + int *count_others_req_procs_ptr,
888 + ADIOI_Access ** others_req_ptr);
889 #endif /* End of AD_UNIX_INCLUDE */
890 diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
891 --- ad_lustre_orig/ad_lustre_hints.c 2008-09-17 14:36:57.000000000 +0800
892 +++ ad_lustre/ad_lustre_hints.c 2008-09-17 18:20:35.000000000 +0800
894 /* -*- Mode: C; c-basic-offset:4 ; -*- */
896 - * Copyright (C) 1997 University of Chicago.
898 + * Copyright (C) 1997 University of Chicago.
899 * See COPYRIGHT notice in top-level directory.
901 * Copyright (C) 2007 Oak Ridge National Laboratory
903 + * Copyright (C) 2008 Sun Microsystems, Lustre group
906 #include "ad_lustre.h"
907 @@ -11,130 +13,162 @@
909 void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
911 - char *value, *value_in_fd;
912 - int flag, tmp_val[3], str_factor=-1, str_unit=0, start_iodev=-1;
913 - struct lov_user_md lum = { 0 };
914 - int err, myrank, fd_sys, perm, amode, old_mask;
915 + char *value = NULL;
916 + int flag, tmp_val, int_val, str_factor, str_unit, start_iodev;
917 + static char myname[] = "ADIOI_LUSTRE_SETINFO";
919 - value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
920 if ( (fd->info) == MPI_INFO_NULL) {
921 - /* This must be part of the open call. can set striping parameters
923 + /* This must be part of the open call. can set striping parameters
925 MPI_Info_create(&(fd->info));
927 MPI_Info_set(fd->info, "direct_read", "false");
928 MPI_Info_set(fd->info, "direct_write", "false");
929 fd->direct_read = fd->direct_write = 0;
931 - /* has user specified striping or server buffering parameters
933 + /* has user specified striping or server buffering parameters
934 and do they have the same value on all processes? */
935 if (users_info != MPI_INFO_NULL) {
936 - MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
939 - str_unit=atoi(value);
941 - MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
944 - str_factor=atoi(value);
945 + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
947 - MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
948 + /* direct read and write */
949 + MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
952 - start_iodev=atoi(value);
954 - MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
956 if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
957 MPI_Info_set(fd->info, "direct_read", "true");
961 - MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
962 + MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
964 if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
965 MPI_Info_set(fd->info, "direct_write", "true");
966 fd->direct_write = 1;
970 - MPI_Comm_rank(fd->comm, &myrank);
972 - tmp_val[0] = str_factor;
973 - tmp_val[1] = str_unit;
974 - tmp_val[2] = start_iodev;
976 - MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm);
978 - if (tmp_val[0] != str_factor
979 - || tmp_val[1] != str_unit
980 - || tmp_val[2] != start_iodev) {
981 - FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys"
982 - "-striping_factor:striping_unit:start_iodevice "
983 - "need to be identical across all processes\n");
984 - MPI_Abort(MPI_COMM_WORLD, 1);
985 - } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) {
986 - /* if user has specified striping info, process 0 tries to set it */
988 - if (fd->perm == ADIO_PERM_NULL) {
989 - old_mask = umask(022);
991 - perm = old_mask ^ 0666;
993 - else perm = fd->perm;
996 - if (fd->access_mode & ADIO_CREATE)
997 - amode = amode | O_CREAT;
998 - if (fd->access_mode & ADIO_RDONLY)
999 - amode = amode | O_RDONLY;
1000 - if (fd->access_mode & ADIO_WRONLY)
1001 - amode = amode | O_WRONLY;
1002 - if (fd->access_mode & ADIO_RDWR)
1003 - amode = amode | O_RDWR;
1004 - if (fd->access_mode & ADIO_EXCL)
1005 - amode = amode | O_EXCL;
1007 - /* we need to create file so ensure this is set */
1008 - amode = amode | O_LOV_DELAY_CREATE | O_CREAT;
1010 - fd_sys = open(fd->filename, amode, perm);
1011 - if (fd_sys == -1) {
1012 - if (errno != EEXIST)
1014 - "Failure to open file %s %d %d\n",strerror(errno), amode, perm);
1016 - lum.lmm_magic = LOV_USER_MAGIC;
1017 - lum.lmm_pattern = 0;
1018 - lum.lmm_stripe_size = str_unit;
1019 - lum.lmm_stripe_count = str_factor;
1020 - lum.lmm_stripe_offset = start_iodev;
1022 - err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum);
1023 - if (err == -1 && errno != EEXIST) {
1024 - fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno));
1028 - } /* End of striping parameters validation */
1030 + MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
1032 + if (flag && (str_unit = atoi(value))) {
1033 + tmp_val = str_unit;
1034 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1035 + if (tmp_val != str_unit) {
1036 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1041 + MPI_Info_set(fd->info, "striping_unit", value);
1043 + /* stripe count */
1044 + MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
1046 + if (flag && (str_factor = atoi(value))) {
1047 + tmp_val = str_factor;
1048 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1049 + if (tmp_val != str_factor) {
1050 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1051 + "striping_factor",
1055 + MPI_Info_set(fd->info, "striping_factor", value);
1057 + /* stripe offset */
1058 + MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
1060 + if (flag && ((start_iodev = atoi(value)) >= 0)) {
1061 + tmp_val = start_iodev;
1062 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1063 + if (tmp_val != start_iodev) {
1064 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1069 + MPI_Info_set(fd->info, "start_iodevice", value);
1072 + MPI_Info_get(users_info, "CO", MPI_MAX_INFO_VAL, value,
1074 + if (flag && (int_val = atoi(value)) > 0) {
1075 + tmp_val = int_val;
1076 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1077 + if (tmp_val != int_val) {
1078 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1083 + MPI_Info_set(fd->info, "CO", value);
1085 + /* big_req_size */
1086 + MPI_Info_get(users_info, "big_req_size", MPI_MAX_INFO_VAL, value,
1088 + if (flag && (int_val = atoi(value)) > 0) {
1089 + tmp_val = int_val;
1090 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1091 + if (tmp_val != int_val) {
1092 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1097 + MPI_Info_set(fd->info, "big_req_size", value);
1099 + /* hint for disabling data sieving when do collective IO */
1100 + MPI_Info_get(users_info, "ds_in_coll", MPI_MAX_INFO_VAL,
1102 + if (flag && (!strcmp(value, "enable") ||
1103 + !strcmp(value, "ENABLE"))) {
1104 + tmp_val = int_val = 1;
1105 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1106 + if (tmp_val != int_val) {
1107 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1112 + MPI_Info_set(fd->info, "ds_in_coll", "enable");
1114 + /* same io size */
1115 + MPI_Info_get(users_info, "same_io_size", MPI_MAX_INFO_VAL,
1117 + if (flag && (!strcmp(value, "yes") ||
1118 + !strcmp(value, "YES"))) {
1119 + tmp_val = int_val = 1;
1120 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1121 + if (tmp_val != int_val) {
1122 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1127 + MPI_Info_set(fd->info, "same_io_size", "yes");
1129 + /* contiguous data */
1130 + MPI_Info_get(users_info, "contiguous_data", MPI_MAX_INFO_VAL,
1132 + if (flag && (!strcmp(value, "yes") ||
1133 + !strcmp(value, "YES"))) {
1134 + tmp_val = int_val = 1;
1135 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1136 + if (tmp_val != int_val) {
1137 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1138 + "contiguous_data",
1142 + MPI_Info_set(fd->info, "contiguous_data", "yes");
1144 + ADIOI_Free(value);
1147 - MPI_Barrier(fd->comm);
1148 - /* set the values for collective I/O and data sieving parameters */
1149 - ADIOI_GEN_SetInfo(fd, users_info, error_code);
1151 - /* The file has been opened previously and fd->fd_sys is a valid
1152 - file descriptor. cannot set striping parameters now. */
1154 - /* set the values for collective I/O and data sieving parameters */
1155 - ADIOI_GEN_SetInfo(fd, users_info, error_code);
1158 + /* set the values for collective I/O and data sieving parameters */
1159 + ADIOI_GEN_SetInfo(fd, users_info, error_code);
1161 if (ADIOI_Direct_read) fd->direct_read = 1;
1162 if (ADIOI_Direct_write) fd->direct_write = 1;
1164 - ADIOI_Free(value);
1166 *error_code = MPI_SUCCESS;
1168 diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c
1169 --- ad_lustre_orig/ad_lustre_open.c 2008-09-17 14:36:57.000000000 +0800
1170 +++ ad_lustre/ad_lustre_open.c 2008-09-17 18:55:50.000000000 +0800
1172 /* -*- Mode: C; c-basic-offset:4 ; -*- */
1174 - * Copyright (C) 1997 University of Chicago.
1176 + * Copyright (C) 1997 University of Chicago.
1177 * See COPYRIGHT notice in top-level directory.
1179 * Copyright (C) 2007 Oak Ridge National Laboratory
1181 + * Copyright (C) 2008 Sun Microsystems, Lustre group
1184 #include "ad_lustre.h"
1186 void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
1188 - int perm, old_mask, amode, amode_direct;
1189 + int perm, old_mask, amode = 0, amode_direct = 0, flag = 0, err, myrank;
1190 + int stripe_size = 0, stripe_count = 0, stripe_offset = -1;
1191 struct lov_user_md lum = { 0 };
1193 + char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
1195 #if defined(MPICH2) || !defined(PRINT_ERR_MSG)
1196 static char myname[] = "ADIOI_LUSTRE_OPEN";
1198 old_mask = umask(022);
1200 perm = old_mask ^ 0666;
1202 - else perm = fd->perm;
1207 - if (fd->access_mode & ADIO_CREATE)
1208 + if (fd->access_mode & ADIO_CREATE) {
1209 amode = amode | O_CREAT;
1210 + /* Check striping info
1211 + * if already set by SetInfo(), set them to lum; otherwise, set by lum
1213 + MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value,
1216 + stripe_size = atoi(value);
1218 + MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value,
1221 + stripe_count = atoi(value);
1223 + MPI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL, value,
1226 + stripe_offset = atoi(value);
1228 + /* if user has specified striping info,
1229 + * process 0 will try to check and set it.
1231 + if ((stripe_size > 0) || (stripe_count > 0) || (stripe_offset >= 0)) {
1232 + MPI_Comm_rank(fd->comm, &myrank);
1233 + if (myrank == 0) {
1234 + int fd_sys = open(fd->filename, amode, perm);
1235 + if (fd_sys == -1) {
1236 + if (errno != EEXIST)
1237 + FPRINTF(stderr, "Failure to open file %s %d %d\n",
1238 + strerror(errno), amode, perm);
1240 + lum.lmm_magic = LOV_USER_MAGIC;
1241 + lum.lmm_pattern = 1;
1242 + lum.lmm_stripe_size = stripe_size;
1243 + lum.lmm_stripe_count = stripe_count;
1244 + lum.lmm_stripe_offset = stripe_offset;
1246 + if (ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum))
1248 + "Failure to set striping info to Lustre!\n");
1252 + MPI_Barrier(fd->comm);
1256 if (fd->access_mode & ADIO_RDONLY)
1257 amode = amode | O_RDONLY;
1258 if (fd->access_mode & ADIO_WRONLY)
1260 fd->fd_sys = open(fd->filename, amode|O_CREAT, perm);
1262 if (fd->fd_sys != -1) {
1265 - value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
1267 /* get file striping information and set it in info */
1268 - lum.lmm_magic = LOV_USER_MAGIC;
1269 - err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
1272 - sprintf(value, "%d", lum.lmm_stripe_size);
1273 - MPI_Info_set(fd->info, "striping_unit", value);
1275 - sprintf(value, "%d", lum.lmm_stripe_count);
1276 - MPI_Info_set(fd->info, "striping_factor", value);
1278 - sprintf(value, "%d", lum.lmm_stripe_offset);
1279 - MPI_Info_set(fd->info, "start_iodevice", value);
1281 - ADIOI_Free(value);
1282 + lum.lmm_magic = LOV_USER_MAGIC;
1283 + err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
1286 + if (lum.lmm_stripe_size && lum.lmm_stripe_count &&
1287 + (lum.lmm_stripe_offset >= 0)) {
1288 + sprintf(value, "%d", lum.lmm_stripe_size);
1289 + MPI_Info_set(fd->info, "striping_unit", value);
1291 + sprintf(value, "%d", lum.lmm_stripe_count);
1292 + MPI_Info_set(fd->info, "striping_factor", value);
1294 + sprintf(value, "%d", lum.lmm_stripe_offset);
1295 + MPI_Info_set(fd->info, "start_iodevice", value);
1297 + FPRINTF(stderr, "Striping info is invalid!\n");
1298 + ADIOI_Free(value);
1299 + MPI_Abort(MPI_COMM_WORLD, 1);
1302 + FPRINTF(stderr, "Failed to get striping info from Lustre!\n");
1303 + ADIOI_Free(value);
1304 + MPI_Abort(MPI_COMM_WORLD, 1);
1306 if (fd->access_mode & ADIO_APPEND)
1307 fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1311 if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
1312 - fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1313 + fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1316 if (fd->direct_write || fd->direct_read) {
1317 @@ -81,20 +133,22 @@
1320 /* --BEGIN ERROR HANDLING-- */
1321 - if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
1322 - (fd->direct_write || fd->direct_read))) {
1323 + if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
1324 + (fd->direct_write || fd->direct_read))) {
1325 if (errno == ENAMETOOLONG)
1326 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1327 - MPIR_ERR_RECOVERABLE, myname,
1328 - __LINE__, MPI_ERR_BAD_FILE,
1329 + MPIR_ERR_RECOVERABLE,
1333 "**filenamelong %s %d",
1335 strlen(fd->filename));
1336 else if (errno == ENOENT)
1337 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1338 - MPIR_ERR_RECOVERABLE, myname,
1339 - __LINE__, MPI_ERR_NO_SUCH_FILE,
1340 + MPIR_ERR_RECOVERABLE,
1342 + MPI_ERR_NO_SUCH_FILE,
1346 @@ -108,27 +162,30 @@
1348 else if (errno == EACCES) {
1349 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1350 - MPIR_ERR_RECOVERABLE, myname,
1351 - __LINE__, MPI_ERR_ACCESS,
1352 + MPIR_ERR_RECOVERABLE,
1356 - "**fileaccess %s",
1359 - else if (errno == EROFS) {
1360 + "**fileaccess %s",
1362 + } else if (errno == EROFS) {
1363 /* Read only file or file system and write access requested */
1364 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1365 - MPIR_ERR_RECOVERABLE, myname,
1366 - __LINE__, MPI_ERR_READ_ONLY,
1367 - "**ioneedrd", 0 );
1370 + MPIR_ERR_RECOVERABLE,
1372 + MPI_ERR_READ_ONLY,
1375 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1376 - MPIR_ERR_RECOVERABLE, myname,
1377 - __LINE__, MPI_ERR_IO, "**io",
1378 + MPIR_ERR_RECOVERABLE,
1380 + MPI_ERR_IO, "**io",
1381 "**io %s", strerror(errno));
1385 /* --END ERROR HANDLING-- */
1386 - else *error_code = MPI_SUCCESS;
1387 + *error_code = MPI_SUCCESS;
1390 + ADIOI_Free(value);
1392 diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c
1393 --- ad_lustre_orig/ad_lustre_rwcontig.c 2008-09-17 14:36:57.000000000 +0800
1394 +++ ad_lustre/ad_lustre_rwcontig.c 2008-09-17 18:52:01.000000000 +0800
1396 /* -*- Mode: C; c-basic-offset:4 ; -*- */
1398 - * Copyright (C) 1997 University of Chicago.
1400 + * Copyright (C) 1997 University of Chicago.
1401 * See COPYRIGHT notice in top-level directory.
1403 * Copyright (C) 2007 Oak Ridge National Laboratory
1405 + * Copyright (C) 2008 Sun Microsystems, Lustre group
1408 #define _XOPEN_SOURCE 600
1412 err = write(fd->fd_sys, buf, len);
1415 err = read(fd->fd_sys, buf, len);
1417 err = ADIOI_LUSTRE_Directio(fd, buf, len, offset, io_mode);
1418 diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
1419 --- ad_lustre_orig/ad_lustre_wrcoll.c 1970-01-01 08:00:00.000000000 +0800
1420 +++ ad_lustre/ad_lustre_wrcoll.c 2008-09-17 18:20:35.000000000 +0800
1422 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
1424 + * Copyright (C) 1997 University of Chicago.
1425 + * See COPYRIGHT notice in top-level directory.
1427 + * Copyright (C) 2007 Oak Ridge National Laboratory
1429 + * Copyright (C) 2008 Sun Microsystems, Lustre group
1432 +#include "ad_lustre.h"
1433 +#include "adio_extern.h"
1435 +/* prototypes of functions used for collective writes only. */
1436 +static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
1437 + MPI_Datatype datatype, int nprocs,
1439 + ADIOI_Access *others_req,
1440 + ADIOI_Access *my_req,
1441 + ADIO_Offset *offset_list,
1443 + int contig_access_count,
1444 + int * striping_info,
1445 + int *buf_idx, int *error_code);
1446 +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
1447 + ADIOI_Flatlist_node * flat_buf,
1449 + ADIO_Offset * offset_list,
1450 + int *len_list, int *send_size,
1451 + MPI_Request * requests,
1452 + int *sent_to_proc, int nprocs,
1453 + int myrank, int contig_access_count,
1454 + int * striping_info,
1455 + int *send_buf_idx,
1456 + int *curr_to_proc,
1457 + int *done_to_proc, int iter,
1458 + MPI_Aint buftype_extent);
1459 +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
1461 + ADIOI_Flatlist_node * flat_buf,
1462 + ADIO_Offset * offset_list,
1463 + int *len_list, int *send_size,
1464 + int *recv_size, ADIO_Offset off,
1465 + int size, int *count,
1466 + int *start_pos, int *partial_recv,
1467 + int *sent_to_proc, int nprocs,
1468 + int myrank, int buftype_is_contig,
1469 + int contig_access_count,
1470 + int * striping_info,
1471 + ADIOI_Access * others_req,
1472 + int *send_buf_idx,
1473 + int *curr_to_proc,
1474 + int *done_to_proc, int *hole,
1475 + int iter, MPI_Aint buftype_extent,
1476 + int *buf_idx, int *error_code);
1477 +static void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
1478 + ADIO_Offset * srt_off, int *srt_len,
1479 + int *start_pos, int nprocs, int nprocs_recv,
1480 + int total_elements);
1482 +void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
1483 + MPI_Datatype datatype,
1484 + int file_ptr_type, ADIO_Offset offset,
1485 + ADIO_Status * status, int *error_code)
1487 + ADIOI_Access *my_req;
1488 + /* array of nprocs access structures, one for each other process has
1489 + this process's request */
1491 + ADIOI_Access *others_req;
1492 + /* array of nprocs access structures, one for each other process
1493 + whose request is written by this process. */
1495 + int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank, do_collect = 0;
1496 + int contig_access_count = 0, buftype_is_contig;
1497 + int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
1498 + ADIO_Offset orig_fp, start_offset, end_offset, off, *offset_list = NULL;
1499 + int *buf_idx = NULL, *len_list = NULL, *striping_info = NULL;
1500 + int old_error, tmp_error;
1502 + MPI_Comm_size(fd->comm, &nprocs);
1503 + MPI_Comm_rank(fd->comm, &myrank);
1505 + nprocs_for_coll = fd->hints->cb_nodes;
1506 + orig_fp = fd->fp_ind;
1508 + /* IO patten identification if cb_write isn't disabled */
1509 + if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
1510 + /* For this process's request, calculate the list of offsets and
1511 + lengths in the file and determine the start and end offsets. */
1512 + ADIOI_LUSTRE_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
1513 + &offset_list, &len_list, &start_offset,
1514 + &end_offset, &contig_access_count);
1515 + /* Get striping information */
1516 + ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1, nprocs);
1517 + /* check if the access pattern can benefit from collective write */
1518 + do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count,
1519 + len_list, nprocs);
1521 + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
1523 + /* Decide if collective I/O should be done */
1524 + if ((!do_collect && fd->hints->cb_write == ADIOI_HINT_AUTO) ||
1525 + fd->hints->cb_write == ADIOI_HINT_DISABLE) {
1527 + int filerange_is_contig = 0;
1529 + /* use independent accesses */
1530 + if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
1531 + ADIOI_Free(offset_list);
1532 + ADIOI_Free(len_list);
1535 + fd->fp_ind = orig_fp;
1536 + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
1537 + if (buftype_is_contig && filetype_is_contig) {
1538 + if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
1539 + off = fd->disp + (fd->etype_size) * offset;
1540 + ADIO_WriteContig(fd, buf, count, datatype,
1541 + ADIO_EXPLICIT_OFFSET,
1542 + off, status, error_code);
1544 + ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
1545 + 0, status, error_code);
1547 + ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
1548 + offset, status, error_code);
1553 + /* calculate what portions of the access requests of this process are
1554 + * located in which process
1556 + ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count,
1557 + striping_info, nprocs, &count_my_req_procs,
1558 + &count_my_req_per_proc, &my_req, &buf_idx);
1559 + /* calculate what process's requests will be written by this process */
1560 + ADIOI_LUSTRE_Calc_others_req(fd, count_my_req_procs,
1561 + count_my_req_per_proc,
1562 + my_req, nprocs, myrank,
1563 + start_offset, end_offset, striping_info,
1564 + &count_others_req_procs, &others_req);
1565 + ADIOI_Free(count_my_req_per_proc);
1567 + /* exchange data and write in sizes of no more than stripe_size. */
1568 + ADIOI_LUSTRE_Exch_and_write(fd, buf, datatype, nprocs, myrank,
1569 + others_req, my_req,
1570 + offset_list, len_list, contig_access_count,
1571 + striping_info, buf_idx, error_code);
1573 + old_error = *error_code;
1574 + if (*error_code != MPI_SUCCESS)
1575 + *error_code = MPI_ERR_IO;
1577 + /* optimization: if only one process performing i/o, we can perform
1578 + * a less-expensive Bcast */
1579 +#ifdef ADIOI_MPE_LOGGING
1580 + MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL);
1582 + if (fd->hints->cb_nodes == 1)
1583 + MPI_Bcast(error_code, 1, MPI_INT,
1584 + fd->hints->ranklist[0], fd->comm);
1586 + tmp_error = *error_code;
1587 + MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT,
1588 + MPI_MAX, fd->comm);
1590 +#ifdef ADIOI_MPE_LOGGING
1591 + MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL);
1594 + if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO))
1595 + *error_code = old_error;
1598 + if (!buftype_is_contig)
1599 + ADIOI_Delete_flattened(datatype);
1601 + /* free all memory allocated for collective I/O */
1602 + /* free others_req */
1603 + for (i = 0; i < nprocs; i++) {
1604 + if (others_req[i].count) {
1605 + ADIOI_Free(others_req[i].offsets);
1606 + ADIOI_Free(others_req[i].lens);
1607 + ADIOI_Free(others_req[i].mem_ptrs);
1610 + ADIOI_Free(others_req);
1611 + /* free my_req here */
1612 + for (i = 0; i < nprocs; i++) {
1613 + if (my_req[i].count) {
1614 + ADIOI_Free(my_req[i].offsets);
1615 + ADIOI_Free(my_req[i].lens);
1618 + ADIOI_Free(my_req);
1619 + ADIOI_Free(buf_idx);
1620 + ADIOI_Free(offset_list);
1621 + ADIOI_Free(len_list);
1622 + ADIOI_Free(striping_info);
1624 +#ifdef HAVE_STATUS_SET_BYTES
1626 + int bufsize, size;
1627 + /* Don't set status if it isn't needed */
1628 + MPI_Type_size(datatype, &size);
1629 + bufsize = size * count;
1630 + MPIR_Status_set_bytes(status, datatype, bufsize);
1632 + /* This is a temporary way of filling in status. The right way is to
1633 + * keep track of how much data was actually written during collective I/O.
1637 + fd->fp_sys_posn = -1; /* set it to null. */
1640 +static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
1641 + MPI_Datatype datatype, int nprocs,
1642 + int myrank, ADIOI_Access *others_req,
1643 + ADIOI_Access *my_req,
1644 + ADIO_Offset *offset_list,
1645 + int *len_list, int contig_access_count,
1646 + int *striping_info, int *buf_idx,
1649 + int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
1650 + ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
1651 + ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
1652 + ADIO_Offset max_size, step_size = 0;
1653 + int real_size, req_len, send_len;
1654 + int *recv_curr_offlen_ptr, *recv_count, *recv_size;
1655 + int *send_curr_offlen_ptr, *send_size;
1656 + int *partial_recv, *sent_to_proc, *recv_start_pos;
1657 + int *send_buf_idx, *curr_to_proc, *done_to_proc;
1658 + char *write_buf = NULL, *value;
1659 + MPI_Status status;
1660 + ADIOI_Flatlist_node *flat_buf = NULL;
1661 + MPI_Aint buftype_extent;
1662 + int stripe_size = striping_info[0], lflag, data_sieving = 0;
1663 + int stripe_count = striping_info[1], CO = striping_info[2];
1664 + /* IO step size in each communication */
1665 + static char myname[] = "ADIOI_EXCH_AND_WRITE";
1667 + *error_code = MPI_SUCCESS; /* changed below if error */
1669 + /* calculate the number of writes of stripe size
1670 + * to be done by each process and the max among all processes.
1671 + * That gives the no. of communication phases as well.
1674 + for (i = 0; i < nprocs; i++) {
1675 + if (others_req[i].count) {
1676 + st_loc = others_req[i].offsets[0];
1677 + end_loc = others_req[i].offsets[0];
1682 + for (i = 0; i < nprocs; i++) {
1683 + for (j = 0; j < others_req[i].count; j++) {
1684 + st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
1685 + end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] +
1686 + others_req[i].lens[j] - 1));
1689 + /* this process does no writing. */
1690 + if ((st_loc == -1) && (end_loc == -1))
1692 + MPI_Allreduce(&end_loc, &max_end_loc, 1, MPI_LONG_LONG_INT, MPI_MAX, fd->comm);
1693 + /* avoid min_st_loc be -1 */
1695 + st_loc = max_end_loc;
1696 + MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm);
1697 + /* align downward */
1698 + min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size;
1699 + /* when nprocs < stripe_count, there will be trouble, because some client
1700 + * would access more than one OST in one whole communication.
1702 + step_size = (ADIO_Offset)ADIOI_MIN(nprocs, stripe_count * CO) * stripe_size;
1703 + max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1);
1705 + write_buf = (char *) ADIOI_Malloc(stripe_size);
1707 + /* calculate the start offset for each iteration */
1708 + off_list = (ADIO_Offset *) ADIOI_Malloc(max_ntimes * sizeof(ADIO_Offset));
1709 + for (m = 0; m < max_ntimes; m ++)
1710 + off_list[m] = max_end_loc;
1711 + for (i = 0; i < nprocs; i++) {
1712 + for (j = 0; j < others_req[i].count; j ++) {
1713 + req_off = others_req[i].offsets[j];
1714 + //m = (req_off - min_st_loc) / (stripe_size * stripe_count * CO);
1715 + m = (int)((req_off - min_st_loc) / step_size);
1716 + off_list[m] = ADIOI_MIN(off_list[m], req_off);
1720 + recv_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1721 + send_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1722 + /* their use is explained below. calloc initializes to 0. */
1724 + recv_count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1725 + /* to store count of how many off-len pairs per proc are satisfied
1726 + in an iteration. */
1728 + send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1729 + /* total size of data to be sent to each proc. in an iteration.
1730 + Of size nprocs so that I can use MPI_Alltoall later. */
1732 + recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1733 + /* total size of data to be recd. from each proc. in an iteration. */
1735 + sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1736 + /* amount of data sent to each proc so far. Used in
1737 + ADIOI_Fill_send_buffer. initialized to 0 here. */
1739 + send_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1740 + curr_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1741 + done_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1742 + /* Above three are used in ADIOI_Fill_send_buffer */
1744 + recv_start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1745 + /* used to store the starting value of recv_curr_offlen_ptr[i] in
1748 + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
1749 + if (!buftype_is_contig) {
1750 + ADIOI_Flatten_datatype(datatype);
1751 + flat_buf = ADIOI_Flatlist;
1752 + while (flat_buf->type != datatype)
1753 + flat_buf = flat_buf->next;
1755 + MPI_Type_extent(datatype, &buftype_extent);
1757 + iter_st_off = min_st_loc;
1759 + /* Although we have recognized the data according to OST index,
1760 + * a read-modify-write will be done if there is a hole between the data.
1761 + * For example: if blocksize=60, transfersize=30 and stripe_size=100,
1762 + * then process0 will collect data [0, 30] and [60, 90] then write. There
1763 + * is a hole [30, 60], which will cause a read-modify-write in [0, 90].
1764 + * It will degrade collective performance.
1765 + * So we disable data sieving by default unless the hint "ds_in_coll"
1766 + * is set to "enable".
1768 + /* check the hint for data sieving */
1769 + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
1770 + MPI_Info_get(fd->info, "ds_in_coll", MPI_MAX_INFO_VAL, value, &lflag);
1771 + if (lflag && !strcmp(value, "enable"))
1773 + ADIOI_Free(value);
1775 + for (m = 0; m < max_ntimes; m++) {
1776 + /* go through all others_req and my_req to check which will be received
1777 + * and sent in this iteration.
1780 + /* Note that MPI guarantees that displacements in filetypes are in
1781 + monotonically nondecreasing order and that, for writes, the
1782 + filetypes cannot specify overlapping regions in the file. This
1783 + simplifies implementation a bit compared to reads. */
1786 + off = start offset in the file for the data to be written in
1788 + iter_st_off = start offset of this iteration
1789 + real_size = size of data written (bytes) corresponding to off
1790 + max_size = possible maximum size of data written in this iteration
1791 + req_off = offset in the file for a particular contiguous request minus
1792 + what was satisfied in previous iteration
1793 + send_off = offset the request needed by other processes in this iteration
1794 + req_len = size corresponding to req_off
1795 + send_len = size corresponding to send_off
1798 + /* first calculate what should be communicated */
1799 + for (i = 0; i < nprocs; i++)
1800 + recv_count[i] = recv_size[i] = send_size[i] = 0;
1802 + off = off_list[m];
1803 + max_size = ADIOI_MIN(step_size, max_end_loc - iter_st_off + 1);
1804 + real_size = (int) ADIOI_MIN((off / stripe_size + 1) * stripe_size - off,
1805 + end_loc - off + 1);
1807 + for (i = 0; i < nprocs; i++) {
1808 + if (my_req[i].count) {
1809 + for (j = send_curr_offlen_ptr[i]; j < my_req[i].count; j++) {
1810 + send_off = my_req[i].offsets[j];
1811 + send_len = my_req[i].lens[j];
1812 + if (send_off < iter_st_off + max_size) {
1813 + send_size[i] += send_len;
1818 + send_curr_offlen_ptr[i] = j;
1820 + if (others_req[i].count) {
1821 + recv_start_pos[i] = recv_curr_offlen_ptr[i];
1822 + for (j = recv_curr_offlen_ptr[i]; j < others_req[i].count; j++) {
1823 + req_off = others_req[i].offsets[j];
1824 + req_len = others_req[i].lens[j];
1825 + if (req_off < iter_st_off + max_size) {
1827 + MPI_Address(write_buf + req_off - off,
1828 + &(others_req[i].mem_ptrs[j]));
1829 + recv_size[i] += req_len;
1834 + recv_curr_offlen_ptr[i] = j;
1837 + /* use hole to pass data_sieving flag into W_Exchange_data */
1838 + hole = data_sieving;
1839 + ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
1840 + len_list, send_size, recv_size, off, real_size,
1841 + recv_count, recv_start_pos, partial_recv,
1842 + sent_to_proc, nprocs, myrank,
1843 + buftype_is_contig, contig_access_count,
1844 + striping_info, others_req, send_buf_idx,
1845 + curr_to_proc, done_to_proc, &hole, m,
1846 + buftype_extent, buf_idx, error_code);
1847 + if (*error_code != MPI_SUCCESS)
1851 + for (i = 0; i < nprocs; i++)
1852 + if (recv_count[i]) {
1857 + /* check whether to do data sieving */
1858 + if(data_sieving) {
1859 + ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
1860 + ADIO_EXPLICIT_OFFSET, off, &status,
1863 + /* if there is no hole, write in one time;
1864 + * otherwise, write data separately */
1866 + ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
1867 + ADIO_EXPLICIT_OFFSET, off, &status,
1870 + for (i = 0; i < nprocs; i++) {
1871 + if (others_req[i].count) {
1872 + for (j = 0; j < others_req[i].count; j++) {
1873 + if (others_req[i].offsets[j] < off + real_size &&
1874 + others_req[i].offsets[j] >= off) {
1875 + ADIO_WriteContig(fd,
1876 + write_buf + others_req[i].offsets[j] - off,
1877 + others_req[i].lens[j],
1878 + MPI_BYTE, ADIO_EXPLICIT_OFFSET,
1879 + others_req[i].offsets[j], &status,
1887 + if (*error_code != MPI_SUCCESS)
1891 + iter_st_off += max_size;
1893 + if (*error_code != MPI_SUCCESS)
1897 + ADIOI_Free(write_buf);
1898 + ADIOI_Free(recv_curr_offlen_ptr);
1899 + ADIOI_Free(send_curr_offlen_ptr);
1900 + ADIOI_Free(recv_count);
1901 + ADIOI_Free(send_size);
1902 + ADIOI_Free(recv_size);
1903 + ADIOI_Free(sent_to_proc);
1904 + ADIOI_Free(recv_start_pos);
1905 + ADIOI_Free(send_buf_idx);
1906 + ADIOI_Free(curr_to_proc);
1907 + ADIOI_Free(done_to_proc);
1908 + ADIOI_Free(off_list);
1911 +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
1913 + ADIOI_Flatlist_node * flat_buf,
1914 + ADIO_Offset * offset_list,
1915 + int *len_list, int *send_size,
1916 + int *recv_size, ADIO_Offset off,
1917 + int size, int *count,
1918 + int *start_pos, int *partial_recv,
1919 + int *sent_to_proc, int nprocs,
1920 + int myrank, int buftype_is_contig,
1921 + int contig_access_count,
1922 + int * striping_info,
1923 + ADIOI_Access * others_req,
1924 + int *send_buf_idx,
1925 + int *curr_to_proc, int *done_to_proc,
1926 + int *hole, int iter,
1927 + MPI_Aint buftype_extent,
1928 + int *buf_idx, int *error_code)
1930 + int i, j, *tmp_len, nprocs_recv, nprocs_send, err;
1931 + char **send_buf = NULL;
1932 + MPI_Request *requests, *send_req;
1933 + MPI_Datatype *recv_types;
1934 + MPI_Status *statuses, status;
1935 + int *srt_len, sum, sum_recv;
1936 + ADIO_Offset *srt_off;
1937 + int data_sieving = *hole;
1938 + static char myname[] = "ADIOI_W_EXCHANGE_DATA";
1940 + /* create derived datatypes for recv */
1942 + for (i = 0; i < nprocs; i++)
1946 + recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) *
1947 + sizeof(MPI_Datatype));
1948 + /* +1 to avoid a 0-size malloc */
1950 + tmp_len = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1952 + for (i = 0; i < nprocs; i++) {
1953 + if (recv_size[i]) {
1954 + MPI_Type_hindexed(count[i],
1955 + &(others_req[i].lens[start_pos[i]]),
1956 + &(others_req[i].mem_ptrs[start_pos[i]]),
1957 + MPI_BYTE, recv_types + j);
1958 + /* absolute displacements; use MPI_BOTTOM in recv */
1959 + MPI_Type_commit(recv_types + j);
1964 + /* To avoid a read-modify-write,
1965 + * check if there are holes in the data to be written.
1966 + * For this, merge the (sorted) offset lists others_req using a heap-merge.
1970 + for (i = 0; i < nprocs; i++)
1972 + srt_off = (ADIO_Offset *) ADIOI_Malloc((sum + 1) * sizeof(ADIO_Offset));
1973 + srt_len = (int *) ADIOI_Malloc((sum + 1) * sizeof(int));
1974 + /* +1 to avoid a 0-size malloc */
1976 + ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
1977 + nprocs, nprocs_recv, sum);
1979 + ADIOI_Free(tmp_len);
1981 + /* check if there are any holes */
1983 + for (i = 0; i < sum - 1; i++) {
1984 + if (srt_off[i] + srt_len[i] < srt_off[i + 1]) {
1989 + /* In some cases (see John Bent ROMIO REQ # 835), an odd interaction
1990 + * between aggregation, nominally contiguous regions, and cb_buffer_size
1991 + * should be handled with a read-modify-write (otherwise we will write out
1992 + * more data than we receive from everyone else (inclusive), so override
1997 + for (i = 0; i < nprocs; i++)
1998 + sum_recv += recv_size[i];
1999 + if (size > sum_recv)
2002 + /* check the hint for data sieving */
2003 + if (data_sieving && nprocs_recv && *hole) {
2004 + ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
2005 + ADIO_EXPLICIT_OFFSET, off, &status, &err);
2006 + // --BEGIN ERROR HANDLING--
2007 + if (err != MPI_SUCCESS) {
2008 + *error_code = MPIO_Err_create_code(err,
2009 + MPIR_ERR_RECOVERABLE,
2012 + "**ioRMWrdwr", 0);
2015 + // --END ERROR HANDLING--
2017 + ADIOI_Free(srt_off);
2018 + ADIOI_Free(srt_len);
2021 + for (i = 0; i < nprocs; i++)
2025 + if (fd->atomicity) {
2026 + /* bug fix from Wei-keng Liao and Kenin Coloma */
2027 + requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) *
2028 + sizeof(MPI_Request));
2029 + send_req = requests;
2031 + requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1)*
2032 + sizeof(MPI_Request));
2033 + /* +1 to avoid a 0-size malloc */
2035 + /* post receives */
2037 + for (i = 0; i < nprocs; i++) {
2038 + if (recv_size[i]) {
2039 + MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i,
2040 + myrank + i + 100 * iter, fd->comm, requests + j);
2044 + send_req = requests + nprocs_recv;
2048 + * if buftype_is_contig, data can be directly sent from
2049 + * user buf at location given by buf_idx. else use send_buf.
2051 + if (buftype_is_contig) {
2053 + for (i = 0; i < nprocs; i++)
2054 + if (send_size[i]) {
2055 + MPI_Isend(((char *) buf) + buf_idx[i], send_size[i],
2056 + MPI_BYTE, i, myrank + i + 100 * iter, fd->comm,
2059 + buf_idx[i] += send_size[i];
2061 + } else if (nprocs_send) {
2062 + /* buftype is not contig */
2063 + send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
2064 + for (i = 0; i < nprocs; i++)
2066 + send_buf[i] = (char *) ADIOI_Malloc(send_size[i]);
2068 + ADIOI_LUSTRE_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list,
2069 + len_list, send_size, send_req,
2070 + sent_to_proc, nprocs, myrank,
2071 + contig_access_count, striping_info,
2072 + send_buf_idx, curr_to_proc, done_to_proc,
2073 + iter, buftype_extent);
2074 + /* the send is done in ADIOI_Fill_send_buffer */
2077 + /* bug fix from Wei-keng Liao and Kenin Coloma */
2078 + if (fd->atomicity) {
2080 + for (i = 0; i < nprocs; i++) {
2081 + MPI_Status wkl_status;
2082 + if (recv_size[i]) {
2083 + MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i,
2084 + myrank + i + 100 * iter, fd->comm, &wkl_status);
2090 + for (i = 0; i < nprocs_recv; i++)
2091 + MPI_Type_free(recv_types + i);
2092 + ADIOI_Free(recv_types);
2094 + /* bug fix from Wei-keng Liao and Kenin Coloma */
2095 + /* +1 to avoid a 0-size malloc */
2096 + if (fd->atomicity) {
2097 + statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) *
2098 + sizeof(MPI_Status));
2100 + statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) *
2101 + sizeof(MPI_Status));
2104 +#ifdef NEEDS_MPI_TEST
2106 + if (fd->atomicity) {
2107 + /* bug fix from Wei-keng Liao and Kenin Coloma */
2109 + MPI_Testall(nprocs_send, send_req, &i, statuses);
2112 + MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses);
2115 + /* bug fix from Wei-keng Liao and Kenin Coloma */
2116 + if (fd->atomicity)
2117 + MPI_Waitall(nprocs_send, send_req, statuses);
2119 + MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses);
2121 + ADIOI_Free(statuses);
2122 + ADIOI_Free(requests);
2123 + if (!buftype_is_contig && nprocs_send) {
2124 + for (i = 0; i < nprocs; i++)
2126 + ADIOI_Free(send_buf[i]);
2127 + ADIOI_Free(send_buf);
2131 +#define ADIOI_BUF_INCR \
2133 + while (buf_incr) { \
2134 + size_in_buf = ADIOI_MIN(buf_incr, flat_buf_sz); \
2135 + user_buf_idx += size_in_buf; \
2136 + flat_buf_sz -= size_in_buf; \
2137 + if (!flat_buf_sz) { \
2138 + if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
2140 + flat_buf_idx = 0; \
2143 + user_buf_idx = flat_buf->indices[flat_buf_idx] + \
2144 + n_buftypes*buftype_extent; \
2145 + flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
2147 + buf_incr -= size_in_buf; \
2152 +#define ADIOI_BUF_COPY \
2155 + size_in_buf = ADIOI_MIN(size, flat_buf_sz); \
2156 + memcpy(&(send_buf[p][send_buf_idx[p]]), \
2157 + ((char *) buf) + user_buf_idx, size_in_buf); \
2158 + send_buf_idx[p] += size_in_buf; \
2159 + user_buf_idx += size_in_buf; \
2160 + flat_buf_sz -= size_in_buf; \
2161 + if (!flat_buf_sz) { \
2162 + if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
2164 + flat_buf_idx = 0; \
2167 + user_buf_idx = flat_buf->indices[flat_buf_idx] + \
2168 + n_buftypes*buftype_extent; \
2169 + flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
2171 + size -= size_in_buf; \
2172 + buf_incr -= size_in_buf; \
2177 +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
2178 + ADIOI_Flatlist_node * flat_buf,
2180 + ADIO_Offset * offset_list,
2181 + int *len_list, int *send_size,
2182 + MPI_Request * requests,
2183 + int *sent_to_proc, int nprocs,
2185 + int contig_access_count,
2186 + int * striping_info,
2187 + int *send_buf_idx,
2188 + int *curr_to_proc,
2189 + int *done_to_proc, int iter,
2190 + MPI_Aint buftype_extent)
2192 + /* this function is only called if buftype is not contig */
2193 + int i, p, flat_buf_idx, size;
2194 + int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes;
2195 + ADIO_Offset off, len, rem_len, user_buf_idx;
2197 + /* curr_to_proc[p] = amount of data sent to proc. p that has already
2198 + * been accounted for so far
2199 + * done_to_proc[p] = amount of data already sent to proc. p in
2200 + * previous iterations
2201 + * user_buf_idx = current location in user buffer
2202 + * send_buf_idx[p] = current location in send_buf of proc. p
2205 + for (i = 0; i < nprocs; i++) {
2206 + send_buf_idx[i] = curr_to_proc[i] = 0;
2207 + done_to_proc[i] = sent_to_proc[i];
2211 + user_buf_idx = flat_buf->indices[0];
2214 + flat_buf_sz = flat_buf->blocklens[0];
2216 + /* flat_buf_idx = current index into flattened buftype
2217 + * flat_buf_sz = size of current contiguous component in flattened buf
2219 + for (i = 0; i < contig_access_count; i++) {
2220 + off = offset_list[i];
2221 + rem_len = (ADIO_Offset) len_list[i];
2223 + /*this request may span to more than one process */
2224 + while (rem_len != 0) {
2226 + /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
2227 + * longer than the single region that processor "p" is responsible
2230 + p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, nprocs, striping_info);
2232 + if (send_buf_idx[p] < send_size[p]) {
2233 + if (curr_to_proc[p] + len > done_to_proc[p]) {
2234 + if (done_to_proc[p] > curr_to_proc[p]) {
2235 + size = (int) ADIOI_MIN(curr_to_proc[p] + len -
2239 + buf_incr = done_to_proc[p] - curr_to_proc[p];
2241 + buf_incr = (int) (curr_to_proc[p] + len -
2243 + curr_to_proc[p] = done_to_proc[p] + size;
2246 + size = (int) ADIOI_MIN(len, send_size[p] -
2248 + buf_incr = (int) len;
2249 + curr_to_proc[p] += size;
2252 + if (send_buf_idx[p] == send_size[p]) {
2253 + MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p,
2254 + myrank + p + 100 * iter, fd->comm,
2259 + curr_to_proc[p] += (int) len;
2260 + buf_incr = (int) len;
2264 + buf_incr = (int) len;
2271 + for (i = 0; i < nprocs; i++)
2273 + sent_to_proc[i] = curr_to_proc[i];
2276 +static void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
2277 + ADIO_Offset * srt_off, int *srt_len,
2278 + int *start_pos, int nprocs, int nprocs_recv,
2279 + int total_elements)
2282 + ADIO_Offset *off_list;
2287 + heap_struct *a, tmp;
2288 + int i, j, heapsize, l, r, k, smallest;
2290 + a = (heap_struct *) ADIOI_Malloc((nprocs_recv + 1) *
2291 + sizeof(heap_struct));
2294 + for (i = 0; i < nprocs; i++)
2296 + a[j].off_list = &(others_req[i].offsets[start_pos[i]]);
2297 + a[j].len_list = &(others_req[i].lens[start_pos[i]]);
2298 + a[j].nelem = count[i];
2302 + /* build a heap out of the first element from each list, with
2303 + the smallest element of the heap at the root */
2305 + heapsize = nprocs_recv;
2306 + for (i = heapsize / 2 - 1; i >= 0; i--) {
2307 + /* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143
2308 + modified for a heap with smallest element at root. I have
2309 + removed the recursion so that there are no function calls.
2310 + Function calls are too expensive. */
2313 + l = 2 * (k + 1) - 1;
2316 + if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list)))
2321 + if ((r < heapsize) &&
2322 + (*(a[r].off_list) < *(a[smallest].off_list)))
2325 + if (smallest != k) {
2326 + tmp.off_list = a[k].off_list;
2327 + tmp.len_list = a[k].len_list;
2328 + tmp.nelem = a[k].nelem;
2330 + a[k].off_list = a[smallest].off_list;
2331 + a[k].len_list = a[smallest].len_list;
2332 + a[k].nelem = a[smallest].nelem;
2334 + a[smallest].off_list = tmp.off_list;
2335 + a[smallest].len_list = tmp.len_list;
2336 + a[smallest].nelem = tmp.nelem;
2344 + for (i = 0; i < total_elements; i++) {
2345 + /* extract smallest element from heap, i.e. the root */
2346 + srt_off[i] = *(a[0].off_list);
2347 + srt_len[i] = *(a[0].len_list);
2350 + if (!a[0].nelem) {
2351 + a[0].off_list = a[heapsize - 1].off_list;
2352 + a[0].len_list = a[heapsize - 1].len_list;
2353 + a[0].nelem = a[heapsize - 1].nelem;
2356 + (a[0].off_list)++;
2357 + (a[0].len_list)++;
2360 + /* Heapify(a, 0, heapsize); */
2363 + l = 2 * (k + 1) - 1;
2366 + if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list)))
2371 + if ((r < heapsize) &&
2372 + (*(a[r].off_list) < *(a[smallest].off_list)))
2375 + if (smallest != k) {
2376 + tmp.off_list = a[k].off_list;
2377 + tmp.len_list = a[k].len_list;
2378 + tmp.nelem = a[k].nelem;
2380 + a[k].off_list = a[smallest].off_list;
2381 + a[k].len_list = a[smallest].len_list;
2382 + a[k].nelem = a[smallest].nelem;
2384 + a[smallest].off_list = tmp.off_list;
2385 + a[smallest].len_list = tmp.len_list;
2386 + a[smallest].nelem = tmp.nelem;
2395 diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
2396 --- ad_lustre_orig/ad_lustre_wrstr.c 1970-01-01 08:00:00.000000000 +0800
2397 +++ ad_lustre/ad_lustre_wrstr.c 2008-09-17 18:20:35.000000000 +0800
2399 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
2401 + * Copyright (C) 1997 University of Chicago.
2402 + * See COPYRIGHT notice in top-level directory.
2404 + * Copyright (C) 2007 Oak Ridge National Laboratory
2406 + * Copyright (C) 2008 Sun Microsystems, Lustre group
2409 +#include "ad_lustre.h"
2410 +#include "adio_extern.h"
2412 +#define ADIOI_BUFFERED_WRITE \
2414 + if (req_off >= writebuf_off + writebuf_len) { \
2415 + if (writebuf_len) { \
2416 + ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2417 + ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2418 + if (!(fd->atomicity)) \
2419 + ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2420 + if (*error_code != MPI_SUCCESS) { \
2421 + *error_code = MPIO_Err_create_code(*error_code, \
2422 + MPIR_ERR_RECOVERABLE, myname, \
2423 + __LINE__, MPI_ERR_IO, \
2428 + writebuf_off = req_off; \
2429 + /* stripe_size alignment */ \
2430 + writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2431 + (writebuf_off / stripe_size + 1) * \
2432 + stripe_size - writebuf_off);\
2433 + if (!(fd->atomicity)) \
2434 + ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2435 + ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,\
2436 + writebuf_off, &status1, error_code); \
2437 + if (*error_code != MPI_SUCCESS) { \
2438 + *error_code = MPIO_Err_create_code(*error_code, \
2439 + MPIR_ERR_RECOVERABLE, myname, \
2440 + __LINE__, MPI_ERR_IO, \
2445 + write_sz = (int) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
2446 + memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, write_sz);\
2447 + while (write_sz != req_len) {\
2448 + ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2449 + ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2450 + if (!(fd->atomicity)) \
2451 + ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2452 + if (*error_code != MPI_SUCCESS) { \
2453 + *error_code = MPIO_Err_create_code(*error_code, \
2454 + MPIR_ERR_RECOVERABLE, myname, \
2455 + __LINE__, MPI_ERR_IO, \
2459 + req_len -= write_sz; \
2460 + userbuf_off += write_sz; \
2461 + writebuf_off += writebuf_len; \
2462 + /* stripe_size alignment */ \
2463 + writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2464 + (writebuf_off / stripe_size + 1) * \
2465 + stripe_size - writebuf_off);\
2466 + if (!(fd->atomicity)) \
2467 + ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2468 + ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,\
2469 + writebuf_off, &status1, error_code); \
2470 + if (*error_code != MPI_SUCCESS) { \
2471 + *error_code = MPIO_Err_create_code(*error_code, \
2472 + MPIR_ERR_RECOVERABLE, myname, \
2473 + __LINE__, MPI_ERR_IO, \
2477 + write_sz = ADIOI_MIN(req_len, writebuf_len); \
2478 + memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
2483 +/* this macro is used when filetype is contig and buftype is not contig.
2484 + it does not do a read-modify-write and does not lock*/
2485 +#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
2487 + if (req_off >= writebuf_off + writebuf_len) { \
2488 + writebuf_off = req_off; \
2489 + /* stripe_size alignment */ \
2490 + writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2491 + (writebuf_off / stripe_size + 1) * \
2492 + stripe_size - writebuf_off);\
2494 + write_sz = (int) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
2495 + memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, write_sz);\
2496 + while (req_len) { \
2497 + ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2498 + ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2499 + if (*error_code != MPI_SUCCESS) { \
2500 + *error_code = MPIO_Err_create_code(*error_code, \
2501 + MPIR_ERR_RECOVERABLE, myname, \
2502 + __LINE__, MPI_ERR_IO, \
2506 + req_len -= write_sz; \
2507 + userbuf_off += write_sz; \
2508 + writebuf_off += writebuf_len; \
2509 + /* stripe_size alignment */ \
2510 + writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2511 + (writebuf_off / stripe_size + 1) * \
2512 + stripe_size - writebuf_off);\
2513 + write_sz = ADIOI_MIN(req_len, writebuf_len); \
2514 + memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
2518 +void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
2519 + MPI_Datatype datatype, int file_ptr_type,
2520 + ADIO_Offset offset, ADIO_Status * status,
2523 + /* offset is in units of etype relative to the filetype. */
2524 + ADIOI_Flatlist_node *flat_buf, *flat_file;
2525 + int i, j, k, bwr_size, fwr_size = 0, st_index = 0;
2526 + int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
2527 + int n_filetypes, etype_in_filetype;
2528 + ADIO_Offset abs_off_in_filetype = 0;
2529 + int filetype_size, etype_size, buftype_size, req_len;
2530 + MPI_Aint filetype_extent, buftype_extent;
2531 + int buf_count, buftype_is_contig, filetype_is_contig;
2532 + ADIO_Offset userbuf_off;
2533 + ADIO_Offset off, req_off, disp, end_offset = 0, writebuf_off, start_off;
2535 + int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
2536 + ADIO_Status status1;
2537 + int new_bwr_size, new_fwr_size;
2539 + int stripe_size, lflag = 0;
2540 + static char myname[] = "ADIOI_LUSTRE_WriteStrided";
2542 + MPI_Comm_rank(fd->comm, &myrank);
2544 + if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
2545 + /* if user has disabled data sieving on writes, use naive
2546 + * approach instead.
2548 + ADIOI_GEN_WriteStrided_naive(fd,
2553 + offset, status, error_code);
2557 + *error_code = MPI_SUCCESS; /* changed below if error */
2559 + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
2560 + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
2562 + MPI_Type_size(fd->filetype, &filetype_size);
2563 + if (!filetype_size) {
2564 + *error_code = MPI_SUCCESS;
2568 + MPI_Type_extent(fd->filetype, &filetype_extent);
2569 + MPI_Type_size(datatype, &buftype_size);
2570 + MPI_Type_extent(datatype, &buftype_extent);
2571 + etype_size = fd->etype_size;
2573 + bufsize = buftype_size * count;
2575 + /* get striping info */
2576 + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
2577 + MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
2579 + stripe_size = atoi(value);
2580 + ADIOI_Free(value);
2582 + /* Different buftype to different filetype */
2583 + if (!buftype_is_contig && filetype_is_contig) {
2584 + /* noncontiguous in memory, contiguous in file. */
2585 + ADIOI_Flatten_datatype(datatype);
2586 + flat_buf = ADIOI_Flatlist;
2587 + while (flat_buf->type != datatype)
2588 + flat_buf = flat_buf->next;
2590 + off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
2591 + fd->disp + etype_size * offset;
2594 + end_offset = start_off + bufsize - 1;
2595 + writebuf_off = start_off;
2596 + /* write stripe size buffer each time */
2597 + writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
2598 + writebuf_len = (int) ADIOI_MIN(bufsize,
2599 + (writebuf_off / stripe_size + 1) *
2600 + stripe_size - writebuf_off);
2602 + /* if atomicity is true, lock the region to be accessed */
2603 + if (fd->atomicity)
2604 + ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
2606 + for (j = 0; j < count; j++) {
2607 + for (i = 0; i < flat_buf->count; i++) {
2608 + userbuf_off = j * buftype_extent + flat_buf->indices[i];
2610 + req_len = flat_buf->blocklens[i];
2611 + ADIOI_BUFFERED_WRITE_WITHOUT_READ
2612 + off += flat_buf->blocklens[i];
2616 + /* write the buffer out finally */
2617 + ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
2618 + ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
2621 + if (fd->atomicity)
2622 + ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
2623 + if (*error_code != MPI_SUCCESS)
2625 + ADIOI_Free(writebuf);
2626 + if (file_ptr_type == ADIO_INDIVIDUAL)
2629 + /* noncontiguous in file */
2630 + /* filetype already flattened in ADIO_Open */
2631 + flat_file = ADIOI_Flatlist;
2632 + while (flat_file->type != fd->filetype)
2633 + flat_file = flat_file->next;
2636 + if (file_ptr_type == ADIO_INDIVIDUAL) {
2637 + offset = fd->fp_ind; /* in bytes */
2642 + for (i = 0; i < flat_file->count; i++) {
2643 + if (disp + flat_file->indices[i] +
2644 + (ADIO_Offset) n_filetypes * filetype_extent +
2645 + flat_file->blocklens[i] >= offset) {
2647 + fwr_size = (int) (disp + flat_file->indices[i] +
2648 + (ADIO_Offset) n_filetypes *
2650 + flat_file->blocklens[i] -
2658 + n_etypes_in_filetype = filetype_size / etype_size;
2659 + n_filetypes = (int) (offset / n_etypes_in_filetype);
2660 + etype_in_filetype = (int) (offset % n_etypes_in_filetype);
2661 + size_in_filetype = etype_in_filetype * etype_size;
2664 + for (i = 0; i < flat_file->count; i++) {
2665 + sum += flat_file->blocklens[i];
2666 + if (sum > size_in_filetype) {
2668 + fwr_size = sum - size_in_filetype;
2669 + abs_off_in_filetype = flat_file->indices[i] +
2670 + size_in_filetype - (sum - flat_file->blocklens[i]);
2675 + /* abs. offset in bytes in the file */
2676 + offset = disp + (ADIO_Offset) n_filetypes *filetype_extent +
2677 + abs_off_in_filetype;
2680 + start_off = offset;
2682 + /* If the file bytes is actually contiguous, we do not need data sieve at all */
2683 + if (bufsize <= fwr_size) {
2684 + req_off = start_off;
2685 + req_len = bufsize;
2686 + end_offset = start_off + bufsize - 1;
2687 + writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
2688 + memset(writebuf, -1, ADIOI_MIN(bufsize, stripe_size));
2692 + ADIOI_BUFFERED_WRITE_WITHOUT_READ
2694 + /* Calculate end_offset, the last byte-offset that will be accessed.
2695 + e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */
2696 + st_fwr_size = fwr_size;
2697 + st_n_filetypes = n_filetypes;
2701 + fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
2702 + while (i < bufsize) {
2704 + end_offset = off + fwr_size - 1;
2706 + if (j < (flat_file->count - 1))
2713 + off = disp + flat_file->indices[j] +
2714 + (ADIO_Offset) n_filetypes * filetype_extent;
2715 + fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i);
2720 + writebuf = (char *) ADIOI_Malloc(stripe_size);
2721 + memset(writebuf, -1, stripe_size);
2722 + /* if atomicity is true, lock the region to be accessed */
2723 + if (fd->atomicity)
2724 + ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
2726 + if (buftype_is_contig && !filetype_is_contig) {
2727 + /* contiguous in memory, noncontiguous in file. should be the most
2732 + n_filetypes = st_n_filetypes;
2733 + fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
2734 + while (i < bufsize) {
2736 + /* TYPE_UB and TYPE_LB can result in
2737 + fwr_size = 0. save system call in such cases */
2739 + lseek(fd->fd_sys, off, SEEK_SET);
2740 + err = write(fd->fd_sys, ((char *) buf) + i, fwr_size);
2743 + req_len = fwr_size;
2745 + ADIOI_BUFFERED_WRITE
2749 + if (off + fwr_size < disp + flat_file->indices[j] +
2750 + flat_file->blocklens[j] +
2751 + (ADIO_Offset) n_filetypes * filetype_extent)
2753 + /* did not reach end of contiguous block in filetype.
2754 + no more I/O needed. off is incremented by fwr_size. */
2756 + if (j < (flat_file->count - 1))
2762 + off = disp + flat_file->indices[j] +
2763 + (ADIO_Offset) n_filetypes * filetype_extent;
2764 + fwr_size = ADIOI_MIN(flat_file->blocklens[j],
2769 + /* noncontiguous in memory as well as in file */
2770 + ADIOI_Flatten_datatype(datatype);
2771 + flat_buf = ADIOI_Flatlist;
2772 + while (flat_buf->type != datatype)
2773 + flat_buf = flat_buf->next;
2775 + k = num = buf_count = 0;
2776 + i = (int) (flat_buf->indices[0]);
2779 + n_filetypes = st_n_filetypes;
2780 + fwr_size = st_fwr_size;
2781 + bwr_size = flat_buf->blocklens[0];
2783 + while (num < bufsize) {
2784 + size = ADIOI_MIN(fwr_size, bwr_size);
2787 + lseek(fd->fd_sys, off, SEEK_SET);
2788 + err = write(fd->fd_sys, ((char *) buf) + i, size);
2793 + ADIOI_BUFFERED_WRITE
2796 + new_fwr_size = fwr_size;
2797 + new_bwr_size = bwr_size;
2799 + if (size == fwr_size) {
2800 + /* reached end of contiguous block in file */
2801 + if (j < (flat_file->count - 1)) {
2807 + off = disp + flat_file->indices[j] +
2808 + (ADIO_Offset) n_filetypes * filetype_extent;
2809 + new_fwr_size = flat_file->blocklens[j];
2810 + if (size != bwr_size) {
2812 + new_bwr_size -= size;
2815 + if (size == bwr_size) {
2816 + /* reached end of contiguous block in memory */
2817 + k = (k + 1) % flat_buf->count;
2819 + i = (int) (buftype_extent *
2820 + (buf_count / flat_buf->count) +
2821 + flat_buf->indices[k]);
2822 + new_bwr_size = flat_buf->blocklens[k];
2823 + if (size != fwr_size) {
2825 + new_fwr_size -= size;
2829 + fwr_size = new_fwr_size;
2830 + bwr_size = new_bwr_size;
2834 + /* write the buffer out finally */
2835 + if (writebuf_len) {
2836 + ADIO_WriteContig(fd, writebuf, writebuf_len,
2837 + MPI_BYTE, ADIO_EXPLICIT_OFFSET,
2838 + writebuf_off, &status1, error_code);
2839 + if (!(fd->atomicity))
2840 + ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
2841 + if (*error_code != MPI_SUCCESS)
2844 + if (fd->atomicity)
2845 + ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
2847 + ADIOI_Free(writebuf);
2848 + if (file_ptr_type == ADIO_INDIVIDUAL)
2851 + fd->fp_sys_posn = -1; /* set it to null. */
2853 +#ifdef HAVE_STATUS_SET_BYTES
2854 + MPIR_Status_set_bytes(status, datatype, bufsize);
2855 + /* This is a temporary way of filling in status. The right way is to
2856 + keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
2859 + if (!buftype_is_contig)
2860 + ADIOI_Delete_flattened(datatype);
2862 diff -ruN ad_lustre_orig/Makefile ad_lustre/Makefile
2863 --- ad_lustre_orig/Makefile 1970-01-01 08:00:00.000000000 +0800
2864 +++ ad_lustre/Makefile 2008-09-17 18:20:35.000000000 +0800
2869 +LIBNAME = /work/download/mpich2-1.0.7-dev/lib/libmpich.a
2870 +srcdir = /work/download/mpich2-1.0.7-dev/src/mpi/romio/adio/ad_lustre
2872 +SHLIBNAME = /work/download/mpich2-1.0.7-dev/lib/libmpich
2874 +INCLUDE_DIR = -I. -I${srcdir}/../include -I../include -I../../include -I${srcdir}/../../../../include -I../../../../include
2875 +CFLAGS = -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/include -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/include -I/work/download/mpich2-1.0.7-dev/src/mpid/common/datatype -I/work/download/mpich2-1.0.7-dev/src/mpid/common/datatype -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/channels/sock/include -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/channels/sock/include -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock/poll -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock/poll -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -O2 -DFORTRANDOUBLEUNDERSCORE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DHAVE_ROMIOCONF_H -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(INCLUDE_DIR)
2877 +top_builddir = /work/download/mpich2-1.0.7-dev
2879 +C_COMPILE_SHL = $(CC_SHL) -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -O2 -DFORTRANDOUBLEUNDERSCORE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DHAVE_ROMIOCONF_H -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(INCLUDE_DIR)
2881 +VPATH = .:${srcdir}
2883 +AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_open.o \
2884 + ad_lustre_rwcontig.o ad_lustre_wrcoll.o ad_lustre_wrstr.o \
2885 + ad_lustre_fcntl.o ad_lustre_hints.o ad_lustre_close.o \
2886 + ad_lustre_aggregate.o
2889 +default: $(LIBNAME)
2890 + @if [ "none" != "none" ] ; then \
2891 + $(MAKE) $(SHLIBNAME).la ;\
2894 +.SUFFIXES: $(SUFFIXES) .p .lo
2897 + $(CC) $(CFLAGS) -c $<
2899 + $(C_COMPILE_SHL) -c $< -o _s$*.o
2900 + @mv -f _s$*.o $*.lo
2902 +$(LIBNAME): $(AD_LUSTRE_OBJECTS)
2903 + $(AR) $(LIBNAME) $(AD_LUSTRE_OBJECTS)
2904 + $(RANLIB) $(LIBNAME)
2906 +AD_LUSTRE_LOOBJECTS=$(AD_LUSTRE_OBJECTS:.o=.lo)
2907 +$(SHLIBNAME).la: $(AD_LUSTRE_LOOBJECTS)
2908 + $(AR) $(SHLIBNAME).la $(AD_LUSTRE_LOOBJECTS)
2911 + -@for file in ${AD_LUSTRE_OBJECTS:.o=.c} ; do \
2912 + gcov -b -f $$file ; done
2916 diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in
2917 --- ad_lustre_orig/Makefile.in 2008-09-17 14:36:57.000000000 +0800
2918 +++ ad_lustre/Makefile.in 2008-09-17 18:20:35.000000000 +0800
2922 AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_open.o \
2923 - ad_lustre_rwcontig.o ad_lustre_hints.o
2924 + ad_lustre_rwcontig.o ad_lustre_wrcoll.o ad_lustre_wrstr.o \
2925 + ad_lustre_hints.o ad_lustre_aggregate.o
2929 @if [ "@ENABLE_SHLIB@" != "none" ] ; then \
2930 diff -ruN ad_lustre_orig/README ad_lustre/README
2931 --- ad_lustre_orig/README 2008-09-17 14:36:57.000000000 +0800
2932 +++ ad_lustre/README 2008-09-17 18:20:35.000000000 +0800
2934 o To post the code for ParColl (Partitioned collective IO)
2936 -----------------------------------------------------
2938 +-----------------------------------------------------
2939 + o Improved data redistribution
2940 + - add I/O pattern identification. If request I/O size is big,
2941 + collective I/O won't be done. The hint big_req_size can be
2942 + used to define this.
2943 + - provide hint CO for load balancing to control the number
2944 + of IO clients for each OST
2945 + - divide the IO clients into the different OST groups to
2946 + produce stripe-contiguous I/O pattern
2947 + - reduce the collective overhead by hints contiguous_data and
2948 + same_io_size to remove unnecessary MPI_Alltoall()
2949 + o Control read-modify-write in data sieving in collective IO
2950 + by hint ds_in_coll.
2952 +-----------------------------------------------------
2954 -----------------------------------------------------
2955 o Direct IO and Lockless IO support