1 diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
2 --- ad_lustre_orig/ad_lustre_aggregate.c 1970-01-01 08:00:00.000000000 +0800
3 +++ ad_lustre/ad_lustre_aggregate.c 2008-10-15 22:26:35.000000000 +0800
5 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
7 + * Copyright (C) 1997 University of Chicago.
8 + * See COPYRIGHT notice in top-level directory.
10 + * Copyright (C) 2007 Oak Ridge National Laboratory
12 + * Copyright (C) 2008 Sun Microsystems, Lustre group
15 +#include "ad_lustre.h"
16 +#include "adio_extern.h"
18 +void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
19 + int mode, int nprocs,
20 + ADIO_Offset *st_offsets,
21 + ADIO_Offset *end_offsets,
22 + ADIO_Offset *min_st_offset_ptr)
24 + int *striping_info = NULL;
25 + /* get striping information:
26 + * striping_info[0]: stripe_size
27 + * striping_info[1]: stripe_count
28 + * striping_info[2]: avail_cb_nodes
30 + int stripe_size, stripe_count, CO = 1, CO_max = 1, lflag, i;
31 + int user_cb_nodes = 0, avail_cb_nodes;
32 + int nprocs_for_coll = fd->hints->cb_nodes;
33 + ADIO_Offset min_st_offset, max_end_offset;
34 + char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
36 + /* Get hints value */
38 + MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
40 + stripe_size = atoi(value);
42 + /* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */
43 + MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &lflag);
45 + stripe_count = atoi(value);
47 + /* Calculate the available number of I/O clients, that is
48 + * avail_cb_nodes=min(cb_nodes, stripe_count*CO), where
52 + /* for collective read,
53 + * if "CO" clients access the same OST simultaneously,
54 + * the OST disk seek time would be much. So, to avoid this,
55 + * it might be better if 1 client only accesses 1 OST.
56 + * So, we set CO = 1 to meet the above requirement.
59 + /*XXX: maybe there are other better way for collective read */
61 + /* CO_max: the largest number of IO clients for each ost group */
62 + CO_max = (nprocs_for_coll - 1)/ stripe_count + 1;
63 + /* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */
64 + MPI_Info_get(fd->info, "CO", MPI_MAX_INFO_VAL, value, &lflag);
67 + CO = ADIOI_MIN(CO_max, CO);
69 + avail_cb_nodes = ADIOI_MIN(nprocs_for_coll, stripe_count * CO);
72 + MPI_Info_get(fd->info, "user_cb_nodes", MPI_MAX_INFO_VAL, value, &lflag);
74 + user_cb_nodes = atoi(value);
75 + /* If the user doesn't change the cb_nodes and
76 + * the whole file access portion is no larger than stripe size,
77 + * we will perform the IO by the same process (rank0 by default).
79 + /* calculate the whole file access portion */
80 + min_st_offset = st_offsets[0];
81 + max_end_offset = end_offsets[0];
82 + for (i = 0; i < nprocs; i ++) {
83 + min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
84 + max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
86 + if (!user_cb_nodes) {
87 + /* Check the whole file access portion
88 + * if (whole_range <= stripe_size)
89 + * then always collect data to the same process;
90 + * set avail_cb_nodes=1; (rank0 by default).
91 + * This pattern can make good use of Lustre client cache and
92 + * avoid extent lock assigning and revoking.
94 + * The recent experiments show good performance. We still need more
97 + if ((max_end_offset > min_st_offset) &&
98 + (max_end_offset - min_st_offset) <= (ADIO_Offset) stripe_size)
104 + *striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int));
105 + striping_info = *striping_info_ptr;
106 + striping_info[0] = stripe_size;
107 + striping_info[1] = stripe_count;
108 + striping_info[2] = avail_cb_nodes;
110 + *min_st_offset_ptr = min_st_offset;
113 +int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
114 + ADIO_Offset *len, int *striping_info)
116 + int rank_index, rank;
117 + ADIO_Offset avail_bytes;
118 + int stripe_size = striping_info[0];
119 + int avail_cb_nodes = striping_info[2];
121 + /* Produce the stripe-contiguous pattern for Lustre */
122 + rank_index = (int)((off / stripe_size) % avail_cb_nodes);
124 + avail_bytes = (off / (ADIO_Offset)stripe_size + 1) *
125 + (ADIO_Offset)stripe_size - off;
126 + if (avail_bytes < *len) {
127 + /* this proc only has part of the requested contig. region */
128 + *len = avail_bytes;
130 + /* map our index to a rank */
131 + /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
132 + rank = fd->hints->ranklist[rank_index];
137 +void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
138 + int *len_list, int contig_access_count,
139 + int *striping_info, int nprocs,
140 + int *count_my_req_procs_ptr,
141 + int **count_my_req_per_proc_ptr,
142 + ADIOI_Access ** my_req_ptr,
145 + /* Nothing different from ADIOI_Calc_my_req(), except calling
146 + * ADIOI_Lustre_Calc_aggregator() instead of the old one */
147 + int *count_my_req_per_proc, count_my_req_procs, *buf_idx;
149 + ADIO_Offset avail_len, rem_len, curr_idx, off;
150 + ADIOI_Access *my_req;
152 + *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
153 + count_my_req_per_proc = *count_my_req_per_proc_ptr;
155 + /* buf_idx is relevant only if buftype_is_contig.
156 + * buf_idx[i] gives the index into user_buf where data received
157 + * from proc. i should be placed. This allows receives to be done
158 + * without extra buffer. This can't be done if buftype is not contig.
160 + buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
161 + /* initialize buf_idx to -1 */
162 + for (i = 0; i < nprocs; i++)
165 + /* one pass just to calculate how much space to allocate for my_req;
166 + * contig_access_count was calculated way back in ADIOI_Calc_my_off_len()
168 + for (i = 0; i < contig_access_count; i++) {
169 + /* short circuit offset/len processing if len == 0
170 + * (zero-byte read/write
172 + if (len_list[i] == 0)
174 + off = offset_list[i];
175 + avail_len = len_list[i];
176 + /* we set avail_len to be the total size of the access.
177 + * then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return
178 + * the amount that was available.
180 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
181 + count_my_req_per_proc[proc]++;
182 + /* figure out how many data is remaining in the access
183 + * we'll take care of this data (if there is any)
184 + * in the while loop below.
186 + rem_len = len_list[i] - avail_len;
188 + while (rem_len != 0) {
189 + off += avail_len; /* point to first remaining byte */
190 + avail_len = rem_len; /* save remaining size, pass to calc */
191 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
192 + count_my_req_per_proc[proc]++;
193 + rem_len -= avail_len; /* reduce remaining length by amount from fd */
197 + *my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access));
198 + my_req = *my_req_ptr;
200 + count_my_req_procs = 0;
201 + for (i = 0; i < nprocs; i++) {
202 + if (count_my_req_per_proc[i]) {
203 + my_req[i].offsets = (ADIO_Offset *)
204 + ADIOI_Malloc(count_my_req_per_proc[i] *
205 + sizeof(ADIO_Offset));
206 + my_req[i].lens = (int *) ADIOI_Malloc(count_my_req_per_proc[i] *
208 + count_my_req_procs++;
210 + my_req[i].count = 0; /* will be incremented where needed later */
213 + /* now fill in my_req */
215 + for (i = 0; i < contig_access_count; i++) {
216 + if (len_list[i] == 0)
218 + off = offset_list[i];
219 + avail_len = len_list[i];
220 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
222 + /* for each separate contiguous access from this process */
223 + if (buf_idx[proc] == -1)
224 + buf_idx[proc] = (int) curr_idx;
226 + l = my_req[proc].count;
227 + curr_idx += (int) avail_len; /* NOTE: Why is curr_idx an int? Fix? */
229 + rem_len = len_list[i] - avail_len;
231 + /* store the proc, offset, and len information in an array
232 + * of structures, my_req. Each structure contains the
233 + * offsets and lengths located in that process's FD,
234 + * and the associated count.
236 + my_req[proc].offsets[l] = off;
237 + my_req[proc].lens[l] = (int) avail_len;
238 + my_req[proc].count++;
240 + while (rem_len != 0) {
242 + avail_len = rem_len;
243 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
245 + if (buf_idx[proc] == -1)
246 + buf_idx[proc] = (int) curr_idx;
248 + l = my_req[proc].count;
249 + curr_idx += avail_len;
250 + rem_len -= avail_len;
252 + my_req[proc].offsets[l] = off;
253 + my_req[proc].lens[l] = (int) avail_len;
254 + my_req[proc].count++;
259 + for (i = 0; i < nprocs; i++) {
260 + if (count_my_req_per_proc[i] > 0) {
261 + FPRINTF(stdout, "data needed from %d (count = %d):\n",
262 + i, my_req[i].count);
263 + for (l = 0; l < my_req[i].count; l++) {
264 + FPRINTF(stdout, " off[%d] = %lld, len[%d] = %d\n",
265 + l, my_req[i].offsets[l], l, my_req[i].lens[l]);
271 + for (i = 0; i < nprocs; i++) {
272 + FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
276 + *count_my_req_procs_ptr = count_my_req_procs;
277 + *buf_idx_ptr = buf_idx;
280 +int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
281 + int *len_list, int nprocs)
283 + /* If the processes are non-interleaved, we will check the req_size.
284 + * if (avg_req_size > big_req_size) {
289 + int i, docollect = 1, lflag, big_req_size = 0;
290 + ADIO_Offset req_size = 0, total_req_size;
291 + int avg_req_size, total_access_count;
292 + char *value = NULL;
294 + /* calculate total_req_size and total_access_count */
295 + for (i = 0; i < contig_access_count; i++)
296 + req_size += len_list[i];
297 + MPI_Allreduce(&req_size, &total_req_size, 1, MPI_LONG_LONG_INT, MPI_SUM,
299 + MPI_Allreduce(&contig_access_count, &total_access_count, 1, MPI_INT, MPI_SUM,
301 + /* estimate average req_size */
302 + avg_req_size = (int)(total_req_size / total_access_count);
304 + /* get hint of big_req_size */
305 + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
306 + MPI_Info_get(fd->info, "big_req_size", MPI_MAX_INFO_VAL, value, &lflag);
308 + big_req_size = atoi(value);
309 + /* Don't perform collective I/O if there are big requests */
310 + if ((big_req_size > 0) && (avg_req_size > big_req_size))
318 +void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
319 + int *count_my_req_per_proc,
320 + ADIOI_Access * my_req,
321 + int nprocs, int myrank,
322 + ADIO_Offset req_len,
323 + ADIO_Offset min_st_offset,
324 + int *striping_info,
325 + int *count_others_req_procs_ptr,
326 + ADIOI_Access ** others_req_ptr)
328 + /* what requests of other processes will be written by this process */
330 + int *count_others_req_per_proc, count_others_req_procs, proc;
331 + int i, j, lflag, samesize = 0, contiguous = 0;
332 + int avail_cb_nodes = striping_info[2];
333 + MPI_Request *send_requests, *recv_requests;
334 + MPI_Status *statuses;
335 + ADIOI_Access *others_req;
336 + char *value = NULL;
337 + ADIO_Offset off, avail_len, rem_len, *all_lens;
339 + /* There are two hints, which could reduce some MPI communication overhead,
340 + * if the users knows the I/O pattern and set them correctly. */
342 + * contiguous_data: if the data are contiguous,
343 + * we don't need to do MPI_Alltoall().
344 + * same_io_size: And if the data req size is same,
345 + * we can calculate the offset directly
347 + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
348 + /* hint of contiguous data */
349 + MPI_Info_get(fd->info, "contiguous_data", MPI_MAX_INFO_VAL, value, &lflag);
350 + if (lflag && !strcmp(value, "yes"))
352 + /* hint of same io size */
353 + MPI_Info_get(fd->info, "same_io_size", MPI_MAX_INFO_VAL, value, &lflag);
354 + if (lflag && !strcmp(value, "yes"))
358 + *others_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs *
359 + sizeof(ADIOI_Access));
360 + others_req = *others_req_ptr;
362 + /* if the data are contiguous, we can calulate the offset and length
363 + * of the other requests simply, instead of MPI_Alltoall() */
365 + for (i = 0; i < nprocs; i++) {
366 + others_req[i].count = 0;
368 + all_lens = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
370 + /* same req size ? */
371 + if (samesize == 0) {
372 + /* exchange request length */
373 + MPI_Allgather(&req_len, 1, ADIO_OFFSET, all_lens, 1, ADIO_OFFSET,
375 + } else { /* same request size */
376 + /* assign request length to all_lens[] */
377 + for (i = 0; i < nprocs; i ++)
378 + all_lens[i] = req_len;
380 + if (myrank < avail_cb_nodes) {
381 + /* It's a IO client and it will receive data from others */
382 + off = min_st_offset;
383 + /* calcaulte other_req[i].count */
384 + for (i = 0; i < nprocs; i++) {
385 + avail_len = all_lens[i];
386 + rem_len = avail_len;
387 + while (rem_len > 0) {
388 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
390 + if (proc == myrank) {
391 + others_req[i].count ++;
394 + rem_len -= avail_len;
395 + avail_len = rem_len;
398 + /* calculate offset and len for each request */
399 + off = min_st_offset;
400 + for (i = 0; i < nprocs; i++) {
401 + if (others_req[i].count) {
402 + others_req[i].offsets = (ADIO_Offset *)
403 + ADIOI_Malloc(others_req[i].count *
404 + sizeof(ADIO_Offset));
405 + others_req[i].lens = (int *)
406 + ADIOI_Malloc(others_req[i].count *
408 + others_req[i].mem_ptrs = (MPI_Aint *)
409 + ADIOI_Malloc(others_req[i].count *
413 + avail_len = all_lens[i];
414 + rem_len = avail_len;
415 + while (rem_len > 0) {
416 + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
418 + if (proc == myrank) {
419 + others_req[i].offsets[j] = off;
420 + others_req[i].lens[j] = (int)avail_len;
424 + rem_len -= avail_len;
425 + avail_len = rem_len;
429 + ADIOI_Free(all_lens);
431 + /* multiple non-contiguous requests */
432 + /* first find out how much to send/recv and from/to whom */
435 + * count_others_req_procs:
436 + * number of processes whose requests will be written by
437 + * this process (including this process itself)
438 + * count_others_req_per_proc[i]:
439 + * how many separate contiguous requests of proc[i] will be
440 + * written by this process.
443 + count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
445 + MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
446 + count_others_req_per_proc, 1, MPI_INT, fd->comm);
448 + count_others_req_procs = 0;
449 + for (i = 0; i < nprocs; i++) {
450 + if (count_others_req_per_proc[i]) {
451 + others_req[i].count = count_others_req_per_proc[i];
452 + others_req[i].offsets = (ADIO_Offset *)
453 + ADIOI_Malloc(others_req[i].count *
454 + sizeof(ADIO_Offset));
455 + others_req[i].lens = (int *)
456 + ADIOI_Malloc(others_req[i].count *
458 + others_req[i].mem_ptrs = (MPI_Aint *)
459 + ADIOI_Malloc(others_req[i].count *
461 + count_others_req_procs++;
463 + others_req[i].count = 0;
466 + /* now send the calculated offsets and lengths to respective processes */
468 + send_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_my_req_procs + 1) *
469 + sizeof(MPI_Request));
470 + recv_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_others_req_procs+1)*
471 + sizeof(MPI_Request));
472 + /* +1 to avoid a 0-size malloc */
475 + for (i = 0; i < nprocs; i++) {
476 + if (others_req[i].count) {
477 + MPI_Irecv(others_req[i].offsets, others_req[i].count,
478 + ADIO_OFFSET, i, i + myrank, fd->comm,
479 + &recv_requests[j]);
481 + MPI_Irecv(others_req[i].lens, others_req[i].count,
482 + MPI_INT, i, i + myrank + 1, fd->comm,
483 + &recv_requests[j]);
489 + for (i = 0; i < nprocs; i++) {
490 + if (my_req[i].count) {
491 + MPI_Isend(my_req[i].offsets, my_req[i].count,
492 + ADIO_OFFSET, i, i + myrank, fd->comm,
493 + &send_requests[j]);
495 + MPI_Isend(my_req[i].lens, my_req[i].count,
496 + MPI_INT, i, i + myrank + 1, fd->comm,
497 + &send_requests[j]);
502 + statuses = (MPI_Status *)
503 + ADIOI_Malloc((1 + 2 * ADIOI_MAX(count_my_req_procs,
504 + count_others_req_procs)) *
505 + sizeof(MPI_Status));
506 + /* +1 to avoid a 0-size malloc */
508 + MPI_Waitall(2 * count_my_req_procs, send_requests, statuses);
509 + MPI_Waitall(2 * count_others_req_procs, recv_requests, statuses);
511 + ADIOI_Free(send_requests);
512 + ADIOI_Free(recv_requests);
513 + ADIOI_Free(statuses);
514 + ADIOI_Free(count_others_req_per_proc);
516 + *count_others_req_procs_ptr = count_others_req_procs;
519 diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c
520 --- ad_lustre_orig/ad_lustre.c 2008-09-17 14:36:57.000000000 +0800
521 +++ ad_lustre/ad_lustre.c 2008-09-17 18:20:35.000000000 +0800
523 /* -*- Mode: C; c-basic-offset:4 ; -*- */
525 - * Copyright (C) 2001 University of Chicago.
527 + * Copyright (C) 2001 University of Chicago.
528 * See COPYRIGHT notice in top-level directory.
530 * Copyright (C) 2007 Oak Ridge National Laboratory
532 + * Copyright (C) 2008 Sun Microsystems, Lustre group
535 #include "ad_lustre.h"
537 ADIOI_LUSTRE_ReadContig, /* ReadContig */
538 ADIOI_LUSTRE_WriteContig, /* WriteContig */
539 ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
540 - ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
541 + ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */
542 ADIOI_GEN_SeekIndividual, /* SeekIndividual */
543 - ADIOI_GEN_Fcntl, /* Fcntl */
544 + ADIOI_LUSTRE_Fcntl, /* Fcntl */
545 ADIOI_LUSTRE_SetInfo, /* SetInfo */
546 ADIOI_GEN_ReadStrided, /* ReadStrided */
547 - ADIOI_GEN_WriteStrided, /* WriteStrided */
548 - ADIOI_GEN_Close, /* Close */
549 + ADIOI_LUSTRE_WriteStrided, /* WriteStrided */
550 + ADIOI_LUSTRE_Close, /* Close */
551 #if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE)
552 ADIOI_GEN_IreadContig, /* IreadContig */
553 ADIOI_GEN_IwriteContig, /* IwriteContig */
554 diff -ruN ad_lustre_orig/ad_lustre_close.c ad_lustre/ad_lustre_close.c
555 --- ad_lustre_orig/ad_lustre_close.c 1970-01-01 08:00:00.000000000 +0800
556 +++ ad_lustre/ad_lustre_close.c 2008-09-17 18:20:35.000000000 +0800
558 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
561 + * Copyright (C) 1997 University of Chicago.
562 + * See COPYRIGHT notice in top-level directory.
564 + * Copyright (C) 2007 Oak Ridge National Laboratory
566 + * Copyright (C) 2008 Sun Microsystems, Lustre group
569 +#include "ad_lustre.h"
575 +void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code)
578 + static char myname[] = "ADIOI_LUSTRE_CLOSE";
581 + MPE_Log_event(9, 0, "start close");
584 + err = close(fd->fd_sys);
587 + MPE_Log_event(10, 0, "end close");
592 + if (err == -1 || derr == -1) {
594 + MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname,
595 + __LINE__, MPI_ERR_IO, "**io", "**io %s",
598 + *error_code = MPI_SUCCESS;
600 diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
601 --- ad_lustre_orig/ad_lustre.h 2008-09-17 14:36:57.000000000 +0800
602 +++ ad_lustre/ad_lustre.h 2008-10-15 21:22:52.000000000 +0800
604 /* -*- Mode: C; c-basic-offset:4 ; -*- */
606 - * Copyright (C) 1997 University of Chicago.
608 + * Copyright (C) 1997 University of Chicago.
609 * See COPYRIGHT notice in top-level directory.
611 * Copyright (C) 2007 Oak Ridge National Laboratory
613 + * Copyright (C) 2008 Sun Microsystems, Lustre group
616 #ifndef AD_UNIX_INCLUDE
619 /*#include <fcntl.h>*/
620 #include <sys/ioctl.h>
622 #include "lustre/lustre_user.h"
624 +/* copy something from lustre_user.h here */
625 +# define LOV_USER_MAGIC 0x0BD10BD0
626 +# define LL_IOC_LOV_SETSTRIPE _IOW ('f', 154, long)
627 +# define LL_IOC_LOV_GETSTRIPE _IOW ('f', 155, long)
628 +# define lov_user_ost_data lov_user_ost_data_v1
629 +struct lov_user_ost_data_v1 { /* per-stripe data structure */
630 + __u64 l_object_id; /* OST object ID */
631 + __u64 l_object_gr; /* OST object group (creating MDS number) */
632 + __u32 l_ost_gen; /* generation of this OST index */
633 + __u32 l_ost_idx; /* OST index in LOV */
634 +} __attribute__((packed));
635 +#define lov_user_md lov_user_md_v1
636 +struct lov_user_md_v1 { /* LOV EA user data (host-endian) */
637 + __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V1 */
638 + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
639 + __u64 lmm_object_id; /* LOV object ID */
640 + __u64 lmm_object_gr; /* LOV object group */
641 + __u32 lmm_stripe_size; /* size of stripe in bytes */
642 + __u16 lmm_stripe_count; /* num stripes in use for this object */
643 + __u16 lmm_stripe_offset; /* starting stripe offset in lmm_objects */
644 + struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
645 +} __attribute__((packed));
648 /*#include "adioi.h"*/
652 void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code);
653 void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code);
654 -void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
655 - MPI_Datatype datatype, int file_ptr_type,
656 - ADIO_Offset offset, ADIO_Status *status, int
658 -void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count,
659 - MPI_Datatype datatype, int file_ptr_type,
660 - ADIO_Offset offset, ADIO_Status *status, int
662 +void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count,
663 + MPI_Datatype datatype, int file_ptr_type,
664 + ADIO_Offset offset, ADIO_Status *status,
666 +void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count,
667 + MPI_Datatype datatype, int file_ptr_type,
668 + ADIO_Offset offset, ADIO_Status *status,
670 +void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
671 + MPI_Datatype datatype, int file_ptr_type,
672 + ADIO_Offset offset, ADIO_Status *status,
674 void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
675 - MPI_Datatype datatype, int file_ptr_type,
676 - ADIO_Offset offset, ADIO_Status *status, int
678 + MPI_Datatype datatype, int file_ptr_type,
679 + ADIO_Offset offset, ADIO_Status *status,
681 void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count,
682 - MPI_Datatype datatype, int file_ptr_type,
683 - ADIO_Offset offset, ADIO_Status *status, int
685 + MPI_Datatype datatype, int file_ptr_type,
686 + ADIO_Offset offset, ADIO_Status *status,
688 +void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count,
689 + MPI_Datatype datatype, int file_ptr_type,
690 + ADIO_Offset offset, ADIO_Status *status,
692 void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct,
694 void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
696 +void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
697 + int mode, int nprocs,
698 + ADIO_Offset *st_offsets,
699 + ADIO_Offset *end_offsets,
700 + ADIO_Offset *min_st_offset);
701 +int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
702 + ADIO_Offset *len, int *striping_info);
703 +void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
704 + int *len_list, int contig_access_count,
705 + int *striping_info, int nprocs,
706 + int *count_my_req_procs_ptr,
707 + int **count_my_req_per_proc_ptr,
708 + ADIOI_Access ** my_req_ptr,
709 + int **buf_idx_ptr);
710 +int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
711 + int *len_list, int nprocs);
712 +void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
713 + int *count_my_req_per_proc,
714 + ADIOI_Access * my_req,
715 + int nprocs, int myrank,
716 + ADIO_Offset req_len,
717 + ADIO_Offset min_st_offset,
718 + int *striping_info,
719 + int *count_others_req_procs_ptr,
720 + ADIOI_Access ** others_req_ptr);
721 #endif /* End of AD_UNIX_INCLUDE */
722 diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
723 --- ad_lustre_orig/ad_lustre_hints.c 2008-09-17 14:36:57.000000000 +0800
724 +++ ad_lustre/ad_lustre_hints.c 2008-10-15 21:31:00.000000000 +0800
726 /* -*- Mode: C; c-basic-offset:4 ; -*- */
728 - * Copyright (C) 1997 University of Chicago.
730 + * Copyright (C) 1997 University of Chicago.
731 * See COPYRIGHT notice in top-level directory.
733 * Copyright (C) 2007 Oak Ridge National Laboratory
735 + * Copyright (C) 2008 Sun Microsystems, Lustre group
738 #include "ad_lustre.h"
739 @@ -11,130 +13,189 @@
741 void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
743 - char *value, *value_in_fd;
744 - int flag, tmp_val[3], str_factor=-1, str_unit=0, start_iodev=-1;
745 - struct lov_user_md lum = { 0 };
746 - int err, myrank, fd_sys, perm, amode, old_mask;
747 + char *value = NULL;
748 + int flag, tmp_val, int_val, str_factor, str_unit, start_iodev;
749 + static char myname[] = "ADIOI_LUSTRE_SETINFO";
751 + *error_code = MPI_SUCCESS;
752 value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
754 if ( (fd->info) == MPI_INFO_NULL) {
755 - /* This must be part of the open call. can set striping parameters
757 + /* This must be part of the open call. can set striping parameters
759 MPI_Info_create(&(fd->info));
761 MPI_Info_set(fd->info, "direct_read", "false");
762 MPI_Info_set(fd->info, "direct_write", "false");
763 fd->direct_read = fd->direct_write = 0;
765 - /* has user specified striping or server buffering parameters
767 + /* has user specified striping or server buffering parameters
768 and do they have the same value on all processes? */
769 if (users_info != MPI_INFO_NULL) {
770 - MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
773 - str_unit=atoi(value);
775 - MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
778 - str_factor=atoi(value);
780 - MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
781 + /* direct read and write */
782 + MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
785 - start_iodev=atoi(value);
787 - MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
789 if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
790 MPI_Info_set(fd->info, "direct_read", "true");
794 - MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
795 + MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL,
797 if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
798 MPI_Info_set(fd->info, "direct_write", "true");
799 fd->direct_write = 1;
802 + MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
804 + if (flag && (str_unit = atoi(value))) {
805 + tmp_val = str_unit;
806 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
807 + if (tmp_val != str_unit) {
808 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
814 + MPI_Info_set(fd->info, "striping_unit", value);
817 + MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
819 + if (flag && (str_factor = atoi(value))) {
820 + tmp_val = str_factor;
821 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
822 + if (tmp_val != str_factor) {
823 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
829 + MPI_Info_set(fd->info, "striping_factor", value);
831 + /* stripe offset */
832 + MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
834 + if (flag && ((start_iodev = atoi(value)) >= 0)) {
835 + tmp_val = start_iodev;
836 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
837 + if (tmp_val != start_iodev) {
838 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
844 + MPI_Info_set(fd->info, "start_iodevice", value);
848 - MPI_Comm_rank(fd->comm, &myrank);
850 - tmp_val[0] = str_factor;
851 - tmp_val[1] = str_unit;
852 - tmp_val[2] = start_iodev;
854 + if (users_info != MPI_INFO_NULL) {
855 + /* CO: IO Clients/OST,
856 + * to keep the load balancing between clients and OSTs */
857 + MPI_Info_get(users_info, "CO", MPI_MAX_INFO_VAL, value,
859 + if (flag && (int_val = atoi(value)) > 0) {
861 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
862 + if (tmp_val != int_val) {
863 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
869 + MPI_Info_set(fd->info, "CO", value);
871 - MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm);
873 - if (tmp_val[0] != str_factor
874 - || tmp_val[1] != str_unit
875 - || tmp_val[2] != start_iodev) {
876 - FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys"
877 - "-striping_factor:striping_unit:start_iodevice "
878 - "need to be identical across all processes\n");
879 - MPI_Abort(MPI_COMM_WORLD, 1);
880 - } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) {
881 - /* if user has specified striping info, process 0 tries to set it */
883 - if (fd->perm == ADIO_PERM_NULL) {
884 - old_mask = umask(022);
886 - perm = old_mask ^ 0666;
888 - else perm = fd->perm;
891 - if (fd->access_mode & ADIO_CREATE)
892 - amode = amode | O_CREAT;
893 - if (fd->access_mode & ADIO_RDONLY)
894 - amode = amode | O_RDONLY;
895 - if (fd->access_mode & ADIO_WRONLY)
896 - amode = amode | O_WRONLY;
897 - if (fd->access_mode & ADIO_RDWR)
898 - amode = amode | O_RDWR;
899 - if (fd->access_mode & ADIO_EXCL)
900 - amode = amode | O_EXCL;
902 - /* we need to create file so ensure this is set */
903 - amode = amode | O_LOV_DELAY_CREATE | O_CREAT;
905 - fd_sys = open(fd->filename, amode, perm);
906 - if (fd_sys == -1) {
907 - if (errno != EEXIST)
909 - "Failure to open file %s %d %d\n",strerror(errno), amode, perm);
911 - lum.lmm_magic = LOV_USER_MAGIC;
912 - lum.lmm_pattern = 0;
913 - lum.lmm_stripe_size = str_unit;
914 - lum.lmm_stripe_count = str_factor;
915 - lum.lmm_stripe_offset = start_iodev;
917 - err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum);
918 - if (err == -1 && errno != EEXIST) {
919 - fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno));
923 - } /* End of striping parameters validation */
925 + * if the req size is bigger than this,
926 + * collective IO may not be performed.
928 + MPI_Info_get(users_info, "big_req_size", MPI_MAX_INFO_VAL, value,
930 + if (flag && (int_val = atoi(value)) > 0) {
932 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
933 + if (tmp_val != int_val) {
934 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
940 + MPI_Info_set(fd->info, "big_req_size", value);
942 + /* ds_in_coll: disable data sieving in collective IO */
943 + MPI_Info_get(users_info, "ds_in_coll", MPI_MAX_INFO_VAL,
945 + if (flag && (!strcmp(value, "enable") ||
946 + !strcmp(value, "ENABLE"))) {
947 + tmp_val = int_val = 1;
948 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
949 + if (tmp_val != int_val) {
950 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
956 + MPI_Info_set(fd->info, "ds_in_coll", "enable");
958 + /* contiguous_data: whether the data are contiguous */
959 + MPI_Info_get(users_info, "contiguous_data", MPI_MAX_INFO_VAL,
961 + if (flag && (!strcmp(value, "yes") ||
962 + !strcmp(value, "YES"))) {
963 + tmp_val = int_val = 1;
964 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
965 + if (tmp_val != int_val) {
966 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
972 + MPI_Info_set(fd->info, "contiguous_data", "yes");
975 - MPI_Barrier(fd->comm);
976 - /* set the values for collective I/O and data sieving parameters */
977 - ADIOI_GEN_SetInfo(fd, users_info, error_code);
979 - /* The file has been opened previously and fd->fd_sys is a valid
980 - file descriptor. cannot set striping parameters now. */
982 - /* set the values for collective I/O and data sieving parameters */
983 - ADIOI_GEN_SetInfo(fd, users_info, error_code);
984 + /* same_io_size: whether the req size is same */
985 + MPI_Info_get(users_info, "same_io_size", MPI_MAX_INFO_VAL,
987 + if (flag && (!strcmp(value, "yes") ||
988 + !strcmp(value, "YES"))) {
989 + tmp_val = int_val = 1;
990 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
991 + if (tmp_val != int_val) {
992 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
998 + MPI_Info_set(fd->info, "same_io_size", "yes");
1000 + /* Remember the current cb_nodes that the user set.
1001 + * It would be used to improve collective I/O.
1003 + MPI_Info_get(users_info, "cb_nodes", MPI_MAX_INFO_VAL, value, &flag);
1004 + if (flag && (int_val = atoi(value)) > 0) {
1005 + tmp_val = int_val;
1006 + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
1007 + if (tmp_val != int_val) {
1008 + MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
1011 + ADIOI_Free(value);
1014 + MPI_Info_set(fd->info, "user_cb_nodes", value);
1018 - if (ADIOI_Direct_read) fd->direct_read = 1;
1019 - if (ADIOI_Direct_write) fd->direct_write = 1;
1022 + /* set the values for collective I/O and data sieving parameters */
1023 + ADIOI_GEN_SetInfo(fd, users_info, error_code);
1025 - *error_code = MPI_SUCCESS;
1026 + if (ADIOI_Direct_read) fd->direct_read = 1;
1027 + if (ADIOI_Direct_write) fd->direct_write = 1;
1029 diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c
1030 --- ad_lustre_orig/ad_lustre_open.c 2008-09-17 14:36:57.000000000 +0800
1031 +++ ad_lustre/ad_lustre_open.c 2008-09-17 18:55:50.000000000 +0800
1033 /* -*- Mode: C; c-basic-offset:4 ; -*- */
1035 - * Copyright (C) 1997 University of Chicago.
1037 + * Copyright (C) 1997 University of Chicago.
1038 * See COPYRIGHT notice in top-level directory.
1040 * Copyright (C) 2007 Oak Ridge National Laboratory
1042 + * Copyright (C) 2008 Sun Microsystems, Lustre group
1045 #include "ad_lustre.h"
1047 void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
1049 - int perm, old_mask, amode, amode_direct;
1050 + int perm, old_mask, amode = 0, amode_direct = 0, flag = 0, err, myrank;
1051 + int stripe_size = 0, stripe_count = 0, stripe_offset = -1;
1052 struct lov_user_md lum = { 0 };
1054 + char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
1056 #if defined(MPICH2) || !defined(PRINT_ERR_MSG)
1057 static char myname[] = "ADIOI_LUSTRE_OPEN";
1059 old_mask = umask(022);
1061 perm = old_mask ^ 0666;
1063 - else perm = fd->perm;
1068 - if (fd->access_mode & ADIO_CREATE)
1069 + if (fd->access_mode & ADIO_CREATE) {
1070 amode = amode | O_CREAT;
1071 + /* Check striping info
1072 + * if already set by SetInfo(), set them to lum; otherwise, set by lum
1074 + MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value,
1077 + stripe_size = atoi(value);
1079 + MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value,
1082 + stripe_count = atoi(value);
1084 + MPI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL, value,
1087 + stripe_offset = atoi(value);
1089 + /* if user has specified striping info,
1090 + * process 0 will try to check and set it.
1092 + if ((stripe_size > 0) || (stripe_count > 0) || (stripe_offset >= 0)) {
1093 + MPI_Comm_rank(fd->comm, &myrank);
1094 + if (myrank == 0) {
1095 + int fd_sys = open(fd->filename, amode, perm);
1096 + if (fd_sys == -1) {
1097 + if (errno != EEXIST)
1098 + FPRINTF(stderr, "Failure to open file %s %d %d\n",
1099 + strerror(errno), amode, perm);
1101 + lum.lmm_magic = LOV_USER_MAGIC;
1102 + lum.lmm_pattern = 1;
1103 + lum.lmm_stripe_size = stripe_size;
1104 + lum.lmm_stripe_count = stripe_count;
1105 + lum.lmm_stripe_offset = stripe_offset;
1107 + if (ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum))
1109 + "Failure to set striping info to Lustre!\n");
1113 + MPI_Barrier(fd->comm);
1117 if (fd->access_mode & ADIO_RDONLY)
1118 amode = amode | O_RDONLY;
1119 if (fd->access_mode & ADIO_WRONLY)
1121 fd->fd_sys = open(fd->filename, amode|O_CREAT, perm);
1123 if (fd->fd_sys != -1) {
1126 - value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
1128 /* get file striping information and set it in info */
1129 - lum.lmm_magic = LOV_USER_MAGIC;
1130 - err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
1133 - sprintf(value, "%d", lum.lmm_stripe_size);
1134 - MPI_Info_set(fd->info, "striping_unit", value);
1136 - sprintf(value, "%d", lum.lmm_stripe_count);
1137 - MPI_Info_set(fd->info, "striping_factor", value);
1139 - sprintf(value, "%d", lum.lmm_stripe_offset);
1140 - MPI_Info_set(fd->info, "start_iodevice", value);
1142 - ADIOI_Free(value);
1143 + lum.lmm_magic = LOV_USER_MAGIC;
1144 + err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
1147 + if (lum.lmm_stripe_size && lum.lmm_stripe_count &&
1148 + (lum.lmm_stripe_offset >= 0)) {
1149 + sprintf(value, "%d", lum.lmm_stripe_size);
1150 + MPI_Info_set(fd->info, "striping_unit", value);
1152 + sprintf(value, "%d", lum.lmm_stripe_count);
1153 + MPI_Info_set(fd->info, "striping_factor", value);
1155 + sprintf(value, "%d", lum.lmm_stripe_offset);
1156 + MPI_Info_set(fd->info, "start_iodevice", value);
1158 + FPRINTF(stderr, "Striping info is invalid!\n");
1159 + ADIOI_Free(value);
1160 + MPI_Abort(MPI_COMM_WORLD, 1);
1163 + FPRINTF(stderr, "Failed to get striping info from Lustre!\n");
1164 + ADIOI_Free(value);
1165 + MPI_Abort(MPI_COMM_WORLD, 1);
1167 if (fd->access_mode & ADIO_APPEND)
1168 fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1172 if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
1173 - fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1174 + fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
1177 if (fd->direct_write || fd->direct_read) {
1178 @@ -81,20 +133,22 @@
1181 /* --BEGIN ERROR HANDLING-- */
1182 - if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
1183 - (fd->direct_write || fd->direct_read))) {
1184 + if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
1185 + (fd->direct_write || fd->direct_read))) {
1186 if (errno == ENAMETOOLONG)
1187 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1188 - MPIR_ERR_RECOVERABLE, myname,
1189 - __LINE__, MPI_ERR_BAD_FILE,
1190 + MPIR_ERR_RECOVERABLE,
1194 "**filenamelong %s %d",
1196 strlen(fd->filename));
1197 else if (errno == ENOENT)
1198 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1199 - MPIR_ERR_RECOVERABLE, myname,
1200 - __LINE__, MPI_ERR_NO_SUCH_FILE,
1201 + MPIR_ERR_RECOVERABLE,
1203 + MPI_ERR_NO_SUCH_FILE,
1207 @@ -108,27 +162,30 @@
1209 else if (errno == EACCES) {
1210 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1211 - MPIR_ERR_RECOVERABLE, myname,
1212 - __LINE__, MPI_ERR_ACCESS,
1213 + MPIR_ERR_RECOVERABLE,
1217 - "**fileaccess %s",
1220 - else if (errno == EROFS) {
1221 + "**fileaccess %s",
1223 + } else if (errno == EROFS) {
1224 /* Read only file or file system and write access requested */
1225 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1226 - MPIR_ERR_RECOVERABLE, myname,
1227 - __LINE__, MPI_ERR_READ_ONLY,
1228 - "**ioneedrd", 0 );
1231 + MPIR_ERR_RECOVERABLE,
1233 + MPI_ERR_READ_ONLY,
1236 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
1237 - MPIR_ERR_RECOVERABLE, myname,
1238 - __LINE__, MPI_ERR_IO, "**io",
1239 + MPIR_ERR_RECOVERABLE,
1241 + MPI_ERR_IO, "**io",
1242 "**io %s", strerror(errno));
1246 /* --END ERROR HANDLING-- */
1247 - else *error_code = MPI_SUCCESS;
1248 + *error_code = MPI_SUCCESS;
1251 + ADIOI_Free(value);
1253 diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c
1254 --- ad_lustre_orig/ad_lustre_rwcontig.c 2008-09-17 14:36:57.000000000 +0800
1255 +++ ad_lustre/ad_lustre_rwcontig.c 2008-10-15 22:44:35.000000000 +0800
1257 /* -*- Mode: C; c-basic-offset:4 ; -*- */
1259 - * Copyright (C) 1997 University of Chicago.
1261 + * Copyright (C) 1997 University of Chicago.
1262 * See COPYRIGHT notice in top-level directory.
1264 * Copyright (C) 2007 Oak Ridge National Laboratory
1266 + * Copyright (C) 2008 Sun Microsystems, Lustre group
1269 #define _XOPEN_SOURCE 600
1270 diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
1271 --- ad_lustre_orig/ad_lustre_wrcoll.c 1970-01-01 08:00:00.000000000 +0800
1272 +++ ad_lustre/ad_lustre_wrcoll.c 2008-10-15 22:02:53.000000000 +0800
1274 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
1276 + * Copyright (C) 1997 University of Chicago.
1277 + * See COPYRIGHT notice in top-level directory.
1279 + * Copyright (C) 2007 Oak Ridge National Laboratory
1281 + * Copyright (C) 2008 Sun Microsystems, Lustre group
1284 +#include "ad_lustre.h"
1285 +#include "adio_extern.h"
1287 +/* prototypes of functions used for collective writes only. */
1288 +static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
1289 + MPI_Datatype datatype, int nprocs,
1291 + ADIOI_Access *others_req,
1292 + ADIOI_Access *my_req,
1293 + ADIO_Offset *offset_list,
1295 + int contig_access_count,
1296 + int * striping_info,
1297 + int *buf_idx, int *error_code);
1298 +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
1299 + ADIOI_Flatlist_node * flat_buf,
1301 + ADIO_Offset * offset_list,
1302 + int *len_list, int *send_size,
1303 + MPI_Request * requests,
1304 + int *sent_to_proc, int nprocs,
1305 + int myrank, int contig_access_count,
1306 + int * striping_info,
1307 + int *send_buf_idx,
1308 + int *curr_to_proc,
1309 + int *done_to_proc, int iter,
1310 + MPI_Aint buftype_extent);
1311 +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
1313 + ADIOI_Flatlist_node * flat_buf,
1314 + ADIO_Offset * offset_list,
1315 + int *len_list, int *send_size,
1316 + int *recv_size, ADIO_Offset off,
1317 + int size, int *count,
1318 + int *start_pos, int *partial_recv,
1319 + int *sent_to_proc, int nprocs,
1320 + int myrank, int buftype_is_contig,
1321 + int contig_access_count,
1322 + int * striping_info,
1323 + ADIOI_Access * others_req,
1324 + int *send_buf_idx,
1325 + int *curr_to_proc,
1326 + int *done_to_proc, int *hole,
1327 + int iter, MPI_Aint buftype_extent,
1328 + int *buf_idx, int *error_code);
1329 +void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
1330 + ADIO_Offset * srt_off, int *srt_len, int *start_pos,
1331 + int nprocs, int nprocs_recv, int total_elements);
1333 +void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
1334 + MPI_Datatype datatype,
1335 + int file_ptr_type, ADIO_Offset offset,
1336 + ADIO_Status * status, int *error_code)
1338 + ADIOI_Access *my_req;
1339 + /* array of nprocs access structures, one for each other process has
1340 + this process's request */
1342 + ADIOI_Access *others_req;
1343 + /* array of nprocs access structures, one for each other process
1344 + whose request is written by this process. */
1346 + int i, filetype_is_contig, nprocs, myrank, do_collect = 0;
1347 + int contig_access_count = 0, buftype_is_contig, interleave_count = 0;
1348 + int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
1349 + ADIO_Offset orig_fp, start_offset, end_offset, off, min_st_offset;
1350 + ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL;
1351 + int *buf_idx = NULL, *len_list = NULL, *striping_info = NULL;
1352 + int old_error, tmp_error;
1354 + MPI_Comm_size(fd->comm, &nprocs);
1355 + MPI_Comm_rank(fd->comm, &myrank);
1357 + orig_fp = fd->fp_ind;
1359 + /* IO patten identification if cb_write isn't disabled */
1360 + if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
1361 + /* For this process's request, calculate the list of offsets and
1362 + lengths in the file and determine the start and end offsets. */
1363 + ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
1364 + &offset_list, &len_list, &start_offset,
1365 + &end_offset, &contig_access_count);
1367 + /* each process communicates its start and end offsets to other
1368 + processes. The result is an array each of start and end offsets stored
1369 + in order of process rank. */
1370 + st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
1371 + end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
1372 + MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
1373 + ADIO_OFFSET, fd->comm);
1374 + MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1,
1375 + ADIO_OFFSET, fd->comm);
1376 + /* are the accesses of different processes interleaved? */
1377 + for (i = 1; i < nprocs; i++)
1378 + if ((st_offsets[i] < end_offsets[i-1]) &&
1379 + (st_offsets[i] <= end_offsets[i]))
1380 + interleave_count++;
1381 + /* This is a rudimentary check for interleaving, but should suffice
1382 + for the moment. */
1384 + /* Two typical access patterns can benefit from collective write.
1385 + * 1) the processes are interleaved, and
1386 + * 2) the req size is small.
1388 + if (interleave_count > 0) {
1391 + do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count,
1392 + len_list, nprocs);
1395 + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
1397 + /* Decide if collective I/O should be done */
1398 + if ((!do_collect && fd->hints->cb_write == ADIOI_HINT_AUTO) ||
1399 + fd->hints->cb_write == ADIOI_HINT_DISABLE) {
1401 + int filerange_is_contig = 0;
1403 + /* use independent accesses */
1404 + if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
1405 + ADIOI_Free(offset_list);
1406 + ADIOI_Free(len_list);
1407 + ADIOI_Free(st_offsets);
1408 + ADIOI_Free(end_offsets);
1411 + fd->fp_ind = orig_fp;
1412 + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
1413 + if (buftype_is_contig && filetype_is_contig) {
1414 + if (file_ptr_type == ADIO_EXPLICIT_OFFSET) {
1415 + off = fd->disp + (fd->etype_size) * offset;
1416 + ADIO_WriteContig(fd, buf, count, datatype,
1417 + ADIO_EXPLICIT_OFFSET,
1418 + off, status, error_code);
1420 + ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL,
1421 + 0, status, error_code);
1423 + ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type,
1424 + offset, status, error_code);
1429 + /* Get Lustre hints information */
1430 + ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1, nprocs,
1431 + st_offsets, end_offsets,
1433 + /* calculate what portions of the access requests of this process are
1434 + * located in which process
1436 + ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count,
1437 + striping_info, nprocs, &count_my_req_procs,
1438 + &count_my_req_per_proc, &my_req, &buf_idx);
1439 + /* calculate what process's requests will be written by this process */
1440 + ADIOI_LUSTRE_Calc_others_req(fd, count_my_req_procs,
1441 + count_my_req_per_proc,
1442 + my_req, nprocs, myrank,
1443 + end_offset - start_offset + 1,
1444 + min_st_offset, striping_info,
1445 + &count_others_req_procs, &others_req);
1446 + ADIOI_Free(count_my_req_per_proc);
1448 + /* exchange data and write in sizes of no more than stripe_size. */
1449 + ADIOI_LUSTRE_Exch_and_write(fd, buf, datatype, nprocs, myrank,
1450 + others_req, my_req,
1451 + offset_list, len_list, contig_access_count,
1452 + striping_info, buf_idx, error_code);
1454 + old_error = *error_code;
1455 + if (*error_code != MPI_SUCCESS)
1456 + *error_code = MPI_ERR_IO;
1458 + /* optimization: if only one process performing i/o, we can perform
1459 + * a less-expensive Bcast */
1460 +#ifdef ADIOI_MPE_LOGGING
1461 + MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL);
1463 + if (fd->hints->cb_nodes == 1)
1464 + MPI_Bcast(error_code, 1, MPI_INT,
1465 + fd->hints->ranklist[0], fd->comm);
1467 + tmp_error = *error_code;
1468 + MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT,
1469 + MPI_MAX, fd->comm);
1471 +#ifdef ADIOI_MPE_LOGGING
1472 + MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL);
1475 + if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO))
1476 + *error_code = old_error;
1479 + if (!buftype_is_contig)
1480 + ADIOI_Delete_flattened(datatype);
1482 + /* free all memory allocated for collective I/O */
1483 + /* free others_req */
1484 + for (i = 0; i < nprocs; i++) {
1485 + if (others_req[i].count) {
1486 + ADIOI_Free(others_req[i].offsets);
1487 + ADIOI_Free(others_req[i].lens);
1488 + ADIOI_Free(others_req[i].mem_ptrs);
1491 + ADIOI_Free(others_req);
1492 + /* free my_req here */
1493 + for (i = 0; i < nprocs; i++) {
1494 + if (my_req[i].count) {
1495 + ADIOI_Free(my_req[i].offsets);
1496 + ADIOI_Free(my_req[i].lens);
1499 + ADIOI_Free(my_req);
1500 + ADIOI_Free(buf_idx);
1501 + ADIOI_Free(offset_list);
1502 + ADIOI_Free(len_list);
1503 + ADIOI_Free(st_offsets);
1504 + ADIOI_Free(end_offsets);
1505 + ADIOI_Free(striping_info);
1507 +#ifdef HAVE_STATUS_SET_BYTES
1509 + int bufsize, size;
1510 + /* Don't set status if it isn't needed */
1511 + MPI_Type_size(datatype, &size);
1512 + bufsize = size * count;
1513 + MPIR_Status_set_bytes(status, datatype, bufsize);
1515 + /* This is a temporary way of filling in status. The right way is to
1516 + * keep track of how much data was actually written during collective I/O.
1520 + fd->fp_sys_posn = -1; /* set it to null. */
1523 +static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
1524 + MPI_Datatype datatype, int nprocs,
1525 + int myrank, ADIOI_Access *others_req,
1526 + ADIOI_Access *my_req,
1527 + ADIO_Offset *offset_list,
1528 + int *len_list, int contig_access_count,
1529 + int *striping_info, int *buf_idx,
1532 + int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
1533 + ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
1534 + ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
1535 + ADIO_Offset max_size, step_size = 0;
1536 + int real_size, req_len, send_len;
1537 + int *recv_curr_offlen_ptr, *recv_count, *recv_size;
1538 + int *send_curr_offlen_ptr, *send_size;
1539 + int *partial_recv, *sent_to_proc, *recv_start_pos;
1540 + int *send_buf_idx, *curr_to_proc, *done_to_proc;
1541 + char *write_buf = NULL, *value;
1542 + MPI_Status status;
1543 + ADIOI_Flatlist_node *flat_buf = NULL;
1544 + MPI_Aint buftype_extent;
1545 + int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
1546 + int lflag, data_sieving = 0;
1548 + *error_code = MPI_SUCCESS; /* changed below if error */
1550 + /* calculate the number of writes of stripe size to be done.
1551 + * That gives the no. of communication phases as well.
1553 + * Because we redistribute data in stripe-contiguous pattern for Lustre,
1554 + * each process has the same no. of communication phases.
1557 + for (i = 0; i < nprocs; i++) {
1558 + if (others_req[i].count) {
1559 + st_loc = others_req[i].offsets[0];
1560 + end_loc = others_req[i].offsets[0];
1564 + for (i = 0; i < nprocs; i++) {
1565 + for (j = 0; j < others_req[i].count; j++) {
1566 + st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]);
1567 + end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] +
1568 + others_req[i].lens[j] - 1));
1571 + /* this process does no writing. */
1572 + if ((st_loc == -1) && (end_loc == -1))
1574 + MPI_Allreduce(&end_loc, &max_end_loc, 1, MPI_LONG_LONG_INT, MPI_MAX, fd->comm);
1575 + /* avoid min_st_loc be -1 */
1577 + st_loc = max_end_loc;
1578 + MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm);
1579 + /* align downward */
1580 + min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size;
1582 + /* Each time, only avail_cb_nodes number of IO clients perform IO,
1583 + * so, step_size=avail_cb_nodes*stripe_size IO will be performed at most,
1584 + * and ntimes=whole_file_portion/step_size
1586 + step_size = (ADIO_Offset) avail_cb_nodes * stripe_size;
1587 + max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1);
1589 + write_buf = (char *) ADIOI_Malloc(stripe_size);
1591 + /* calculate the start offset for each iteration */
1592 + off_list = (ADIO_Offset *) ADIOI_Malloc(max_ntimes * sizeof(ADIO_Offset));
1593 + for (m = 0; m < max_ntimes; m ++)
1594 + off_list[m] = max_end_loc;
1595 + for (i = 0; i < nprocs; i++) {
1596 + for (j = 0; j < others_req[i].count; j ++) {
1597 + req_off = others_req[i].offsets[j];
1598 + m = (int)((req_off - min_st_loc) / step_size);
1599 + off_list[m] = ADIOI_MIN(off_list[m], req_off);
1603 + recv_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1604 + send_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1605 + /* their use is explained below. calloc initializes to 0. */
1607 + recv_count = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1608 + /* to store count of how many off-len pairs per proc are satisfied
1609 + in an iteration. */
1611 + send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1612 + /* total size of data to be sent to each proc. in an iteration.
1613 + Of size nprocs so that I can use MPI_Alltoall later. */
1615 + recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1616 + /* total size of data to be recd. from each proc. in an iteration. */
1618 + sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int));
1619 + /* amount of data sent to each proc so far. Used in
1620 + ADIOI_Fill_send_buffer. initialized to 0 here. */
1622 + send_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1623 + curr_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1624 + done_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1625 + /* Above three are used in ADIOI_Fill_send_buffer */
1627 + recv_start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int));
1628 + /* used to store the starting value of recv_curr_offlen_ptr[i] in
1631 + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
1632 + if (!buftype_is_contig) {
1633 + ADIOI_Flatten_datatype(datatype);
1634 + flat_buf = ADIOI_Flatlist;
1635 + while (flat_buf->type != datatype)
1636 + flat_buf = flat_buf->next;
1638 + MPI_Type_extent(datatype, &buftype_extent);
1640 + iter_st_off = min_st_loc;
1642 + /* Although we have recognized the data according to OST index,
1643 + * a read-modify-write will be done if there is a hole between the data.
1644 + * For example: if blocksize=60, xfersize=30 and stripe_size=100,
1645 + * then rank0 will collect data [0, 30] and [60, 90] then write. There
1646 + * is a hole in [30, 60], which will cause a read-modify-write in [0, 90].
1648 + * To reduce its impact on the performance, we disable data sieving
1649 + * by default, unless the hint "ds_in_coll" is enabled.
1651 + /* check the hint for data sieving */
1652 + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
1653 + MPI_Info_get(fd->info, "ds_in_coll", MPI_MAX_INFO_VAL, value, &lflag);
1654 + if (lflag && !strcmp(value, "enable"))
1656 + ADIOI_Free(value);
1658 + for (m = 0; m < max_ntimes; m++) {
1659 + /* go through all others_req and my_req to check which will be received
1660 + * and sent in this iteration.
1663 + /* Note that MPI guarantees that displacements in filetypes are in
1664 + monotonically nondecreasing order and that, for writes, the
1665 + filetypes cannot specify overlapping regions in the file. This
1666 + simplifies implementation a bit compared to reads. */
1669 + off = start offset in the file for the data to be written in
1671 + iter_st_off = start offset of this iteration
1672 + real_size = size of data written (bytes) corresponding to off
1673 + max_size = possible maximum size of data written in this iteration
1674 + req_off = offset in the file for a particular contiguous request minus
1675 + what was satisfied in previous iteration
1676 + send_off = offset the request needed by other processes in this iteration
1677 + req_len = size corresponding to req_off
1678 + send_len = size corresponding to send_off
1681 + /* first calculate what should be communicated */
1682 + for (i = 0; i < nprocs; i++)
1683 + recv_count[i] = recv_size[i] = send_size[i] = 0;
1685 + off = off_list[m];
1686 + max_size = ADIOI_MIN(step_size, max_end_loc - iter_st_off + 1);
1687 + real_size = (int) ADIOI_MIN((off / stripe_size + 1) * stripe_size - off,
1688 + end_loc - off + 1);
1690 + for (i = 0; i < nprocs; i++) {
1691 + if (my_req[i].count) {
1692 + for (j = send_curr_offlen_ptr[i]; j < my_req[i].count; j++) {
1693 + send_off = my_req[i].offsets[j];
1694 + send_len = my_req[i].lens[j];
1695 + if (send_off < iter_st_off + max_size) {
1696 + send_size[i] += send_len;
1701 + send_curr_offlen_ptr[i] = j;
1703 + if (others_req[i].count) {
1704 + recv_start_pos[i] = recv_curr_offlen_ptr[i];
1705 + for (j = recv_curr_offlen_ptr[i]; j < others_req[i].count; j++) {
1706 + req_off = others_req[i].offsets[j];
1707 + req_len = others_req[i].lens[j];
1708 + if (req_off < iter_st_off + max_size) {
1710 + MPI_Address(write_buf + req_off - off,
1711 + &(others_req[i].mem_ptrs[j]));
1712 + recv_size[i] += req_len;
1717 + recv_curr_offlen_ptr[i] = j;
1720 + /* use variable "hole" to pass data_sieving flag into W_Exchange_data */
1721 + hole = data_sieving;
1722 + ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list,
1723 + len_list, send_size, recv_size, off, real_size,
1724 + recv_count, recv_start_pos, partial_recv,
1725 + sent_to_proc, nprocs, myrank,
1726 + buftype_is_contig, contig_access_count,
1727 + striping_info, others_req, send_buf_idx,
1728 + curr_to_proc, done_to_proc, &hole, m,
1729 + buftype_extent, buf_idx, error_code);
1730 + if (*error_code != MPI_SUCCESS)
1734 + for (i = 0; i < nprocs; i++)
1735 + if (recv_count[i]) {
1740 + /* check whether to do data sieving */
1741 + if(data_sieving) {
1742 + ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
1743 + ADIO_EXPLICIT_OFFSET, off, &status,
1746 + /* if there is no hole, write data in one time;
1747 + * otherwise, write data in several times */
1749 + ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
1750 + ADIO_EXPLICIT_OFFSET, off, &status,
1753 + for (i = 0; i < nprocs; i++) {
1754 + if (others_req[i].count) {
1755 + for (j = 0; j < others_req[i].count; j++) {
1756 + if (others_req[i].offsets[j] < off + real_size &&
1757 + others_req[i].offsets[j] >= off) {
1758 + ADIO_WriteContig(fd,
1759 + write_buf + others_req[i].offsets[j] - off,
1760 + others_req[i].lens[j],
1761 + MPI_BYTE, ADIO_EXPLICIT_OFFSET,
1762 + others_req[i].offsets[j], &status,
1764 + if (*error_code != MPI_SUCCESS)
1772 + if (*error_code != MPI_SUCCESS)
1775 + iter_st_off += max_size;
1779 + ADIOI_Free(write_buf);
1780 + ADIOI_Free(recv_curr_offlen_ptr);
1781 + ADIOI_Free(send_curr_offlen_ptr);
1782 + ADIOI_Free(recv_count);
1783 + ADIOI_Free(send_size);
1784 + ADIOI_Free(recv_size);
1785 + ADIOI_Free(sent_to_proc);
1786 + ADIOI_Free(recv_start_pos);
1787 + ADIOI_Free(send_buf_idx);
1788 + ADIOI_Free(curr_to_proc);
1789 + ADIOI_Free(done_to_proc);
1790 + ADIOI_Free(off_list);
1793 +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
1795 + ADIOI_Flatlist_node * flat_buf,
1796 + ADIO_Offset * offset_list,
1797 + int *len_list, int *send_size,
1798 + int *recv_size, ADIO_Offset off,
1799 + int size, int *count,
1800 + int *start_pos, int *partial_recv,
1801 + int *sent_to_proc, int nprocs,
1802 + int myrank, int buftype_is_contig,
1803 + int contig_access_count,
1804 + int * striping_info,
1805 + ADIOI_Access * others_req,
1806 + int *send_buf_idx,
1807 + int *curr_to_proc, int *done_to_proc,
1808 + int *hole, int iter,
1809 + MPI_Aint buftype_extent,
1810 + int *buf_idx, int *error_code)
1812 + int i, j, nprocs_recv, nprocs_send, err;
1813 + char **send_buf = NULL;
1814 + MPI_Request *requests, *send_req;
1815 + MPI_Datatype *recv_types;
1816 + MPI_Status *statuses, status;
1817 + int *srt_len, sum, sum_recv;
1818 + ADIO_Offset *srt_off;
1819 + int data_sieving = *hole;
1820 + static char myname[] = "ADIOI_W_EXCHANGE_DATA";
1822 + /* create derived datatypes for recv */
1824 + for (i = 0; i < nprocs; i++)
1828 + recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) *
1829 + sizeof(MPI_Datatype));
1830 + /* +1 to avoid a 0-size malloc */
1833 + for (i = 0; i < nprocs; i++) {
1834 + if (recv_size[i]) {
1835 + MPI_Type_hindexed(count[i],
1836 + &(others_req[i].lens[start_pos[i]]),
1837 + &(others_req[i].mem_ptrs[start_pos[i]]),
1838 + MPI_BYTE, recv_types + j);
1839 + /* absolute displacements; use MPI_BOTTOM in recv */
1840 + MPI_Type_commit(recv_types + j);
1845 + /* To avoid a read-modify-write,
1846 + * check if there are holes in the data to be written.
1847 + * For this, merge the (sorted) offset lists others_req using a heap-merge.
1851 + for (i = 0; i < nprocs; i++)
1853 + srt_off = (ADIO_Offset *) ADIOI_Malloc((sum + 1) * sizeof(ADIO_Offset));
1854 + srt_len = (int *) ADIOI_Malloc((sum + 1) * sizeof(int));
1855 + /* +1 to avoid a 0-size malloc */
1857 + ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
1858 + nprocs, nprocs_recv, sum);
1860 + /* check if there are any holes */
1862 + for (i = 0; i < sum - 1; i++) {
1863 + if (srt_off[i] + srt_len[i] < srt_off[i + 1]) {
1868 + /* In some cases (see John Bent ROMIO REQ # 835), an odd interaction
1869 + * between aggregation, nominally contiguous regions, and cb_buffer_size
1870 + * should be handled with a read-modify-write (otherwise we will write out
1871 + * more data than we receive from everyone else (inclusive), so override
1876 + for (i = 0; i < nprocs; i++)
1877 + sum_recv += recv_size[i];
1878 + if (size > sum_recv)
1881 + /* check the hint for data sieving */
1882 + if (data_sieving && nprocs_recv && *hole) {
1883 + ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
1884 + ADIO_EXPLICIT_OFFSET, off, &status, &err);
1885 + // --BEGIN ERROR HANDLING--
1886 + if (err != MPI_SUCCESS) {
1887 + *error_code = MPIO_Err_create_code(err,
1888 + MPIR_ERR_RECOVERABLE,
1891 + "**ioRMWrdwr", 0);
1892 + ADIOI_Free(recv_types);
1893 + ADIOI_Free(srt_off);
1894 + ADIOI_Free(srt_len);
1897 + // --END ERROR HANDLING--
1899 + ADIOI_Free(srt_off);
1900 + ADIOI_Free(srt_len);
1903 + for (i = 0; i < nprocs; i++)
1907 + if (fd->atomicity) {
1908 + /* bug fix from Wei-keng Liao and Kenin Coloma */
1909 + requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) *
1910 + sizeof(MPI_Request));
1911 + send_req = requests;
1913 + requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1)*
1914 + sizeof(MPI_Request));
1915 + /* +1 to avoid a 0-size malloc */
1917 + /* post receives */
1919 + for (i = 0; i < nprocs; i++) {
1920 + if (recv_size[i]) {
1921 + MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i,
1922 + myrank + i + 100 * iter, fd->comm, requests + j);
1926 + send_req = requests + nprocs_recv;
1930 + * if buftype_is_contig, data can be directly sent from
1931 + * user buf at location given by buf_idx. else use send_buf.
1933 + if (buftype_is_contig) {
1935 + for (i = 0; i < nprocs; i++)
1936 + if (send_size[i]) {
1937 + MPI_Isend(((char *) buf) + buf_idx[i], send_size[i],
1938 + MPI_BYTE, i, myrank + i + 100 * iter, fd->comm,
1941 + buf_idx[i] += send_size[i];
1943 + } else if (nprocs_send) {
1944 + /* buftype is not contig */
1945 + send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *));
1946 + for (i = 0; i < nprocs; i++)
1948 + send_buf[i] = (char *) ADIOI_Malloc(send_size[i]);
1950 + ADIOI_LUSTRE_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list,
1951 + len_list, send_size, send_req,
1952 + sent_to_proc, nprocs, myrank,
1953 + contig_access_count, striping_info,
1954 + send_buf_idx, curr_to_proc, done_to_proc,
1955 + iter, buftype_extent);
1956 + /* the send is done in ADIOI_Fill_send_buffer */
1959 + /* bug fix from Wei-keng Liao and Kenin Coloma */
1960 + if (fd->atomicity) {
1962 + for (i = 0; i < nprocs; i++) {
1963 + MPI_Status wkl_status;
1964 + if (recv_size[i]) {
1965 + MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i,
1966 + myrank + i + 100 * iter, fd->comm, &wkl_status);
1972 + for (i = 0; i < nprocs_recv; i++)
1973 + MPI_Type_free(recv_types + i);
1974 + ADIOI_Free(recv_types);
1976 + /* bug fix from Wei-keng Liao and Kenin Coloma */
1977 + /* +1 to avoid a 0-size malloc */
1978 + if (fd->atomicity) {
1979 + statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) *
1980 + sizeof(MPI_Status));
1982 + statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) *
1983 + sizeof(MPI_Status));
1986 +#ifdef NEEDS_MPI_TEST
1988 + if (fd->atomicity) {
1989 + /* bug fix from Wei-keng Liao and Kenin Coloma */
1991 + MPI_Testall(nprocs_send, send_req, &i, statuses);
1994 + MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses);
1997 + /* bug fix from Wei-keng Liao and Kenin Coloma */
1998 + if (fd->atomicity)
1999 + MPI_Waitall(nprocs_send, send_req, statuses);
2001 + MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses);
2003 + ADIOI_Free(statuses);
2004 + ADIOI_Free(requests);
2005 + if (!buftype_is_contig && nprocs_send) {
2006 + for (i = 0; i < nprocs; i++)
2008 + ADIOI_Free(send_buf[i]);
2009 + ADIOI_Free(send_buf);
2013 +#define ADIOI_BUF_INCR \
2015 + while (buf_incr) { \
2016 + size_in_buf = ADIOI_MIN(buf_incr, flat_buf_sz); \
2017 + user_buf_idx += size_in_buf; \
2018 + flat_buf_sz -= size_in_buf; \
2019 + if (!flat_buf_sz) { \
2020 + if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
2022 + flat_buf_idx = 0; \
2025 + user_buf_idx = flat_buf->indices[flat_buf_idx] + \
2026 + n_buftypes*buftype_extent; \
2027 + flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
2029 + buf_incr -= size_in_buf; \
2034 +#define ADIOI_BUF_COPY \
2037 + size_in_buf = ADIOI_MIN(size, flat_buf_sz); \
2038 + memcpy(&(send_buf[p][send_buf_idx[p]]), \
2039 + ((char *) buf) + user_buf_idx, size_in_buf); \
2040 + send_buf_idx[p] += size_in_buf; \
2041 + user_buf_idx += size_in_buf; \
2042 + flat_buf_sz -= size_in_buf; \
2043 + if (!flat_buf_sz) { \
2044 + if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \
2046 + flat_buf_idx = 0; \
2049 + user_buf_idx = flat_buf->indices[flat_buf_idx] + \
2050 + n_buftypes*buftype_extent; \
2051 + flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \
2053 + size -= size_in_buf; \
2054 + buf_incr -= size_in_buf; \
2059 +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
2060 + ADIOI_Flatlist_node * flat_buf,
2062 + ADIO_Offset * offset_list,
2063 + int *len_list, int *send_size,
2064 + MPI_Request * requests,
2065 + int *sent_to_proc, int nprocs,
2067 + int contig_access_count,
2068 + int * striping_info,
2069 + int *send_buf_idx,
2070 + int *curr_to_proc,
2071 + int *done_to_proc, int iter,
2072 + MPI_Aint buftype_extent)
2074 + /* this function is only called if buftype is not contig */
2075 + int i, p, flat_buf_idx, size;
2076 + int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes;
2077 + ADIO_Offset off, len, rem_len, user_buf_idx;
2079 + /* curr_to_proc[p] = amount of data sent to proc. p that has already
2080 + * been accounted for so far
2081 + * done_to_proc[p] = amount of data already sent to proc. p in
2082 + * previous iterations
2083 + * user_buf_idx = current location in user buffer
2084 + * send_buf_idx[p] = current location in send_buf of proc. p
2087 + for (i = 0; i < nprocs; i++) {
2088 + send_buf_idx[i] = curr_to_proc[i] = 0;
2089 + done_to_proc[i] = sent_to_proc[i];
2093 + user_buf_idx = flat_buf->indices[0];
2096 + flat_buf_sz = flat_buf->blocklens[0];
2098 + /* flat_buf_idx = current index into flattened buftype
2099 + * flat_buf_sz = size of current contiguous component in flattened buf
2101 + for (i = 0; i < contig_access_count; i++) {
2102 + off = offset_list[i];
2103 + rem_len = (ADIO_Offset) len_list[i];
2105 + /*this request may span to more than one process */
2106 + while (rem_len != 0) {
2108 + /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no
2109 + * longer than the single region that processor "p" is responsible
2112 + p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, striping_info);
2114 + if (send_buf_idx[p] < send_size[p]) {
2115 + if (curr_to_proc[p] + len > done_to_proc[p]) {
2116 + if (done_to_proc[p] > curr_to_proc[p]) {
2117 + size = (int) ADIOI_MIN(curr_to_proc[p] + len -
2121 + buf_incr = done_to_proc[p] - curr_to_proc[p];
2123 + buf_incr = (int) (curr_to_proc[p] + len -
2125 + curr_to_proc[p] = done_to_proc[p] + size;
2128 + size = (int) ADIOI_MIN(len, send_size[p] -
2130 + buf_incr = (int) len;
2131 + curr_to_proc[p] += size;
2134 + if (send_buf_idx[p] == send_size[p]) {
2135 + MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p,
2136 + myrank + p + 100 * iter, fd->comm,
2141 + curr_to_proc[p] += (int) len;
2142 + buf_incr = (int) len;
2146 + buf_incr = (int) len;
2153 + for (i = 0; i < nprocs; i++)
2155 + sent_to_proc[i] = curr_to_proc[i];
2157 diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
2158 --- ad_lustre_orig/ad_lustre_wrstr.c 1970-01-01 08:00:00.000000000 +0800
2159 +++ ad_lustre/ad_lustre_wrstr.c 2008-10-13 15:34:53.000000000 +0800
2161 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
2163 + * Copyright (C) 1997 University of Chicago.
2164 + * See COPYRIGHT notice in top-level directory.
2166 + * Copyright (C) 2007 Oak Ridge National Laboratory
2168 + * Copyright (C) 2008 Sun Microsystems, Lustre group
2171 +#include "ad_lustre.h"
2172 +#include "adio_extern.h"
2174 +#define ADIOI_BUFFERED_WRITE \
2176 + if (req_off >= writebuf_off + writebuf_len) { \
2177 + if (writebuf_len) { \
2178 + ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2179 + ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2180 + if (!(fd->atomicity)) \
2181 + ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2182 + if (*error_code != MPI_SUCCESS) { \
2183 + *error_code = MPIO_Err_create_code(*error_code, \
2184 + MPIR_ERR_RECOVERABLE, myname, \
2185 + __LINE__, MPI_ERR_IO, \
2187 + ADIOI_Free(writebuf); \
2191 + writebuf_off = req_off; \
2192 + /* stripe_size alignment */ \
2193 + writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2194 + (writebuf_off / stripe_size + 1) * \
2195 + stripe_size - writebuf_off);\
2196 + if (!(fd->atomicity)) \
2197 + ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2198 + ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,\
2199 + writebuf_off, &status1, error_code); \
2200 + if (*error_code != MPI_SUCCESS) { \
2201 + *error_code = MPIO_Err_create_code(*error_code, \
2202 + MPIR_ERR_RECOVERABLE, myname, \
2203 + __LINE__, MPI_ERR_IO, \
2205 + ADIOI_Free(writebuf); \
2209 + write_sz = (int) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
2210 + memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, write_sz);\
2211 + while (write_sz != req_len) {\
2212 + ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2213 + ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2214 + if (!(fd->atomicity)) \
2215 + ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2216 + if (*error_code != MPI_SUCCESS) { \
2217 + *error_code = MPIO_Err_create_code(*error_code, \
2218 + MPIR_ERR_RECOVERABLE, myname, \
2219 + __LINE__, MPI_ERR_IO, \
2221 + ADIOI_Free(writebuf); \
2224 + req_len -= write_sz; \
2225 + userbuf_off += write_sz; \
2226 + writebuf_off += writebuf_len; \
2227 + /* stripe_size alignment */ \
2228 + writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2229 + (writebuf_off / stripe_size + 1) * \
2230 + stripe_size - writebuf_off);\
2231 + if (!(fd->atomicity)) \
2232 + ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \
2233 + ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,\
2234 + writebuf_off, &status1, error_code); \
2235 + if (*error_code != MPI_SUCCESS) { \
2236 + *error_code = MPIO_Err_create_code(*error_code, \
2237 + MPIR_ERR_RECOVERABLE, myname, \
2238 + __LINE__, MPI_ERR_IO, \
2240 + ADIOI_Free(writebuf); \
2243 + write_sz = ADIOI_MIN(req_len, writebuf_len); \
2244 + memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
2249 +/* this macro is used when filetype is contig and buftype is not contig.
2250 + it does not do a read-modify-write and does not lock*/
2251 +#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \
2253 + if (req_off >= writebuf_off + writebuf_len) { \
2254 + writebuf_off = req_off; \
2255 + /* stripe_size alignment */ \
2256 + writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2257 + (writebuf_off / stripe_size + 1) * \
2258 + stripe_size - writebuf_off);\
2260 + write_sz = (int) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \
2261 + memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, write_sz);\
2262 + while (req_len) { \
2263 + ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \
2264 + ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \
2265 + if (*error_code != MPI_SUCCESS) { \
2266 + *error_code = MPIO_Err_create_code(*error_code, \
2267 + MPIR_ERR_RECOVERABLE, myname, \
2268 + __LINE__, MPI_ERR_IO, \
2270 + ADIOI_Free(writebuf); \
2273 + req_len -= write_sz; \
2274 + userbuf_off += write_sz; \
2275 + writebuf_off += writebuf_len; \
2276 + /* stripe_size alignment */ \
2277 + writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \
2278 + (writebuf_off / stripe_size + 1) * \
2279 + stripe_size - writebuf_off);\
2280 + write_sz = ADIOI_MIN(req_len, writebuf_len); \
2281 + memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\
2285 +void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count,
2286 + MPI_Datatype datatype, int file_ptr_type,
2287 + ADIO_Offset offset, ADIO_Status * status,
2290 + /* offset is in units of etype relative to the filetype. */
2291 + ADIOI_Flatlist_node *flat_buf, *flat_file;
2292 + int i, j, k, bwr_size, fwr_size = 0, st_index = 0;
2293 + int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype;
2294 + int n_filetypes, etype_in_filetype;
2295 + ADIO_Offset abs_off_in_filetype = 0;
2296 + int filetype_size, etype_size, buftype_size, req_len;
2297 + MPI_Aint filetype_extent, buftype_extent;
2298 + int buf_count, buftype_is_contig, filetype_is_contig;
2299 + ADIO_Offset userbuf_off;
2300 + ADIO_Offset off, req_off, disp, end_offset = 0, writebuf_off, start_off;
2302 + int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
2303 + ADIO_Status status1;
2304 + int new_bwr_size, new_fwr_size;
2306 + int stripe_size, lflag = 0;
2307 + static char myname[] = "ADIOI_LUSTRE_WriteStrided";
2309 + MPI_Comm_rank(fd->comm, &myrank);
2311 + if (fd->hints->ds_write == ADIOI_HINT_DISABLE) {
2312 + /* if user has disabled data sieving on writes, use naive
2313 + * approach instead.
2315 + ADIOI_GEN_WriteStrided_naive(fd,
2320 + offset, status, error_code);
2324 + *error_code = MPI_SUCCESS; /* changed below if error */
2326 + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig);
2327 + ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig);
2329 + MPI_Type_size(fd->filetype, &filetype_size);
2330 + if (!filetype_size) {
2331 + *error_code = MPI_SUCCESS;
2335 + MPI_Type_extent(fd->filetype, &filetype_extent);
2336 + MPI_Type_size(datatype, &buftype_size);
2337 + MPI_Type_extent(datatype, &buftype_extent);
2338 + etype_size = fd->etype_size;
2340 + bufsize = buftype_size * count;
2342 + /* get striping info */
2343 + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
2344 + MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
2346 + stripe_size = atoi(value);
2347 + ADIOI_Free(value);
2349 + /* Different buftype to different filetype */
2350 + if (!buftype_is_contig && filetype_is_contig) {
2351 + /* noncontiguous in memory, contiguous in file. */
2352 + ADIOI_Flatten_datatype(datatype);
2353 + flat_buf = ADIOI_Flatlist;
2354 + while (flat_buf->type != datatype)
2355 + flat_buf = flat_buf->next;
2357 + off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind :
2358 + fd->disp + etype_size * offset;
2361 + end_offset = start_off + bufsize - 1;
2362 + writebuf_off = start_off;
2363 + /* write stripe size buffer each time */
2364 + writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
2365 + writebuf_len = (int) ADIOI_MIN(bufsize,
2366 + (writebuf_off / stripe_size + 1) *
2367 + stripe_size - writebuf_off);
2369 + /* if atomicity is true, lock the region to be accessed */
2370 + if (fd->atomicity)
2371 + ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
2373 + for (j = 0; j < count; j++) {
2374 + for (i = 0; i < flat_buf->count; i++) {
2375 + userbuf_off = j * buftype_extent + flat_buf->indices[i];
2377 + req_len = flat_buf->blocklens[i];
2378 + ADIOI_BUFFERED_WRITE_WITHOUT_READ
2379 + off += flat_buf->blocklens[i];
2383 + /* write the buffer out finally */
2384 + ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE,
2385 + ADIO_EXPLICIT_OFFSET, writebuf_off, &status1,
2388 + if (fd->atomicity)
2389 + ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
2390 + if (*error_code != MPI_SUCCESS) {
2391 + ADIOI_Free(writebuf);
2394 + ADIOI_Free(writebuf);
2395 + if (file_ptr_type == ADIO_INDIVIDUAL)
2398 + /* noncontiguous in file */
2399 + /* filetype already flattened in ADIO_Open */
2400 + flat_file = ADIOI_Flatlist;
2401 + while (flat_file->type != fd->filetype)
2402 + flat_file = flat_file->next;
2405 + if (file_ptr_type == ADIO_INDIVIDUAL) {
2406 + offset = fd->fp_ind; /* in bytes */
2411 + for (i = 0; i < flat_file->count; i++) {
2412 + if (disp + flat_file->indices[i] +
2413 + (ADIO_Offset) n_filetypes * filetype_extent +
2414 + flat_file->blocklens[i] >= offset) {
2416 + fwr_size = (int) (disp + flat_file->indices[i] +
2417 + (ADIO_Offset) n_filetypes *
2419 + flat_file->blocklens[i] -
2427 + n_etypes_in_filetype = filetype_size / etype_size;
2428 + n_filetypes = (int) (offset / n_etypes_in_filetype);
2429 + etype_in_filetype = (int) (offset % n_etypes_in_filetype);
2430 + size_in_filetype = etype_in_filetype * etype_size;
2433 + for (i = 0; i < flat_file->count; i++) {
2434 + sum += flat_file->blocklens[i];
2435 + if (sum > size_in_filetype) {
2437 + fwr_size = sum - size_in_filetype;
2438 + abs_off_in_filetype = flat_file->indices[i] +
2439 + size_in_filetype - (sum - flat_file->blocklens[i]);
2444 + /* abs. offset in bytes in the file */
2445 + offset = disp + (ADIO_Offset) n_filetypes *filetype_extent +
2446 + abs_off_in_filetype;
2449 + start_off = offset;
2451 + /* If the file bytes is actually contiguous, we do not need data sieve at all */
2452 + if (bufsize <= fwr_size) {
2453 + req_off = start_off;
2454 + req_len = bufsize;
2455 + end_offset = start_off + bufsize - 1;
2456 + writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size));
2457 + memset(writebuf, -1, ADIOI_MIN(bufsize, stripe_size));
2461 + ADIOI_BUFFERED_WRITE_WITHOUT_READ
2463 + /* Calculate end_offset, the last byte-offset that will be accessed.
2464 + e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */
2465 + st_fwr_size = fwr_size;
2466 + st_n_filetypes = n_filetypes;
2470 + fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
2471 + while (i < bufsize) {
2473 + end_offset = off + fwr_size - 1;
2475 + if (j < (flat_file->count - 1))
2482 + off = disp + flat_file->indices[j] +
2483 + (ADIO_Offset) n_filetypes * filetype_extent;
2484 + fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i);
2489 + writebuf = (char *) ADIOI_Malloc(stripe_size);
2490 + memset(writebuf, -1, stripe_size);
2491 + /* if atomicity is true, lock the region to be accessed */
2492 + if (fd->atomicity)
2493 + ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
2495 + if (buftype_is_contig && !filetype_is_contig) {
2496 + /* contiguous in memory, noncontiguous in file. should be the most
2501 + n_filetypes = st_n_filetypes;
2502 + fwr_size = ADIOI_MIN(st_fwr_size, bufsize);
2503 + while (i < bufsize) {
2505 + /* TYPE_UB and TYPE_LB can result in
2506 + fwr_size = 0. save system call in such cases */
2508 + lseek(fd->fd_sys, off, SEEK_SET);
2509 + err = write(fd->fd_sys, ((char *) buf) + i, fwr_size);
2512 + req_len = fwr_size;
2514 + ADIOI_BUFFERED_WRITE
2518 + if (off + fwr_size < disp + flat_file->indices[j] +
2519 + flat_file->blocklens[j] +
2520 + (ADIO_Offset) n_filetypes * filetype_extent)
2522 + /* did not reach end of contiguous block in filetype.
2523 + no more I/O needed. off is incremented by fwr_size. */
2525 + if (j < (flat_file->count - 1))
2531 + off = disp + flat_file->indices[j] +
2532 + (ADIO_Offset) n_filetypes * filetype_extent;
2533 + fwr_size = ADIOI_MIN(flat_file->blocklens[j],
2538 + /* noncontiguous in memory as well as in file */
2539 + ADIOI_Flatten_datatype(datatype);
2540 + flat_buf = ADIOI_Flatlist;
2541 + while (flat_buf->type != datatype)
2542 + flat_buf = flat_buf->next;
2544 + k = num = buf_count = 0;
2545 + i = (int) (flat_buf->indices[0]);
2548 + n_filetypes = st_n_filetypes;
2549 + fwr_size = st_fwr_size;
2550 + bwr_size = flat_buf->blocklens[0];
2552 + while (num < bufsize) {
2553 + size = ADIOI_MIN(fwr_size, bwr_size);
2556 + lseek(fd->fd_sys, off, SEEK_SET);
2557 + err = write(fd->fd_sys, ((char *) buf) + i, size);
2562 + ADIOI_BUFFERED_WRITE
2565 + new_fwr_size = fwr_size;
2566 + new_bwr_size = bwr_size;
2568 + if (size == fwr_size) {
2569 + /* reached end of contiguous block in file */
2570 + if (j < (flat_file->count - 1)) {
2576 + off = disp + flat_file->indices[j] +
2577 + (ADIO_Offset) n_filetypes * filetype_extent;
2578 + new_fwr_size = flat_file->blocklens[j];
2579 + if (size != bwr_size) {
2581 + new_bwr_size -= size;
2584 + if (size == bwr_size) {
2585 + /* reached end of contiguous block in memory */
2586 + k = (k + 1) % flat_buf->count;
2588 + i = (int) (buftype_extent *
2589 + (buf_count / flat_buf->count) +
2590 + flat_buf->indices[k]);
2591 + new_bwr_size = flat_buf->blocklens[k];
2592 + if (size != fwr_size) {
2594 + new_fwr_size -= size;
2598 + fwr_size = new_fwr_size;
2599 + bwr_size = new_bwr_size;
2603 + /* write the buffer out finally */
2604 + if (writebuf_len) {
2605 + ADIO_WriteContig(fd, writebuf, writebuf_len,
2606 + MPI_BYTE, ADIO_EXPLICIT_OFFSET,
2607 + writebuf_off, &status1, error_code);
2608 + if (!(fd->atomicity))
2609 + ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
2610 + if (*error_code != MPI_SUCCESS) {
2611 + ADIOI_Free(writebuf);
2615 + if (fd->atomicity)
2616 + ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize);
2618 + ADIOI_Free(writebuf);
2619 + if (file_ptr_type == ADIO_INDIVIDUAL)
2622 + fd->fp_sys_posn = -1; /* set it to null. */
2624 +#ifdef HAVE_STATUS_SET_BYTES
2625 + MPIR_Status_set_bytes(status, datatype, bufsize);
2626 + /* This is a temporary way of filling in status. The right way is to
2627 + keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */
2630 + if (!buftype_is_contig)
2631 + ADIOI_Delete_flattened(datatype);
2633 diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in
2634 --- ad_lustre_orig/Makefile.in 2008-09-17 14:36:57.000000000 +0800
2635 +++ ad_lustre/Makefile.in 2008-09-17 18:20:35.000000000 +0800
2639 AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_open.o \
2640 - ad_lustre_rwcontig.o ad_lustre_hints.o
2641 + ad_lustre_rwcontig.o ad_lustre_wrcoll.o ad_lustre_wrstr.o \
2642 + ad_lustre_hints.o ad_lustre_aggregate.o
2646 @if [ "@ENABLE_SHLIB@" != "none" ] ; then \
2647 diff -ruN ad_lustre_orig/README ad_lustre/README
2648 --- ad_lustre_orig/README 2008-09-17 14:36:57.000000000 +0800
2649 +++ ad_lustre/README 2008-10-15 22:43:07.000000000 +0800
2651 o To post the code for ParColl (Partitioned collective IO)
2653 -----------------------------------------------------
2655 +-----------------------------------------------------
2656 +Improved data redistribution
2657 + o Improve I/O pattern identification. Besides checking interleaving,
2658 + if request I/O size is small, collective I/O will be performed.
2659 + The hint big_req_size can be used to define the req size value.
2660 + o Provide hint CO for load balancing to control the number of
2661 + IO clients for each OST
2662 + o Produce stripe-contiguous I/O pattern that Lustre prefers
2663 + o Reduce the collective overhead by hints contiguous_data and
2664 + same_io_size to remove unnecessary MPI_Alltoall()
2665 + o Control read-modify-write in data sieving in collective IO
2666 + by hint ds_in_coll.
2667 + o Optimize the IO pattern.
2668 + - If the whole access size <= stripe size, we suggest all the
2669 + IO data will be performed by the same client, to avoid the
2670 + extent lock revoking and reassignment.
2672 +-----------------------------------------------------
2674 -----------------------------------------------------
2675 o Direct IO and Lockless IO support
2676 --- common/ad_write_coll_orig.c 2008-10-15 11:24:31.000000000 +0800
2677 +++ common/ad_write_coll.c 2008-10-15 11:25:39.000000000 +0800
2679 int *send_buf_idx, int *curr_to_proc,
2680 int *done_to_proc, int iter,
2681 MPI_Aint buftype_extent);
2682 -static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
2683 +void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
2684 ADIO_Offset *srt_off, int *srt_len, int *start_pos,
2685 int nprocs, int nprocs_recv, int total_elements);
2691 -static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
2692 +void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
2693 ADIO_Offset *srt_off, int *srt_len, int *start_pos,
2694 int nprocs, int nprocs_recv, int total_elements)