X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fcontrib%2Fadio_driver_mpich2-1.0.7.patch;h=6b3387216cffdf9677efd8fe9f986bae7c403c46;hb=ad5c8ce3c3b561c3913cfe28703652329ec6c4dc;hp=eb919cbe8108ef6b7b1b1889b0e6a3058baabdc3;hpb=25d4f73bfbbd6191f35f5ad9c7ed776ee9adf731;p=fs%2Flustre-release.git diff --git a/lustre/contrib/adio_driver_mpich2-1.0.7.patch b/lustre/contrib/adio_driver_mpich2-1.0.7.patch index eb919cb..6b33872 100644 --- a/lustre/contrib/adio_driver_mpich2-1.0.7.patch +++ b/lustre/contrib/adio_driver_mpich2-1.0.7.patch @@ -1,7 +1,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c --- ad_lustre_orig/ad_lustre_aggregate.c 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/ad_lustre_aggregate.c 2008-09-17 18:20:35.000000000 +0800 -@@ -0,0 +1,676 @@ ++++ ad_lustre/ad_lustre_aggregate.c 2008-10-17 17:30:00.000000000 +0800 +@@ -0,0 +1,502 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (C) 1997 University of Chicago. @@ -16,78 +16,92 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c +#include "adio_extern.h" + +void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr, -+ int mode, int nprocs) ++ int mode) +{ + int *striping_info = NULL; + /* get striping information: -+ * striping_info[0] = stripe_size; -+ * striping_info[1] = stripe_count; -+ * striping_info[2] = CO; ++ * striping_info[0]: stripe_size ++ * striping_info[1]: stripe_count ++ * striping_info[2]: avail_cb_nodes + */ -+ /* for easy understanding, we name some variables */ -+ int stripe_size, stripe_count, CO = 1, CO_max = 1, lflag; ++ int stripe_size, stripe_count, CO = 1, CO_max = 1, CO_nodes, lflag; ++ int avail_cb_nodes, divisor, nprocs_for_coll = fd->hints->cb_nodes; + char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); ++ ++ /* Get hints value */ + /* stripe size */ + MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag); + if (lflag) + stripe_size = atoi(value); + /* stripe count */ ++ /* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */ + MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &lflag); + if (lflag) + stripe_count = atoi(value); -+ /* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */ + -+ /* calculate CO */ ++ /* Calculate the available number of I/O clients, that is ++ * avail_cb_nodes=min(cb_nodes, stripe_count*CO), where ++ * CO=1 by default ++ */ + if (!mode) { -+ /* for collective read, ++ /* for collective read, + * if "CO" clients access the same OST simultaneously, -+ * the OST disk seek time would be large. So, to avoid this, ++ * the OST disk seek time would be much. So, to avoid this, + * it might be better if 1 client only accesses 1 OST. + * So, we set CO = 1 to meet the above requirement. + */ + CO = 1; + /*XXX: maybe there are other better way for collective read */ + } else { -+ /* CO_max: the largest number of IO clients for each ost group */ -+ CO_max = (nprocs - 1)/ stripe_count + 1; ++ /* CO_max: the largest number of IO clients for each ost group */ ++ CO_max = (nprocs_for_coll - 1)/ stripe_count + 1; + /* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */ + MPI_Info_get(fd->info, "CO", MPI_MAX_INFO_VAL, value, &lflag); + if (lflag) + CO = atoi(value); + CO = ADIOI_MIN(CO_max, CO); + } -+ ADIOI_Free(value); -+ /* although there are known "N" hints so far, we still malloc space here -+ * instead of declaring an array[3] outside, -+ * because on one hand in the future we probably need more hints, and -+ * on the other hand this function can be called by -+ * both collective read and write conveniently. -+ */ ++ /* Calculate how many IO clients we need */ ++ /* To avoid extent lock conflicts, ++ * avail_cb_nodes should divide (stripe_count*CO) exactly, ++ * so that each OST is accessed by only one or more constant clients. */ ++ avail_cb_nodes = ADIOI_MIN(nprocs_for_coll, stripe_count * CO); ++ if (avail_cb_nodes == nprocs_for_coll) { ++ CO_nodes = stripe_count * CO; ++ do { ++ /* find the divisor of CO_nodes */ ++ divisor = 1; ++ do { ++ divisor ++; ++ } while (CO_nodes % divisor); ++ CO_nodes = CO_nodes / divisor; ++ /* if stripe_count*CO is a prime number, change nothing */ ++ if ((CO_nodes <= avail_cb_nodes) && (CO_nodes != 1)) { ++ avail_cb_nodes = CO_nodes; ++ break; ++ } ++ } while (CO_nodes != 1); ++ } ++ + *striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int)); + striping_info = *striping_info_ptr; + striping_info[0] = stripe_size; + striping_info[1] = stripe_count; -+ striping_info[2] = CO; ++ striping_info[2] = avail_cb_nodes; ++ ++ ADIOI_Free(value); +} + +int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off, -+ ADIO_Offset *len, int nprocs, -+ int *striping_info) ++ ADIO_Offset *len, int *striping_info) +{ -+ /* please refer the comments in above function for the detailed algorithm */ -+ int rank_index; ++ int rank_index, rank; + ADIO_Offset avail_bytes; -+ + int stripe_size = striping_info[0]; -+ int stripe_count = striping_info[1]; -+ int CO = striping_info[2]; -+ int avail_nprocs = ADIOI_MIN(stripe_count * CO, nprocs); -+ -+ /* calculate the rank by offset directly */ -+ rank_index = (int)((off / stripe_size) % avail_nprocs); -+ /* XXX: the above method is so simple that the processes in top ranks are always -+ * chosen to be I/O clients. we hope they are different each time. -+ */ ++ int avail_cb_nodes = striping_info[2]; ++ ++ /* Produce the stripe-contiguous pattern for Lustre */ ++ rank_index = (int)((off / stripe_size) % avail_cb_nodes); + + avail_bytes = (off / (ADIO_Offset)stripe_size + 1) * + (ADIO_Offset)stripe_size - off; @@ -95,8 +109,11 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + /* this proc only has part of the requested contig. region */ + *len = avail_bytes; + } ++ /* map our index to a rank */ ++ /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */ ++ rank = fd->hints->ranklist[rank_index]; + -+ return rank_index; ++ return rank; +} + +void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, @@ -107,6 +124,8 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + ADIOI_Access ** my_req_ptr, + int **buf_idx_ptr) +{ ++ /* Nothing different from ADIOI_Calc_my_req(), except calling ++ * ADIOI_Lustre_Calc_aggregator() instead of the old one */ + int *count_my_req_per_proc, count_my_req_procs, *buf_idx; + int i, l, proc; + ADIO_Offset avail_len, rem_len, curr_idx, off; @@ -140,8 +159,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + * then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return + * the amount that was available. + */ -+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs, -+ striping_info); ++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info); + count_my_req_per_proc[proc]++; + /* figure out how many data is remaining in the access + * we'll take care of this data (if there is any) @@ -152,8 +170,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + while (rem_len != 0) { + off += avail_len; /* point to first remaining byte */ + avail_len = rem_len; /* save remaining size, pass to calc */ -+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs, -+ striping_info); ++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info); + count_my_req_per_proc[proc]++; + rem_len -= avail_len; /* reduce remaining length by amount from fd */ + } @@ -182,8 +199,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + continue; + off = offset_list[i]; + avail_len = len_list[i]; -+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs, -+ striping_info); ++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info); + + /* for each separate contiguous access from this process */ + if (buf_idx[proc] == -1) @@ -206,7 +222,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + while (rem_len != 0) { + off += avail_len; + avail_len = rem_len; -+ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs, ++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, + striping_info); + if (buf_idx[proc] == -1) + buf_idx[proc] = (int) curr_idx; @@ -246,13 +262,10 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c +int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count, + int *len_list, int nprocs) +{ -+ /* Algorithm: -+ * So far, only one case is suitable for collective I/O -+ * (1) request size <= big_req_size -+ * -+ * if (avg_req_size > big_req_size) { -+ * docollect = 0; -+ * } ++ /* If the processes are non-interleaved, we will check the req_size. ++ * if (avg_req_size > big_req_size) { ++ * docollect = 0; ++ * } + */ + + int i, docollect = 1, lflag, big_req_size = 0; @@ -270,12 +283,12 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + /* estimate average req_size */ + avg_req_size = (int)(total_req_size / total_access_count); + -+ /* get hint of hole_ratio */ ++ /* get hint of big_req_size */ + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); + MPI_Info_get(fd->info, "big_req_size", MPI_MAX_INFO_VAL, value, &lflag); + if (lflag) + big_req_size = atoi(value); -+ ++ /* Don't perform collective I/O if there are big requests */ + if ((big_req_size > 0) && (avg_req_size > big_req_size)) + docollect = 0; + @@ -284,200 +297,6 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + return docollect; +} + -+void ADIOI_LUSTRE_Calc_my_off_len(ADIO_File fd, int bufcount, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, -+ ADIO_Offset **offset_list_ptr, -+ int **len_list_ptr, -+ ADIO_Offset *start_offset_ptr, -+ ADIO_Offset *end_offset_ptr, -+ int *contig_access_count_ptr) -+{ -+ int filetype_size, buftype_size, etype_size; -+ int i, j, k, frd_size = 0, old_frd_size = 0, st_index = 0; -+ int n_filetypes, etype_in_filetype; -+ ADIO_Offset abs_off_in_filetype = 0; -+ int bufsize, sum, n_etypes_in_filetype, size_in_filetype; -+ int contig_access_count, *len_list, flag, filetype_is_contig; -+ MPI_Aint filetype_extent, filetype_lb; -+ ADIOI_Flatlist_node *flat_file; -+ ADIO_Offset *offset_list, off, end_offset = 0, disp; -+ -+ /* For this process's request, calculate the list of offsets and -+ lengths in the file and determine the start and end offsets. */ -+ -+ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); -+ -+ MPI_Type_size(fd->filetype, &filetype_size); -+ MPI_Type_extent(fd->filetype, &filetype_extent); -+ MPI_Type_lb(fd->filetype, &filetype_lb); -+ MPI_Type_size(datatype, &buftype_size); -+ etype_size = fd->etype_size; -+ -+ if (!filetype_size) { -+ *contig_access_count_ptr = 0; -+ *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset)); -+ *len_list_ptr = (int *) ADIOI_Malloc(2 * sizeof(int)); -+ /* 2 is for consistency. everywhere I malloc one more than needed */ -+ -+ offset_list = *offset_list_ptr; -+ len_list = *len_list_ptr; -+ offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : -+ fd->disp + etype_size * offset; -+ len_list[0] = 0; -+ *start_offset_ptr = offset_list[0]; -+ *end_offset_ptr = offset_list[0] + len_list[0] - 1; -+ return; -+ } -+ -+ if (filetype_is_contig) { -+ *contig_access_count_ptr = 1; -+ *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset)); -+ *len_list_ptr = (int *) ADIOI_Malloc(2 * sizeof(int)); -+ /* 2 is for consistency. everywhere I malloc one more than needed */ -+ -+ offset_list = *offset_list_ptr; -+ len_list = *len_list_ptr; -+ offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : -+ fd->disp + etype_size * offset; -+ len_list[0] = bufcount * buftype_size; -+ *start_offset_ptr = offset_list[0]; -+ *end_offset_ptr = offset_list[0] + len_list[0] - 1; -+ -+ /* update file pointer */ -+ if (file_ptr_type == ADIO_INDIVIDUAL) -+ fd->fp_ind = *end_offset_ptr + 1; -+ } else { -+ /* First calculate what size of offset_list and len_list to allocate */ -+ /* filetype already flattened in ADIO_Open or ADIO_Fcntl */ -+ flat_file = ADIOI_Flatlist; -+ while (flat_file->type != fd->filetype) -+ flat_file = flat_file->next; -+ disp = fd->disp; -+ -+ if (file_ptr_type == ADIO_INDIVIDUAL) { -+ offset = fd->fp_ind; /* in bytes */ -+ n_filetypes = -1; -+ flag = 0; -+ while (!flag) { -+ n_filetypes++; -+ for (i = 0; i < flat_file->count; i++) { -+ if (disp + flat_file->indices[i] + -+ (ADIO_Offset) n_filetypes * filetype_extent + -+ flat_file->blocklens[i] >= offset) { -+ st_index = i; -+ frd_size = (int) (disp + flat_file->indices[i] + -+ (ADIO_Offset) n_filetypes * -+ filetype_extent + -+ flat_file->blocklens[i] - -+ offset); -+ flag = 1; -+ break; -+ } -+ } -+ } -+ } else { -+ n_etypes_in_filetype = filetype_size / etype_size; -+ n_filetypes = (int) (offset / n_etypes_in_filetype); -+ etype_in_filetype = (int) (offset % n_etypes_in_filetype); -+ size_in_filetype = etype_in_filetype * etype_size; -+ -+ sum = 0; -+ for (i = 0; i < flat_file->count; i++) { -+ sum += flat_file->blocklens[i]; -+ if (sum > size_in_filetype) { -+ st_index = i; -+ frd_size = sum - size_in_filetype; -+ abs_off_in_filetype = flat_file->indices[i] + -+ size_in_filetype - -+ (sum - flat_file->blocklens[i]); -+ break; -+ } -+ } -+ -+ /* abs. offset in bytes in the file */ -+ offset = disp + (ADIO_Offset) n_filetypes *filetype_extent + -+ abs_off_in_filetype; -+ } -+ -+ /* calculate how much space to allocate for offset_list, len_list */ -+ -+ old_frd_size = frd_size; -+ contig_access_count = i = 0; -+ j = st_index; -+ bufsize = buftype_size * bufcount; -+ frd_size = ADIOI_MIN(frd_size, bufsize); -+ while (i < bufsize) { -+ if (frd_size) -+ contig_access_count++; -+ i += frd_size; -+ j = (j + 1) % flat_file->count; -+ frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i); -+ } -+ -+ /* allocate space for offset_list and len_list */ -+ -+ *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc((contig_access_count+1) * -+ sizeof(ADIO_Offset)); -+ *len_list_ptr = (int *) ADIOI_Malloc((contig_access_count + 1) * -+ sizeof(int)); -+ /* +1 to avoid a 0-size malloc */ -+ -+ offset_list = *offset_list_ptr; -+ len_list = *len_list_ptr; -+ -+ /* find start offset, end offset, and fill in offset_list and len_list */ -+ -+ *start_offset_ptr = offset; /* calculated above */ -+ -+ i = k = 0; -+ j = st_index; -+ off = offset; -+ frd_size = ADIOI_MIN(old_frd_size, bufsize); -+ while (i < bufsize) { -+ if (frd_size) { -+ offset_list[k] = off; -+ len_list[k] = frd_size; -+ k++; -+ } -+ i += frd_size; -+ end_offset = off + frd_size - 1; -+ -+ /* Note: end_offset points to the last byte-offset that will be accessed. -+ e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */ -+ -+ if (off + frd_size < disp + flat_file->indices[j] + -+ flat_file->blocklens[j] + -+ (ADIO_Offset) n_filetypes * filetype_extent) { -+ off += frd_size; -+ /* did not reach end of contiguous block in filetype. -+ * no more I/O needed. off is incremented by frd_size. -+ */ -+ } else { -+ if (j < (flat_file->count - 1)) -+ j++; -+ else { -+ /* hit end of flattened filetype; -+ * start at beginning again -+ */ -+ j = 0; -+ n_filetypes++; -+ } -+ off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes * -+ filetype_extent; -+ frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i); -+ } -+ } -+ -+ /* update file pointer */ -+ if (file_ptr_type == ADIO_INDIVIDUAL) -+ fd->fp_ind = off; -+ -+ *contig_access_count_ptr = contig_access_count; -+ *end_offset_ptr = end_offset; -+ } -+} -+ +void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs, + int *count_my_req_per_proc, + ADIOI_Access * my_req, @@ -490,41 +309,49 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c +{ + /* what requests of other processes will be written by this process */ + -+ int *count_others_req_per_proc, count_others_req_procs; ++ int *count_others_req_per_proc, count_others_req_procs, proc; + int i, j, lflag, samesize = 0, contiguous = 0; ++ int avail_cb_nodes = striping_info[2]; + MPI_Request *send_requests, *recv_requests; + MPI_Status *statuses; + ADIOI_Access *others_req; + char *value = NULL; -+ int proc, avail_nprocs, stripe_count, CO; + ADIO_Offset min_st_offset, off, req_len, avail_len, rem_len, *all_lens; + ++ /* There are two hints, which could reduce some MPI communication overhead, ++ * if the users knows the I/O pattern and set them correctly. */ ++ /* They are ++ * contiguous_data: if the data are contiguous, ++ * we don't need to do MPI_Alltoall(). ++ * same_io_size: And if the data req size is same, ++ * we can calculate the offset directly ++ */ + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); -+ /* same io size */ -+ MPI_Info_get(fd->info, "same_io_size", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag && !strcmp(value, "yes")) -+ samesize = 1; -+ /* contiguous data */ ++ /* hint of contiguous data */ + MPI_Info_get(fd->info, "contiguous_data", MPI_MAX_INFO_VAL, value, &lflag); + if (lflag && !strcmp(value, "yes")) + contiguous = 1; ++ /* hint of same io size */ ++ MPI_Info_get(fd->info, "same_io_size", MPI_MAX_INFO_VAL, value, &lflag); ++ if (lflag && !strcmp(value, "yes")) ++ samesize = 1; ++ ADIOI_Free(value); + + *others_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * + sizeof(ADIOI_Access)); + others_req = *others_req_ptr; + -+ /* if the data are contiguous, we don't need to do MPI_Alltoall */ ++ /* if the data are contiguous, we can calulate the offset and length ++ * of the other requests simply, instead of MPI_Alltoall() */ + if (contiguous) { -+ stripe_count = striping_info[1]; -+ CO = striping_info[2]; -+ + for (i = 0; i < nprocs; i++) { + others_req[i].count = 0; + } + req_len = end_offset - start_offset + 1; + all_lens = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); + -+ if (samesize == 0) {/* different request size */ ++ /* same req size ? */ ++ if (samesize == 0) { + /* calculate the min_st_offset */ + MPI_Allreduce(&start_offset, &min_st_offset, 1, MPI_LONG_LONG, + MPI_MIN, fd->comm); @@ -532,14 +359,14 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + MPI_Allgather(&req_len, 1, ADIO_OFFSET, all_lens, 1, ADIO_OFFSET, + fd->comm); + } else { /* same request size */ -+ /* calculate the min_st_offset */ ++ /* calculate the 1st request's offset */ + min_st_offset = start_offset - myrank * req_len; + /* assign request length to all_lens[] */ + for (i = 0; i < nprocs; i ++) + all_lens[i] = req_len; + } -+ avail_nprocs = ADIOI_MIN(nprocs, stripe_count * CO); -+ if (myrank < avail_nprocs) { ++ if (myrank < avail_cb_nodes) { ++ /* This is a IO client and it will receive data from others */ + off = min_st_offset; + /* calcaulte other_req[i].count */ + for (i = 0; i < nprocs; i++) { @@ -547,7 +374,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + rem_len = avail_len; + while (rem_len > 0) { + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, -+ nprocs, striping_info); ++ striping_info); + if (proc == myrank) { + others_req[i].count ++; + } @@ -575,7 +402,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + rem_len = avail_len; + while (rem_len > 0) { + proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, -+ nprocs, striping_info); ++ striping_info); + if (proc == myrank) { + others_req[i].offsets[j] = off; + others_req[i].lens[j] = (int)avail_len; @@ -587,7 +414,6 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + } + } + } -+ ADIOI_Free(value); + ADIOI_Free(all_lens); + } else { + /* multiple non-contiguous requests */ @@ -680,7 +506,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c +} diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c --- ad_lustre_orig/ad_lustre.c 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/ad_lustre.c 2008-09-17 18:20:35.000000000 +0800 ++++ ad_lustre/ad_lustre.c 2008-10-17 17:03:42.000000000 +0800 @@ -1,9 +1,11 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* @@ -695,73 +521,24 @@ diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c */ #include "ad_lustre.h" -@@ -13,13 +15,13 @@ +@@ -13,12 +15,12 @@ ADIOI_LUSTRE_ReadContig, /* ReadContig */ ADIOI_LUSTRE_WriteContig, /* WriteContig */ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */ - ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */ + ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */ ADIOI_GEN_SeekIndividual, /* SeekIndividual */ -- ADIOI_GEN_Fcntl, /* Fcntl */ -+ ADIOI_LUSTRE_Fcntl, /* Fcntl */ + ADIOI_GEN_Fcntl, /* Fcntl */ ADIOI_LUSTRE_SetInfo, /* SetInfo */ ADIOI_GEN_ReadStrided, /* ReadStrided */ - ADIOI_GEN_WriteStrided, /* WriteStrided */ -- ADIOI_GEN_Close, /* Close */ + ADIOI_LUSTRE_WriteStrided, /* WriteStrided */ -+ ADIOI_LUSTRE_Close, /* Close */ + ADIOI_GEN_Close, /* Close */ #if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE) ADIOI_GEN_IreadContig, /* IreadContig */ - ADIOI_GEN_IwriteContig, /* IwriteContig */ -diff -ruN ad_lustre_orig/ad_lustre_close.c ad_lustre/ad_lustre_close.c ---- ad_lustre_orig/ad_lustre_close.c 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/ad_lustre_close.c 2008-09-17 18:20:35.000000000 +0800 -@@ -0,0 +1,42 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ * -+ * Copyright (C) 2007 Oak Ridge National Laboratory -+ * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group -+ */ -+ -+#include "ad_lustre.h" -+ -+#ifdef PROFILE -+#include "mpe.h" -+#endif -+ -+void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code) -+{ -+ int err, derr = 0; -+ static char myname[] = "ADIOI_LUSTRE_CLOSE"; -+ -+#ifdef PROFILE -+ MPE_Log_event(9, 0, "start close"); -+#endif -+ -+ err = close(fd->fd_sys); -+ -+#ifdef PROFILE -+ MPE_Log_event(10, 0, "end close"); -+#endif -+ -+ fd->fd_sys = -1; -+ -+ if (err == -1 || derr == -1) { -+ *error_code = -+ MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, -+ __LINE__, MPI_ERR_IO, "**io", "**io %s", -+ strerror(errno)); -+ } else -+ *error_code = MPI_SUCCESS; -+} diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h --- ad_lustre_orig/ad_lustre.h 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/ad_lustre.h 2008-10-06 16:07:21.000000000 +0800 ++++ ad_lustre/ad_lustre.h 2008-10-17 17:11:11.000000000 +0800 @@ -1,9 +1,11 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* @@ -809,7 +586,7 @@ diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h #include "adio.h" /*#include "adioi.h"*/ -@@ -41,24 +68,62 @@ +@@ -41,24 +68,31 @@ void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code); void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code); @@ -855,41 +632,10 @@ diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h int *error_code); void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); - -+void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr, -+ int mode, int nprocs); -+int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off, -+ ADIO_Offset *len, int nprocs, -+ int *striping_info); -+void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, -+ int *len_list, int contig_access_count, -+ int *striping_info, int nprocs, -+ int *count_my_req_procs_ptr, -+ int **count_my_req_per_proc_ptr, -+ ADIOI_Access ** my_req_ptr, -+ int **buf_idx_ptr); -+int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count, -+ int *len_list, int nprocs); -+void ADIOI_LUSTRE_Calc_my_off_len(ADIO_File fd, int bufcount, -+ MPI_Datatype datatype, int file_ptr_type, -+ ADIO_Offset offset, -+ ADIO_Offset **offset_list_ptr, -+ int **len_list_ptr, -+ ADIO_Offset *start_offset_ptr, -+ ADIO_Offset *end_offset_ptr, -+ int *contig_access_count_ptr); -+void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs, -+ int *count_my_req_per_proc, -+ ADIOI_Access * my_req, -+ int nprocs, int myrank, -+ ADIO_Offset start_offset, -+ ADIO_Offset end_offset, -+ int *striping_info, -+ int *count_others_req_procs_ptr, -+ ADIOI_Access ** others_req_ptr); #endif /* End of AD_UNIX_INCLUDE */ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c --- ad_lustre_orig/ad_lustre_hints.c 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/ad_lustre_hints.c 2008-09-17 18:20:35.000000000 +0800 ++++ ad_lustre/ad_lustre_hints.c 2008-10-20 14:36:48.000000000 +0800 @@ -1,9 +1,11 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* @@ -904,7 +650,7 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c */ #include "ad_lustre.h" -@@ -11,130 +13,162 @@ +@@ -11,130 +13,173 @@ void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) { @@ -916,7 +662,9 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c + int flag, tmp_val, int_val, str_factor, str_unit, start_iodev; + static char myname[] = "ADIOI_LUSTRE_SETINFO"; -- value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ++ *error_code = MPI_SUCCESS; + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); ++ if ( (fd->info) == MPI_INFO_NULL) { - /* This must be part of the open call. can set striping parameters - if necessary. */ @@ -942,8 +690,7 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c - value, &flag); - if (flag) - str_factor=atoi(value); -+ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); - +- - MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, + /* direct read and write */ + MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, @@ -957,7 +704,7 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c MPI_Info_set(fd->info, "direct_read", "true"); fd->direct_read = 1; } - +- - MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL, + MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL, value, &flag); @@ -965,14 +712,76 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c MPI_Info_set(fd->info, "direct_write", "true"); fd->direct_write = 1; } -- } - ++ /* stripe size */ ++ MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag && (str_unit = atoi(value))) { ++ tmp_val = str_unit; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != str_unit) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "striping_unit", ++ error_code); ++ ADIOI_Free(value); ++ return; ++ } ++ MPI_Info_set(fd->info, "striping_unit", value); ++ } ++ /* stripe count */ ++ MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag && (str_factor = atoi(value))) { ++ tmp_val = str_factor; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != str_factor) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "striping_factor", ++ error_code); ++ ADIOI_Free(value); ++ return; ++ } ++ MPI_Info_set(fd->info, "striping_factor", value); ++ } ++ /* stripe offset */ ++ MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag && ((start_iodev = atoi(value)) >= 0)) { ++ tmp_val = start_iodev; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != start_iodev) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "start_iodevice", ++ error_code); ++ ADIOI_Free(value); ++ return; ++ } ++ MPI_Info_set(fd->info, "start_iodevice", value); ++ } + } +- - MPI_Comm_rank(fd->comm, &myrank); - if (myrank == 0) { - tmp_val[0] = str_factor; - tmp_val[1] = str_unit; - tmp_val[2] = start_iodev; -- } ++ } ++ if (users_info != MPI_INFO_NULL) { ++ /* CO: IO Clients/OST, ++ * to keep the load balancing between clients and OSTs */ ++ MPI_Info_get(users_info, "CO", MPI_MAX_INFO_VAL, value, ++ &flag); ++ if (flag && (int_val = atoi(value)) > 0) { ++ tmp_val = int_val; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != int_val) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "CO", ++ error_code); ++ ADIOI_Free(value); ++ return; ++ } ++ MPI_Info_set(fd->info, "CO", value); + } - MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm); - - if (tmp_val[0] != str_factor @@ -1026,122 +835,71 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c - close(fd_sys); - } - } /* End of striping parameters validation */ -+ /* stripe size */ -+ MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && (str_unit = atoi(value))) { -+ tmp_val = str_unit; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != str_unit) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "striping_unit", -+ error_code); -+ return; -+ } -+ MPI_Info_set(fd->info, "striping_unit", value); -+ } -+ /* stripe count */ -+ MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && (str_factor = atoi(value))) { -+ tmp_val = str_factor; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != str_factor) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "striping_factor", -+ error_code); -+ return; -+ } -+ MPI_Info_set(fd->info, "striping_factor", value); -+ } -+ /* stripe offset */ -+ MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && ((start_iodev = atoi(value)) >= 0)) { -+ tmp_val = start_iodev; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != start_iodev) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "start_iodevice", -+ error_code); -+ return; -+ } -+ MPI_Info_set(fd->info, "start_iodevice", value); -+ } -+ /* CO */ -+ MPI_Info_get(users_info, "CO", MPI_MAX_INFO_VAL, value, -+ &flag); -+ if (flag && (int_val = atoi(value)) > 0) { -+ tmp_val = int_val; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != int_val) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "CO", -+ error_code); -+ return; -+ } -+ MPI_Info_set(fd->info, "CO", value); -+ } -+ /* big_req_size */ -+ MPI_Info_get(users_info, "big_req_size", MPI_MAX_INFO_VAL, value, -+ &flag); -+ if (flag && (int_val = atoi(value)) > 0) { -+ tmp_val = int_val; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != int_val) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "big_req_size", -+ error_code); -+ return; -+ } -+ MPI_Info_set(fd->info, "big_req_size", value); ++ /* big_req_size: ++ * if the req size is bigger than this, ++ * collective IO may not be performed. ++ */ ++ MPI_Info_get(users_info, "big_req_size", MPI_MAX_INFO_VAL, value, ++ &flag); ++ if (flag && (int_val = atoi(value)) > 0) { ++ tmp_val = int_val; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != int_val) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "big_req_size", ++ error_code); ++ ADIOI_Free(value); ++ return; + } -+ /* hint for disabling data sieving when do collective IO */ -+ MPI_Info_get(users_info, "ds_in_coll", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && (!strcmp(value, "enable") || -+ !strcmp(value, "ENABLE"))) { -+ tmp_val = int_val = 1; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != int_val) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "ds_in_coll", -+ error_code); -+ return; -+ } -+ MPI_Info_set(fd->info, "ds_in_coll", "enable"); ++ MPI_Info_set(fd->info, "big_req_size", value); ++ } ++ /* ds_in_coll: disable data sieving in collective IO */ ++ MPI_Info_get(users_info, "ds_in_coll", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag && (!strcmp(value, "enable") || ++ !strcmp(value, "ENABLE"))) { ++ tmp_val = int_val = 1; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != int_val) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "ds_in_coll", ++ error_code); ++ ADIOI_Free(value); ++ return; + } -+ /* same io size */ -+ MPI_Info_get(users_info, "same_io_size", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && (!strcmp(value, "yes") || -+ !strcmp(value, "YES"))) { -+ tmp_val = int_val = 1; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != int_val) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "same_io_size", -+ error_code); -+ return; -+ } -+ MPI_Info_set(fd->info, "same_io_size", "yes"); ++ MPI_Info_set(fd->info, "ds_in_coll", "enable"); ++ } ++ /* contiguous_data: whether the data are contiguous */ ++ MPI_Info_get(users_info, "contiguous_data", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag && (!strcmp(value, "yes") || ++ !strcmp(value, "YES"))) { ++ tmp_val = int_val = 1; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != int_val) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "contiguous_data", ++ error_code); ++ ADIOI_Free(value); ++ return; + } -+ /* contiguous data */ -+ MPI_Info_get(users_info, "contiguous_data", MPI_MAX_INFO_VAL, -+ value, &flag); -+ if (flag && (!strcmp(value, "yes") || -+ !strcmp(value, "YES"))) { -+ tmp_val = int_val = 1; -+ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); -+ if (tmp_val != int_val) { -+ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, -+ "contiguous_data", -+ error_code); -+ return; -+ } -+ MPI_Info_set(fd->info, "contiguous_data", "yes"); ++ MPI_Info_set(fd->info, "contiguous_data", "yes"); ++ } ++ /* same_io_size: whether the req size is same */ ++ MPI_Info_get(users_info, "same_io_size", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag && (!strcmp(value, "yes") || ++ !strcmp(value, "YES"))) { ++ tmp_val = int_val = 1; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != int_val) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "same_io_size", ++ error_code); ++ ADIOI_Free(value); ++ return; + } -+ ADIOI_Free(value); ++ MPI_Info_set(fd->info, "same_io_size", "yes"); } - - MPI_Barrier(fd->comm); @@ -1155,15 +913,16 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c - ADIOI_GEN_SetInfo(fd, users_info, error_code); } - +- if (ADIOI_Direct_read) fd->direct_read = 1; +- if (ADIOI_Direct_write) fd->direct_write = 1; +- + ADIOI_Free(value); + /* set the values for collective I/O and data sieving parameters */ + ADIOI_GEN_SetInfo(fd, users_info, error_code); -+ - if (ADIOI_Direct_read) fd->direct_read = 1; - if (ADIOI_Direct_write) fd->direct_write = 1; -- ADIOI_Free(value); -- - *error_code = MPI_SUCCESS; +- *error_code = MPI_SUCCESS; ++ if (ADIOI_Direct_read) fd->direct_read = 1; ++ if (ADIOI_Direct_write) fd->direct_write = 1; } diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c --- ad_lustre_orig/ad_lustre_open.c 2008-09-17 14:36:57.000000000 +0800 @@ -1391,7 +1150,7 @@ diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c } diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c --- ad_lustre_orig/ad_lustre_rwcontig.c 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/ad_lustre_rwcontig.c 2008-09-17 18:52:01.000000000 +0800 ++++ ad_lustre/ad_lustre_rwcontig.c 2008-10-15 22:44:35.000000000 +0800 @@ -1,9 +1,11 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* @@ -1406,19 +1165,10 @@ diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c */ #define _XOPEN_SOURCE 600 -@@ -138,7 +140,7 @@ - - if (io_mode) - err = write(fd->fd_sys, buf, len); -- else -+ else - err = read(fd->fd_sys, buf, len); - } else { - err = ADIOI_LUSTRE_Directio(fd, buf, len, offset, io_mode); diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c --- ad_lustre_orig/ad_lustre_wrcoll.c 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/ad_lustre_wrcoll.c 2008-09-17 18:20:35.000000000 +0800 -@@ -0,0 +1,973 @@ ++++ ad_lustre/ad_lustre_wrcoll.c 2008-10-17 16:34:36.000000000 +0800 +@@ -0,0 +1,880 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (C) 1997 University of Chicago. @@ -1474,10 +1224,9 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + int *done_to_proc, int *hole, + int iter, MPI_Aint buftype_extent, + int *buf_idx, int *error_code); -+static void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count, -+ ADIO_Offset * srt_off, int *srt_len, -+ int *start_pos, int nprocs, int nprocs_recv, -+ int total_elements); ++void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count, ++ ADIO_Offset * srt_off, int *srt_len, int *start_pos, ++ int nprocs, int nprocs_recv, int total_elements); + +void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, @@ -1492,31 +1241,54 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + /* array of nprocs access structures, one for each other process + whose request is written by this process. */ + -+ int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank, do_collect = 0; -+ int contig_access_count = 0, buftype_is_contig; ++ int i, filetype_is_contig, nprocs, myrank, do_collect = 0; ++ int contig_access_count = 0, buftype_is_contig, interleave_count = 0; + int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; -+ ADIO_Offset orig_fp, start_offset, end_offset, off, *offset_list = NULL; ++ ADIO_Offset orig_fp, start_offset, end_offset, off; ++ ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL; + int *buf_idx = NULL, *len_list = NULL, *striping_info = NULL; + int old_error, tmp_error; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + -+ nprocs_for_coll = fd->hints->cb_nodes; + orig_fp = fd->fp_ind; + + /* IO patten identification if cb_write isn't disabled */ + if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { + /* For this process's request, calculate the list of offsets and + lengths in the file and determine the start and end offsets. */ -+ ADIOI_LUSTRE_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, -+ &offset_list, &len_list, &start_offset, -+ &end_offset, &contig_access_count); -+ /* Get striping information */ -+ ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1, nprocs); -+ /* check if the access pattern can benefit from collective write */ -+ do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count, -+ len_list, nprocs); ++ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, ++ &offset_list, &len_list, &start_offset, ++ &end_offset, &contig_access_count); ++ ++ /* each process communicates its start and end offsets to other ++ processes. The result is an array each of start and end offsets stored ++ in order of process rank. */ ++ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); ++ end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); ++ MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ++ ADIO_OFFSET, fd->comm); ++ MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ++ ADIO_OFFSET, fd->comm); ++ /* are the accesses of different processes interleaved? */ ++ for (i = 1; i < nprocs; i++) ++ if ((st_offsets[i] < end_offsets[i-1]) && ++ (st_offsets[i] <= end_offsets[i])) ++ interleave_count++; ++ /* This is a rudimentary check for interleaving, but should suffice ++ for the moment. */ ++ ++ /* Two typical access patterns can benefit from collective write. ++ * 1) the processes are interleaved, and ++ * 2) the req size is small. ++ */ ++ if (interleave_count > 0) { ++ do_collect = 1; ++ } else { ++ do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count, ++ len_list, nprocs); ++ } + } + ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); + @@ -1530,6 +1302,8 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { + ADIOI_Free(offset_list); + ADIOI_Free(len_list); ++ ADIOI_Free(st_offsets); ++ ADIOI_Free(end_offsets); + } + + fd->fp_ind = orig_fp; @@ -1550,6 +1324,8 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + return; + } + ++ /* Get Lustre hints information */ ++ ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1); + /* calculate what portions of the access requests of this process are + * located in which process + */ @@ -1619,6 +1395,8 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + ADIOI_Free(buf_idx); + ADIOI_Free(offset_list); + ADIOI_Free(len_list); ++ ADIOI_Free(st_offsets); ++ ADIOI_Free(end_offsets); + ADIOI_Free(striping_info); + +#ifdef HAVE_STATUS_SET_BYTES @@ -1659,16 +1437,16 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + MPI_Status status; + ADIOI_Flatlist_node *flat_buf = NULL; + MPI_Aint buftype_extent; -+ int stripe_size = striping_info[0], lflag, data_sieving = 0; -+ int stripe_count = striping_info[1], CO = striping_info[2]; -+ /* IO step size in each communication */ -+ static char myname[] = "ADIOI_EXCH_AND_WRITE"; ++ int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2]; ++ int lflag, data_sieving = 0; + + *error_code = MPI_SUCCESS; /* changed below if error */ + -+ /* calculate the number of writes of stripe size -+ * to be done by each process and the max among all processes. ++ /* calculate the number of writes of stripe size to be done. + * That gives the no. of communication phases as well. ++ * Note: ++ * Because we redistribute data in stripe-contiguous pattern for Lustre, ++ * each process has the same no. of communication phases. + */ + + for (i = 0; i < nprocs; i++) { @@ -1678,7 +1456,6 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + break; + } + } -+ + for (i = 0; i < nprocs; i++) { + for (j = 0; j < others_req[i].count; j++) { + st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]); @@ -1696,10 +1473,12 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm); + /* align downward */ + min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size; -+ /* when nprocs < stripe_count, there will be trouble, because some client -+ * would access more than one OST in one whole communication. ++ ++ /* Each time, only avail_cb_nodes number of IO clients perform IO, ++ * so, step_size=avail_cb_nodes*stripe_size IO will be performed at most, ++ * and ntimes=whole_file_portion/step_size + */ -+ step_size = (ADIO_Offset)ADIOI_MIN(nprocs, stripe_count * CO) * stripe_size; ++ step_size = (ADIO_Offset) avail_cb_nodes * stripe_size; + max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1); + if (ntimes) + write_buf = (char *) ADIOI_Malloc(stripe_size); @@ -1711,7 +1490,6 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + for (i = 0; i < nprocs; i++) { + for (j = 0; j < others_req[i].count; j ++) { + req_off = others_req[i].offsets[j]; -+ //m = (req_off - min_st_loc) / (stripe_size * stripe_count * CO); + m = (int)((req_off - min_st_loc) / step_size); + off_list[m] = ADIOI_MIN(off_list[m], req_off); + } @@ -1758,12 +1536,12 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + + /* Although we have recognized the data according to OST index, + * a read-modify-write will be done if there is a hole between the data. -+ * For example: if blocksize=60, transfersize=30 and stripe_size=100, -+ * then process0 will collect data [0, 30] and [60, 90] then write. There -+ * is a hole [30, 60], which will cause a read-modify-write in [0, 90]. -+ * It will degrade collective performance. -+ * So we disable data sieving by default unless the hint "ds_in_coll" -+ * is set to "enable". ++ * For example: if blocksize=60, xfersize=30 and stripe_size=100, ++ * then rank0 will collect data [0, 30] and [60, 90] then write. There ++ * is a hole in [30, 60], which will cause a read-modify-write in [0, 90]. ++ * ++ * To reduce its impact on the performance, we disable data sieving ++ * by default, unless the hint "ds_in_coll" is enabled. + */ + /* check the hint for data sieving */ + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); @@ -1834,7 +1612,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + recv_curr_offlen_ptr[i] = j; + } + } -+ /* use hole to pass data_sieving flag into W_Exchange_data */ ++ /* use variable "hole" to pass data_sieving flag into W_Exchange_data */ + hole = data_sieving; + ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list, + len_list, send_size, recv_size, off, real_size, @@ -1845,7 +1623,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + curr_to_proc, done_to_proc, &hole, m, + buftype_extent, buf_idx, error_code); + if (*error_code != MPI_SUCCESS) -+ return; ++ goto over; + + flag = 0; + for (i = 0; i < nprocs; i++) @@ -1860,8 +1638,8 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + ADIO_EXPLICIT_OFFSET, off, &status, + error_code); + } else { -+ /* if there is no hole, write in one time; -+ * otherwise, write data separately */ ++ /* if there is no hole, write data in one time; ++ * otherwise, write data in several times */ + if (!hole) { + ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE, + ADIO_EXPLICIT_OFFSET, off, &status, @@ -1878,6 +1656,8 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + MPI_BYTE, ADIO_EXPLICIT_OFFSET, + others_req[i].offsets[j], &status, + error_code); ++ if (*error_code != MPI_SUCCESS) ++ goto over; + } + } + } @@ -1885,14 +1665,11 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + } + } + if (*error_code != MPI_SUCCESS) -+ return; ++ goto over; + } -+ + iter_st_off += max_size; + } -+ if (*error_code != MPI_SUCCESS) -+ return; -+ ++over: + if (ntimes) + ADIOI_Free(write_buf); + ADIOI_Free(recv_curr_offlen_ptr); @@ -1927,7 +1704,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + MPI_Aint buftype_extent, + int *buf_idx, int *error_code) +{ -+ int i, j, *tmp_len, nprocs_recv, nprocs_send, err; ++ int i, j, nprocs_recv, nprocs_send, err; + char **send_buf = NULL; + MPI_Request *requests, *send_req; + MPI_Datatype *recv_types; @@ -1947,7 +1724,6 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + sizeof(MPI_Datatype)); + /* +1 to avoid a 0-size malloc */ + -+ tmp_len = (int *) ADIOI_Malloc(nprocs * sizeof(int)); + j = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { @@ -1976,8 +1752,6 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos, + nprocs, nprocs_recv, sum); + -+ ADIOI_Free(tmp_len); -+ + /* check if there are any holes */ + *hole = 0; + for (i = 0; i < sum - 1; i++) { @@ -2010,6 +1784,9 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + myname, __LINE__, + MPI_ERR_IO, + "**ioRMWrdwr", 0); ++ ADIOI_Free(recv_types); ++ ADIOI_Free(srt_off); ++ ADIOI_Free(srt_len); + return; + } + // --END ERROR HANDLING-- @@ -2227,7 +2004,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + * longer than the single region that processor "p" is responsible + * for. + */ -+ p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, nprocs, striping_info); ++ p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, striping_info); + + if (send_buf_idx[p] < send_size[p]) { + if (curr_to_proc[p] + len > done_to_proc[p]) { @@ -2272,130 +2049,10 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + if (send_size[i]) + sent_to_proc[i] = curr_to_proc[i]; +} -+ -+static void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count, -+ ADIO_Offset * srt_off, int *srt_len, -+ int *start_pos, int nprocs, int nprocs_recv, -+ int total_elements) -+{ -+ typedef struct { -+ ADIO_Offset *off_list; -+ int *len_list; -+ int nelem; -+ } heap_struct; -+ -+ heap_struct *a, tmp; -+ int i, j, heapsize, l, r, k, smallest; -+ -+ a = (heap_struct *) ADIOI_Malloc((nprocs_recv + 1) * -+ sizeof(heap_struct)); -+ -+ j = 0; -+ for (i = 0; i < nprocs; i++) -+ if (count[i]) { -+ a[j].off_list = &(others_req[i].offsets[start_pos[i]]); -+ a[j].len_list = &(others_req[i].lens[start_pos[i]]); -+ a[j].nelem = count[i]; -+ j++; -+ } -+ -+ /* build a heap out of the first element from each list, with -+ the smallest element of the heap at the root */ -+ -+ heapsize = nprocs_recv; -+ for (i = heapsize / 2 - 1; i >= 0; i--) { -+ /* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143 -+ modified for a heap with smallest element at root. I have -+ removed the recursion so that there are no function calls. -+ Function calls are too expensive. */ -+ k = i; -+ for (;;) { -+ l = 2 * (k + 1) - 1; -+ r = 2 * (k + 1); -+ -+ if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list))) -+ smallest = l; -+ else -+ smallest = k; -+ -+ if ((r < heapsize) && -+ (*(a[r].off_list) < *(a[smallest].off_list))) -+ smallest = r; -+ -+ if (smallest != k) { -+ tmp.off_list = a[k].off_list; -+ tmp.len_list = a[k].len_list; -+ tmp.nelem = a[k].nelem; -+ -+ a[k].off_list = a[smallest].off_list; -+ a[k].len_list = a[smallest].len_list; -+ a[k].nelem = a[smallest].nelem; -+ -+ a[smallest].off_list = tmp.off_list; -+ a[smallest].len_list = tmp.len_list; -+ a[smallest].nelem = tmp.nelem; -+ -+ k = smallest; -+ } else -+ break; -+ } -+ } -+ -+ for (i = 0; i < total_elements; i++) { -+ /* extract smallest element from heap, i.e. the root */ -+ srt_off[i] = *(a[0].off_list); -+ srt_len[i] = *(a[0].len_list); -+ (a[0].nelem)--; -+ -+ if (!a[0].nelem) { -+ a[0].off_list = a[heapsize - 1].off_list; -+ a[0].len_list = a[heapsize - 1].len_list; -+ a[0].nelem = a[heapsize - 1].nelem; -+ heapsize--; -+ } else { -+ (a[0].off_list)++; -+ (a[0].len_list)++; -+ } -+ -+ /* Heapify(a, 0, heapsize); */ -+ k = 0; -+ for (;;) { -+ l = 2 * (k + 1) - 1; -+ r = 2 * (k + 1); -+ -+ if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list))) -+ smallest = l; -+ else -+ smallest = k; -+ -+ if ((r < heapsize) && -+ (*(a[r].off_list) < *(a[smallest].off_list))) -+ smallest = r; -+ -+ if (smallest != k) { -+ tmp.off_list = a[k].off_list; -+ tmp.len_list = a[k].len_list; -+ tmp.nelem = a[k].nelem; -+ -+ a[k].off_list = a[smallest].off_list; -+ a[k].len_list = a[smallest].len_list; -+ a[k].nelem = a[smallest].nelem; -+ -+ a[smallest].off_list = tmp.off_list; -+ a[smallest].len_list = tmp.len_list; -+ a[smallest].nelem = tmp.nelem; -+ -+ k = smallest; -+ } else -+ break; -+ } -+ } -+ ADIOI_Free(a); -+} diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c --- ad_lustre_orig/ad_lustre_wrstr.c 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/ad_lustre_wrstr.c 2008-09-17 18:20:35.000000000 +0800 -@@ -0,0 +1,463 @@ ++++ ad_lustre/ad_lustre_wrstr.c 2008-10-13 15:34:53.000000000 +0800 +@@ -0,0 +1,472 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (C) 1997 University of Chicago. @@ -2422,6 +2079,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + MPIR_ERR_RECOVERABLE, myname, \ + __LINE__, MPI_ERR_IO, \ + "**iowswc", 0); \ ++ ADIOI_Free(writebuf); \ + return; \ + } \ + } \ @@ -2439,6 +2097,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + MPIR_ERR_RECOVERABLE, myname, \ + __LINE__, MPI_ERR_IO, \ + "**iowsrc", 0); \ ++ ADIOI_Free(writebuf); \ + return; \ + } \ + } \ @@ -2454,6 +2113,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + MPIR_ERR_RECOVERABLE, myname, \ + __LINE__, MPI_ERR_IO, \ + "**iowswc", 0); \ ++ ADIOI_Free(writebuf); \ + return; \ + } \ + req_len -= write_sz; \ @@ -2472,6 +2132,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + MPIR_ERR_RECOVERABLE, myname, \ + __LINE__, MPI_ERR_IO, \ + "**iowsrc", 0); \ ++ ADIOI_Free(writebuf); \ + return; \ + } \ + write_sz = ADIOI_MIN(req_len, writebuf_len); \ @@ -2501,6 +2162,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + MPIR_ERR_RECOVERABLE, myname, \ + __LINE__, MPI_ERR_IO, \ + "**iowswc", 0); \ ++ ADIOI_Free(writebuf); \ + return; \ + } \ + req_len -= write_sz; \ @@ -2620,8 +2282,10 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + + if (fd->atomicity) + ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize); -+ if (*error_code != MPI_SUCCESS) ++ if (*error_code != MPI_SUCCESS) { ++ ADIOI_Free(writebuf); + return; ++ } + ADIOI_Free(writebuf); + if (file_ptr_type == ADIO_INDIVIDUAL) + fd->fp_ind = off; @@ -2838,8 +2502,10 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + writebuf_off, &status1, error_code); + if (!(fd->atomicity)) + ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); -+ if (*error_code != MPI_SUCCESS) ++ if (*error_code != MPI_SUCCESS) { ++ ADIOI_Free(writebuf); + return; ++ } + } + if (fd->atomicity) + ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize); @@ -2859,63 +2525,9 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c + if (!buftype_is_contig) + ADIOI_Delete_flattened(datatype); +} -diff -ruN ad_lustre_orig/Makefile ad_lustre/Makefile ---- ad_lustre_orig/Makefile 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/Makefile 2008-09-17 18:20:35.000000000 +0800 -@@ -0,0 +1,50 @@ -+CC = gcc -+AR = ar cr -+RANLIB = ranlib -+LIBNAME = /work/download/mpich2-1.0.7-dev/lib/libmpich.a -+srcdir = /work/download/mpich2-1.0.7-dev/src/mpi/romio/adio/ad_lustre -+CC_SHL = true -+SHLIBNAME = /work/download/mpich2-1.0.7-dev/lib/libmpich -+ -+INCLUDE_DIR = -I. -I${srcdir}/../include -I../include -I../../include -I${srcdir}/../../../../include -I../../../../include -+CFLAGS = -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/include -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/include -I/work/download/mpich2-1.0.7-dev/src/mpid/common/datatype -I/work/download/mpich2-1.0.7-dev/src/mpid/common/datatype -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/channels/sock/include -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/channels/sock/include -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock/poll -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock/poll -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -O2 -DFORTRANDOUBLEUNDERSCORE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DHAVE_ROMIOCONF_H -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(INCLUDE_DIR) -+ -+top_builddir = /work/download/mpich2-1.0.7-dev -+LIBTOOL = -+C_COMPILE_SHL = $(CC_SHL) -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -O2 -DFORTRANDOUBLEUNDERSCORE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DHAVE_ROMIOCONF_H -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(INCLUDE_DIR) -+ -+VPATH = .:${srcdir} -+ -+AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_open.o \ -+ ad_lustre_rwcontig.o ad_lustre_wrcoll.o ad_lustre_wrstr.o \ -+ ad_lustre_fcntl.o ad_lustre_hints.o ad_lustre_close.o \ -+ ad_lustre_aggregate.o -+ -+ -+default: $(LIBNAME) -+ @if [ "none" != "none" ] ; then \ -+ $(MAKE) $(SHLIBNAME).la ;\ -+ fi -+ -+.SUFFIXES: $(SUFFIXES) .p .lo -+ -+.c.o: -+ $(CC) $(CFLAGS) -c $< -+.c.lo: -+ $(C_COMPILE_SHL) -c $< -o _s$*.o -+ @mv -f _s$*.o $*.lo -+ -+$(LIBNAME): $(AD_LUSTRE_OBJECTS) -+ $(AR) $(LIBNAME) $(AD_LUSTRE_OBJECTS) -+ $(RANLIB) $(LIBNAME) -+ -+AD_LUSTRE_LOOBJECTS=$(AD_LUSTRE_OBJECTS:.o=.lo) -+$(SHLIBNAME).la: $(AD_LUSTRE_LOOBJECTS) -+ $(AR) $(SHLIBNAME).la $(AD_LUSTRE_LOOBJECTS) -+ -+coverage: -+ -@for file in ${AD_LUSTRE_OBJECTS:.o=.c} ; do \ -+ gcov -b -f $$file ; done -+ -+clean: -+ @rm -f *.o *.lo diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in --- ad_lustre_orig/Makefile.in 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/Makefile.in 2008-09-17 18:20:35.000000000 +0800 ++++ ad_lustre/Makefile.in 2008-10-17 17:03:06.000000000 +0800 @@ -16,7 +16,9 @@ @VPATH@ @@ -2929,27 +2541,48 @@ diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in @if [ "@ENABLE_SHLIB@" != "none" ] ; then \ diff -ruN ad_lustre_orig/README ad_lustre/README --- ad_lustre_orig/README 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/README 2008-09-17 18:20:35.000000000 +0800 -@@ -5,6 +5,22 @@ ++++ ad_lustre/README 2008-10-17 16:50:15.000000000 +0800 +@@ -5,6 +5,23 @@ o To post the code for ParColl (Partitioned collective IO) ----------------------------------------------------- +V05: +----------------------------------------------------- -+ o Improved data redistribution -+ - add I/O pattern identification. If request I/O size is big, -+ collective I/O won't be done. The hint big_req_size can be -+ used to define this. -+ - provide hint CO for load balancing to control the number -+ of IO clients for each OST -+ - divide the IO clients into the different OST groups to -+ produce stripe-contiguous I/O pattern -+ - reduce the collective overhead by hints contiguous_data and -+ same_io_size to remove unnecessary MPI_Alltoall() ++Improved data redistribution ++ o Improve I/O pattern identification. Besides checking interleaving, ++ if request I/O size is small, collective I/O will be performed. ++ The hint big_req_size can be used to define the req size value. ++ o Provide hint CO for load balancing to control the number of ++ IO clients for each OST ++ o Produce stripe-contiguous I/O pattern that Lustre prefers ++ o Reduce the collective overhead by hints contiguous_data and ++ same_io_size to remove unnecessary MPI_Alltoall() + o Control read-modify-write in data sieving in collective IO + by hint ds_in_coll. ++ o Reduce extent lock conflicts by make each OST accessed by one or ++ more constant clients. + +----------------------------------------------------- V04: ----------------------------------------------------- o Direct IO and Lockless IO support +--- common/ad_write_coll_orig.c 2008-10-15 11:24:31.000000000 +0800 ++++ common/ad_write_coll.c 2008-10-15 11:25:39.000000000 +0800 +@@ -42,7 +42,7 @@ + int *send_buf_idx, int *curr_to_proc, + int *done_to_proc, int iter, + MPI_Aint buftype_extent); +-static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, ++void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, + ADIO_Offset *srt_off, int *srt_len, int *start_pos, + int nprocs, int nprocs_recv, int total_elements); + +@@ -921,7 +921,7 @@ + + + +-static void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, ++void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count, + ADIO_Offset *srt_off, int *srt_len, int *start_pos, + int nprocs, int nprocs_recv, int total_elements) + {