removed cwd "./" (refer to Bugzilla 14399).
* File join has been disabled in this release, refer to Bugzilla 16929.
+Severity : normal
+Bugzilla : 18645
+Description: Reduce small size read RPC
+Details : Set read-ahead limite for every file and only do read-ahead when
+ available read-ahead pages are bigger than 1M to avoid small size
+ read RPC.
+
Severity : enhancement
Bugzilla : 17974
Description: add lazystatfs mount option to allow statfs(2) to skip down OSTs
Description: Add state history info file, enhance import info file
Details : Track import connection state changes in a new osc/mdc proc file;
add overview-type data to the osc/mdc import proc file.
-
+
Severity : enhancement
Bugzilla : 17536
Description: MDS create should not wait for statfs RPC while holding DLM lock.
Severity : enhancement
Bugzilla : 15899
Description: File striping can now be set to use an arbitrary pool of OSTs.
-
+
Severity : enhancement
Bugzilla : 16573
Description: Export bytes_read/bytes_write count on OSC/OST.
Description: Add lockdep annotations to llog code.
Details : Use appropriately tagged _nested() locking calls in the places
where llog takes more than one ->lgh_lock lock.
-
+
Severity : minor
Bugzilla : 16450
Description: Add loi_kms_set().
this on patchless clients the deathrow inode reaper is turned
off, and we depend on the VM to clean up old inodes. This
dependency was during via the fix for bug 12181.
-
+
--------------------------------------------------------------------------------
2007-04-19 Cluster File Systems, Inc. <info@clusterfs.com>
Description: startup order invariance
Details : MDTs and OSTs can be started in any order. Clients only
require the MDT to complete startup.
-
+
Severity : enhancement
Bugzilla : 4899
Description: parallel, asynchronous orphan cleanup
Details : stripe assignments are now made based on ost space available,
ost previous usage, and OSS previous usage, in order to try
to optimize storage space and networking resources.
-
+
Severity : enhancement
Bugzilla : 4226
Description: Permanently set tunables
Details : All writable /proc/fs/lustre tunables can now be permanently
set on a per-server basis, at mkfs time or on a live system.
-
+
Severity : enhancement
Bugzilla : 10547
Description: Lustre message v2
Description: SPEC SFS validation failure on NFS v2 over lustre.
Details : Changes the blocksize for regular files to be 2x RPC size,
and not depend on stripe size.
-
+
Severity : enhancement
Bugzilla : 9293
Description: Multiple MD RPCs in flight.
Rather --with-portals=<path-to-portals-includes> is used to
enable building on the XT3. In addition to enable XT3 specific
features the option --enable-cray-xt3 must be used.
-
+
Severity : major
Frequency : rare
Bugzilla : 7407
* add hard link support
* change obdfile creation method
* kernel patch changed
-
+
2002-09-19 Peter Braam <braam@clusterfs.com>
* version 0_5_9
* bug fix
struct ll_ra_info {
atomic_t ra_cur_pages;
unsigned long ra_max_pages;
+ unsigned long ra_max_pages_per_file;
unsigned long ra_max_read_ahead_whole_pages;
};
/*
* The following 3 items are used for detecting the stride I/O
* mode.
- * In stride I/O mode,
+ * In stride I/O mode,
* ...............|-----data-----|****gap*****|--------|******|....
* offset |-stride_pages-|-stride_gap-|
* ras_stride_offset = offset;
sbi->ll_async_page_max = (pages / 4) * 3;
}
- sbi->ll_ra_info.ra_max_pages = min(pages / 32,
+ sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
SBI_DEFAULT_READAHEAD_MAX);
+ sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
SBI_DEFAULT_READAHEAD_WHOLE_MAX;
INIT_LIST_HEAD(&sbi->ll_conn_chain);
rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars,
lcfg, sb);
if (rc > 0)
- rc = 0;
+ rc = 0;
return(rc);
}
return count;
}
+static int ll_rd_max_readahead_per_file_mb(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct super_block *sb = data;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ long pages_number;
+ int mult;
+
+ spin_lock(&sbi->ll_lock);
+ pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
+ spin_unlock(&sbi->ll_lock);
+
+ mult = 1 << (20 - CFS_PAGE_SHIFT);
+ return lprocfs_read_frac_helper(page, count, pages_number, mult);
+}
+
+static int ll_wr_max_readahead_per_file_mb(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct super_block *sb = data;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int mult, rc, pages_number;
+
+ mult = 1 << (20 - CFS_PAGE_SHIFT);
+ rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+ if (rc)
+ return rc;
+
+ if (pages_number < 0 ||
+ pages_number > sbi->ll_ra_info.ra_max_pages) {
+ CERROR("can't set file readahead more than"
+ "max_read_ahead_mb %lu MB\n", sbi->ll_ra_info.ra_max_pages);
+ return -ERANGE;
+ }
+
+ spin_lock(&sbi->ll_lock);
+ sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
+ spin_unlock(&sbi->ll_lock);
+
+ return count;
+}
+
static int ll_rd_max_read_ahead_whole_mb(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
/* Cap this at the current max readahead window size, the readahead
* algorithm does this anyway so it's pointless to set it larger. */
- if (pages_number < 0 || pages_number > sbi->ll_ra_info.ra_max_pages) {
+ if (pages_number < 0 ||
+ pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
CERROR("can't set max_read_ahead_whole_mb more than "
- "max_read_ahead_mb: %lu\n",
- sbi->ll_ra_info.ra_max_pages >> (20 - CFS_PAGE_SHIFT));
+ "max_read_ahead_per_file_mb: %lu\n",
+ sbi->ll_ra_info.ra_max_pages_per_file >> (20 - CFS_PAGE_SHIFT));
return -ERANGE;
}
//{ "filegroups", lprocfs_rd_filegroups, 0, 0 },
{ "max_read_ahead_mb", ll_rd_max_readahead_mb,
ll_wr_max_readahead_mb, 0 },
+ { "max_read_ahead_per_file_mb", ll_rd_max_readahead_per_file_mb,
+ ll_wr_max_readahead_per_file_mb, 0 },
{ "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb,
ll_wr_max_read_ahead_whole_mb, 0 },
{ "max_cached_mb", ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 },
unsigned long ret;
ENTRY;
+ /**
+ * If read-ahead pages left are less than 1M, do not do read-ahead,
+ * otherwise it will form small read RPC(< 1M), which hurt server
+ * performance a lot.
+ */
ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), len);
- if ((int)ret < 0)
+ if ((int)ret < 0 || ret < min((unsigned long)PTLRPC_MAX_BRW_PAGES, len))
GOTO(out, ret = 0);
if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
#define RAS_CDEBUG(ras) \
CDEBUG(D_READA, \
"lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu" \
- "csr %lu sf %lu sp %lu sl %lu \n", \
+ "csr %lu sf %lu sp %lu sl %lu \n", \
ras->ras_last_readpage, ras->ras_consecutive_requests, \
ras->ras_consecutive_pages, ras->ras_window_start, \
ras->ras_window_len, ras->ras_next_readahead, \
- ras->ras_requests, ras->ras_request_index, \
+ ras->ras_requests, ras->ras_request_index, \
ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
ras->ras_stride_pages, ras->ras_stride_length)
unsigned long stride_len;
LASSERT(ras->ras_stride_length > 0);
- LASSERTF(ras->ras_window_start + ras->ras_window_len
+ LASSERTF(ras->ras_window_start + ras->ras_window_len
>= ras->ras_stride_offset, "window_start %lu, window_len %lu"
" stride_offset %lu\n", ras->ras_window_start,
ras->ras_window_len, ras->ras_stride_offset);
window_len += step * ras->ras_stride_length + left;
- if (stride_page_count(ras, window_len) <= ra->ra_max_pages)
+ if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file)
ras->ras_window_len = window_len;
RAS_CDEBUG(ras);
index < ras->ras_next_readahead &&
index_in_window(index, ras->ras_window_start, 0,
ras->ras_window_len)) {
- ra_miss = 1;
+ ra_miss = 1;
ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
}
/* On the second access to a file smaller than the tunable
* ra_max_read_ahead_whole_pages trigger RA on all pages in the
- * file up to ra_max_pages. This is simply a best effort and
- * only occurs once per open file. Normal RA behavior is reverted
+ * file up to ra_max_pages_per_file. This is simply a best effort
+ * and only occurs once per open file. Normal RA behavior is reverted
* to for subsequent IO. The mmap case does not increment
* ras_requests and thus can never trigger this behavior. */
if (ras->ras_requests == 2 && !ras->ras_request_index) {
CFS_PAGE_SHIFT;
CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages,
- ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages);
+ ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file);
if (kms_pages &&
kms_pages <= ra->ra_max_read_ahead_whole_pages) {
ras->ras_window_start = 0;
ras->ras_last_readpage = 0;
ras->ras_next_readahead = 0;
- ras->ras_window_len = min(ra->ra_max_pages,
+ ras->ras_window_len = min(ra->ra_max_pages_per_file,
ra->ra_max_read_ahead_whole_pages);
GOTO(out_unlock, 0);
}
}
if (zero) {
- /* check whether it is in stride I/O mode*/
+ /* check whether it is in stride I/O mode*/
if (!index_in_stride_window(index, ras, inode)) {
ras_reset(ras, index);
ras->ras_consecutive_pages++;
ras_stride_reset(ras);
GOTO(out_unlock, 0);
} else {
- ras->ras_consecutive_requests = 0;
+ ras->ras_consecutive_requests = 0;
if (++ras->ras_consecutive_stride_requests > 1)
stride_detect = 1;
RAS_CDEBUG(ras);
} else if (stride_io_mode(ras)) {
/* If this is contiguous read but in stride I/O mode
* currently, check whether stride step still is valid,
- * if invalid, it will reset the stride ra window*/
+ * if invalid, it will reset the stride ra window*/
if (!index_in_stride_window(index, ras, inode)) {
/* Shrink stride read-ahead window to be zero */
ras_stride_reset(ras);
else
ras->ras_window_len = min(ras->ras_window_len +
RAS_INCREASE_STEP,
- ra->ra_max_pages);
+ ra->ra_max_pages_per_file);
}
EXIT;
out_unlock:
ENTRY;
- if (sbi->ll_ra_info.ra_max_pages)
+ if (sbi->ll_ra_info.ra_max_pages_per_file)
ras_update(sbi, inode, ras, page->cp_index,
cp->cpg_defer_uptodate);
* this will unlock it automatically as part of cl_page_list_disown().
*/
cl_2queue_add(queue, page);
- if (sbi->ll_ra_info.ra_max_pages)
+ if (sbi->ll_ra_info.ra_max_pages_per_file)
ll_readahead(env, io, ras,
vmpage->mapping, &queue->c2_qin, fd->fd_flags);