struct ll_dir_entry {
/* number of inode, referenced by this entry */
- __le32 lde_inode;
+ __le32 lde_inode;
/* total record length, multiple of LL_DIR_PAD */
- __le16 lde_rec_len;
+ __le16 lde_rec_len;
/* length of name */
- __u8 lde_name_len;
+ __u8 lde_name_len;
/* file type: regular, directory, device, etc. */
- __u8 lde_file_type;
+ __u8 lde_file_type;
/* name. NOT NUL-terminated */
- char lde_name[LL_DIR_NAME_LEN];
+ char lde_name[LL_DIR_NAME_LEN];
};
struct ll_dentry_data {
* dir statahead.
*/
pid_t lli_opendir_pid;
- /*
+ /*
* since parent-child threads can share the same @file struct,
* "opendir_key" is the token when dir close for case of parent exit
* before child -- it is me should cleanup the dir readahead. */
struct ll_ra_info {
unsigned long ra_cur_pages;
unsigned long ra_max_pages;
+ unsigned long ra_max_pages_per_file;
unsigned long ra_max_read_ahead_whole_pages;
unsigned long ra_stats[_NR_RA_STAT];
};
unsigned long ras_consecutive_pages;
/*
* number of read requests after the last read-ahead window reset
- * As window is reset on each seek, this is effectively the number
+ * As window is reset on each seek, this is effectively the number
* on consecutive read request and is used to trigger read-ahead.
*/
unsigned long ras_consecutive_requests;
*/
unsigned long ras_requests;
/*
- * Page index with respect to the current request, these value
+ * Page index with respect to the current request, these value
* will not be accurate when dealing with reads issued via mmap.
*/
unsigned long ras_request_index;
* protected by ->ras_lock.
*/
struct list_head ras_read_beads;
- /*
+ /*
* The following 3 items are used for detecting the stride I/O
- * mode.
- * In stride I/O mode,
- * ...............|-----data-----|****gap*****|--------|******|....
- * offset |-stride_pages-|-stride_gap-|
+ * mode.
+ * In stride I/O mode,
+ * ...............|-----data-----|****gap*****|--------|******|....
+ * offset |-stride_pages-|-stride_gap-|
* ras_stride_offset = offset;
* ras_stride_length = stride_pages + stride_gap;
* ras_stride_pages = stride_pages;
unsigned long ras_stride_length;
unsigned long ras_stride_pages;
pgoff_t ras_stride_offset;
- /*
+ /*
* number of consecutive stride request count, and it is similar as
* ras_consecutive_requests, but used for stride I/O mode.
* Note: only more than 2 consecutive stride request are detected,
#define ll_unregister_cache(cache) do {} while (0)
#endif
-void ll_ra_read_init(struct file *f, struct ll_ra_read *rar,
+void ll_ra_read_init(struct file *f, struct ll_ra_read *rar,
loff_t offset, size_t count);
void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar);
struct ll_ra_read *ll_ra_read_get(struct file *f);
int ll_file_open(struct inode *inode, struct file *file);
int ll_file_release(struct inode *inode, struct file *file);
int ll_lsm_getattr(struct obd_export *, struct lov_stripe_md *, struct obdo *);
-int ll_glimpse_ioctl(struct ll_sb_info *sbi,
+int ll_glimpse_ioctl(struct ll_sb_info *sbi,
struct lov_stripe_md *lsm, lstat_t *st);
int ll_glimpse_size(struct inode *inode, int ast_flags);
int ll_local_open(struct file *file,
struct ptlrpc_request **request);
int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
int set_default);
-int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmm,
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmm,
int *lmm_size, struct ptlrpc_request **request);
int ll_fsync(struct file *file, struct dentry *dentry, int data);
int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
* "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR"
* will bypass interacting with statahead thread for checking:
* "lld_sa_generation == lli_sai->sai_generation"
- */
+ */
if (ldd && lli->lli_sai &&
ldd->lld_sa_generation == lli->lli_sai->sai_generation)
return -EAGAIN;
* Parameters:
* @magic: Dynamic ioctl call routine will feed this vaule with the pointer
* returned to ll_iocontrol_register. Callback functions should use this
- * data to check the potential collasion of ioctl cmd. If collasion is
+ * data to check the potential collasion of ioctl cmd. If collasion is
* found, callback function should return LLIOC_CONT.
* @rcp: The result of ioctl command.
*
* Return values:
- * If @magic matches the pointer returned by ll_iocontrol_data, the
+ * If @magic matches the pointer returned by ll_iocontrol_data, the
* callback should return LLIOC_STOP; return LLIOC_STOP otherwise.
*/
-typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode,
+typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode,
struct file *file, unsigned int cmd, unsigned long arg,
void *magic, int *rcp);
-enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
+enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
unsigned int cmd, unsigned long arg, int *rcp);
/* export functions */
-/* Register ioctl block dynamatically for a regular file.
+/* Register ioctl block dynamatically for a regular file.
*
* @cmd: the array of ioctl command set
* @count: number of commands in the @cmd
- * @cb: callback function, it will be called if an ioctl command is found to
+ * @cb: callback function, it will be called if an ioctl command is found to
* belong to the command list @cmd.
*
* Return vaule:
- * A magic pointer will be returned if success;
- * otherwise, NULL will be returned.
+ * A magic pointer will be returned if success;
+ * otherwise, NULL will be returned.
* */
void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd);
void ll_iocontrol_unregister(void *magic);
oa.o_id = lli->lli_smd->lsm_object_id;
oa.o_valid = OBD_MD_FLID;
if (srvlock) {
- /* set OBD_MD_FLFLAGS in o_valid, only if we
+ /* set OBD_MD_FLFLAGS in o_valid, only if we
* set OBD_FL_TRUNCLOCK, otherwise ost_punch
* and filter_setattr get confused, see the comment
* in ost_punch */
int srvlock = test_bit(LLI_F_SRVLOCK, &lli->lli_flags);
loff_t new_size;
ENTRY;
- CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) to %Lu=%#Lx\n",inode->i_ino,
- inode->i_generation, inode, i_size_read(inode), i_size_read(inode));
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) to %Lu=%#Lx\n",
+ inode->i_ino, inode->i_generation, inode, i_size_read(inode),
+ i_size_read(inode));
ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_TRUNC, 1);
if (lli->lli_size_sem_owner != current) {
struct ost_lvb lvb;
int rc;
- /* XXX I'm pretty sure this is a hack to paper over a more fundamental
- * race condition. */
+ /* XXX I'm pretty sure this is a hack to paper over a more
+ * fundamental race condition. */
lov_stripe_lock(lli->lli_smd);
inode_init_lvb(inode, &lvb);
rc = obd_merge_lvb(ll_i2obdexp(inode), lli->lli_smd, &lvb, 0);
inode->i_blocks = lvb.lvb_blocks;
if (lvb.lvb_size == i_size_read(inode) && rc == 0) {
- CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64", %Lu=%#Lx\n",
+ CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64
+ ", %Lu=%#Lx\n",
lli->lli_smd->lsm_object_id, i_size_read(inode),
i_size_read(inode));
lov_stripe_unlock(lli->lli_smd);
* with the removepage path which gets the page lock then the
* cli lock */
if(!clear_page_dirty_for_io(page)) {
- unlock_page(page);
- RETURN(-EAGAIN);
- }
+ unlock_page(page);
+ RETURN(-EAGAIN);
+ }
/* This actually clears the dirty bit in the radix tree.*/
set_page_writeback(page);
static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, unsigned long len)
{
struct ll_ra_info *ra = &sbi->ll_ra_info;
- unsigned long ret;
+ unsigned long ret = 0;
ENTRY;
+ /**
+ * If read-ahead pages left are less than 1M, do not do read-ahead,
+ * otherwise it will form small read RPC(< 1M), which hurt server
+ * performance a lot.
+ */
spin_lock(&sbi->ll_lock);
- ret = min(ra->ra_max_pages - ra->ra_cur_pages, len);
- ra->ra_cur_pages += ret;
+ if (ra->ra_max_pages - ra->ra_cur_pages >=
+ min((unsigned long)PTLRPC_MAX_BRW_PAGES, len)) {
+ ret = min(ra->ra_max_pages - ra->ra_cur_pages, len);
+ ra->ra_cur_pages += ret;
+ }
spin_unlock(&sbi->ll_lock);
RETURN(ret);
llap->llap_ra_used = 0;
rc = obd_queue_group_io(exp, ll_i2info(page->mapping->host)->lli_smd,
NULL, oig, llap->llap_cookie, OBD_BRW_READ, 0,
- CFS_PAGE_SIZE, 0, ASYNC_COUNT_STABLE | ASYNC_READY |
- ASYNC_URGENT);
+ CFS_PAGE_SIZE, 0, ASYNC_COUNT_STABLE |
+ ASYNC_READY | ASYNC_URGENT);
if (rc) {
LL_CDEBUG_PAGE(D_ERROR, page, "read queue failed: rc %d\n", rc);
page_cache_release(page);
#define RAS_CDEBUG(ras) \
CDEBUG(D_READA, \
"lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu" \
- "csr %lu sf %lu sp %lu sl %lu \n", \
+ "csr %lu sf %lu sp %lu sl %lu \n", \
ras->ras_last_readpage, ras->ras_consecutive_requests, \
ras->ras_consecutive_pages, ras->ras_window_start, \
ras->ras_window_len, ras->ras_next_readahead, \
- ras->ras_requests, ras->ras_request_index, \
+ ras->ras_requests, ras->ras_request_index, \
ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
ras->ras_stride_pages, ras->ras_stride_length)
return &fd->fd_ras;
}
-void ll_ra_read_init(struct file *f, struct ll_ra_read *rar,
+void ll_ra_read_init(struct file *f, struct ll_ra_read *rar,
loff_t offset, size_t count)
{
struct ll_readahead_state *ras;
if (page->mapping != mapping) {
ll_ra_stats_inc(mapping, RA_STAT_WRONG_GRAB_PAGE);
CDEBUG(D_READA, "g_c_p_n returned invalid page\n");
- GOTO(unlock_page, rc = 0);
+ GOTO(unlock_page, rc = 0);
}
/* we do this first so that we can see the page in the /proc
GOTO(unlock_page, rc = -ENOLCK);
}
CDEBUG(D_READA, "read-ahead page\n");
- GOTO(unlock_page, rc = 0);
+ GOTO(unlock_page, rc = 0);
}
/* skip completed pages */
if (Page_Uptodate(page))
- GOTO(unlock_page, rc = 0);
+ GOTO(unlock_page, rc = 0);
/* bail out when we hit the end of the lock. */
rc = ll_issue_page_read(exp, llap, oig, 1);
LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "started read-ahead\n");
rc = 1;
} else {
-unlock_page:
+unlock_page:
unlock_page(page);
LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "skipping read-ahead\n");
}
unsigned long ria_pages;
};
-#define RIA_DEBUG(ria) \
+#define RIA_DEBUG(ria) \
CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n", \
ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
ria->ria_pages)
static int ll_read_ahead_pages(struct obd_export *exp,
struct obd_io_group *oig,
- struct ra_io_arg *ria,
+ struct ra_io_arg *ria,
unsigned long *reserved_pages,
struct address_space *mapping,
unsigned long *ra_end)
if (ras_inside_ra_window(page_idx, ria)) {
/* If the page is inside the read-ahead window*/
rc = ll_read_ahead_page(exp, oig, page_idx, mapping);
- if (rc == 1) {
- (*reserved_pages)--;
- count ++;
- } else if (rc == -ENOLCK)
- break;
+ if (rc == 1) {
+ (*reserved_pages)--;
+ count ++;
+ } else if (rc == -ENOLCK)
+ break;
} else if (stride_ria) {
/* If it is not in the read-ahead window, and it is
* read-ahead mode, then check whether it should skip
* the stride gap */
- pgoff_t offset;
+ pgoff_t offset;
/* FIXME: This assertion only is valid when it is for
* forward read-ahead, it will be fixed when backward
* read-ahead is implemented */
" offset %lu \n", page_idx, ria->ria_stoff);
offset = page_idx - ria->ria_stoff;
- offset = offset % (ria->ria_length);
- if (offset > ria->ria_pages) {
- page_idx += ria->ria_length - offset;
+ offset = offset % (ria->ria_length);
+ if (offset > ria->ria_pages) {
+ page_idx += ria->ria_length - offset;
CDEBUG(D_READA, "i %lu skip %lu \n", page_idx,
ria->ria_length - offset);
continue;
/* Enlarge the RA window to encompass the full read */
if (bead != NULL && ras->ras_window_start + ras->ras_window_len <
bead->lrr_start + bead->lrr_count) {
- obd_off read_end = (bead->lrr_start + bead->lrr_count) <<
+ obd_off read_end = (bead->lrr_start + bead->lrr_count) <<
CFS_PAGE_SHIFT;
- obd_extent_calc(exp, lsm, OBD_CALC_STRIPE_RPC_END_ALIGN,
+ obd_extent_calc(exp, lsm, OBD_CALC_STRIPE_RPC_END_ALIGN,
&read_end);
- ras->ras_window_len = ((read_end + 1) >> CFS_PAGE_SHIFT) -
+ ras->ras_window_len = ((read_end + 1) >> CFS_PAGE_SHIFT) -
ras->ras_window_start;
}
- /* Reserve a part of the read-ahead window that we'll be issuing */
+ /* Reserve a part of the read-ahead window that we'll be issuing */
if (ras->ras_window_len) {
start = ras->ras_next_readahead;
end = ras->ras_window_start + ras->ras_window_len - 1;
if (ra_end < ras->ras_next_readahead &&
index_in_window(ra_end, ras->ras_window_start, 0,
ras->ras_window_len)) {
- ras->ras_next_readahead = ra_end;
- RAS_CDEBUG(ras);
+ ras->ras_next_readahead = ra_end;
+ RAS_CDEBUG(ras);
}
spin_unlock(&ras->ras_lock);
}
INIT_LIST_HEAD(&ras->ras_read_beads);
}
-/*
+/*
* Check whether the read request is in the stride window.
* If it is in the stride window, return 1, otherwise return 0.
*/
struct inode *inode)
{
unsigned long stride_gap = index - ras->ras_last_readpage - 1;
-
+
if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0)
return 0;
/* If it is contiguous read */
- if (stride_gap == 0)
+ if (stride_gap == 0)
return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages;
-
+
/*Otherwise check the stride by itself */
return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap &&
ras->ras_consecutive_pages == ras->ras_stride_pages;
{
unsigned long stride_gap = index - ras->ras_last_readpage - 1;
- if (!stride_io_mode(ras) && (stride_gap != 0 ||
+ if (!stride_io_mode(ras) && (stride_gap != 0 ||
ras->ras_consecutive_stride_requests == 0)) {
ras->ras_stride_pages = ras->ras_consecutive_pages;
ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
unsigned long stride_len;
LASSERT(ras->ras_stride_length > 0);
- LASSERTF(ras->ras_window_start + ras->ras_window_len
+ LASSERTF(ras->ras_window_start + ras->ras_window_len
>= ras->ras_stride_offset, "window_start %lu, window_len %lu"
" stride_offset %lu\n", ras->ras_window_start,
ras->ras_window_len, ras->ras_stride_offset);
window_len += step * ras->ras_stride_length + left;
- if (stride_page_count(ras, window_len) <= ra->ra_max_pages)
+ if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file)
ras->ras_window_len = window_len;
RAS_CDEBUG(ras);
RAS_CDEBUG(ras);
}
-static void ras_increase_window(struct ll_readahead_state *ras,
- struct ll_ra_info *ra, struct inode *inode)
+static void ras_increase_window(struct ll_readahead_state *ras,
+ struct ll_ra_info *ra, struct inode *inode)
{
- __u64 step;
- __u32 size;
- int rc;
-
- step = ((loff_t)(ras->ras_window_start +
- ras->ras_window_len)) << CFS_PAGE_SHIFT;
- size = sizeof(step);
- /*Get rpc_size for this offset (step) */
- rc = obd_get_info(ll_i2obdexp(inode), sizeof(KEY_OFF_RPCSIZE),
- KEY_OFF_RPCSIZE, &size, &step,
- ll_i2info(inode)->lli_smd);
- if (rc)
- step = INIT_RAS_WINDOW_PAGES;
-
- if (stride_io_mode(ras))
- ras_stride_increase_window(ras, ra, (unsigned long)step);
- else
- ras->ras_window_len = min(ras->ras_window_len + (unsigned long)step,
- ra->ra_max_pages);
+ __u64 step;
+ __u32 size;
+ int rc;
+
+ step = ((loff_t)(ras->ras_window_start +
+ ras->ras_window_len)) << CFS_PAGE_SHIFT;
+ size = sizeof(step);
+ /*Get rpc_size for this offset (step) */
+ rc = obd_get_info(ll_i2obdexp(inode), sizeof(KEY_OFF_RPCSIZE),
+ KEY_OFF_RPCSIZE, &size, &step,
+ ll_i2info(inode)->lli_smd);
+ if (rc)
+ step = INIT_RAS_WINDOW_PAGES;
+
+ if (stride_io_mode(ras))
+ ras_stride_increase_window(ras, ra, (unsigned long)step);
+ else
+ ras->ras_window_len = min(ras->ras_window_len +
+ (unsigned long)step,
+ ra->ra_max_pages_per_file);
}
static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
index < ras->ras_next_readahead &&
index_in_window(index, ras->ras_window_start, 0,
ras->ras_window_len)) {
- ra_miss = 1;
+ ra_miss = 1;
ll_ra_stats_inc_unlocked(ra, RA_STAT_MISS_IN_WINDOW);
}
/* On the second access to a file smaller than the tunable
* ra_max_read_ahead_whole_pages trigger RA on all pages in the
- * file up to ra_max_pages. This is simply a best effort and
- * only occurs once per open file. Normal RA behavior is reverted
+ * file up to ra_max_pages_per_file. This is simply a best effort
+ * and only occurs once per open file. Normal RA behavior is reverted
* to for subsequent IO. The mmap case does not increment
* ras_requests and thus can never trigger this behavior. */
if (ras->ras_requests == 2 && !ras->ras_request_index) {
CFS_PAGE_SHIFT;
CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages,
- ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages);
+ ra->ra_max_read_ahead_whole_pages,
+ ra->ra_max_pages_per_file);
if (kms_pages &&
kms_pages <= ra->ra_max_read_ahead_whole_pages) {
ras->ras_window_start = 0;
ras->ras_last_readpage = 0;
ras->ras_next_readahead = 0;
- ras->ras_window_len = min(ra->ra_max_pages,
+ ras->ras_window_len = min(ra->ra_max_pages_per_file,
ra->ra_max_read_ahead_whole_pages);
GOTO(out_unlock, 0);
}
}
if (zero) {
- /* check whether it is in stride I/O mode*/
+ /* check whether it is in stride I/O mode*/
if (!index_in_stride_window(index, ras, inode)) {
ras_reset(ras, index);
ras->ras_consecutive_pages++;
ras_stride_reset(ras);
GOTO(out_unlock, 0);
} else {
- ras->ras_consecutive_requests = 0;
+ ras->ras_consecutive_requests = 0;
if (++ras->ras_consecutive_stride_requests > 1)
stride_detect = 1;
RAS_CDEBUG(ras);
if (ra_miss) {
if (index_in_stride_window(index, ras, inode) &&
stride_io_mode(ras)) {
- /*If stride-RA hit cache miss, the stride dector
+ /*If stride-RA hit cache miss, the stride dector
*will not be reset to avoid the overhead of
*redetecting read-ahead mode */
if (index != ras->ras_last_readpage + 1)
ras->ras_consecutive_pages = 0;
RAS_CDEBUG(ras);
} else {
- /*Reset both stride window and normal RA window*/
+ /*Reset both stride window and normal RA window*/
ras_reset(ras, index);
ras->ras_consecutive_pages++;
ras_stride_reset(ras);
} else if (stride_io_mode(ras)) {
/* If this is contiguous read but in stride I/O mode
* currently, check whether stride step still is valid,
- * if invalid, it will reset the stride ra window*/
+ * if invalid, it will reset the stride ra window*/
if (!index_in_stride_window(index, ras, inode)) {
/*Shrink stride read-ahead window to be zero*/
ras_stride_reset(ras);
* uselessly reading and discarding pages for random IO the window is
* only increased once per consecutive request received. */
if ((ras->ras_consecutive_requests > 1 &&
- !ras->ras_request_index) || stride_detect)
- ras_increase_window(ras, ra, inode);
+ !ras->ras_request_index) || stride_detect)
+ ras_increase_window(ras, ra, inode);
EXIT;
out_unlock:
RAS_CDEBUG(ras);
GOTO(out, rc = PTR_ERR(llap));
}
- if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
+ if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages_per_file)
ras_update(ll_i2sbi(inode), inode, &fd->fd_ras, page->index,
llap->llap_defer_uptodate);
LL_CDEBUG_PAGE(D_PAGE, page, "queued readpage\n");
/* We have just requested the actual page we want, see if we can tack
* on some readahead to that page's RPC before it is sent. */
- if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
+ if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages_per_file)
ll_readahead(&fd->fd_ras, exp, page->mapping, oig,
fd->fd_flags);
CERROR("the llap wasn't freed\n");
(*pp)->mapping = NULL;
if (page_count(*pp) != 1)
- CERROR("page %p, flags %#lx, count %i, private %p\n",
- (*pp), (unsigned long)(*pp)->flags, page_count(*pp),
- (void*)page_private(*pp));
+ CERROR("page %p, flags %#lx, count %i, "
+ "private %p\n", (*pp),
+ (unsigned long)(*pp)->flags,
+ page_count(*pp),
+ (void*)page_private(*pp));
__free_pages(*pp, 0);
}
}