From f00c55df4dc92aac21bdcc0cab7e37ddc78053e8 Mon Sep 17 00:00:00 2001 From: adilger Date: Fri, 26 Nov 2004 06:02:06 +0000 Subject: [PATCH] Land b1_4_smallfix onto b1_4 (20041125_1716) - don't keep a lock reference when lock is not granted (b=4238) - unsafe list practices (rarely) led to infinite eviction loop (b=4908) - add per-fs limit of Lustre pages in page cache, avoid OOM (b=4699) - drop import inflight refcount on signal_completed_replay error (b=5255) - unlock page after async write error during send (b=3677) - service request history (b=4965) - put {ll,lov,osc}_async_page structs in a single slab (b=4699) - create an "evict_client" /proc entry on OSTs, like the MDS has - handle missing objects in filter_preprw_read properly (b=5265) --- .../patches/ext3-ea-in-inode-2.6-suse.patch | 2 +- lustre/ChangeLog | 9 + lustre/configure.in | 2 +- lustre/include/linux/lprocfs_status.h | 9 +- lustre/include/linux/lustre_compat25.h | 1 + lustre/include/linux/lustre_mds.h | 2 +- lustre/include/linux/lustre_net.h | 57 +- lustre/include/linux/obd.h | 4 +- lustre/include/linux/obd_support.h | 2 + .../patches/ext3-ea-in-inode-2.4.20.patch | 2 +- .../patches/ext3-ea-in-inode-2.4.21-chaos.patch | 2 +- .../patches/ext3-ea-in-inode-2.4.21-suse2.patch | 760 --------------------- .../patches/ext3-ea-in-inode-2.4.22-rh.patch | 2 +- .../patches/ext3-ea-in-inode-2.6-suse.patch | 2 +- .../patches/ext3-mballoc2-2.6.7.patch | 6 +- .../patches/vfs_intent-2.4.21-chaos.patch | 6 +- lustre/kernel_patches/series/chaos-2.4.21 | 1 + lustre/kernel_patches/series/rhel-2.4.21 | 1 + lustre/kernel_patches/series/suse-2.4.21-2 | 2 +- lustre/ldlm/ldlm_lib.c | 51 +- lustre/ldlm/ldlm_lockd.c | 53 +- lustre/ldlm/ldlm_request.c | 4 + lustre/liblustre/llite_lib.h | 6 +- lustre/liblustre/rw.c | 22 +- lustre/llite/llite_close.c | 3 +- lustre/llite/llite_internal.h | 20 +- lustre/llite/llite_lib.c | 50 +- lustre/llite/lproc_llite.c | 166 +++-- lustre/llite/rw.c | 142 +++- lustre/llite/rw24.c | 2 + lustre/llite/super.c | 3 + lustre/llite/super25.c | 4 + lustre/lov/lov_obd.c | 18 +- lustre/lvfs/fsfilt_ext3.c | 4 +- lustre/mds/handler.c | 11 +- lustre/mds/lproc_mds.c | 52 +- lustre/mds/mds_open.c | 12 +- lustre/mds/mds_reint.c | 2 +- lustre/mgmt/mgmt_svc.c | 2 +- lustre/obdfilter/filter.c | 6 + lustre/obdfilter/filter_io.c | 16 +- lustre/obdfilter/filter_log.c | 2 +- lustre/obdfilter/filter_san.c | 6 - lustre/obdfilter/lproc_obdfilter.c | 3 +- lustre/osc/osc_request.c | 28 +- lustre/ost/autoMakefile.am | 2 +- lustre/ost/lproc_ost.c | 36 + lustre/ost/ost_handler.c | 7 +- lustre/portals/knals/lonal/lonal_cb.c | 4 +- lustre/ptlbd/server.c | 2 +- lustre/ptlrpc/events.c | 12 +- lustre/ptlrpc/import.c | 4 +- lustre/ptlrpc/lproc_ptlrpc.c | 281 ++++++++ lustre/ptlrpc/niobuf.c | 12 +- lustre/ptlrpc/pack_generic.c | 2 + lustre/ptlrpc/service.c | 125 +++- lustre/tests/recovery-small.sh | 7 +- lustre/tests/replay-single.sh | 13 + lustre/tests/sanity.sh | 20 +- lustre/utils/lconf | 34 +- 60 files changed, 997 insertions(+), 1124 deletions(-) delete mode 100644 lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.21-suse2.patch diff --git a/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch index 997cc1e..f984067 100644 --- a/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch @@ -94,7 +94,7 @@ Index: linux-2.6.0/fs/ext3/xattr.c /* - * ext3_xattr_list() -+ * ext3_xattr_ibode_get() ++ * ext3_xattr_ibody_get() * - * Copy a list of attribute names into the buffer + * routine looks for attribute in inode body and returns it's value and size diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 21d3576..27ad84f 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -5,6 +5,11 @@ - flock/lockf fixes (but it's still disabled, pending 5135) - don't use EXT3 constants in llite code (5094) - memory shortage at startup could cause assertion (5176) + - don't keep a lock reference when lock is not granted (4238) + - unsafe list practices (rarely) led to infinite eviction loop (4908) + - add per-fs limit of Lustre pages in page cache, avoid OOM (4699) + - drop import inflight refcount on signal_completed_replay error (5255) + - unlock page after async write error during send (3677) * miscellania - reorganization of lov code - single portals codebase @@ -12,6 +17,9 @@ - add extents/mballoc support (5025) - direct I/O reads in the obdfilter (4048) - kernel patches from LNXI for 2.6 (bluesmoke, perfctr, mtd, kexec) + - service request history (4965) + - put {ll,lov,osc}_async_page structs in a single slab (4699) + - create an "evict_client" /proc entry on OSTs, like the MDS has tbd Cluster File Systems, Inc. * version 1.2.9 @@ -19,6 +27,7 @@ tbd Cluster File Systems, Inc. - don't ASSERT in ptl_send_rpc() if we run out of memory (5119) - lock /proc/sys/portals/routes internal state, avoiding oops (4827) - the watchdog thread now runs as interruptible (5246) + - handle missing objects in filter_preprw_read properly (5265) * miscellania - add pid to ldlm debugging output (4922) diff --git a/lustre/configure.in b/lustre/configure.in index 0b9c3c3..96116b8 100644 --- a/lustre/configure.in +++ b/lustre/configure.in @@ -5,7 +5,7 @@ AC_INIT AC_CANONICAL_SYSTEM -AM_INIT_AUTOMAKE(lustre, 1.4.0) +AM_INIT_AUTOMAKE(lustre, 1.4.0.1) # AM_MAINTAINER_MODE # Four main targets: lustre kernel modules, utilities, tests, and liblustre diff --git a/lustre/include/linux/lprocfs_status.h b/lustre/include/linux/lprocfs_status.h index 47f6d6e..f552d61 100644 --- a/lustre/include/linux/lprocfs_status.h +++ b/lustre/include/linux/lprocfs_status.h @@ -230,6 +230,8 @@ extern int lprocfs_rd_num_exports(char *page, char **start, off_t off, int count, int *eof, void *data); extern int lprocfs_rd_numrefs(char *page, char **start, off_t off, int count, int *eof, void *data); +extern int lprocfs_wr_evict_client(struct file *file, const char *buffer, + unsigned long count, void *data); /* Statfs helpers */ extern int lprocfs_rd_blksize(char *page, char **start, off_t off, @@ -324,7 +326,12 @@ static inline int lprocfs_rd_num_exports(char *page, char **start, off_t off, int count, int *eof, void *data) { return 0; } static inline int lprocfs_rd_numrefs(char *page, char **start, off_t off, - int count, int *eof, void *data) { return 0; } + int count, int *eof, void *data) +{ return 0; } +static inline int lprocfs_wr_evict_client(struct file *file, const char *buffer, + unsigned long count, void *data) +{ return 0; }; + /* Statfs helpers */ static inline diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index 44e1a57..119eb59 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -47,6 +47,7 @@ #define TryLockPage(page) TestSetPageLocked(page) #define filemap_fdatasync(mapping) filemap_fdatawrite(mapping) #define Page_Uptodate(page) PageUptodate(page) +#define ClearPageLaunder(page) do {} while(0) #define KDEVT_INIT(val) (val) diff --git a/lustre/include/linux/lustre_mds.h b/lustre/include/linux/lustre_mds.h index b78978d..541f67f 100644 --- a/lustre/include/linux/lustre_mds.h +++ b/lustre/include/linux/lustre_mds.h @@ -273,7 +273,7 @@ static inline void mdc_pack_fid(struct ll_fid *fid, obd_id ino, __u32 gen, req->rq_export->exp_mds_data.med_mcd; \ if (mcd->mcd_last_xid == req->rq_xid) { \ reconstruct; \ - RETURN(0); \ + RETURN(req->rq_repmsg->status); \ } \ DEBUG_REQ(D_HA, req, "no reply for RESENT req (have "LPD64")", \ mcd->mcd_last_xid); \ diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h index 335cbc5..7fe611b 100644 --- a/lustre/include/linux/lustre_net.h +++ b/lustre/include/linux/lustre_net.h @@ -262,6 +262,7 @@ struct ptlrpc_reply_state { struct obd_export *rs_export; struct ptlrpc_srv_ni *rs_srv_ni; ptl_handle_md_t rs_md_h; + atomic_t rs_refcount; /* locks awaiting client reply ACK */ int rs_nlocks; @@ -274,6 +275,8 @@ struct ptlrpc_reply_state { struct ptlrpc_request { int rq_type; /* one of PTL_RPC_MSG_* */ struct list_head rq_list; + struct list_head rq_history_list; /* server-side history */ + __u64 rq_history_seq; /* history sequence # */ int rq_status; spinlock_t rq_lock; /* client-side flags */ @@ -350,14 +353,30 @@ struct ptlrpc_request { #define RQ_PHASE_INTERPRET 0xebc0de03 #define RQ_PHASE_COMPLETE 0xebc0de04 +static inline const char * +ptlrpc_rqphase2str(struct ptlrpc_request *req) +{ + switch (req->rq_phase) { + case RQ_PHASE_NEW: + return "New"; + case RQ_PHASE_RPC: + return "Rpc"; + case RQ_PHASE_BULK: + return "Bulk"; + case RQ_PHASE_INTERPRET: + return "Interpret"; + case RQ_PHASE_COMPLETE: + return "Complete"; + default: + return "?Phase?"; + } +} + /* Spare the preprocessor, spoil the bugs. */ #define FLAG(field, str) (field ? str : "") #define DEBUG_REQ_FLAGS(req) \ - ((req->rq_phase == RQ_PHASE_NEW) ? "New" : \ - (req->rq_phase == RQ_PHASE_RPC) ? "Rpc" : \ - (req->rq_phase == RQ_PHASE_INTERPRET) ? "Interpret" : \ - (req->rq_phase == RQ_PHASE_COMPLETE) ? "Complete" : "?phase?"), \ + ptlrpc_rqphase2str(req), \ FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \ FLAG(req->rq_err, "E"), \ FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \ @@ -437,6 +456,7 @@ struct ptlrpc_thread { struct ptlrpc_request_buffer_desc { struct list_head rqbd_list; + struct list_head rqbd_reqs; struct ptlrpc_srv_ni *rqbd_srv_ni; ptl_handle_md_t rqbd_md_h; int rqbd_refcount; @@ -466,6 +486,7 @@ struct ptlrpc_srv_ni { }; typedef int (*svc_handler_t)(struct ptlrpc_request *req); +typedef void (*svcreq_printfn_t)(void *, struct ptlrpc_request *); struct ptlrpc_service { struct list_head srv_list; /* chain thru all services */ @@ -485,8 +506,16 @@ struct ptlrpc_service { int srv_n_queued_reqs; /* # reqs waiting to be served */ struct list_head srv_request_queue; /* reqs waiting for service */ - struct list_head srv_idle_rqbds; /* request buffers to be reposted */ + struct list_head srv_request_history; /* request history */ + __u64 srv_request_seq; /* next request sequence # */ + __u64 srv_request_max_cull_seq; /* highest seq culled from history */ + svcreq_printfn_t srv_request_history_print_fn; /* service-specific print fn */ + struct list_head srv_idle_rqbds; /* request buffers to be reposted */ + struct list_head srv_history_rqbds; /* request buffer history */ + int srv_n_history_rqbds; /* # request buffers in history */ + int srv_max_history_rqbds; /* max # request buffers in history */ + atomic_t srv_outstanding_replies; struct list_head srv_reply_queue; /* replies waiting for service */ @@ -650,7 +679,8 @@ struct ptlrpc_service *ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int req_portal, int rep_portal, int watchdog_timeout, /* in ms */ svc_handler_t, char *name, - struct proc_dir_entry *proc_entry); + struct proc_dir_entry *proc_entry, + svcreq_printfn_t); void ptlrpc_stop_all_threads(struct ptlrpc_service *svc); int ptlrpc_start_n_threads(struct obd_device *dev, struct ptlrpc_service *svc, int cnt, char *base_name); @@ -691,6 +721,21 @@ void *lustre_swab_reqbuf (struct ptlrpc_request *req, int n, int minlen, void *lustre_swab_repbuf (struct ptlrpc_request *req, int n, int minlen, void *swabber); +static inline void +ptlrpc_rs_addref(struct ptlrpc_reply_state *rs) +{ + LASSERT(atomic_read(&rs->rs_refcount) > 0); + atomic_inc(&rs->rs_refcount); +} + +static inline void +ptlrpc_rs_decref(struct ptlrpc_reply_state *rs) +{ + LASSERT(atomic_read(&rs->rs_refcount) > 0); + if (atomic_dec_and_test(&rs->rs_refcount)) + lustre_free_reply_state(rs); +} + /* ldlm/ldlm_lib.c */ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf); int client_obd_cleanup(struct obd_device * obddev, int flags); diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index 141d5a2..8e4abbf 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -156,7 +156,7 @@ struct oig_callback_context { * callees of this method are encouraged to abort their state * in the oig. This may be called multiple times. */ void (*occ_interrupted)(struct oig_callback_context *occ); - int interrupted; + unsigned interrupted:1; }; /* if we find more consumers this could be generalized */ @@ -322,8 +322,6 @@ struct mds_obd { struct file *mds_lov_objid_filp; unsigned long *mds_client_bitmap; struct semaphore mds_orphan_recovery_sem; - - atomic_t mds_open_count; }; struct echo_obd { diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h index 26d95f6..8ddc85b 100644 --- a/lustre/include/linux/obd_support.h +++ b/lustre/include/linux/obd_support.h @@ -88,6 +88,7 @@ extern wait_queue_head_t obd_race_waitq; #define OBD_FAIL_MDS_PAUSE_OPEN 0x129 #endif #define OBD_FAIL_MDS_STATFS_LCW_SLEEP 0x12a +#define OBD_FAIL_MDS_OPEN_CREATE 0x12b #define OBD_FAIL_OST 0x200 #define OBD_FAIL_OST_CONNECT_NET 0x201 @@ -133,6 +134,7 @@ extern wait_queue_head_t obd_race_waitq; #define OBD_FAIL_OSC_LOCK_BL_AST 0x403 #define OBD_FAIL_OSC_LOCK_CP_AST 0x404 #define OBD_FAIL_OSC_MATCH 0x405 +#define OBD_FAIL_OSC_BRW_PREP_REQ 0x406 #define OBD_FAIL_PTLRPC 0x500 #define OBD_FAIL_PTLRPC_ACK 0x501 diff --git a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.20.patch b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.20.patch index aeff645..c21d851 100644 --- a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.20.patch +++ b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.20.patch @@ -93,7 +93,7 @@ } /* -+ * ext3_xattr_ibode_get() ++ * ext3_xattr_ibody_get() + * + * routine looks for attribute in inode body and returns it's value and size + */ diff --git a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.21-chaos.patch b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.21-chaos.patch index c1b1150..aaf543f 100644 --- a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.21-chaos.patch @@ -99,7 +99,7 @@ Index: linux-2.4.21-chaos/fs/ext3/xattr.c } /* -+ * ext3_xattr_ibode_get() ++ * ext3_xattr_ibody_get() + * + * routine looks for attribute in inode body and returns it's value and size + */ diff --git a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.21-suse2.patch b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.21-suse2.patch deleted file mode 100644 index edcf826..0000000 --- a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.21-suse2.patch +++ /dev/null @@ -1,760 +0,0 @@ - fs/ext3/ialloc.c | 6 - fs/ext3/inode.c | 12 - fs/ext3/super.c | 6 - fs/ext3/xattr.c | 597 +++++++++++++++++++++++++++++++++++++++++++++- - include/linux/ext3_fs.h | 2 - include/linux/ext3_fs_i.h | 3 - 6 files changed, 615 insertions(+), 11 deletions(-) - -Index: linux-2.4.21-chaos/fs/ext3/ialloc.c -=================================================================== ---- linux-2.4.21-chaos.orig/fs/ext3/ialloc.c 2003-12-12 17:39:10.000000000 +0300 -+++ linux-2.4.21-chaos/fs/ext3/ialloc.c 2003-12-12 17:39:55.000000000 +0300 -@@ -580,6 +580,12 @@ - insert_inode_hash(inode); - inode->i_generation = sbi->s_next_generation++; - -+ if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { -+ EXT3_I(inode)->i_extra_isize = sizeof(__u16) /* i_extra_isize */ -+ + sizeof(__u16); /* i_pad1 */ -+ } else -+ EXT3_I(inode)->i_extra_isize = 0; -+ - inode->u.ext3_i.i_state = EXT3_STATE_NEW; - err = ext3_get_inode_loc_new(inode, &iloc, 1); - if (err) goto fail; -Index: linux-2.4.21-chaos/fs/ext3/inode.c -=================================================================== ---- linux-2.4.21-chaos.orig/fs/ext3/inode.c 2003-12-12 17:39:11.000000000 +0300 -+++ linux-2.4.21-chaos/fs/ext3/inode.c 2003-12-12 17:39:55.000000000 +0300 -@@ -2502,6 +2502,12 @@ - ei->i_data[block] = iloc.raw_inode->i_block[block]; - INIT_LIST_HEAD(&ei->i_orphan); - -+ if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) -+ EXT3_I(inode)->i_extra_isize = -+ le16_to_cpu(raw_inode->i_extra_isize); -+ else -+ EXT3_I(inode)->i_extra_isize = 0; -+ - if (S_ISREG(inode->i_mode)) { - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; -@@ -2564,6 +2570,8 @@ - if (err) - goto out_brelse; - } -+ if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) -+ memset(raw_inode, 0, EXT3_INODE_SIZE(inode->i_sb)); - raw_inode->i_mode = cpu_to_le16(inode->i_mode); - if(!(test_opt(inode->i_sb, NO_UID32))) { - raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); -@@ -2646,6 +2654,10 @@ - else for (block = 0; block < EXT3_N_BLOCKS; block++) - raw_inode->i_block[block] = ei->i_data[block]; - -+ if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) -+ raw_inode->i_extra_isize = -+ cpu_to_le16(EXT3_I(inode)->i_extra_isize); -+ - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - rc = ext3_journal_dirty_metadata(handle, bh); - if (!err) -Index: linux-2.4.21-chaos/fs/ext3/xattr.c -=================================================================== ---- linux-2.4.21-chaos.orig/fs/ext3/xattr.c 2003-12-12 17:38:44.000000000 +0300 -+++ linux-2.4.21-chaos/fs/ext3/xattr.c 2003-12-12 17:42:58.000000000 +0300 -@@ -88,6 +88,9 @@ - struct buffer_head *, - struct ext3_xattr_header *); - -+int ext3_xattr_block_set(handle_t *, struct inode *, int, const char *, -+ const void *, size_t, int); -+ - #ifdef CONFIG_EXT3_FS_XATTR_SHARING - - static int ext3_xattr_cache_insert(struct buffer_head *); -@@ -256,17 +259,12 @@ - } - - /* -- * ext3_xattr_get() -- * -- * Copy an extended attribute into the buffer -- * provided, or compute the buffer size required. -- * Buffer is NULL to compute the size of the buffer required. -+ * ext3_xattr_block_get() - * -- * Returns a negative error number on failure, or the number of bytes -- * used / required on success. -+ * routine looks for attribute in EA block and returns it's value and size - */ - int --ext3_xattr_get(struct inode *inode, int name_index, const char *name, -+ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size) - { - struct buffer_head *bh = NULL; -@@ -359,6 +357,94 @@ - } - - /* -+ * ext3_xattr_ibode_get() -+ * -+ * routine looks for attribute in inode body and returns it's value and size -+ */ -+int -+ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t buffer_size) -+{ -+ int size, name_len = strlen(name), storage_size; -+ struct ext3_xattr_entry *last; -+ struct ext3_inode *raw_inode; -+ struct ext3_iloc iloc; -+ char *start, *end; -+ int ret = -ENOENT; -+ -+ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) -+ return -ENOENT; -+ -+ ret = ext3_get_inode_loc(inode, &iloc); -+ if (ret) -+ return ret; -+ raw_inode = iloc.raw_inode; -+ -+ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - -+ EXT3_GOOD_OLD_INODE_SIZE - -+ EXT3_I(inode)->i_extra_isize - -+ sizeof(__u32); -+ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + -+ EXT3_I(inode)->i_extra_isize; -+ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { -+ brelse(iloc.bh); -+ return -ENOENT; -+ } -+ start += sizeof(__u32); -+ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; -+ -+ last = (struct ext3_xattr_entry *) start; -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); -+ if (le32_to_cpu(last->e_value_size) > storage_size || -+ (char *) next >= end) { -+ ext3_error(inode->i_sb, "ext3_xattr_ibody_get", -+ "inode %ld", inode->i_ino); -+ brelse(iloc.bh); -+ return -EIO; -+ } -+ if (name_index == last->e_name_index && -+ name_len == last->e_name_len && -+ !memcmp(name, last->e_name, name_len)) -+ goto found; -+ last = next; -+ } -+ -+ /* can't find EA */ -+ brelse(iloc.bh); -+ return -ENOENT; -+ -+found: -+ size = le32_to_cpu(last->e_value_size); -+ if (buffer) { -+ ret = -ERANGE; -+ if (buffer_size >= size) { -+ memcpy(buffer, start + le16_to_cpu(last->e_value_offs), -+ size); -+ ret = size; -+ } -+ } else -+ ret = size; -+ brelse(iloc.bh); -+ return ret; -+} -+ -+int ext3_xattr_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t buffer_size) -+{ -+ int err; -+ -+ /* try to find attribute in inode body */ -+ err = ext3_xattr_ibody_get(inode, name_index, name, -+ buffer, buffer_size); -+ if (err < 0) -+ /* search was unsuccessful, try to find EA in dedicated block */ -+ err = ext3_xattr_block_get(inode, name_index, name, -+ buffer, buffer_size); -+ return err; -+} -+ -+/* - * ext3_xattr_list() - * - * Copy a list of attribute names into the buffer -@@ -369,7 +455,7 @@ - * used / required on success. - */ - int --ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) -+ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) - { - struct buffer_head *bh = NULL; - struct ext3_xattr_entry *entry; -@@ -446,6 +532,131 @@ - return error; - } - -+/* ext3_xattr_ibody_list() -+ * -+ * generate list of attributes stored in inode body -+ */ -+int -+ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) -+{ -+ struct ext3_xattr_entry *last; -+ struct ext3_inode *raw_inode; -+ char *start, *end, *buf; -+ struct ext3_iloc iloc; -+ int storage_size; -+ int ret; -+ int size = 0; -+ -+ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) -+ return 0; -+ -+ ret = ext3_get_inode_loc(inode, &iloc); -+ if (ret) -+ return ret; -+ raw_inode = iloc.raw_inode; -+ -+ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - -+ EXT3_GOOD_OLD_INODE_SIZE - -+ EXT3_I(inode)->i_extra_isize - -+ sizeof(__u32); -+ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + -+ EXT3_I(inode)->i_extra_isize; -+ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { -+ brelse(iloc.bh); -+ return 0; -+ } -+ start += sizeof(__u32); -+ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; -+ -+ last = (struct ext3_xattr_entry *) start; -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); -+ struct ext3_xattr_handler *handler; -+ if (le32_to_cpu(last->e_value_size) > storage_size || -+ (char *) next >= end) { -+ ext3_error(inode->i_sb, "ext3_xattr_ibody_list", -+ "inode %ld", inode->i_ino); -+ brelse(iloc.bh); -+ return -EIO; -+ } -+ handler = ext3_xattr_handler(last->e_name_index); -+ if (handler) -+ size += handler->list(NULL, inode, last->e_name, -+ last->e_name_len); -+ last = next; -+ } -+ -+ if (!buffer) { -+ ret = size; -+ goto cleanup; -+ } else { -+ ret = -ERANGE; -+ if (size > buffer_size) -+ goto cleanup; -+ } -+ -+ last = (struct ext3_xattr_entry *) start; -+ buf = buffer; -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); -+ struct ext3_xattr_handler *handler; -+ handler = ext3_xattr_handler(last->e_name_index); -+ if (handler) -+ buf += handler->list(buf, inode, last->e_name, -+ last->e_name_len); -+ last = next; -+ } -+ ret = size; -+cleanup: -+ brelse(iloc.bh); -+ return ret; -+} -+ -+/* -+ * ext3_xattr_list() -+ * -+ * Copy a list of attribute names into the buffer -+ * provided, or compute the buffer size required. -+ * Buffer is NULL to compute the size of the buffer required. -+ * -+ * Returns a negative error number on failure, or the number of bytes -+ * used / required on success. -+ */ -+int -+ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) -+{ -+ int error; -+ int size = buffer_size; -+ -+ /* get list of attributes stored in inode body */ -+ error = ext3_xattr_ibody_list(inode, buffer, buffer_size); -+ if (error < 0) { -+ /* some error occured while collecting -+ * attributes in inode body */ -+ size = 0; -+ goto cleanup; -+ } -+ size = error; -+ -+ /* get list of attributes stored in dedicated block */ -+ if (buffer) { -+ buffer_size -= error; -+ if (buffer_size <= 0) { -+ buffer = NULL; -+ buffer_size = 0; -+ } else -+ buffer += error; -+ } -+ -+ error = ext3_xattr_block_list(inode, buffer, buffer_size); -+ if (error < 0) -+ /* listing was successful, so we return len */ -+ size = 0; -+ -+cleanup: -+ return error + size; -+} -+ - /* - * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is - * not set, set it. -@@ -480,6 +691,102 @@ - */ - int - ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, -+ const char *name, const void *value, size_t value_len, -+ int flags) -+{ -+ struct ext3_xattr_entry entry; -+ int err, where = 0, found = 0, total; -+ int free1 = -1, free2 = -1; -+ int name_len; -+ -+ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", -+ name_index, name, value, (long)value_len); -+ -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -+ return -EPERM; -+ if (value == NULL) -+ value_len = 0; -+ if (name == NULL) -+ return -EINVAL; -+ name_len = strlen(name); -+ if (name_len > 255 || value_len > inode->i_sb->s_blocksize) -+ return -ERANGE; -+ -+ /* try to find attribute in inode body */ -+ err = ext3_xattr_ibody_find(inode, name_index, name, &entry, &free1); -+ if (err == 0) { -+ /* found EA in inode */ -+ found = 1; -+ where = 0; -+ } else if (err == -ENOENT) { -+ /* there is no such attribute in inode body */ -+ /* try to find attribute in dedicated block */ -+ err = ext3_xattr_block_find(inode, name_index, name, -+ &entry, &free2); -+ if (err != 0 && err != -ENOENT) { -+ /* not found EA in block */ -+ goto finish; -+ } else if (err == 0) { -+ /* found EA in block */ -+ where = 1; -+ found = 1; -+ } -+ } else -+ goto finish; -+ -+ /* check flags: may replace? may create ? */ -+ if (found && (flags & XATTR_CREATE)) { -+ err = -EEXIST; -+ goto finish; -+ } else if (!found && (flags & XATTR_REPLACE)) { -+ err = -ENODATA; -+ goto finish; -+ } -+ -+ /* check if we have enough space to store attribute */ -+ total = EXT3_XATTR_LEN(strlen(name)) + value_len; -+ if (free1 >= 0 && total > free1 && free2 >= 0 && total > free2) { -+ /* have no enough space */ -+ err = -ENOSPC; -+ goto finish; -+ } -+ -+ /* time to remove attribute */ -+ if (found) { -+ if (where == 0) { -+ /* EA is stored in inode body */ -+ ext3_xattr_ibody_set(handle, inode, name_index, name, -+ NULL, 0, flags); -+ } else { -+ /* EA is stored in separated block */ -+ ext3_xattr_block_set(handle, inode, name_index, name, -+ NULL, 0, flags); -+ } -+ } -+ -+ /* try to store EA in inode body */ -+ err = ext3_xattr_ibody_set(handle, inode, name_index, name, -+ value, value_len, flags); -+ if (err) { -+ /* can't store EA in inode body */ -+ /* try to store in block */ -+ err = ext3_xattr_block_set(handle, inode, name_index, -+ name, value, value_len, flags); -+ } -+ -+finish: -+ return err; -+} -+ -+/* -+ * ext3_xattr_block_set() -+ * -+ * this routine add/remove/replace attribute in EA block -+ */ -+int -+ext3_xattr_block_set(handle_t *handle, struct inode *inode, int name_index, - const char *name, const void *value, size_t value_len, - int flags) - { -@@ -868,6 +1174,279 @@ - } - - /* -+ * ext3_xattr_ibody_find() -+ * -+ * search attribute and calculate free space in inode body -+ * NOTE: free space includes space our attribute hold -+ */ -+int -+ext3_xattr_ibody_find(struct inode *inode, int name_index, -+ const char *name, struct ext3_xattr_entry *rentry, int *free) -+{ -+ struct ext3_xattr_entry *last; -+ struct ext3_inode *raw_inode; -+ int name_len = strlen(name); -+ int err, storage_size; -+ struct ext3_iloc iloc; -+ char *start, *end; -+ int ret = -ENOENT; -+ -+ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) -+ return ret; -+ -+ err = ext3_get_inode_loc(inode, &iloc); -+ if (err) -+ return -EIO; -+ raw_inode = iloc.raw_inode; -+ -+ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - -+ EXT3_GOOD_OLD_INODE_SIZE - -+ EXT3_I(inode)->i_extra_isize - -+ sizeof(__u32); -+ *free = storage_size - sizeof(__u32); -+ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + -+ EXT3_I(inode)->i_extra_isize; -+ if (le32_to_cpu((*(__u32*) start)) != EXT3_XATTR_MAGIC) { -+ brelse(iloc.bh); -+ return -ENOENT; -+ } -+ start += sizeof(__u32); -+ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; -+ -+ last = (struct ext3_xattr_entry *) start; -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); -+ if (le32_to_cpu(last->e_value_size) > storage_size || -+ (char *) next >= end) { -+ ext3_error(inode->i_sb, "ext3_xattr_ibody_find", -+ "inode %ld", inode->i_ino); -+ brelse(iloc.bh); -+ return -EIO; -+ } -+ -+ if (name_index == last->e_name_index && -+ name_len == last->e_name_len && -+ !memcmp(name, last->e_name, name_len)) { -+ memcpy(rentry, last, sizeof(struct ext3_xattr_entry)); -+ ret = 0; -+ } else { -+ *free -= EXT3_XATTR_LEN(last->e_name_len); -+ *free -= le32_to_cpu(last->e_value_size); -+ } -+ last = next; -+ } -+ -+ brelse(iloc.bh); -+ return ret; -+} -+ -+/* -+ * ext3_xattr_block_find() -+ * -+ * search attribute and calculate free space in EA block (if it allocated) -+ * NOTE: free space includes space our attribute hold -+ */ -+int -+ext3_xattr_block_find(struct inode *inode, int name_index, const char *name, -+ struct ext3_xattr_entry *rentry, int *free) -+{ -+ struct buffer_head *bh = NULL; -+ struct ext3_xattr_entry *entry; -+ char *end; -+ int name_len, error = -ENOENT; -+ -+ if (!EXT3_I(inode)->i_file_acl) { -+ *free = inode->i_sb->s_blocksize - -+ sizeof(struct ext3_xattr_header) - -+ sizeof(__u32); -+ return -ENOENT; -+ } -+ ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl); -+ bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); -+ if (!bh) -+ return -EIO; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); -+ end = bh->b_data + bh->b_size; -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+bad_block: ext3_error(inode->i_sb, "ext3_xattr_get", -+ "inode %ld: bad block %d", inode->i_ino, -+ EXT3_I(inode)->i_file_acl); -+ brelse(bh); -+ return -EIO; -+ } -+ /* find named attribute */ -+ name_len = strlen(name); -+ *free = bh->b_size - sizeof(__u32); -+ -+ entry = FIRST_ENTRY(bh); -+ while (!IS_LAST_ENTRY(entry)) { -+ struct ext3_xattr_entry *next = -+ EXT3_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (name_index == entry->e_name_index && -+ name_len == entry->e_name_len && -+ memcmp(name, entry->e_name, name_len) == 0) { -+ memcpy(rentry, entry, sizeof(struct ext3_xattr_entry)); -+ error = 0; -+ } else { -+ *free -= EXT3_XATTR_LEN(entry->e_name_len); -+ *free -= le32_to_cpu(entry->e_value_size); -+ } -+ entry = next; -+ } -+ brelse(bh); -+ -+ return error; -+} -+ -+/* -+ * ext3_xattr_inode_set() -+ * -+ * this routine add/remove/replace attribute in inode body -+ */ -+int -+ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, int name_index, -+ const char *name, const void *value, size_t value_len, -+ int flags) -+{ -+ struct ext3_xattr_entry *last, *next, *here = NULL; -+ struct ext3_inode *raw_inode; -+ int name_len = strlen(name); -+ int esize = EXT3_XATTR_LEN(name_len); -+ struct buffer_head *bh; -+ int err, storage_size; -+ struct ext3_iloc iloc; -+ int free, min_offs; -+ char *start, *end; -+ -+ if (EXT3_SB(inode->i_sb)->s_inode_size <= EXT3_GOOD_OLD_INODE_SIZE) -+ return -ENOSPC; -+ -+ err = ext3_get_inode_loc(inode, &iloc); -+ if (err) -+ return err; -+ raw_inode = iloc.raw_inode; -+ bh = iloc.bh; -+ -+ storage_size = EXT3_SB(inode->i_sb)->s_inode_size - -+ EXT3_GOOD_OLD_INODE_SIZE - -+ EXT3_I(inode)->i_extra_isize - -+ sizeof(__u32); -+ start = (char *) raw_inode + EXT3_GOOD_OLD_INODE_SIZE + -+ EXT3_I(inode)->i_extra_isize; -+ if ((*(__u32*) start) != EXT3_XATTR_MAGIC) { -+ /* inode had no attributes before */ -+ *((__u32*) start) = cpu_to_le32(EXT3_XATTR_MAGIC); -+ } -+ start += sizeof(__u32); -+ end = (char *) raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; -+ min_offs = storage_size; -+ free = storage_size - sizeof(__u32); -+ -+ last = (struct ext3_xattr_entry *) start; -+ while (!IS_LAST_ENTRY(last)) { -+ next = EXT3_XATTR_NEXT(last); -+ if (le32_to_cpu(last->e_value_size) > storage_size || -+ (char *) next >= end) { -+ ext3_error(inode->i_sb, "ext3_xattr_ibody_set", -+ "inode %ld", inode->i_ino); -+ brelse(bh); -+ return -EIO; -+ } -+ -+ if (last->e_value_size) { -+ int offs = le16_to_cpu(last->e_value_offs); -+ if (offs < min_offs) -+ min_offs = offs; -+ } -+ if (name_index == last->e_name_index && -+ name_len == last->e_name_len && -+ !memcmp(name, last->e_name, name_len)) -+ here = last; -+ else { -+ /* we calculate all but our attribute -+ * because it will be removed before changing */ -+ free -= EXT3_XATTR_LEN(last->e_name_len); -+ free -= le32_to_cpu(last->e_value_size); -+ } -+ last = next; -+ } -+ -+ if (value && (esize + value_len > free)) { -+ brelse(bh); -+ return -ENOSPC; -+ } -+ -+ err = ext3_reserve_inode_write(handle, inode, &iloc); -+ if (err) { -+ brelse(bh); -+ return err; -+ } -+ -+ if (here) { -+ /* time to remove old value */ -+ struct ext3_xattr_entry *e; -+ int size = le32_to_cpu(here->e_value_size); -+ int border = le16_to_cpu(here->e_value_offs); -+ char *src; -+ -+ /* move tail */ -+ memmove(start + min_offs + size, start + min_offs, -+ border - min_offs); -+ -+ /* recalculate offsets */ -+ e = (struct ext3_xattr_entry *) start; -+ while (!IS_LAST_ENTRY(e)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(e); -+ int offs = le16_to_cpu(e->e_value_offs); -+ if (offs < border) -+ e->e_value_offs = -+ cpu_to_le16(offs + size); -+ e = next; -+ } -+ min_offs += size; -+ -+ /* remove entry */ -+ border = EXT3_XATTR_LEN(here->e_name_len); -+ src = (char *) here + EXT3_XATTR_LEN(here->e_name_len); -+ size = (char *) last - src; -+ if ((char *) here + size > end) -+ printk("ALERT at %s:%d: 0x%p + %d > 0x%p\n", -+ __FILE__, __LINE__, here, size, end); -+ memmove(here, src, size); -+ last = (struct ext3_xattr_entry *) ((char *) last - border); -+ *((__u32 *) last) = 0; -+ } -+ -+ if (value) { -+ int offs = min_offs - value_len; -+ /* use last to create new entry */ -+ last->e_name_len = strlen(name); -+ last->e_name_index = name_index; -+ last->e_value_offs = cpu_to_le16(offs); -+ last->e_value_size = cpu_to_le32(value_len); -+ last->e_hash = last->e_value_block = 0; -+ memset(last->e_name, 0, esize); -+ memcpy(last->e_name, name, last->e_name_len); -+ if (start + offs + value_len > end) -+ printk("ALERT at %s:%d: 0x%p + %d + %d > 0x%p\n", -+ __FILE__, __LINE__, start, offs, -+ value_len, end); -+ memcpy(start + offs, value, value_len); -+ last = EXT3_XATTR_NEXT(last); -+ *((__u32 *) last) = 0; -+ } -+ -+ ext3_mark_iloc_dirty(handle, inode, &iloc); -+ brelse(bh); -+ -+ return 0; -+} -+ -+/* - * ext3_xattr_set_trans() - * - * Like ext3_xattr_set_handle, but start from an inode. This extended -Index: linux-2.4.21-chaos/fs/ext3/super.c -=================================================================== ---- linux-2.4.21-chaos.orig/fs/ext3/super.c 2003-12-12 17:39:11.000000000 +0300 -+++ linux-2.4.21-chaos/fs/ext3/super.c 2003-12-12 17:39:55.000000000 +0300 -@@ -1354,8 +1354,10 @@ - } else { - sbi->s_inode_size = le16_to_cpu(es->s_inode_size); - sbi->s_first_ino = le32_to_cpu(es->s_first_ino); -- if (sbi->s_inode_size != EXT3_GOOD_OLD_INODE_SIZE) { -- printk (KERN_ERR -+ if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) || -+ (sbi->s_inode_size & (sbi->s_inode_size - 1)) || -+ (sbi->s_inode_size > blocksize)) { -+ printk (KERN_ERR - "EXT3-fs: unsupported inode size: %d\n", - sbi->s_inode_size); - goto failed_mount; -Index: linux-2.4.21-chaos/include/linux/ext3_fs.h -=================================================================== ---- linux-2.4.21-chaos.orig/include/linux/ext3_fs.h 2003-12-12 17:39:10.000000000 +0300 -+++ linux-2.4.21-chaos/include/linux/ext3_fs.h 2003-12-12 17:39:55.000000000 +0300 -@@ -268,6 +268,8 @@ - __u32 m_i_reserved2[2]; - } masix2; - } osd2; /* OS dependent 2 */ -+ __u16 i_extra_isize; -+ __u16 i_pad1; - }; - - #define i_size_high i_dir_acl -Index: linux-2.4.21-chaos/include/linux/ext3_fs_i.h -=================================================================== ---- linux-2.4.21-chaos.orig/include/linux/ext3_fs_i.h 2003-12-05 16:54:33.000000000 +0300 -+++ linux-2.4.21-chaos/include/linux/ext3_fs_i.h 2003-12-12 17:39:55.000000000 +0300 -@@ -76,6 +76,9 @@ - */ - loff_t i_disksize; - -+ /* on-disk additional length */ -+ __u16 i_extra_isize; -+ - /* - * truncate_sem is for serialising ext3_truncate() against - * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's diff --git a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.22-rh.patch b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.22-rh.patch index e6eaaf8..18604ef 100644 --- a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.22-rh.patch +++ b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.22-rh.patch @@ -108,7 +108,7 @@ } /* -+ * ext3_xattr_ibode_get() ++ * ext3_xattr_ibody_get() + * + * routine looks for attribute in inode body and returns it's value and size + */ diff --git a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch index 997cc1e..f984067 100644 --- a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch +++ b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch @@ -94,7 +94,7 @@ Index: linux-2.6.0/fs/ext3/xattr.c /* - * ext3_xattr_list() -+ * ext3_xattr_ibode_get() ++ * ext3_xattr_ibody_get() * - * Copy a list of attribute names into the buffer + * routine looks for attribute in inode body and returns it's value and size diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.7.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.7.patch index 9d782c4..1c8b8d9 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.7.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.7.patch @@ -1681,9 +1681,9 @@ Index: linux-2.6.7/include/linux/ext3_fs.h */ @@ -335,6 +337,7 @@ #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x10000 /* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x20000 /* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x100000/* Buddy allocation support */ + #define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.21-chaos.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.21-chaos.patch index 29dc451..f6d9f9d 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.21-chaos.patch @@ -561,7 +561,7 @@ Index: linux-ia64/fs/namei.c if (!IS_POSIXACL(dir->d_inode)) mode &= ~current->fs->umask; - error = vfs_create(dir->d_inode, dentry, mode); -+ error = vfs_create_it(dir->d_inode, dentry, mode, it); ++ error = vfs_create_it(dir->d_inode, dentry, mode, it); up(&dir->d_inode->i_sem); dput(nd->dentry); nd->dentry = dentry; @@ -1209,7 +1209,7 @@ Index: linux-ia64/fs/open.c - error = __user_walk(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | - LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + error = __user_walk_it(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | -+ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); ++ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); if (error) goto out; @@ -1668,7 +1668,7 @@ Index: linux-ia64/include/linux/fs.h #define ATTR_ATTR_FLAG 1024 +#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ +#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ -+#define ATTR_CTIME_SET 0x2000 ++#define ATTR_CTIME_SET 0x2000 /* * This is the Inode Attributes structure, used for notify_change(). It diff --git a/lustre/kernel_patches/series/chaos-2.4.21 b/lustre/kernel_patches/series/chaos-2.4.21 index 97190b5..3e69962 100644 --- a/lustre/kernel_patches/series/chaos-2.4.21 +++ b/lustre/kernel_patches/series/chaos-2.4.21 @@ -1,5 +1,6 @@ revert-76chaos.patch configurable-x86-stack-2.4.21-chaos.patch +configurable-x86_64-2.4.21.patch dev_read_only_2.4.21-chaos.patch exports_2.4.19-suse.patch lustre_version.patch diff --git a/lustre/kernel_patches/series/rhel-2.4.21 b/lustre/kernel_patches/series/rhel-2.4.21 index cf623d5..817319b 100644 --- a/lustre/kernel_patches/series/rhel-2.4.21 +++ b/lustre/kernel_patches/series/rhel-2.4.21 @@ -1,4 +1,5 @@ configurable-x86-stack-2.4.21-chaos.patch +configurable-x86_64-2.4.21.patch dev_read_only_2.4.21-chaos.patch exports_2.4.19-suse.patch lustre_version.patch diff --git a/lustre/kernel_patches/series/suse-2.4.21-2 b/lustre/kernel_patches/series/suse-2.4.21-2 index 27928ea..52337b9 100644 --- a/lustre/kernel_patches/series/suse-2.4.21-2 +++ b/lustre/kernel_patches/series/suse-2.4.21-2 @@ -24,7 +24,7 @@ ext3-no-write-super-chaos.patch add_page_private.patch nfs_export_kernel-2.4.21-suse2.patch ext3-raw-lookup.patch -ext3-ea-in-inode-2.4.21-suse2.patch +ext3-ea-in-inode-2.4.21-chaos.patch listman-2.4.20.patch ext3-xattr-ptr-arith-fix.patch procfs-ndynamic-2.4.21-suse2.patch diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 3dcaa90..c5f39a3 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -113,8 +113,16 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) spin_lock_init(&cli->cl_write_rpc_hist.oh_lock); spin_lock_init(&cli->cl_read_page_hist.oh_lock); spin_lock_init(&cli->cl_write_page_hist.oh_lock); - cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES; - cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT; + if (num_physpages >> (20 - PAGE_SHIFT) <= 128) { /* <= 128 MB */ + cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES / 4; + cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT / 4; + } else if (num_physpages >> (20 - PAGE_SHIFT) <= 512) { /* <= 512 MB */ + cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES / 2; + cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT / 2; + } else { + cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES; + cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT; + } rc = ldlm_get_ref(); if (rc) { @@ -596,6 +604,9 @@ void target_destroy_export(struct obd_export *exp) static void target_release_saved_req(struct ptlrpc_request *req) { + if (req->rq_reply_state != NULL) + ptlrpc_rs_decref(req->rq_reply_state); + class_export_put(req->rq_export); OBD_FREE(req->rq_reqmsg, req->rq_reqlen); OBD_FREE(req, sizeof *req); @@ -684,15 +695,12 @@ void target_cleanup_recovery(struct obd_device *obd) list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) { req = list_entry(tmp, struct ptlrpc_request, rq_list); list_del(&req->rq_list); - LASSERT (req->rq_reply_state); - lustre_free_reply_state(req->rq_reply_state); target_release_saved_req(req); } list_for_each_safe(tmp, n, &obd->obd_recovery_queue) { req = list_entry(tmp, struct ptlrpc_request, rq_list); list_del(&req->rq_list); - LASSERT (req->rq_reply_state == 0); target_release_saved_req(req); } } @@ -1005,8 +1013,7 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc) LBUG(); memcpy(saved_req, req, sizeof *saved_req); memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); - /* the copied req takes over the reply state */ - req->rq_reply_state = NULL; + ptlrpc_rs_addref(req->rq_reply_state); /* +1 ref for saved reply */ req = saved_req; req->rq_reqmsg = reqmsg; class_export_get(req->rq_export); @@ -1045,18 +1052,6 @@ target_send_reply_msg (struct ptlrpc_request *req, int rc, int fail_id) if (OBD_FAIL_CHECK(fail_id | OBD_FAIL_ONCE)) { obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED; DEBUG_REQ(D_ERROR, req, "dropping reply"); - /* NB this does _not_ send with ACK disabled, to simulate - * sending OK, but timing out for the ACK */ - if (req->rq_reply_state != NULL) { - if (!req->rq_reply_state->rs_difficult) { - lustre_free_reply_state (req->rq_reply_state); - req->rq_reply_state = NULL; - } else { - struct ptlrpc_service *svc = - req->rq_rqbd->rqbd_srv_ni->sni_service; - atomic_inc(&svc->srv_outstanding_replies); - } - } return (-ECOMM); } @@ -1087,12 +1082,8 @@ target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) rs = req->rq_reply_state; if (rs == NULL || !rs->rs_difficult) { - /* The easy case; no notifiers and reply_out_callback() - * cleans up (i.e. we can't look inside rs after a - * successful send) */ - netrc = target_send_reply_msg (req, rc, fail_id); - - LASSERT (netrc == 0 || req->rq_reply_state == NULL); + /* no notifiers */ + target_send_reply_msg (req, rc, fail_id); return; } @@ -1141,8 +1132,16 @@ target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) svc->srv_n_difficult_replies++; - if (netrc != 0) /* error sending: reply is off the net */ + if (netrc != 0) { + /* error sending: reply is off the net. Also we need +1 + * reply ref until ptlrpc_server_handle_reply() is done + * with the reply state (if the send was successful, there + * would have been +1 ref for the net, which + * reply_out_callback leaves alone) */ rs->rs_on_net = 0; + ptlrpc_rs_addref(rs); + atomic_inc (&svc->srv_outstanding_replies); + } if (!rs->rs_on_net || /* some notifier */ list_empty(&rs->rs_exp_list) || /* completed already */ diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 39d24ac..67ab95a 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -58,16 +58,15 @@ inline unsigned long round_timeout(unsigned long timeout) } #ifdef __KERNEL__ -/* XXX should this be per-ldlm? */ -static struct list_head waiting_locks_list; +/* w_l_spinlock protects both waiting_locks_list and expired_lock_thread */ static spinlock_t waiting_locks_spinlock; +static struct list_head waiting_locks_list; static struct timer_list waiting_locks_timer; static struct expired_lock_thread { wait_queue_head_t elt_waitq; int elt_state; struct list_head elt_expired_locks; - spinlock_t elt_lock; } expired_lock_thread; #endif @@ -96,9 +95,9 @@ static inline int have_expired_locks(void) { int need_to_run; - spin_lock_bh(&expired_lock_thread.elt_lock); + spin_lock_bh(&waiting_locks_spinlock); need_to_run = !list_empty(&expired_lock_thread.elt_expired_locks); - spin_unlock_bh(&expired_lock_thread.elt_lock); + spin_unlock_bh(&waiting_locks_spinlock); RETURN(need_to_run); } @@ -129,7 +128,7 @@ static int expired_lock_main(void *arg) expired_lock_thread.elt_state == ELT_TERMINATE, &lwi); - spin_lock_bh(&expired_lock_thread.elt_lock); + spin_lock_bh(&waiting_locks_spinlock); while (!list_empty(expired)) { struct obd_export *export; struct ldlm_lock *lock; @@ -151,13 +150,13 @@ static int expired_lock_main(void *arg) continue; } export = class_export_get(lock->l_export); - spin_unlock_bh(&expired_lock_thread.elt_lock); + spin_unlock_bh(&waiting_locks_spinlock); ptlrpc_fail_export(export); class_export_put(export); - spin_lock_bh(&expired_lock_thread.elt_lock); + spin_lock_bh(&waiting_locks_spinlock); } - spin_unlock_bh(&expired_lock_thread.elt_lock); + spin_unlock_bh(&waiting_locks_spinlock); if (expired_lock_thread.elt_state == ELT_TERMINATE) break; @@ -175,6 +174,7 @@ static void waiting_locks_callback(unsigned long unused) spin_lock_bh(&waiting_locks_spinlock); while (!list_empty(&waiting_locks_list)) { + lock = list_entry(waiting_locks_list.next, struct ldlm_lock, l_pending_chain); @@ -182,10 +182,9 @@ static void waiting_locks_callback(unsigned long unused) break; LDLM_ERROR(lock, "lock callback timer expired: evicting client " - "%s@%s nid %s ", - lock->l_export->exp_client_uuid.uuid, + "%s@%s nid %s ",lock->l_export->exp_client_uuid.uuid, lock->l_export->exp_connection->c_remote_uuid.uuid, - ptlrpc_peernid2str(&lock->l_export->exp_connection->c_peer, str)); + ptlrpc_peernid2str(&lock->l_export->exp_connection->c_peer,str)); if (lock == last) { LDLM_ERROR(lock, "waiting on lock multiple times"); @@ -193,16 +192,15 @@ static void waiting_locks_callback(unsigned long unused) waiting_locks_list.next, waiting_locks_list.prev, lock->l_pending_chain.next, lock->l_pending_chain.prev); - spin_unlock(&waiting_locks_spinlock); + spin_unlock_bh(&waiting_locks_spinlock); LBUG(); } last = lock; - spin_lock_bh(&expired_lock_thread.elt_lock); list_del(&lock->l_pending_chain); list_add(&lock->l_pending_chain, &expired_lock_thread.elt_expired_locks); - spin_unlock_bh(&expired_lock_thread.elt_lock); + wake_up(&expired_lock_thread.elt_waitq); } @@ -232,11 +230,10 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock) spin_lock_bh(&waiting_locks_spinlock); if (!list_empty(&lock->l_pending_chain)) { - LDLM_DEBUG(lock, "not re-adding to wait list"); spin_unlock_bh(&waiting_locks_spinlock); + LDLM_DEBUG(lock, "not re-adding to wait list"); return 0; } - LDLM_DEBUG(lock, "adding to wait list"); lock->l_callback_timeout = jiffies + (obd_timeout * HZ / 2); @@ -248,6 +245,7 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock) } list_add_tail(&lock->l_pending_chain, &waiting_locks_list); /* FIFO */ spin_unlock_bh(&waiting_locks_spinlock); + LDLM_DEBUG(lock, "adding to wait list"); return 1; } @@ -288,11 +286,7 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock) round_timeout(next->l_callback_timeout)); } } - - /* the lock could already be expired, get the elt_lock also */ - spin_lock_bh(&expired_lock_thread.elt_lock); list_del_init(&lock->l_pending_chain); - spin_unlock_bh(&expired_lock_thread.elt_lock); spin_unlock_bh(&waiting_locks_spinlock); LDLM_DEBUG(lock, "removed"); @@ -1313,7 +1307,7 @@ static int ldlm_setup(void) ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE, LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL, 1500, ldlm_callback_handler, "ldlm_cbd", - ldlm_svc_proc_dir); + ldlm_svc_proc_dir, NULL); if (!ldlm_state->ldlm_cb_service) { CERROR("failed to start service\n"); @@ -1325,7 +1319,7 @@ static int ldlm_setup(void) LDLM_CANCEL_REQUEST_PORTAL, LDLM_CANCEL_REPLY_PORTAL, 30000, ldlm_cancel_handler, "ldlm_canceld", - ldlm_svc_proc_dir); + ldlm_svc_proc_dir, NULL); if (!ldlm_state->ldlm_cancel_service) { CERROR("failed to start service\n"); @@ -1369,10 +1363,15 @@ static int ldlm_setup(void) GOTO(out_thread, rc); INIT_LIST_HEAD(&expired_lock_thread.elt_expired_locks); - spin_lock_init(&expired_lock_thread.elt_lock); expired_lock_thread.elt_state = ELT_STOPPED; init_waitqueue_head(&expired_lock_thread.elt_waitq); + INIT_LIST_HEAD(&waiting_locks_list); + spin_lock_init(&waiting_locks_spinlock); + waiting_locks_timer.function = waiting_locks_callback; + waiting_locks_timer.data = 0; + init_timer(&waiting_locks_timer); + rc = kernel_thread(expired_lock_main, NULL, CLONE_VM | CLONE_FS); if (rc < 0) { CERROR("Cannot start ldlm expired-lock thread: %d\n", rc); @@ -1381,12 +1380,6 @@ static int ldlm_setup(void) wait_event(expired_lock_thread.elt_waitq, expired_lock_thread.elt_state == ELT_READY); - - INIT_LIST_HEAD(&waiting_locks_list); - spin_lock_init(&waiting_locks_spinlock); - waiting_locks_timer.function = waiting_locks_callback; - waiting_locks_timer.data = 0; - init_timer(&waiting_locks_timer); #endif RETURN(0); diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 0981545..fae8d40 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -401,6 +401,10 @@ int ldlm_cli_enqueue(struct obd_export *exp, int err = lock->l_completion_ast(lock, *flags, NULL); if (!rc) rc = err; + if (lock->l_destroyed || + lock->l_flags & LDLM_FL_FAILED) + cleanup_phase = 2; + } } diff --git a/lustre/liblustre/llite_lib.h b/lustre/liblustre/llite_lib.h index 7fffce6..9d66c89 100644 --- a/lustre/liblustre/llite_lib.h +++ b/lustre/liblustre/llite_lib.h @@ -109,10 +109,11 @@ struct llu_inode_info { unsigned long lli_st_generation; }; -#define LLU_SYSIO_COOKIE_SIZE(x) \ +#define LLU_SYSIO_COOKIE_SIZE(exp, x) \ (sizeof(struct llu_sysio_cookie) + \ sizeof(struct ll_async_page) * (x) + \ - sizeof(struct page) * (x)) + sizeof(struct page) * (x) + \ + llap_cookie_size * (x)) struct llu_sysio_cookie { struct obd_io_group *lsc_oig; @@ -121,6 +122,7 @@ struct llu_sysio_cookie { int lsc_npages; struct ll_async_page *lsc_llap; struct page *lsc_pages; + void *lsc_llap_cookie; __u64 lsc_rwcount; }; diff --git a/lustre/liblustre/rw.c b/lustre/liblustre/rw.c index 43e75c5..a59dba0 100644 --- a/lustre/liblustre/rw.c +++ b/lustre/liblustre/rw.c @@ -41,7 +41,9 @@ #include "llite_lib.h" -static int llu_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock) +size_t llap_cookie_size; + +static int llu_lock_to_stripe_offset(struct inode *inode,struct ldlm_lock *lock) { struct llu_inode_info *lli = llu_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; @@ -418,12 +420,17 @@ static struct obd_async_page_ops llu_async_page_ops = { }; static -struct llu_sysio_cookie* get_sysio_cookie(struct inode *inode, int maxpages) +struct llu_sysio_cookie* get_sysio_cookie(struct inode *inode, + struct obd_export *exp, int maxpages) { struct llu_sysio_cookie *cookie; int rc; - OBD_ALLOC(cookie, LLU_SYSIO_COOKIE_SIZE(maxpages)); + if (!llap_cookie_size) + llap_cookie_size = obd_prep_async_page(llu_i2obdexp(inode), + NULL, NULL, NULL, 0, + NULL, NULL, NULL); + OBD_ALLOC(cookie, LLU_SYSIO_COOKIE_SIZE(exp, maxpages)); if (cookie == NULL) goto out; @@ -432,10 +439,11 @@ struct llu_sysio_cookie* get_sysio_cookie(struct inode *inode, int maxpages) cookie->lsc_maxpages = maxpages; cookie->lsc_llap = (struct ll_async_page *)(cookie + 1); cookie->lsc_pages = (struct page *) (cookie->lsc_llap + maxpages); + cookie->lsc_llap_cookie = (void *)(cookie->lsc_pages + maxpages); rc = oig_init(&cookie->lsc_oig); if (rc) { - OBD_FREE(cookie, LLU_SYSIO_COOKIE_SIZE(maxpages)); + OBD_FREE(cookie, LLU_SYSIO_COOKIE_SIZE(exp, maxpages)); cookie = NULL; } @@ -469,7 +477,7 @@ void put_sysio_cookie(struct llu_sysio_cookie *cookie) I_RELE(cookie->lsc_inode); oig_release(cookie->lsc_oig); - OBD_FREE(cookie, LLU_SYSIO_COOKIE_SIZE(cookie->lsc_maxpages)); + OBD_FREE(cookie, LLU_SYSIO_COOKIE_SIZE(exp, cookie->lsc_maxpages)); } #ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE @@ -560,6 +568,7 @@ int llu_prep_async_io(struct llu_sysio_cookie *cookie, int cmd, struct obd_export *exp = llu_i2obdexp(cookie->lsc_inode); struct page *pages = cookie->lsc_pages; struct ll_async_page *llap = cookie->lsc_llap; + void *llap_cookie = cookie->lsc_llap_cookie; int i, rc, npages = 0; ENTRY; @@ -611,6 +620,7 @@ int llu_prep_async_io(struct llu_sysio_cookie *cookie, int cmd, for (i = 0; i < npages; i++) { llap[i].llap_magic = LLAP_MAGIC; + llap[i].llap_cookie = llap_cookie + i * llap_cookie_size; rc = obd_prep_async_page(exp, lsm, NULL, &pages[i], (obd_off)pages[i].index << PAGE_SHIFT, &llu_async_page_ops, @@ -661,7 +671,7 @@ llu_rw(int cmd, struct inode *inode, char *buf, size_t count, loff_t pos) max_pages = (count >> PAGE_SHIFT) + 2; - cookie = get_sysio_cookie(inode, max_pages); + cookie = get_sysio_cookie(inode, llu_i2obdexp(inode), max_pages); if (!cookie) RETURN(ERR_PTR(-ENOMEM)); diff --git a/lustre/llite/llite_close.c b/lustre/llite/llite_close.c index 09d80be..c218fa4 100644 --- a/lustre/llite/llite_close.c +++ b/lustre/llite/llite_close.c @@ -43,8 +43,7 @@ void llap_write_complete(struct inode *inode, struct ll_async_page *llap) { struct ll_inode_info *lli = ll_i2info(inode); spin_lock(&lli->lli_lock); - if (!list_empty(&llap->llap_pending_write)) - list_del_init(&llap->llap_pending_write); + list_del_init(&llap->llap_pending_write); spin_unlock(&lli->lli_lock); } diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 6d2fb9c..4075d6b 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -14,7 +14,7 @@ /* default to about 40meg of readahead on a given system. That much tied * up in 512k readahead requests serviced at 40ms each is about 1GB/s. */ -#define SBI_DEFAULT_RA_MAX ((40 << 20) >> PAGE_CACHE_SHIFT) +#define SBI_DEFAULT_READAHEAD_MAX ((40UL << 20) >> PAGE_CACHE_SHIFT) enum ra_stat { RA_STAT_HIT = 0, RA_STAT_MISS, @@ -56,8 +56,10 @@ struct ll_sb_info { struct lprocfs_stats *ll_stats; /* lprocfs stats counter */ + unsigned long ll_async_page_max; + unsigned long ll_async_page_count; unsigned long ll_pglist_gen; - struct list_head ll_pglist; + struct list_head ll_pglist; /* all pages (llap_pglist_item) */ struct ll_ra_info ll_ra_info; unsigned int ll_namelen; @@ -127,6 +129,8 @@ struct it_cb_data { #define LLAP_MAGIC 98764321 +extern kmem_cache_t *ll_async_page_slab; +extern size_t ll_async_page_slab_size; struct ll_async_page { int llap_magic; void *llap_cookie; @@ -137,7 +141,7 @@ struct ll_async_page { llap_defer_uptodate:1, llap_origin:3, llap_ra_used:1; - struct list_head llap_proc_item; + struct list_head llap_pglist_item; }; enum { @@ -148,6 +152,7 @@ enum { LLAP_ORIGIN_WRITEPAGE, LLAP__ORIGIN_MAX, }; +extern char *llap_origins[]; /* llite/lproc_llite.c */ int lprocfs_register_mountpoint(struct proc_dir_entry *parent, @@ -177,6 +182,7 @@ int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to); int ll_writepage(struct page *page); void ll_inode_fill_obdo(struct inode *inode, int cmd, struct obdo *oa); void ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc); +int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction); void ll_removepage(struct page *page); int ll_readpage(struct file *file, struct page *page); struct ll_async_page *llap_from_cookie(void *cookie); @@ -228,7 +234,6 @@ int lustre_fill_super(struct super_block *sb, void *data, int silent); void lustre_put_super(struct super_block *sb); struct inode *ll_inode_from_lock(struct ldlm_lock *lock); void ll_clear_inode(struct inode *inode); -int ll_attr2inode(struct inode *inode, struct iattr *attr, int trunc); int ll_setattr_raw(struct inode *inode, struct iattr *attr); int ll_setattr(struct dentry *de, struct iattr *attr); int ll_statfs(struct super_block *sb, struct kstatfs *sfs); @@ -236,14 +241,16 @@ int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs, unsigned long maxage); void ll_update_inode(struct inode *inode, struct mds_body *body, struct lov_stripe_md *lsm); -int it_disposition(struct lookup_intent *it, int flag); -void it_set_disposition(struct lookup_intent *it, int flag); void ll_read_inode2(struct inode *inode, void *opaque); int ll_iocontrol(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg); void ll_umount_begin(struct super_block *sb); int ll_prep_inode(struct obd_export *exp, struct inode **inode, struct ptlrpc_request *req, int offset, struct super_block *); +struct ll_async_page *llite_pglist_next_llap(struct ll_sb_info *sbi, + struct list_head *list); + +/* llite/llite_nfs.c */ __u32 get_uuid2int(const char *name, int len); struct dentry *ll_fh_to_dentry(struct super_block *sb, __u32 *data, int len, int fhtype, int parent); @@ -279,7 +286,6 @@ void ll_close_thread_shutdown(struct ll_close_queue *lcq); int ll_close_thread_start(struct ll_close_queue **lcq_ret); #define LL_SBI_NOLCK 0x1 -#define LL_SBI_READAHEAD 0x2 #define LL_MAX_BLKSIZE (4UL * 1024 * 1024) diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 2d09beb..fe911da 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -56,7 +56,13 @@ struct ll_sb_info *lustre_init_sbi(struct super_block *sb) spin_lock_init(&sbi->ll_lock); INIT_LIST_HEAD(&sbi->ll_pglist); sbi->ll_pglist_gen = 0; - sbi->ll_ra_info.ra_max_pages = SBI_DEFAULT_RA_MAX; + if (num_physpages >> (20 - PAGE_SHIFT) < 512) + sbi->ll_async_page_max = num_physpages / 2; + else + sbi->ll_async_page_max = (num_physpages / 4) * 3; + sbi->ll_ra_info.ra_max_pages = min(num_physpages / 8, + SBI_DEFAULT_READAHEAD_MAX); + INIT_LIST_HEAD(&sbi->ll_conn_chain); INIT_HLIST_HEAD(&sbi->ll_orphan_dentry_list); ll_s2sbi(sb) = sbi; @@ -151,6 +157,18 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc) } sbi->ll_osc_exp = class_conn2export(&osc_conn); + if (!ll_async_page_slab) { + ll_async_page_slab_size = + size_round(sizeof(struct ll_async_page)) + + obd_prep_async_page(sbi->ll_osc_exp, NULL, NULL, NULL, + 0, NULL, NULL, NULL); + ll_async_page_slab = kmem_cache_create("ll_async_page", + ll_async_page_slab_size, + 0, 0, NULL, NULL); + if (!ll_async_page_slab) + GOTO(out_osc, -ENOMEM); + } + err = mdc_getstatus(sbi->ll_mdc_exp, &rootfid); if (err) { CERROR("cannot mds_connect: rc = %d\n", err); @@ -341,7 +359,6 @@ int ll_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) RETURN(-ENOMEM); - sbi->ll_flags |= LL_SBI_READAHEAD; ll_options(data, &osc, &mdc, &sbi->ll_flags); if (!osc) { @@ -556,8 +573,6 @@ int lustre_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) RETURN(-ENOMEM); - sbi->ll_flags |= LL_SBI_READAHEAD; - if (lmd->lmd_profile) { struct lustre_profile *lprof; struct config_llog_instance cfg; @@ -1357,3 +1372,30 @@ int ll_prep_inode(struct obd_export *exp, struct inode **inode, RETURN(rc); } + +char *llap_origins[] = { + [LLAP_ORIGIN_UNKNOWN] = "--", + [LLAP_ORIGIN_READPAGE] = "rp", + [LLAP_ORIGIN_READAHEAD] = "ra", + [LLAP_ORIGIN_COMMIT_WRITE] = "cw", + [LLAP_ORIGIN_WRITEPAGE] = "wp", +}; + +struct ll_async_page *llite_pglist_next_llap(struct ll_sb_info *sbi, + struct list_head *list) +{ + struct ll_async_page *llap; + struct list_head *pos; + + list_for_each(pos, list) { + if (pos == &sbi->ll_pglist) + return NULL; + llap = list_entry(pos, struct ll_async_page, llap_pglist_item); + if (llap->llap_page == NULL) + continue; + return llap; + } + LBUG(); + return NULL; +} + diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index e09a703..d39803c 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -182,55 +182,60 @@ static int ll_rd_sb_uuid(char *page, char **start, off_t off, int count, return snprintf(page, count, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid); } -static int ll_rd_read_ahead(char *page, char **start, off_t off, int count, - int *eof, void *data) +static int ll_rd_max_readahead_mb(char *page, char **start, off_t off, + int count, int *eof, void *data) { - struct super_block *sb = (struct super_block*)data; + struct super_block *sb = data; struct ll_sb_info *sbi = ll_s2sbi(sb); - int val, rc; - ENTRY; + unsigned val; - *eof = 1; - val = (sbi->ll_flags & LL_SBI_READAHEAD) ? 1 : 0; - rc = snprintf(page, count, "%d\n", val); - RETURN(rc); + spin_lock(&sbi->ll_lock); + val = sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_CACHE_SHIFT); + spin_unlock(&sbi->ll_lock); + + return snprintf(page, count, "%u\n", val); } -static int ll_wr_read_ahead(struct file *file, const char *buffer, - unsigned long count, void *data) +static int ll_wr_max_readahead_mb(struct file *file, const char *buffer, + unsigned long count, void *data) { - struct super_block *sb = (struct super_block*)data; + struct super_block *sb = data; struct ll_sb_info *sbi = ll_s2sbi(sb); - int readahead; - ENTRY; + int val, rc; - if (sscanf(buffer, "%d", &readahead) != 1) - RETURN(-EINVAL); + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; - if (readahead) - sbi->ll_flags |= LL_SBI_READAHEAD; - else - sbi->ll_flags &= ~LL_SBI_READAHEAD; + if (val < 0 || val > (num_physpages >> (20 - PAGE_CACHE_SHIFT - 1))) { + CERROR("can't set readahead more than %lu MB\n", + num_physpages >> (20 - PAGE_CACHE_SHIFT - 1)); + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_max_pages = val << (20 - PAGE_CACHE_SHIFT); + spin_unlock(&sbi->ll_lock); - RETURN(count); + return count; } -static int ll_rd_max_read_ahead_mb(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int ll_rd_max_cached_mb(char *page, char **start, off_t off, + int count, int *eof, void *data) { struct super_block *sb = data; struct ll_sb_info *sbi = ll_s2sbi(sb); unsigned val; spin_lock(&sbi->ll_lock); - val = (sbi->ll_ra_info.ra_max_pages << PAGE_CACHE_SHIFT) >> 20; + val = sbi->ll_async_page_max >> (20 - PAGE_CACHE_SHIFT); spin_unlock(&sbi->ll_lock); return snprintf(page, count, "%u\n", val); } -static int ll_wr_max_read_ahead_mb(struct file *file, const char *buffer, - unsigned long count, void *data) +static int ll_wr_max_cached_mb(struct file *file, const char *buffer, + unsigned long count, void *data) { struct super_block *sb = data; struct ll_sb_info *sbi = ll_s2sbi(sb); @@ -240,13 +245,19 @@ static int ll_wr_max_read_ahead_mb(struct file *file, const char *buffer, if (rc) return rc; - if (val < 0 || val > (num_physpages << PAGE_SHIFT) >> 20) + if (val < 0 || val > (num_physpages >> (20 - PAGE_CACHE_SHIFT))) { + CERROR("can't set max cache more than %lu MB\n", + num_physpages >> (20 - PAGE_CACHE_SHIFT)); return -ERANGE; + } spin_lock(&sbi->ll_lock); - sbi->ll_ra_info.ra_max_pages = (val << 20) >> PAGE_CACHE_SHIFT; + sbi->ll_async_page_max = val << (20 - PAGE_CACHE_SHIFT); spin_unlock(&sbi->ll_lock); + if (sbi->ll_async_page_count >= sbi->ll_async_page_max) + llap_shrink_cache(sbi, 0); + return count; } @@ -261,9 +272,9 @@ static struct lprocfs_vars lprocfs_obd_vars[] = { { "filestotal", ll_rd_filestotal, 0, 0 }, { "filesfree", ll_rd_filesfree, 0, 0 }, //{ "filegroups", lprocfs_rd_filegroups, 0, 0 }, - { "read_ahead", ll_rd_read_ahead, ll_wr_read_ahead, 0 }, - { "max_read_ahead_mb", ll_rd_max_read_ahead_mb, - ll_wr_max_read_ahead_mb, 0 }, + { "max_read_ahead_mb", ll_rd_max_readahead_mb, + ll_wr_max_readahead_mb, 0 }, + { "max_cached_mb", ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 }, { 0 } }; @@ -458,24 +469,6 @@ void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi) } #undef MAX_STRING_SIZE -static struct ll_async_page *llite_pglist_next_llap(struct ll_sb_info *sbi, - struct list_head *list) -{ - struct ll_async_page *llap; - struct list_head *pos; - - list_for_each(pos, list) { - if (pos == &sbi->ll_pglist) - return NULL; - llap = list_entry(pos, struct ll_async_page, llap_proc_item); - if (llap->llap_page == NULL) - continue; - return llap; - } - LBUG(); - return NULL; -} - #define seq_page_flag(seq, page, flag, has_flags) do { \ if (test_bit(PG_##flag, &(page)->flags)) { \ if (!has_flags) \ @@ -486,6 +479,16 @@ static struct ll_async_page *llite_pglist_next_llap(struct ll_sb_info *sbi, } \ } while(0); +static void *llite_dump_pgcache_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct ll_async_page *dummy_llap = seq->private; + + if (dummy_llap->llap_magic == 2) + return NULL; + + return (void *)1; +} + static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) { struct ll_async_page *llap, *dummy_llap = seq->private; @@ -494,32 +497,27 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) /* 2.4 doesn't seem to have SEQ_START_TOKEN, so we implement * it in our own state */ if (dummy_llap->llap_magic == 0) { - seq_printf(seq, "generation | llap cookie origin | page "); - seq_printf(seq, "inode index count [ page flags ]\n"); + seq_printf(seq, "gener | llap cookie origin wq du | page " + "inode index count [ page flags ]\n"); return 0; } spin_lock(&sbi->ll_lock); - llap = llite_pglist_next_llap(sbi, &dummy_llap->llap_proc_item); + llap = llite_pglist_next_llap(sbi, &dummy_llap->llap_pglist_item); if (llap != NULL) { int has_flags = 0; struct page *page = llap->llap_page; - static char *origins[] = { - [LLAP_ORIGIN_UNKNOWN] = "--", - [LLAP_ORIGIN_READPAGE] = "rp", - [LLAP_ORIGIN_READAHEAD] = "ra", - [LLAP_ORIGIN_COMMIT_WRITE] = "cw", - [LLAP_ORIGIN_WRITEPAGE] = "wp", - }; LASSERTF(llap->llap_origin < LLAP__ORIGIN_MAX, "%u\n", llap->llap_origin); - seq_printf(seq, "%lu | %p %p %s | %p %p %lu %u [", + seq_printf(seq, "%5lu | %p %p %s %s %s | %p %p %lu %u [", sbi->ll_pglist_gen, llap, llap->llap_cookie, - origins[llap->llap_origin], + llap_origins[llap->llap_origin], + llap->llap_write_queued ? "wq" : "- ", + llap->llap_defer_uptodate ? "du" : "- ", page, page->mapping->host, page->index, page_count(page)); seq_page_flag(seq, page, locked, has_flags); @@ -539,16 +537,6 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) return 0; } -static void *llite_dump_pgcache_seq_start(struct seq_file *seq, loff_t *pos) -{ - struct ll_async_page *llap = seq->private; - - if (llap->llap_magic == 2) - return NULL; - - return (void *)1; -} - static void *llite_dump_pgcache_seq_next(struct seq_file *seq, void *v, loff_t *pos) { @@ -565,11 +553,11 @@ static void *llite_dump_pgcache_seq_next(struct seq_file *seq, void *v, * we advance to a position beyond it, returning null if there * isn't another llap in the list beyond that new position. */ spin_lock(&sbi->ll_lock); - llap = llite_pglist_next_llap(sbi, &dummy_llap->llap_proc_item); - list_del_init(&dummy_llap->llap_proc_item); + llap = llite_pglist_next_llap(sbi, &dummy_llap->llap_pglist_item); + list_del_init(&dummy_llap->llap_pglist_item); if (llap) { - list_add(&dummy_llap->llap_proc_item, &llap->llap_proc_item); - llap = llite_pglist_next_llap(sbi, &dummy_llap->llap_proc_item); + list_add(&dummy_llap->llap_pglist_item,&llap->llap_pglist_item); + llap =llite_pglist_next_llap(sbi,&dummy_llap->llap_pglist_item); } spin_unlock(&sbi->ll_lock); @@ -606,28 +594,28 @@ struct seq_operations llite_dump_pgcache_seq_sops = { static int llite_dump_pgcache_seq_open(struct inode *inode, struct file *file) { struct proc_dir_entry *dp = PDE(inode); - struct ll_async_page *llap; + struct ll_async_page *dummy_llap; struct seq_file *seq; struct ll_sb_info *sbi = dp->data; int rc; - OBD_ALLOC_GFP(llap, sizeof(*llap), GFP_KERNEL); - if (llap == NULL) + OBD_ALLOC_GFP(dummy_llap, sizeof(*dummy_llap), GFP_KERNEL); + if (dummy_llap == NULL) return -ENOMEM; - llap->llap_page = NULL; - llap->llap_cookie = sbi; - llap->llap_magic = 0; + dummy_llap->llap_page = NULL; + dummy_llap->llap_cookie = sbi; + dummy_llap->llap_magic = 0; rc = seq_open(file, &llite_dump_pgcache_seq_sops); if (rc) { - OBD_FREE(llap, sizeof(*llap)); + OBD_FREE(dummy_llap, sizeof(*dummy_llap)); return rc; } seq = file->private_data; - seq->private = llap; + seq->private = dummy_llap; spin_lock(&sbi->ll_lock); - list_add(&llap->llap_proc_item, &sbi->ll_pglist); + list_add(&dummy_llap->llap_pglist_item, &sbi->ll_pglist); spin_unlock(&sbi->ll_lock); return 0; @@ -637,14 +625,14 @@ static int llite_dump_pgcache_seq_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; - struct ll_async_page *llap = seq->private; - struct ll_sb_info *sbi = llap->llap_cookie; + struct ll_async_page *dummy_llap = seq->private; + struct ll_sb_info *sbi = dummy_llap->llap_cookie; spin_lock(&sbi->ll_lock); - if (!list_empty(&llap->llap_proc_item)) - list_del_init(&llap->llap_proc_item); + if (!list_empty(&dummy_llap->llap_pglist_item)) + list_del_init(&dummy_llap->llap_pglist_item); spin_unlock(&sbi->ll_lock); - OBD_FREE(llap, sizeof(*llap)); + OBD_FREE(dummy_llap, sizeof(*dummy_llap)); return seq_release(inode, file); } diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 9798c3e..369ff51 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -54,6 +54,9 @@ pos = n, n = pos->prev ) #endif +kmem_cache_t *ll_async_page_slab = NULL; +size_t ll_async_page_slab_size = 0; + /* SYNCHRONOUS I/O to object storage for an inode */ static int ll_brw(int cmd, struct inode *inode, struct obdo *oa, struct page *page, int flags) @@ -359,7 +362,103 @@ struct ll_async_page *llap_cast_private(struct page *page) return llap; } -/* XXX have the exp be an argument? */ +/* Try to shrink the page cache for the @sbi filesystem by 1/@shrink_fraction. + * + * There is an llap attached onto every page in lustre, linked off @sbi. + * We add an llap to the list so we don't lose our place during list walking. + * If llaps in the list are being moved they will only move to the end + * of the LRU, and we aren't terribly interested in those pages here (we + * start at the beginning of the list where the least-used llaps are. + */ +int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction) +{ + struct ll_async_page *llap, dummy_llap = { .llap_magic = 0xd11ad11a }; + unsigned long total, want, count = 0; + + total = sbi->ll_async_page_count; + + /* There can be a large number of llaps (600k or more in a large + * memory machine) so the VM 1/6 shrink ratio is likely too much. + * Since we are freeing pages also, we don't necessarily want to + * shrink so much. Limit to 40MB of pages + llaps per call. */ + if (shrink_fraction == 0) + want = sbi->ll_async_page_count - sbi->ll_async_page_max + 32; + else + want = (total + shrink_fraction - 1) / shrink_fraction; + + if (want > 40 << (20 - PAGE_CACHE_SHIFT)) + want = 40 << (20 - PAGE_CACHE_SHIFT); + + CDEBUG(D_CACHE, "shrinking %lu of %lu pages (1/%d)\n", + want, total, shrink_fraction); + + spin_lock(&sbi->ll_lock); + list_add(&dummy_llap.llap_pglist_item, &sbi->ll_pglist); + + while (--total >= 0 && count < want) { + struct page *page; + + if (unlikely(need_resched())) { + spin_unlock(&sbi->ll_lock); + cond_resched(); + spin_lock(&sbi->ll_lock); + } + + llap = llite_pglist_next_llap(sbi,&dummy_llap.llap_pglist_item); + list_del_init(&dummy_llap.llap_pglist_item); + if (llap == NULL) + break; + + page = llap->llap_page; + LASSERT(page != NULL); + + list_add(&dummy_llap.llap_pglist_item, &llap->llap_pglist_item); + + /* Page needs/undergoing IO */ + if (TryLockPage(page)) { + LL_CDEBUG_PAGE(D_PAGE, page, "can't lock\n"); + continue; + } + + /* If page is dirty or undergoing IO don't discard it */ + if (llap->llap_write_queued || PageDirty(page) || + (!PageUptodate(page) && + llap->llap_origin != LLAP_ORIGIN_READAHEAD)) { + unlock_page(page); + LL_CDEBUG_PAGE(D_PAGE, page, "can't drop from cache: " + "%s%s%s%s origin %s\n", + llap->llap_write_queued ? "wq " : "", + PageDirty(page) ? "pd " : "", + PageUptodate(page) ? "" : "!pu", + llap->llap_defer_uptodate ? "" : "!du", + llap_origins[llap->llap_origin]); + continue; + } + + page_cache_get(page); + spin_unlock(&sbi->ll_lock); + + ++count; + LL_CDEBUG_PAGE(D_PAGE, page, "drop from cache %lu/%lu\n", + count, want); + if (page->mapping != NULL) { + ll_ra_accounting(page, page->mapping); + ll_truncate_complete_page(page); + } + unlock_page(page); + page_cache_release(page); + + spin_lock(&sbi->ll_lock); + } + list_del(&dummy_llap.llap_pglist_item); + spin_unlock(&sbi->ll_lock); + + CDEBUG(D_CACHE, "shrank %lu/%lu and left %lu unscanned\n", + count, want, total); + + return count; +} + struct ll_async_page *llap_from_page(struct page *page, unsigned origin) { struct ll_async_page *llap; @@ -369,25 +468,40 @@ struct ll_async_page *llap_from_page(struct page *page, unsigned origin) int rc; ENTRY; + LASSERT(ll_async_page_slab); LASSERTF(origin < LLAP__ORIGIN_MAX, "%u\n", origin); llap = llap_cast_private(page); - if (llap != NULL) + if (llap != NULL) { + /* move to end of LRU list */ + spin_lock(&sbi->ll_lock); + sbi->ll_pglist_gen++; + list_del_init(&llap->llap_pglist_item); + list_add_tail(&llap->llap_pglist_item, &sbi->ll_pglist); + spin_unlock(&sbi->ll_lock); GOTO(out, llap); + } exp = ll_i2obdexp(page->mapping->host); if (exp == NULL) RETURN(ERR_PTR(-EINVAL)); - OBD_ALLOC(llap, sizeof(*llap)); + /* limit the number of lustre-cached pages */ + if (sbi->ll_async_page_count >= sbi->ll_async_page_max) + llap_shrink_cache(sbi, 0); + + OBD_SLAB_ALLOC(llap, ll_async_page_slab, SLAB_KERNEL, + ll_async_page_slab_size); if (llap == NULL) RETURN(ERR_PTR(-ENOMEM)); llap->llap_magic = LLAP_MAGIC; + llap->llap_cookie = (void *)llap + size_round(sizeof(*llap)); rc = obd_prep_async_page(exp, ll_i2info(inode)->lli_smd, NULL, page, (obd_off)page->index << PAGE_SHIFT, &ll_async_page_ops, llap, &llap->llap_cookie); if (rc) { - OBD_FREE(llap, sizeof(*llap)); + OBD_SLAB_FREE(llap, ll_async_page_slab, + ll_async_page_slab_size); RETURN(ERR_PTR(rc)); } @@ -399,7 +513,8 @@ struct ll_async_page *llap_from_page(struct page *page, unsigned origin) spin_lock(&sbi->ll_lock); sbi->ll_pglist_gen++; - list_add_tail(&llap->llap_proc_item, &sbi->ll_pglist); + sbi->ll_async_page_count++; + list_add_tail(&llap->llap_pglist_item, &sbi->ll_pglist); spin_unlock(&sbi->ll_lock); out: @@ -594,8 +709,12 @@ void ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc) } ClearPageError(page); } else { - if (cmd == OBD_BRW_READ) + if (cmd == OBD_BRW_READ) { llap->llap_defer_uptodate = 0; + } else { + SetPageDirty(page); + ClearPageLaunder(page); + } SetPageError(page); } @@ -663,11 +782,12 @@ void ll_removepage(struct page *page) __clear_page_ll_data(page); spin_lock(&sbi->ll_lock); - if (!list_empty(&llap->llap_proc_item)) - list_del_init(&llap->llap_proc_item); + if (!list_empty(&llap->llap_pglist_item)) + list_del_init(&llap->llap_pglist_item); sbi->ll_pglist_gen++; + sbi->ll_async_page_count--; spin_unlock(&sbi->ll_lock); - OBD_FREE(llap, sizeof(*llap)); + OBD_SLAB_FREE(llap, ll_async_page_slab, ll_async_page_slab_size); EXIT; } @@ -991,7 +1111,7 @@ int ll_readpage(struct file *filp, struct page *page) if (IS_ERR(llap)) GOTO(out, rc = PTR_ERR(llap)); - if (ll_i2sbi(inode)->ll_flags & LL_SBI_READAHEAD) + if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages) ras_update(ll_i2sbi(inode), &fd->fd_ras, page->index, llap->llap_defer_uptodate); @@ -1028,7 +1148,7 @@ int ll_readpage(struct file *filp, struct page *page) GOTO(out, rc); LL_CDEBUG_PAGE(D_PAGE, page, "queued readpage\n"); - if (ll_i2sbi(inode)->ll_flags & LL_SBI_READAHEAD) + if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages) ll_readahead(&fd->fd_ras, exp, page->mapping, oig, fd->fd_flags); diff --git a/lustre/llite/rw24.c b/lustre/llite/rw24.c index 736caf3..ec30746 100644 --- a/lustre/llite/rw24.c +++ b/lustre/llite/rw24.c @@ -91,6 +91,8 @@ out: if (rc) { if (!lli->lli_async_rc) lli->lli_async_rc = rc; + SetPageDirty(page); + ClearPageLaunder(page); unlock_page(page); } RETURN(rc); diff --git a/lustre/llite/super.c b/lustre/llite/super.c index 0b3ef4f..2625872 100644 --- a/lustre/llite/super.c +++ b/lustre/llite/super.c @@ -117,6 +117,9 @@ static void __exit exit_lustre_lite(void) LASSERTF(kmem_cache_destroy(ll_file_data_slab) == 0, "couldn't destroy ll_file_data slab\n"); + if (ll_async_page_slab) + LASSERTF(kmem_cache_destroy(ll_async_page_slab) == 0, + "couldn't destroy ll_async_page slab\n"); if (proc_lustre_fs_root) { lprocfs_remove(proc_lustre_fs_root); diff --git a/lustre/llite/super25.c b/lustre/llite/super25.c index 8df4dea..bc34d5f 100644 --- a/lustre/llite/super25.c +++ b/lustre/llite/super25.c @@ -160,6 +160,10 @@ static void __exit exit_lustre_lite(void) ll_destroy_inodecache(); LASSERTF(kmem_cache_destroy(ll_file_data_slab) == 0, "couldn't destroy ll_file_data slab\n"); + if (ll_async_page_slab) + LASSERTF(kmem_cache_destroy(ll_async_page_slab) == 0, + "couldn't destroy ll_async_page slab\n"); + if (proc_lustre_fs_root) { lprocfs_remove(proc_lustre_fs_root); proc_lustre_fs_root = NULL; diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 8afa23d..f8707f8 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -1024,13 +1024,15 @@ int lov_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm, int rc; ENTRY; + if (!page) + return size_round(sizeof(*lap)) + + obd_prep_async_page(lov->tgts[0].ltd_exp, NULL, NULL, + NULL, 0, NULL, NULL, NULL); + ASSERT_LSM_MAGIC(lsm); LASSERT(loi == NULL); - OBD_ALLOC(lap, sizeof(*lap)); - if (lap == NULL) - RETURN(-ENOMEM); - + lap = *res; lap->lap_magic = LAP_MAGIC; lap->lap_caller_ops = ops; lap->lap_caller_data = data; @@ -1043,17 +1045,16 @@ int lov_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm, /* so the callback doesn't need the lsm */ lap->lap_loi_id = loi->loi_id; + lap->lap_sub_cookie = (void *)lap + size_round(sizeof(*lap)); + rc = obd_prep_async_page(lov->tgts[loi->loi_ost_idx].ltd_exp, lsm, loi, page, lap->lap_sub_offset, &lov_async_page_ops, lap, &lap->lap_sub_cookie); - if (rc) { - OBD_FREE(lap, sizeof(*lap)); + if (rc) RETURN(rc); - } CDEBUG(D_CACHE, "lap %p page %p cookie %p off "LPU64"\n", lap, page, lap->lap_sub_cookie, offset); - *res = lap; RETURN(0); } @@ -1172,7 +1173,6 @@ static int lov_teardown_async_page(struct obd_export *exp, lap->lap_sub_cookie, rc); RETURN(rc); } - OBD_FREE(lap, sizeof(*lap)); RETURN(rc); } diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index 279c040..39aace8 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -1027,7 +1027,7 @@ int fsfilt_ext3_map_bm_inode_pages(struct inode *inode, struct page **page, inode->i_ino, *b, *cr, create, rc); break; } - + b += blocks_per_page; cr += blocks_per_page; } @@ -1053,7 +1053,7 @@ int fsfilt_ext3_map_inode_pages(struct inode *inode, struct page **page, created, create); if (optional_sem != NULL) up(optional_sem); - + return rc; } diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 807773d..ee65ca4 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -1187,10 +1187,6 @@ int mds_handle(struct ptlrpc_request *req) rc = mds_readpage(req); if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_MDS_SENDPAGE)) { - if (req->rq_reply_state) { - lustre_free_reply_state (req->rq_reply_state); - req->rq_reply_state = NULL; - } RETURN(0); } @@ -1433,7 +1429,6 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) spin_lock_init(&mds->mds_transno_lock); mds->mds_max_mdsize = sizeof(struct lov_mds_md); mds->mds_max_cookiesize = sizeof(struct llog_cookie); - atomic_set(&mds->mds_open_count, 0); sprintf(ns_name, "mds-%s", obd->obd_uuid.uuid); obd->obd_namespace = ldlm_namespace_new(ns_name, LDLM_NAMESPACE_SERVER); @@ -1904,7 +1899,7 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf) ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE, MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT, - mds_handle, "mds", obd->obd_proc_entry); + mds_handle, "mds", obd->obd_proc_entry, NULL); if (!mds->mds_service) { CERROR("failed to start service\n"); @@ -1921,7 +1916,7 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf) MDS_SETATTR_PORTAL, MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT, mds_handle, "mds_setattr", - obd->obd_proc_entry); + obd->obd_proc_entry, NULL); if (!mds->mds_setattr_service) { CERROR("failed to start getattr service\n"); GOTO(err_thread, rc = -ENOMEM); @@ -1937,7 +1932,7 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf) MDS_READPAGE_PORTAL, MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT, mds_handle, "mds_readpage", - obd->obd_proc_entry); + obd->obd_proc_entry, NULL); if (!mds->mds_readpage_service) { CERROR("failed to start readpage service\n"); GOTO(err_thread2, rc = -ENOMEM); diff --git a/lustre/mds/lproc_mds.c b/lustre/mds/lproc_mds.c index 9c9bd12..589297a 100644 --- a/lustre/mds/lproc_mds.c +++ b/lustre/mds/lproc_mds.c @@ -49,65 +49,19 @@ static int lprocfs_mds_rd_mntdev(char *page, char **start, off_t off, int count, return snprintf(page, count, "%s\n",obd->u.mds.mds_vfsmnt->mnt_devname); } -static int lprocfs_mds_rd_filesopen(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct obd_device *obd = data; - LASSERT(obd != NULL); - *eof = 1; - - return snprintf(page, count, "%d\n", - atomic_read(&obd->u.mds.mds_open_count)); -} - -static int lprocfs_mds_wr_evict_client(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - struct obd_device *obd = data; - struct obd_export *doomed_exp = NULL; - struct obd_uuid doomed; - struct list_head *p; - char tmpbuf[sizeof(doomed)]; - - sscanf(buffer, "%40s", tmpbuf); - obd_str2uuid(&doomed, tmpbuf); - - spin_lock(&obd->obd_dev_lock); - list_for_each(p, &obd->obd_exports) { - doomed_exp = list_entry(p, struct obd_export, exp_obd_chain); - if (obd_uuid_equals(&doomed, &doomed_exp->exp_client_uuid)) { - class_export_get(doomed_exp); - break; - } - doomed_exp = NULL; - } - spin_unlock(&obd->obd_dev_lock); - - if (doomed_exp == NULL) { - CERROR("can't disconnect %s: no export found\n", - doomed.uuid); - } else { - CERROR("evicting %s at adminstrative request\n", - doomed.uuid); - ptlrpc_fail_export(doomed_exp); - class_export_put(doomed_exp); - } - return count; -} - struct lprocfs_vars lprocfs_mds_obd_vars[] = { { "uuid", lprocfs_rd_uuid, 0, 0 }, { "blocksize", lprocfs_rd_blksize, 0, 0 }, { "kbytestotal", lprocfs_rd_kbytestotal, 0, 0 }, { "kbytesfree", lprocfs_rd_kbytesfree, 0, 0 }, { "kbytesavail", lprocfs_rd_kbytesavail, 0, 0 }, - { "fstype", lprocfs_rd_fstype, 0, 0 }, { "filestotal", lprocfs_rd_filestotal, 0, 0 }, { "filesfree", lprocfs_rd_filesfree, 0, 0 }, - { "filesopen", lprocfs_mds_rd_filesopen, 0, 0 }, + { "fstype", lprocfs_rd_fstype, 0, 0 }, { "mntdev", lprocfs_mds_rd_mntdev, 0, 0 }, { "recovery_status", lprocfs_obd_rd_recovery_status, 0, 0 }, - { "evict_client", 0, lprocfs_mds_wr_evict_client, 0 }, + { "num_exports", lprocfs_rd_num_exports, 0, 0 }, + { "evict_client", 0, lprocfs_wr_evict_client, 0 }, { "num_exports", lprocfs_rd_num_exports, 0, 0 }, { 0 } }; diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index 08191ff..dcd45d3 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -1000,6 +1000,11 @@ int mds_open(struct mds_update_record *rec, int offset, } } + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OPEN_CREATE)) { + obd_fail_loc = OBD_FAIL_LDLM_REPLY | OBD_FAIL_ONCE; + GOTO(cleanup, rc = -EAGAIN); + } + /* Step 5: mds_open it */ rc = mds_finish_open(req, dchild, body, rec->ur_flags, &handle, rec, rep); @@ -1032,8 +1037,6 @@ int mds_open(struct mds_update_record *rec, int offset, else ptlrpc_save_lock (req, &parent_lockh, parent_mode); } - if (rc == 0) - atomic_inc(&mds->mds_open_count); RETURN(rc); } @@ -1207,7 +1210,6 @@ out: mds_mfd_destroy(mfd); cleanup: - atomic_dec(&mds->mds_open_count); if (req != NULL && reply_body != NULL) { rc = mds_finish_transno(mds, pending_dir, handle, req, rc, 0); } else if (handle) { @@ -1276,7 +1278,7 @@ int mds_close(struct ptlrpc_request *req) mds_pack_inode2fid(&body->fid1, inode); mds_pack_inode2body(body, inode); - mds_pack_md(obd, req->rq_repmsg, 1, body, inode, MDS_PACK_MD_LOCK); + mds_pack_md(obd, req->rq_repmsg, 1,body,inode,MDS_PACK_MD_LOCK); } spin_lock(&med->med_open_lock); list_del(&mfd->mfd_list); @@ -1293,7 +1295,7 @@ int mds_close(struct ptlrpc_request *req) RETURN(-ENOMEM); } - RETURN(0); + RETURN(rc); } int mds_done_writing(struct ptlrpc_request *req) diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index 4730c58c..b58c07a 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -264,7 +264,7 @@ int mds_fix_attr(struct inode *inode, struct mds_update_record *rec) int mode = attr->ia_mode; /* chmod */ if (attr->ia_mode == (mode_t) -1) - attr->ia_mode = inode->i_mode; + mode = inode->i_mode; attr->ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); } diff --git a/lustre/mgmt/mgmt_svc.c b/lustre/mgmt/mgmt_svc.c index fdeee93..db21485 100644 --- a/lustre/mgmt/mgmt_svc.c +++ b/lustre/mgmt/mgmt_svc.c @@ -91,7 +91,7 @@ static int mgmt_setup(struct obd_device *obd, obd_count len, void *buf) mgmt_service = ptlrpc_init_svc(MGMT_NBUFS, MGMT_BUFSIZE, MGMT_MAXREQSIZE, MGMT_REQUEST_PORTAL, MGMT_REPLY_PORTAL, 30000, - mgmt_handler, "mgmt", obd->obd_proc_entry); + mgmt_handler, "mgmt", obd->obd_proc_entry, NULL); if (!mgmt_service) { CERROR("Failed to start mgmt service\n"); RETURN(-ENOMEM); diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index ddeba4d..419d4c03 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -2135,6 +2135,12 @@ static int filter_destroy(struct obd_export *exp, struct obdo *oa, if (dchild->d_inode == NULL) { CDEBUG(D_INODE, "destroying non-existent object "LPU64"\n", oa->o_id); + /* If object already gone, cancel cookie right now */ + if (oa->o_valid & OBD_MD_FLCOOKIE) { + fcc = obdo_logcookie(oa); + llog_cancel(llog_get_context(obd, fcc->lgc_subsys + 1), + NULL, 1, fcc, 0); + } GOTO(cleanup, rc = -ENOENT); } diff --git a/lustre/obdfilter/filter_io.c b/lustre/obdfilter/filter_io.c index 599e9dc..c3574de 100644 --- a/lustre/obdfilter/filter_io.c +++ b/lustre/obdfilter/filter_io.c @@ -304,17 +304,15 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, GOTO(cleanup, rc); dentry = filter_oa2dentry(obd, oa); - if (IS_ERR(dentry)) - GOTO(cleanup, rc = PTR_ERR(dentry)); - - if (dentry->d_inode == NULL) { - CERROR("trying to BRW to non-existent file "LPU64"\n", - obj->ioo_id); - GOTO(cleanup, rc = -ENOENT); + if (IS_ERR(dentry)) { + rc = PTR_ERR(dentry); + dentry = NULL; + GOTO(cleanup, rc); } + inode = dentry->d_inode; - obdo_to_inode(dentry->d_inode, oa, OBD_MD_FLATIME); + obdo_to_inode(inode, oa, OBD_MD_FLATIME); fsfilt_check_slow(now, obd_timeout, "preprw_read setup"); @@ -370,8 +368,6 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, if (dentry != NULL) f_dput(dentry); - else - CERROR("NULL dentry in cleanup -- tell CFS\n"); } if (iobuf != NULL) diff --git a/lustre/obdfilter/filter_log.c b/lustre/obdfilter/filter_log.c index 5f7fc14..d2dd8b04b 100644 --- a/lustre/obdfilter/filter_log.c +++ b/lustre/obdfilter/filter_log.c @@ -109,7 +109,7 @@ void filter_cancel_cookies_cb(struct obd_device *obd, __u64 transno, NULL, 1, cookie, 0); if (rc) CERROR("error cancelling log cookies: rc = %d\n", rc); - OBD_FREE(cb_data, sizeof(struct llog_cookie)); + OBD_FREE(cookie, sizeof(*cookie)); } /* Callback for processing the unlink log record received from MDS by diff --git a/lustre/obdfilter/filter_san.c b/lustre/obdfilter/filter_san.c index f6d8d06..64ddf68 100644 --- a/lustre/obdfilter/filter_san.c +++ b/lustre/obdfilter/filter_san.c @@ -80,12 +80,6 @@ int filter_san_preprw(int cmd, struct obd_export *exp, struct obdo *oa, GOTO(out, rc = PTR_ERR(dentry)); inode = dentry->d_inode; - if (!inode) { - CERROR("trying to BRW to non-existent file "LPU64"\n", - o->ioo_id); - f_dput(dentry); - GOTO(out, rc = -ENOENT); - } fs_bmap = inode->i_mapping->a_ops->bmap; for (j = 0; j < o->ioo_bufcnt; j++, rnb++) { diff --git a/lustre/obdfilter/lproc_obdfilter.c b/lustre/obdfilter/lproc_obdfilter.c index c2d7286..3617e96 100644 --- a/lustre/obdfilter/lproc_obdfilter.c +++ b/lustre/obdfilter/lproc_obdfilter.c @@ -136,11 +136,12 @@ static struct lprocfs_vars lprocfs_obd_vars[] = { { "tot_dirty", lprocfs_filter_rd_tot_dirty, 0, 0 }, { "tot_pending", lprocfs_filter_rd_tot_pending, 0, 0 }, { "tot_granted", lprocfs_filter_rd_tot_granted, 0, 0 }, + { "recovery_status", lprocfs_obd_rd_recovery_status, 0, 0 }, + { "evict_client", 0, lprocfs_wr_evict_client, 0 }, { "num_exports", lprocfs_rd_num_exports, 0, 0 }, { "readcache_max_filesize", lprocfs_filter_rd_readcache, lprocfs_filter_wr_readcache, 0 }, - { "recovery_status", lprocfs_obd_rd_recovery_status, 0, 0 }, { 0 } }; diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index ff1dca9..adfa52d 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -747,6 +747,7 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa, size[1] = sizeof(*ioobj); size[2] = niocount * sizeof(*niobuf); + OBD_FAIL_RETURN(OBD_FAIL_OSC_BRW_PREP_REQ, -ENOMEM); req = ptlrpc_prep_req(imp, opc, 3, size, NULL); if (req == NULL) return (-ENOMEM); @@ -1196,9 +1197,9 @@ static void osc_process_ar(struct osc_async_rc *ar, struct ptlrpc_request *req, ar->ar_min_xid = ptlrpc_sample_next_xid(); return; - } - - if (ar->ar_force_sync && (ptlrpc_req_xid(req) >= ar->ar_min_xid)) + } + + if (ar->ar_force_sync && req && (ptlrpc_req_xid(req) >= ar->ar_min_xid)) ar->ar_force_sync = 0; } @@ -1211,13 +1212,12 @@ static void osc_ap_completion(struct client_obd *cli, struct obdo *oa, oap->oap_async_flags = 0; oap->oap_interrupted = 0; - if (oap->oap_request != NULL) { - if (sent && oap->oap_cmd == OBD_BRW_WRITE) { - osc_process_ar(&cli->cl_ar, oap->oap_request, rc); - osc_process_ar(&oap->oap_loi->loi_ar, - oap->oap_request, rc); - } + if (oap->oap_cmd == OBD_BRW_WRITE) { + osc_process_ar(&cli->cl_ar, oap->oap_request, rc); + osc_process_ar(&oap->oap_loi->loi_ar, oap->oap_request, rc); + } + if (oap->oap_request != NULL) { ptlrpc_req_finished(oap->oap_request); oap->oap_request = NULL; } @@ -1485,6 +1485,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, oap->oap_count); continue; } + osc_ap_completion(cli, NULL, oap, 0, PTR_ERR(request)); /* put the page back in the loi/lop lists */ list_add_tail(&oap->oap_pending_item, @@ -1831,10 +1832,10 @@ int osc_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm, struct osc_async_page *oap; ENTRY; - OBD_ALLOC(oap, sizeof(*oap)); - if (oap == NULL) - return -ENOMEM; + if (!page) + return size_round(sizeof(*oap)); + oap = *res; oap->oap_magic = OAP_MAGIC; oap->oap_cli = &exp->exp_obd->u.cli; oap->oap_loi = loi; @@ -1852,7 +1853,6 @@ int osc_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm, oap->oap_occ.occ_interrupted = osc_occ_interrupted; CDEBUG(D_CACHE, "oap %p page %p obj off "LPU64"\n", oap, page, offset); - *res = oap; RETURN(0); } @@ -2116,8 +2116,6 @@ static int osc_teardown_async_page(struct obd_export *exp, LOI_DEBUG(loi, "oap %p page %p torn down\n", oap, oap->oap_page); out: spin_unlock(&cli->cl_loi_list_lock); - if (rc == 0) - OBD_FREE(oap, sizeof(*oap)); RETURN(rc); } diff --git a/lustre/ost/autoMakefile.am b/lustre/ost/autoMakefile.am index 37e7583..72ef6bb 100644 --- a/lustre/ost/autoMakefile.am +++ b/lustre/ost/autoMakefile.am @@ -8,4 +8,4 @@ modulefs_DATA = ost$(KMODEXT) endif MOSTLYCLEANFILES = *.o *.ko *.mod.c -DIST_SOURCES = $(ost-objs:%.o=%.c) +DIST_SOURCES = $(ost-objs:%.o=%.c) ost_internal.h diff --git a/lustre/ost/lproc_ost.c b/lustre/ost/lproc_ost.c index 936706d..4ceec0d 100644 --- a/lustre/ost/lproc_ost.c +++ b/lustre/ost/lproc_ost.c @@ -23,6 +23,8 @@ #include #include +#include +#include "ost_internal.h" #ifndef LPROCFS static struct lprocfs_vars lprocfs_obd_vars[] = { {0} }; @@ -38,5 +40,39 @@ static struct lprocfs_vars lprocfs_module_vars[] = { { 0 } }; +void +ost_print_req(void *seq_file, struct ptlrpc_request *req) +{ + /* Called holding srv_lock with irqs disabled. + * Print specific req contents and a newline. + * CAVEAT EMPTOR: check request message length before printing!!! + * You might have received any old crap so you must be just as + * careful here as the service's request parser!!! */ + struct seq_file *sf = seq_file; + + switch (req->rq_phase) { + case RQ_PHASE_NEW: + /* still awaiting a service thread's attention, or rejected + * because the generic request message didn't unpack */ + seq_printf(sf, "\n"); + break; + + case RQ_PHASE_INTERPRET: + /* being handled, so basic msg swabbed, and opc is valid + * but racing with ost_handle() */ + seq_printf(sf, "opc %d\n", req->rq_reqmsg->opc); + break; + + case RQ_PHASE_COMPLETE: + /* been handled by ost_handle() reply state possibly still + * volatile */ + seq_printf(sf, "opc %d\n", req->rq_reqmsg->opc); + break; + + default: + LBUG(); + } +} + #endif /* LPROCFS */ LPROCFS_INIT_VARS(ost, lprocfs_module_vars, lprocfs_obd_vars) diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index badf82d..ebc5d64 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -45,6 +45,7 @@ #include #include #include +#include "ost_internal.h" void oti_init(struct obd_trans_info *oti, struct ptlrpc_request *req) { @@ -1136,7 +1137,8 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf) ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, OST_REQUEST_PORTAL, OSC_REPLY_PORTAL, 30000, ost_handle, "ost", - obd->obd_proc_entry); + obd->obd_proc_entry, + ost_print_req); if (ost->ost_service == NULL) { CERROR("failed to start service\n"); GOTO(out_lprocfs, rc = -ENOMEM); @@ -1151,7 +1153,8 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf) ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, OST_CREATE_PORTAL, OSC_REPLY_PORTAL, 30000, ost_handle, "ost_create", - obd->obd_proc_entry); + obd->obd_proc_entry, + ost_print_req); if (ost->ost_create_service == NULL) { CERROR("failed to start OST create service\n"); GOTO(out_service, rc = -ENOMEM); diff --git a/lustre/portals/knals/lonal/lonal_cb.c b/lustre/portals/knals/lonal/lonal_cb.c index 6de9ea4..cf5df0d 100644 --- a/lustre/portals/knals/lonal/lonal_cb.c +++ b/lustre/portals/knals/lonal/lonal_cb.c @@ -50,7 +50,7 @@ klonal_send (lib_nal_t *nal, .klod_niov = payload_niov, .klod_offset = payload_offset, .klod_nob = payload_nob, - .klod_iov.iov = payload_iov}; + .klod_iov = { .iov = payload_iov } }; ptl_err_t rc; LASSERT(nid == klonal_lib.libnal_ni.ni_pid.nid); @@ -80,7 +80,7 @@ klonal_send_pages (lib_nal_t *nal, .klod_niov = payload_niov, .klod_offset = payload_offset, .klod_nob = payload_nob, - .klod_iov.kiov = payload_kiov}; + .klod_iov = { .kiov = payload_kiov } }; ptl_err_t rc; LASSERT(nid == klonal_lib.libnal_ni.ni_pid.nid); diff --git a/lustre/ptlbd/server.c b/lustre/ptlbd/server.c index 0f06912..5798d4c 100644 --- a/lustre/ptlbd/server.c +++ b/lustre/ptlbd/server.c @@ -56,7 +56,7 @@ static int ptlbd_sv_setup(struct obd_device *obd, obd_count len, void *buf) ptlrpc_init_svc(PTLBD_NBUFS, PTLBD_BUFSIZE, PTLBD_MAXREQSIZE, PTLBD_REQUEST_PORTAL, PTLBD_REPLY_PORTAL, 30000, ptlbd_handle, "ptlbd_sv", - obd->obd_proc_entry); + obd->obd_proc_entry, NULL); if (ptlbd->ptlbd_service == NULL) GOTO(out_filp, rc = -ENOMEM); diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index dd3489d..5c269d0 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -216,9 +216,13 @@ void request_in_callback(ptl_event_t *ev) req->rq_peer.peer_ni = rqbd->rqbd_srv_ni->sni_ni; ptlrpc_id2str(&req->rq_peer, req->rq_peerstr); req->rq_rqbd = rqbd; + req->rq_phase = RQ_PHASE_NEW; spin_lock_irqsave (&service->srv_lock, flags); + req->rq_history_seq = service->srv_request_seq++; + list_add_tail(&req->rq_history_list, &service->srv_request_history); + if (ev->unlinked) { srv_ni->sni_nrqbd_receiving--; if (ev->type != PTL_EVENT_UNLINK && @@ -265,9 +269,10 @@ void reply_out_callback(ptl_event_t *ev) ev->type == PTL_EVENT_UNLINK); if (!rs->rs_difficult) { - /* I'm totally responsible for freeing "easy" replies */ + /* 'Easy' replies have no further processing so I drop the + * net's ref on 'rs' */ LASSERT (ev->unlinked); - lustre_free_reply_state (rs); + ptlrpc_rs_decref(rs); atomic_dec (&svc->srv_outstanding_replies); EXIT; return; @@ -276,7 +281,8 @@ void reply_out_callback(ptl_event_t *ev) LASSERT (rs->rs_on_net); if (ev->unlinked) { - /* Last network callback */ + /* Last network callback. The net's ref on 'rs' stays put + * until ptlrpc_server_handle_reply() is done with it */ spin_lock_irqsave (&svc->srv_lock, flags); rs->rs_on_net = 0; ptlrpc_schedule_difficult_reply (rs); diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 0f389db..d10f12f 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -476,8 +476,10 @@ static int signal_completed_replay(struct obd_import *imp) atomic_inc(&imp->imp_replay_inflight); req = ptlrpc_prep_req(imp, OBD_PING, 0, NULL, NULL); - if (!req) + if (!req) { + atomic_dec(&imp->imp_replay_inflight); RETURN(-ENOMEM); + } req->rq_replen = lustre_msg_size(0, NULL); req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT; diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index 68a17bc..8c30c70 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include "ptlrpc_internal.h" @@ -155,12 +157,257 @@ void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir, } } +static int +ptlrpc_lprocfs_read_req_history_len(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ptlrpc_service *svc = data; + + *eof = 1; + return snprintf(page, count, "%d\n", svc->srv_n_history_rqbds); +} + +static int +ptlrpc_lprocfs_read_req_history_max(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ptlrpc_service *svc = data; + + *eof = 1; + return snprintf(page, count, "%d\n", svc->srv_max_history_rqbds); +} + +static int +ptlrpc_lprocfs_write_req_history_max(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct ptlrpc_service *svc = data; + int bufpages; + unsigned long flags; + int val; + int rc = lprocfs_write_helper(buffer, count, &val); + + if (rc < 0) + return rc; + + if (val < 0) + return -ERANGE; + + /* This sanity check is more of an insanity check; we can still + * hose a kernel by allowing the request history to grow too + * far. */ + bufpages = (svc->srv_buf_size + PAGE_SIZE - 1)/PAGE_SIZE; + if (val > num_physpages/(2*bufpages)) + return -ERANGE; + + spin_lock_irqsave(&svc->srv_lock, flags); + svc->srv_max_history_rqbds = val; + spin_unlock_irqrestore(&svc->srv_lock, flags); + + return count; +} + +struct ptlrpc_srh_iterator { + __u64 srhi_seq; + struct ptlrpc_request *srhi_req; +}; + +int +ptlrpc_lprocfs_svc_req_history_seek(struct ptlrpc_service *svc, + struct ptlrpc_srh_iterator *srhi, + __u64 seq) +{ + struct list_head *e; + struct ptlrpc_request *req; + + if (srhi->srhi_req != NULL && + srhi->srhi_seq > svc->srv_request_max_cull_seq && + srhi->srhi_seq <= seq) { + /* If srhi_req was set previously, hasn't been culled and + * we're searching for a seq on or after it (i.e. more + * recent), search from it onwards. + * Since the service history is LRU (i.e. culled reqs will + * be near the head), we shouldn't have to do long + * re-scans */ + LASSERT (srhi->srhi_seq == srhi->srhi_req->rq_history_seq); + LASSERT (!list_empty(&svc->srv_request_history)); + e = &srhi->srhi_req->rq_history_list; + } else { + /* search from start */ + e = svc->srv_request_history.next; + } + + while (e != &svc->srv_request_history) { + req = list_entry(e, struct ptlrpc_request, rq_history_list); + + if (req->rq_history_seq >= seq) { + srhi->srhi_seq = req->rq_history_seq; + srhi->srhi_req = req; + return 0; + } + e = e->next; + } + + return -ENOENT; +} + +static void * +ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos) +{ + struct ptlrpc_service *svc = s->private; + struct ptlrpc_srh_iterator *srhi; + unsigned long flags; + int rc; + + OBD_ALLOC(srhi, sizeof(*srhi)); + if (srhi == NULL) + return NULL; + + srhi->srhi_seq = 0; + srhi->srhi_req = NULL; + + spin_lock_irqsave(&svc->srv_lock, flags); + rc = ptlrpc_lprocfs_svc_req_history_seek(svc, srhi, *pos); + spin_unlock_irqrestore(&svc->srv_lock, flags); + + if (rc == 0) { + *pos = srhi->srhi_seq; + return srhi; + } + + OBD_FREE(srhi, sizeof(*srhi)); + return NULL; +} + +static void +ptlrpc_lprocfs_svc_req_history_stop(struct seq_file *s, void *iter) +{ + struct ptlrpc_srh_iterator *srhi = iter; + + if (srhi != NULL) + OBD_FREE(srhi, sizeof(*srhi)); +} + +static void * +ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s, + void *iter, loff_t *pos) +{ + struct ptlrpc_service *svc = s->private; + struct ptlrpc_srh_iterator *srhi = iter; + unsigned long flags; + int rc; + + spin_lock_irqsave(&svc->srv_lock, flags); + rc = ptlrpc_lprocfs_svc_req_history_seek(svc, srhi, *pos + 1); + spin_unlock_irqrestore(&svc->srv_lock, flags); + + if (rc != 0) { + OBD_FREE(srhi, sizeof(*srhi)); + return NULL; + } + + *pos = srhi->srhi_seq; + return srhi; +} + +static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter) +{ + struct ptlrpc_service *svc = s->private; + struct ptlrpc_srh_iterator *srhi = iter; + struct ptlrpc_request *req; + unsigned long flags; + int rc; + + spin_lock_irqsave(&svc->srv_lock, flags); + + rc = ptlrpc_lprocfs_svc_req_history_seek(svc, srhi, srhi->srhi_seq); + + if (rc == 0) { + req = srhi->srhi_req; + + /* Print common req fields. + * CAVEAT EMPTOR: we're racing with the service handler + * here. The request could contain any old crap, so you + * must be just as careful as the service's request + * parser. Currently I only print stuff here I know is OK + * to look at coz it was set up in request_in_callback()!!! */ + seq_printf(s, LPD64":%s:%s:"LPD64":%d:%s ", + req->rq_history_seq, + req->rq_peer.peer_ni->pni_name, req->rq_peerstr, + req->rq_xid, req->rq_reqlen,ptlrpc_rqphase2str(req)); + + if (svc->srv_request_history_print_fn == NULL) + seq_printf(s, "\n"); + else + svc->srv_request_history_print_fn(s, srhi->srhi_req); + } + + spin_unlock_irqrestore(&svc->srv_lock, flags); + + return rc; +} + +static int +ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file) +{ + static struct seq_operations sops = { + .start = ptlrpc_lprocfs_svc_req_history_start, + .stop = ptlrpc_lprocfs_svc_req_history_stop, + .next = ptlrpc_lprocfs_svc_req_history_next, + .show = ptlrpc_lprocfs_svc_req_history_show, + }; + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *seqf; + int rc; + + rc = seq_open(file, &sops); + + if (rc == 0) { + seqf = file->private_data; + seqf->private = dp->data; + } + + return rc; +} + void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry, struct ptlrpc_service *svc) { + struct lprocfs_vars lproc_vars[] = { + {.name = "req_buffer_history_len", + .write_fptr = NULL, + .read_fptr = ptlrpc_lprocfs_read_req_history_len, + .data = svc}, + {.name = "req_buffer_history_max", + .write_fptr = ptlrpc_lprocfs_write_req_history_max, + .read_fptr = ptlrpc_lprocfs_read_req_history_max, + .data = svc}, + {NULL} + }; + static struct file_operations req_history_fops = { + .owner = THIS_MODULE, + .open = ptlrpc_lprocfs_svc_req_history_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + }; + struct proc_dir_entry *req_history; + ptlrpc_lprocfs_register(entry, svc->srv_name, "stats", &svc->srv_procroot, &svc->srv_stats); + + if (svc->srv_procroot == NULL) + return; + + lprocfs_add_vars(svc->srv_procroot, lproc_vars, NULL); + + req_history = create_proc_entry("req_history", 0400, + svc->srv_procroot); + if (req_history != NULL) { + req_history->data = svc; + req_history->proc_fops = &req_history_fops; + } } void ptlrpc_lprocfs_register_obd(struct obd_device *obddev) @@ -204,4 +451,38 @@ void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) obd->obd_svc_stats = NULL; } } + +int lprocfs_wr_evict_client(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + struct obd_export *doomed_exp = NULL; + struct obd_uuid doomed; + struct list_head *p; + char tmpbuf[sizeof(doomed)]; + + sscanf(buffer, "%40s", tmpbuf); + obd_str2uuid(&doomed, tmpbuf); + + spin_lock(&obd->obd_dev_lock); + list_for_each(p, &obd->obd_exports) { + doomed_exp = list_entry(p, struct obd_export, exp_obd_chain); + if (obd_uuid_equals(&doomed, &doomed_exp->exp_client_uuid)) { + class_export_get(doomed_exp); + break; + } + doomed_exp = NULL; + } + spin_unlock(&obd->obd_dev_lock); + + if (doomed_exp == NULL) { + CERROR("can't disconnect %s: no export found\n", doomed.uuid); + } else { + CERROR("evicting %s at adminstrative request\n", doomed.uuid); + ptlrpc_fail_export(doomed_exp); + class_export_put(doomed_exp); + } + return count; +} +EXPORT_SYMBOL(lprocfs_wr_evict_client); #endif /* LPROCFS */ diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index df2110d..637e546 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -337,6 +337,7 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) conn = ptlrpc_connection_addref(req->rq_export->exp_connection); atomic_inc (&svc->srv_outstanding_replies); + ptlrpc_rs_addref(rs); /* +1 ref for the network */ rc = ptl_send_buf (&rs->rs_md_h, req->rq_repmsg, req->rq_replen, rs->rs_difficult ? PTL_ACK_REQ : PTL_NOACK_REQ, @@ -344,14 +345,7 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) svc->srv_rep_portal, req->rq_xid); if (rc != 0) { atomic_dec (&svc->srv_outstanding_replies); - - if (!rs->rs_difficult) { - /* Callers other than target_send_reply() expect me - * to clean up on a comms error */ - lustre_free_reply_state (rs); - req->rq_reply_state = NULL; - req->rq_repmsg = NULL; - } + ptlrpc_rs_decref(rs); } ptlrpc_put_connection(conn); return rc; @@ -406,7 +400,7 @@ int ptl_send_rpc(struct ptlrpc_request *request) request->rq_reqmsg->handle = request->rq_import->imp_remote_handle; request->rq_reqmsg->type = PTL_RPC_MSG_REQUEST; request->rq_reqmsg->conn_cnt = request->rq_import->imp_conn_cnt; - + LASSERT (request->rq_replen != 0); if (request->rq_repmsg == NULL) OBD_ALLOC(request->rq_repmsg, request->rq_replen); diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index f369733..bbfb9bd 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -121,6 +121,7 @@ int lustre_pack_reply (struct ptlrpc_request *req, if (rs == NULL) RETURN (-ENOMEM); + atomic_set(&rs->rs_refcount, 1); /* 1 ref for rq_reply_state */ rs->rs_cb_id.cbid_fn = reply_out_callback; rs->rs_cb_id.cbid_arg = rs; rs->rs_srv_ni = req->rq_rqbd->rqbd_srv_ni; @@ -142,6 +143,7 @@ void lustre_free_reply_state (struct ptlrpc_reply_state *rs) { PTLRPC_RS_DEBUG_LRU_DEL(rs); + LASSERT (atomic_read(&rs->rs_refcount) == 0); LASSERT (!rs->rs_difficult || rs->rs_handled); LASSERT (!rs->rs_on_net); LASSERT (!rs->rs_scheduled); diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 8abf7dc..dbb7aa75 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -37,18 +37,6 @@ static int ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc); static LIST_HEAD (ptlrpc_all_services); static spinlock_t ptlrpc_all_services_lock = SPIN_LOCK_UNLOCKED; -static void -ptlrpc_free_server_req (struct ptlrpc_request *req) -{ - /* The last request to be received into a request buffer uses space - * in the request buffer descriptor, otherwise requests are - * allocated dynamically in the incoming reply event handler */ - if (req == &req->rq_rqbd->rqbd_req) - return; - - OBD_FREE(req, sizeof(*req)); -} - static char * ptlrpc_alloc_request_buffer (int size) { @@ -86,6 +74,7 @@ ptlrpc_alloc_rqbd (struct ptlrpc_srv_ni *srv_ni) rqbd->rqbd_refcount = 0; rqbd->rqbd_cbid.cbid_fn = request_in_callback; rqbd->rqbd_cbid.cbid_arg = rqbd; + INIT_LIST_HEAD(&rqbd->rqbd_reqs); rqbd->rqbd_buffer = ptlrpc_alloc_request_buffer(svc->srv_buf_size); if (rqbd->rqbd_buffer == NULL) { @@ -109,6 +98,7 @@ ptlrpc_free_rqbd (struct ptlrpc_request_buffer_desc *rqbd) unsigned long flags; LASSERT (rqbd->rqbd_refcount == 0); + LASSERT (list_empty(&rqbd->rqbd_reqs)); spin_lock_irqsave(&svc->srv_lock, flags); list_del(&rqbd->rqbd_list); @@ -280,7 +270,8 @@ struct ptlrpc_service * ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int req_portal, int rep_portal, int watchdog_timeout, svc_handler_t handler, char *name, - struct proc_dir_entry *proc_entry) + struct proc_dir_entry *proc_entry, + svcreq_printfn_t svcreq_printfn) { int i; int rc; @@ -311,9 +302,14 @@ ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, service->srv_req_portal = req_portal; service->srv_watchdog_timeout = watchdog_timeout; service->srv_handler = handler; + service->srv_request_history_print_fn = svcreq_printfn; + service->srv_request_seq = 1; /* valid seq #s start at 1 */ + service->srv_request_max_cull_seq = 0; INIT_LIST_HEAD(&service->srv_request_queue); INIT_LIST_HEAD(&service->srv_idle_rqbds); + INIT_LIST_HEAD(&service->srv_history_rqbds); + INIT_LIST_HEAD(&service->srv_request_history); INIT_LIST_HEAD(&service->srv_reply_queue); /* First initialise enough for early teardown */ @@ -357,23 +353,83 @@ failed: } static void -ptlrpc_server_free_request(struct ptlrpc_service *svc, struct ptlrpc_request *req) +ptlrpc_server_free_request(struct ptlrpc_request *req) { - unsigned long flags; - int refcount; - + struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd; + struct ptlrpc_srv_ni *srv_ni = rqbd->rqbd_srv_ni; + struct ptlrpc_service *svc = srv_ni->sni_service; + unsigned long flags; + int refcount; + struct list_head *tmp; + struct list_head *nxt; + spin_lock_irqsave(&svc->srv_lock, flags); + svc->srv_n_active_reqs--; - refcount = --(req->rq_rqbd->rqbd_refcount); + list_add (&req->rq_list, &rqbd->rqbd_reqs); + + refcount = --(rqbd->rqbd_refcount); if (refcount == 0) { - /* request buffer is now idle */ - list_del(&req->rq_rqbd->rqbd_list); - list_add_tail(&req->rq_rqbd->rqbd_list, - &svc->srv_idle_rqbds); + /* request buffer is now idle: add to history */ + list_del(&rqbd->rqbd_list); + list_add_tail(&rqbd->rqbd_list, &svc->srv_history_rqbds); + svc->srv_n_history_rqbds++; + + /* cull some history? + * I expect only about 1 or 2 rqbds need to be recycled here */ + while (svc->srv_n_history_rqbds > svc->srv_max_history_rqbds) { + rqbd = list_entry(svc->srv_history_rqbds.next, + struct ptlrpc_request_buffer_desc, + rqbd_list); + + list_del(&rqbd->rqbd_list); + svc->srv_n_history_rqbds--; + + /* remove rqbd's reqs from svc's req history while + * I've got the service lock */ + list_for_each(tmp, &rqbd->rqbd_reqs) { + req = list_entry(tmp, struct ptlrpc_request, + rq_list); + /* Track the highest culled req seq */ + if (req->rq_history_seq > + svc->srv_request_max_cull_seq) + svc->srv_request_max_cull_seq = + req->rq_history_seq; + list_del(&req->rq_history_list); + } + + spin_unlock_irqrestore(&svc->srv_lock, flags); + + list_for_each_safe(tmp, nxt, &rqbd->rqbd_reqs) { + req = list_entry(rqbd->rqbd_reqs.next, + struct ptlrpc_request, + rq_list); + + list_del(&req->rq_list); + + if (req->rq_reply_state != NULL) { + ptlrpc_rs_decref(req->rq_reply_state); + req->rq_reply_state = NULL; + } + + if (req != &rqbd->rqbd_req) { + /* NB request buffers use an embedded + * req if the incoming req unlinked the + * MD; this isn't one of them! */ + OBD_FREE(req, sizeof(*req)); + } + } + + spin_lock_irqsave(&svc->srv_lock, flags); + + /* schedule request buffer for re-use. + * NB I can only do this after I've disposed of their + * reqs; particularly the embedded req */ + list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds); + } } + spin_unlock_irqrestore(&svc->srv_lock, flags); - - ptlrpc_free_server_req(req); } static int @@ -464,6 +520,8 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc) request->rq_export->exp_last_request_time = CURRENT_SECONDS; } + request->rq_phase = RQ_PHASE_INTERPRET; + CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:ni:nid:opc " "%s:%s+%d:%d:"LPU64":%s:%s:%d\n", current->comm, (request->rq_export ? @@ -477,6 +535,8 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc) rc = svc->srv_handler(request); + request->rq_phase = RQ_PHASE_COMPLETE; + CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:ni:nid:opc " "%s:%s+%d:%d:"LPU64":%s:%s:%d\n", current->comm, (request->rq_export ? @@ -513,7 +573,7 @@ put_conn: } } - ptlrpc_server_free_request(svc, request); + ptlrpc_server_free_request(request); RETURN(1); } @@ -605,7 +665,7 @@ ptlrpc_server_handle_reply (struct ptlrpc_service *svc) class_export_put (exp); rs->rs_export = NULL; - lustre_free_reply_state (rs); + ptlrpc_rs_decref (rs); atomic_dec (&svc->srv_outstanding_replies); RETURN(1); } @@ -678,6 +738,12 @@ ptlrpc_check_rqbd_pools(struct ptlrpc_service *svc) avail += sni->sni_nrqbd_receiving; /* NB I'm not locking; just looking. */ + + /* CAVEAT EMPTOR: We might be allocating buffers here + * because we've allowed the request history to grow out of + * control. We could put a sanity check on that here and + * cull some history if we need the space. */ + if (sni->sni_nrqbd_receiving <= low_water) ptlrpc_grow_req_bufs(sni); } @@ -918,6 +984,10 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) ptlrpc_lprocfs_unregister_service(service); + /* All history will be culled when the next request buffer is + * freed */ + service->srv_max_history_rqbds = 0; + for (i = 0; i < ptlrpc_ninterfaces; i++) { srv_ni = &service->srv_interfaces[i]; CDEBUG(D_NET, "%s: tearing down interface %s\n", @@ -982,10 +1052,11 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) service->srv_n_queued_reqs--; service->srv_n_active_reqs++; - ptlrpc_server_free_request(service, req); + ptlrpc_server_free_request(req); } LASSERT(service->srv_n_queued_reqs == 0); LASSERT(service->srv_n_active_reqs == 0); + LASSERT(service->srv_n_history_rqbds == 0); for (i = 0; i < ptlrpc_ninterfaces; i++) { srv_ni = &service->srv_interfaces[i]; diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 157bca1..837f328 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -209,15 +209,16 @@ test_15() { } run_test 15 "failed open (-ENOMEM)" +READ_AHEAD=`cat /proc/fs/lustre/llite/*/max_read_ahead_mb | head -n 1` stop_read_ahead() { - for f in /proc/fs/lustre/llite/*/read_ahead; do + for f in /proc/fs/lustre/llite/*/max_read_ahead_mb; do echo 0 > $f done } start_read_ahead() { - for f in /proc/fs/lustre/llite/*/read_ahead; do - echo 1 > $f + for f in /proc/fs/lustre/llite/*/max_read_ahead_mb; do + echo $READ_AHEAD > $f done } diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 9d40aea..9f25515 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -978,5 +978,18 @@ test_52() { } run_test 52 "time out lock replay (3764)" +#b3761 ASSERTION(hash != 0) failed +test_53() { +# OBD_FAIL_MDS_OPEN_CREATE | OBD_FAIL_ONCE + do_facet mds "sysctl -w lustre.fail_loc=0x8000012b" + touch $DIR/$tfile & + # give touch a chance to run + sleep 5 + do_facet mds "sysctl -w lustre.fail_loc=0x0" + rm $DIR/$tfile + return 0 +} +run_test 53 "let MDS_CHECK_RESENT return the original return code instead of 0" + equals_msg test complete, cleaning up $CLEANUP diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 4d45bc1..e2cda44 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -673,7 +673,8 @@ run_test 24n "Statting the old file after renameing (Posix rename 2)" test_24o() { check_kernel_version 37 || return 0 - rename_many -s random -v -n 10 $DIR + mkdir -p $DIR/d24o + rename_many -s random -v -n 10 $DIR/d24o } run_test 24o "rename of files during htree split ===============" @@ -2102,7 +2103,22 @@ test_63() { done true } -run_test 63 "Verify oig_wait interruption does not crash ======" +run_test 63 "Verify oig_wait interruption does not crash =======" + +# bug 2248 - async write errors didn't return to application on sync +# bug 3677 - async write errors left page locked +test_63b() { + # ensure we have a grant to do async writes + dd if=/dev/zero of=/mnt/lustre/f63b bs=4k count=1 + rm /mnt/lustre/f63b + + #define OBD_FAIL_OSC_BRW_PREP_REQ 0x406 + sysctl -w lustre.fail_loc=0x80000406 + multiop /mnt/lustre/f63b Owy && error "sync didn't return ENOMEM" + grep -q locked /proc/fs/lustre/llite/fs*/dump_page_cache && \ + error "locked page left in cache after async error" || true +} +run_test 63b "async write errors should be returned to fsync ===" test_64a () { df $DIR diff --git a/lustre/utils/lconf b/lustre/utils/lconf index 9fe97fa..6b796ab 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -210,7 +210,10 @@ class DaemonHandler: if not self.path: panic(self.command, "not found.") ret, out = runcmd(self.path +' '+ self.command_line()) - if ret: + + # FIXME: add this check can only narrow the race but can not avoid it + # completely, so I don't apply this method on inserting module. + if ret and not self.running(): raise CommandError(self.path, out, ret) def stop(self): @@ -244,6 +247,11 @@ class DaemonHandler: return pid except IOError: return 0 + except ValueError: + print "WARNING: invalid pid in %s, removed" % self.pidfile() + print "WARNING: You may need to stop acceptor by yourself and then unload the module libcfs" + os.unlink(self.pidfile()) + return 0 def clean_pidfile(self): """ Remove a stale pidfile """ @@ -1094,13 +1102,14 @@ class kmod: if not module: panic('module not found:', mod) (rc, out) = run('/sbin/insmod', module) - if rc: + if rc and not mod_loaded(mod): raise CommandError('insmod', out, rc) else: - (rc, out) = run('/sbin/modprobe', mod) - if rc: + (rc, out) = run('/sbin/modprobe', mod) + if rc and not mod_loaded(mod): raise CommandError('modprobe', out, rc) + def cleanup_module(self): """Unload the modules in the list in reverse order.""" rev = self.kmodule_list @@ -1634,6 +1643,12 @@ class MDSDEV(Module): "--record_device", self.name, "--node", client_name, config_options) + if ret: + lctl.clear_log(self.name, client_name) + print out + self.cleanup() + panic("Record client log %s on %s failed" %( + client_name, self.name)) if config.verbose: for s in out: log("record> ", string.strip(s)) ret, out = run (sys.argv[0], @@ -1643,6 +1658,17 @@ class MDSDEV(Module): "--record_device", self.name, "--node", client_name, config_options) + if ret: + # In this case, although 0-conf mount works but 0-conf umount + # doesn't work. As a boring result, the user is forced to + # cleanup client service manually again and again. So I prefer + # deleting these two llogs together and let the user write_conf. + lctl.clear_log(self.name, client_name) + lctl.clear_log(self.name, client_name + '-clean') + print out + self.cleanup() + panic("Record client log %s on %s failed" %( + client_name + '-clean', self.name)) if config.verbose: for s in out: log("record> ", string.strip(s)) config.noexec = old_noexec -- 1.8.3.1