From e0576be940d7af30b2ba6e219891a4413670325d Mon Sep 17 00:00:00 2001 From: rread Date: Sat, 14 Feb 2004 03:16:22 +0000 Subject: [PATCH] land b_eq on HEAD --- lnet/archdep.m4 | 1 + lnet/include/lnet/errno.h | 3 +- lnet/include/lnet/lib-lnet.h | 131 ++-- lnet/include/lnet/lib-nal.h | 67 +- lnet/include/lnet/lib-p30.h | 131 ++-- lnet/include/lnet/lib-types.h | 16 +- lnet/include/lnet/lnet.h | 1 - lnet/include/lnet/p30.h | 1 - lnet/include/lnet/types.h | 27 +- lnet/klnds/gmlnd/gmlnd.h | 4 +- lnet/klnds/gmlnd/gmlnd_cb.c | 13 - lnet/klnds/gmlnd/gmlnd_comm.c | 24 +- lnet/klnds/iblnd/ibnal_cb.c | 21 +- lnet/klnds/qswlnd/qswlnd_cb.c | 222 ++++-- lnet/klnds/scimaclnd/scimacnal_cb.c | 34 +- lnet/klnds/socklnd/socklnd.c | 14 +- lnet/klnds/socklnd/socklnd_cb.c | 126 ++-- lnet/libcfs/module.c | 2 + lnet/lnet/Makefile.am | 6 +- lnet/lnet/api-eq.c | 49 +- lnet/lnet/api-errno.c | 1 - lnet/lnet/api-ni.c | 2 +- lnet/lnet/api-wrap.c | 4 +- lnet/lnet/lib-init.c | 18 +- lnet/lnet/lib-md.c | 83 +- lnet/lnet/lib-move.c | 788 +++++++++---------- lnet/lnet/lib-msg.c | 172 ++--- lnet/ulnds/Makefile.am | 6 +- lnet/ulnds/bridge.h | 5 + lnet/ulnds/connection.c | 7 +- lnet/ulnds/connection.h | 4 +- lnet/ulnds/procapi.c | 56 +- lnet/ulnds/procbridge.h | 4 + lnet/ulnds/proclib.c | 24 +- lnet/ulnds/select.c | 9 - lnet/ulnds/socklnd/Makefile.am | 6 +- lnet/ulnds/socklnd/bridge.h | 5 + lnet/ulnds/socklnd/connection.c | 7 +- lnet/ulnds/socklnd/connection.h | 4 +- lnet/ulnds/socklnd/procapi.c | 56 +- lnet/ulnds/socklnd/procbridge.h | 4 + lnet/ulnds/socklnd/proclib.c | 24 +- lnet/ulnds/socklnd/select.c | 9 - lnet/ulnds/socklnd/tcplnd.c | 94 +-- lnet/ulnds/tcplnd.c | 94 +-- lnet/utils/Makefile.am | 12 +- lnet/utils/l_ioctl.c | 160 ++-- lnet/utils/portals.c | 18 + lustre/ChangeLog | 12 +- lustre/Makefile.am | 2 +- lustre/configure.in | 2 +- lustre/include/liblustre.h | 21 +- lustre/include/linux/lustre_dlm.h | 3 - lustre/include/linux/lustre_export.h | 10 +- lustre/include/linux/lustre_idl.h | 1 + lustre/include/linux/lustre_net.h | 290 ++++--- lustre/include/linux/obd.h | 5 +- .../kernel_patches/patches/2.6.0-test6-mm4.patch | 46 +- lustre/kernel_patches/patches/bproc-patch-2.4.20 | 6 +- .../patches/ext3-xattr-ptr-arith-fix.patch | 4 +- lustre/ldlm/Makefile.am | 1 + lustre/ldlm/ldlm_lib.c | 270 +++---- lustre/ldlm/ldlm_lock.c | 23 +- lustre/ldlm/ldlm_lockd.c | 38 +- lustre/liblustre/Makefile.am | 69 +- lustre/liblustre/dir.c | 220 ++++++ lustre/liblustre/file.c | 168 +---- lustre/liblustre/genlib.sh | 47 +- lustre/liblustre/libtest.c | 246 ------ lustre/liblustre/llite_lib.c | 41 +- lustre/liblustre/llite_lib.h | 172 +++-- lustre/liblustre/namei.c | 206 ++--- lustre/liblustre/rw.c | 347 ++++----- lustre/liblustre/super.c | 155 +--- lustre/liblustre/tests/.cvsignore | 3 + lustre/liblustre/tests/Makefile.am | 45 ++ lustre/liblustre/tests/echo_test.c | 368 +++++++++ lustre/liblustre/{ => tests}/recovery_small.c | 2 + lustre/liblustre/tests/replay_ost_single.c | 338 +++++++++ lustre/liblustre/{ => tests}/replay_single.c | 2 + lustre/liblustre/{lltest.c => tests/sanity.c} | 97 ++- lustre/liblustre/{ => tests}/test_common.c | 25 +- lustre/liblustre/{ => tests}/test_common.h | 3 +- lustre/liblustre/{ => tests}/test_lock_cancel.c | 0 lustre/llite/llite_lib.c | 2 +- lustre/lov/Makefile.am | 3 +- lustre/lvfs/Makefile.am | 3 +- lustre/mdc/Makefile.am | 3 +- lustre/mdc/mdc_request.c | 17 +- lustre/mds/handler.c | 90 ++- lustre/mds/mds_internal.h | 3 +- lustre/mds/mds_log.c | 26 +- lustre/mds/mds_open.c | 17 +- lustre/mds/mds_reint.c | 90 ++- lustre/mds/mds_unlink_open.c | 133 ++-- lustre/mgmt/mgmt_svc.c | 10 +- lustre/obdclass/Makefile.am | 13 +- lustre/obdclass/class_obd.c | 11 +- lustre/obdclass/genops.c | 2 + lustre/obdclass/llog_lvfs.c | 2 +- lustre/obdclass/lprocfs_status.c | 7 + lustre/obdclass/obd_config.c | 19 +- lustre/obdecho/Makefile.am | 3 +- lustre/obdecho/echo.c | 9 +- lustre/obdecho/echo_client.c | 124 +-- lustre/osc/Makefile.am | 3 +- lustre/osc/osc_internal.h | 2 +- lustre/osc/osc_request.c | 62 +- lustre/ost/ost_handler.c | 107 +-- lustre/portals/archdep.m4 | 1 + lustre/portals/include/portals/errno.h | 3 +- lustre/portals/include/portals/lib-nal.h | 67 +- lustre/portals/include/portals/lib-p30.h | 131 ++-- lustre/portals/include/portals/lib-types.h | 16 +- lustre/portals/include/portals/p30.h | 1 - lustre/portals/include/portals/types.h | 27 +- lustre/portals/knals/gmnal/gmnal.h | 4 +- lustre/portals/knals/gmnal/gmnal_cb.c | 13 - lustre/portals/knals/gmnal/gmnal_comm.c | 24 +- lustre/portals/knals/ibnal/ibnal_cb.c | 21 +- lustre/portals/knals/qswnal/qswnal_cb.c | 222 ++++-- lustre/portals/knals/scimacnal/scimacnal_cb.c | 34 +- lustre/portals/knals/socknal/socknal.c | 14 +- lustre/portals/knals/socknal/socknal_cb.c | 126 ++-- lustre/portals/libcfs/module.c | 2 + lustre/portals/portals/Makefile.am | 6 +- lustre/portals/portals/api-eq.c | 49 +- lustre/portals/portals/api-errno.c | 1 - lustre/portals/portals/api-ni.c | 2 +- lustre/portals/portals/api-wrap.c | 4 +- lustre/portals/portals/lib-init.c | 18 +- lustre/portals/portals/lib-md.c | 83 +- lustre/portals/portals/lib-move.c | 788 +++++++++---------- lustre/portals/portals/lib-msg.c | 172 ++--- lustre/portals/unals/Makefile.am | 6 +- lustre/portals/unals/bridge.h | 5 + lustre/portals/unals/connection.c | 7 +- lustre/portals/unals/connection.h | 4 +- lustre/portals/unals/procapi.c | 56 +- lustre/portals/unals/procbridge.h | 4 + lustre/portals/unals/proclib.c | 24 +- lustre/portals/unals/select.c | 9 - lustre/portals/unals/tcpnal.c | 94 +-- lustre/portals/utils/Makefile.am | 12 +- lustre/portals/utils/l_ioctl.c | 160 ++-- lustre/portals/utils/portals.c | 18 + lustre/ptlbd/rpc.c | 45 +- lustre/ptlbd/server.c | 7 +- lustre/ptlrpc/Makefile.am | 25 +- lustre/ptlrpc/client.c | 256 +++---- lustre/ptlrpc/events.c | 626 +++++++-------- lustre/ptlrpc/import.c | 9 +- lustre/ptlrpc/llog_net.c | 3 +- lustre/ptlrpc/llog_server.c | 35 + lustre/ptlrpc/lproc_ptlrpc.c | 25 +- lustre/ptlrpc/niobuf.c | 716 +++++++----------- lustre/ptlrpc/pack_generic.c | 127 +++- lustre/ptlrpc/pinger.c | 209 +++++- lustre/ptlrpc/ptlrpc_internal.h | 6 +- lustre/ptlrpc/ptlrpc_module.c | 10 +- lustre/ptlrpc/ptlrpcd.c | 4 +- lustre/ptlrpc/recover.c | 13 +- lustre/ptlrpc/service.c | 835 ++++++++++++++------- lustre/scripts/lustre.spec.in | 4 +- lustre/tests/conf-sanity.sh | 45 +- lustre/tests/replay-ost-single.sh | 2 +- lustre/utils/Makefile.am | 15 +- lustre/utils/lrun | 2 +- 168 files changed, 6894 insertions(+), 4971 deletions(-) create mode 100644 lustre/liblustre/dir.c delete mode 100644 lustre/liblustre/libtest.c create mode 100644 lustre/liblustre/tests/.cvsignore create mode 100644 lustre/liblustre/tests/Makefile.am create mode 100644 lustre/liblustre/tests/echo_test.c rename lustre/liblustre/{ => tests}/recovery_small.c (99%) create mode 100644 lustre/liblustre/tests/replay_ost_single.c rename lustre/liblustre/{ => tests}/replay_single.c (99%) mode change 100755 => 100644 rename lustre/liblustre/{lltest.c => tests/sanity.c} (81%) rename lustre/liblustre/{ => tests}/test_common.c (91%) rename lustre/liblustre/{ => tests}/test_common.h (91%) rename lustre/liblustre/{ => tests}/test_lock_cancel.c (100%) diff --git a/lnet/archdep.m4 b/lnet/archdep.m4 index e955c33..c06bc8a 100644 --- a/lnet/archdep.m4 +++ b/lnet/archdep.m4 @@ -333,6 +333,7 @@ AC_SUBST(SCIMACNAL) CFLAGS="$KCFLAGS" CPPFLAGS="$KINCFLAGS $KCPPFLAGS $MFLAGS $enable_zerocopy $enable_affinity $with_quadrics $with_gm $with_scamac $with_ib" +AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib) AC_SUBST(MOD_LINK) AC_SUBST(LINUX25) AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib) diff --git a/lnet/include/lnet/errno.h b/lnet/include/lnet/errno.h index 817936a..08f084a 100644 --- a/lnet/include/lnet/errno.h +++ b/lnet/include/lnet/errno.h @@ -50,9 +50,8 @@ typedef enum { PTL_IOV_TOO_SMALL = 31, PTL_EQ_INUSE = 32, - PTL_MD_INUSE = 33, - PTL_MAX_ERRNO = 33 + PTL_MAX_ERRNO = 32 } ptl_err_t; /* If you change these, you must update the string table in api-errno.c */ diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 3582b94..e9e4635 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -42,7 +41,7 @@ do { \ nal->cb_sti(nal, flagsp); \ } -#ifdef PTL_USE_DESC_LISTS +#ifdef PTL_USE_LIB_FREELIST #define MAX_MES 2048 #define MAX_MDS 2048 @@ -98,7 +97,7 @@ lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) } static inline lib_md_t * -lib_md_alloc (nal_cb_t *nal) +lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) { /* NEVER called with statelock held */ unsigned long flags; @@ -142,8 +141,20 @@ lib_me_free (nal_cb_t *nal, lib_me_t *me) static inline lib_msg_t * lib_msg_alloc (nal_cb_t *nal) { - /* ALWAYS called with statelock held */ - return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs)); + /* NEVER called with statelock held */ + unsigned long flags; + lib_msg_t *msg; + + state_lock (nal, &flags); + msg = (lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs); + state_unlock (nal, &flags); + + if (msg != NULL) { + /* NULL pointers, clear flags etc */ + memset (msg, 0, sizeof (*msg)); + msg->ack_wmd = PTL_WIRE_HANDLE_NONE; + } + return(msg); } static inline void @@ -155,22 +166,13 @@ lib_msg_free (nal_cb_t *nal, lib_msg_t *msg) #else -extern atomic_t md_in_use_count; -extern atomic_t msg_in_use_count; -extern atomic_t me_in_use_count; -extern atomic_t eq_in_use_count; - static inline lib_eq_t * lib_eq_alloc (nal_cb_t *nal) { /* NEVER called with statelock held */ lib_eq_t *eq; - PORTAL_ALLOC(eq, sizeof(*eq)); - - if (eq == NULL) - return (NULL); - atomic_inc (&eq_in_use_count); + PORTAL_ALLOC(eq, sizeof(*eq)); return (eq); } @@ -178,21 +180,34 @@ static inline void lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) { /* ALWAYS called with statelock held */ - atomic_dec (&eq_in_use_count); PORTAL_FREE(eq, sizeof(*eq)); } static inline lib_md_t * -lib_md_alloc (nal_cb_t *nal) +lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) { /* NEVER called with statelock held */ lib_md_t *md; - PORTAL_ALLOC(md, sizeof(*md)); - - if (md == NULL) - return (NULL); - - atomic_inc (&md_in_use_count); + int size; + int niov; + + if ((umd->options & PTL_MD_KIOV) != 0) { + niov = umd->niov; + size = offsetof(lib_md_t, md_iov.kiov[niov]); + } else { + niov = ((umd->options & PTL_MD_IOV) != 0) ? + umd->niov : 1; + size = offsetof(lib_md_t, md_iov.iov[niov]); + } + + PORTAL_ALLOC(md, size); + + if (md != NULL) { + /* Set here in case of early free */ + md->options = umd->options; + md->md_niov = niov; + } + return (md); } @@ -200,8 +215,14 @@ static inline void lib_md_free (nal_cb_t *nal, lib_md_t *md) { /* ALWAYS called with statelock held */ - atomic_dec (&md_in_use_count); - PORTAL_FREE(md, sizeof(*md)); + int size; + + if ((md->options & PTL_MD_KIOV) != 0) + size = offsetof(lib_md_t, md_iov.kiov[md->md_niov]); + else + size = offsetof(lib_md_t, md_iov.iov[md->md_niov]); + + PORTAL_FREE(md, size); } static inline lib_me_t * @@ -209,12 +230,8 @@ lib_me_alloc (nal_cb_t *nal) { /* NEVER called with statelock held */ lib_me_t *me; - PORTAL_ALLOC(me, sizeof(*me)); - - if (me == NULL) - return (NULL); - atomic_inc (&me_in_use_count); + PORTAL_ALLOC(me, sizeof(*me)); return (me); } @@ -222,21 +239,21 @@ static inline void lib_me_free(nal_cb_t *nal, lib_me_t *me) { /* ALWAYS called with statelock held */ - atomic_dec (&me_in_use_count); PORTAL_FREE(me, sizeof(*me)); } static inline lib_msg_t * lib_msg_alloc(nal_cb_t *nal) { - /* ALWAYS called with statelock held */ + /* NEVER called with statelock held */ lib_msg_t *msg; - PORTAL_ALLOC_ATOMIC(msg, sizeof(*msg)); - if (msg == NULL) - return (NULL); - - atomic_inc (&msg_in_use_count); + PORTAL_ALLOC(msg, sizeof(*msg)); + if (msg != NULL) { + /* NULL pointers, clear flags etc */ + memset (msg, 0, sizeof (*msg)); + msg->ack_wmd = PTL_WIRE_HANDLE_NONE; + } return (msg); } @@ -244,7 +261,6 @@ static inline void lib_msg_free(nal_cb_t *nal, lib_msg_t *msg) { /* ALWAYS called with statelock held */ - atomic_dec (&msg_in_use_count); PORTAL_FREE(msg, sizeof(*msg)); } #endif @@ -344,26 +360,41 @@ extern char *dispatch_name(int index); * Call backs will be made to write events, send acks or * replies and so on. */ -extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private); -extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg); +extern void lib_enq_event_locked (nal_cb_t *nal, void *private, + lib_eq_t *eq, ptl_event_t *ev); +extern void lib_finalize (nal_cb_t *nal, void *private, lib_msg_t *msg, + ptl_err_t status); +extern void lib_parse (nal_cb_t *nal, ptl_hdr_t *hdr, void *private); extern lib_msg_t *lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_md_t *getmd); -extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr); +extern void print_hdr (nal_cb_t * nal, ptl_hdr_t * hdr); + extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov); -extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len); -extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len); +extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, + ptl_size_t offset, ptl_size_t len); +extern void lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, + char *src, ptl_size_t len); +extern int lib_extract_iov (int dst_niov, struct iovec *dst, + int src_niov, struct iovec *src, + ptl_size_t offset, ptl_size_t len); extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov); -extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len); -extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len); +extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, + ptl_size_t offset, ptl_size_t len); +extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset, + char *src, ptl_size_t len); +extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, + int src_niov, ptl_kiov_t *src, + ptl_size_t offset, ptl_size_t len); + extern void lib_assert_wire_constants (void); -extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, - ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); -extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - lib_md_t *md, ptl_size_t offset, ptl_size_t len); +extern ptl_err_t lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, + ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); +extern ptl_err_t lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + lib_md_t *md, ptl_size_t offset, ptl_size_t len); extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in, ptl_md_t * md_out); diff --git a/lnet/include/lnet/lib-nal.h b/lnet/include/lnet/lib-nal.h index 4052c0c..0bf557e 100644 --- a/lnet/include/lnet/lib-nal.h +++ b/lnet/include/lnet/lib-nal.h @@ -18,47 +18,60 @@ struct nal_cb_t { lib_ni_t ni; void *nal_data; /* - * send: Sends a preformatted header and user data to a - * specified remote process. - * Can overwrite iov. + * send: Sends a preformatted header and payload data to a + * specified remote process. The payload is scattered over 'niov' + * fragments described by iov, starting at 'offset' for 'mlen' + * bytes. + * NB the NAL may NOT overwrite iov. + * PTL_OK on success => NAL has committed to send and will call + * lib_finalize on completion */ - int (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, size_t mlen); + ptl_err_t (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen); /* as send, but with a set of page fragments (NULL if not supported) */ - int (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, ptl_kiov_t *iov, size_t mlen); + ptl_err_t (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, ptl_kiov_t *iov, + size_t offset, size_t mlen); /* - * recv: Receives an incoming message from a remote process - * Type of iov depends on options. Can overwrite iov. + * recv: Receives an incoming message from a remote process. The + * payload is to be received into the scattered buffer of 'niov' + * fragments described by iov, starting at 'offset' for 'mlen' + * bytes. Payload bytes after 'mlen' up to 'rlen' are to be + * discarded. + * NB the NAL may NOT overwrite iov. + * PTL_OK on success => NAL has committed to receive and will call + * lib_finalize on completion */ - int (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - unsigned int niov, struct iovec *iov, size_t mlen, - size_t rlen); + ptl_err_t (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen, size_t rlen); /* as recv, but with a set of page fragments (NULL if not supported) */ - int (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - unsigned int niov, ptl_kiov_t *iov, size_t mlen, - size_t rlen); + ptl_err_t (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + unsigned int niov, ptl_kiov_t *iov, + size_t offset, size_t mlen, size_t rlen); /* * read: Reads a block of data from a specified user address */ - int (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr, - user_ptr src_addr, size_t len); + ptl_err_t (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr, + user_ptr src_addr, size_t len); /* * write: Writes a block of data into a specified user address */ - int (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr, - void *src_addr, size_t len); + ptl_err_t (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr, + void *src_addr, size_t len); /* * callback: Calls an event callback + * NULL => lib calls eq's callback (if any) directly. */ - int (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq, - ptl_event_t *ev); + void (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq, + ptl_event_t *ev); /* * malloc: Acquire a block of memory in a system independent @@ -74,14 +87,14 @@ struct nal_cb_t { * type of *iov depends on options. * Set to NULL if not required. */ - int (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, - void **addrkey); + ptl_err_t (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, + void **addrkey); void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, void **addrkey); /* as (un)map, but with a set of page fragments */ - int (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, - void **addrkey); + ptl_err_t (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, + void **addrkey); void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, void **addrkey); diff --git a/lnet/include/lnet/lib-p30.h b/lnet/include/lnet/lib-p30.h index 3582b94..e9e4635 100644 --- a/lnet/include/lnet/lib-p30.h +++ b/lnet/include/lnet/lib-p30.h @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -42,7 +41,7 @@ do { \ nal->cb_sti(nal, flagsp); \ } -#ifdef PTL_USE_DESC_LISTS +#ifdef PTL_USE_LIB_FREELIST #define MAX_MES 2048 #define MAX_MDS 2048 @@ -98,7 +97,7 @@ lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) } static inline lib_md_t * -lib_md_alloc (nal_cb_t *nal) +lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) { /* NEVER called with statelock held */ unsigned long flags; @@ -142,8 +141,20 @@ lib_me_free (nal_cb_t *nal, lib_me_t *me) static inline lib_msg_t * lib_msg_alloc (nal_cb_t *nal) { - /* ALWAYS called with statelock held */ - return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs)); + /* NEVER called with statelock held */ + unsigned long flags; + lib_msg_t *msg; + + state_lock (nal, &flags); + msg = (lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs); + state_unlock (nal, &flags); + + if (msg != NULL) { + /* NULL pointers, clear flags etc */ + memset (msg, 0, sizeof (*msg)); + msg->ack_wmd = PTL_WIRE_HANDLE_NONE; + } + return(msg); } static inline void @@ -155,22 +166,13 @@ lib_msg_free (nal_cb_t *nal, lib_msg_t *msg) #else -extern atomic_t md_in_use_count; -extern atomic_t msg_in_use_count; -extern atomic_t me_in_use_count; -extern atomic_t eq_in_use_count; - static inline lib_eq_t * lib_eq_alloc (nal_cb_t *nal) { /* NEVER called with statelock held */ lib_eq_t *eq; - PORTAL_ALLOC(eq, sizeof(*eq)); - - if (eq == NULL) - return (NULL); - atomic_inc (&eq_in_use_count); + PORTAL_ALLOC(eq, sizeof(*eq)); return (eq); } @@ -178,21 +180,34 @@ static inline void lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) { /* ALWAYS called with statelock held */ - atomic_dec (&eq_in_use_count); PORTAL_FREE(eq, sizeof(*eq)); } static inline lib_md_t * -lib_md_alloc (nal_cb_t *nal) +lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) { /* NEVER called with statelock held */ lib_md_t *md; - PORTAL_ALLOC(md, sizeof(*md)); - - if (md == NULL) - return (NULL); - - atomic_inc (&md_in_use_count); + int size; + int niov; + + if ((umd->options & PTL_MD_KIOV) != 0) { + niov = umd->niov; + size = offsetof(lib_md_t, md_iov.kiov[niov]); + } else { + niov = ((umd->options & PTL_MD_IOV) != 0) ? + umd->niov : 1; + size = offsetof(lib_md_t, md_iov.iov[niov]); + } + + PORTAL_ALLOC(md, size); + + if (md != NULL) { + /* Set here in case of early free */ + md->options = umd->options; + md->md_niov = niov; + } + return (md); } @@ -200,8 +215,14 @@ static inline void lib_md_free (nal_cb_t *nal, lib_md_t *md) { /* ALWAYS called with statelock held */ - atomic_dec (&md_in_use_count); - PORTAL_FREE(md, sizeof(*md)); + int size; + + if ((md->options & PTL_MD_KIOV) != 0) + size = offsetof(lib_md_t, md_iov.kiov[md->md_niov]); + else + size = offsetof(lib_md_t, md_iov.iov[md->md_niov]); + + PORTAL_FREE(md, size); } static inline lib_me_t * @@ -209,12 +230,8 @@ lib_me_alloc (nal_cb_t *nal) { /* NEVER called with statelock held */ lib_me_t *me; - PORTAL_ALLOC(me, sizeof(*me)); - - if (me == NULL) - return (NULL); - atomic_inc (&me_in_use_count); + PORTAL_ALLOC(me, sizeof(*me)); return (me); } @@ -222,21 +239,21 @@ static inline void lib_me_free(nal_cb_t *nal, lib_me_t *me) { /* ALWAYS called with statelock held */ - atomic_dec (&me_in_use_count); PORTAL_FREE(me, sizeof(*me)); } static inline lib_msg_t * lib_msg_alloc(nal_cb_t *nal) { - /* ALWAYS called with statelock held */ + /* NEVER called with statelock held */ lib_msg_t *msg; - PORTAL_ALLOC_ATOMIC(msg, sizeof(*msg)); - if (msg == NULL) - return (NULL); - - atomic_inc (&msg_in_use_count); + PORTAL_ALLOC(msg, sizeof(*msg)); + if (msg != NULL) { + /* NULL pointers, clear flags etc */ + memset (msg, 0, sizeof (*msg)); + msg->ack_wmd = PTL_WIRE_HANDLE_NONE; + } return (msg); } @@ -244,7 +261,6 @@ static inline void lib_msg_free(nal_cb_t *nal, lib_msg_t *msg) { /* ALWAYS called with statelock held */ - atomic_dec (&msg_in_use_count); PORTAL_FREE(msg, sizeof(*msg)); } #endif @@ -344,26 +360,41 @@ extern char *dispatch_name(int index); * Call backs will be made to write events, send acks or * replies and so on. */ -extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private); -extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg); +extern void lib_enq_event_locked (nal_cb_t *nal, void *private, + lib_eq_t *eq, ptl_event_t *ev); +extern void lib_finalize (nal_cb_t *nal, void *private, lib_msg_t *msg, + ptl_err_t status); +extern void lib_parse (nal_cb_t *nal, ptl_hdr_t *hdr, void *private); extern lib_msg_t *lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_md_t *getmd); -extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr); +extern void print_hdr (nal_cb_t * nal, ptl_hdr_t * hdr); + extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov); -extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len); -extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len); +extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, + ptl_size_t offset, ptl_size_t len); +extern void lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, + char *src, ptl_size_t len); +extern int lib_extract_iov (int dst_niov, struct iovec *dst, + int src_niov, struct iovec *src, + ptl_size_t offset, ptl_size_t len); extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov); -extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len); -extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len); +extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, + ptl_size_t offset, ptl_size_t len); +extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset, + char *src, ptl_size_t len); +extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, + int src_niov, ptl_kiov_t *src, + ptl_size_t offset, ptl_size_t len); + extern void lib_assert_wire_constants (void); -extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, - ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); -extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - lib_md_t *md, ptl_size_t offset, ptl_size_t len); +extern ptl_err_t lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, + ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); +extern ptl_err_t lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + lib_md_t *md, ptl_size_t offset, ptl_size_t len); extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in, ptl_md_t * md_out); diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index d9e3c11..904204b 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -16,7 +16,7 @@ # include # include #else -# define PTL_USE_DESC_LISTS +# define PTL_USE_LIB_FREELIST # include #endif @@ -139,16 +139,9 @@ typedef struct { struct lib_msg_t { struct list_head msg_list; - int send_ack; lib_md_t *md; - ptl_nid_t nid; - ptl_pid_t pid; - ptl_event_t ev; ptl_handle_wire_t ack_wmd; - union { - struct iovec iov[PTL_MD_MAX_IOV]; - ptl_kiov_t kiov[PTL_MD_MAX_IOV]; - } msg_iov; + ptl_event_t ev; }; struct lib_ptl_t { @@ -212,9 +205,8 @@ struct lib_md_t { }; #define PTL_MD_FLAG_UNLINK (1 << 0) -#define PTL_MD_FLAG_AUTO_UNLINKED (1 << 1) -#ifdef PTL_USE_DESC_LISTS +#ifdef PTL_USE_LIB_FREELIST typedef struct { void *fl_objs; /* single contiguous array of objects */ @@ -262,7 +254,7 @@ typedef struct { struct list_head ni_test_peers; -#ifdef PTL_USE_DESC_LISTS +#ifdef PTL_USE_LIB_FREELIST lib_freelist_t ni_free_mes; lib_freelist_t ni_free_msgs; lib_freelist_t ni_free_mds; diff --git a/lnet/include/lnet/lnet.h b/lnet/include/lnet/lnet.h index a4ea39b..8b1495e 100644 --- a/lnet/include/lnet/lnet.h +++ b/lnet/include/lnet/lnet.h @@ -21,7 +21,6 @@ #include #include #include -#include #include extern int __p30_initialized; /* for libraries & test codes */ diff --git a/lnet/include/lnet/p30.h b/lnet/include/lnet/p30.h index a4ea39b..8b1495e 100644 --- a/lnet/include/lnet/p30.h +++ b/lnet/include/lnet/p30.h @@ -21,7 +21,6 @@ #include #include #include -#include #include extern int __p30_initialized; /* for libraries & test codes */ diff --git a/lnet/include/lnet/types.h b/lnet/include/lnet/types.h index e4ccebf..7ffe797 100644 --- a/lnet/include/lnet/types.h +++ b/lnet/include/lnet/types.h @@ -17,6 +17,8 @@ typedef u_int64_t __u64; # define do_gettimeofday(tv) gettimeofday(tv, NULL) #endif +#include + typedef __u64 ptl_nid_t; typedef __u32 ptl_pid_t; typedef __u32 ptl_pt_index_t; @@ -97,7 +99,8 @@ typedef enum { PTL_EVENT_PUT, PTL_EVENT_REPLY, PTL_EVENT_ACK, - PTL_EVENT_SENT + PTL_EVENT_SENT, + PTL_EVENT_UNLINK, } ptl_event_kind_t; #define PTL_SEQ_BASETYPE long @@ -112,15 +115,19 @@ typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t; #pragma pack(push, 4) #endif typedef struct { - ptl_event_kind_t type; - ptl_process_id_t initiator; - ptl_pt_index_t portal; - ptl_match_bits_t match_bits; - ptl_size_t rlength, mlength, offset; - ptl_handle_me_t unlinked_me; - ptl_md_t mem_desc; - ptl_hdr_data_t hdr_data; - struct timeval arrival_time; + ptl_event_kind_t type; + ptl_err_t status; + int unlinked; + ptl_process_id_t initiator; + ptl_pt_index_t portal; + ptl_match_bits_t match_bits; + ptl_size_t rlength; + ptl_size_t mlength; + ptl_size_t offset; + ptl_md_t mem_desc; + ptl_hdr_data_t hdr_data; + struct timeval arrival_time; + volatile ptl_seq_t sequence; } ptl_event_t; #ifdef __CYGWIN__ diff --git a/lnet/klnds/gmlnd/gmlnd.h b/lnet/klnds/gmlnd/gmlnd.h index 53757ab..cdde5b7 100644 --- a/lnet/klnds/gmlnd/gmlnd.h +++ b/lnet/klnds/gmlnd/gmlnd.h @@ -353,8 +353,6 @@ int gmnal_cb_read(nal_cb_t *, void *private, void *, user_ptr, size_t); int gmnal_cb_write(nal_cb_t *, void *private, user_ptr, void *, size_t); -int gmnal_cb_callback(nal_cb_t *, void *, lib_eq_t *, ptl_event_t *); - void *gmnal_cb_malloc(nal_cb_t *, size_t); void gmnal_cb_free(nal_cb_t *, void *, size_t); @@ -384,7 +382,7 @@ void gmnal_fini(void); a->cb_recv_pages = gmnal_cb_recv_pages; \ a->cb_read = gmnal_cb_read; \ a->cb_write = gmnal_cb_write; \ - a->cb_callback = gmnal_cb_callback; \ + a->cb_callback = NULL; \ a->cb_malloc = gmnal_cb_malloc; \ a->cb_free = gmnal_cb_free; \ a->cb_map = NULL; \ diff --git a/lnet/klnds/gmlnd/gmlnd_cb.c b/lnet/klnds/gmlnd/gmlnd_cb.c index 6ae91db..e055242 100644 --- a/lnet/klnds/gmlnd/gmlnd_cb.c +++ b/lnet/klnds/gmlnd/gmlnd_cb.c @@ -126,7 +126,6 @@ int gmnal_cb_send(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, niov, iov, len); } else { CDEBUG(D_ERROR, "Large message send it is not supported\n"); - lib_finalize(nal_cb, private, cookie); return(PTL_FAIL); gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid, niov, iov, len); @@ -200,18 +199,6 @@ int gmnal_cb_write(nal_cb_t *nal_cb, void *private, user_ptr dst, return(PTL_OK); } -int gmnal_cb_callback(nal_cb_t *nal_cb, void *private, lib_eq_t *eq, - ptl_event_t *ev) -{ - - if (eq->event_callback != NULL) { - CDEBUG(D_INFO, "found callback\n"); - eq->event_callback(ev); - } - - return(PTL_OK); -} - void *gmnal_cb_malloc(nal_cb_t *nal_cb, size_t len) { void *ptr = NULL; diff --git a/lnet/klnds/gmlnd/gmlnd_comm.c b/lnet/klnds/gmlnd/gmlnd_comm.c index 4171df6..a0d3530 100644 --- a/lnet/klnds/gmlnd/gmlnd_comm.c +++ b/lnet/klnds/gmlnd/gmlnd_comm.c @@ -321,7 +321,6 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, if (!private) { CDEBUG(D_ERROR, "gmnal_small_rx no context\n"); - lib_finalize(nal_cb, private, cookie); return(PTL_FAIL); } @@ -343,10 +342,8 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, * let portals library know receive is complete */ CDEBUG(D_PORTALS, "calling lib_finalize\n"); - if (lib_finalize(nal_cb, private, cookie) != PTL_OK) { - /* TO DO what to do with failed lib_finalise? */ - CDEBUG(D_INFO, "lib_finalize failed\n"); - } + lib_finalize(nal_cb, private, cookie, PTL_OK); + /* * return buffer so it can be used again */ @@ -590,10 +587,8 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) return; } gmnal_return_stxd(nal_data, stxd); - if (lib_finalize(nal_cb, stxd, cookie) != PTL_OK) { - CDEBUG(D_INFO, "Call to lib_finalize failed for stxd [%p]\n", - stxd); - } + lib_finalize(nal_cb, stxd, cookie, PTL_OK); + return; } @@ -817,7 +812,6 @@ gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, if (!srxd) { CDEBUG(D_ERROR, "gmnal_large_rx no context\n"); - lib_finalize(nal_cb, private, cookie); return(PTL_FAIL); } @@ -1114,10 +1108,7 @@ gmnal_remote_get_callback(gm_port_t *gm_port, void *context, * Let our client application proceed */ CDEBUG(D_ERROR, "final callback context[%p]\n", srxd); - if (lib_finalize(nal_cb, srxd, srxd->cookie) != PTL_OK) { - CDEBUG(D_INFO, "Call to lib_finalize failed for srxd [%p]\n", - srxd); - } + lib_finalize(nal_cb, srxd, srxd->cookie, PTL_OK); /* * send an ack to the sender to let him know we got the data @@ -1282,10 +1273,7 @@ gmnal_large_tx_ack_received(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) CDEBUG(D_INFO, "gmnal_large_tx_ack_received stxd [%p]\n", stxd); - if (lib_finalize(nal_cb, stxd, stxd->cookie) != PTL_OK) { - CDEBUG(D_INFO, "Call to lib_finalize failed for stxd [%p]\n", - stxd); - } + lib_finalize(nal_cb, stxd, stxd->cookie, PTL_OK); /* * extract the iovec from the stxd, deregister the memory. diff --git a/lnet/klnds/iblnd/ibnal_cb.c b/lnet/klnds/iblnd/ibnal_cb.c index 2c07cc4..0688062 100644 --- a/lnet/klnds/iblnd/ibnal_cb.c +++ b/lnet/klnds/iblnd/ibnal_cb.c @@ -306,7 +306,7 @@ kibnal_send(nal_cb_t *nal, if(buf_length > MAX_MSG_SIZE) { CERROR("kibnal_send:request exceeds Transmit data size (%d).\n", MAX_MSG_SIZE); - rc = -1; + rc = PTL_FAIL; return rc; } else { @@ -363,7 +363,7 @@ kibnal_send(nal_cb_t *nal, PROF_FINISH(kibnal_send); // time stapm of send operation - rc = 1; + rc = PTL_OK; return rc; } @@ -386,7 +386,7 @@ int kibnal_send_pages(nal_cb_t * nal, ptl_kiov_t *iov, size_t mlen) { - int rc = 1; + int rc = PTL_FAIL; CDEBUG(D_NET, "kibnal_send_pages\n"); @@ -420,7 +420,7 @@ void kibnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) // // do you need this // -int kibnal_callback(nal_cb_t * nal, +void kibnal_callback(nal_cb_t * nal, void *private, lib_eq_t *eq, ptl_event_t *ev) @@ -507,7 +507,7 @@ kibnal_recv_pages(nal_cb_t * nal, { CDEBUG(D_NET, "recv_pages not implemented\n"); - return PTL_OK; + return PTL_FAIL; } @@ -526,11 +526,12 @@ kibnal_recv(nal_cb_t *nal, CDEBUG(D_NET,"kibnal_recv: mlen=%d, rlen=%d\n", mlen, rlen); /* What was actually received must be >= what sender claims to - * have sent. This is an LASSERT, since lib-move doesn't - * check cb return code yet. */ - LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen); + * have sent. */ LASSERT (mlen <= rlen); + if (krx->krx_len < sizeof (ptl_hdr_t) + rlen) + return (PTL_FAIL); + PROF_START(kibnal_recv); if(mlen != 0) { @@ -542,12 +543,12 @@ kibnal_recv(nal_cb_t *nal, PROF_START(lib_finalize); - lib_finalize(nal, private, cookie); + lib_finalize(nal, private, cookie, PTL_OK); PROF_FINISH(lib_finalize); PROF_FINISH(kibnal_recv); - return rlen; + return PTL_OK; } // diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c index 96749cd..4c2bd6a 100644 --- a/lnet/klnds/qswlnd/qswlnd_cb.c +++ b/lnet/klnds/qswlnd/qswlnd_cb.c @@ -33,7 +33,7 @@ EP_STATUSBLK kqswnal_rpc_failed; * LIB functions follow * */ -static int +static ptl_err_t kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, size_t len) { @@ -41,10 +41,10 @@ kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, nal->ni.nid, len, src_addr, dst_addr ); memcpy( dst_addr, src_addr, len ); - return (0); + return (PTL_OK); } -static int +static ptl_err_t kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, size_t len) { @@ -52,7 +52,7 @@ kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, nal->ni.nid, len, src_addr, dst_addr ); memcpy( dst_addr, src_addr, len ); - return (0); + return (PTL_OK); } static void * @@ -157,13 +157,12 @@ kqswnal_unmap_tx (kqswnal_tx_t *ktx) elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState, kqswnal_data.kqn_eptxdmahandle, ktx->ktx_basepage, ktx->ktx_nmappedpages); - #endif ktx->ktx_nmappedpages = 0; } int -kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov) +kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_t *kiov) { int nfrags = ktx->ktx_nfrag; int nmapped = ktx->ktx_nmappedpages; @@ -188,8 +187,16 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov) LASSERT (niov > 0); LASSERT (nob > 0); + /* skip complete frags before 'offset' */ + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + kiov++; + niov--; + LASSERT (niov > 0); + } + do { - int fraglen = kiov->kiov_len; + int fraglen = kiov->kiov_len - offset; /* nob exactly spans the iovs */ LASSERT (fraglen <= nob); @@ -212,7 +219,7 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov) /* XXX this is really crap, but we'll have to kmap until * EKC has a page (rather than vaddr) mapping interface */ - ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset; CDEBUG(D_NET, "%p[%d] loading %p for %d, page %d, %d total\n", @@ -257,6 +264,7 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov) kiov++; niov--; nob -= fraglen; + offset = 0; /* iov must not run out before end of data */ LASSERT (nob == 0 || niov > 0); @@ -271,7 +279,8 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov) } int -kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov) +kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob, + int niov, struct iovec *iov) { int nfrags = ktx->ktx_nfrag; int nmapped = ktx->ktx_nmappedpages; @@ -295,8 +304,16 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov) LASSERT (niov > 0); LASSERT (nob > 0); + /* skip complete frags before offset */ + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + niov--; + LASSERT (niov > 0); + } + do { - int fraglen = iov->iov_len; + int fraglen = iov->iov_len - offset; long npages = kqswnal_pages_spanned (iov->iov_base, fraglen); /* nob exactly spans the iovs */ @@ -317,12 +334,12 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov) CDEBUG(D_NET, "%p[%d] loading %p for %d, pages %d for %ld, %d total\n", - ktx, nfrags, iov->iov_base, fraglen, basepage, npages, - nmapped); + ktx, nfrags, iov->iov_base + offset, fraglen, + basepage, npages, nmapped); #if MULTIRAIL_EKC ep_dvma_load(kqswnal_data.kqn_ep, NULL, - iov->iov_base, fraglen, + iov->iov_base + offset, fraglen, kqswnal_data.kqn_ep_tx_nmh, basepage, &railmask, &ktx->ktx_frags[nfrags]); @@ -336,7 +353,7 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov) #else elan3_dvma_kaddr_load (kqswnal_data.kqn_ep->DmaState, kqswnal_data.kqn_eptxdmahandle, - iov->iov_base, fraglen, + iov->iov_base + offset, fraglen, basepage, &ktx->ktx_frags[nfrags].Base); if (nfrags > 0 && /* previous frag mapped */ @@ -357,6 +374,7 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov) iov++; niov--; nob -= fraglen; + offset = 0; /* iov must not run out before end of data */ LASSERT (nob == 0 || niov > 0); @@ -483,7 +501,7 @@ void kqswnal_tx_done (kqswnal_tx_t *ktx, int error) { lib_msg_t *msg; - lib_msg_t *repmsg; + lib_msg_t *repmsg = NULL; switch (ktx->ktx_state) { case KTX_FORWARDING: /* router asked me to forward this packet */ @@ -493,21 +511,29 @@ kqswnal_tx_done (kqswnal_tx_t *ktx, int error) case KTX_SENDING: /* packet sourced locally */ lib_finalize (&kqswnal_lib, ktx->ktx_args[0], - (lib_msg_t *)ktx->ktx_args[1]); + (lib_msg_t *)ktx->ktx_args[1], + (error == 0) ? PTL_OK : + (error == -ENOMEM) ? PTL_NOSPACE : PTL_FAIL); break; case KTX_GETTING: /* Peer has DMA-ed direct? */ msg = (lib_msg_t *)ktx->ktx_args[1]; - repmsg = NULL; - if (error == 0) + if (error == 0) { repmsg = lib_fake_reply_msg (&kqswnal_lib, ktx->ktx_nid, msg->md); + if (repmsg == NULL) + error = -ENOMEM; + } - lib_finalize (&kqswnal_lib, ktx->ktx_args[0], msg); - - if (repmsg != NULL) - lib_finalize (&kqswnal_lib, NULL, repmsg); + if (error == 0) { + lib_finalize (&kqswnal_lib, ktx->ktx_args[0], + msg, PTL_OK); + lib_finalize (&kqswnal_lib, NULL, repmsg, PTL_OK); + } else { + lib_finalize (&kqswnal_lib, ktx->ktx_args[0], msg, + (error == -ENOMEM) ? PTL_NOSPACE : PTL_FAIL); + } break; default: @@ -533,7 +559,7 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status) ktx->ktx_nid, status); kqswnal_notify_peer_down(ktx); - status = -EIO; + status = -EHOSTDOWN; } else if (ktx->ktx_state == KTX_GETTING) { /* RPC completed OK; what did our peer put in the status @@ -745,7 +771,8 @@ kqswnal_eiovs2datav (int ndv, EP_DATAVEC *dv, int kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, - struct iovec *iov, ptl_kiov_t *kiov, int nob) + struct iovec *iov, ptl_kiov_t *kiov, + int offset, int nob) { kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; char *buffer = (char *)page_address(krx->krx_pages[0]); @@ -779,9 +806,9 @@ kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, /* Map the source data... */ ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0; if (kiov != NULL) - rc = kqswnal_map_tx_kiov (ktx, nob, nfrag, kiov); + rc = kqswnal_map_tx_kiov (ktx, offset, nob, nfrag, kiov); else - rc = kqswnal_map_tx_iov (ktx, nob, nfrag, iov); + rc = kqswnal_map_tx_iov (ktx, offset, nob, nfrag, iov); if (rc != 0) { CERROR ("Can't map source data: %d\n", rc); @@ -846,7 +873,7 @@ kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, return (-ECONNABORTED); } -static int +static ptl_err_t kqswnal_sendmsg (nal_cb_t *nal, void *private, lib_msg_t *libmsg, @@ -857,6 +884,7 @@ kqswnal_sendmsg (nal_cb_t *nal, unsigned int payload_niov, struct iovec *payload_iov, ptl_kiov_t *payload_kiov, + size_t payload_offset, size_t payload_nob) { kqswnal_tx_t *ktx; @@ -865,6 +893,7 @@ kqswnal_sendmsg (nal_cb_t *nal, #if KQSW_CHECKSUM int i; kqsw_csum_t csum; + int sumoff; int sumnob; #endif @@ -928,9 +957,9 @@ kqswnal_sendmsg (nal_cb_t *nal, } /* peer expects RPC completion with GET data */ - rc = kqswnal_dma_reply (ktx, - payload_niov, payload_iov, - payload_kiov, payload_nob); + rc = kqswnal_dma_reply (ktx, payload_niov, + payload_iov, payload_kiov, + payload_offset, payload_nob); if (rc == 0) return (PTL_OK); @@ -945,22 +974,39 @@ kqswnal_sendmsg (nal_cb_t *nal, #if KQSW_CHECKSUM csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr)); memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum)); - for (csum = 0, i = 0, sumnob = payload_nob; sumnob > 0; i++) { + for (csum = 0, i = 0, sumoff = payload_offset, sumnob = payload_nob; sumnob > 0; i++) { + LASSERT(i < niov); if (payload_kiov != NULL) { ptl_kiov_t *kiov = &payload_kiov[i]; - char *addr = ((char *)kmap (kiov->kiov_page)) + - kiov->kiov_offset; - - csum = kqsw_csum (csum, addr, MIN (sumnob, kiov->kiov_len)); - sumnob -= kiov->kiov_len; + + if (sumoff >= kiov->kiov_len) { + sumoff -= kiov->kiov_len; + } else { + char *addr = ((char *)kmap (kiov->kiov_page)) + + kiov->kiov_offset + sumoff; + int fragnob = kiov->kiov_len - sumoff; + + csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob)); + sumnob -= fragnob; + sumoff = 0; + kunmap(kiov->kiov_page); + } } else { struct iovec *iov = &payload_iov[i]; - csum = kqsw_csum (csum, iov->iov_base, MIN (sumnob, kiov->iov_len)); - sumnob -= iov->iov_len; + if (sumoff > iov->iov_len) { + sumoff -= iov->iov_len; + } else { + char *addr = iov->iov_base + sumoff; + int fragnob = iov->iov_len - sumoff; + + csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob)); + sumnob -= fragnob; + sumoff = 0; + } } } - memcpy(ktx->ktx_buffer +sizeof(*hdr) +sizeof(csum), &csum,sizeof(csum)); + memcpy(ktx->ktx_buffer + sizeof(*hdr) + sizeof(csum), &csum, sizeof(csum)); #endif if (kqswnal_data.kqn_optimized_gets && @@ -987,10 +1033,10 @@ kqswnal_sendmsg (nal_cb_t *nal, ktx->ktx_state = KTX_GETTING; if ((libmsg->md->options & PTL_MD_KIOV) != 0) - rc = kqswnal_map_tx_kiov (ktx, md->length, + rc = kqswnal_map_tx_kiov (ktx, 0, md->length, md->md_niov, md->md_iov.kiov); else - rc = kqswnal_map_tx_iov (ktx, md->length, + rc = kqswnal_map_tx_iov (ktx, 0, md->length, md->md_niov, md->md_iov.iov); if (rc < 0) { @@ -1033,10 +1079,12 @@ kqswnal_sendmsg (nal_cb_t *nal, if (payload_nob > 0) { if (payload_kiov != NULL) lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE, - payload_niov, payload_kiov, payload_nob); + payload_niov, payload_kiov, + payload_offset, payload_nob); else lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE, - payload_niov, payload_iov, payload_nob); + payload_niov, payload_iov, + payload_offset, payload_nob); } } else { @@ -1052,10 +1100,10 @@ kqswnal_sendmsg (nal_cb_t *nal, ktx->ktx_frags[0].Len = KQSW_HDR_SIZE; #endif if (payload_kiov != NULL) - rc = kqswnal_map_tx_kiov (ktx, payload_nob, + rc = kqswnal_map_tx_kiov (ktx, payload_offset, payload_nob, payload_niov, payload_kiov); else - rc = kqswnal_map_tx_iov (ktx, payload_nob, + rc = kqswnal_map_tx_iov (ktx, payload_offset, payload_nob, payload_niov, payload_iov); if (rc != 0) { kqswnal_put_idle_tx (ktx); @@ -1078,7 +1126,7 @@ kqswnal_sendmsg (nal_cb_t *nal, return (PTL_OK); } -static int +static ptl_err_t kqswnal_send (nal_cb_t *nal, void *private, lib_msg_t *libmsg, @@ -1088,13 +1136,15 @@ kqswnal_send (nal_cb_t *nal, ptl_pid_t pid, unsigned int payload_niov, struct iovec *payload_iov, + size_t payload_offset, size_t payload_nob) { return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid, - payload_niov, payload_iov, NULL, payload_nob)); + payload_niov, payload_iov, NULL, + payload_offset, payload_nob)); } -static int +static ptl_err_t kqswnal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *libmsg, @@ -1104,10 +1154,12 @@ kqswnal_send_pages (nal_cb_t *nal, ptl_pid_t pid, unsigned int payload_niov, ptl_kiov_t *payload_kiov, + size_t payload_offset, size_t payload_nob) { return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid, - payload_niov, NULL, payload_kiov, payload_nob)); + payload_niov, NULL, payload_kiov, + payload_offset, payload_nob)); } void @@ -1161,7 +1213,7 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) nob <= KQSW_TX_BUFFER_SIZE) { /* send from ktx's pre-mapped contiguous buffer? */ - lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, nob); + lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, 0, nob); #if MULTIRAIL_EKC ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob); @@ -1176,7 +1228,7 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) { /* zero copy */ ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0; - rc = kqswnal_map_tx_iov (ktx, nob, niov, iov); + rc = kqswnal_map_tx_iov (ktx, 0, nob, niov, iov); if (rc != 0) goto failed; @@ -1231,7 +1283,8 @@ kqswnal_dma_reply_complete (EP_RXD *rxd) krx->krx_rpc_reply_needed = 0; kqswnal_rx_done (krx); - lib_finalize (&kqswnal_lib, NULL, msg); + lib_finalize (&kqswnal_lib, NULL, msg, + (status == EP_SUCCESS) ? PTL_OK : PTL_FAIL); kqswnal_put_idle_tx (ktx); } @@ -1461,13 +1514,14 @@ kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr) } #endif -static int +static ptl_err_t kqswnal_recvmsg (nal_cb_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t mlen, size_t rlen) { @@ -1498,10 +1552,13 @@ kqswnal_recvmsg (nal_cb_t *nal, #endif CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen); - /* What was actually received must be >= payload. - * This is an LASSERT, as lib_finalize() doesn't have a completion status. */ - LASSERT (krx->krx_nob >= KQSW_HDR_SIZE + mlen); + /* What was actually received must be >= payload. */ LASSERT (mlen <= rlen); + if (krx->krx_nob < KQSW_HDR_SIZE + mlen) { + CERROR("Bad message size: have %d, need %d + %d\n", + krx->krx_nob, KQSW_HDR_SIZE, mlen); + return (PTL_FAIL); + } /* It must be OK to kmap() if required */ LASSERT (kiov == NULL || !in_interrupt ()); @@ -1516,20 +1573,37 @@ kqswnal_recvmsg (nal_cb_t *nal, page_nob = PAGE_SIZE - KQSW_HDR_SIZE; LASSERT (niov > 0); + if (kiov != NULL) { - iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; - iov_nob = kiov->kiov_len; + /* skip complete frags */ + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + kiov++; + niov--; + LASSERT (niov > 0); + } + iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset; + iov_nob = kiov->kiov_len - offset; } else { - iov_ptr = iov->iov_base; - iov_nob = iov->iov_len; + /* skip complete frags */ + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + niov--; + LASSERT (niov > 0); + } + iov_ptr = iov->iov_base + offset; + iov_nob = iov->iov_len - offset; } - + for (;;) { - /* We expect the iov to exactly match mlen */ - LASSERT (iov_nob <= mlen); - - frag = MIN (page_nob, iov_nob); + frag = mlen; + if (frag > page_nob) + frag = page_nob; + if (frag > iov_nob) + frag = iov_nob; + memcpy (iov_ptr, page_ptr, frag); #if KQSW_CHECKSUM payload_csum = kqsw_csum (payload_csum, iov_ptr, frag); @@ -1588,33 +1662,39 @@ kqswnal_recvmsg (nal_cb_t *nal, "csum_nob %d\n", hdr_csum, payload_csum, csum_frags, csum_nob); #endif - lib_finalize(nal, private, libmsg); + lib_finalize(nal, private, libmsg, PTL_OK); - return (rlen); + return (PTL_OK); } -static int +static ptl_err_t kqswnal_recv(nal_cb_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen, size_t rlen) { - return (kqswnal_recvmsg (nal, private, libmsg, niov, iov, NULL, mlen, rlen)); + return (kqswnal_recvmsg(nal, private, libmsg, + niov, iov, NULL, + offset, mlen, rlen)); } -static int +static ptl_err_t kqswnal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, ptl_kiov_t *kiov, + size_t offset, size_t mlen, size_t rlen) { - return (kqswnal_recvmsg (nal, private, libmsg, niov, NULL, kiov, mlen, rlen)); + return (kqswnal_recvmsg(nal, private, libmsg, + niov, NULL, kiov, + offset, mlen, rlen)); } int diff --git a/lnet/klnds/scimaclnd/scimacnal_cb.c b/lnet/klnds/scimaclnd/scimacnal_cb.c index b31c2ea..52afb98 100644 --- a/lnet/klnds/scimaclnd/scimacnal_cb.c +++ b/lnet/klnds/scimaclnd/scimacnal_cb.c @@ -176,7 +176,8 @@ kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context) break; } - lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie); + lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie, + (err == 0) ? PTL_OK : PTL_FAIL); PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); } @@ -225,14 +226,14 @@ kscimacnal_sendmsg(nal_cb_t *nal, if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) { CERROR("kscimacnal:request exceeds TX MTU size (%ld).\n", mac_get_mtusize(ksci->ksci_machandle)); - return -EINVAL; + return PTL_FAIL; } /* save transaction info for later finalize and cleanup */ PORTAL_ALLOC(ktx, (sizeof(kscimacnal_tx_t))); if (!ktx) { - return -ENOMEM; + return PTL_NOSPACE; } ktx->ktx_nmapped = 0; /* Start with no mapped pages :) */ @@ -247,7 +248,7 @@ kscimacnal_sendmsg(nal_cb_t *nal, kscimacnal_txrelease, ktx); if (!msg) { PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); - return -ENOMEM; + return PTL_NOSPACE; } mac_put_mblk(msg, sizeof(ptl_hdr_t)); lastblk=msg; @@ -284,7 +285,7 @@ kscimacnal_sendmsg(nal_cb_t *nal, if(!newblk) { mac_free_msg(msg); PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); - return -ENOMEM; + return PTL_NOSPACE; } mac_put_mblk(newblk, nob); mac_link_mblk(lastblk, newblk); @@ -315,10 +316,10 @@ kscimacnal_sendmsg(nal_cb_t *nal, CERROR("kscimacnal: mac_send() failed, rc=%d\n", rc); mac_free_msg(msg); PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); - return rc; + return PTL_FAIL; } - return 0; + return PTL_OK; } @@ -463,12 +464,15 @@ kscimacnal_recvmsg(nal_cb_t *nal, krx->msg, mlen, rlen, niov); /* What was actually received must be >= what sender claims to have - * sent. This is an LASSERT, since lib-move doesn't check cb return - * code yet. Also, rlen seems to be negative when mlen==0 so don't - * assert on that. - */ - LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen); - LASSERT (mlen==0 || mlen <= rlen); + * sent. */ + LASSERT (mlen <= rlen); /* something is wrong if this isn't true */ + if (mac_msg_size(krx->msg) < sizeof(ptl_hdr_t)+mlen) { + /* We didn't receive everything lib thinks we did */ + CERROR("Bad message size: have %d, need %d + %d\n", + mac_msg_size(krx->msg), sizeof(ptl_hdr_t), mlen); + return (PTL_FAIL); + } + /* It must be OK to kmap() if required */ LASSERT (kiov == NULL || !in_interrupt ()); /* Either all pages or all vaddrs */ @@ -545,12 +549,12 @@ kscimacnal_recvmsg(nal_cb_t *nal, CDEBUG(D_NET, "Calling lib_finalize.\n"); PROF_START(lib_finalize); - lib_finalize(nal, private, cookie); + lib_finalize(nal, private, cookie, PTL_OK); PROF_FINISH(lib_finalize); CDEBUG(D_NET, "Done.\n"); - return rlen; + return PTL_OK; } diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 9ae1c87..c47dcb4 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -993,15 +993,11 @@ ksocknal_destroy_conn (ksock_conn_t *conn) /* complete current receive if any */ switch (conn->ksnc_rx_state) { case SOCKNAL_RX_BODY: -#if 0 - lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie); -#else - CERROR ("Refusing to complete a partial receive from " - LPX64", ip %d.%d.%d.%d:%d\n", conn->ksnc_peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); - CERROR ("This may hang communications and " - "prevent modules from unloading\n"); -#endif + CERROR("Completing partial receive from "LPX64 + ", ip %d.%d.%d.%d:%d, with error\n", + conn->ksnc_peer->ksnp_nid, + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); + lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_FAIL); break; case SOCKNAL_RX_BODY_FWD: ksocknal_fmb_callback (conn->ksnc_cookie, -ECONNABORTED); diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 82d4c64..3ecead1 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -29,7 +29,7 @@ * LIB functions follow * */ -int +ptl_err_t ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, size_t len) { @@ -37,10 +37,10 @@ ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr, nal->ni.nid, (long)len, src_addr, dst_addr); memcpy( dst_addr, src_addr, len ); - return 0; + return PTL_OK; } -int +ptl_err_t ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, size_t len) { @@ -48,20 +48,7 @@ ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, nal->ni.nid, (long)len, src_addr, dst_addr); memcpy( dst_addr, src_addr, len ); - return 0; -} - -int -ksocknal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq, - ptl_event_t *ev) -{ - CDEBUG(D_NET, LPX64": callback eq %p ev %p\n", - nal->ni.nid, eq, ev); - - if (eq->event_callback != NULL) - eq->event_callback(ev); - - return 0; + return PTL_OK; } void * @@ -617,7 +604,8 @@ ksocknal_tx_done (ksock_tx_t *tx, int asynch) if (tx->tx_isfwd) { /* was a forwarded packet? */ kpr_fwd_done (&ksocknal_data.ksnd_router, - KSOCK_TX_2_KPR_FWD_DESC (tx), 0); + KSOCK_TX_2_KPR_FWD_DESC (tx), + (tx->tx_resid == 0) ? 0 : -ECONNABORTED); EXIT; return; } @@ -625,7 +613,8 @@ ksocknal_tx_done (ksock_tx_t *tx, int asynch) /* local send */ ltx = KSOCK_TX_2_KSOCK_LTX (tx); - lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie); + lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie, + (tx->tx_resid == 0) ? PTL_OK : PTL_FAIL); ksocknal_free_ltx (ltx); EXIT; @@ -694,17 +683,17 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx) LASSERT (rc < 0); if (!conn->ksnc_closing) - CERROR ("[%p] Error %d on write to "LPX64 - " ip %d.%d.%d.%d:%d\n",conn, rc, - conn->ksnc_peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr), - conn->ksnc_port); + CERROR("[%p] Error %d on write to "LPX64 + " ip %d.%d.%d.%d:%d\n", conn, rc, + conn->ksnc_peer->ksnp_nid, + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); ksocknal_close_conn_and_siblings (conn, rc); ksocknal_tx_launched (tx); - + return (rc); -} +} void ksocknal_launch_autoconnect_locked (ksock_route_t *route) @@ -742,21 +731,21 @@ ksocknal_find_target_peer_locked (ksock_tx_t *tx, ptl_nid_t nid) ptl_nid_t target_nid; int rc; ksock_peer_t *peer = ksocknal_find_peer_locked (nid); - + if (peer != NULL) return (peer); - + if (tx->tx_isfwd) { CERROR ("Can't send packet to "LPX64 - " %s: routed target is not a peer\n", + " %s: routed target is not a peer\n", nid, portals_nid2str(SOCKNAL, nid, ipbuf)); return (NULL); } - + rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, tx->tx_nob, &target_nid); if (rc != 0) { - CERROR ("Can't route to "LPX64" %s: router error %d\n", + CERROR ("Can't route to "LPX64" %s: router error %d\n", nid, portals_nid2str(SOCKNAL, nid, ipbuf), rc); return (NULL); } @@ -1018,7 +1007,7 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) return (-EHOSTUNREACH); } -int +ptl_err_t ksocknal_sendmsg(nal_cb_t *nal, void *private, lib_msg_t *cookie, @@ -1029,6 +1018,7 @@ ksocknal_sendmsg(nal_cb_t *nal, unsigned int payload_niov, struct iovec *payload_iov, ptl_kiov_t *payload_kiov, + size_t payload_offset, size_t payload_nob) { ksock_ltx_t *ltx; @@ -1091,20 +1081,19 @@ ksocknal_sendmsg(nal_cb_t *nal, ltx->ltx_tx.tx_kiov = NULL; ltx->ltx_tx.tx_nkiov = 0; - ltx->ltx_tx.tx_niov = 1 + payload_niov; - - memcpy(ltx->ltx_iov + 1, payload_iov, - payload_niov * sizeof (*payload_iov)); - + ltx->ltx_tx.tx_niov = + 1 + lib_extract_iov(payload_niov, <x->ltx_iov[1], + payload_niov, payload_iov, + payload_offset, payload_nob); } else { /* payload is all pages */ - ltx->ltx_tx.tx_kiov = ltx->ltx_kiov; - ltx->ltx_tx.tx_nkiov = payload_niov; - ltx->ltx_tx.tx_niov = 1; - memcpy(ltx->ltx_kiov, payload_kiov, - payload_niov * sizeof (*payload_kiov)); + ltx->ltx_tx.tx_kiov = ltx->ltx_kiov; + ltx->ltx_tx.tx_nkiov = + lib_extract_kiov(payload_niov, ltx->ltx_kiov, + payload_niov, payload_kiov, + payload_offset, payload_nob); } rc = ksocknal_launch_packet(<x->ltx_tx, nid); @@ -1115,28 +1104,28 @@ ksocknal_sendmsg(nal_cb_t *nal, return (PTL_FAIL); } -int +ptl_err_t ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, struct iovec *payload_iov, - size_t payload_len) + size_t payload_offset, size_t payload_len) { return (ksocknal_sendmsg(nal, private, cookie, hdr, type, nid, pid, payload_niov, payload_iov, NULL, - payload_len)); + payload_offset, payload_len)); } -int +ptl_err_t ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, ptl_kiov_t *payload_kiov, - size_t payload_len) + size_t payload_offset, size_t payload_len) { return (ksocknal_sendmsg(nal, private, cookie, hdr, type, nid, pid, payload_niov, NULL, payload_kiov, - payload_len)); + payload_offset, payload_len)); } void @@ -1208,7 +1197,7 @@ ksocknal_fmb_callback (void *arg, int error) /* drop peer ref taken on init */ ksocknal_put_peer (fmb->fmb_peer); - + spin_lock_irqsave (&fmp->fmp_lock, flags); list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs); @@ -1591,7 +1580,7 @@ ksocknal_process_receive (ksock_conn_t *conn) case SOCKNAL_RX_BODY: /* payload all received */ - lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie); + lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_OK); /* Fall through */ case SOCKNAL_RX_SLOP: @@ -1627,9 +1616,10 @@ ksocknal_process_receive (ksock_conn_t *conn) return (-EINVAL); /* keep gcc happy */ } -int +ptl_err_t ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen) + unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen, size_t rlen) { ksock_conn_t *conn = (ksock_conn_t *)private; @@ -1642,20 +1632,22 @@ ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, conn->ksnc_rx_nkiov = 0; conn->ksnc_rx_kiov = NULL; - conn->ksnc_rx_niov = niov; conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; - memcpy (conn->ksnc_rx_iov, iov, niov * sizeof (*iov)); + conn->ksnc_rx_niov = + lib_extract_iov(PTL_MD_MAX_IOV, conn->ksnc_rx_iov, + niov, iov, offset, mlen); LASSERT (mlen == lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); - return (rlen); + return (PTL_OK); } -int +ptl_err_t ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, ptl_kiov_t *kiov, size_t mlen, size_t rlen) + unsigned int niov, ptl_kiov_t *kiov, + size_t offset, size_t mlen, size_t rlen) { ksock_conn_t *conn = (ksock_conn_t *)private; @@ -1668,15 +1660,16 @@ ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg, conn->ksnc_rx_niov = 0; conn->ksnc_rx_iov = NULL; - conn->ksnc_rx_nkiov = niov; conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; - memcpy (conn->ksnc_rx_kiov, kiov, niov * sizeof (*kiov)); + conn->ksnc_rx_nkiov = + lib_extract_kiov(PTL_MD_MAX_IOV, conn->ksnc_rx_kiov, + niov, kiov, offset, mlen); LASSERT (mlen == lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); - return (rlen); + return (PTL_OK); } int ksocknal_scheduler (void *arg) @@ -2064,7 +2057,7 @@ ksocknal_hello (struct socket *sock, ptl_nid_t *nid, int *type, __u64 *incarnati rc, *nid, portals_nid2str(SOCKNAL, *nid, ipbuf)); return (rc); } - + if (hmv->magic != __le32_to_cpu (PORTALS_PROTO_MAGIC)) { CERROR ("Bad magic %#08x (%#08x expected) from "LPX64" %s\n", __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC, *nid, @@ -2118,7 +2111,7 @@ ksocknal_hello (struct socket *sock, ptl_nid_t *nid, int *type, __u64 *incarnati } else if (*nid != __le64_to_cpu (hdr.src_nid)) { CERROR ("Connected to nid "LPX64" %s, but expecting "LPX64" %s\n", __le64_to_cpu (hdr.src_nid), - portals_nid2str(SOCKNAL, + portals_nid2str(SOCKNAL, __le64_to_cpu(hdr.src_nid), ipbuf), *nid, portals_nid2str(SOCKNAL, *nid, ipbuf)); @@ -2139,7 +2132,7 @@ ksocknal_hello (struct socket *sock, ptl_nid_t *nid, int *type, __u64 *incarnati *type = SOCKNAL_CONN_BULK_IN; break; default: - CERROR ("Unexpected type %d from "LPX64" %s\n", + CERROR ("Unexpected type %d from "LPX64" %s\n", *type, *nid, portals_nid2str(SOCKNAL, *nid, ipbuf)); return (-EPROTO); @@ -2346,8 +2339,8 @@ ksocknal_connect_peer (ksock_route_t *route, int type) if (rc != 0) { CERROR ("Error %d connecting to "LPX64" %s\n", rc, route->ksnr_peer->ksnp_nid, - portals_nid2str(SOCKNAL, - route->ksnr_peer->ksnp_nid, + portals_nid2str(SOCKNAL, + route->ksnr_peer->ksnp_nid, ipbuf)); goto out; } @@ -2432,7 +2425,7 @@ ksocknal_autoconnect (ksock_route_t *route) while (!list_empty (&zombies)) { char ipbuf[PTL_NALFMT_SIZE]; tx = list_entry (zombies.next, ksock_tx_t, tx_list); - + CERROR ("Deleting packet type %d len %d ("LPX64" %s->"LPX64" %s)\n", NTOH__u32 (tx->tx_hdr->type), NTOH__u32 (tx->tx_hdr->payload_length), @@ -2719,7 +2712,6 @@ nal_cb_t ksocknal_lib = { cb_recv_pages: ksocknal_recv_pages, cb_read: ksocknal_read, cb_write: ksocknal_write, - cb_callback: ksocknal_callback, cb_malloc: ksocknal_malloc, cb_free: ksocknal_free, cb_printf: ksocknal_printf, diff --git a/lnet/libcfs/module.c b/lnet/libcfs/module.c index 2768c8d..2f5a852 100644 --- a/lnet/libcfs/module.c +++ b/lnet/libcfs/module.c @@ -812,9 +812,11 @@ EXPORT_SYMBOL(PtlMDBind); EXPORT_SYMBOL(lib_iov_nob); EXPORT_SYMBOL(lib_copy_iov2buf); EXPORT_SYMBOL(lib_copy_buf2iov); +EXPORT_SYMBOL(lib_extract_iov); EXPORT_SYMBOL(lib_kiov_nob); EXPORT_SYMBOL(lib_copy_kiov2buf); EXPORT_SYMBOL(lib_copy_buf2kiov); +EXPORT_SYMBOL(lib_extract_kiov); EXPORT_SYMBOL(lib_finalize); EXPORT_SYMBOL(lib_parse); EXPORT_SYMBOL(lib_fake_reply_msg); diff --git a/lnet/lnet/Makefile.am b/lnet/lnet/Makefile.am index 8c03749..d17db61 100644 --- a/lnet/lnet/Makefile.am +++ b/lnet/lnet/Makefile.am @@ -6,5 +6,9 @@ CPPFLAGS= INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -lib_LIBRARIES= libportals.a +noinst_LIBRARIES= libportals.a libportals_a_SOURCES= api-eq.c api-init.c api-me.c api-errno.c api-ni.c api-wrap.c lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-eq.c lib-md.c lib-move.c lib-ni.c lib-pid.c + +if LIBLUSTRE +libportals_a_CFLAGS= -fPIC +endif diff --git a/lnet/lnet/api-eq.c b/lnet/lnet/api-eq.c index 9bc9c36..964b9d8 100644 --- a/lnet/lnet/api-eq.c +++ b/lnet/lnet/api-eq.c @@ -81,12 +81,6 @@ int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t * ev) *ev = *new_event; - /* Set the unlinked_me interface number if there is one to pass - * back, since the NAL hasn't a clue what it is and therefore can't - * set it. */ - if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE)) - ev->unlinked_me.nal_idx = eventq.nal_idx; - /* ensure event is delivered correctly despite possible races with lib_finalize */ if (eq->sequence != new_event->sequence) { @@ -119,6 +113,7 @@ int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out) } #ifndef __KERNEL__ +#if 0 static jmp_buf eq_jumpbuf; static void eq_timeout(int signal) @@ -162,6 +157,46 @@ int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out, return rc; } +#else +#include -#endif +/* FIXME + * Here timeout need a trick with tcpnal, definitely unclean but OK for + * this moment. + */ + +/* global variables defined by tcpnal */ +extern int __tcpnal_eqwait_timeout_value; +extern int __tcpnal_eqwait_timedout; + +int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out, + int timeout) +{ + int rc; + if (!timeout) + return PtlEQWait(eventq_in, event_out); + + __tcpnal_eqwait_timeout_value = timeout; + + while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) { + nal_t *nal = ptl_hndl2nal(&eventq_in); + + if (nal->yield) + nal->yield(nal); + + if (__tcpnal_eqwait_timedout) { + if (__tcpnal_eqwait_timedout != ETIMEDOUT) + printf("Warning: yield return error %d\n", + __tcpnal_eqwait_timedout); + rc = PTL_EQ_EMPTY; + break; + } + } + + __tcpnal_eqwait_timeout_value = 0; + + return rc; +} +#endif +#endif /* __KERNEL__ */ diff --git a/lnet/lnet/api-errno.c b/lnet/lnet/api-errno.c index 026c93b..b5e7aa1 100644 --- a/lnet/lnet/api-errno.c +++ b/lnet/lnet/api-errno.c @@ -50,6 +50,5 @@ const char *ptl_err_str[] = { "PTL_IOV_TOO_SMALL", "PTL_EQ_INUSE", - "PTL_MD_INUSE" }; /* If you change these, you must update the number table in portals/errno.h */ diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index b2e069e..18eea91 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -125,7 +125,7 @@ int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size, if (ptl_interfaces[i] == nal) { nal->refct++; handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | i; - fprintf(stderr, "Returning existing NAL (%d)\n", i); + CDEBUG(D_OTHER, "Returning existing NAL (%d)\n", i); ptl_ni_init_mutex_exit (); return PTL_OK; } diff --git a/lnet/lnet/api-wrap.c b/lnet/lnet/api-wrap.c index e54707f..d23a6aa 100644 --- a/lnet/lnet/api-wrap.c +++ b/lnet/lnet/api-wrap.c @@ -32,7 +32,7 @@ static int do_forward(ptl_handle_any_t any_h, int cmd, void *argbuf, nal_t *nal; if (!ptl_init) { - fprintf(stderr, "PtlGetId: Not initialized\n"); + CERROR("Not initialized\n"); return PTL_NOINIT; } @@ -262,7 +262,7 @@ static int validate_md(ptl_handle_any_t current_in, ptl_md_t md_in) int i; if (!ptl_init) { - fprintf(stderr, "PtlMDAttach/Bind/Update: Not initialized\n"); + CERROR("PtlMDAttach/Bind/Update: Not initialized\n"); return PTL_NOINIT; } diff --git a/lnet/lnet/lib-init.c b/lnet/lnet/lib-init.c index 0765498..d4d8860 100644 --- a/lnet/lnet/lib-init.c +++ b/lnet/lnet/lib-init.c @@ -38,31 +38,17 @@ # include #endif -#ifndef PTL_USE_DESC_LISTS -static int ptl_slab_users; - -atomic_t md_in_use_count = ATOMIC_INIT(0); -atomic_t msg_in_use_count = ATOMIC_INIT(0); -atomic_t me_in_use_count = ATOMIC_INIT(0); -atomic_t eq_in_use_count = ATOMIC_INIT(0); +#ifndef PTL_USE_LIB_FREELIST int kportal_descriptor_setup (nal_cb_t *nal) { - ptl_slab_users++; - RETURN(PTL_OK); + return PTL_OK; } void kportal_descriptor_cleanup (nal_cb_t *nal) { - if (--ptl_slab_users != 0) - return; - - LASSERT (atomic_read (&md_in_use_count) == 0); - LASSERT (atomic_read (&me_in_use_count) == 0); - LASSERT (atomic_read (&eq_in_use_count) == 0); - LASSERT (atomic_read (&msg_in_use_count) == 0); } #else diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c index be6949c..a1ed583 100644 --- a/lnet/lnet/lib-md.c +++ b/lnet/lnet/lib-md.c @@ -83,7 +83,7 @@ static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private, int rc; int i; - /* NB we are passes an allocated, but uninitialised/active md. + /* NB we are passed an allocated, but uninitialised/active md. * if we return success, caller may lib_md_unlink() it. * otherwise caller may only lib_md_free() it. */ @@ -94,9 +94,10 @@ static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private, return PTL_INV_EQ; } - if ((md->options & PTL_MD_IOV) != 0 && /* discontiguous MD */ - md->niov > PTL_MD_MAX_IOV) /* too many fragments */ - return PTL_IOV_TOO_MANY; + /* Must check this _before_ allocation. Also, note that non-iov + * MDs must set md_niov to 0. */ + LASSERT((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0 || + md->niov <= PTL_MD_MAX_IOV); if ((md->options & max_size_opts) != 0 && /* max size used */ (md->max_size < 0 || md->max_size > md->length)) // illegal max_size @@ -239,7 +240,11 @@ int do_PtlMDAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret) lib_md_t *md; unsigned long flags; - md = lib_md_alloc (nal); + if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOV)) != 0 && + args->md_in.niov > PTL_MD_MAX_IOV) /* too many fragments */ + return (ret->rc = PTL_IOV_TOO_MANY); + + md = lib_md_alloc(nal, &args->md_in); if (md == NULL) return (ret->rc = PTL_NOSPACE); @@ -287,7 +292,11 @@ int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret) lib_md_t *md; unsigned long flags; - md = lib_md_alloc (nal); + if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOV)) != 0 && + args->md_in.niov > PTL_MD_MAX_IOV) /* too many fragments */ + return (ret->rc = PTL_IOV_TOO_MANY); + + md = lib_md_alloc(nal, &args->md_in); if (md == NULL) return (ret->rc = PTL_NOSPACE); @@ -311,34 +320,43 @@ int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret) int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) { - PtlMDUnlink_in *args = v_args; + PtlMDUnlink_in *args = v_args; PtlMDUnlink_out *ret = v_ret; - - lib_md_t *md; - unsigned long flags; + ptl_event_t ev; + lib_md_t *md; + unsigned long flags; state_lock(nal, &flags); md = ptl_handle2md(&args->md_in, nal); if (md == NULL) { - ret->rc = PTL_INV_MD; - } else if (md->pending != 0) { /* being filled/spilled */ - ret->rc = PTL_MD_INUSE; - } else { - /* Callers attempting to unlink a busy MD which will get - * unlinked once the net op completes should see INUSE, - * before completion and INV_MD thereafter. LASSERT we've - * got that right... */ - LASSERT ((md->md_flags & PTL_MD_FLAG_UNLINK) == 0); - - lib_md_deconstruct(nal, md, &ret->status_out); - lib_md_unlink(nal, md); - ret->rc = PTL_OK; + state_unlock(nal, &flags); + return (ret->rc = PTL_INV_MD); + } + + /* If the MD is busy, lib_md_unlink just marks it for deletion, and + * when the NAL is done, the completion event flags that the MD was + * unlinked. Otherwise, we enqueue an event now... */ + + if (md->eq != NULL && + md->pending == 0) { + memset(&ev, 0, sizeof(ev)); + + ev.type = PTL_EVENT_UNLINK; + ev.status = PTL_OK; + ev.unlinked = 1; + lib_md_deconstruct(nal, md, &ev.mem_desc); + + lib_enq_event_locked(nal, private, md->eq, &ev); } + lib_md_deconstruct(nal, md, &ret->status_out); + lib_md_unlink(nal, md); + ret->rc = PTL_OK; + state_unlock(nal, &flags); - return (ret->rc); + return (PTL_OK); } int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args, @@ -379,6 +397,23 @@ int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args, goto out; } + /* XXX fttb, the new MD must be the same type wrt fragmentation */ + if (((new->options ^ md->options) & + (PTL_MD_IOV | PTL_MD_KIOV)) != 0) { + ret->rc = PTL_INV_MD; + goto out; + } + + if (new->niov > md->md_niov) { + ret->rc = PTL_IOV_TOO_MANY; + goto out; + } + + if (new->niov < md->md_niov) { + ret->rc = PTL_IOV_TOO_SMALL; + goto out; + } + if (!PtlHandleEqual (args->testq_in, PTL_EQ_NONE)) { test_eq = ptl_handle2eq(&args->testq_in, nal); if (test_eq == NULL) { diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index d844a7a..ecd543c 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -258,55 +258,78 @@ lib_iov_nob (int niov, struct iovec *iov) } void -lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len) +lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, + ptl_size_t offset, ptl_size_t len) { ptl_size_t nob; - while (len > 0) - { + if (len == 0) + return; + + /* skip complete frags before 'offset' */ + LASSERT (niov > 0); + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + niov--; + LASSERT (niov > 0); + } + + do { LASSERT (niov > 0); - nob = MIN (iov->iov_len, len); - memcpy (dest, iov->iov_base, nob); + nob = MIN (iov->iov_len - offset, len); + memcpy (dest, iov->iov_base + offset, nob); len -= nob; dest += nob; niov--; iov++; - } + offset = 0; + } while (len > 0); } void -lib_copy_buf2iov (int niov, struct iovec *iov, char *src, ptl_size_t len) +lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, + char *src, ptl_size_t len) { ptl_size_t nob; - while (len > 0) - { + if (len == 0) + return; + + /* skip complete frags before 'offset' */ + LASSERT (niov > 0); + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + niov--; LASSERT (niov > 0); - nob = MIN (iov->iov_len, len); - memcpy (iov->iov_base, src, nob); + } + + do { + LASSERT (niov > 0); + nob = MIN (iov->iov_len - offset, len); + memcpy (iov->iov_base + offset, src, nob); len -= nob; src += nob; niov--; iov++; - } + offset = 0; + } while (len > 0); } -static int -lib_extract_iov (struct iovec *dst, lib_md_t *md, +int +lib_extract_iov (int dst_niov, struct iovec *dst, + int src_niov, struct iovec *src, ptl_size_t offset, ptl_size_t len) { /* Initialise 'dst' to the subset of 'src' starting at 'offset', * for exactly 'len' bytes, and return the number of entries. * NB not destructive to 'src' */ - int src_niov = md->md_niov; - struct iovec *src = md->md_iov.iov; ptl_size_t frag_len; - int dst_niov; + int niov; - LASSERT (offset + len <= md->length); - if (len == 0) /* no data => */ return (0); /* no frags */ @@ -318,17 +341,17 @@ lib_extract_iov (struct iovec *dst, lib_md_t *md, LASSERT (src_niov > 0); } - dst_niov = 1; + niov = 1; for (;;) { LASSERT (src_niov > 0); - LASSERT (dst_niov <= PTL_MD_MAX_IOV); + LASSERT (niov <= dst_niov); frag_len = src->iov_len - offset; dst->iov_base = ((char *)src->iov_base) + offset; if (len <= frag_len) { dst->iov_len = len; - return (dst_niov); + return (niov); } dst->iov_len = frag_len; @@ -336,7 +359,7 @@ lib_extract_iov (struct iovec *dst, lib_md_t *md, len -= frag_len; dst++; src++; - dst_niov++; + niov++; src_niov--; offset = 0; } @@ -351,19 +374,22 @@ lib_kiov_nob (int niov, ptl_kiov_t *kiov) } void -lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len) +lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, + ptl_size_t offset, ptl_size_t len) { LASSERT (0); } void -lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *dest, ptl_size_t len) +lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset, + char *src, ptl_size_t len) { LASSERT (0); } -static int -lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, +int +lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, + int src_niov, ptl_kiov_t *src, ptl_size_t offset, ptl_size_t len) { LASSERT (0); @@ -383,18 +409,30 @@ lib_kiov_nob (int niov, ptl_kiov_t *kiov) } void -lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len) +lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, + ptl_size_t offset, ptl_size_t len) { ptl_size_t nob; char *addr; + + if (len == 0) + return; LASSERT (!in_interrupt ()); - while (len > 0) - { + + LASSERT (niov > 0); + while (offset > kiov->kiov_len) { + offset -= kiov->kiov_len; + kiov++; + niov--; + LASSERT (niov > 0); + } + + do{ LASSERT (niov > 0); - nob = MIN (kiov->kiov_len, len); + nob = MIN (kiov->kiov_len - offset, len); - addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset; memcpy (dest, addr, nob); kunmap (kiov->kiov_page); @@ -402,22 +440,35 @@ lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len) dest += nob; niov--; kiov++; - } + offset = 0; + } while (len > 0); } void -lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len) +lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset, + char *src, ptl_size_t len) { ptl_size_t nob; char *addr; + if (len == 0) + return; + LASSERT (!in_interrupt ()); - while (len > 0) - { + + LASSERT (niov > 0); + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + kiov++; + niov--; + LASSERT (niov > 0); + } + + do { LASSERT (niov > 0); - nob = MIN (kiov->kiov_len, len); + nob = MIN (kiov->kiov_len - offset, len); - addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset; memcpy (addr, src, nob); kunmap (kiov->kiov_page); @@ -425,23 +476,21 @@ lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len) src += nob; niov--; kiov++; - } + offset = 0; + } while (len > 0); } -static int -lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, +int +lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, + int src_niov, ptl_kiov_t *src, ptl_size_t offset, ptl_size_t len) { /* Initialise 'dst' to the subset of 'src' starting at 'offset', * for exactly 'len' bytes, and return the number of entries. * NB not destructive to 'src' */ - int src_niov = md->md_niov; - ptl_kiov_t *src = md->md_iov.kiov; ptl_size_t frag_len; - int dst_niov; + int niov; - LASSERT (offset + len <= md->length); - if (len == 0) /* no data => */ return (0); /* no frags */ @@ -453,10 +502,10 @@ lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, LASSERT (src_niov > 0); } - dst_niov = 1; + niov = 1; for (;;) { LASSERT (src_niov > 0); - LASSERT (dst_niov <= PTL_MD_MAX_IOV); + LASSERT (niov <= dst_niov); frag_len = src->kiov_len - offset; dst->kiov_page = src->kiov_page; @@ -465,7 +514,7 @@ lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, if (len <= frag_len) { dst->kiov_len = len; LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); - return (dst_niov); + return (niov); } dst->kiov_len = frag_len; @@ -474,73 +523,66 @@ lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, len -= frag_len; dst++; src++; - dst_niov++; + niov++; src_niov--; offset = 0; } } #endif -void +ptl_err_t lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen) { - int niov; - if (mlen == 0) - nal->cb_recv (nal, private, msg, 0, NULL, 0, rlen); - else if ((md->options & PTL_MD_KIOV) == 0) { - niov = lib_extract_iov (msg->msg_iov.iov, md, offset, mlen); - nal->cb_recv (nal, private, msg, - niov, msg->msg_iov.iov, mlen, rlen); - } else { - niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, mlen); - nal->cb_recv_pages (nal, private, msg, - niov, msg->msg_iov.kiov, mlen, rlen); - } + return (nal->cb_recv(nal, private, msg, + 0, NULL, + offset, mlen, rlen)); + + if ((md->options & PTL_MD_KIOV) == 0) + return (nal->cb_recv(nal, private, msg, + md->md_niov, md->md_iov.iov, + offset, mlen, rlen)); + + return (nal->cb_recv_pages(nal, private, msg, + md->md_niov, md->md_iov.kiov, + offset, mlen, rlen)); } -int +ptl_err_t lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, lib_md_t *md, ptl_size_t offset, ptl_size_t len) { - int niov; - if (len == 0) - return (nal->cb_send (nal, private, msg, - hdr, type, nid, pid, - 0, NULL, 0)); + return (nal->cb_send(nal, private, msg, + hdr, type, nid, pid, + 0, NULL, + offset, len)); - if ((md->options & PTL_MD_KIOV) == 0) { - niov = lib_extract_iov (msg->msg_iov.iov, md, offset, len); - return (nal->cb_send (nal, private, msg, - hdr, type, nid, pid, - niov, msg->msg_iov.iov, len)); - } - - niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, len); - return (nal->cb_send_pages (nal, private, msg, - hdr, type, nid, pid, - niov, msg->msg_iov.kiov, len)); + if ((md->options & PTL_MD_KIOV) == 0) + return (nal->cb_send(nal, private, msg, + hdr, type, nid, pid, + md->md_niov, md->md_iov.iov, + offset, len)); + + return (nal->cb_send_pages(nal, private, msg, + hdr, type, nid, pid, + md->md_niov, md->md_iov.kiov, + offset, len)); } -static lib_msg_t * -get_new_msg (nal_cb_t *nal, lib_md_t *md) +static void +lib_commit_md (nal_cb_t *nal, lib_md_t *md, lib_msg_t *msg) { /* ALWAYS called holding the state_lock */ lib_counters_t *counters = &nal->ni.counters; - lib_msg_t *msg = lib_msg_alloc (nal); - - if (msg == NULL) - return (NULL); - - memset (msg, 0, sizeof (*msg)); - - msg->send_ack = 0; + /* Here, we commit the MD to a network OP by marking it busy and + * decrementing its threshold. Come what may, the network "owns" + * the MD until a call to lib_finalize() signals completion. */ msg->md = md; - do_gettimeofday(&msg->ev.arrival_time); + md->pending++; if (md->threshold != PTL_MD_THRESH_INF) { LASSERT (md->threshold > 0); @@ -552,8 +594,24 @@ get_new_msg (nal_cb_t *nal, lib_md_t *md) counters->msgs_max = counters->msgs_alloc; list_add (&msg->msg_list, &nal->ni.ni_active_msgs); +} - return (msg); +static void +lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr) +{ + unsigned long flags; + + /* CAVEAT EMPTOR: this only drops messages that we've not committed + * to receive (init_msg() not called) and therefore can't cause an + * event. */ + + state_lock(nal, &flags); + nal->ni.counters.drop_count++; + nal->ni.counters.drop_length += hdr->payload_length; + state_unlock(nal, &flags); + + /* NULL msg => if NAL calls lib_finalize it will be a noop */ + (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length); } /* @@ -563,17 +621,18 @@ get_new_msg (nal_cb_t *nal, lib_md_t *md) * of long messages. * */ -static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +static ptl_err_t +parse_put(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { lib_ni_t *ni = &nal->ni; ptl_size_t mlength = 0; ptl_size_t offset = 0; int unlink = 0; + ptl_err_t rc; lib_me_t *me; lib_md_t *md; - lib_msg_t *msg; unsigned long flags; - + /* Convert put fields to host byte order */ hdr->msg.put.match_bits = NTOH__u64 (hdr->msg.put.match_bits); hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index); @@ -586,8 +645,10 @@ static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) hdr->payload_length, hdr->msg.put.offset, hdr->msg.put.match_bits, &mlength, &offset, &unlink); - if (me == NULL) - goto drop; + if (me == NULL) { + state_unlock(nal, &flags); + return (PTL_FAIL); + } md = me->md; CDEBUG(D_NET, "Incoming put index %x from "LPU64"/%u of length %d/%d " @@ -595,69 +656,46 @@ static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) hdr->src_nid, hdr->src_pid, mlength, hdr->payload_length, md->md_lh.lh_cookie, md->md_niov, offset); - msg = get_new_msg (nal, md); - if (msg == NULL) { - CERROR(LPU64": Dropping PUT from "LPU64": can't allocate msg\n", - ni->nid, hdr->src_nid); - goto drop; - } + lib_commit_md(nal, md, msg); + + msg->ev.type = PTL_EVENT_PUT; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.portal = hdr->msg.put.ptl_index; + msg->ev.match_bits = hdr->msg.put.match_bits; + msg->ev.rlength = hdr->payload_length; + msg->ev.mlength = mlength; + msg->ev.offset = offset; + msg->ev.hdr_data = hdr->msg.put.hdr_data; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) && !(md->options & PTL_MD_ACK_DISABLE)) { - msg->send_ack = 1; msg->ack_wmd = hdr->msg.put.ack_wmd; - msg->nid = hdr->src_nid; - msg->pid = hdr->src_pid; - msg->ev.match_bits = hdr->msg.put.match_bits; - } - - if (md->eq) { - msg->ev.type = PTL_EVENT_PUT; - msg->ev.initiator.nid = hdr->src_nid; - msg->ev.initiator.pid = hdr->src_pid; - msg->ev.portal = hdr->msg.put.ptl_index; - msg->ev.match_bits = hdr->msg.put.match_bits; - msg->ev.rlength = hdr->payload_length; - msg->ev.mlength = mlength; - msg->ev.offset = offset; - msg->ev.hdr_data = hdr->msg.put.hdr_data; - - /* NB if this match has exhausted the MD, we can't be sure - * that this event will the the last one associated with - * this MD in the event queue (another message already - * matching this ME/MD could end up being last). So we - * remember the ME handle anyway and check again when we're - * allocating our slot in the event queue. - */ - ptl_me2handle (&msg->ev.unlinked_me, me); - - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); } ni->counters.recv_count++; ni->counters.recv_length += mlength; - /* only unlink after MD's pending count has been bumped - * in get_new_msg() otherwise lib_me_unlink() will nuke it */ - if (unlink) { - md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED; + /* only unlink after MD's pending count has been bumped in + * lib_commit_md() otherwise lib_me_unlink() will nuke it */ + if (unlink) lib_me_unlink (nal, me); - } state_unlock(nal, &flags); - lib_recv (nal, private, msg, md, offset, mlength, hdr->payload_length); - return 0; + rc = lib_recv(nal, private, msg, md, offset, mlength, + hdr->payload_length); + if (rc != PTL_OK) + CERROR(LPU64": error on receiving PUT from "LPU64": %d\n", + ni->nid, hdr->src_nid, rc); - drop: - nal->ni.counters.drop_count++; - nal->ni.counters.drop_length += hdr->payload_length; - state_unlock (nal, &flags); - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); - return -1; + return (rc); } -static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +static ptl_err_t +parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { lib_ni_t *ni = &nal->ni; ptl_size_t mlength = 0; @@ -665,7 +703,6 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) int unlink = 0; lib_me_t *me; lib_md_t *md; - lib_msg_t *msg; ptl_hdr_t reply; unsigned long flags; int rc; @@ -683,8 +720,10 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) hdr->msg.get.sink_length, hdr->msg.get.src_offset, hdr->msg.get.match_bits, &mlength, &offset, &unlink); - if (me == NULL) - goto drop; + if (me == NULL) { + state_unlock(nal, &flags); + return (PTL_FAIL); + } md = me->md; CDEBUG(D_NET, "Incoming get index %d from "LPU64".%u of length %d/%d " @@ -692,45 +731,27 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) hdr->src_nid, hdr->src_pid, mlength, hdr->payload_length, md->md_lh.lh_cookie, md->md_niov, offset); - msg = get_new_msg (nal, md); - if (msg == NULL) { - CERROR(LPU64": Dropping GET from "LPU64": can't allocate msg\n", - ni->nid, hdr->src_nid); - goto drop; - } + lib_commit_md(nal, md, msg); - if (md->eq) { - msg->ev.type = PTL_EVENT_GET; - msg->ev.initiator.nid = hdr->src_nid; - msg->ev.initiator.pid = hdr->src_pid; - msg->ev.portal = hdr->msg.get.ptl_index; - msg->ev.match_bits = hdr->msg.get.match_bits; - msg->ev.rlength = hdr->payload_length; - msg->ev.mlength = mlength; - msg->ev.offset = offset; - msg->ev.hdr_data = 0; - - /* NB if this match has exhausted the MD, we can't be sure - * that this event will the the last one associated with - * this MD in the event queue (another message already - * matching this ME/MD could end up being last). So we - * remember the ME handle anyway and check again when we're - * allocating our slot in the event queue. - */ - ptl_me2handle (&msg->ev.unlinked_me, me); - - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - } + msg->ev.type = PTL_EVENT_GET; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.portal = hdr->msg.get.ptl_index; + msg->ev.match_bits = hdr->msg.get.match_bits; + msg->ev.rlength = hdr->payload_length; + msg->ev.mlength = mlength; + msg->ev.offset = offset; + msg->ev.hdr_data = 0; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); ni->counters.send_count++; ni->counters.send_length += mlength; - /* only unlink after MD's refcount has been bumped - * in get_new_msg() otherwise lib_me_unlink() will nuke it */ - if (unlink) { - md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED; + /* only unlink after MD's refcount has been bumped in + * lib_commit_md() otherwise lib_me_unlink() will nuke it */ + if (unlink) lib_me_unlink (nal, me); - } state_unlock(nal, &flags); @@ -749,36 +770,25 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY, hdr->src_nid, hdr->src_pid, md, offset, mlength); - if (rc != PTL_OK) { - CERROR(LPU64": Dropping GET from "LPU64": send REPLY failed\n", - ni->nid, hdr->src_nid); - /* Hmm, this will create a GET event and make believe - * the reply completed, which it kind of did, only the - * source won't get her reply */ - lib_finalize (nal, private, msg); - state_lock (nal, &flags); - goto drop; - } + if (rc != PTL_OK) + CERROR(LPU64": Unable to send REPLY for GET from "LPU64": %d\n", + ni->nid, hdr->src_nid, rc); + + /* Discard any junk after the hdr */ + (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length); - /* Complete the incoming message */ - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); return (rc); - drop: - ni->counters.drop_count++; - ni->counters.drop_length += hdr->msg.get.sink_length; - state_unlock(nal, &flags); - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); - return -1; } -static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +static ptl_err_t +parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { lib_ni_t *ni = &nal->ni; lib_md_t *md; int rlength; int length; - lib_msg_t *msg; unsigned long flags; + ptl_err_t rc; state_lock(nal, &flags); @@ -790,7 +800,9 @@ static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) md == NULL ? "invalid" : "inactive", hdr->msg.reply.dst_wmd.wh_interface_cookie, hdr->msg.reply.dst_wmd.wh_object_cookie); - goto drop; + + state_unlock(nal, &flags); + return (PTL_FAIL); } LASSERT (md->offset == 0); @@ -804,7 +816,8 @@ static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) ni->nid, hdr->src_nid, length, hdr->msg.reply.dst_wmd.wh_object_cookie, md->length); - goto drop; + state_unlock(nal, &flags); + return (PTL_FAIL); } length = md->length; } @@ -813,46 +826,36 @@ static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) hdr->src_nid, length, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie); - msg = get_new_msg (nal, md); - if (msg == NULL) { - CERROR(LPU64": Dropping REPLY from "LPU64": can't " - "allocate msg\n", ni->nid, hdr->src_nid); - goto drop; - } + lib_commit_md(nal, md, msg); - if (md->eq) { - msg->ev.type = PTL_EVENT_REPLY; - msg->ev.initiator.nid = hdr->src_nid; - msg->ev.initiator.pid = hdr->src_pid; - msg->ev.rlength = rlength; - msg->ev.mlength = length; - msg->ev.offset = 0; + msg->ev.type = PTL_EVENT_REPLY; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.rlength = rlength; + msg->ev.mlength = length; + msg->ev.offset = 0; - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - } + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); ni->counters.recv_count++; ni->counters.recv_length += length; state_unlock(nal, &flags); - lib_recv (nal, private, msg, md, 0, length, rlength); - return 0; + rc = lib_recv(nal, private, msg, md, 0, length, rlength); + if (rc != PTL_OK) + CERROR(LPU64": error on receiving REPLY from "LPU64": %d\n", + ni->nid, hdr->src_nid, rc); - drop: - nal->ni.counters.drop_count++; - nal->ni.counters.drop_length += hdr->payload_length; - state_unlock (nal, &flags); - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); - return -1; + return (rc); } -static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +static ptl_err_t +parse_ack(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { - lib_ni_t *ni = &nal->ni; - lib_md_t *md; - lib_msg_t *msg = NULL; - unsigned long flags; + lib_ni_t *ni = &nal->ni; + lib_md_t *md; + unsigned long flags; /* Convert ack fields to host byte order */ hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits); @@ -868,40 +871,37 @@ static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) (md == NULL) ? "invalid" : "inactive", hdr->msg.ack.dst_wmd.wh_interface_cookie, hdr->msg.ack.dst_wmd.wh_object_cookie); - goto drop; + + state_unlock(nal, &flags); + return (PTL_FAIL); } CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n", ni->nid, hdr->src_nid, hdr->msg.ack.dst_wmd.wh_object_cookie); - msg = get_new_msg (nal, md); - if (msg == NULL) { - CERROR(LPU64": Dropping ACK from "LPU64": can't allocate msg\n", - ni->nid, hdr->src_nid); - goto drop; - } + lib_commit_md(nal, md, msg); - if (md->eq) { - msg->ev.type = PTL_EVENT_ACK; - msg->ev.initiator.nid = hdr->src_nid; - msg->ev.initiator.pid = hdr->src_pid; - msg->ev.mlength = hdr->msg.ack.mlength; - msg->ev.match_bits = hdr->msg.ack.match_bits; + msg->ev.type = PTL_EVENT_ACK; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.mlength = hdr->msg.ack.mlength; + msg->ev.match_bits = hdr->msg.ack.match_bits; - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - } + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); ni->counters.recv_count++; - state_unlock(nal, &flags); - lib_recv (nal, private, msg, NULL, 0, 0, hdr->payload_length); - return 0; - drop: - nal->ni.counters.drop_count++; - state_unlock (nal, &flags); - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); - return -1; + state_unlock(nal, &flags); + + /* We have received and matched up the ack OK, create the + * completion event now... */ + lib_finalize(nal, private, msg, PTL_OK); + + /* ...and now discard any junk after the hdr */ + (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length); + + return (PTL_OK); } static char * @@ -983,10 +983,13 @@ void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr) } /* end of print_hdr() */ -int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +void +lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private) { unsigned long flags; - + ptl_err_t rc; + lib_msg_t *msg; + /* convert common fields to host byte order */ hdr->dest_nid = NTOH__u64 (hdr->dest_nid); hdr->src_nid = NTOH__u64 (hdr->src_nid); @@ -1008,22 +1011,16 @@ int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) nal->ni.nid, mv->magic, mv->version_major, mv->version_minor, hdr->src_nid); - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); - return (-1); + lib_drop_message(nal, private, hdr); + return; } if (hdr->dest_nid != nal->ni.nid) { CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64 " (not me)\n", nal->ni.nid, hdr_type_string (hdr), hdr->src_nid, hdr->dest_nid); - - state_lock (nal, &flags); - nal->ni.counters.drop_count++; - nal->ni.counters.drop_length += hdr->payload_length; - state_unlock (nal, &flags); - - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); - return (-1); + lib_drop_message(nal, private, hdr); + return; } if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ @@ -1033,34 +1030,59 @@ int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) ": simulated failure\n", nal->ni.nid, hdr_type_string (hdr), hdr->src_nid); - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); - return (-1); + lib_drop_message(nal, private, hdr); + return; } - + + msg = lib_msg_alloc(nal); + if (msg == NULL) { + CERROR(LPU64": Dropping incoming %s from "LPU64 + ": can't allocate a lib_msg_t\n", + nal->ni.nid, hdr_type_string (hdr), + hdr->src_nid); + lib_drop_message(nal, private, hdr); + return; + } + + do_gettimeofday(&msg->ev.arrival_time); + switch (hdr->type) { case PTL_MSG_ACK: - return (parse_ack(nal, hdr, private)); + rc = parse_ack(nal, hdr, private, msg); + break; case PTL_MSG_PUT: - return (parse_put(nal, hdr, private)); + rc = parse_put(nal, hdr, private, msg); break; case PTL_MSG_GET: - return (parse_get(nal, hdr, private)); + rc = parse_get(nal, hdr, private, msg); break; case PTL_MSG_REPLY: - return (parse_reply(nal, hdr, private)); + rc = parse_reply(nal, hdr, private, msg); break; default: CERROR(LPU64": Dropping message from "LPU64 ": Bad type=0x%x\n", nal->ni.nid, hdr->src_nid, hdr->type); - - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); - return (-1); + rc = PTL_FAIL; + break; + } + + if (rc != PTL_OK) { + if (msg->md != NULL) { + /* committed... */ + lib_finalize(nal, private, msg, rc); + } else { + state_lock(nal, &flags); + lib_msg_free(nal, msg); /* expects state_lock held */ + state_unlock(nal, &flags); + + lib_drop_message(nal, private, hdr); + } } } - -int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +do_PtlPut(nal_cb_t *nal, void *private, void *v_args, void *v_ret) { /* * Incoming: @@ -1075,16 +1097,15 @@ int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret) * Outgoing: */ - PtlPut_in *args = v_args; - PtlPut_out *ret = v_ret; - ptl_hdr_t hdr; - - lib_ni_t *ni = &nal->ni; - lib_md_t *md; - lib_msg_t *msg = NULL; + PtlPut_in *args = v_args; ptl_process_id_t *id = &args->target_in; - unsigned long flags; - int rc; + PtlPut_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + lib_msg_t *msg; + ptl_hdr_t hdr; + lib_md_t *md; + unsigned long flags; + int rc; if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ fail_peer (nal, id->nid, 1)) /* shall we now? */ @@ -1093,13 +1114,22 @@ int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret) nal->ni.nid, id->nid); return (ret->rc = PTL_INV_PROC); } - - ret->rc = PTL_OK; + + msg = lib_msg_alloc(nal); + if (msg == NULL) { + CERROR(LPU64": Dropping PUT to "LPU64": ENOMEM on lib_msg_t\n", + ni->nid, id->nid); + return (ret->rc = PTL_NOSPACE); + } + state_lock(nal, &flags); + md = ptl_handle2md(&args->md_in, nal); - if (md == NULL || !md->threshold) { + if (md == NULL || md->threshold == 0) { + lib_msg_free(nal, msg); state_unlock(nal, &flags); - return ret->rc = PTL_INV_MD; + + return (ret->rc = PTL_INV_MD); } CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid, @@ -1126,57 +1156,39 @@ int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret) hdr.msg.put.offset = HTON__u32 (args->offset_in); hdr.msg.put.hdr_data = args->hdr_data_in; + lib_commit_md(nal, md, msg); + + msg->ev.type = PTL_EVENT_SENT; + msg->ev.initiator.nid = ni->nid; + msg->ev.initiator.pid = ni->pid; + msg->ev.portal = args->portal_in; + msg->ev.match_bits = args->match_bits_in; + msg->ev.rlength = md->length; + msg->ev.mlength = md->length; + msg->ev.offset = args->offset_in; + msg->ev.hdr_data = args->hdr_data_in; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + ni->counters.send_count++; ni->counters.send_length += md->length; - msg = get_new_msg (nal, md); - if (msg == NULL) { - CERROR("BAD: could not allocate msg!\n"); - state_unlock(nal, &flags); - return ret->rc = PTL_NOSPACE; - } - - /* - * If this memory descriptor has an event queue associated with - * it we need to allocate a message state object and record the - * information about this operation that will be recorded into - * event queue once the message has been completed. - * - * NB. We're now committed to the GET, since we just marked the MD - * busy. Callers who observe this (by getting PTL_MD_INUSE from - * PtlMDUnlink()) expect a completion event to tell them when the - * MD becomes idle. - */ - if (md->eq) { - msg->ev.type = PTL_EVENT_SENT; - msg->ev.initiator.nid = ni->nid; - msg->ev.initiator.pid = ni->pid; - msg->ev.portal = args->portal_in; - msg->ev.match_bits = args->match_bits_in; - msg->ev.rlength = md->length; - msg->ev.mlength = md->length; - msg->ev.offset = args->offset_in; - msg->ev.hdr_data = args->hdr_data_in; - - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - } - state_unlock(nal, &flags); rc = lib_send (nal, private, msg, &hdr, PTL_MSG_PUT, id->nid, id->pid, md, 0, md->length); if (rc != PTL_OK) { - /* get_new_msg() committed us to sending by decrementing - * md->threshold, so we have to act like we did send, but - * the network dropped it. */ - lib_finalize (nal, private, msg); + CERROR(LPU64": error sending PUT to "LPU64": %d\n", + ni->nid, id->nid, rc); + lib_finalize (nal, private, msg, rc); } + /* completion will be signalled by an event */ return ret->rc = PTL_OK; } -lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, - lib_md_t *getmd) +lib_msg_t * +lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_md_t *getmd) { /* The NAL can DMA direct to the GET md (i.e. no REPLY msg). This * returns a msg the NAL can pass to lib_finalize() so that a REPLY @@ -1188,39 +1200,38 @@ lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, * lib_finalize() of the original GET. */ lib_ni_t *ni = &nal->ni; - lib_msg_t *msg; + lib_msg_t *msg = lib_msg_alloc(nal); unsigned long flags; state_lock(nal, &flags); LASSERT (getmd->pending > 0); + if (msg == NULL) { + CERROR ("Dropping REPLY from "LPU64": can't allocate msg\n", + peer_nid); + goto drop; + } + if (getmd->threshold == 0) { CERROR ("Dropping REPLY from "LPU64" for inactive MD %p\n", peer_nid, getmd); - goto drop; + goto drop_msg; } LASSERT (getmd->offset == 0); CDEBUG(D_NET, "Reply from "LPU64" md %p\n", peer_nid, getmd); - msg = get_new_msg (nal, getmd); - if (msg == NULL) { - CERROR("Dropping REPLY from "LPU64" md %p: can't allocate msg\n", - peer_nid, getmd); - goto drop; - } + lib_commit_md (nal, getmd, msg); - if (getmd->eq) { - msg->ev.type = PTL_EVENT_REPLY; - msg->ev.initiator.nid = peer_nid; - msg->ev.initiator.pid = 0; /* XXX FIXME!!! */ - msg->ev.rlength = msg->ev.mlength = getmd->length; - msg->ev.offset = 0; + msg->ev.type = PTL_EVENT_REPLY; + msg->ev.initiator.nid = peer_nid; + msg->ev.initiator.pid = 0; /* XXX FIXME!!! */ + msg->ev.rlength = msg->ev.mlength = getmd->length; + msg->ev.offset = 0; - lib_md_deconstruct(nal, getmd, &msg->ev.mem_desc); - } + lib_md_deconstruct(nal, getmd, &msg->ev.mem_desc); ni->counters.recv_count++; ni->counters.recv_length += getmd->length; @@ -1228,7 +1239,9 @@ lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, state_unlock(nal, &flags); return msg; - + + drop_msg: + lib_msg_free(nal, msg); drop: nal->ni.counters.drop_count++; nal->ni.counters.drop_length += getmd->length; @@ -1238,7 +1251,8 @@ lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, return NULL; } -int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +do_PtlGet(nal_cb_t *nal, void *private, void *v_args, void *v_ret) { /* * Incoming: @@ -1252,15 +1266,15 @@ int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret) * Outgoing: */ - PtlGet_in *args = v_args; - PtlGet_out *ret = v_ret; - ptl_hdr_t hdr; - lib_msg_t *msg = NULL; - lib_ni_t *ni = &nal->ni; + PtlGet_in *args = v_args; ptl_process_id_t *id = &args->target_in; - lib_md_t *md; - unsigned long flags; - int rc; + PtlGet_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + lib_msg_t *msg; + ptl_hdr_t hdr; + lib_md_t *md; + unsigned long flags; + int rc; if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ fail_peer (nal, id->nid, 1)) /* shall we now? */ @@ -1269,16 +1283,24 @@ int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret) nal->ni.nid, id->nid); return (ret->rc = PTL_INV_PROC); } - + + msg = lib_msg_alloc(nal); + if (msg == NULL) { + CERROR(LPU64": Dropping GET to "LPU64": ENOMEM on lib_msg_t\n", + ni->nid, id->nid); + return (ret->rc = PTL_NOSPACE); + } + state_lock(nal, &flags); + md = ptl_handle2md(&args->md_in, nal); if (md == NULL || !md->threshold) { + lib_msg_free(nal, msg); state_unlock(nal, &flags); + return ret->rc = PTL_INV_MD; } - LASSERT (md->offset == 0); - CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid, (unsigned long)id->pid); @@ -1299,51 +1321,33 @@ int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret) hdr.msg.get.src_offset = HTON__u32 (args->offset_in); hdr.msg.get.sink_length = HTON__u32 (md->length); - ni->counters.send_count++; + lib_commit_md(nal, md, msg); - msg = get_new_msg (nal, md); - if (msg == NULL) { - CERROR("do_PtlGet: BAD - could not allocate cookie!\n"); - state_unlock(nal, &flags); - return ret->rc = PTL_NOSPACE; - } + msg->ev.type = PTL_EVENT_SENT; + msg->ev.initiator.nid = ni->nid; + msg->ev.initiator.pid = ni->pid; + msg->ev.portal = args->portal_in; + msg->ev.match_bits = args->match_bits_in; + msg->ev.rlength = md->length; + msg->ev.mlength = md->length; + msg->ev.offset = args->offset_in; + msg->ev.hdr_data = 0; - /* - * If this memory descriptor has an event queue associated with - * it we must allocate a message state object that will record - * the information to be filled in once the message has been - * completed. More information is in the do_PtlPut() comments. - * - * NB. We're now committed to the GET, since we just marked the MD - * busy. Callers who observe this (by getting PTL_MD_INUSE from - * PtlMDUnlink()) expect a completion event to tell them when the - * MD becomes idle. - */ - if (md->eq) { - msg->ev.type = PTL_EVENT_SENT; - msg->ev.initiator.nid = ni->nid; - msg->ev.initiator.pid = ni->pid; - msg->ev.portal = args->portal_in; - msg->ev.match_bits = args->match_bits_in; - msg->ev.rlength = md->length; - msg->ev.mlength = md->length; - msg->ev.offset = args->offset_in; - msg->ev.hdr_data = 0; - - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - } + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + + ni->counters.send_count++; state_unlock(nal, &flags); rc = lib_send (nal, private, msg, &hdr, PTL_MSG_GET, id->nid, id->pid, NULL, 0, 0); if (rc != PTL_OK) { - /* get_new_msg() committed us to sending by decrementing - * md->threshold, so we have to act like we did send, but - * the network dropped it. */ - lib_finalize (nal, private, msg); + CERROR(LPU64": error sending GET to "LPU64": %d\n", + ni->nid, id->nid, rc); + lib_finalize (nal, private, msg, rc); } + /* completion will be signalled by an event */ return ret->rc = PTL_OK; } diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 9840ff5..04c69b1 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -32,32 +32,81 @@ #include -int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg) +void +lib_enq_event_locked (nal_cb_t *nal, void *private, + lib_eq_t *eq, ptl_event_t *ev) { - lib_md_t *md; - lib_eq_t *eq; + ptl_event_t *eq_slot; int rc; + + ev->sequence = eq->sequence++; /* Allocate the next queue slot */ + + /* size must be a power of 2 to handle a wrapped sequence # */ + LASSERT (eq->size != 0 && + eq->size == LOWEST_BIT_SET (eq->size)); + eq_slot = eq->base + (ev->sequence & (eq->size - 1)); + + /* Copy the event into the allocated slot, ensuring all the rest of + * the event's contents have been copied _before_ the sequence + * number gets updated. A processes 'getting' an event waits on + * the next queue slot's sequence to be 'new'. When it is, _all_ + * other event fields had better be consistent. I assert + * 'sequence' is the last member, so I only need a 2 stage copy. */ + + LASSERT(sizeof (ptl_event_t) == + offsetof(ptl_event_t, sequence) + sizeof(ev->sequence)); + + rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev, + offsetof (ptl_event_t, sequence)); + LASSERT (rc == PTL_OK); + +#ifdef __KERNEL__ + barrier(); +#endif + /* Updating the sequence number is what makes the event 'new' NB if + * the cb_write below isn't atomic, this could cause a race with + * PtlEQGet */ + rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence, + (void *)&ev->sequence,sizeof (ev->sequence)); + LASSERT (rc == PTL_OK); + +#ifdef __KERNEL__ + barrier(); +#endif + + if (nal->cb_callback != NULL) + nal->cb_callback(nal, private, eq, ev); + else if (eq->event_callback != NULL) + eq->event_callback(ev); +} + +void +lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) +{ + lib_md_t *md; + int unlink; unsigned long flags; + int rc; + ptl_hdr_t ack; /* ni went down while processing this message */ - if (nal->ni.up == 0) { - return -1; - } + if (nal->ni.up == 0) + return; if (msg == NULL) - return 0; + return; - rc = 0; - if (msg->send_ack) { - ptl_hdr_t ack; + /* Only send an ACK if the PUT completed successfully */ + if (status == PTL_OK && + !ptl_is_wire_handle_none(&msg->ack_wmd)) { - LASSERT (!ptl_is_wire_handle_none (&msg->ack_wmd)); + LASSERT(msg->ev.type == PTL_EVENT_PUT); memset (&ack, 0, sizeof (ack)); ack.type = HTON__u32 (PTL_MSG_ACK); - ack.dest_nid = HTON__u64 (msg->nid); + ack.dest_nid = HTON__u64 (msg->ev.initiator.nid); ack.src_nid = HTON__u64 (nal->ni.nid); - ack.dest_pid = HTON__u32 (msg->pid); + ack.dest_pid = HTON__u32 (msg->ev.initiator.pid); ack.src_pid = HTON__u32 (nal->ni.pid); ack.payload_length = 0; @@ -66,92 +115,35 @@ int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg) ack.msg.ack.mlength = HTON__u32 (msg->ev.mlength); rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK, - msg->nid, msg->pid, NULL, 0, 0); - /* If this send fails, there's nothing else to clean up */ + msg->ev.initiator.nid, msg->ev.initiator.pid, + NULL, 0, 0); + if (rc != PTL_OK) { + /* send failed: there's nothing else to clean up. */ + CERROR("Error %d sending ACK to "LPX64"\n", + rc, msg->ev.initiator.nid); + } } md = msg->md; - LASSERT (md->pending > 0); /* I've not dropped my ref yet */ - eq = md->eq; state_lock(nal, &flags); - if (eq != NULL) { - ptl_event_t *ev = &msg->ev; - ptl_event_t *eq_slot; - - /* I have to hold the lock while I bump the sequence number - * and copy the event into the queue. If not, and I was - * interrupted after bumping the sequence number, other - * events could fill the queue, including the slot I just - * allocated to this event. On resuming, I would overwrite - * a more 'recent' event with old event state, and - * processes taking events off the queue would not detect - * overflow correctly. - */ - - ev->sequence = eq->sequence++;/* Allocate the next queue slot */ - - /* size must be a power of 2 to handle a wrapped sequence # */ - LASSERT (eq->size != 0 && - eq->size == LOWEST_BIT_SET (eq->size)); - eq_slot = eq->base + (ev->sequence & (eq->size - 1)); - - /* Invalidate unlinked_me unless this is the last - * event for an auto-unlinked MD. Note that if md was - * auto-unlinked, md->pending can only decrease - */ - if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || /* not auto-unlinked */ - md->pending != 1) /* not last ref */ - ev->unlinked_me = PTL_HANDLE_NONE; - - /* Copy the event into the allocated slot, ensuring all the - * rest of the event's contents have been copied _before_ - * the sequence number gets updated. A processes 'getting' - * an event waits on the next queue slot's sequence to be - * 'new'. When it is, _all_ other event fields had better - * be consistent. I assert 'sequence' is the last member, - * so I only need a 2 stage copy. - */ - LASSERT(sizeof (ptl_event_t) == - offsetof(ptl_event_t, sequence) + sizeof(ev->sequence)); - - rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev, - offsetof (ptl_event_t, sequence)); - LASSERT (rc == 0); - -#ifdef __KERNEL__ - barrier(); -#endif - /* Updating the sequence number is what makes the event 'new' */ - - /* cb_write is not necessarily atomic, so this could - cause a race with PtlEQGet */ - rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence, - (void *)&ev->sequence,sizeof (ev->sequence)); - LASSERT (rc == 0); + /* Now it's safe to drop my caller's ref */ + md->pending--; + LASSERT (md->pending >= 0); -#ifdef __KERNEL__ - barrier(); -#endif + /* Should I unlink this MD? */ + unlink = (md->pending == 0 && /* No other refs */ + (md->threshold == 0 || /* All ops done */ + md->md_flags & PTL_MD_FLAG_UNLINK) != 0); /* black spot */ - /* I must also ensure that (a) callbacks are made in the - * same order as the events land in the queue, and (b) the - * callback occurs before the event can be removed from the - * queue, so I can't drop the lock during the callback. */ - if (nal->cb_callback != NULL) - nal->cb_callback(nal, private, eq, ev); - else if (eq->event_callback != NULL) - (void)((eq->event_callback) (ev)); - } + msg->ev.status = status; + msg->ev.unlinked = unlink; - LASSERT ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || - (md->md_flags & PTL_MD_FLAG_UNLINK) != 0); + if (md->eq != NULL) + lib_enq_event_locked(nal, private, md->eq, &msg->ev); - md->pending--; - if (md->pending == 0 && /* no more outstanding operations on this md */ - (md->threshold == 0 || /* done its business */ - (md->md_flags & PTL_MD_FLAG_UNLINK) != 0)) /* marked for death */ + if (unlink) lib_md_unlink(nal, md); list_del (&msg->msg_list); @@ -159,6 +151,4 @@ int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg) lib_msg_free(nal, msg); state_unlock(nal, &flags); - - return rc; } diff --git a/lnet/ulnds/Makefile.am b/lnet/ulnds/Makefile.am index dc427b0..6035ca1 100644 --- a/lnet/ulnds/Makefile.am +++ b/lnet/ulnds/Makefile.am @@ -1,5 +1,9 @@ CPPFLAGS= INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir) -lib_LIBRARIES = libtcpnal.a +noinst_LIBRARIES = libtcpnal.a pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h + +if LIBLUSTRE +libtcpnal_a_CFLAGS = -fPIC +endif diff --git a/lnet/ulnds/bridge.h b/lnet/ulnds/bridge.h index 0b4940f..9a90ab8 100644 --- a/lnet/ulnds/bridge.h +++ b/lnet/ulnds/bridge.h @@ -6,6 +6,9 @@ * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ */ +#ifndef TCPNAL_PROCBRIDGE_H +#define TCPNAL_PROCBRIDGE_H + #include typedef struct bridge { @@ -27,3 +30,5 @@ nal_t *bridge_init(ptl_interface_t nal, typedef int (*nal_initialize)(bridge); extern nal_initialize nal_table[PTL_IFACE_MAX]; + +#endif diff --git a/lnet/ulnds/connection.c b/lnet/ulnds/connection.c index 29e75be..ca6999a 100644 --- a/lnet/ulnds/connection.c +++ b/lnet/ulnds/connection.c @@ -309,7 +309,8 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation) */ connection force_tcp_connection(manager m, unsigned int ip, - unsigned short port) + unsigned short port, + procbridge pb) { connection conn; struct sockaddr_in addr; @@ -357,6 +358,10 @@ connection force_tcp_connection(manager m, exit(-1); conn = allocate_connection(m, ip, port, fd); + + /* let nal thread know this event right away */ + if (conn) + procbridge_wakeup_nal(pb); } pthread_mutex_unlock(&m->conn_lock); diff --git a/lnet/ulnds/connection.h b/lnet/ulnds/connection.h index fb1eaab..343ffa6 100644 --- a/lnet/ulnds/connection.h +++ b/lnet/ulnds/connection.h @@ -7,6 +7,7 @@ */ #include +#include typedef struct manager { table connections; @@ -26,7 +27,8 @@ typedef struct connection { manager m; } *connection; -connection force_tcp_connection(manager m, unsigned int ip, unsigned int short); +connection force_tcp_connection(manager m, unsigned int ip, unsigned int short, + procbridge pb); manager init_connections(unsigned short, int (*f)(void *, void *), void *); void remove_connection(void *arg); void shutdown_connections(manager m); diff --git a/lnet/ulnds/procapi.c b/lnet/ulnds/procapi.c index 2a3fbd8..bddfe9a 100644 --- a/lnet/ulnds/procapi.c +++ b/lnet/ulnds/procapi.c @@ -32,12 +32,34 @@ #include #include #include +#ifndef __CYGWIN__ +#include +#endif +#include #include #include #include #include +/* XXX CFS workaround, to give a chance to let nal thread wake up + * from waiting in select + */ +static int procbridge_notifier_handler(void *arg) +{ + static char buf[8]; + procbridge p = (procbridge) arg; + + syscall(SYS_read, p->notifier[1], buf, sizeof(buf)); + return 1; +} + +void procbridge_wakeup_nal(procbridge p) +{ + static char buf[8]; + syscall(SYS_write, p->notifier[0], buf, sizeof(buf)); +} + /* Function: forward * Arguments: nal_t *nal: pointer to my top-side nal structure * id: the command to pass to the lower layer @@ -79,6 +101,7 @@ static int procbridge_shutdown(nal_t *n, int ni) procbridge p=(procbridge)b->local; p->nal_flags |= NAL_FLAG_STOPPING; + procbridge_wakeup_nal(p); do { pthread_mutex_lock(&p->mutex); @@ -104,6 +127,12 @@ static int procbridge_validate(nal_t *nal, void *base, size_t extent) } +/* FIXME cfs temporary workaround! FIXME + * global time out value + */ +int __tcpnal_eqwait_timeout_value = 0; +int __tcpnal_eqwait_timedout = 0; + /* Function: yield * Arguments: pid: * @@ -118,7 +147,19 @@ static void procbridge_yield(nal_t *n) procbridge p=(procbridge)b->local; pthread_mutex_lock(&p->mutex); - pthread_cond_wait(&p->cond,&p->mutex); + if (!__tcpnal_eqwait_timeout_value) { + pthread_cond_wait(&p->cond,&p->mutex); + } else { + struct timeval now; + struct timespec timeout; + + gettimeofday(&now, NULL); + timeout.tv_sec = now.tv_sec + __tcpnal_eqwait_timeout_value; + timeout.tv_nsec = now.tv_usec * 1000; + + __tcpnal_eqwait_timedout = + pthread_cond_timedwait(&p->cond, &p->mutex, &timeout); + } pthread_mutex_unlock(&p->mutex); } @@ -194,6 +235,19 @@ nal_t *procbridge_interface(int num_interface, p->nal_flags = 0; pthread_mutex_init(&p->nal_cb_lock, 0); + /* initialize notifier */ + if (socketpair(AF_UNIX, SOCK_STREAM, 0, p->notifier)) { + perror("socketpair failed"); + return NULL; + } + + if (!register_io_handler(p->notifier[1], READ_HANDLER, + procbridge_notifier_handler, p)) { + perror("fail to register notifier handler"); + return NULL; + } + + /* create nal thread */ if (pthread_create(&p->t, NULL, nal_thread, &args)) { perror("nal_init: pthread_create"); return(NULL); diff --git a/lnet/ulnds/procbridge.h b/lnet/ulnds/procbridge.h index 317e22f..965f83d 100644 --- a/lnet/ulnds/procbridge.h +++ b/lnet/ulnds/procbridge.h @@ -25,6 +25,9 @@ typedef struct procbridge { pthread_cond_t cond; pthread_mutex_t mutex; + /* socket pair used to notify nal thread */ + int notifier[2]; + int nal_flags; pthread_mutex_t nal_cb_lock; @@ -51,5 +54,6 @@ extern nal_t *procbridge_interface(int num_interface, ptl_pt_index_t ptl_size, ptl_ac_index_t acl_size, ptl_pid_t requested_pid); +extern void procbridge_wakeup_nal(procbridge p); #endif diff --git a/lnet/ulnds/proclib.c b/lnet/ulnds/proclib.c index 2627253..2a5ba0d 100644 --- a/lnet/ulnds/proclib.c +++ b/lnet/ulnds/proclib.c @@ -43,24 +43,24 @@ /* the following functions are stubs to satisfy the nal definition without doing anything particularily useful*/ -static int nal_write(nal_cb_t *nal, - void *private, - user_ptr dst_addr, - void *src_addr, - size_t len) +static ptl_err_t nal_write(nal_cb_t *nal, + void *private, + user_ptr dst_addr, + void *src_addr, + size_t len) { memcpy(dst_addr, src_addr, len); - return 0; + return PTL_OK; } -static int nal_read(nal_cb_t * nal, - void *private, - void *dst_addr, - user_ptr src_addr, - size_t len) +static ptl_err_t nal_read(nal_cb_t * nal, + void *private, + void *dst_addr, + user_ptr src_addr, + size_t len) { memcpy(dst_addr, src_addr, len); - return 0; + return PTL_OK; } static void *nal_malloc(nal_cb_t *nal, diff --git a/lnet/ulnds/select.c b/lnet/ulnds/select.c index fe24efc..c4ccae1 100644 --- a/lnet/ulnds/select.c +++ b/lnet/ulnds/select.c @@ -126,15 +126,6 @@ void select_timer_block(when until) timeout_pointer=&timeout; } else timeout_pointer=0; - - /* FIXME - * temporarily add timer for endless waiting problem. - * FIXME - */ - timeout.tv_sec = 1; - timeout.tv_usec = 0; - timeout_pointer=&timeout; - FD_ZERO(&fds[0]); FD_ZERO(&fds[1]); FD_ZERO(&fds[2]); diff --git a/lnet/ulnds/socklnd/Makefile.am b/lnet/ulnds/socklnd/Makefile.am index dc427b0..6035ca1 100644 --- a/lnet/ulnds/socklnd/Makefile.am +++ b/lnet/ulnds/socklnd/Makefile.am @@ -1,5 +1,9 @@ CPPFLAGS= INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir) -lib_LIBRARIES = libtcpnal.a +noinst_LIBRARIES = libtcpnal.a pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h + +if LIBLUSTRE +libtcpnal_a_CFLAGS = -fPIC +endif diff --git a/lnet/ulnds/socklnd/bridge.h b/lnet/ulnds/socklnd/bridge.h index 0b4940f..9a90ab8 100644 --- a/lnet/ulnds/socklnd/bridge.h +++ b/lnet/ulnds/socklnd/bridge.h @@ -6,6 +6,9 @@ * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ */ +#ifndef TCPNAL_PROCBRIDGE_H +#define TCPNAL_PROCBRIDGE_H + #include typedef struct bridge { @@ -27,3 +30,5 @@ nal_t *bridge_init(ptl_interface_t nal, typedef int (*nal_initialize)(bridge); extern nal_initialize nal_table[PTL_IFACE_MAX]; + +#endif diff --git a/lnet/ulnds/socklnd/connection.c b/lnet/ulnds/socklnd/connection.c index 29e75be..ca6999a 100644 --- a/lnet/ulnds/socklnd/connection.c +++ b/lnet/ulnds/socklnd/connection.c @@ -309,7 +309,8 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation) */ connection force_tcp_connection(manager m, unsigned int ip, - unsigned short port) + unsigned short port, + procbridge pb) { connection conn; struct sockaddr_in addr; @@ -357,6 +358,10 @@ connection force_tcp_connection(manager m, exit(-1); conn = allocate_connection(m, ip, port, fd); + + /* let nal thread know this event right away */ + if (conn) + procbridge_wakeup_nal(pb); } pthread_mutex_unlock(&m->conn_lock); diff --git a/lnet/ulnds/socklnd/connection.h b/lnet/ulnds/socklnd/connection.h index fb1eaab..343ffa6 100644 --- a/lnet/ulnds/socklnd/connection.h +++ b/lnet/ulnds/socklnd/connection.h @@ -7,6 +7,7 @@ */ #include +#include typedef struct manager { table connections; @@ -26,7 +27,8 @@ typedef struct connection { manager m; } *connection; -connection force_tcp_connection(manager m, unsigned int ip, unsigned int short); +connection force_tcp_connection(manager m, unsigned int ip, unsigned int short, + procbridge pb); manager init_connections(unsigned short, int (*f)(void *, void *), void *); void remove_connection(void *arg); void shutdown_connections(manager m); diff --git a/lnet/ulnds/socklnd/procapi.c b/lnet/ulnds/socklnd/procapi.c index 2a3fbd8..bddfe9a 100644 --- a/lnet/ulnds/socklnd/procapi.c +++ b/lnet/ulnds/socklnd/procapi.c @@ -32,12 +32,34 @@ #include #include #include +#ifndef __CYGWIN__ +#include +#endif +#include #include #include #include #include +/* XXX CFS workaround, to give a chance to let nal thread wake up + * from waiting in select + */ +static int procbridge_notifier_handler(void *arg) +{ + static char buf[8]; + procbridge p = (procbridge) arg; + + syscall(SYS_read, p->notifier[1], buf, sizeof(buf)); + return 1; +} + +void procbridge_wakeup_nal(procbridge p) +{ + static char buf[8]; + syscall(SYS_write, p->notifier[0], buf, sizeof(buf)); +} + /* Function: forward * Arguments: nal_t *nal: pointer to my top-side nal structure * id: the command to pass to the lower layer @@ -79,6 +101,7 @@ static int procbridge_shutdown(nal_t *n, int ni) procbridge p=(procbridge)b->local; p->nal_flags |= NAL_FLAG_STOPPING; + procbridge_wakeup_nal(p); do { pthread_mutex_lock(&p->mutex); @@ -104,6 +127,12 @@ static int procbridge_validate(nal_t *nal, void *base, size_t extent) } +/* FIXME cfs temporary workaround! FIXME + * global time out value + */ +int __tcpnal_eqwait_timeout_value = 0; +int __tcpnal_eqwait_timedout = 0; + /* Function: yield * Arguments: pid: * @@ -118,7 +147,19 @@ static void procbridge_yield(nal_t *n) procbridge p=(procbridge)b->local; pthread_mutex_lock(&p->mutex); - pthread_cond_wait(&p->cond,&p->mutex); + if (!__tcpnal_eqwait_timeout_value) { + pthread_cond_wait(&p->cond,&p->mutex); + } else { + struct timeval now; + struct timespec timeout; + + gettimeofday(&now, NULL); + timeout.tv_sec = now.tv_sec + __tcpnal_eqwait_timeout_value; + timeout.tv_nsec = now.tv_usec * 1000; + + __tcpnal_eqwait_timedout = + pthread_cond_timedwait(&p->cond, &p->mutex, &timeout); + } pthread_mutex_unlock(&p->mutex); } @@ -194,6 +235,19 @@ nal_t *procbridge_interface(int num_interface, p->nal_flags = 0; pthread_mutex_init(&p->nal_cb_lock, 0); + /* initialize notifier */ + if (socketpair(AF_UNIX, SOCK_STREAM, 0, p->notifier)) { + perror("socketpair failed"); + return NULL; + } + + if (!register_io_handler(p->notifier[1], READ_HANDLER, + procbridge_notifier_handler, p)) { + perror("fail to register notifier handler"); + return NULL; + } + + /* create nal thread */ if (pthread_create(&p->t, NULL, nal_thread, &args)) { perror("nal_init: pthread_create"); return(NULL); diff --git a/lnet/ulnds/socklnd/procbridge.h b/lnet/ulnds/socklnd/procbridge.h index 317e22f..965f83d 100644 --- a/lnet/ulnds/socklnd/procbridge.h +++ b/lnet/ulnds/socklnd/procbridge.h @@ -25,6 +25,9 @@ typedef struct procbridge { pthread_cond_t cond; pthread_mutex_t mutex; + /* socket pair used to notify nal thread */ + int notifier[2]; + int nal_flags; pthread_mutex_t nal_cb_lock; @@ -51,5 +54,6 @@ extern nal_t *procbridge_interface(int num_interface, ptl_pt_index_t ptl_size, ptl_ac_index_t acl_size, ptl_pid_t requested_pid); +extern void procbridge_wakeup_nal(procbridge p); #endif diff --git a/lnet/ulnds/socklnd/proclib.c b/lnet/ulnds/socklnd/proclib.c index 2627253..2a5ba0d 100644 --- a/lnet/ulnds/socklnd/proclib.c +++ b/lnet/ulnds/socklnd/proclib.c @@ -43,24 +43,24 @@ /* the following functions are stubs to satisfy the nal definition without doing anything particularily useful*/ -static int nal_write(nal_cb_t *nal, - void *private, - user_ptr dst_addr, - void *src_addr, - size_t len) +static ptl_err_t nal_write(nal_cb_t *nal, + void *private, + user_ptr dst_addr, + void *src_addr, + size_t len) { memcpy(dst_addr, src_addr, len); - return 0; + return PTL_OK; } -static int nal_read(nal_cb_t * nal, - void *private, - void *dst_addr, - user_ptr src_addr, - size_t len) +static ptl_err_t nal_read(nal_cb_t * nal, + void *private, + void *dst_addr, + user_ptr src_addr, + size_t len) { memcpy(dst_addr, src_addr, len); - return 0; + return PTL_OK; } static void *nal_malloc(nal_cb_t *nal, diff --git a/lnet/ulnds/socklnd/select.c b/lnet/ulnds/socklnd/select.c index fe24efc..c4ccae1 100644 --- a/lnet/ulnds/socklnd/select.c +++ b/lnet/ulnds/socklnd/select.c @@ -126,15 +126,6 @@ void select_timer_block(when until) timeout_pointer=&timeout; } else timeout_pointer=0; - - /* FIXME - * temporarily add timer for endless waiting problem. - * FIXME - */ - timeout.tv_sec = 1; - timeout.tv_usec = 0; - timeout_pointer=&timeout; - FD_ZERO(&fds[0]); FD_ZERO(&fds[1]); FD_ZERO(&fds[2]); diff --git a/lnet/ulnds/socklnd/tcplnd.c b/lnet/ulnds/socklnd/tcplnd.c index 1041d1d..0c47f42 100644 --- a/lnet/ulnds/socklnd/tcplnd.c +++ b/lnet/ulnds/socklnd/tcplnd.c @@ -55,69 +55,69 @@ * * sends a packet to the peer, after insuring that a connection exists */ -int tcpnal_send(nal_cb_t *n, - void *private, - lib_msg_t *cookie, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int niov, - struct iovec *iov, - size_t len) +ptl_err_t tcpnal_send(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int niov, + struct iovec *iov, + size_t offset, + size_t len) { connection c; bridge b=(bridge)n->nal_data; struct iovec tiov[257]; static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER; - int rc; + ptl_err_t rc = PTL_OK; + int sysrc; int total; + int ntiov; int i; if (!(c=force_tcp_connection((manager)b->lower, PNAL_IP(nid,b), - PNAL_PORT(nid,pid)))) - return(1); + PNAL_PORT(nid,pid), + b->local))) + return(PTL_FAIL); -#if 0 /* TODO: these results should be checked. furthermore, provision must be made for the SIGPIPE which is delivered when writing on a tcp socket which has closed underneath the application. there is a linux flag in the sendmsg call which turns off the signally behaviour, but its nonstandard */ - syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t)); - LASSERT (niov <= 1); - if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len); -#else + LASSERT (niov <= 256); tiov[0].iov_base = hdr; tiov[0].iov_len = sizeof(ptl_hdr_t); + ntiov = 1 + lib_extract_iov(256, &tiov[1], niov, iov, offset, len); - if (niov > 0) - memcpy(&tiov[1], iov, niov * sizeof(struct iovec)); pthread_mutex_lock(&send_lock); #if 1 - for (i = total = 0; i <= niov; i++) + for (i = total = 0; i < ntiov; i++) total += tiov[i].iov_len; - rc = syscall(SYS_writev, c->fd, tiov, niov+1); - if (rc != total) { + sysrc = syscall(SYS_writev, c->fd, tiov, ntiov); + if (sysrc != total) { fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n", rc, total, errno); - abort(); + rc = PTL_FAIL; } #else - for (i = total = 0; i <= niov; i++) { + for (i = total = 0; i <= ntiov; i++) { rc = send(c->fd, tiov[i].iov_base, tiov[i].iov_len, 0); if (rc != tiov[i].iov_len) { fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n", rc, tiov[i].iov_len, errno); - abort(); + rc = PTL_FAIL; + break; } - total != rc; + total += rc; } #endif #if 0 @@ -130,10 +130,14 @@ int tcpnal_send(nal_cb_t *n, total, niov + 1); #endif pthread_mutex_unlock(&send_lock); -#endif - lib_finalize(n, private, cookie); - - return(0); + + if (rc == PTL_OK) { + /* NB the NAL only calls lib_finalize() if it returns PTL_OK + * from cb_send() */ + lib_finalize(n, private, cookie, PTL_OK); + } + + return(rc); } @@ -150,15 +154,18 @@ int tcpnal_send(nal_cb_t *n, * blocking read of the requested data. must drain out the * difference of mainpulated and requested lengths from the network */ -int tcpnal_recv(nal_cb_t *n, - void *private, - lib_msg_t *cookie, - unsigned int niov, - struct iovec *iov, - size_t mlen, - size_t rlen) +ptl_err_t tcpnal_recv(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + size_t offset, + size_t mlen, + size_t rlen) { + struct iovec tiov[256]; + int ntiov; int i; if (!niov) @@ -168,16 +175,19 @@ int tcpnal_recv(nal_cb_t *n, LASSERT(rlen); LASSERT(rlen >= mlen); + ntiov = lib_extract_iov(256, tiov, niov, iov, offset, mlen); + /* FIXME * 1. Is this effecient enough? change to use readv() directly? * 2. need check return from read_connection() * - MeiJia */ - for (i = 0; i < niov; i++) - read_connection(private, iov[i].iov_base, iov[i].iov_len); + for (i = 0; i < ntiov; i++) + read_connection(private, tiov[i].iov_base, tiov[i].iov_len); finalize: - lib_finalize(n, private, cookie); + /* FIXME; we always assume success here... */ + lib_finalize(n, private, cookie, PTL_OK); if (mlen!=rlen){ char *trash=malloc(rlen-mlen); @@ -187,7 +197,7 @@ finalize: free(trash); } - return(rlen); + return(PTL_OK); } diff --git a/lnet/ulnds/tcplnd.c b/lnet/ulnds/tcplnd.c index 1041d1d..0c47f42 100644 --- a/lnet/ulnds/tcplnd.c +++ b/lnet/ulnds/tcplnd.c @@ -55,69 +55,69 @@ * * sends a packet to the peer, after insuring that a connection exists */ -int tcpnal_send(nal_cb_t *n, - void *private, - lib_msg_t *cookie, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int niov, - struct iovec *iov, - size_t len) +ptl_err_t tcpnal_send(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int niov, + struct iovec *iov, + size_t offset, + size_t len) { connection c; bridge b=(bridge)n->nal_data; struct iovec tiov[257]; static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER; - int rc; + ptl_err_t rc = PTL_OK; + int sysrc; int total; + int ntiov; int i; if (!(c=force_tcp_connection((manager)b->lower, PNAL_IP(nid,b), - PNAL_PORT(nid,pid)))) - return(1); + PNAL_PORT(nid,pid), + b->local))) + return(PTL_FAIL); -#if 0 /* TODO: these results should be checked. furthermore, provision must be made for the SIGPIPE which is delivered when writing on a tcp socket which has closed underneath the application. there is a linux flag in the sendmsg call which turns off the signally behaviour, but its nonstandard */ - syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t)); - LASSERT (niov <= 1); - if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len); -#else + LASSERT (niov <= 256); tiov[0].iov_base = hdr; tiov[0].iov_len = sizeof(ptl_hdr_t); + ntiov = 1 + lib_extract_iov(256, &tiov[1], niov, iov, offset, len); - if (niov > 0) - memcpy(&tiov[1], iov, niov * sizeof(struct iovec)); pthread_mutex_lock(&send_lock); #if 1 - for (i = total = 0; i <= niov; i++) + for (i = total = 0; i < ntiov; i++) total += tiov[i].iov_len; - rc = syscall(SYS_writev, c->fd, tiov, niov+1); - if (rc != total) { + sysrc = syscall(SYS_writev, c->fd, tiov, ntiov); + if (sysrc != total) { fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n", rc, total, errno); - abort(); + rc = PTL_FAIL; } #else - for (i = total = 0; i <= niov; i++) { + for (i = total = 0; i <= ntiov; i++) { rc = send(c->fd, tiov[i].iov_base, tiov[i].iov_len, 0); if (rc != tiov[i].iov_len) { fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n", rc, tiov[i].iov_len, errno); - abort(); + rc = PTL_FAIL; + break; } - total != rc; + total += rc; } #endif #if 0 @@ -130,10 +130,14 @@ int tcpnal_send(nal_cb_t *n, total, niov + 1); #endif pthread_mutex_unlock(&send_lock); -#endif - lib_finalize(n, private, cookie); - - return(0); + + if (rc == PTL_OK) { + /* NB the NAL only calls lib_finalize() if it returns PTL_OK + * from cb_send() */ + lib_finalize(n, private, cookie, PTL_OK); + } + + return(rc); } @@ -150,15 +154,18 @@ int tcpnal_send(nal_cb_t *n, * blocking read of the requested data. must drain out the * difference of mainpulated and requested lengths from the network */ -int tcpnal_recv(nal_cb_t *n, - void *private, - lib_msg_t *cookie, - unsigned int niov, - struct iovec *iov, - size_t mlen, - size_t rlen) +ptl_err_t tcpnal_recv(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + size_t offset, + size_t mlen, + size_t rlen) { + struct iovec tiov[256]; + int ntiov; int i; if (!niov) @@ -168,16 +175,19 @@ int tcpnal_recv(nal_cb_t *n, LASSERT(rlen); LASSERT(rlen >= mlen); + ntiov = lib_extract_iov(256, tiov, niov, iov, offset, mlen); + /* FIXME * 1. Is this effecient enough? change to use readv() directly? * 2. need check return from read_connection() * - MeiJia */ - for (i = 0; i < niov; i++) - read_connection(private, iov[i].iov_base, iov[i].iov_len); + for (i = 0; i < ntiov; i++) + read_connection(private, tiov[i].iov_base, tiov[i].iov_len); finalize: - lib_finalize(n, private, cookie); + /* FIXME; we always assume success here... */ + lib_finalize(n, private, cookie, PTL_OK); if (mlen!=rlen){ char *trash=malloc(rlen-mlen); @@ -187,7 +197,7 @@ finalize: free(trash); } - return(rlen); + return(PTL_OK); } diff --git a/lnet/utils/Makefile.am b/lnet/utils/Makefile.am index f1878df..6c31b3d 100644 --- a/lnet/utils/Makefile.am +++ b/lnet/utils/Makefile.am @@ -3,17 +3,18 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution - COMPILE = $(CC) -Wall -g -I$(srcdir)/../include LINK = $(CC) -o $@ if LIBLUSTRE -tmp= + +noinst_LIBRARIES = libuptlctl.a +libuptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h +libuptlctl_a_CFLAGS = -fPIC + else -tmp=gmnalnid -endif -sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck $(tmp) +sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid lib_LIBRARIES = libptlctl.a acceptor_SOURCES = acceptor.c # -lefence @@ -33,3 +34,4 @@ debugctl_LDADD = -L. -lptlctl -lncurses # -lefence debugctl_DEPENDENCIES = libptlctl.a routerstat_SOURCES = routerstat.c +endif diff --git a/lnet/utils/l_ioctl.c b/lnet/utils/l_ioctl.c index c6628ff..58a408a 100644 --- a/lnet/utils/l_ioctl.c +++ b/lnet/utils/l_ioctl.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -35,6 +34,16 @@ #include #include +#ifndef __CYGWIN__ + #include +#else + #include + #include +#endif + +static ioc_handler_t do_ioctl; /* forward ref */ +static ioc_handler_t *current_ioc_handler = &do_ioctl; + struct ioc_dev { const char * dev_name; int dev_fd; @@ -48,7 +57,16 @@ struct dump_hdr { int opc; }; -char * dump_filename; +char *dump_filename; + +void +set_ioc_handler (ioc_handler_t *handler) +{ + if (handler == NULL) + current_ioc_handler = do_ioctl; + else + current_ioc_handler = handler; +} static int open_ioc_dev(int dev_id) @@ -115,7 +133,7 @@ dump(int dev_id, int opc, void *buf) { FILE *fp; struct dump_hdr dump_hdr; - struct portal_ioctl_hdr * ioc_hdr = (struct portal_ioctl_hdr *) buf; + struct portal_ioctl_hdr * ioc_hdr = (struct portal_ioctl_hdr *) buf; int rc; printf("dumping opc %x to %s\n", opc, dump_filename); @@ -132,17 +150,17 @@ dump(int dev_id, int opc, void *buf) return -EINVAL; } - rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp); - if (rc == 1) - rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp); - fclose(fp); - if (rc != 1) { - fprintf(stderr, "%s: %s\n", dump_filename, - strerror(errno)); - return -EINVAL; - } - - return 0; + rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp); + if (rc == 1) + rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp); + fclose(fp); + if (rc != 1) { + fprintf(stderr, "%s: %s\n", dump_filename, + strerror(errno)); + return -EINVAL; + } + + return 0; } /* register a device to send ioctls to. */ @@ -184,16 +202,17 @@ set_ioctl_dump(char * file) free(dump_filename); dump_filename = strdup(file); + if (dump_filename == NULL) + abort(); + + set_ioc_handler(&dump); return 0; } int l_ioctl(int dev_id, int opc, void *buf) { - if (dump_filename) - return dump(dev_id, opc, buf); - else - return do_ioctl(dev_id, opc, buf); + return current_ioc_handler(dev_id, opc, buf); } /* Read an ioctl dump file, and call the ioc_func for each ioctl buffer @@ -207,16 +226,28 @@ l_ioctl(int dev_id, int opc, void *buf) int parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *)) { - int fd, line =0; + int line =0; struct stat st; - char *buf, *end; + char *start, *buf, *end; +#ifndef __CYGWIN__ + int fd; +#else + HANDLE fd, hmap; + DWORD size; +#endif +#ifndef __CYGWIN__ fd = syscall(SYS_open, dump_file, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "couldn't open %s: %s\n", dump_file, + strerror(errno)); + exit(1); + } #ifndef SYS_fstat64 -#define __SYS_fstat__ SYS_fstat +# define __SYS_fstat__ SYS_fstat #else -#define __SYS_fstat__ SYS_fstat64 +# define __SYS_fstat__ SYS_fstat64 #endif if (syscall(__SYS_fstat__, fd, &st)) { perror("stat fails"); @@ -228,41 +259,72 @@ parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *)) exit(1); } - buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0); - end = buf + st.st_size; + start = buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0); + end = start + st.st_size; close(fd); - while (buf < end) { - struct dump_hdr *dump_hdr = (struct dump_hdr *) buf; - struct portal_ioctl_hdr * data; - char tmp[8096]; - int rc; - - line++; + if (start == MAP_FAILED) { + fprintf(stderr, "can't create file mapping\n"); + exit(1); + } +#else + fd = CreateFile(dump_file, GENERIC_READ, FILE_SHARE_READ, NULL, + OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); + size = GetFileSize(fd, NULL); + if (size < 1) { + fprintf(stderr, "KML is empty\n"); + exit(1); + } - data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr)); - if (buf + data->ioc_len > end ) { - fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf, - data->ioc_len, end); - return -1; - } + hmap = CreateFileMapping(fd, NULL, PAGE_READONLY, 0,0, NULL); + start = buf = MapViewOfFile(hmap, FILE_MAP_READ, 0, 0, 0); + end = buf + size; + CloseHandle(fd); + if (start == NULL) { + fprintf(stderr, "can't create file mapping\n"); + exit(1); + } +#endif /* __CYGWIN__ */ + + while (buf < end) { + struct dump_hdr *dump_hdr = (struct dump_hdr *) buf; + struct portal_ioctl_hdr * data; + char tmp[8096]; + int rc; + + line++; + + data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr)); + if (buf + data->ioc_len > end ) { + fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf, + data->ioc_len, end); + return -1; + } #if 0 - printf ("dump_hdr: %lx data: %lx\n", - (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf); - - printf("%d: opcode %x len: %d ver: %x ", line, dump_hdr->opc, - data->ioc_len, data->ioc_version); + printf ("dump_hdr: %lx data: %lx\n", + (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf); + + printf("%d: opcode %x len: %d ver: %x ", line, dump_hdr->opc, + data->ioc_len, data->ioc_version); #endif - memcpy(tmp, data, data->ioc_len); + memcpy(tmp, data, data->ioc_len); - rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp); - if (rc) { - printf("failed: %d\n", rc); - exit(1); - } + rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp); + if (rc) { + printf("failed: %d\n", rc); + exit(1); + } - buf += data->ioc_len + sizeof(*dump_hdr); + buf += data->ioc_len + sizeof(*dump_hdr); } + +#ifndef __CYGWIN__ + munmap(start, end - start); +#else + UnmapViewOfFile(start); + CloseHandle(hmap); +#endif + return 0; } diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index 3c7ec20..fb031ae 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -36,6 +36,21 @@ #include #include +#ifdef __CYGWIN__ + +#include + +#warning assuming little endian + +#define __cpu_to_le64(x) ((__u64)(x)) +#define __le64_to_cpu(x) ((__u64)(x)) +#define __cpu_to_le32(x) ((__u32)(x)) +#define __le32_to_cpu(x) ((__u32)(x)) +#define __cpu_to_le16(x) ((__u16)(x)) +#define __le16_to_cpu(x) ((__u16)(x)) + +#endif /* __CYGWIN__ */ + #include #include #include @@ -94,6 +109,9 @@ pcfg_ioctl(struct portals_cfg *pcfg) PORTAL_IOC_INIT (data); data.ioc_pbuf1 = (char*)pcfg; data.ioc_plen1 = sizeof(*pcfg); + /* XXX liblustre hack XXX */ + data.ioc_nal_cmd = pcfg->pcfg_command; + data.ioc_nid = pcfg->pcfg_nid; rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); } diff --git a/lustre/ChangeLog b/lustre/ChangeLog index cbdcb10..54abc71 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -8,13 +8,23 @@ tbd Cluster File Systems, Inc. - ptlrpcd can be blocked, stopping ALL progress (2477) - recovery for initial connections (2355) - fixes for mds_cleanup_orphans (1934) + - abort_recovery crashes MDS in b_eq (mds_unlink_orphan) (2584) - block all file creations until orphan recovery completes (1901) - client remove rq_connection from request struct (2423) - conf-sanity test_5, proper cleanup in umount log not availale (2640) - recovery timer race (2670) - mdc_close recovey bug (2532) + - ptlrpc cleanup bug (2710) + - mds timeout on local locks (2588) + - namespace lock held during RPCs (2431) - don't try to handle a message that hasn't been replied to (2699) - - don't fail assertion if in recovery during cleanup (2701) + - client assert failure during cleanup after abort recovery (2701) + - leak mdc device after failed mount (2712) + - ptlrpc_check_set allows timedout requests to complete (2714) + - wait for inflight reqs when ptlrpcd finishes (2710) + - make sure unregistered services are removed from the srv_list + - reset bulk XID's when resending them (caught by 1138 test) + - unregister_bulk after timeout - fix lconf error (2694) * miscellania - return LL_SUPER_MAGIC from statfs for the filesystem type (1972) diff --git a/lustre/Makefile.am b/lustre/Makefile.am index 1582666..045bace5 100644 --- a/lustre/Makefile.am +++ b/lustre/Makefile.am @@ -12,7 +12,7 @@ DIRS24 = ptlbd endif if LIBLUSTRE -SUBDIRS = portals obdclass lov ptlrpc obdecho osc utils mdc lvfs #liblustre +SUBDIRS = portals obdclass lov ptlrpc obdecho osc utils mdc lvfs liblustre else SUBDIRS = lvfs portals obdclass include $(DIRS24) mds utils obdfilter mdc osc ost SUBDIRS+= llite obdecho lov cobd tests doc scripts conf ptlrpc diff --git a/lustre/configure.in b/lustre/configure.in index 4107a0c..2f023db 100644 --- a/lustre/configure.in +++ b/lustre/configure.in @@ -73,7 +73,7 @@ AC_OUTPUT([Makefile lvfs/Makefile portals/Makefile portals/Kernelenv \ portals/knals/scimacnal/Makefile \ portals/knals/ibnal/Makefile \ portals/utils/Makefile portals/tests/Makefile portals/doc/Makefile \ - obdecho/Makefile ptlrpc/Makefile liblustre/Makefile \ + obdecho/Makefile ptlrpc/Makefile liblustre/Makefile liblustre/tests/Makefile \ lov/Makefile osc/Makefile mdc/Makefile mds/Makefile ost/Makefile \ cobd/Makefile ptlbd/Makefile conf/Makefile tests/Makefile \ utils/Makefile utils/Lustre/Makefile obdfilter/Makefile \ diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h index 6c6ac1d..0b6da9f 100644 --- a/lustre/include/liblustre.h +++ b/lustre/include/liblustre.h @@ -111,9 +111,9 @@ static inline void *kmalloc(int size, int prot) #define GFP_HIGHUSER 1 #define GFP_ATOMIC 1 #define GFP_NOFS 1 -#define IS_ERR(a) (((a) && abs((int)(a)) < 500) ? 1 : 0) -#define PTR_ERR(a) ((int)(a)) -#define ERR_PTR(a) ((void*)(a)) +#define IS_ERR(a) (((a) && abs((long)(a)) < 500) ? 1 : 0) +#define PTR_ERR(a) ((long)(a)) +#define ERR_PTR(a) ((void*)((long)(a))) #define capable(foo) 1 #define CAP_SYS_ADMIN 1 @@ -415,6 +415,11 @@ static inline int kmem_cache_destroy(kmem_cache_t *a) #define PAGE_CACHE_SHIFT 12 #define PAGE_CACHE_MASK PAGE_MASK +/* XXX + * for this moment, liblusre will not rely OST for non-page-aligned write + */ +#define LIBLUSTRE_HANDLE_UNALIGNED_PAGE + struct page { void *addr; unsigned long index; @@ -424,6 +429,9 @@ struct page { /* internally used by liblustre file i/o */ int _offset; int _count; +#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE + int _managed; +#endif }; #define kmap(page) (page)->addr @@ -461,6 +469,7 @@ static inline void __free_pages(struct page *pg, int what) } #define __free_page(page) __free_pages((page), 0) +#define free_page(page) __free_page(page) static inline struct page* __grab_cache_page(unsigned long index) { @@ -706,6 +715,12 @@ static inline void del_timer(struct timer_list *l) free(l); } +#define time_after(a, b) \ +({ \ + printf("Error: inapproiate call time_after()\n"); \ + 1; \ +}) + typedef struct { volatile int counter; } atomic_t; #define atomic_read(a) ((a)->counter) diff --git a/lustre/include/linux/lustre_dlm.h b/lustre/include/linux/lustre_dlm.h index 331e8f8..99c1785 100644 --- a/lustre/include/linux/lustre_dlm.h +++ b/lustre/include/linux/lustre_dlm.h @@ -388,9 +388,6 @@ void ldlm_lock_remove_from_lru(struct ldlm_lock *); struct ldlm_lock *ldlm_handle2lock_ns(struct ldlm_namespace *, struct lustre_handle *); -void *ldlm_put_lock_into_req(struct ptlrpc_request *, - struct lustre_handle *, int); - static inline struct ldlm_lock *ldlm_handle2lock(struct lustre_handle *h) { return __ldlm_handle2lock(h, 0); diff --git a/lustre/include/linux/lustre_export.h b/lustre/include/linux/lustre_export.h index b949fe1..218807c 100644 --- a/lustre/include/linux/lustre_export.h +++ b/lustre/include/linux/lustre_export.h @@ -68,19 +68,21 @@ struct obd_export { struct obd_uuid exp_client_uuid; struct list_head exp_obd_chain; struct obd_device *exp_obd; - struct obd_import *exp_imp_reverse; /* to make rpc's backwards */ + struct obd_import *exp_imp_reverse; /* to make RPCs backwards */ struct ptlrpc_connection *exp_connection; __u32 exp_conn_cnt; struct ldlm_export_data exp_ldlm_data; - struct ptlrpc_request *exp_outstanding_reply; + struct list_head exp_outstanding_replies; time_t exp_last_request_time; spinlock_t exp_lock; /* protects flags int below */ - int exp_failed:1; + /* ^ protects exp_outstanding_replies too */ int exp_flags; + int exp_failed:1; + int exp_libclient:1; /* liblustre client? */ union { struct mds_export_data eu_mds_data; struct filter_export_data eu_filter_data; - struct ec_export_data eu_ec_data; + struct ec_export_data eu_ec_data; struct osc_export_data eu_osc_data; } u; }; diff --git a/lustre/include/linux/lustre_idl.h b/lustre/include/linux/lustre_idl.h index 9428296..3fa0a61 100644 --- a/lustre/include/linux/lustre_idl.h +++ b/lustre/include/linux/lustre_idl.h @@ -226,6 +226,7 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags) #define MSG_CONNECT_RECONNECT 0x2 #define MSG_CONNECT_REPLAYABLE 0x4 //#define MSG_CONNECT_PEER 0x8 +#define MSG_CONNECT_LIBCLIENT 0x10 /* * OST requests: OBDO & OBD request records diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h index bb8900e..03a011a 100644 --- a/lustre/include/linux/lustre_net.h +++ b/lustre/include/linux/lustre_net.h @@ -41,6 +41,10 @@ #include #include +/* Size over which to OBD_VMALLOC() rather than OBD_ALLOC() service request + * buffers */ +#define SVC_BUF_VMALLOC_THRESHOLD (2*PAGE_SIZE) + /* The following constants determine how much memory is devoted to * buffering in the lustre services. * @@ -51,28 +55,22 @@ * total memory = ?_NBUFS * ?_BUFSIZE * * ?_MAXREQSIZE # maximum request service will receive - * larger messages will get dropped. + * messages larger than ?_MAXREQSIZE are dropped. * request buffers are auto-unlinked when less than ?_MAXREQSIZE * is left in them. */ #define LDLM_NUM_THREADS min(smp_num_cpus * smp_num_cpus * 8, 64) -#define LDLM_NEVENT_MAX 8192UL -#define LDLM_NEVENTS min_t(unsigned long, num_physpages / 64, \ - LDLM_NEVENT_MAX) #define LDLM_NBUF_MAX 256UL -#define LDLM_NBUFS min(LDLM_NEVENTS / 16, LDLM_NBUF_MAX) #define LDLM_BUFSIZE (8 * 1024) #define LDLM_MAXREQSIZE (5 * 1024) +#define LDLM_MAXMEM (num_physpages*(PAGE_SIZE/1024)) +#define LDLM_NBUFS min(LDLM_MAXMEM/LDLM_BUFSIZE, LDLM_NBUF_MAX) #define MDT_MAX_THREADS 32UL #define MDT_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \ MDT_MAX_THREADS), 2UL) -#define MDS_NEVENT_MAX 8192UL -#define MDS_NEVENTS min_t(unsigned long, num_physpages / 64, \ - MDS_NEVENT_MAX) #define MDS_NBUF_MAX 512UL -#define MDS_NBUFS min(MDS_NEVENTS / 16, MDS_NBUF_MAX) #define MDS_BUFSIZE (8 * 1024) /* Assume file name length = FNAME_MAX = 256 (true for extN). * path name length = PATH_MAX = 4096 @@ -89,15 +87,13 @@ * except in the open case where there are a large number of OSTs in a LOV. */ #define MDS_MAXREQSIZE (5 * 1024) +#define MDS_MAXMEM (num_physpages*(PAGE_SIZE/512)) +#define MDS_NBUFS min(MDS_MAXMEM/MDS_BUFSIZE, MDS_NBUF_MAX) #define OST_MAX_THREADS 36UL #define OST_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \ OST_MAX_THREADS), 2UL) -#define OST_NEVENT_MAX 16384UL -#define OST_NEVENTS min_t(unsigned long, num_physpages / 16, \ - OST_NEVENT_MAX) #define OST_NBUF_MAX 5000UL -#define OST_NBUFS min(OST_NEVENTS / 2, OST_NBUF_MAX) #define OST_BUFSIZE (8 * 1024) /* OST_MAXREQSIZE ~= 1640 bytes = * lustre_msg + obdo + 16 * obd_ioobj + 64 * niobuf_remote @@ -106,9 +102,10 @@ * - OST_MAXREQSIZE must be at least 1 page of cookies plus some spillover */ #define OST_MAXREQSIZE (5 * 1024) +#define OST_MAXMEM (num_physpages*(PAGE_SIZE/512)) +#define OST_NBUFS min(OST_MAXMEM/OST_BUFSIZE, OST_NBUF_MAX) #define PTLBD_NUM_THREADS 4 -#define PTLBD_NEVENTS 1024 #define PTLBD_NBUFS 20 #define PTLBD_BUFSIZE (32 * 1024) #define PTLBD_MAXREQSIZE 1024 @@ -198,21 +195,66 @@ struct ptlrpc_request_set { struct ptlrpc_bulk_desc; +/* + * ptlrpc callback & work item stuff + */ +struct ptlrpc_cb_id { + void (*cbid_fn)(ptl_event_t *ev); /* specific callback fn */ + void *cbid_arg; /* additional arg */ +}; + +#define RS_MAX_LOCKS 4 +#define RS_DEBUG 1 + +struct ptlrpc_reply_state { + struct ptlrpc_cb_id rs_cb_id; + struct list_head rs_list; + struct list_head rs_exp_list; + struct list_head rs_obd_list; +#if RS_DEBUG + struct list_head rs_debug_list; +#endif + /* updates to following flag serialised by srv_request_lock */ + unsigned int rs_difficult:1; /* ACK/commit stuff */ + unsigned int rs_scheduled:1; /* being handled? */ + unsigned int rs_scheduled_ever:1; /* any schedule attempts? */ + unsigned int rs_handled:1; /* been handled yet? */ + unsigned int rs_on_net:1; /* reply_out_callback pending? */ + + int rs_size; + __u64 rs_transno; + __u64 rs_xid; + struct obd_export *rs_export; + struct ptlrpc_srv_ni *rs_srv_ni; + ptl_handle_md_t rs_md_h; + + /* locks awaiting client reply ACK */ + int rs_nlocks; + struct lustre_handle rs_locks[RS_MAX_LOCKS]; + ldlm_mode_t rs_modes[RS_MAX_LOCKS]; + /* last member: variable sized reply message */ + struct lustre_msg rs_msg; +}; + struct ptlrpc_request { int rq_type; /* one of PTL_RPC_MSG_* */ struct list_head rq_list; int rq_status; spinlock_t rq_lock; - unsigned int rq_intr:1, rq_replied:1, rq_want_ack:1, rq_err:1, + /* client-side flags */ + unsigned int rq_intr:1, rq_replied:1, rq_err:1, rq_timedout:1, rq_resend:1, rq_restart:1, rq_replay:1, - rq_no_resend:1, rq_resent:1, rq_waiting:1, rq_receiving_reply:1; + rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1; int rq_phase; - + /* client-side refcount for SENT race */ atomic_t rq_refcount; int rq_request_portal; /* XXX FIXME bug 249 */ int rq_reply_portal; /* XXX FIXME bug 249 */ + /* client-side # reply bytes actually received */ + int rq_nob_received; + int rq_reqlen; struct lustre_msg *rq_reqmsg; @@ -230,20 +272,25 @@ struct ptlrpc_request { int rq_import_generation; enum lustre_imp_state rq_send_state; - wait_queue_head_t rq_reply_waitq; /* XXX also _for_ack */ - /* incoming reply */ - ptl_md_t rq_reply_md; - ptl_handle_md_t rq_reply_md_h; - - /* outgoing req/rep */ - ptl_md_t rq_req_md; + /* client+server request */ + ptl_handle_md_t rq_req_md_h; + struct ptlrpc_cb_id rq_req_cbid; + /* server-side... */ + struct timeval rq_arrival_time; /* request arrival time */ + struct ptlrpc_reply_state *rq_reply_state; /* separated reply state */ + struct ptlrpc_request_buffer_desc *rq_rqbd; /* incoming request buffer */ + + /* client-only incoming reply */ + ptl_handle_md_t rq_reply_md_h; + wait_queue_head_t rq_reply_waitq; + struct ptlrpc_cb_id rq_reply_cbid; + struct ptlrpc_peer rq_peer; /* XXX see service.c can this be factored away? */ struct obd_export *rq_export; struct obd_import *rq_import; - struct ptlrpc_service *rq_svc; - + void (*rq_replay_cb)(struct ptlrpc_request *); void (*rq_commit_cb)(struct ptlrpc_request *); void *rq_cb_data; @@ -256,17 +303,11 @@ struct ptlrpc_request { struct ptlrpc_request_set *rq_set; void *rq_interpret_reply; /* Async completion handler */ union ptlrpc_async_args rq_async_args; /* Async completion context */ - - /* Only used on the server side for tracking acks. */ - struct ptlrpc_req_ack_lock { - struct lustre_handle lock; - __u32 mode; - } rq_ack_locks[REQ_MAX_ACK_LOCKS]; }; #define RQ_PHASE_NEW 0xebc0de00 -#define RQ_PHASE_RPC 0xebc0de01 +#define RQ_PHASE_RPC 0xebc0de01 #define RQ_PHASE_BULK 0xebc0de02 #define RQ_PHASE_INTERPRET 0xebc0de03 #define RQ_PHASE_COMPLETE 0xebc0de04 @@ -276,20 +317,19 @@ struct ptlrpc_request { #define PTLRPC_REQUEST_COMPLETE(req) ((req)->rq_phase > RQ_PHASE_RPC) -#define DEBUG_REQ_FLAGS(req) \ - ((req->rq_phase == RQ_PHASE_NEW) ? "New" : \ - (req->rq_phase == RQ_PHASE_RPC) ? "RPC" : \ - (req->rq_phase == RQ_PHASE_INTERPRET) ? "Interpret" : \ - (req->rq_phase == RQ_PHASE_COMPLETE) ? "Complete" : \ - (req->rq_phase == RQ_PHASE_BULK) ? "Bulk" : "?phase?"), \ - FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \ - FLAG(req->rq_want_ack, "A"), FLAG(req->rq_err, "E"), \ - FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \ - FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \ - FLAG(req->rq_no_resend, "N"), FLAG(req->rq_resent, "s"), \ +#define DEBUG_REQ_FLAGS(req) \ + ((req->rq_phase == RQ_PHASE_NEW) ? "New" : \ + (req->rq_phase == RQ_PHASE_RPC) ? "Rpc" : \ + (req->rq_phase == RQ_PHASE_INTERPRET) ? "Interpret" : \ + (req->rq_phase == RQ_PHASE_COMPLETE) ? "Complete" : "?phase?"), \ + FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \ + FLAG(req->rq_err, "E"), \ + FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \ + FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \ + FLAG(req->rq_no_resend, "N"), \ FLAG(req->rq_waiting, "W") -#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s" +#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s" #define DEBUG_REQ(level, req, fmt, args...) \ do { \ @@ -312,20 +352,19 @@ CDEBUG(level, "@@@ " fmt \ } while (0) struct ptlrpc_bulk_page { - struct ptlrpc_bulk_desc *bp_desc; struct list_head bp_link; int bp_buflen; int bp_pageoffset; /* offset within a page */ struct page *bp_page; }; -#define BULK_GET_SOURCE 0 +#define BULK_GET_SOURCE 0 #define BULK_PUT_SINK 1 #define BULK_GET_SINK 2 #define BULK_PUT_SOURCE 3 struct ptlrpc_bulk_desc { - unsigned int bd_complete:1; + unsigned int bd_success:1; /* completed successfully */ unsigned int bd_network_rw:1; /* accessible to the network */ unsigned int bd_type:2; /* {put,get}{source,sink} */ unsigned int bd_registered:1; /* client side */ @@ -335,17 +374,17 @@ struct ptlrpc_bulk_desc { struct obd_import *bd_import; __u32 bd_portal; struct ptlrpc_request *bd_req; /* associated request */ - wait_queue_head_t bd_waitq; /* server side only WQ */ - struct list_head bd_page_list; - __u32 bd_page_count; - __u32 bd_last_xid; - - ptl_md_t bd_md; - ptl_handle_md_t bd_md_h; - ptl_handle_me_t bd_me_h; + wait_queue_head_t bd_waitq; /* server side only WQ */ + int bd_page_count; /* # pages (== entries in bd_iov) */ + int bd_max_pages; /* allocated size of bd_iov */ + int bd_nob; /* # bytes covered */ + int bd_nob_transferred; /* # bytes GOT/PUT */ - int bd_callback_count; /* server side callbacks */ + __u64 bd_last_xid; + struct ptlrpc_cb_id bd_cbid; /* network callback info */ + ptl_handle_md_t bd_md_h; /* associated MD */ + #ifdef __KERNEL__ ptl_kiov_t bd_iov[PTL_MD_MAX_IOV]; #else @@ -363,9 +402,12 @@ struct ptlrpc_thread { struct ptlrpc_request_buffer_desc { struct list_head rqbd_list; struct ptlrpc_srv_ni *rqbd_srv_ni; - ptl_handle_me_t rqbd_me_h; - atomic_t rqbd_refcount; + ptl_handle_md_t rqbd_md_h; + int rqbd_refcount; + int rqbd_eventcount; char *rqbd_buffer; + struct ptlrpc_cb_id rqbd_cbid; + struct ptlrpc_request rqbd_req; }; /* event queues are per-ni, because one day we may get a hardware @@ -376,57 +418,64 @@ struct ptlrpc_ni { /* Generic interface state */ char *pni_name; int pni_number; ptl_handle_ni_t pni_ni_h; - ptl_handle_eq_t pni_request_out_eq_h; - ptl_handle_eq_t pni_reply_in_eq_h; - ptl_handle_eq_t pni_reply_out_eq_h; - ptl_handle_eq_t pni_bulk_put_source_eq_h; - ptl_handle_eq_t pni_bulk_put_sink_eq_h; - ptl_handle_eq_t pni_bulk_get_source_eq_h; - ptl_handle_eq_t pni_bulk_get_sink_eq_h; + ptl_handle_eq_t pni_eq_h; }; struct ptlrpc_srv_ni { /* Interface-specific service state */ struct ptlrpc_service *sni_service; /* owning service */ struct ptlrpc_ni *sni_ni; /* network interface */ - ptl_handle_eq_t sni_eq_h; /* event queue handle */ - struct list_head sni_rqbds; /* all the request buffer descriptors */ - __u32 sni_nrqbds; /* # request buffers */ - atomic_t sni_nrqbds_receiving; /* # request buffers posted */ + struct list_head sni_rqbds; /* all the request buffers */ + struct list_head sni_active_replies; /* all the active replies */ + int sni_nrqbd_receiving; /* # posted request buffers */ }; -struct ptlrpc_service { - time_t srv_time; - time_t srv_timeout; - - struct list_head srv_ni_list; /* list of interfaces */ - __u32 srv_max_req_size; /* biggest request to receive */ - __u32 srv_buf_size; /* # bytes in a request buffer */ +typedef int (*svc_handler_t)(struct ptlrpc_request *req); +struct ptlrpc_service { + struct list_head srv_list; /* chain thru all services */ + int srv_max_req_size; /* biggest request to receive */ + int srv_buf_size; /* size of individual buffers */ + int srv_nbufs; /* total # req buffer descs allocated */ + int srv_nthreads; /* # running threads */ + int srv_n_difficult_replies; /* # 'difficult' replies */ + int srv_n_active_reqs; /* # reqs being served */ + __u32 srv_req_portal; __u32 srv_rep_portal; - __u32 srv_xid; + int srv_n_queued_reqs; /* # reqs waiting to be served */ + struct list_head srv_request_queue; /* reqs waiting for service */ + + atomic_t srv_outstanding_replies; + struct list_head srv_reply_queue; /* replies waiting for service */ wait_queue_head_t srv_waitq; /* all threads sleep on this */ - spinlock_t srv_lock; - struct list_head srv_threads; - int (*srv_handler)(struct ptlrpc_request *req); + struct list_head srv_threads; + struct obd_device *srv_obddev; + svc_handler_t srv_handler; + char *srv_name; /* only statically allocated strings here; we don't clean them */ - struct proc_dir_entry *srv_procroot; - struct lprocfs_stats *srv_stats; - int srv_interface_rover; + spinlock_t srv_lock; + + struct proc_dir_entry *srv_procroot; + struct lprocfs_stats *srv_stats; + struct ptlrpc_srv_ni srv_interfaces[0]; }; -typedef int (*svc_handler_t)(struct ptlrpc_request *req); - /* ptlrpc/events.c */ extern struct ptlrpc_ni ptlrpc_interfaces[]; extern int ptlrpc_ninterfaces; extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, struct ptlrpc_peer *peer); +extern void request_out_callback (ptl_event_t *ev); +extern void reply_in_callback(ptl_event_t *ev); +extern void client_bulk_callback (ptl_event_t *ev); +extern void request_in_callback(ptl_event_t *ev); +extern void reply_out_callback(ptl_event_t *ev); +extern void server_bulk_callback (ptl_event_t *ev); /* ptlrpc/connection.c */ void ptlrpc_dump_connections(void); @@ -439,28 +488,28 @@ void ptlrpc_init_connection(void); void ptlrpc_cleanup_connection(void); /* ptlrpc/niobuf.c */ -int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *); -int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *); -void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *bulk); +int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc); +void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc); int ptlrpc_register_bulk(struct ptlrpc_request *req); void ptlrpc_unregister_bulk (struct ptlrpc_request *req); -static inline int ptlrpc_bulk_complete (struct ptlrpc_bulk_desc *desc) +static inline int ptlrpc_bulk_active (struct ptlrpc_bulk_desc *desc) { unsigned long flags; int rc; spin_lock_irqsave (&desc->bd_lock, flags); - rc = desc->bd_complete; + rc = desc->bd_network_rw; spin_unlock_irqrestore (&desc->bd_lock, flags); return (rc); } +int ptlrpc_send_reply(struct ptlrpc_request *req, int); int ptlrpc_reply(struct ptlrpc_request *req); int ptlrpc_error(struct ptlrpc_request *req); void ptlrpc_resend_req(struct ptlrpc_request *request); int ptl_send_rpc(struct ptlrpc_request *request); -void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd); +void ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd); /* ptlrpc/client.c */ void ptlrpc_init_client(int req_portal, int rep_portal, char *name, @@ -468,6 +517,39 @@ void ptlrpc_init_client(int req_portal, int rep_portal, char *name, void ptlrpc_cleanup_client(struct obd_import *imp); struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid); +static inline int +ptlrpc_client_receiving_reply (struct ptlrpc_request *req) +{ + unsigned long flags; + int rc; + + spin_lock_irqsave(&req->rq_lock, flags); + rc = req->rq_receiving_reply; + spin_unlock_irqrestore(&req->rq_lock, flags); + return (rc); +} + +static inline int +ptlrpc_client_replied (struct ptlrpc_request *req) +{ + unsigned long flags; + int rc; + + spin_lock_irqsave(&req->rq_lock, flags); + rc = req->rq_replied; + spin_unlock_irqrestore(&req->rq_lock, flags); + return (rc); +} + +static inline void +ptlrpc_wake_client_req (struct ptlrpc_request *req) +{ + if (req->rq_set == NULL) + wake_up(&req->rq_reply_waitq); + else + wake_up(&req->rq_set->set_waitq); +} + int ptlrpc_queue_wait(struct ptlrpc_request *req); int ptlrpc_replay_req(struct ptlrpc_request *req); void ptlrpc_unregister_reply(struct ptlrpc_request *req); @@ -493,28 +575,32 @@ void ptlrpc_req_finished(struct ptlrpc_request *request); void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request); struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req); struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req, - int type, int portal); + int npages, int type, int portal); struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req, - int type, int portal); + int npages, int type, int portal); void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk); -int ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, - struct page *page, int pageoffset, int len); -void ptlrpc_free_bulk_page(struct ptlrpc_bulk_page *page); +void ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, int len); void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, struct obd_import *imp); __u64 ptlrpc_next_xid(void); /* ptlrpc/service.c */ -struct ptlrpc_service * -ptlrpc_init_svc(__u32 nevents, __u32 nbufs, __u32 bufsize, __u32 max_req_size, - int req_portal, int rep_portal, svc_handler_t, char *name, - struct proc_dir_entry *proc_entry); +void ptlrpc_save_lock (struct ptlrpc_request *req, + struct lustre_handle *lock, int mode); +void ptlrpc_commit_replies (struct obd_device *obd); +void ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs); +struct ptlrpc_service *ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, + int req_portal, int rep_portal, + svc_handler_t, char *name, + struct proc_dir_entry *proc_entry); void ptlrpc_stop_all_threads(struct ptlrpc_service *svc); int ptlrpc_start_n_threads(struct obd_device *dev, struct ptlrpc_service *svc, int cnt, char *base_name); int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc, char *name); int ptlrpc_unregister_service(struct ptlrpc_service *service); +int liblustre_check_services (void *arg); struct ptlrpc_svc_data { char *name; @@ -535,6 +621,7 @@ int lustre_pack_request(struct ptlrpc_request *, int count, int *lens, char **bufs); int lustre_pack_reply(struct ptlrpc_request *, int count, int *lens, char **bufs); +void lustre_free_reply_state(struct ptlrpc_reply_state *rs); int lustre_msg_size(int count, int *lengths); int lustre_unpack_msg(struct lustre_msg *m, int len); void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen); @@ -571,7 +658,6 @@ void ptlrpc_lprocfs_unregister_obd(struct obd_device *obddev); #endif /* ptlrpc/llog_server.c */ -struct llog_obd_ctxt; int llog_origin_handle_create(struct ptlrpc_request *req); int llog_origin_handle_next_block(struct ptlrpc_request *req); int llog_origin_handle_read_header(struct ptlrpc_request *req); diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index 619010b..ec90c84 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -480,7 +480,8 @@ struct obd_device { int obd_replayed_requests; int obd_requests_queued_for_recovery; wait_queue_head_t obd_next_transno_waitq; - wait_queue_head_t obd_commit_waitq; + struct list_head obd_uncommitted_replies; + spinlock_t obd_uncommitted_replies_lock; struct timer_list obd_recovery_timer; struct list_head obd_recovery_queue; struct list_head obd_delayed_reply_queue; @@ -666,7 +667,7 @@ static inline void obd_transno_commit_cb(struct obd_device *obd, __u64 transno, obd->obd_name, transno); if (transno > obd->obd_last_committed) { obd->obd_last_committed = transno; - wake_up(&obd->obd_commit_waitq); + ptlrpc_commit_replies (obd); } } diff --git a/lustre/kernel_patches/patches/2.6.0-test6-mm4.patch b/lustre/kernel_patches/patches/2.6.0-test6-mm4.patch index 6293972..ff8d63b 100644 --- a/lustre/kernel_patches/patches/2.6.0-test6-mm4.patch +++ b/lustre/kernel_patches/patches/2.6.0-test6-mm4.patch @@ -14430,7 +14430,7 @@ +++ 25/arch/parisc/lib/checksum.c 2003-10-05 00:33:23.000000000 -0700 @@ -16,8 +16,10 @@ * - * $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ + * $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ */ -#include +#include @@ -31511,8 +31511,8 @@ --- linux-2.6.0-test6/drivers/char/ftape/compressor/zftape-compress.c 2003-06-14 12:18:32.000000000 -0700 +++ 25/drivers/char/ftape/compressor/zftape-compress.c 2003-10-05 00:33:24.000000000 -0700 @@ -31,6 +31,7 @@ - char zftc_rev[] = "$Revision: 1.3 $"; - char zftc_dat[] = "$Date: 2003/12/03 05:12:20 $"; + char zftc_rev[] = "$Revision: 1.4 $"; + char zftc_dat[] = "$Date: 2004/02/14 03:14:33 $"; +#include #include @@ -37169,8 +37169,8 @@ --- linux-2.6.0-test6/drivers/isdn/hardware/eicon/divamnt.c 2003-09-27 18:57:44.000000000 -0700 +++ 25/drivers/isdn/hardware/eicon/divamnt.c 2003-10-05 00:33:24.000000000 -0700 @@ -1,4 +1,4 @@ --/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ -+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ +-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ ++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ * * Driver for Eicon DIVA Server ISDN cards. * Maint module @@ -37181,16 +37181,16 @@ -#include "di_defs.h" #include "debug_if.h" --static char *main_revision = "$Revision: 1.3 $"; -+static char *main_revision = "$Revision: 1.3 $"; +-static char *main_revision = "$Revision: 1.4 $"; ++static char *main_revision = "$Revision: 1.4 $"; static int major; --- linux-2.6.0-test6/drivers/isdn/hardware/eicon/divasmain.c 2003-09-27 18:57:44.000000000 -0700 +++ 25/drivers/isdn/hardware/eicon/divasmain.c 2003-10-05 00:33:24.000000000 -0700 @@ -1,4 +1,4 @@ --/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ -+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ +-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ ++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ * * Low level driver for Eicon DIVA Server ISDN cards. * @@ -37212,16 +37212,16 @@ #include "diva_dma.h" #include "diva_pci.h" --static char *main_revision = "$Revision: 1.3 $"; -+static char *main_revision = "$Revision: 1.3 $"; +-static char *main_revision = "$Revision: 1.4 $"; ++static char *main_revision = "$Revision: 1.4 $"; static int major; --- linux-2.6.0-test6/drivers/isdn/hardware/eicon/dqueue.c 2003-06-14 12:18:22.000000000 -0700 +++ 25/drivers/isdn/hardware/eicon/dqueue.c 2003-10-05 00:33:24.000000000 -0700 @@ -1,10 +1,10 @@ --/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ -+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ +-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ ++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ * * Driver for Eicon DIVA Server ISDN cards. * User Mode IDI Interface @@ -37236,8 +37236,8 @@ --- linux-2.6.0-test6/drivers/isdn/hardware/eicon/mntfunc.c 2003-09-27 18:57:44.000000000 -0700 +++ 25/drivers/isdn/hardware/eicon/mntfunc.c 2003-10-05 00:33:24.000000000 -0700 @@ -1,4 +1,4 @@ --/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ -+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ +-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ ++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ * * Driver for Eicon DIVA Server ISDN cards. * Maint module @@ -37252,8 +37252,8 @@ --- linux-2.6.0-test6/drivers/isdn/hardware/eicon/os_capi.h 2003-06-14 12:18:25.000000000 -0700 +++ 25/drivers/isdn/hardware/eicon/os_capi.h 2003-10-05 00:33:24.000000000 -0700 @@ -1,10 +1,10 @@ --/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ -+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ +-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ ++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ * * ISDN interface module for Eicon active cards DIVA. * CAPI Interface OS include files @@ -37268,8 +37268,8 @@ --- linux-2.6.0-test6/drivers/isdn/hardware/eicon/platform.h 2003-09-27 18:57:44.000000000 -0700 +++ 25/drivers/isdn/hardware/eicon/platform.h 2003-10-05 00:33:24.000000000 -0700 @@ -1,4 +1,4 @@ --/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ -+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ +-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ ++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ * * platform.h * @@ -37754,7 +37754,7 @@ +++ 25/drivers/media/video/planb.c 2003-10-05 00:33:24.000000000 -0700 @@ -27,7 +27,6 @@ - /* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ */ + /* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ */ -#include #include @@ -38069,7 +38069,7 @@ --- linux-2.6.0-test6/drivers/mtd/chips/map_rom.c 2003-06-14 12:18:24.000000000 -0700 +++ 25/drivers/mtd/chips/map_rom.c 2003-10-05 00:33:24.000000000 -0700 @@ -4,7 +4,6 @@ - * $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ + * $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ */ -#include @@ -42159,8 +42159,8 @@ #include /* Version */ --static const char version[] = "$Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ for Linux\n"; -+static const char version[] = "$Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ for Linux\n"; +-static const char version[] = "$Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ for Linux\n"; ++static const char version[] = "$Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ for Linux\n"; static int debug; static int quartz; diff --git a/lustre/kernel_patches/patches/bproc-patch-2.4.20 b/lustre/kernel_patches/patches/bproc-patch-2.4.20 index 5411d9c..54d1f68 100644 --- a/lustre/kernel_patches/patches/bproc-patch-2.4.20 +++ b/lustre/kernel_patches/patches/bproc-patch-2.4.20 @@ -1,4 +1,4 @@ -$Id: bproc-patch-2.4.20,v 1.3 2003/12/03 05:12:25 phil Exp $ +$Id: bproc-patch-2.4.20,v 1.4 2004/02/14 03:14:37 rread Exp $ Index: linux/fs/exec.c =================================================================== @@ -764,7 +764,7 @@ Index: linux/kernel/bproc_hook.c + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * -+ * $Id: bproc-patch-2.4.20,v 1.3 2003/12/03 05:12:25 phil Exp $ ++ * $Id: bproc-patch-2.4.20,v 1.4 2004/02/14 03:14:37 rread Exp $ + *-----------------------------------------------------------------------*/ +#include +#include @@ -832,7 +832,7 @@ Index: linux/include/linux/bproc.h + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * -+ * $Id: bproc-patch-2.4.20,v 1.3 2003/12/03 05:12:25 phil Exp $ ++ * $Id: bproc-patch-2.4.20,v 1.4 2004/02/14 03:14:37 rread Exp $ + *-----------------------------------------------------------------------*/ +#ifndef _LINUX_BPROC_H +#define _LINUX_BPROC_H diff --git a/lustre/kernel_patches/patches/ext3-xattr-ptr-arith-fix.patch b/lustre/kernel_patches/patches/ext3-xattr-ptr-arith-fix.patch index 05fcf61..818596c 100644 --- a/lustre/kernel_patches/patches/ext3-xattr-ptr-arith-fix.patch +++ b/lustre/kernel_patches/patches/ext3-xattr-ptr-arith-fix.patch @@ -1,7 +1,7 @@ Index: linux-2.4.20/fs/ext3/xattr.c =================================================================== ---- linux-2.4.20.orig/fs/ext3/xattr.c 2003-11-13 17:14:52.000000000 +0300 -+++ linux-2.4.20/fs/ext3/xattr.c 2003-11-21 16:43:48.000000000 +0300 +--- linux-2.4.20.orig/fs/ext3/xattr.c 2003-11-13 10:59:33.000000000 +0800 ++++ linux-2.4.20/fs/ext3/xattr.c 2003-11-25 21:16:51.000000000 +0800 @@ -1293,9 +1293,10 @@ goto cleanup; memcpy(header, HDR(bh), bh->b_size); diff --git a/lustre/ldlm/Makefile.am b/lustre/ldlm/Makefile.am index 768f778..0da12fc 100644 --- a/lustre/ldlm/Makefile.am +++ b/lustre/ldlm/Makefile.am @@ -13,6 +13,7 @@ if LIBLUSTRE lib_LIBRARIES = libldlm.a libldlm_a_SOURCES = l_lock.c ldlm_lock.c ldlm_resource.c ldlm_lib.c \ ldlm_plain.c ldlm_extent.c ldlm_request.c ldlm_lockd.c ldlm_internal.h +libldlm_a_CFLAGS = -fPIC endif include $(top_srcdir)/Rules diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 4b7eb3b..d1e2b49 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -497,6 +497,10 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) export = req->rq_export = class_conn2export(&conn); LASSERT(export != NULL); + /* request from liblustre? */ + if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) + export->exp_libclient = 1; + if (export->exp_connection != NULL) ptlrpc_put_connection(export->exp_connection); export->exp_connection = ptlrpc_get_connection(&req->rq_peer, @@ -888,6 +892,8 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc) int recovery_done = 0; int rc2; + LASSERT ((rc == 0) == (req->rq_reply_state != NULL)); + if (rc) { /* Just like ptlrpc_error, but without the sending. */ rc = lustre_pack_reply(req, 0, NULL, NULL); @@ -895,6 +901,7 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc) req->rq_type = PTL_RPC_MSG_ERR; } + LASSERT (!req->rq_reply_state->rs_difficult); LASSERT(list_empty(&req->rq_list)); /* XXX a bit like the request-dup code in queue_recovery_request */ OBD_ALLOC(saved_req, sizeof *saved_req); @@ -905,6 +912,8 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc) LBUG(); memcpy(saved_req, req, sizeof *saved_req); memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); + /* the copied req takes over the reply state */ + req->rq_reply_state = NULL; req = saved_req; req->rq_reqmsg = reqmsg; class_export_get(req->rq_export); @@ -954,180 +963,131 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc) return 1; } -static void ptlrpc_abort_reply (struct ptlrpc_request *req) -{ - /* On return, we must be sure that the ACK callback has either - * happened or will not happen. Note that the SENT callback will - * happen come what may since we successfully posted the PUT. */ - int rc; - struct l_wait_info lwi; - unsigned long flags; - - again: - /* serialise with ACK callback */ - spin_lock_irqsave (&req->rq_lock, flags); - if (!req->rq_want_ack) { - spin_unlock_irqrestore (&req->rq_lock, flags); - /* The ACK callback has happened already. Although the - * SENT callback might still be outstanding (yes really) we - * don't care; this is just like normal completion. */ - return; - } - spin_unlock_irqrestore (&req->rq_lock, flags); - - /* Have a bash at unlinking the MD. This will fail until the SENT - * callback has happened since the MD is busy from the PUT. If the - * ACK still hasn't arrived after then, a successful unlink will - * ensure the ACK callback never happens. */ - rc = PtlMDUnlink (req->rq_reply_md_h); - switch (rc) { - default: - LBUG (); - case PTL_OK: - /* SENT callback happened; ACK callback preempted */ - LASSERT (req->rq_want_ack); - spin_lock_irqsave (&req->rq_lock, flags); - req->rq_want_ack = 0; - spin_unlock_irqrestore (&req->rq_lock, flags); - return; - case PTL_INV_MD: - return; - case PTL_MD_INUSE: - /* Still sending or ACK callback in progress: wait until - * either callback has completed and try again. - * Actually we can't wait for the SENT callback because - * there's no state the SENT callback can touch that will - * allow it to communicate with us! So we just wait here - * for a short time, effectively polling for the SENT - * callback by calling PtlMDUnlink() again, to see if it - * has finished. Note that if the ACK does arrive, its - * callback wakes us in short order. --eeb */ - lwi = LWI_TIMEOUT (HZ/4, NULL, NULL); - rc = l_wait_event(req->rq_reply_waitq, !req->rq_want_ack, - &lwi); - CDEBUG (D_HA, "Retrying req %p: %d\n", req, rc); - /* NB go back and test rq_want_ack with locking, to ensure - * if ACK callback happened, it has completed stopped - * referencing this req. */ - goto again; - } -} - -void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) +int +target_send_reply_msg (struct ptlrpc_request *req, int rc, int fail_id) { - int i; - int netrc; - unsigned long flags; - struct ptlrpc_req_ack_lock *ack_lock; - struct l_wait_info lwi = { 0 }; - wait_queue_t commit_wait; - struct obd_device *obd = - req->rq_export ? req->rq_export->exp_obd : NULL; - struct obd_export *exp = NULL; - - if (req->rq_export) { - for (i = 0; i < REQ_MAX_ACK_LOCKS; i++) { - if (req->rq_ack_locks[i].mode) { - exp = req->rq_export; - break; + if (OBD_FAIL_CHECK(fail_id | OBD_FAIL_ONCE)) { + obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED; + DEBUG_REQ(D_ERROR, req, "dropping reply"); + /* NB this does _not_ send with ACK disabled, to simulate + * sending OK, but timing out for the ACK */ + if (req->rq_reply_state != NULL) { + if (!req->rq_reply_state->rs_difficult) { + lustre_free_reply_state (req->rq_reply_state); + req->rq_reply_state = NULL; + } else { + struct ptlrpc_service *svc = + req->rq_rqbd->rqbd_srv_ni->sni_service; + atomic_inc(&svc->srv_outstanding_replies); } } + return (-ECOMM); } - if (exp) { - exp->exp_outstanding_reply = req; - spin_lock_irqsave (&req->rq_lock, flags); - req->rq_want_ack = 1; - spin_unlock_irqrestore (&req->rq_lock, flags); - } - - if (!OBD_FAIL_CHECK(fail_id | OBD_FAIL_ONCE)) { - if (rc == 0) { - DEBUG_REQ(D_NET, req, "sending reply"); - netrc = ptlrpc_reply(req); - } else if (rc == -ENOTCONN) { - DEBUG_REQ(D_HA, req, "processing error (%d)", rc); - netrc = ptlrpc_error(req); - } else { - DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc); - netrc = ptlrpc_error(req); + if (rc) { + DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc); + if (req->rq_reply_state == NULL) { + rc = lustre_pack_reply (req, 0, NULL, NULL); + if (rc != 0) { + CERROR ("can't allocate reply\n"); + return (rc); + } } + req->rq_type = PTL_RPC_MSG_ERR; } else { - obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED; - DEBUG_REQ(D_ERROR, req, "dropping reply"); - if (req->rq_repmsg) { - OBD_FREE(req->rq_repmsg, req->rq_replen); - req->rq_repmsg = NULL; - } - init_waitqueue_head(&req->rq_reply_waitq); - netrc = 0; + DEBUG_REQ(D_NET, req, "sending reply"); } + + return (ptlrpc_send_reply(req, 1)); +} - /* a failed send simulates the callbacks */ - LASSERT(netrc == 0 || req->rq_want_ack == 0); - if (exp == NULL) { - LASSERT(req->rq_want_ack == 0); +void +target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) +{ + int netrc; + unsigned long flags; + struct ptlrpc_reply_state *rs; + struct obd_device *obd; + struct obd_export *exp; + struct ptlrpc_srv_ni *sni; + struct ptlrpc_service *svc; + + sni = req->rq_rqbd->rqbd_srv_ni; + svc = sni->sni_service; + + rs = req->rq_reply_state; + if (rs == NULL || !rs->rs_difficult) { + /* The easy case; no notifiers and reply_out_callback() + * cleans up (i.e. we can't look inside rs after a + * successful send) */ + netrc = target_send_reply_msg (req, rc, fail_id); + + LASSERT (netrc == 0 || req->rq_reply_state == NULL); return; } - LASSERT(obd != NULL); - - init_waitqueue_entry(&commit_wait, current); - add_wait_queue(&obd->obd_commit_waitq, &commit_wait); - rc = l_wait_event(req->rq_reply_waitq, - !req->rq_want_ack || req->rq_resent || - req->rq_transno <= obd->obd_last_committed, &lwi); - remove_wait_queue(&obd->obd_commit_waitq, &commit_wait); - - spin_lock_irqsave (&req->rq_lock, flags); - /* If we got here because the ACK callback ran, this acts as a - * barrier to ensure the callback completed the wakeup. */ - spin_unlock_irqrestore (&req->rq_lock, flags); - - /* If we committed the transno already, then we might wake up before - * the ack arrives. We need to stop waiting for the ack before we can - * reuse this request structure. We are guaranteed by this point that - * this cannot abort the sending of the actual reply.*/ - ptlrpc_abort_reply(req); - - if (req->rq_resent) { - DEBUG_REQ(D_HA, req, "resent: not cancelling locks"); - return; + + /* must be an export if locks saved */ + LASSERT (req->rq_export != NULL); + /* req/reply consistent */ + LASSERT (rs->rs_srv_ni == sni); + + /* "fresh" reply */ + LASSERT (!rs->rs_scheduled); + LASSERT (!rs->rs_scheduled_ever); + LASSERT (!rs->rs_handled); + LASSERT (!rs->rs_on_net); + LASSERT (rs->rs_export == NULL); + LASSERT (list_empty(&rs->rs_obd_list)); + LASSERT (list_empty(&rs->rs_exp_list)); + + exp = class_export_get (req->rq_export); + obd = exp->exp_obd; + + /* disable reply scheduling onto srv_reply_queue while I'm setting up */ + rs->rs_scheduled = 1; + rs->rs_on_net = 1; + rs->rs_xid = req->rq_xid; + rs->rs_transno = req->rq_transno; + rs->rs_export = exp; + + spin_lock_irqsave (&obd->obd_uncommitted_replies_lock, flags); + + if (rs->rs_transno > obd->obd_last_committed) { + /* not committed already */ + list_add_tail (&rs->rs_obd_list, + &obd->obd_uncommitted_replies); } - LASSERT(rc == 0); - DEBUG_REQ(D_HA, req, "cancelling locks for %s", - req->rq_want_ack ? "commit" : "ack"); + spin_unlock (&obd->obd_uncommitted_replies_lock); + spin_lock (&exp->exp_lock); - exp->exp_outstanding_reply = NULL; + list_add_tail (&rs->rs_exp_list, &exp->exp_outstanding_replies); - for (ack_lock = req->rq_ack_locks, i = 0; - i < REQ_MAX_ACK_LOCKS; i++, ack_lock++) { - if (!ack_lock->mode) - continue; - ldlm_lock_decref(&ack_lock->lock, ack_lock->mode); + spin_unlock_irqrestore (&exp->exp_lock, flags); + + netrc = target_send_reply_msg (req, rc, fail_id); + + spin_lock_irqsave (&svc->srv_lock, flags); + + svc->srv_n_difficult_replies++; + + if (netrc != 0) /* error sending: reply is off the net */ + rs->rs_on_net = 0; + + if (!rs->rs_on_net || /* some notifier */ + list_empty(&rs->rs_exp_list) || /* completed already */ + list_empty(&rs->rs_obd_list)) { + list_add_tail (&rs->rs_list, &svc->srv_reply_queue); + wake_up (&svc->srv_waitq); + } else { + list_add (&rs->rs_list, &sni->sni_active_replies); + rs->rs_scheduled = 0; /* allow notifier to schedule */ } + + spin_unlock_irqrestore (&svc->srv_lock, flags); } int target_handle_ping(struct ptlrpc_request *req) { return lustre_pack_reply(req, 0, NULL, NULL); } - -void *ldlm_put_lock_into_req(struct ptlrpc_request *req, - struct lustre_handle *lock, int mode) -{ - int i; - - for (i = 0; i < REQ_MAX_ACK_LOCKS; i++) { - if (req->rq_ack_locks[i].mode) - continue; - CDEBUG(D_HA, "saving lock "LPX64" in req %p ack_lock[%d]\n", - lock->cookie, req, i); - memcpy(&req->rq_ack_locks[i].lock, lock, sizeof(*lock)); - req->rq_ack_locks[i].mode = mode; - return &req->rq_ack_locks[i]; - } - CERROR("no space for lock in struct ptlrpc_request\n"); - LBUG(); - return NULL; -} diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 9ed2684..5fde33e 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -890,7 +890,28 @@ static int reprocess_one_queue(struct ldlm_resource *res, void *closure) void ldlm_reprocess_all_ns(struct ldlm_namespace *ns) { - (void)ldlm_namespace_foreach_res(ns, reprocess_one_queue, NULL); + int i, rc; + + l_lock(&ns->ns_lock); + for (i = 0; i < RES_HASH_SIZE; i++) { + struct list_head *tmp, *next; + list_for_each_safe(tmp, next, &(ns->ns_hash[i])) { + struct ldlm_resource *res = + list_entry(tmp, struct ldlm_resource, lr_hash); + + ldlm_resource_getref(res); + l_unlock(&ns->ns_lock); + rc = reprocess_one_queue(res, NULL); + l_lock(&ns->ns_lock); + next = tmp->next; + ldlm_resource_putref(res); + if (rc == LDLM_ITER_STOP) + GOTO(out, rc); + } + } + out: + l_unlock(&ns->ns_lock); + EXIT; } void ldlm_reprocess_all(struct ldlm_resource *res) diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index e1fe658..2d7946b 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -389,23 +389,17 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock, req->rq_timeout = 2; /* 2 second timeout for initial AST reply */ rc = ptlrpc_queue_wait(req); if (rc == -ETIMEDOUT || rc == -EINTR) { -#ifdef __KERNEL__ - ldlm_del_waiting_lock(lock); - ldlm_failed_ast(lock, rc, "blocking"); -#else - /* XXX - * Here we treat all clients as liblustre. When BLOCKING AST - * timeout we don't evicting the client and only cancel - * the lock. - * restore to orignial implementation later!!! - * XXX - */ - CERROR("BLOCKING AST to client (nid "LPU64") timeout, " - "simply cancel lock 0x%p\n", - req->rq_peer.peer_nid, lock); - ldlm_lock_cancel(lock); - rc = -ERESTART; -#endif + LASSERT(lock->l_export); + if (lock->l_export->exp_libclient) { + CDEBUG(D_HA, "BLOCKING AST to liblustre client (nid " + LPU64") timeout, simply cancel lock 0x%p\n", + req->rq_peer.peer_nid, lock); + ldlm_lock_cancel(lock); + rc = -ERESTART; + } else { + ldlm_del_waiting_lock(lock); + ldlm_failed_ast(lock, rc, "blocking"); + } } else if (rc) { if (rc == -EINVAL) CDEBUG(D_DLMTRACE, "client (nid "LPU64") returned %d " @@ -1145,9 +1139,8 @@ static int ldlm_setup(void) #endif ldlm->ldlm_cb_service = - ptlrpc_init_svc(LDLM_NEVENTS, LDLM_NBUFS, LDLM_BUFSIZE, - LDLM_MAXREQSIZE, LDLM_CB_REQUEST_PORTAL, - LDLM_CB_REPLY_PORTAL, + ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE, + LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL, ldlm_callback_handler, "ldlm_cbd", ldlm_svc_proc_dir); @@ -1157,8 +1150,8 @@ static int ldlm_setup(void) } ldlm->ldlm_cancel_service = - ptlrpc_init_svc(LDLM_NEVENTS, LDLM_NBUFS, LDLM_BUFSIZE, - LDLM_MAXREQSIZE, LDLM_CANCEL_REQUEST_PORTAL, + ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE, + LDLM_CANCEL_REQUEST_PORTAL, LDLM_CANCEL_REPLY_PORTAL, ldlm_cancel_handler, "ldlm_canceld", ldlm_svc_proc_dir); @@ -1404,4 +1397,3 @@ EXPORT_SYMBOL(target_queue_recovery_request); EXPORT_SYMBOL(target_handle_ping); EXPORT_SYMBOL(target_handle_disconnect); EXPORT_SYMBOL(target_queue_final_reply); -EXPORT_SYMBOL(ldlm_put_lock_into_req); diff --git a/lustre/liblustre/Makefile.am b/lustre/liblustre/Makefile.am index ef4fa2f..6622485 100644 --- a/lustre/liblustre/Makefile.am +++ b/lustre/liblustre/Makefile.am @@ -1,63 +1,46 @@ ## Liblustre excecutables & libraries Makefile DEFS= +SUBDIRS = . tests + CFLAGS := -g -Wall -I$(top_srcdir)/utils -I$(top_srcdir)/portals/include \ - -I$(top_srcdir)/portals/unals -I$(SYSIO)/include \ - -I/opt/lam/include -L/opt/lam/lib + -I$(top_srcdir)/portals/unals -I$(SYSIO)/include -KFLAGS:= CPPFLAGS = $(HAVE_EFENCE) -D_LARGEFILE64_SOURCE=1 LIBS = $(LIBEFENCE) -## lustre components libs -LLIBS := ./libllite.a \ - ../lov/liblov.a \ - ../obdecho/libobdecho.a \ - ../osc/libosc.a \ - ../mdc/libmdc.a \ - ../ldlm/libldlm.a \ - ../ptlrpc/libptlrpc.a \ - ../obdclass/liblustreclass.a \ - ../lvfs/liblvfs.a - -## portals components libs -PTLLIBS := ../portals/utils/libptlctl.a \ - ../portals/unals/libtcpnal.a \ - ../portals/portals/libportals.a - -## sysio components libs -SYSIOLIBS := $(SYSIO)/drivers/native/libsysio_native.a \ +LUSTRE_LIBS = libllite.a \ + $(top_srcdir)/lov/liblov.a \ + $(top_srcdir)/obdecho/libobdecho.a \ + $(top_srcdir)/osc/libosc.a \ + $(top_srcdir)/mdc/libmdc.a \ + $(top_srcdir)/ptlrpc/libptlrpc.a \ + $(top_srcdir)/obdclass/liblustreclass.a \ + $(top_srcdir)/lvfs/liblvfs.a + +PTL_LIBS = $(top_srcdir)/portals/utils/libuptlctl.a \ + $(top_srcdir)/portals/unals/libtcpnal.a \ + $(top_srcdir)/portals/portals/libportals.a + +SYSIO_LIBS = $(SYSIO)/drivers/native/libsysio_native.a \ $(SYSIO)/drivers/sockets/libsysio_sockets.a \ $(SYSIO)/src/libsysio.a \ $(SYSIO)/dev/stdfd/libsysio_stdfd.a -LLIB_EXEC= $(PTLLIBS) $(SYSIOLIBS) -lpthread +#SYSIO_LIBS = $(SYSIO)/lib/libsysio.a -lib_LIBRARIES = -noinst_LIBRARIES = libllite.a libtestcommon.a -libllite_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c -libtestcommon_a_SOURCES = test_common.c +lib_LIBRARIES = liblustre.a +noinst_LIBRARIES = libllite.a -bin_PROGRAMS = libtest lltest recovery_small replay_single #test_lock_cancel +libllite_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c dir.c +libllite_a_CFLAGS = -fPIC -libtest_SOURCES = libtest.c ../utils/parser.c ../utils/obd.c ../utils/lustre_cfg.c -libtest_LDADD := $(LLIBS) $(PTLLIBS) \ - $(LIBREADLINE) -lpthread +# for make rpms -- need cleanup +liblustre_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c dir.c +liblustre_a_CFLAGS = -fPIC -liblustre.a : libllite.a +liblustre.a : $(LUSTRE_LIBS) $(PTL_LIBS) $(SYSIO_LIBS) $(shell ./genlib.sh $(SYSIO) $(AR) $(LINK)) -lltest_SOURCES = lltest.c -lltest_LDADD := ./libtestcommon.a $(LLIBS) $(LLIB_EXEC) $(LIBREADLINE) - -recovery_small_SOURCES = recovery_small.c -recovery_small_LDADD := ./libtestcommon.a $(LLIBS) $(LLIB_EXEC) $(LIBREADLINE) - -replay_single_SOURCES = replay_single.c -replay_single_LDADD := ./libtestcommon.a $(LLIBS) $(LLIB_EXEC) $(LIBREADLINE) - -#test_lock_cancel_SOURCES = test_lock_cancel.c -#test_lock_cancel_LDADD := $(LLIBS) $(LLIB_EXEC) -lmpi -llam - include $(top_srcdir)/Rules diff --git a/lustre/liblustre/dir.c b/lustre/liblustre/dir.c new file mode 100644 index 0000000..cceb1e0 --- /dev/null +++ b/lustre/liblustre/dir.c @@ -0,0 +1,220 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Lustre Light Super operations + * + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#undef LIST_HEAD + +#include +#include +#include + +#include "llite_lib.h" + +static int llu_dir_do_readpage(struct inode *inode, struct page *page) +{ + struct llu_inode_info *lli = llu_i2info(inode); + struct llu_sb_info *sbi = llu_i2sbi(inode); + struct ll_fid mdc_fid; + __u64 offset; + int rc = 0; + struct ptlrpc_request *request; + struct lustre_handle lockh; + struct mds_body *body; + struct lookup_intent it = { .it_op = IT_READDIR }; + struct mdc_op_data data; + struct obd_device *obddev = class_exp2obd(sbi->ll_mdc_exp); + struct ldlm_res_id res_id = + { .name = {lli->lli_st_ino, (__u64)lli->lli_st_generation} }; + ENTRY; + + if ((lli->lli_st_size + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT <= page->index) { + /* XXX why do we need this exactly, and why do we think that + * an all-zero directory page is useful? + */ + CERROR("memsetting dir page %lu to zero (size %lld)\n", + page->index, lli->lli_st_size); + memset(page->addr, 0, PAGE_CACHE_SIZE); + GOTO(readpage_out, rc); + } + + rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED, + &res_id, LDLM_PLAIN, NULL, 0, LCK_PR, &lockh); + if (!rc) { + llu_prepare_mdc_op_data(&data, inode, NULL, NULL, 0, 0); + + rc = mdc_enqueue(sbi->ll_mdc_exp, LDLM_PLAIN, &it, LCK_PR, + &data, &lockh, NULL, 0, + ldlm_completion_ast, llu_mdc_blocking_ast, + inode); + request = (struct ptlrpc_request *)it.d.lustre.it_data; + if (request) + ptlrpc_req_finished(request); + if (rc < 0) { + CERROR("lock enqueue: err: %d\n", rc); + RETURN(rc); + } + } + ldlm_lock_dump_handle(D_OTHER, &lockh); + + mdc_pack_fid(&mdc_fid, lli->lli_st_ino, lli->lli_st_generation, S_IFDIR); + + offset = page->index << PAGE_SHIFT; + rc = mdc_readpage(sbi->ll_mdc_exp, &mdc_fid, + offset, page, &request); + if (!rc) { + body = lustre_msg_buf(request->rq_repmsg, 0, sizeof (*body)); + LASSERT (body != NULL); /* checked by mdc_readpage() */ + LASSERT_REPSWABBED (request, 0); /* swabbed by mdc_readpage() */ + + lli->lli_st_size = body->size; + } + ptlrpc_req_finished(request); + EXIT; + + readpage_out: + ldlm_lock_decref(&lockh, LCK_PR); + return rc; +} + +static struct page *llu_dir_read_page(struct inode *ino, int pgidx) +{ + struct page *page; + int rc; + ENTRY; + + page = alloc_page(0); + if (!page) { + CERROR("alloc page failed\n"); + RETURN(ERR_PTR(-ENOMEM)); + } + page->index = pgidx; + + rc = llu_dir_do_readpage(ino, page); + if (rc) { + free_page(page); + RETURN(ERR_PTR(rc)); + } + + return page; +} + +#define NAME_OFFSET(de) ((int) ((de)->d_name - (char *) (de))) +#define ROUND_UP64(x) (((x)+sizeof(__u64)-1) & ~(sizeof(__u64)-1)) + +static int filldir(char *buf, int buflen, + const char *name, int namelen, loff_t offset, + ino_t ino, unsigned int d_type, int *filled) +{ + struct dirent64 *dirent = (struct dirent64 *) (buf + *filled); + int reclen = ROUND_UP64(NAME_OFFSET(dirent) + namelen + 1); + + /* check overflow */ + if ((*filled + reclen) > buflen) + return 1; + + dirent->d_ino = ino; + dirent->d_off = offset, + dirent->d_reclen = reclen; + dirent->d_type = (unsigned short) d_type; + memcpy(dirent->d_name, name, namelen); + dirent->d_name[namelen] = 0; + + *filled += reclen; + + return 0; +} + +ssize_t llu_iop_getdirentries(struct inode *ino, char *buf, size_t nbytes, + _SYSIO_OFF_T *basep) +{ + struct llu_inode_info *lli = llu_i2info(ino); + loff_t pos = *basep, offset; + int maxpages, pgidx, filled = 0; + ENTRY; + + if (pos == -1) + pos = lli->lli_dir_pos; + + maxpages = lli->lli_st_size >> PAGE_CACHE_SHIFT; + pgidx = pos >> PAGE_CACHE_SHIFT; + offset = pos & ~PAGE_CACHE_MASK; + + for ( ; pgidx < maxpages ; pgidx++, offset = 0) { + struct page *page; + struct ext2_dirent *de; + char *addr, *limit; + + page = llu_dir_read_page(ino, pgidx); + if (IS_ERR(page)) + continue; + + /* size might have been updated by mdc_readpage */ + maxpages = lli->lli_st_size >> PAGE_CACHE_SHIFT; + + /* fill in buffer */ + addr = page->addr; + limit = addr + PAGE_CACHE_SIZE - EXT2_DIR_REC_LEN(1); + de = (struct ext2_dirent *) (addr + offset); + + for ( ; (char*) de <= limit; de = ext2_next_entry(de)) { + if (de->inode) { + int over; + unsigned char d_type = 0; + + /* XXX handle type, etc here */ + + offset = (char*) de - addr; + over = filldir(buf, nbytes, de->name, de->name_len, + (pgidx << PAGE_CACHE_SHIFT) | offset, + le32_to_cpu(de->inode), d_type, &filled); + if (over) { + free_page(page); + GOTO(done, 0); + } + } + } + + free_page(page); + } +done: + lli->lli_dir_pos = pgidx << PAGE_CACHE_SHIFT | offset; + *basep = lli->lli_dir_pos; + RETURN(filled); +} diff --git a/lustre/liblustre/file.c b/lustre/liblustre/file.c index 58339c8..de3f35e 100644 --- a/lustre/liblustre/file.c +++ b/lustre/liblustre/file.c @@ -36,6 +36,8 @@ #include #include +#undef LIST_HEAD + #include "llite_lib.h" void llu_prepare_mdc_op_data(struct mdc_op_data *data, @@ -89,105 +91,6 @@ void obdo_refresh_inode(struct inode *dst, lli->lli_st_blocks = src->o_blocks; } -#if 0 -static int llu_create_obj(struct lustre_handle *conn, struct inode *inode, - struct lov_stripe_md *lsm) -{ - struct ptlrpc_request *req = NULL; - struct llu_inode_info *lli = llu_i2info(inode); - struct lov_mds_md *lmm = NULL; - struct obdo *oa; - struct iattr iattr; - struct mdc_op_data op_data; - struct obd_trans_info oti = { 0 }; - int rc, err, lmm_size = 0;; - ENTRY; - - oa = obdo_alloc(); - if (!oa) - RETURN(-ENOMEM); - - LASSERT(S_ISREG(inode->i_mode)); - oa->o_mode = S_IFREG | 0600; - oa->o_id = lli->lli_st_ino; - oa->o_generation = lli->lli_st_generation; - /* Keep these 0 for now, because chown/chgrp does not change the - * ownership on the OST, and we don't want to allow BA OST NFS - * users to access these objects by mistake. - */ - oa->o_uid = 0; - oa->o_gid = 0; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLGENER | OBD_MD_FLTYPE | - OBD_MD_FLMODE | OBD_MD_FLUID | OBD_MD_FLGID; - - obdo_from_inode(oa, inode, OBD_MD_FLTYPE|OBD_MD_FLATIME|OBD_MD_FLMTIME| - OBD_MD_FLCTIME | - (llu_i2info(inode)->lli_st_size ? OBD_MD_FLSIZE : 0)); - - rc = obd_create(conn, oa, &lsm, &oti); - if (rc) { - CERROR("error creating objects for inode %lu: rc = %d\n", - lli->lli_st_ino, rc); - if (rc > 0) { - CERROR("obd_create returned invalid rc %d\n", rc); - rc = -EIO; - } - GOTO(out_oa, rc); - } - obdo_refresh_inode(inode, oa, OBD_MD_FLBLKSZ); - - LASSERT(lsm && lsm->lsm_object_id); - rc = obd_packmd(conn, &lmm, lsm); - if (rc < 0) - GOTO(out_destroy, rc); - - lmm_size = rc; - - /* Save the stripe MD with this file on the MDS */ - memset(&iattr, 0, sizeof(iattr)); - iattr.ia_valid = ATTR_FROM_OPEN; - - llu_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0); - - rc = mdc_setattr(&llu_i2sbi(inode)->ll_mdc_conn, &op_data, - &iattr, lmm, lmm_size, oti.oti_logcookies, - oti.oti_numcookies * sizeof(oti.oti_onecookie), &req); - ptlrpc_req_finished(req); - - obd_free_diskmd(conn, &lmm); - - /* If we couldn't complete mdc_open() and store the stripe MD on the - * MDS, we need to destroy the objects now or they will be leaked. - */ - if (rc) { - CERROR("error: storing stripe MD for %lu: rc %d\n", - lli->lli_st_ino, rc); - GOTO(out_destroy, rc); - } - lli->lli_smd = lsm; - lli->lli_maxbytes = lsm->lsm_maxbytes; - - EXIT; -out_oa: - oti_free_cookies(&oti); - obdo_free(oa); - return rc; - -out_destroy: - oa->o_id = lsm->lsm_object_id; - oa->o_valid = OBD_MD_FLID; - obdo_from_inode(oa, inode, OBD_MD_FLTYPE); - - err = obd_destroy(conn, oa, lsm, NULL); - obd_free_memmd(conn, &lsm); - if (err) { - CERROR("error uncreating inode %lu objects: rc %d\n", - lli->lli_st_ino, err); - } - goto out_oa; -} -#endif - static int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it) { struct ptlrpc_request *req = it->d.lustre.it_data; @@ -210,8 +113,6 @@ static int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it) * ll_mdc_close, so don't even try right now. */ LASSERT(fd != NULL); - memset(fd, 0, sizeof(*fd)); - memcpy(&fd->fd_mds_och.och_fh, &body->handle, sizeof(body->handle)); fd->fd_mds_och.och_magic = OBD_CLIENT_HANDLE_MAGIC; lli->lli_file_data = fd; @@ -221,38 +122,6 @@ static int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it) RETURN(0); } -#if 0 -static int llu_osc_open(struct lustre_handle *conn, struct inode *inode, - struct lov_stripe_md *lsm) -{ - struct ll_file_data *fd = llu_i2info(inode)->lli_file_data; - struct obdo *oa; - int rc; - ENTRY; - - oa = obdo_alloc(); - if (!oa) - RETURN(-ENOMEM); - oa->o_id = lsm->lsm_object_id; - oa->o_mode = S_IFREG; - oa->o_valid = (OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLBLOCKS | - OBD_MD_FLMTIME | OBD_MD_FLCTIME); - rc = obd_open(conn, oa, lsm, NULL, &fd->fd_ost_och); - if (rc) - GOTO(out, rc); - - /* file->f_flags &= ~O_LOV_DELAY_CREATE; */ - obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | - OBD_MD_FLCTIME); - - EXIT; -out: - obdo_free(oa); - return rc; -} -#endif - - int llu_iop_open(struct pnode *pnode, int flags, mode_t mode) { struct inode *inode = pnode->p_base->pb_ino; @@ -264,19 +133,15 @@ int llu_iop_open(struct pnode *pnode, int flags, mode_t mode) int rc = 0; ENTRY; + /* don't do anything for '/' */ + if (llu_is_root_inode(inode)) + RETURN(0); + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", lli->lli_st_ino); LL_GET_INTENT(inode, it); if (!it->d.lustre.it_disposition) { -#if 0 - struct lookup_intent oit = { .it_op = IT_OPEN, - .it_flags = file->f_flags }; - it = &oit; - rc = ll_intent_file_open(file, NULL, 0, it); - if (rc) - GOTO(out_release, rc); -#endif - CERROR("fixme!!\n"); + LBUG(); } rc = it_open_error(DISP_OPEN_OPEN, it); @@ -298,17 +163,6 @@ int llu_iop_open(struct pnode *pnode, int flags, mode_t mode) CDEBUG(D_INODE, "object creation was delayed\n"); GOTO(out_release, rc); } -#if 0 - if (!lli->lli_smd) { - rc = llu_create_obj(conn, inode, NULL); - if (rc) - GOTO(out_close, rc); - } else { - CERROR("warning: stripe already set on ino %lu\n", - lli->lli_st_ino); - } - lsm = lli->lli_smd; -#endif } fd->fd_flags &= ~O_LOV_DELAY_CREATE; @@ -451,12 +305,8 @@ int llu_file_release(struct inode *inode) CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%lu\n", lli->lli_st_ino, lli->lli_st_generation); - /* FIXME need add this check later. how to find the root pnode? */ -#if 0 - /* don't do anything for / */ - if (inode->i_sb->s_root == file->f_dentry) - RETURN(0); -#endif + /* XXX don't do anything for '/'. but how to find the root pnode? */ + /* still opened by others? */ if (--lli->lli_open_count) RETURN(0); diff --git a/lustre/liblustre/genlib.sh b/lustre/liblustre/genlib.sh index 52b4b88..f371650 100755 --- a/lustre/liblustre/genlib.sh +++ b/lustre/liblustre/genlib.sh @@ -1,4 +1,5 @@ #!/bin/bash +#set -xv # # This script is to generate lib lustre library as a whole. It will leave @@ -8,48 +9,80 @@ AR=/usr/bin/ar LD=/usr/bin/ld +RANLIB=/usr/bin/ranlib CWD=`pwd` SYSIO=$1 +#if [ ! -f $SYSIO/lib/libsysio.a ]; then +# echo "ERROR: $SYSIO/lib/libsysio.a dosen't exist" +# exit 1 +#fi +# +# do cleanup at first +#rm -f liblustre.so + ALL_OBJS= build_obj_list() { _objs=`$AR -t $1/$2` for _lib in $_objs; do - ALL_OBJS=$ALL_OBJS"$1/$_lib "; + ALL_OBJS=$ALL_OBJS"$1/$_lib "; done; } +# +# special treatment for libsysio +# +#sysio_tmp=$CWD/sysio_tmp_`date +%s` +#build_sysio_obj_list() { +# _objs=`$AR -t $1` +# mkdir -p $sysio_tmp +# $AR -x $1 +# mv $_objs $sysio_tmp +# for _lib in $_objs; do +# ALL_OBJS=$ALL_OBJS"$sysio_tmp/$_lib "; +# done +#} + # lustre components libs build_obj_list . libllite.a build_obj_list ../lov liblov.a build_obj_list ../obdecho libobdecho.a build_obj_list ../osc libosc.a build_obj_list ../mdc libmdc.a -build_obj_list ../ldlm libldlm.a build_obj_list ../ptlrpc libptlrpc.a build_obj_list ../obdclass liblustreclass.a build_obj_list ../lvfs liblvfs.a # portals components libs -build_obj_list ../portals/utils libptlctl.a +build_obj_list ../portals/utils libuptlctl.a build_obj_list ../portals/unals libtcpnal.a build_obj_list ../portals/portals libportals.a +# create static lib lsupport +rm -f $CWD/liblsupport.a +$AR -cru $CWD/liblsupport.a $ALL_OBJS +$RANLIB $CWD/liblsupport.a + # libsysio components libs build_obj_list $SYSIO/drivers/native libsysio_native.a build_obj_list $SYSIO/drivers/sockets libsysio_sockets.a build_obj_list $SYSIO/src libsysio.a build_obj_list $SYSIO/dev/stdfd libsysio_stdfd.a +# +#build_sysio_obj_list $SYSIO/lib/libsysio.a +# - -# create static lib +# create static lib lustre rm -f $CWD/liblustre.a -$AR -r $CWD/liblustre.a $ALL_OBJS +$AR -cru $CWD/liblustre.a $ALL_OBJS +$RANLIB $CWD/liblustre.a -# create shared lib +# create shared lib lustre rm -f $CWD/liblustre.so $LD -shared -o $CWD/liblustre.so -init __liblustre_setup_ -fini __liblustre_cleanup_ \ $ALL_OBJS -lpthread + +#rm -rf $sysio_tmp diff --git a/lustre/liblustre/libtest.c b/lustre/liblustre/libtest.c deleted file mode 100644 index b956347..0000000 --- a/lustre/liblustre/libtest.c +++ /dev/null @@ -1,246 +0,0 @@ -#include -#include -#include -#include - -#include /* needed for ptpctl.h */ -#include /* needed for parse_dump */ - - -#include -#include -#include -#include - -#define LIBLUSTRE_TEST 1 -#include "../utils/lctl.c" - -struct ldlm_namespace; -struct ldlm_res_id; -struct obd_import; - -void *inter_module_get(char *arg) -{ - if (!strcmp(arg, "tcpnal_ni")) - return &tcpnal_ni; - else if (!strcmp(arg, "ldlm_cli_cancel_unused")) - return ldlm_cli_cancel_unused; - else if (!strcmp(arg, "ldlm_namespace_cleanup")) - return ldlm_namespace_cleanup; - else if (!strcmp(arg, "ldlm_replay_locks")) - return ldlm_replay_locks; - else - return NULL; -} - -/* XXX move to proper place */ -char *portals_nid2str(int nal, ptl_nid_t nid, char *str) -{ - switch(nal){ - case TCPNAL: - /* userspace NAL */ - case SOCKNAL: - sprintf(str, "%u:%d.%d.%d.%d", (__u32)(nid >> 32), - HIPQUAD(nid)); - break; - case QSWNAL: - case GMNAL: - case IBNAL: - case SCIMACNAL: - sprintf(str, "%u:%u", (__u32)(nid >> 32), (__u32)nid); - break; - default: - return NULL; - } - return str; -} - -ptl_handle_ni_t tcpnal_ni; - -struct pingcli_args { - ptl_nid_t mynid; - ptl_nid_t nid; - ptl_pid_t port; - int count; - int size; -}; - -struct task_struct *current; - -struct obd_class_user_state ocus; - -/* portals interfaces */ -ptl_handle_ni_t * -kportal_get_ni (int nal) -{ - switch (nal) - { - case SOCKNAL: - return &tcpnal_ni; - default: - return NULL; - } -} - -inline void -kportal_put_ni (int nal) -{ - return; -} - -int -kportal_nal_cmd(struct portals_cfg *pcfg) -{ -#if 0 - __u32 nal = pcfg->pcfg_nal; - int rc = -EINVAL; - - ENTRY; - - down(&nal_cmd_sem); - if (nal > 0 && nal <= NAL_MAX_NR && nal_cmd[nal].nch_handler) { - CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, - pcfg->pcfg_command); - rc = nal_cmd[nal].nch_handler(pcfg, nal_cmd[nal].nch_private); - } - up(&nal_cmd_sem); - RETURN(rc); -#else - CERROR("empty function!!!\n"); - return 0; -#endif -} - -int init_current(int argc, char **argv) -{ - current = malloc(sizeof(*current)); - strncpy(current->comm, argv[0], sizeof(current->comm)); - current->pid = getpid(); - return 0; -} - -ptl_nid_t tcpnal_mynid; - -int init_lib_portals() -{ - int rc; - - PtlInit(); - rc = PtlNIInit(procbridge_interface, 0, 0, 0, &tcpnal_ni); - if (rc != 0) { - CERROR("ksocknal: PtlNIInit failed: error %d\n", rc); - PtlFini(); - RETURN (rc); - } - PtlNIDebug(tcpnal_ni, ~0); - return rc; -} - -extern int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, unsigned long arg); - - -int lib_ioctl_nalcmd(int dev_id, int opc, void * ptr) -{ - struct portal_ioctl_data *ptldata; - - if (opc == IOC_PORTAL_NAL_CMD) { - ptldata = (struct portal_ioctl_data *) ptr; - - if (ptldata->ioc_nal_cmd == NAL_CMD_REGISTER_MYNID) { - tcpnal_mynid = ptldata->ioc_nid; - printf("mynid: %u.%u.%u.%u\n", - (unsigned)(tcpnal_mynid>>24) & 0xFF, - (unsigned)(tcpnal_mynid>>16) & 0xFF, - (unsigned)(tcpnal_mynid>>8) & 0xFF, - (unsigned)(tcpnal_mynid) & 0xFF); - } - } - - return (0); -} - -int lib_ioctl(int dev_id, int opc, void * ptr) -{ - - if (dev_id == OBD_DEV_ID) { - class_handle_ioctl(&ocus, opc, (unsigned long)ptr); - - /* you _may_ need to call obd_ioctl_unpack or some - other verification function if you want to use ioc - directly here */ -#if 0 - printf ("processing ioctl cmd: %x buf len: %d\n", - opc, ioc->ioc_len); -#endif - } - return (0); -} - -int liblustre_ioctl(int dev_id, int opc, void *ptr) -{ - int rc = -EINVAL; - - switch (dev_id) { - default: - fprintf(stderr, "Unexpected device id %d\n", dev_id); - abort(); - break; - - case OBD_DEV_ID: - rc = class_handle_ioctl(&ocus, opc, (unsigned long)ptr); - break; - } - - return rc; -} - -extern int time_ptlwait1; -extern int time_ptlwait2; -extern int time_ptlselect; -int main(int argc, char **argv) -{ - char *config_file; - - if (argc > 2) { - printf("Usage: %s [config_file]\n", argv[0]); - return 1; - } - - if (argc == 2) { - config_file = argv[1]; - argc--; - argv++; - } else - config_file = "/tmp/DUMP_FILE"; - - srand(time(NULL)); - - INIT_LIST_HEAD(&ocus.ocus_conns); -#if 1 - portal_debug = 0; - portal_subsystem_debug = 0; -#endif - parse_dump(config_file, lib_ioctl_nalcmd); - - if (init_current(argc, argv) || - init_obdclass() || init_lib_portals() || - ptlrpc_init() || - ldlm_init() || - mdc_init() || - lov_init() || - osc_init() || - echo_client_init()) { - printf("error\n"); - return 1; - } - - parse_dump(config_file, lib_ioctl); - - set_ioc_handler(liblustre_ioctl); -#if 0 - portal_debug = -1; - portal_subsystem_debug = -1; -#endif - return lctl_main(argc, argv); -} - diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c index baf564a..1cb6a37 100644 --- a/lustre/liblustre/llite_lib.c +++ b/lustre/liblustre/llite_lib.c @@ -37,6 +37,11 @@ #include #include +/* both sys/queue.h (libsysio require it) and portals/lists.h have definition + * of 'LIST_HEAD'. undef it to suppress warnings + */ +#undef LIST_HEAD + #include /* needed for ptpctl.h */ #include /* needed for parse_dump */ #include @@ -45,8 +50,7 @@ ptl_handle_ni_t tcpnal_ni; -struct task_struct *current; -struct obd_class_user_state ocus; +struct task_struct *current; /* portals interfaces */ ptl_handle_ni_t * @@ -141,7 +145,7 @@ int init_lib_portals() PtlInit(); rc = PtlNIInit(procbridge_interface, 0, 0, 0, &tcpnal_ni); if (rc != 0) { - CERROR("ksocknal: PtlNIInit failed: error %d\n", rc); + CERROR("TCPNAL: PtlNIInit failed: error %d\n", rc); PtlFini(); RETURN (rc); } @@ -156,7 +160,7 @@ kportal_nal_cmd(struct portals_cfg *pcfg) return 0; } -extern int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, unsigned long arg); +extern int class_handle_ioctl(unsigned int cmd, unsigned long arg); int lib_ioctl_nalcmd(int dev_id, int opc, void * ptr) { @@ -190,7 +194,7 @@ int lib_ioctl(int dev_id, int opc, void * ptr) ioc->ioc_pbuf1 = ioc->ioc_bulk; //XXX - rc = class_handle_ioctl(&ocus, opc, (unsigned long)ptr); + rc = class_handle_ioctl(opc, (unsigned long)ptr); printf ("proccssing ioctl cmd: %x, rc %d\n", opc, rc); @@ -202,8 +206,6 @@ int lib_ioctl(int dev_id, int opc, void * ptr) int lllib_init(char *dumpfile) { - INIT_LIST_HEAD(&ocus.ocus_conns); - if (!g_zconf) { /* this parse only get my nid from config file * before initialize portals @@ -213,7 +215,7 @@ int lllib_init(char *dumpfile) } else { /* XXX need setup mynid before tcpnal initialize */ tcpnal_mynid = ((uint64_t)getpid() << 32) | time(0); - printf("set tcpnal mynid: %016llx\n", tcpnal_mynid); + printf("LibLustre: TCPNAL NID: %016llx\n", tcpnal_mynid); } init_current("dummy"); @@ -239,7 +241,7 @@ static void llu_check_request() } #endif -int liblustre_process_log(struct config_llog_instance *cfg) +int liblustre_process_log(struct config_llog_instance *cfg, int allow_recov) { struct lustre_cfg lcfg; char *peer = "MDS_PEER_UUID"; @@ -297,6 +299,11 @@ int liblustre_process_log(struct config_llog_instance *cfg) if (obd == NULL) GOTO(out_cleanup, err = -EINVAL); + /* Disable initial recovery on this import */ + err = obd_set_info(obd->obd_self_export, + strlen("initial_recov"), "initial_recov", + sizeof(allow_recov), &allow_recov); + err = obd_connect(&mdc_conn, obd, &mdc_uuid); if (err) { CERROR("cannot connect to %s: rc = %d\n", @@ -374,10 +381,13 @@ int ll_parse_mount_target(const char *target, char **mdsnid, /* env variables */ #define ENV_LUSTRE_MNTPNT "LIBLUSTRE_MOUNT_POINT" #define ENV_LUSTRE_MNTTGT "LIBLUSTRE_MOUNT_TARGET" +#define ENV_LUSTRE_TIMEOUT "LIBLUSTRE_TIMEOUT" #define ENV_LUSTRE_DUMPFILE "LIBLUSTRE_DUMPFILE" extern int _sysio_native_init(); +extern unsigned int obd_timeout; + /* global variables */ int g_zconf = 0; /* zeroconf or dumpfile */ char *g_zconf_mdsname = NULL; /* mdsname, for zeroconf */ @@ -389,6 +399,7 @@ void __liblustre_setup_(void) { char *lustre_path = NULL; char *target = NULL; + char *timeout = NULL; char *dumpfile = NULL; char *root_driver = "native"; char *lustre_driver = "llite"; @@ -397,7 +408,10 @@ void __liblustre_setup_(void) int err; - srand(time(NULL)); + /* consider tha case of starting multiple liblustre instances + * at a same time on single node. + */ + srand(time(NULL) + getpid()); signal(SIGUSR1, sighandler_USR1); @@ -429,6 +443,13 @@ void __liblustre_setup_(void) lustre_path, target); } + timeout = getenv(ENV_LUSTRE_TIMEOUT); + if (timeout) { + obd_timeout = (unsigned int) atoi(timeout); + printf("LibLustre: set obd timeout as %u seconds\n", + obd_timeout); + } + if (_sysio_init() != 0) { perror("init sysio"); exit(1); diff --git a/lustre/liblustre/llite_lib.h b/lustre/liblustre/llite_lib.h index 9e4340d..043be49 100644 --- a/lustre/liblustre/llite_lib.h +++ b/lustre/liblustre/llite_lib.h @@ -1,3 +1,26 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Lustre Light Super operations + * + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + #ifndef __LLU_H_ #define __LLU_H_ @@ -20,7 +43,7 @@ struct ll_file_data { struct llu_sb_info { struct obd_uuid ll_sb_uuid; - struct obd_export *ll_mdc_exp; + struct obd_export *ll_mdc_exp; struct obd_export *ll_osc_exp; obd_id ll_rootino; int ll_flags; @@ -31,6 +54,9 @@ struct llu_sb_info char *ll_instance; }; +#define LL_SBI_NOLCK 0x1 +#define LL_SBI_READAHEAD 0x2 + #define LLI_F_HAVE_OST_SIZE_LOCK 0 #define LLI_F_HAVE_MDS_SIZE_LOCK 1 #define LLI_F_PREFER_EXTENDED_SIZE 2 @@ -42,7 +68,7 @@ struct llu_inode_info { struct lov_stripe_md *lli_smd; char *lli_symlink_name; struct semaphore lli_open_sem; - __u64 lli_maxbytes; + __u64 lli_maxbytes; unsigned long lli_flags; /* for libsysio */ @@ -50,8 +76,10 @@ struct llu_inode_info { struct lookup_intent *lli_it; - /* XXX workaround for libsysio */ + /* XXX workaround for libsysio unlink */ int lli_stale_flag; + /* XXX workaround for libsysio readdir */ + loff_t lli_dir_pos; /* in libsysio we have no chance to store data in file, * so place it here. since it's possible that an file @@ -60,24 +88,24 @@ struct llu_inode_info { struct ll_file_data *lli_file_data; int lli_open_count; - /* stat FIXME not 64 bit clean */ - dev_t lli_st_dev; - ino_t lli_st_ino; - mode_t lli_st_mode; - nlink_t lli_st_nlink; - uid_t lli_st_uid; - gid_t lli_st_gid; - dev_t lli_st_rdev; - loff_t lli_st_size; - unsigned int lli_st_blksize; - unsigned int lli_st_blocks; - time_t lli_st_atime; - time_t lli_st_mtime; - time_t lli_st_ctime; - - /* not for stat, change it later */ - int lli_st_flags; - unsigned long lli_st_generation; + /* stat FIXME not 64 bit clean */ + dev_t lli_st_dev; + ino_t lli_st_ino; + mode_t lli_st_mode; + nlink_t lli_st_nlink; + uid_t lli_st_uid; + gid_t lli_st_gid; + dev_t lli_st_rdev; + loff_t lli_st_size; + unsigned int lli_st_blksize; + unsigned int lli_st_blocks; + time_t lli_st_atime; + time_t lli_st_mtime; + time_t lli_st_ctime; + + /* not for stat, change it later */ + int lli_st_flags; + unsigned long lli_st_generation; }; #define LLU_SYSIO_COOKIE_SIZE(x) \ @@ -87,8 +115,9 @@ struct llu_inode_info { struct llu_sysio_cookie { struct obd_sync_io_container *lsc_osic; - struct inode *lsc_inode; - int lsc_npages; + struct inode *lsc_inode; + int lsc_maxpages; + int lsc_npages; struct ll_async_page *lsc_llap; struct page *lsc_pages; __u64 lsc_rwcount; @@ -99,18 +128,18 @@ struct llu_sysio_cookie { struct llu_sysio_callback_args { - int ncookies; - struct llu_sysio_cookie *cookies[MAX_IOVEC]; + int ncookies; + struct llu_sysio_cookie *cookies[MAX_IOVEC]; }; static inline struct llu_sb_info *llu_fs2sbi(struct filesys *fs) { - return (struct llu_sb_info*)(fs->fs_private); + return (struct llu_sb_info*)(fs->fs_private); } static inline struct llu_inode_info *llu_i2info(struct inode *inode) { - return (struct llu_inode_info*)(inode->i_private); + return (struct llu_inode_info*)(inode->i_private); } static inline struct llu_sb_info *llu_i2sbi(struct inode *inode) @@ -118,16 +147,6 @@ static inline struct llu_sb_info *llu_i2sbi(struct inode *inode) return llu_i2info(inode)->lli_sbi; } -#if 0 -static inline struct client_obd *sbi2mdc(struct llu_sb_info *sbi) -{ - struct obd_device *obd = class_conn2obd(&sbi->ll_mdc_conn); - if (obd == NULL) - LBUG(); - return &obd->u.cli; -} -#endif - static inline struct obd_export *llu_i2obdexp(struct inode *inode) { return llu_i2info(inode)->lli_sbi->ll_osc_exp; @@ -138,16 +157,21 @@ static inline struct obd_export *llu_i2mdcexp(struct inode *inode) return llu_i2info(inode)->lli_sbi->ll_mdc_exp; } +static inline int llu_is_root_inode(struct inode *inode) +{ + return (llu_i2info(inode)->lli_fid.id == + llu_i2info(inode)->lli_sbi->ll_rootino); +} #define LL_SAVE_INTENT(inode, it) \ do { \ - struct lookup_intent *temp; \ + struct lookup_intent *temp; \ LASSERT(llu_i2info(inode)->lli_it == NULL); \ OBD_ALLOC(temp, sizeof(*temp)); \ memcpy(temp, it, sizeof(*temp)); \ llu_i2info(inode)->lli_it = temp; \ CDEBUG(D_DENTRY, "alloc intent %p to inode %p(ino %lu)\n", \ - temp, inode, llu_i2info(inode)->lli_st_ino); \ + temp, inode, llu_i2info(inode)->lli_st_ino); \ } while(0) @@ -158,7 +182,7 @@ do { \ LASSERT(it); \ llu_i2info(inode)->lli_it = NULL; \ CDEBUG(D_DENTRY, "dettach intent %p from inode %p(ino %lu)\n", \ - it, inode, llu_i2info(inode)->lli_st_ino); \ + it, inode, llu_i2info(inode)->lli_st_ino); \ } while(0) /* interpet return codes from intent lookup */ @@ -167,20 +191,20 @@ do { \ static inline void ll_inode2fid(struct ll_fid *fid, struct inode *inode) { - *fid = llu_i2info(inode)->lli_fid; + *fid = llu_i2info(inode)->lli_fid; } struct it_cb_data { - struct inode *icbd_parent; - struct pnode *icbd_child; - obd_id hash; + struct inode *icbd_parent; + struct pnode *icbd_child; + obd_id hash; }; static inline void ll_i2uctxt(struct ll_uctxt *ctxt, struct inode *i1, struct inode *i2) { - struct llu_inode_info *lli1 = llu_i2info(i1); - struct llu_inode_info *lli2; + struct llu_inode_info *lli1 = llu_i2info(i1); + struct llu_inode_info *lli2; LASSERT(i1); LASSERT(ctxt); @@ -191,7 +215,7 @@ static inline void ll_i2uctxt(struct ll_uctxt *ctxt, struct inode *i1, ctxt->gid1 = -1; if (i2) { - lli2 = llu_i2info(i2); + lli2 = llu_i2info(i2); if (in_group_p(lli2->lli_st_gid)) ctxt->gid2 = lli2->lli_st_gid; else @@ -210,21 +234,9 @@ int llu_intent_lock(struct inode *parent, struct pnode *pnode, /* FIXME */ static inline int ll_permission(struct inode *inode, int flag, void * unused) { - return 0; -} - -#if 0 -static inline int it_disposition(struct lookup_intent *it, int flag) -{ - return it->d.lustre.it_disposition & flag; + return 0; } -static inline void it_set_disposition(struct lookup_intent *it, int flag) -{ - it->d.lustre.it_disposition |= flag; -} -#endif - static inline __u64 ll_file_maxbytes(struct inode *inode) { return llu_i2info(inode)->lli_maxbytes; @@ -232,17 +244,17 @@ static inline __u64 ll_file_maxbytes(struct inode *inode) struct mount_option_s { - char *mdc_uuid; - char *osc_uuid; + char *mdc_uuid; + char *osc_uuid; }; /* llite_lib.c */ void generate_random_uuid(unsigned char uuid_out[16]); -int liblustre_process_log(struct config_llog_instance *cfg); +int liblustre_process_log(struct config_llog_instance *cfg, int allow_recov); int ll_parse_mount_target(const char *target, char **mdsnid, char **mdsname, char **profile); -extern int g_zconf; +extern int g_zconf; extern char *g_zconf_mdsnid; extern char *g_zconf_mdsname; extern char *g_zconf_profile; @@ -253,8 +265,6 @@ void llu_update_inode(struct inode *inode, struct mds_body *body, struct lov_stripe_md *lmm); void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid); void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid); -//struct inode* llu_new_inode(struct filesys *fs, ino_t ino, mode_t mode); -//int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm, void *ostdata); int ll_it_open_error(int phase, struct lookup_intent *it); struct inode *llu_iget(struct filesys *fs, struct lustre_md *md); int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm); @@ -282,7 +292,7 @@ int llu_objects_destroy(struct ptlrpc_request *request, struct inode *dir); int llu_iop_iodone(struct ioctx *ioctxp __IS_UNUSED); struct llu_sysio_callback_args* llu_file_write(struct inode *inode, const struct iovec *iovec, - size_t iovlen, loff_t pos); + size_t iovlen, loff_t pos); struct llu_sysio_callback_args* llu_file_read(struct inode *inode, const struct iovec *iovec, size_t iovlen, loff_t pos); @@ -308,5 +318,33 @@ int llu_iop_lookup(struct pnode *pnode, const char *path); void unhook_stale_inode(struct pnode *pno); struct inode *llu_inode_from_lock(struct ldlm_lock *lock); +int llu_mdc_blocking_ast(struct ldlm_lock *lock, + struct ldlm_lock_desc *desc, + void *data, int flag); + +/* dir.c */ +ssize_t llu_iop_getdirentries(struct inode *ino, char *buf, size_t nbytes, + _SYSIO_OFF_T *basep); + +/* ext2 related */ +#define EXT2_NAME_LEN (255) + +struct ext2_dirent { + __u32 inode; + __u16 rec_len; + __u8 name_len; + __u8 file_type; + char name[EXT2_NAME_LEN]; +}; + +#define EXT2_DIR_PAD 4 +#define EXT2_DIR_ROUND (EXT2_DIR_PAD - 1) +#define EXT2_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT2_DIR_ROUND) & \ + ~EXT2_DIR_ROUND) + +static inline struct ext2_dirent *ext2_next_entry(struct ext2_dirent *p) +{ + return (struct ext2_dirent*)((char*) p + le16_to_cpu(p->rec_len)); +} #endif diff --git a/lustre/liblustre/namei.c b/lustre/liblustre/namei.c index 1dceb8b..1c00634 100644 --- a/lustre/liblustre/namei.c +++ b/lustre/liblustre/namei.c @@ -38,112 +38,38 @@ #include #include +#undef LIST_HEAD + #include "llite_lib.h" -static void ll_intent_release(struct lookup_intent *it) +static void ll_intent_drop_lock(struct lookup_intent *it) { struct lustre_handle *handle; - ENTRY; - /* LASSERT(ll_d2d(de) != NULL); */ - - if (it->d.lustre.it_lock_mode) { + if (it->it_op && it->d.lustre.it_lock_mode) { handle = (struct lustre_handle *)&it->d.lustre.it_lock_handle; CDEBUG(D_DLMTRACE, "releasing lock with cookie "LPX64 - " from it %p\n", - handle->cookie, it); + " from it %p\n", handle->cookie, it); ldlm_lock_decref(handle, it->d.lustre.it_lock_mode); - /* intent_release may be called multiple times, from - this thread and we don't want to double-decref this - lock (see bug 494) */ + /* bug 494: intent_release may be called multiple times, from + * this thread and we don't want to double-decref this lock */ it->d.lustre.it_lock_mode = 0; } - it->it_magic = 0; - it->it_op_release = 0; - EXIT; } -#if 0 -static void llu_mdc_lock_set_inode(struct lustre_handle *lockh, - struct inode *inode) +static void ll_intent_release(struct lookup_intent *it) { - struct ldlm_lock *lock = ldlm_handle2lock(lockh); ENTRY; - LASSERT(lock != NULL); - lock->l_data = inode; - LDLM_LOCK_PUT(lock); + ll_intent_drop_lock(it); + it->it_magic = 0; + it->it_op_release = 0; + it->d.lustre.it_disposition = 0; + it->d.lustre.it_data = NULL; EXIT; } -static int pnode_revalidate_finish(struct ptlrpc_request *request, - struct inode *parent, struct pnode *pnode, - struct lookup_intent *it, int offset, - obd_id ino) -{ - struct llu_sb_info *sbi = llu_i2sbi(parent); - struct pnode_base *pb = pnode->p_base; - struct mds_body *body; - struct lov_stripe_md *lsm = NULL; - struct lov_mds_md *lmm; - int lmmsize; - int rc = 0; - ENTRY; - - /* NB 1 request reference will be taken away by ll_intent_lock() - * when I return */ - - if (it_disposition(it, DISP_LOOKUP_NEG)) - RETURN(-ENOENT); - - /* We only get called if the mdc_enqueue() called from - * ll_intent_lock() was successful. Therefore the mds_body is - * present and correct, and the eadata is present (but still - * opaque, so only obd_unpackmd() can check the size) */ - body = lustre_msg_buf(request->rq_repmsg, offset, sizeof (*body)); - LASSERT (body != NULL); - LASSERT_REPSWABBED (request, offset); - - if (body->valid & OBD_MD_FLEASIZE) { - /* Only bother with this if inodes's LSM not set? */ - - if (body->eadatasize == 0) { - CERROR ("OBD_MD_FLEASIZE set, but eadatasize 0\n"); - GOTO (out, rc = -EPROTO); - } - lmmsize = body->eadatasize; - lmm = lustre_msg_buf (request->rq_repmsg, offset + 1, lmmsize); - LASSERT (lmm != NULL); - LASSERT_REPSWABBED (request, offset + 1); - - rc = obd_unpackmd (&sbi->ll_osc_conn, - &lsm, lmm, lmmsize); - if (rc < 0) { - CERROR ("Error %d unpacking eadata\n", rc); - LBUG(); - /* XXX don't know if I should do this... */ - GOTO (out, rc); - /* or skip the ll_update_inode but still do - * mdc_lock_set_inode() */ - } - LASSERT (rc >= sizeof (*lsm)); - rc = 0; - } - - llu_update_inode(pb->pb_ino, body, lsm); - - if (lsm != NULL && - llu_i2info(pb->pb_ino)->lli_smd != lsm) - obd_free_memmd (&sbi->ll_osc_conn, &lsm); - - llu_mdc_lock_set_inode((struct lustre_handle *)&it->d.lustre.it_lock_handle, - pb->pb_ino); - out: - RETURN(rc); -} -#endif - /* * remove the stale inode from pnode */ @@ -156,11 +82,11 @@ void unhook_stale_inode(struct pnode *pno) LASSERT(llu_i2info(inode)->lli_stale_flag); pno->p_base->pb_ino = NULL; + I_RELE(inode); if (!llu_i2info(inode)->lli_open_count) { CDEBUG(D_INODE, "unhook inode %p (ino %lu) from pno %p\n", inode, llu_i2info(inode)->lli_st_ino, pno); - I_RELE(inode); if (!inode->i_ref) _sysio_i_gone(inode); } @@ -188,14 +114,14 @@ void llu_lookup_finish_locks(struct lookup_intent *it, struct pnode *pnode) } -static inline void ll_invalidate_inode_pages(struct inode * inode) +static inline void llu_invalidate_inode_pages(struct inode * inode) { /* do nothing */ } -static int llu_mdc_blocking_ast(struct ldlm_lock *lock, - struct ldlm_lock_desc *desc, - void *data, int flag) +int llu_mdc_blocking_ast(struct ldlm_lock *lock, + struct ldlm_lock_desc *desc, + void *data, int flag) { int rc; struct lustre_handle lockh; @@ -232,7 +158,7 @@ static int llu_mdc_blocking_ast(struct ldlm_lock *lock, CDEBUG(D_INODE, "invalidating inode %lu\n", lli->lli_st_ino); - ll_invalidate_inode_pages(inode); + llu_invalidate_inode_pages(inode); } /* @@ -250,6 +176,33 @@ static int llu_mdc_blocking_ast(struct ldlm_lock *lock, RETURN(0); } +static int pnode_revalidate_finish(struct ptlrpc_request *req, + int offset, + struct lookup_intent *it, + struct pnode *pnode) +{ + struct inode *inode = pnode->p_base->pb_ino; + struct lustre_md md; + int rc = 0; + ENTRY; + + LASSERT(inode); + + if (!req) + RETURN(0); + + if (it_disposition(it, DISP_LOOKUP_NEG)) + RETURN(-ENOENT); + + rc = mdc_req2lustre_md(req, offset, llu_i2sbi(inode)->ll_osc_exp, &md); + if (rc) + RETURN(rc); + + llu_update_inode(inode, md.body, md.lsm); + + RETURN(rc); +} + int llu_pb_revalidate(struct pnode *pnode, int flags, struct lookup_intent *it) { struct pnode_base *pb = pnode->p_base; @@ -285,7 +238,7 @@ int llu_pb_revalidate(struct pnode *pnode, int flags, struct lookup_intent *it) } /* This is due to bad interaction with libsysio. remove this when we - * switched to libbsdio + * switched to libbsdio XXX */ { struct llu_inode_info *lli = llu_i2info(pb->pb_ino); @@ -322,14 +275,13 @@ int llu_pb_revalidate(struct pnode *pnode, int flags, struct lookup_intent *it) if (req == NULL && rc >= 0) GOTO(out, rc); - /* unfortunately ll_intent_lock may cause a callback and revoke our - dentry */ - /* - spin_lock(&dcache_lock); - list_del_init(&de->d_hash); - spin_unlock(&dcache_lock); - d_rehash(de); - */ + if (rc < 0) + GOTO(out, rc = 0); + + rc = pnode_revalidate_finish(req, 1, it, pnode); + + /* Note: ll_intent_lock may cause a callback, check this! */ + if (it->it_op & (IT_OPEN | IT_GETATTR)) LL_SAVE_INTENT(pb->pb_ino, it); RETURN(1); @@ -339,7 +291,7 @@ int llu_pb_revalidate(struct pnode *pnode, int flags, struct lookup_intent *it) if (rc == 0) { LASSERT(pb->pb_ino); if (S_ISDIR(llu_i2info(pb->pb_ino)->lli_st_mode)) - ll_invalidate_inode_pages(pb->pb_ino); + llu_invalidate_inode_pages(pb->pb_ino); llu_i2info(pb->pb_ino)->lli_stale_flag = 1; unhook_stale_inode(pnode); } else { @@ -362,8 +314,9 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset, int rc; /* NB 1 request reference will be taken away by ll_intent_lock() - * when I return */ - /* XXX libsysio require the inode must be generated here XXX */ + * when I return + * Note: libsysio require the inode must be generated here + */ if ((it->it_op & IT_CREAT) || !it_disposition(it, DISP_LOOKUP_NEG)) { struct lustre_md md; struct llu_inode_info *lli; @@ -396,6 +349,9 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset, LASSERT(lsm->lsm_object_id != 0); + /* bug 2334: drop MDS lock before acquiring OST lock */ + ll_intent_drop_lock(it); + rc = llu_extent_lock(NULL, inode, lsm, LCK_PR, &extent, &lockh); if (rc != ELDLM_OK) { @@ -408,15 +364,10 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset, ENTRY; } + /* intent will be further used in cases of open()/getattr() */ if (inode && (it->it_op & (IT_OPEN | IT_GETATTR))) LL_SAVE_INTENT(inode, it); -/* - dentry->d_op = &ll_d_ops; - ll_set_dd(dentry); - if (dentry == saved) - d_add(dentry, inode); -*/ child->p_base->pb_ino = inode; RETURN(0); @@ -437,9 +388,6 @@ struct inode *llu_inode_from_lock(struct ldlm_lock *lock) return inode; } -/* XXX */ -#define EXT2_NAME_LEN (255) - static int llu_lookup_it(struct inode *parent, struct pnode *pnode, struct lookup_intent *it, int flags) { @@ -454,18 +402,6 @@ static int llu_lookup_it(struct inode *parent, struct pnode *pnode, if (pnode->p_base->pb_name.len > EXT2_NAME_LEN) RETURN(-ENAMETOOLONG); - -/* - CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n", - dentry->d_name.name, parent->i_ino, parent->i_generation, - parent, LL_IT2STR(it)); - - if (d_mountpoint(dentry)) - CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it)); - - ll_frob_intent(&it, &lookup_it); -*/ - if (!it) { it = &lookup_it; it->it_op_release = ll_intent_release; @@ -493,12 +429,6 @@ static int llu_lookup_it(struct inode *parent, struct pnode *pnode, llu_lookup_finish_locks(it, pnode); -/* - if (dentry == save) - GOTO(out, retval = NULL); - else - GOTO(out, retval = dentry); -*/ out: if (req) ptlrpc_req_finished(req); @@ -553,12 +483,7 @@ translate_lookup_intent(struct intent *intent, const char *path) it->it_flags |= fmode; } - /* - else if (intent->int_opmask & INT_CREAT) - it->it_op |= IT_LOOKUP; - */ - - /* FIXME libsysio has strange code on intent handling, + /* XXX libsysio has strange code on intent handling, * more check later */ if (it->it_flags & O_CREAT) { it->it_op |= IT_CREAT; @@ -567,9 +492,8 @@ translate_lookup_intent(struct intent *intent, const char *path) if (intent->int_opmask & INT_GETATTR) it->it_op |= IT_GETATTR; - /* XXX */ - if (intent->int_opmask & INT_SETATTR) - LBUG(); + + LASSERT(!(intent->int_opmask & INT_SETATTR)); /* libsysio is different to linux vfs when doing unlink/rmdir, * INT_UPDPARENT was passed down during name resolution. Here diff --git a/lustre/liblustre/rw.c b/lustre/liblustre/rw.c index c05a7c9..c07409e 100644 --- a/lustre/liblustre/rw.c +++ b/lustre/liblustre/rw.c @@ -36,177 +36,9 @@ #include #include -#include "llite_lib.h" - -#if 0 -void llu_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm, - struct ldlm_lock *lock) -{ - clear_bit(LLI_F_HAVE_SIZE_LOCK, &(llu_i2info(inode)->lli_flags)); -#if 0 - struct ldlm_extent *extent = &lock->l_extent; - unsigned long start, end, count, skip, i, j; - struct page *page; - int ret; - ENTRY; - - CDEBUG(D_INODE, "obdo %lu inode %p ["LPU64"->"LPU64"] size: %llu\n", - inode->i_ino, inode, extent->start, extent->end, inode->i_size); - - start = extent->start >> PAGE_CACHE_SHIFT; - count = ~0; - skip = 0; - end = (extent->end >> PAGE_CACHE_SHIFT) + 1; - if ((end << PAGE_CACHE_SHIFT) < extent->end) - end = ~0; - if (lsm->lsm_stripe_count > 1) { - struct { - char name[16]; - struct ldlm_lock *lock; - struct lov_stripe_md *lsm; - } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm }; - __u32 stripe; - __u32 vallen = sizeof(stripe); - int rc; - - /* get our offset in the lov */ - rc = obd_get_info(ll_i2obdconn(inode), sizeof(key), - &key, &vallen, &stripe); - if (rc != 0) { - CERROR("obd_get_info: rc = %d\n", rc); - LBUG(); - } - LASSERT(stripe < lsm->lsm_stripe_count); - - count = lsm->lsm_stripe_size >> PAGE_CACHE_SHIFT; - skip = (lsm->lsm_stripe_count - 1) * count; - start += (start/count * skip) + (stripe * count); - if (end != ~0) - end += (end/count * skip) + (stripe * count); - } - - i = (inode->i_size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; - if (end >= i) - clear_bit(LLI_F_HAVE_SIZE_LOCK, &(ll_i2info(inode)->lli_flags)); - if (i < end) - end = i; - - CDEBUG(D_INODE, "start: %lu j: %lu count: %lu skip: %lu end: %lu\n", - start, start % count, count, skip, end); - - /* start writeback on dirty pages in the extent when its PW */ - for (i = start, j = start % count; - lock->l_granted_mode == LCK_PW && i < end; j++, i++) { - if (j == count) { - i += skip; - j = 0; - } - /* its unlikely, but give us a chance to bail when we're out */ - PGCACHE_WRLOCK(inode->i_mapping); - if (list_empty(&inode->i_mapping->dirty_pages)) { - CDEBUG(D_INODE, "dirty list empty\n"); - PGCACHE_WRUNLOCK(inode->i_mapping); - break; - } - PGCACHE_WRUNLOCK(inode->i_mapping); - - if (need_resched()) - schedule(); - - /* always do a getattr for the first person to pop out of lock - * acquisition.. the DID_GETATTR flag and semaphore serialize - * this initial race. we used to make a decision based on whether - * the lock was matched or acquired, but the matcher could win the - * waking race with the first issuer so that was no good.. - */ - if (test_bit(LLI_F_DID_GETATTR, &lli->lli_flags)) - RETURN(ELDLM_OK); - - down(&lli->lli_getattr_sem); - - if (!test_bit(LLI_F_DID_GETATTR, &lli->lli_flags)) { - rc = ll_inode_getattr(inode, lsm); - if (rc == 0) { - set_bit(LLI_F_DID_GETATTR, &lli->lli_flags); - } else { - unlock_page(page); - } - page_cache_release(page); - - } - - /* our locks are page granular thanks to osc_enqueue, we invalidate the - * whole page. */ - LASSERT((extent->start & ~PAGE_CACHE_MASK) == 0); - LASSERT(((extent->end+1) & ~PAGE_CACHE_MASK) == 0); - for (i = start, j = start % count ; i < end ; j++, i++) { - if ( j == count ) { - i += skip; - j = 0; - } - PGCACHE_WRLOCK(inode->i_mapping); - if (list_empty(&inode->i_mapping->dirty_pages) && - list_empty(&inode->i_mapping->clean_pages) && - list_empty(&inode->i_mapping->locked_pages)) { - CDEBUG(D_INODE, "nothing left\n"); - PGCACHE_WRUNLOCK(inode->i_mapping); - break; - } - PGCACHE_WRUNLOCK(inode->i_mapping); - if (need_resched()) - schedule(); - page = find_get_page(inode->i_mapping, i); - if (page == NULL) - continue; - CDEBUG(D_INODE, "dropping page %p at %lu\n", page, page->index); - lock_page(page); - if (page->mapping) /* might have raced */ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - truncate_complete_page(page); -#else - truncate_complete_page(page->mapping, page); -#endif - unlock_page(page); - page_cache_release(page); - } - EXIT; -#endif -} - -int llu_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new, - void *data, int flag) -{ - struct inode *inode = data; - struct llu_inode_info *lli = llu_i2info(inode); - struct lustre_handle lockh = {0}; - int rc; - ENTRY; - - if (inode == NULL) - LBUG(); - - switch (flag) { - case LDLM_CB_BLOCKING: - ldlm_lock2handle(lock, &lockh); - rc = ldlm_cli_cancel(&lockh); - if (rc != ELDLM_OK) - CERROR("ldlm_cli_cancel failed: %d\n", rc); - break; - case LDLM_CB_CANCELING: { - /* FIXME: we could be given 'canceling intents' so that we - * could know to write-back or simply throw away the pages - * based on if the cancel comes from a desire to, say, - * read or truncate.. */ - llu_pgcache_remove_extent(inode, lli->lli_smd, lock); - break; - } - default: - LBUG(); - } +#undef LIST_HEAD - RETURN(0); -} -#endif +#include "llite_lib.h" static int llu_extent_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new, void *data, @@ -274,12 +106,10 @@ int llu_extent_lock_no_validate(struct ll_file_data *fd, LASSERT(lockh->cookie == 0); -#if 0 /* XXX phil: can we do this? won't it screw the file size up? */ if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) || (sbi->ll_flags & LL_SBI_NOLCK)) RETURN(0); -#endif CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n", lli->lli_st_ino, extent->start, extent->end); @@ -432,17 +262,17 @@ static struct obd_async_page_ops llu_async_page_ops = { }; static -struct llu_sysio_cookie* get_sysio_cookie(struct inode *inode, int npages) +struct llu_sysio_cookie* get_sysio_cookie(struct inode *inode, int maxpages) { struct llu_sysio_cookie *cookie; - OBD_ALLOC(cookie, LLU_SYSIO_COOKIE_SIZE(npages)); + OBD_ALLOC(cookie, LLU_SYSIO_COOKIE_SIZE(maxpages)); if (cookie) { I_REF(inode); cookie->lsc_inode = inode; - cookie->lsc_npages = npages; + cookie->lsc_maxpages = maxpages; cookie->lsc_llap = (struct ll_async_page *)(cookie + 1); - cookie->lsc_pages = (struct page *) (cookie->lsc_llap + npages); + cookie->lsc_pages = (struct page *) (cookie->lsc_llap + maxpages); osic_init(&cookie->lsc_osic); } @@ -456,25 +286,114 @@ void put_sysio_cookie(struct llu_sysio_cookie *cookie) struct lov_stripe_md *lsm = llu_i2info(cookie->lsc_inode)->lli_smd; struct obd_export *exp = llu_i2obdexp(cookie->lsc_inode); struct ll_async_page *llap = cookie->lsc_llap; +#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE + struct page *pages = cookie->lsc_pages; +#endif int i; - for (i = 0; i< cookie->lsc_npages; i++) { + for (i = 0; i< cookie->lsc_maxpages; i++) { if (llap[i].llap_cookie) obd_teardown_async_page(exp, lsm, NULL, llap[i].llap_cookie); +#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE + if (pages[i]._managed) { + free(pages[i].addr); + pages[i]._managed = 0; + } +#endif } I_RELE(cookie->lsc_inode); osic_release(cookie->lsc_osic); - OBD_FREE(cookie, LLU_SYSIO_COOKIE_SIZE(cookie->lsc_npages)); + OBD_FREE(cookie, LLU_SYSIO_COOKIE_SIZE(cookie->lsc_maxpages)); +} + +#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE +/* Note: these code should be removed finally, don't need + * more cleanup + */ +static +int prepare_unaligned_write(struct llu_sysio_cookie *cookie) +{ + struct inode *inode = cookie->lsc_inode; + struct llu_inode_info *lli = llu_i2info(inode); + struct lov_stripe_md *lsm = lli->lli_smd; + struct obdo oa; + struct page *pages = cookie->lsc_pages; + int i, pgidx[2] = {0, cookie->lsc_npages-1}; + int rc; + ENTRY; + + for (i = 0; i < 2; i++) { + struct page *oldpage = &pages[pgidx[i]]; + struct page newpage; + struct brw_page pg; + char *newbuf; + + if (i == 0 && pgidx[0] == pgidx[1]) + continue; + + LASSERT(oldpage->_offset + oldpage->_count <= PAGE_CACHE_SIZE); + + if (oldpage->_count == PAGE_CACHE_SIZE) + continue; + + if (oldpage->index << PAGE_CACHE_SHIFT >= + lli->lli_st_size) + continue; + + newbuf = malloc(PAGE_CACHE_SIZE); + if (!newbuf) + return -ENOMEM; + + newpage.index = oldpage->index; + newpage.addr = newbuf; + + pg.pg = &newpage; + pg.off = ((obd_off)newpage.index << PAGE_CACHE_SHIFT); + if (pg.off + PAGE_CACHE_SIZE > lli->lli_st_size) + pg.count = lli->lli_st_size % PAGE_CACHE_SIZE; + else + pg.count = PAGE_CACHE_SIZE; + pg.flag = 0; + + oa.o_id = lsm->lsm_object_id; + oa.o_mode = lli->lli_st_mode; + oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE; + + /* issue read */ + rc = obd_brw(OBD_BRW_READ, llu_i2obdexp(inode), &oa, lsm, 1, &pg, NULL); + if (rc) { + free(newbuf); + RETURN(rc); + } + + /* copy page content, and reset page params */ + memcpy(newbuf + oldpage->_offset, + (char*)oldpage->addr + oldpage->_offset, + oldpage->_count); + + oldpage->addr = newbuf; + if ((((obd_off)oldpage->index << PAGE_CACHE_SHIFT) + + oldpage->_offset + oldpage->_count) > lli->lli_st_size) + oldpage->_count += oldpage->_offset; + else + oldpage->_count = PAGE_CACHE_SIZE; + oldpage->_offset = 0; + oldpage->_managed = 1; + } + + RETURN(0); } +#endif static int llu_prep_async_io(struct llu_sysio_cookie *cookie, int cmd, char *buf, loff_t pos, size_t count) { - struct lov_stripe_md *lsm = llu_i2info(cookie->lsc_inode)->lli_smd; + struct llu_inode_info *lli = llu_i2info(cookie->lsc_inode); + struct lov_stripe_md *lsm = lli->lli_smd; struct obd_export *exp = llu_i2obdexp(cookie->lsc_inode); struct page *pages = cookie->lsc_pages; struct ll_async_page *llap = cookie->lsc_llap; @@ -484,8 +403,6 @@ int llu_prep_async_io(struct llu_sysio_cookie *cookie, int cmd, if (!exp) RETURN(-EINVAL); - cookie->lsc_rwcount = count; - /* prepare the pages array */ do { unsigned long index, offset, bytes; @@ -496,6 +413,14 @@ int llu_prep_async_io(struct llu_sysio_cookie *cookie, int cmd, if (bytes > count) bytes = count; + /* prevent read beyond file range */ + if ((cmd == OBD_BRW_READ) && + (pos + bytes) >= lli->lli_st_size) { + if (pos >= lli->lli_st_size) + break; + bytes = lli->lli_st_size - pos; + } + /* prepare page for this index */ pages[npages].index = index; pages[npages].addr = buf - offset; @@ -507,8 +432,20 @@ int llu_prep_async_io(struct llu_sysio_cookie *cookie, int cmd, count -= bytes; pos += bytes; buf += bytes; + + cookie->lsc_rwcount += bytes; } while (count); + cookie->lsc_npages = npages; + +#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE + if (cmd == OBD_BRW_WRITE) { + rc = prepare_unaligned_write(cookie); + if (rc) + RETURN(rc); + } +#endif + for (i = 0; i < npages; i++) { llap[i].llap_magic = LLAP_MAGIC; rc = obd_prep_async_page(exp, lsm, NULL, &pages[i], @@ -612,28 +549,30 @@ llu_file_write(struct inode *inode, const struct iovec *iovec, /* FIXME optimize the following extent locking */ for (iovidx = 0; iovidx < iovlen; iovidx++) { - char *buf = iovec[iovidx].iov_base; + char *buf = (char*)iovec[iovidx].iov_base; size_t count = iovec[iovidx].iov_len; if (count == 0) continue; - /* FIXME libsysio haven't consider the open flags - * such as O_APPEND */ -#if 0 - if (!S_ISBLK(lli->lli_st_mode) && file->f_flags & O_APPEND) { - extent.start = 0; - extent.end = OBD_OBJECT_EOF; - } else { - extent.start = *ppos; - extent.end = *ppos + count - 1; - } -#else + /* FIXME libsysio haven't handle O_APPEND */ extent.start = pos; extent.end = pos + count - 1; -#endif - err = llu_extent_lock(fd, inode, lsm, LCK_PW, &extent, &lockh); +#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE + if ((pos & ~PAGE_CACHE_MASK) == 0 && + (count & ~PAGE_CACHE_MASK) == 0) + err = llu_extent_lock_no_validate(fd, inode, lsm, + LCK_PW, &extent, &lockh, 0); + else + err = llu_extent_lock(fd, inode, lsm, LCK_PW, + &extent, &lockh); +#else + /* server will handle partial write, so we don't + * care for file size here */ + err = llu_extent_lock_no_validate(fd, inode, lsm, LCK_PW, + &extent, &lockh, 0); +#endif if (err != ELDLM_OK) GOTO(err_out, err = -ENOLCK); @@ -737,6 +676,11 @@ llu_file_read(struct inode *inode, const struct iovec *iovec, CDEBUG(D_INFO, "Reading inode %lu, "LPSZ" bytes, offset %Ld\n", lli->lli_st_ino, count, pos); + if (pos >= lli->lli_st_size) { + llu_extent_unlock(fd, inode, lsm, LCK_PR, &lockh); + break; + } + cookie = llu_rw(OBD_BRW_READ, inode, buf, count, pos); if (!IS_ERR(cookie)) { /* save cookie */ @@ -776,8 +720,10 @@ int llu_iop_iodone(struct ioctx *ioctxp) ENTRY; /* write/read(fd, buf, 0) */ - if (!lsca) - return 1; + if (!lsca) { + ioctxp->ioctx_cc = 0; + RETURN(1); + } LASSERT(!IS_ERR(lsca)); @@ -793,8 +739,11 @@ int llu_iop_iodone(struct ioctx *ioctxp) } } - if (rc) - ioctxp->ioctx_cc = rc; + if (rc) { + LASSERT(rc < 0); + ioctxp->ioctx_cc = -1; + ioctxp->ioctx_errno = -rc; + } OBD_FREE(lsca, sizeof(*lsca)); ioctxp->ioctx_private = NULL; diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index 2bd8248..1e6a230 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -43,6 +43,8 @@ #include #include +#undef LIST_HEAD + #include "llite_lib.h" static void llu_fsop_gone(struct filesys *fs) @@ -293,14 +295,6 @@ int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm) obdo_refresh_inode(inode, &oa, refresh_valid); -/* - if (inode->i_blksize < PAGE_CACHE_SIZE) - inode->i_blksize = PAGE_CACHE_SIZE; - - CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %lu, blksize %lu\n", - lsm->lsm_object_id, inode->i_size, inode->i_blocks, - inode->i_blksize); -*/ RETURN(0); } @@ -343,49 +337,6 @@ static struct inode* llu_new_inode(struct filesys *fs, return inode; } -#if 0 -static int ll_intent_to_lock_mode(struct lookup_intent *it) -{ - /* CREAT needs to be tested before open (both could be set) */ - if (it->it_op & IT_CREAT) - return LCK_PW; - else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP)) - return LCK_PR; - - LBUG(); - RETURN(-EINVAL); -} -#endif - -#if 0 -int ll_it_open_error(int phase, struct lookup_intent *it) -{ - if (it_disposition(it, DISP_OPEN_OPEN)) { - if (phase == DISP_OPEN_OPEN) - return it->d.lustre.it_status; - else - return 0; - } - - if (it_disposition(it, DISP_OPEN_CREATE)) { - if (phase == DISP_OPEN_CREATE) - return it->d.lustre.it_status; - else - return 0; - } - - if (it_disposition(it, DISP_LOOKUP_EXECD)) { - if (phase == DISP_LOOKUP_EXECD) - return it->d.lustre.it_status; - else - return 0; - } - CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition, it->d.lustre.it_status); - LBUG(); - return 0; -} -#endif - static int llu_have_md_lock(struct inode *inode) { struct llu_sb_info *sbi = llu_i2sbi(inode); @@ -653,8 +604,6 @@ out: * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE * at the same time. */ -#define OST_ATTR (ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME | \ - ATTR_ATIME | ATTR_ATIME_SET | ATTR_SIZE) int llu_setattr_raw(struct inode *inode, struct iattr *attr) { struct lov_stripe_md *lsm = llu_i2info(inode)->lli_smd; @@ -702,7 +651,7 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr) /* If only OST attributes being set on objects, don't do MDS RPC. * In that case, we need to check permissions and update the local * inode ourselves so we can call obdo_from_inode() always. */ - if (ia_valid & (lsm ? ~(OST_ATTR | ATTR_FROM_OPEN | ATTR_RAW) : ~0)) { + if (ia_valid & (lsm ? ~(ATTR_SIZE | ATTR_FROM_OPEN | ATTR_RAW) : ~0)) { struct lustre_md md; llu_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0); @@ -810,7 +759,7 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr) RETURN(rc); } -/* FIXME here we simply act as a thin layer to glue it with +/* here we simply act as a thin layer to glue it with * llu_setattr_raw(), which is copy from kernel */ static int llu_iop_setattr(struct pnode *pno, @@ -844,7 +793,7 @@ static int llu_iop_setattr(struct pnode *pno, iattr.ia_valid |= ATTR_GID; } if (mask & SETATTR_LEN) { - iattr.ia_size = stbuf->st_size; /* FIXME signed expansion problem */ + iattr.ia_size = stbuf->st_size; /* XXX signed expansion problem */ iattr.ia_valid |= ATTR_SIZE; } @@ -950,10 +899,6 @@ static int llu_iop_readlink(struct pnode *pno, char *data, size_t bufsize) int rc; ENTRY; - /* on symlinks lli_open_sem protects lli_symlink_name allocation/data */ -/* - down(&lli->lli_open_sem); -*/ rc = llu_readlink_internal(inode, &request, &symname); if (rc) GOTO(out, rc); @@ -963,9 +908,6 @@ static int llu_iop_readlink(struct pnode *pno, char *data, size_t bufsize) ptlrpc_req_finished(request); out: -/* - up(&lli->lli_open_sem); -*/ RETURN(rc); } @@ -1013,80 +955,6 @@ static int llu_iop_mknod_raw(struct pnode *pno, RETURN(err); } -#if 0 -static int llu_mdc_unlink(struct inode *dir, struct inode *child, __u32 mode, - const char *name, int len) -{ - struct ptlrpc_request *request = NULL; - struct mds_body *body; - struct lov_mds_md *eadata; - struct lov_stripe_md *lsm = NULL; - struct obd_trans_info oti = { 0 }; - struct mdc_op_data op_data; - struct obdo *oa; - int rc; - ENTRY; - - llu_prepare_mdc_op_data(&op_data, dir, child, name, len, mode); - rc = mdc_unlink(&llu_i2sbi(dir)->ll_mdc_conn, &op_data, &request); - if (rc) - GOTO(out, rc); - /* req is swabbed so this is safe */ - body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body)); - - if (!(body->valid & OBD_MD_FLEASIZE)) - GOTO(out, rc = 0); - - if (body->eadatasize == 0) { - CERROR("OBD_MD_FLEASIZE set but eadatasize zero\n"); - GOTO(out, rc = -EPROTO); - } - - /* The MDS sent back the EA because we unlinked the last reference - * to this file. Use this EA to unlink the objects on the OST. - * It's opaque so we don't swab here; we leave it to obd_unpackmd() to - * check it is complete and sensible. */ - eadata = lustre_swab_repbuf(request, 1, body->eadatasize, NULL); - LASSERT(eadata != NULL); - if (eadata == NULL) { - CERROR("Can't unpack MDS EA data\n"); - GOTO(out, rc = -EPROTO); - } - - rc = obd_unpackmd(llu_i2obdconn(dir), &lsm, eadata, body->eadatasize); - if (rc < 0) { - CERROR("obd_unpackmd: %d\n", rc); - GOTO(out, rc); - } - LASSERT(rc >= sizeof(*lsm)); - - oa = obdo_alloc(); - if (oa == NULL) - GOTO(out_free_memmd, rc = -ENOMEM); - - oa->o_id = lsm->lsm_object_id; - oa->o_mode = body->mode & S_IFMT; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE; - - if (body->valid & OBD_MD_FLCOOKIE) { - oa->o_valid |= OBD_MD_FLCOOKIE; - oti.oti_logcookies = lustre_msg_buf(request->rq_repmsg, 3, - body->eadatasize); - } - - rc = obd_destroy(llu_i2obdconn(dir), oa, lsm, &oti); - obdo_free(oa); - if (rc) - CERROR("obd destroy objid 0x"LPX64" error %d\n", - lsm->lsm_object_id, rc); - out_free_memmd: - obd_free_memmd(llu_i2obdconn(dir), &lsm); - out: - ptlrpc_req_finished(request); - return rc; -} -#endif - static int llu_iop_link_raw(struct pnode *old, struct pnode *new) { struct inode *src = old->p_base->pb_ino; @@ -1176,7 +1044,7 @@ static int llu_iop_rename_raw(struct pnode *old, struct pnode *new) RETURN(rc); } -#if 0 +#ifdef _HAVE_STATVFS static int llu_statfs_internal(struct llu_sb_info *sbi, struct obd_statfs *osfs, unsigned long max_age) @@ -1221,7 +1089,7 @@ static int llu_statfs_internal(struct llu_sb_info *sbi, RETURN(rc); } -static int llu_statfs(struct llu_sb_info *sbi, struct kstatfs *sfs) +static int llu_statfs(struct llu_sb_info *sbi, struct statfs *sfs) { struct obd_statfs osfs; int rc; @@ -1284,7 +1152,7 @@ static int llu_iop_statvfs(struct pnode *pno, RETURN(0); } -#endif +#endif /* _HAVE_STATVFS */ static int llu_iop_mkdir_raw(struct pnode *pno, mode_t mode) { @@ -1449,7 +1317,6 @@ llu_fsswop_mount(const char *source, GOTO(out_free, err = -EINVAL); } - /* XXX */ /* generate a string unique to this super, let's try the address of the super itself.*/ len = (sizeof(sbi) * 2) + 1; @@ -1460,7 +1327,7 @@ llu_fsswop_mount(const char *source, cfg.cfg_instance = sbi->ll_instance; cfg.cfg_uuid = sbi->ll_sb_uuid; - err = liblustre_process_log(&cfg); + err = liblustre_process_log(&cfg, 1); if (err < 0) { CERROR("Unable to process log: %s\n", g_zconf_profile); @@ -1622,7 +1489,7 @@ static struct inode_ops llu_inode_ops = { inop_lookup: llu_iop_lookup, inop_getattr: llu_iop_getattr, inop_setattr: llu_iop_setattr, - inop_getdirentries: NULL, + inop_getdirentries: llu_iop_getdirentries, inop_mkdir: llu_iop_mkdir_raw, inop_rmdir: llu_iop_rmdir_raw, inop_symlink: llu_iop_symlink_raw, @@ -1640,7 +1507,7 @@ static struct inode_ops llu_inode_ops = { inop_datasync: llu_iop_datasync, inop_ioctl: llu_iop_ioctl, inop_mknod: llu_iop_mknod_raw, -#if 0 +#ifdef _HAVE_STATVFS inop_statvfs: llu_iop_statvfs, #endif inop_gone: llu_iop_gone, diff --git a/lustre/liblustre/tests/.cvsignore b/lustre/liblustre/tests/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lustre/liblustre/tests/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lustre/liblustre/tests/Makefile.am b/lustre/liblustre/tests/Makefile.am new file mode 100644 index 0000000..a99a4bb --- /dev/null +++ b/lustre/liblustre/tests/Makefile.am @@ -0,0 +1,45 @@ +## Liblustre excecutables & libraries Makefile +DEFS= + +CFLAGS := -g -Wall -I$(top_srcdir)/utils -I$(top_srcdir)/portals/include \ + -I$(top_srcdir)/portals/unals -I$(SYSIO)/include \ + -I/opt/lam/include -L/opt/lam/lib + +KFLAGS:= +CPPFLAGS = $(HAVE_EFENCE) -D_LARGEFILE64_SOURCE=1 +LIBS = $(LIBEFENCE) + + +LLIB_EXEC= ../liblustre.a -lpthread + +noinst_LIBRARIES = libtestcommon.a +libtestcommon_a_SOURCES = test_common.c + +bin_PROGRAMS = echo_test sanity recovery_small replay_single test_lock_cancel \ + replay_ost_single + +echo_test_SOURCES = echo_test.c ../../utils/parser.c ../../utils/obd.c ../../utils/lustre_cfg.c +echo_test_LDADD = ../liblsupport.a $(LIBREADLINE) -lpthread +echo_test_DEPENDENCIES=$(top_srcdir)/liblustre/liblsupport.a + +sanity_SOURCES = sanity.c +sanity_LDADD := ./libtestcommon.a $(LLIB_EXEC) +sanity_DEPENDENCIES = $(top_srcdir)/liblustre/liblustre.a ./libtestcommon.a + +recovery_small_SOURCES = recovery_small.c +recovery_small_LDADD := ./libtestcommon.a $(LLIB_EXEC) +recovery_small_DEPENDENCIES = $(top_srcdir)/liblustre/liblustre.a + +replay_single_SOURCES = replay_single.c +replay_single_LDADD := ./libtestcommon.a $(LLIB_EXEC) +replay_single_DEPENDENCIES = $(top_srcdir)/liblustre/liblustre.a + +test_lock_cancel_SOURCES = test_lock_cancel.c +test_lock_cancel_LDADD := $(LLIB_EXEC) -lmpi -llam + +replay_ost_single_SOURCES = replay_ost_single.c +replay_ost_single_LDADD := ./libtestcommon.a $(LLIB_EXEC) +replay_ost_single_DEPENDENCIES = $(top_srcdir)/liblustre/liblustre.a + +include $(top_srcdir)/Rules + diff --git a/lustre/liblustre/tests/echo_test.c b/lustre/liblustre/tests/echo_test.c new file mode 100644 index 0000000..51bf60fc --- /dev/null +++ b/lustre/liblustre/tests/echo_test.c @@ -0,0 +1,368 @@ +#include +#include +#include +#include + +#include /* needed for ptpctl.h */ +#include /* needed for parse_dump */ + + +#include +#include +#include +#include + +#define LIBLUSTRE_TEST 1 +#include "../utils/lctl.c" + +struct ldlm_namespace; +struct ldlm_res_id; +struct obd_import; + +void *inter_module_get(char *arg) +{ + if (!strcmp(arg, "tcpnal_ni")) + return &tcpnal_ni; + else if (!strcmp(arg, "ldlm_cli_cancel_unused")) + return ldlm_cli_cancel_unused; + else if (!strcmp(arg, "ldlm_namespace_cleanup")) + return ldlm_namespace_cleanup; + else if (!strcmp(arg, "ldlm_replay_locks")) + return ldlm_replay_locks; + else + return NULL; +} + +/* XXX move to proper place */ +char *portals_nid2str(int nal, ptl_nid_t nid, char *str) +{ + switch(nal){ + case TCPNAL: + /* userspace NAL */ + case SOCKNAL: + sprintf(str, "%u:%d.%d.%d.%d", (__u32)(nid >> 32), + HIPQUAD(nid)); + break; + case QSWNAL: + case GMNAL: + case IBNAL: + case SCIMACNAL: + sprintf(str, "%u:%u", (__u32)(nid >> 32), (__u32)nid); + break; + default: + return NULL; + } + return str; +} + +ptl_handle_ni_t tcpnal_ni; + +struct pingcli_args { + ptl_nid_t mynid; + ptl_nid_t nid; + ptl_pid_t port; + int count; + int size; +}; + +struct task_struct *current; + +/* portals interfaces */ +ptl_handle_ni_t * +kportal_get_ni (int nal) +{ + switch (nal) + { + case SOCKNAL: + return &tcpnal_ni; + default: + return NULL; + } +} + +inline void +kportal_put_ni (int nal) +{ + return; +} + +int +kportal_nal_cmd(struct portals_cfg *pcfg) +{ +#if 0 + __u32 nal = pcfg->pcfg_nal; + int rc = -EINVAL; + + ENTRY; + + down(&nal_cmd_sem); + if (nal > 0 && nal <= NAL_MAX_NR && nal_cmd[nal].nch_handler) { + CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, + pcfg->pcfg_command); + rc = nal_cmd[nal].nch_handler(pcfg, nal_cmd[nal].nch_private); + } + up(&nal_cmd_sem); + RETURN(rc); +#else + CERROR("empty function!!!\n"); + return 0; +#endif +} + +int init_current(int argc, char **argv) +{ + current = malloc(sizeof(*current)); + strncpy(current->comm, argv[0], sizeof(current->comm)); + current->pid = getpid(); + return 0; +} + +ptl_nid_t tcpnal_mynid; + +int init_lib_portals() +{ + int rc; + + PtlInit(); + rc = PtlNIInit(procbridge_interface, 0, 0, 0, &tcpnal_ni); + if (rc != 0) { + CERROR("ksocknal: PtlNIInit failed: error %d\n", rc); + PtlFini(); + RETURN (rc); + } + PtlNIDebug(tcpnal_ni, ~0); + return rc; +} + +extern int class_handle_ioctl(unsigned int cmd, unsigned long arg); + +int liblustre_ioctl(int dev_id, int opc, void *ptr) +{ + int rc = -EINVAL; + + switch (dev_id) { + default: + fprintf(stderr, "Unexpected device id %d\n", dev_id); + abort(); + break; + + case OBD_DEV_ID: + rc = class_handle_ioctl(opc, (unsigned long)ptr); + break; + } + + return rc; +} + +static void generate_random_uuid(unsigned char uuid_out[16]) +{ + int *arr = (int*)uuid_out; + int i; + + for (i = 0; i < sizeof(uuid_out)/sizeof(int); i++) + arr[i] = rand(); +} + +static char *echo_server_nid = NULL; +static char *echo_server_ostname = "obd1"; +static char *osc_dev_name = "OSC_DEV_NAME"; +static char *echo_dev_name = "ECHO_CLIENT_DEV_NAME"; + +static int connect_echo_client(void) +{ + struct lustre_cfg lcfg; + ptl_nid_t nid; + char *peer = "ECHO_PEER_NID"; + class_uuid_t osc_uuid, echo_uuid; + struct obd_uuid osc_uuid_str, echo_uuid_str; + int nal, err; + ENTRY; + + generate_random_uuid(osc_uuid); + class_uuid_unparse(osc_uuid, &osc_uuid_str); + generate_random_uuid(echo_uuid); + class_uuid_unparse(echo_uuid, &echo_uuid_str); + + if (ptl_parse_nid(&nid, echo_server_nid)) { + CERROR("Can't parse NID %s\n", echo_server_nid); + RETURN(-EINVAL); + } + nal = ptl_name2nal("tcp"); + if (nal <= 0) { + CERROR("Can't parse NAL tcp\n"); + RETURN(-EINVAL); + } + + /* add uuid */ + LCFG_INIT(lcfg, LCFG_ADD_UUID, NULL); + lcfg.lcfg_nid = nid; + lcfg.lcfg_inllen1 = strlen(peer) + 1; + lcfg.lcfg_inlbuf1 = peer; + lcfg.lcfg_nal = nal; + err = class_process_config(&lcfg); + if (err < 0) { + CERROR("failed add_uuid\n"); + RETURN(-EINVAL); + } + + /* attach osc */ + LCFG_INIT(lcfg, LCFG_ATTACH, osc_dev_name); + lcfg.lcfg_inlbuf1 = "osc"; + lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1; + lcfg.lcfg_inlbuf2 = osc_uuid_str.uuid; + lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1; + err = class_process_config(&lcfg); + if (err < 0) { + CERROR("failed attach osc\n"); + RETURN(-EINVAL); + } + + /* setup osc */ + LCFG_INIT(lcfg, LCFG_SETUP, osc_dev_name); + lcfg.lcfg_inlbuf1 = echo_server_ostname; + lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1; + lcfg.lcfg_inlbuf2 = peer; + lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1; + err = class_process_config(&lcfg); + if (err < 0) { + CERROR("failed setup osc\n"); + RETURN(-EINVAL); + } + + /* attach echo_client */ + LCFG_INIT(lcfg, LCFG_ATTACH, echo_dev_name); + lcfg.lcfg_inlbuf1 = "echo_client"; + lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1; + lcfg.lcfg_inlbuf2 = echo_uuid_str.uuid; + lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1; + err = class_process_config(&lcfg); + if (err < 0) { + CERROR("failed attach echo_client\n"); + RETURN(-EINVAL); + } + + /* setup echo_client */ + LCFG_INIT(lcfg, LCFG_SETUP, echo_dev_name); + lcfg.lcfg_inlbuf1 = osc_dev_name; + lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1; + lcfg.lcfg_inlbuf2 = NULL; + lcfg.lcfg_inllen2 = 0; + err = class_process_config(&lcfg); + if (err < 0) { + CERROR("failed setup echo_client\n"); + RETURN(-EINVAL); + } + + RETURN(0); +} + +static int disconnect_echo_client(void) +{ + struct lustre_cfg lcfg; + int err; + ENTRY; + + /* cleanup echo_client */ + LCFG_INIT(lcfg, LCFG_CLEANUP, echo_dev_name); + err = class_process_config(&lcfg); + if (err < 0) { + CERROR("failed cleanup echo_client\n"); + RETURN(-EINVAL); + } + + /* detach echo_client */ + LCFG_INIT(lcfg, LCFG_DETACH, echo_dev_name); + err = class_process_config(&lcfg); + if (err < 0) { + CERROR("failed detach echo_client\n"); + RETURN(-EINVAL); + } + + /* cleanup osc */ + LCFG_INIT(lcfg, LCFG_CLEANUP, osc_dev_name); + err = class_process_config(&lcfg); + if (err < 0) { + CERROR("failed cleanup osc device\n"); + RETURN(-EINVAL); + } + + /* detach osc */ + LCFG_INIT(lcfg, LCFG_DETACH, osc_dev_name); + err = class_process_config(&lcfg); + if (err < 0) { + CERROR("failed detach osc device\n"); + RETURN(-EINVAL); + } + + RETURN(0); +} + +static void usage(const char *s) +{ + printf("Usage: %s -s ost_host_name [-n ost_name]\n", s); + printf(" ost_host_name: the host name of echo server\n"); + printf(" ost_name: ost name, default is \"obd1\"\n"); +} + +extern int time_ptlwait1; +extern int time_ptlwait2; +extern int time_ptlselect; + +int main(int argc, char **argv) +{ + int c, rc; + + while ((c = getopt(argc, argv, "s:n:")) != -1) { + switch (c) { + case 's': + echo_server_nid = optarg; + break; + case 'n': + echo_server_ostname = optarg; + break; + default: + usage(argv[0]); + return 1; + } + } + + if (optind != argc) + usage(argv[0]); + + if (!echo_server_nid) { + usage(argv[0]); + return 1; + } + + srand(time(NULL)); + + tcpnal_mynid = rand(); +#if 1 + portal_debug = 0; + portal_subsystem_debug = 0; +#endif + + if (init_current(argc, argv) || + init_obdclass() || init_lib_portals() || + ptlrpc_init() || + ldlm_init() || + mdc_init() || + lov_init() || + osc_init() || + echo_client_init()) { + printf("error\n"); + return 1; + } + + rc = connect_echo_client(); + if (rc) + return rc; + + set_ioc_handler(liblustre_ioctl); + + rc = lctl_main(1, &argv[0]); + + rc |= disconnect_echo_client(); + + return rc; +} diff --git a/lustre/liblustre/recovery_small.c b/lustre/liblustre/tests/recovery_small.c similarity index 99% rename from lustre/liblustre/recovery_small.c rename to lustre/liblustre/tests/recovery_small.c index b1292c2..5aed06c 100644 --- a/lustre/liblustre/recovery_small.c +++ b/lustre/liblustre/tests/recovery_small.c @@ -353,6 +353,8 @@ int main(int argc, char * argv[]) exit(-1); } + setenv(ENV_LUSTRE_TIMEOUT, "10", 1); + __liblustre_setup_(); while (drop_arr[drop_index].name) { diff --git a/lustre/liblustre/tests/replay_ost_single.c b/lustre/liblustre/tests/replay_ost_single.c new file mode 100644 index 0000000..2897807 --- /dev/null +++ b/lustre/liblustre/tests/replay_ost_single.c @@ -0,0 +1,338 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Lustre Light user test program + * + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define _BSD_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "test_common.h" + + + +static char mds_server[1024] = {0,}; +static char barrier_script[1024] = {0,}; +static char failover_script[1024] = {0,}; +static char barrier_cmd[1024] = {0,}; +static char failover_cmd[1024] = {0,}; + +static void replay_barrier() +{ + int rc; + + if ((rc = system(barrier_cmd))) { + printf("excute barrier error: %d\n", rc); + exit(rc); + } +} + +static void mds_failover() +{ + int rc; + + if ((rc = system(failover_cmd))) { + printf("excute failover error: %d\n", rc); + exit(rc); + } +} + + +#define ENTRY(str) \ + do { \ + char buf[100]; \ + int len; \ + sprintf(buf, "===== START: %s ", (str)); \ + len = strlen(buf); \ + if (len < 79) { \ + memset(buf+len, '=', 100-len); \ + buf[79] = '\n'; \ + buf[80] = 0; \ + } \ + printf("%s", buf); \ + } while (0) + +#define LEAVE() \ + do { \ + printf("----- END TEST successfully ---"); \ + printf("-----------------------------"); \ + printf("-------------------\n"); \ + } while (0) + +void t0() +{ + const int bufsize = 4096; + char *path = "/mnt/lustre/rp_ost_t0_file"; + char buf[bufsize]; + int fd, i, j, rc; + ENTRY("open-failover-write-verification (no ping involved)"); + + printf("create/open file...\n"); + t_touch(path); + fd = t_open(path); + printf("OST failover...\n"); + replay_barrier(); + mds_failover(); + + printf("write file...\n"); + for (i = 0; i < 20; i++) { + memset(buf, i, bufsize); + if ((rc = write(fd, buf, bufsize)) != bufsize) { + perror("write error after failover"); + printf("i = %d, rc = %d\n", i, rc); + exit(-1); + } + } + + /* verify */ + printf("read & verify...\n"); + lseek(fd, 0, SEEK_SET); + for (i = 0; i < 20; i++) { + memset(buf, -1, bufsize); + if ((rc = read(fd, buf, bufsize)) != bufsize) { + perror("read error rc"); + printf("i = %d, rc = %d\n", i, rc); + exit(-1); + } + for (j = 0; j < bufsize; j++) { + if (buf[j] != i) { + printf("verify error!\n"); + exit(-1); + } + } + } + t_close(fd); + t_unlink(path); + LEAVE(); +} + +void t1() +{ + const int bufsize = 4096; + char *path = "/mnt/lustre/rp_ost_t1_file"; + char buf[bufsize]; + int fd, i, j; + ENTRY("open-write-close-open-failover-read (no ping involved)"); + + printf("create/open file...\n"); + t_touch(path); + fd = t_open(path); + printf("write file...\n"); + for (i = 0; i < 20; i++) { + memset(buf, i, bufsize); + if (write(fd, buf, bufsize) != bufsize) { + perror("write error"); + exit(-1); + } + } + printf("close/reopen...\n"); + t_close(fd); + fd = t_open(path); + lseek(fd, 0, SEEK_SET); + + printf("OST failover...\n"); + replay_barrier(); + mds_failover(); + + printf("read & verify...\n"); + for (i = 0; i < 20; i++) { + memset(buf, -1, bufsize); + if (read(fd, buf, bufsize) != bufsize) { + perror("read error after failover"); + exit(-1); + } + for (j = 0; j < bufsize; j++) { + if (buf[j] != i) { + printf("verify error after failover\n"); + exit(-1); + } + } + } + + t_close(fd); + t_unlink(path); + LEAVE(); +} + +void t2() +{ + char *path = "/mnt/lustre/rp_ost_t2_file"; + char *str = "xxxxjoiwlsdf98lsjdfsjfoajflsjfajfoaidfojaj08eorje;"; + ENTRY("empty replay"); + + replay_barrier(); + mds_failover(); + + t_echo_create(path, str); + t_grep(path, str); + t_unlink(path); +} + +void t3() +{ + char *path = "/mnt/lustre/rp_ost_t3_file"; + char *str = "xxxxjoiwlsdf98lsjdfsjfoajflsjfajfoaidfojaj08eorje;"; + ENTRY("touch"); + + printf("touch to create a file\n"); + t_echo_create(path, str); + replay_barrier(); + mds_failover(); + + printf("read & verify\n"); + t_grep(path, str); + t_unlink(path); + /* XXX have problem without this, seems server side problem XXX */ + sleep(5); +} + +void t4() +{ + char *path = "/mnt/lustre/rp_ost_t4_file"; + char namebuf[1024]; + char str[1024]; + int count = 10, i; + ENTRY("|X| 10 open(CREAT)s (ping involved)"); + + printf("create %d files\n", count); + for (i = 0; i < count; i++) { + sprintf(namebuf, "%s%02d", path, i); + sprintf(str, "%s-%08d-%08x-AAAAA", "content", i, i); + t_echo_create(namebuf, str); + } + replay_barrier(); + mds_failover(); + + printf("read & verify\n"); + for (i = 0; i < count; i++) { + sprintf(namebuf, "%s%02d", path, i); + sprintf(str, "%s-%08d-%08x-AAAAA", "content", i, i); + t_grep(namebuf, str); + t_unlink(namebuf); + } +} + +extern int portal_debug; +extern int portal_subsystem_debug; + +extern void __liblustre_setup_(void); +extern void __liblustre_cleanup_(void); + +void usage(const char *cmd) +{ + printf("Usage: \t%s --target mdsnid:/mdsname/profile -s ost_hostname " + "-b \"barrier cmd\" -f \"failover cmd\"\n", cmd); + printf(" \t%s --dumpfile dumpfile -s ost_hostname -b \"barrier cmd\" " + "-f \"failover cmd\"\n", cmd); + exit(-1); +} + +void test_ssh() +{ + char cmd[1024]; + + sprintf(cmd, "ssh %s cat /dev/null", mds_server); + if (system(cmd)) { + printf("ssh can't access server node: %s\n", mds_server); + exit(-1); + } +} + +int main(int argc, char * const argv[]) +{ + int opt_index, c; + static struct option long_opts[] = { + {"target", 1, 0, 0}, + {"dumpfile", 1, 0, 0}, + {0, 0, 0, 0} + }; + + if (argc < 4) + usage(argv[0]); + + while ((c = getopt_long(argc, argv, "s:b:f:", long_opts, &opt_index)) != -1) { + switch (c) { + case 0: { + if (!optarg[0]) + usage(argv[0]); + + if (!strcmp(long_opts[opt_index].name, "target")) { + setenv(ENV_LUSTRE_MNTTGT, optarg, 1); + } else if (!strcmp(long_opts[opt_index].name, "dumpfile")) { + setenv(ENV_LUSTRE_DUMPFILE, optarg, 1); + } else + usage(argv[0]); + break; + } + case 's': + strcpy(mds_server, optarg); + break; + case 'b': + strcpy(barrier_script, optarg); + break; + case 'f': + strcpy(failover_script, optarg); + break; + default: + usage(argv[0]); + } + } + + if (optind != argc) + usage(argv[0]); + if (!strlen(mds_server) || !strlen(barrier_script) || + !strlen(failover_script)) + usage(argv[0]); + + test_ssh(); + + /* prepare remote command */ + sprintf(barrier_cmd, "ssh %s \"%s\"", mds_server, barrier_script); + sprintf(failover_cmd, "ssh %s \"%s\"", mds_server, failover_script); + + setenv(ENV_LUSTRE_TIMEOUT, "5", 1); + + __liblustre_setup_(); + + t0(); + t1(); + t2(); + t3(); + t4(); + + printf("liblustre is about shutdown\n"); + __liblustre_cleanup_(); + + printf("complete successfully\n"); + return 0; +} diff --git a/lustre/liblustre/replay_single.c b/lustre/liblustre/tests/replay_single.c old mode 100755 new mode 100644 similarity index 99% rename from lustre/liblustre/replay_single.c rename to lustre/liblustre/tests/replay_single.c index 1602a7c..6645056 --- a/lustre/liblustre/replay_single.c +++ b/lustre/liblustre/tests/replay_single.c @@ -384,6 +384,8 @@ int main(int argc, char * const argv[]) sprintf(barrier_cmd, "ssh %s \"%s\"", mds_server, barrier_script); sprintf(failover_cmd, "ssh %s \"%s\"", mds_server, failover_script); + setenv(ENV_LUSTRE_TIMEOUT, "10", 1); + __liblustre_setup_(); t0(); diff --git a/lustre/liblustre/lltest.c b/lustre/liblustre/tests/sanity.c similarity index 81% rename from lustre/liblustre/lltest.c rename to lustre/liblustre/tests/sanity.c index ac6e8ad..391dd3d 100644 --- a/lustre/liblustre/lltest.c +++ b/lustre/liblustre/tests/sanity.c @@ -180,7 +180,7 @@ static void pages_io(int xfer, loff_t pos) void t5() { char text[256]; - loff_t off_array[] = {1, 17, 255, 257, 4095, 4097, 8191, 1024*1024*1024}; + loff_t off_array[] = {1, 4, 17, 255, 258, 4095, 4097, 8191, 1024*1024*1024}; int np = 1, i; loff_t offset = 0; @@ -280,7 +280,7 @@ void t10() LEAVE(); } -void t100() +void t11() { char *base="/mnt/lustre"; char path[4096], path2[4096]; @@ -316,6 +316,91 @@ void t100() LEAVE(); } +void t12() +{ + char *dir="/mnt/lustre/test_t12_dir"; + char buf[1024*128]; + int fd; + ENTRY("empty directory readdir"); + + t_mkdir(dir); + fd = t_open(dir); + t_ls(fd, buf, sizeof(buf)); + t_close(fd); + t_rmdir(dir); + LEAVE(); +} + +void t13() +{ + char *dir="/mnt/lustre/test_t13_dir/"; + char name[1024]; + char buf[1024]; + const int nfiles = 20; + char *prefix = "test13_filename_prefix_"; + int fd, i; + ENTRY("multiple entries directory readdir"); + + t_mkdir(dir); + printf("Creating %d files...\n", nfiles); + for (i = 0; i < nfiles; i++) { + sprintf(name, "%s%s%05d", dir, prefix, i); + t_touch(name); + } + fd = t_open(dir); + t_ls(fd, buf, sizeof(buf)); + t_close(fd); + printf("Cleanup...\n"); + for (i = 0; i < nfiles; i++) { + sprintf(name, "%s%s%05d", dir, prefix, i); + t_unlink(name); + } + t_rmdir(dir); + LEAVE(); +} + +void t14() +{ + char *dir="/mnt/lustre/test_t14_dir/"; + char name[1024]; + char buf[1024]; + const int nfiles = 256; + char *prefix = "test14_filename_long_prefix_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA___"; + int fd, i; + ENTRY(">1 block(4k) directory readdir"); + + t_mkdir(dir); + printf("Creating %d files...\n", nfiles); + for (i = 0; i < nfiles; i++) { + sprintf(name, "%s%s%05d", dir, prefix, i); + t_touch(name); + } + fd = t_open(dir); + t_ls(fd, buf, sizeof(buf)); + t_close(fd); + printf("Cleanup...\n"); + for (i = 0; i < nfiles; i++) { + sprintf(name, "%s%s%05d", dir, prefix, i); + t_unlink(name); + } + t_rmdir(dir); + LEAVE(); +} + +void t15() +{ + char *file = "/mnt/lustre/test_t15_file"; + int fd; + ENTRY("open-stat-close"); + + t_touch(file); + fd = t_open(file); + t_check_stat(file, NULL); + t_close(fd); + t_unlink(file); + LEAVE(); +} + extern void __liblustre_setup_(void); extern void __liblustre_cleanup_(void); @@ -341,7 +426,6 @@ int main(int argc, char * const argv[]) while ((c = getopt_long(argc, argv, "", long_opts, &opt_index)) != -1) { switch (c) { case 0: { - printf("optindex %d\n", opt_index); if (!optarg[0]) usage(argv[0]); @@ -374,8 +458,11 @@ int main(int argc, char * const argv[]) t8(); t9(); t10(); - - t100(); + t11(); + t12(); + t13(); + t14(); + t15(); #endif printf("liblustre is about shutdown\n"); diff --git a/lustre/liblustre/test_common.c b/lustre/liblustre/tests/test_common.c similarity index 91% rename from lustre/liblustre/test_common.c rename to lustre/liblustre/tests/test_common.c index 210d57e..6f6676e 100644 --- a/lustre/liblustre/test_common.c +++ b/lustre/liblustre/tests/test_common.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "test_common.h" @@ -243,7 +244,7 @@ void t_echo_create(const char *path, const char *str) } } -void _t_grep(const char *path, char *str, int should_contain) +static void _t_grep(const char *path, char *str, int should_contain) { char buf[1024]; int fd; @@ -278,3 +279,25 @@ void t_grep_v(const char *path, char *str) { _t_grep(path, str, 0); } + +void t_ls(int fd, char *buf, int size) +{ + struct dirent64 *ent; + int rc, pos; + loff_t base = 0; + + printf("dir entries listing...\n"); + while ((rc = getdirentries64(fd, buf, size, &base)) > 0) { + pos = 0; + while (pos < rc) { + ent = (struct dirent64 *) ((char*) buf + pos); + printf("%s\n", ent->d_name); + pos += ent->d_reclen; + } + } + + if (rc < 0) { + printf("getdents error %d\n", rc); + EXIT(-1); + } +} diff --git a/lustre/liblustre/test_common.h b/lustre/liblustre/tests/test_common.h similarity index 91% rename from lustre/liblustre/test_common.h rename to lustre/liblustre/tests/test_common.h index af638f2..9d537cc 100644 --- a/lustre/liblustre/test_common.h +++ b/lustre/liblustre/tests/test_common.h @@ -3,6 +3,7 @@ #define ENV_LUSTRE_MNTPNT "LIBLUSTRE_MOUNT_POINT" #define ENV_LUSTRE_MNTTGT "LIBLUSTRE_MOUNT_TARGET" +#define ENV_LUSTRE_TIMEOUT "LIBLUSTRE_TIMEOUT" #define ENV_LUSTRE_DUMPFILE "LIBLUSTRE_DUMPFILE" extern int exit_on_err; @@ -24,8 +25,8 @@ void t_close(int fd); int t_check_stat(const char *name, struct stat *buf); int t_check_stat_fail(const char *name); void t_echo_create(const char *path, const char *str); -//int t_pread_once(const char *path, char *buf, size_t size, off_t offset); void t_grep(const char *path, char *str); void t_grep_v(const char *path, char *str); +void t_ls(int fd, char *buf, int size); #endif diff --git a/lustre/liblustre/test_lock_cancel.c b/lustre/liblustre/tests/test_lock_cancel.c similarity index 100% rename from lustre/liblustre/test_lock_cancel.c rename to lustre/liblustre/tests/test_lock_cancel.c diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 21555c2..e5801a0 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -542,7 +542,7 @@ int lustre_fill_super(struct super_block *sb, void *data, int silent) cfg.cfg_instance = sbi->ll_instance; cfg.cfg_uuid = sbi->ll_sb_uuid; cfg.cfg_local_nid = lmd->lmd_local_nid; - err = lustre_process_log(lmd, lmd->lmd_profile, &cfg, 1); + err = lustre_process_log(lmd, lmd->lmd_profile, &cfg, 0); if (err < 0) { CERROR("Unable to process log: %s\n", lmd->lmd_profile); diff --git a/lustre/lov/Makefile.am b/lustre/lov/Makefile.am index d6d35d8..468d064 100644 --- a/lustre/lov/Makefile.am +++ b/lustre/lov/Makefile.am @@ -6,8 +6,9 @@ DEFS= if LIBLUSTRE -lib_LIBRARIES = liblov.a +noinst_LIBRARIES = liblov.a liblov_a_SOURCES = lov_log.c lov_obd.c lov_pack.c lov_internal.h +liblov_a_CFLAGS = -fPIC else MODULE = lov modulefs_DATA = lov.o diff --git a/lustre/lvfs/Makefile.am b/lustre/lvfs/Makefile.am index 3006306..1fd7dd1 100644 --- a/lustre/lvfs/Makefile.am +++ b/lustre/lvfs/Makefile.am @@ -14,8 +14,9 @@ endif if LIBLUSTRE -lib_LIBRARIES = liblvfs.a +noinst_LIBRARIES = liblvfs.a liblvfs_a_SOURCES = lvfs_userfs.c +liblvfs_a_CFLAGS = -fPIC #if MYSQL #liblvfs_a_SOURCES += lvfs_user_mysql.c diff --git a/lustre/mdc/Makefile.am b/lustre/mdc/Makefile.am index c254e76..0200532 100644 --- a/lustre/mdc/Makefile.am +++ b/lustre/mdc/Makefile.am @@ -6,8 +6,9 @@ DEFS= if LIBLUSTRE -lib_LIBRARIES = libmdc.a +noinst_LIBRARIES = libmdc.a libmdc_a_SOURCES = mdc_request.c mdc_reint.c mdc_lib.c mdc_internal.h mdc_locks.c +libmdc_a_CFLAGS = -fPIC else MODULE = mdc modulefs_DATA = mdc.o diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index bcac2e3..8ceb655 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -550,14 +550,12 @@ int mdc_readpage(struct obd_export *exp, struct ll_fid *mdc_fid, __u64 offset, /* XXX FIXME bug 249 */ req->rq_request_portal = MDS_READPAGE_PORTAL; - desc = ptlrpc_prep_bulk_imp(req, BULK_PUT_SINK, MDS_BULK_PORTAL); + desc = ptlrpc_prep_bulk_imp(req, 1, BULK_PUT_SINK, MDS_BULK_PORTAL); if (desc == NULL) GOTO(out, rc = -ENOMEM); /* NB req now owns desc and will free it when it gets freed */ - rc = ptlrpc_prep_bulk_page(desc, page, 0, PAGE_CACHE_SIZE); - if (rc != 0) - GOTO(out, rc); + ptlrpc_prep_bulk_page(desc, page, 0, PAGE_CACHE_SIZE); mdc_readdir_pack(req, offset, PAGE_CACHE_SIZE, mdc_fid); @@ -565,13 +563,20 @@ int mdc_readpage(struct obd_export *exp, struct ll_fid *mdc_fid, __u64 offset, rc = ptlrpc_queue_wait(req); if (rc == 0) { - LASSERT(desc->bd_page_count == 1); - body = lustre_swab_repbuf(req, 0, sizeof(*body), + body = lustre_swab_repbuf(req, 0, sizeof (*body), lustre_swab_mds_body); if (body == NULL) { CERROR("Can't unpack mds_body\n"); GOTO(out, rc = -EPROTO); } + + if (req->rq_bulk->bd_nob_transferred != PAGE_CACHE_SIZE) { + CERROR ("Unexpected # bytes transferred: %d" + " (%ld expected)\n", + req->rq_bulk->bd_nob_transferred, + PAGE_CACHE_SIZE); + GOTO (out, rc = -EPROTO); + } } EXIT; diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index c512293..fbb3a3c 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -59,19 +59,6 @@ static int mds_postsetup(struct obd_device *obd); static int mds_cleanup(struct obd_device *obd, int flags); -static int mds_bulk_timeout(void *data) -{ - struct ptlrpc_bulk_desc *desc = data; - struct obd_export *exp = desc->bd_export; - - DEBUG_REQ(D_ERROR, desc->bd_req,"bulk send timed out: evicting %s@%s\n", - exp->exp_client_uuid.uuid, - exp->exp_connection->c_remote_uuid.uuid); - ptlrpc_fail_export(exp); - ptlrpc_abort_bulk (desc); - RETURN(1); -} - /* Assumes caller has already pushed into the kernel filesystem context */ static int mds_sendpage(struct ptlrpc_request *req, struct file *file, loff_t offset, int count) @@ -89,7 +76,7 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file, if (!pages) GOTO(out, rc = -ENOMEM); - desc = ptlrpc_prep_bulk_exp (req, BULK_PUT_SOURCE, MDS_BULK_PORTAL); + desc = ptlrpc_prep_bulk_exp (req, 1, BULK_PUT_SOURCE, MDS_BULK_PORTAL); if (desc == NULL) GOTO(out_free, rc = -ENOMEM); @@ -100,9 +87,7 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file, if (pages[i] == NULL) GOTO(cleanup_buf, rc = -ENOMEM); - rc = ptlrpc_prep_bulk_page(desc, pages[i], 0, tmpsize); - if (rc != 0) - GOTO(cleanup_buf, rc); + ptlrpc_prep_bulk_page(desc, pages[i], 0, tmpsize); } for (i = 0, tmpcount = count; i < npages; i++, tmpcount -= tmpsize) { @@ -118,25 +103,41 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file, GOTO(cleanup_buf, rc = -EIO); } - rc = ptlrpc_bulk_put(desc); + LASSERT(desc->bd_nob == count); + + rc = ptlrpc_start_bulk_transfer(desc); if (rc) GOTO(cleanup_buf, rc); if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE)) { CERROR("obd_fail_loc=%x, fail operation rc=%d\n", OBD_FAIL_MDS_SENDPAGE, rc); - ptlrpc_abort_bulk(desc); - GOTO(cleanup_buf, rc); + GOTO(abort_bulk, rc); } - lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, mds_bulk_timeout, desc); - rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete (desc), &lwi); - if (rc) { - LASSERT (rc == -ETIMEDOUT); - GOTO(cleanup_buf, rc); + lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL); + rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), &lwi); + LASSERT (rc == 0 || rc == -ETIMEDOUT); + + if (rc == 0) { + if (desc->bd_success && + desc->bd_nob_transferred == count) + GOTO(cleanup_buf, rc); + + rc = -ETIMEDOUT; /* XXX should this be a different errno? */ } + + DEBUG_REQ(D_ERROR, req, "bulk failed: %s %d(%d), evicting %s@%s\n", + (rc == -ETIMEDOUT) ? "timeout" : "network error", + desc->bd_nob_transferred, count, + req->rq_export->exp_client_uuid.uuid, + req->rq_export->exp_connection->c_remote_uuid.uuid); + + ptlrpc_fail_export(req->rq_export); EXIT; + abort_bulk: + ptlrpc_abort_bulk (desc); cleanup_buf: for (i = 0; i < npages; i++) if (pages[i]) @@ -358,6 +359,21 @@ static int mds_disconnect(struct obd_export *export, int flags) ldlm_cancel_locks_for_export(export); + /* complete all outstanding replies */ + spin_lock_irqsave (&export->exp_lock, irqflags); + while (!list_empty (&export->exp_outstanding_replies)) { + struct ptlrpc_reply_state *rs = + list_entry (export->exp_outstanding_replies.next, + struct ptlrpc_reply_state, rs_exp_list); + struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service; + + spin_lock (&svc->srv_lock); + list_del_init (&rs->rs_exp_list); + ptlrpc_schedule_difficult_reply (rs); + spin_unlock (&svc->srv_lock); + } + spin_unlock_irqrestore (&export->exp_lock, irqflags); + spin_lock_irqsave(&export->exp_lock, irqflags); export->exp_flags = flags; spin_unlock_irqrestore(&export->exp_lock, irqflags); @@ -1100,7 +1116,13 @@ int mds_handle(struct ptlrpc_request *req) OBD_FAIL_RETURN(OBD_FAIL_MDS_READPAGE_NET, 0); rc = mds_readpage(req); - OBD_FAIL_RETURN(OBD_FAIL_MDS_SENDPAGE, 0); + if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_MDS_SENDPAGE)) { + if (req->rq_reply_state) { + lustre_free_reply_state (req->rq_reply_state); + req->rq_reply_state = NULL; + } + RETURN(0); + } break; @@ -1789,11 +1811,11 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf) int rc = 0; ENTRY; - mds->mds_service = ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS, - MDS_BUFSIZE, MDS_MAXREQSIZE, - MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL, - mds_handle, "mds", - obddev->obd_proc_entry); + mds->mds_service = + ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE, + MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL, + mds_handle, "mds", + obddev->obd_proc_entry); if (!mds->mds_service) { CERROR("failed to start service\n"); @@ -1806,8 +1828,7 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf) GOTO(err_thread, rc); mds->mds_setattr_service = - ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS, - MDS_BUFSIZE, MDS_MAXREQSIZE, + ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE, MDS_SETATTR_PORTAL, MDC_REPLY_PORTAL, mds_handle, "mds_setattr", obddev->obd_proc_entry); @@ -1822,8 +1843,7 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf) GOTO(err_thread2, rc); mds->mds_readpage_service = - ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS, - MDS_BUFSIZE, MDS_MAXREQSIZE, + ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE, MDS_READPAGE_PORTAL, MDC_REPLY_PORTAL, mds_handle, "mds_readpage", obddev->obd_proc_entry); diff --git a/lustre/mds/mds_internal.h b/lustre/mds/mds_internal.h index 48d739c..a6bba27 100644 --- a/lustre/mds/mds_internal.h +++ b/lustre/mds/mds_internal.h @@ -52,7 +52,8 @@ int mds_cleanup_orphans(struct obd_device *obd); /* mds/mds_log.c */ int mds_log_op_unlink(struct obd_device *obd, struct inode *inode, - struct lustre_msg *repmsg, int offset); + struct lov_mds_md *lmm, int lmm_size, + struct llog_cookie *logcookies, int cookies_size); int mds_llog_init(struct obd_device *obd, struct obd_device *tgt, int count, struct llog_logid *logid); int mds_llog_finish(struct obd_device *obd, int count); diff --git a/lustre/mds/mds_log.c b/lustre/mds/mds_log.c index 549c760..c4d5690 100644 --- a/lustre/mds/mds_log.c +++ b/lustre/mds/mds_log.c @@ -82,7 +82,8 @@ static int mds_llog_repl_cancel(struct llog_ctxt *ctxt, struct lov_stripe_md *ls } int mds_log_op_unlink(struct obd_device *obd, struct inode *inode, - struct lustre_msg *repmsg, int offset) + struct lov_mds_md *lmm, int lmm_size, + struct llog_cookie *logcookies, int cookies_size) { struct mds_obd *mds = &obd->u.mds; struct lov_stripe_md *lsm = NULL; @@ -94,14 +95,13 @@ int mds_log_op_unlink(struct obd_device *obd, struct inode *inode, RETURN(PTR_ERR(mds->mds_osc_obd)); rc = obd_unpackmd(mds->mds_osc_exp, &lsm, - lustre_msg_buf(repmsg, offset, 0), - repmsg->buflens[offset]); + lmm, lmm_size); if (rc < 0) RETURN(rc); ctxt = llog_get_context(obd, LLOG_UNLINK_ORIG_CTXT); - rc = llog_add(ctxt, NULL, lsm, lustre_msg_buf(repmsg, offset + 1, 0), - repmsg->buflens[offset + 1] / sizeof(struct llog_cookie)); + rc = llog_add(ctxt, NULL, lsm, logcookies, + cookies_size / sizeof(struct llog_cookie)); obd_free_memmd(mds->mds_osc_exp, &lsm); @@ -123,7 +123,7 @@ int mds_llog_init(struct obd_device *obd, struct obd_device *tgt, struct obd_device *lov_obd = obd->u.mds.mds_osc_obd; int rc; ENTRY; - + rc = llog_setup(obd, LLOG_UNLINK_ORIG_CTXT, tgt, 0, NULL, &mds_unlink_orig_logops); if (rc) @@ -134,9 +134,9 @@ int mds_llog_init(struct obd_device *obd, struct obd_device *tgt, if (rc) RETURN(rc); - rc = obd_llog_init(lov_obd, tgt, count, logid); - if (rc) - CERROR("error lov_llog_init\n"); + rc = obd_llog_init(lov_obd, tgt, count, logid); + if (rc) + CERROR("error lov_llog_init\n"); RETURN(rc); } @@ -146,7 +146,7 @@ int mds_llog_finish(struct obd_device *obd, int count) struct obd_device *lov_obd = obd->u.mds.mds_osc_obd; int rc; ENTRY; - + rc = llog_cleanup(llog_get_context(obd, LLOG_UNLINK_ORIG_CTXT)); if (rc) RETURN(rc); @@ -155,9 +155,9 @@ int mds_llog_finish(struct obd_device *obd, int count) if (rc) RETURN(rc); - rc = obd_llog_finish(lov_obd, count); - if (rc) - CERROR("error lov_llog_finish\n"); + rc = obd_llog_finish(lov_obd, count); + if (rc) + CERROR("error lov_llog_finish\n"); RETURN(rc); } diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index 80728da..88724c0 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -455,7 +455,6 @@ static void reconstruct_open(struct mds_update_record *rec, int offset, struct ptlrpc_request *req, struct lustre_handle *child_lockh) { - struct ptlrpc_request *oldreq = req->rq_export->exp_outstanding_reply; struct mds_export_data *med = &req->rq_export->exp_mds_data; struct mds_client_data *mcd = med->med_mcd; struct mds_obd *mds = mds_req2mds(req); @@ -553,10 +552,11 @@ static void reconstruct_open(struct mds_update_record *rec, int offset, mfd = NULL; } - if (oldreq != NULL) { - /* if we're not recovering, it had better be found */ - LASSERT(mfd != NULL); - } else if (mfd == NULL) { +#warning "XXX fixme" + /* Here it used to LASSERT(mfd) if exp_outstanding_reply != NULL. + * Now that exp_outstanding_reply is a list, it's just using mfd != NULL + * to detect a re-open */ + if (mfd == NULL) { mntget(mds->mds_vfsmnt); CERROR("Re-opened file \n"); mfd = mds_dentry_open(child, mds->mds_vfsmnt, @@ -969,7 +969,7 @@ int mds_open(struct mds_update_record *rec, int offset, if (rc) ldlm_lock_decref(&parent_lockh, parent_mode); else - ldlm_put_lock_into_req(req, &parent_lockh, parent_mode); + ptlrpc_save_lock (req, &parent_lockh, parent_mode); } if (rc == 0) atomic_inc(&mds->mds_open_count); @@ -1048,7 +1048,10 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd, if (req != NULL && (reply_body->valid & OBD_MD_FLEASIZE) && mds_log_op_unlink(obd, pending_child->d_inode, - req->rq_repmsg, 1) > 0) { + lustre_msg_buf(req->rq_repmsg, 1, 0), + req->rq_repmsg->buflens[1], + lustre_msg_buf(req->rq_repmsg, 2, 0), + req->rq_repmsg->buflens[2]) > 0) { reply_body->valid |= OBD_MD_FLCOOKIE; } diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index dcacdcf..b44dc22 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -270,21 +270,53 @@ int mds_fix_attr(struct inode *inode, struct mds_update_record *rec) RETURN(0); } -void mds_steal_ack_locks(struct obd_export *exp, struct ptlrpc_request *req) +void mds_steal_ack_locks(struct ptlrpc_request *req) { - unsigned long flags; - struct ptlrpc_request *oldrep = exp->exp_outstanding_reply; - - if (oldrep == NULL) + struct obd_export *exp = req->rq_export; + struct list_head *tmp; + struct ptlrpc_reply_state *oldrep; + struct ptlrpc_service *svc; + unsigned long flags; + int i; + + /* CAVEAT EMPTOR: spinlock order */ + spin_lock_irqsave (&exp->exp_lock, flags); + list_for_each (tmp, &exp->exp_outstanding_replies) { + oldrep = list_entry(tmp, struct ptlrpc_reply_state,rs_exp_list); + + if (oldrep->rs_xid != req->rq_xid) + continue; + + if (oldrep->rs_msg.opc != req->rq_reqmsg->opc) + CERROR ("Resent req xid "LPX64" has mismatched opc: " + "new %d old %d\n", req->rq_xid, + req->rq_reqmsg->opc, oldrep->rs_msg.opc); + + svc = oldrep->rs_srv_ni->sni_service; + spin_lock (&svc->srv_lock); + + list_del_init (&oldrep->rs_exp_list); + + CWARN("Stealing %d locks from rs %p x"LPD64".t"LPD64 + " o%d NID"LPX64"\n", + oldrep->rs_nlocks, oldrep, + oldrep->rs_xid, oldrep->rs_transno, oldrep->rs_msg.opc, + exp->exp_connection->c_peer.peer_nid); + + for (i = 0; i < oldrep->rs_nlocks; i++) + ptlrpc_save_lock(req, + &oldrep->rs_locks[i], + oldrep->rs_modes[i]); + oldrep->rs_nlocks = 0; + + DEBUG_REQ(D_HA, req, "stole locks for"); + ptlrpc_schedule_difficult_reply (oldrep); + + spin_unlock (&svc->srv_lock); + spin_unlock_irqrestore (&exp->exp_lock, flags); return; - memcpy(req->rq_ack_locks, oldrep->rq_ack_locks, - sizeof req->rq_ack_locks); - spin_lock_irqsave(&req->rq_lock, flags); - oldrep->rq_resent = 1; - wake_up(&oldrep->rq_reply_waitq); - spin_unlock_irqrestore(&req->rq_lock, flags); - DEBUG_REQ(D_HA, oldrep, "stole locks from"); - DEBUG_REQ(D_HA, req, "stole locks for"); + } + spin_unlock_irqrestore (&exp->exp_lock, flags); } void mds_req_from_mcd(struct ptlrpc_request *req, struct mds_client_data *mcd) @@ -294,8 +326,7 @@ void mds_req_from_mcd(struct ptlrpc_request *req, struct mds_client_data *mcd) req->rq_repmsg->transno = req->rq_transno = mcd->mcd_last_transno; req->rq_repmsg->status = req->rq_status = mcd->mcd_last_result; - if (req->rq_export->exp_outstanding_reply) - mds_steal_ack_locks(req->rq_export, req); + mds_steal_ack_locks(req); } static void reconstruct_reint_setattr(struct mds_update_record *rec, @@ -444,7 +475,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, if (rc) { ldlm_lock_decref(&lockh, LCK_PW); } else { - ldlm_put_lock_into_req(req, &lockh, LCK_PW); + ptlrpc_save_lock (req, &lockh, LCK_PW); } } case 0: @@ -695,7 +726,7 @@ cleanup: if (rc) { ldlm_lock_decref(&lockh, LCK_PW); } else { - ldlm_put_lock_into_req(req, &lockh, LCK_PW); + ptlrpc_save_lock (req, &lockh, LCK_PW); } l_dput(dparent); case 0: @@ -1181,8 +1212,11 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset, rc = vfs_unlink(dparent->d_inode, dchild); if (!rc && log_unlink) - if (mds_log_op_unlink(obd, child_inode, req->rq_repmsg, - offset + 1) > 0) + if (mds_log_op_unlink(obd, child_inode, + lustre_msg_buf(req->rq_repmsg, offset + 1, 0), + req->rq_repmsg->buflens[offset + 1], + lustre_msg_buf(req->rq_repmsg, offset + 2, 0), + req->rq_repmsg->buflens[offset + 2]) > 0) body->valid |= OBD_MD_FLCOOKIE; break; } @@ -1234,14 +1268,14 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset, if (rc) ldlm_lock_decref(&child_reuse_lockh, LCK_EX); else - ldlm_put_lock_into_req(req, &child_reuse_lockh, LCK_EX); + ptlrpc_save_lock(req, &child_reuse_lockh, LCK_EX); case 2: /* child lock */ ldlm_lock_decref(&child_lockh, LCK_EX); case 1: /* child and parent dentry, parent lock */ if (rc) ldlm_lock_decref(&parent_lockh, LCK_PW); else - ldlm_put_lock_into_req(req, &parent_lockh, LCK_PW); + ptlrpc_save_lock(req, &parent_lockh, LCK_PW); l_dput(dchild); l_dput(dparent); case 0: @@ -1353,8 +1387,8 @@ cleanup: ldlm_lock_decref(&src_lockh, LCK_EX); ldlm_lock_decref(&tgt_dir_lockh, LCK_EX); } else { - ldlm_put_lock_into_req(req, &src_lockh, LCK_EX); - ldlm_put_lock_into_req(req, &tgt_dir_lockh, LCK_EX); + ptlrpc_save_lock(req, &src_lockh, LCK_EX); + ptlrpc_save_lock(req, &tgt_dir_lockh, LCK_EX); } case 2: /* target dentry */ l_dput(de_tgt_dir); @@ -1720,11 +1754,11 @@ cleanup: ldlm_lock_decref(&(dlm_handles[0]), LCK_PW); } else { if (lock_count == 4) - ldlm_put_lock_into_req(req, - &(dlm_handles[3]), LCK_EX); - ldlm_put_lock_into_req(req, &(dlm_handles[2]), LCK_EX); - ldlm_put_lock_into_req(req, &(dlm_handles[1]), LCK_PW); - ldlm_put_lock_into_req(req, &(dlm_handles[0]), LCK_PW); + ptlrpc_save_lock(req, + &(dlm_handles[3]), LCK_EX); + ptlrpc_save_lock(req, &(dlm_handles[2]), LCK_EX); + ptlrpc_save_lock(req, &(dlm_handles[1]), LCK_PW); + ptlrpc_save_lock(req, &(dlm_handles[0]), LCK_PW); } l_dput(de_new); l_dput(de_old); diff --git a/lustre/mds/mds_unlink_open.c b/lustre/mds/mds_unlink_open.c index 56e6dcc..330be73 100644 --- a/lustre/mds/mds_unlink_open.c +++ b/lustre/mds/mds_unlink_open.c @@ -101,28 +101,22 @@ out_lock: } static int mds_osc_destroy_orphan(struct mds_obd *mds, - struct ptlrpc_request *request) + struct inode *inode, + struct lov_mds_md *lmm, + int lmm_size, + struct llog_cookie *logcookies, + int log_unlink) { - struct mds_body *body; - struct lov_mds_md *lmm = NULL; struct lov_stripe_md *lsm = NULL; struct obd_trans_info oti = { 0 }; struct obdo *oa; int rc; ENTRY; - body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body)); - if (!(body->valid & OBD_MD_FLEASIZE)) + if (lmm_size == 0) RETURN(0); - if (body->eadatasize == 0) { - CERROR("OBD_MD_FLEASIZE set but eadatasize zero\n"); - RETURN(rc = -EPROTO); - } - lmm = lustre_msg_buf(request->rq_repmsg, 1, body->eadatasize); - LASSERT(lmm != NULL); - - rc = obd_unpackmd(mds->mds_osc_exp, &lsm, lmm, body->eadatasize); + rc = obd_unpackmd(mds->mds_osc_exp, &lsm, lmm, lmm_size); if (rc < 0) { CERROR("Error unpack md %p\n", lmm); RETURN(rc); @@ -135,18 +129,12 @@ static int mds_osc_destroy_orphan(struct mds_obd *mds, if (oa == NULL) GOTO(out_free_memmd, rc = -ENOMEM); oa->o_id = lsm->lsm_object_id; - oa->o_mode = body->mode & S_IFMT; + oa->o_mode = inode->i_mode & S_IFMT; oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE; - if (body->valid & OBD_MD_FLCOOKIE) { + if (log_unlink && logcookies) { oa->o_valid |= OBD_MD_FLCOOKIE; - oti.oti_logcookies = - lustre_msg_buf(request->rq_repmsg, 2, - sizeof(struct llog_cookie) * - lsm->lsm_stripe_count); - if (oti.oti_logcookies == NULL) - oa->o_valid &= ~OBD_MD_FLCOOKIE; - body->valid &= ~OBD_MD_FLCOOKIE; + oti.oti_logcookies = logcookies; } rc = obd_destroy(mds->mds_osc_exp, oa, lsm, &oti); @@ -163,69 +151,88 @@ static int mds_unlink_orphan(struct obd_device *obd, struct dentry *dchild, struct inode *inode, struct inode *pending_dir) { struct mds_obd *mds = &obd->u.mds; - struct mds_body *body; + struct lov_mds_md *lmm = NULL; + struct llog_cookie *logcookies = NULL; + int lmm_size = 0, log_unlink = 0; void *handle = NULL; - struct ptlrpc_request *req; - int lengths[3] = {sizeof(struct mds_body), - mds->mds_max_mdsize, - mds->mds_max_cookiesize}; - int rc; + int rc, err; ENTRY; LASSERT(mds->mds_osc_obd != NULL); - OBD_ALLOC(req, sizeof(*req)); - if (!req) { - CERROR("request allocation out of memory\n"); - GOTO(err_alloc_req, rc = -ENOMEM); - } - rc = lustre_pack_reply(req, 3, lengths, NULL); - if (rc) { - CERROR("cannot pack request %d\n", rc); - GOTO(out_free_req, rc); - } - body = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*body)); - LASSERT(body != NULL); - mds_pack_inode2body(body, inode); - mds_pack_md(obd, req->rq_repmsg, 1, body, inode, 1); + OBD_ALLOC(lmm, mds->mds_max_mdsize); + if (lmm == NULL) + RETURN(-ENOMEM); + + down(&inode->i_sem); + rc = fsfilt_get_md(obd, inode, lmm, mds->mds_max_mdsize); + up(&inode->i_sem); + + if (rc < 0) { + CERROR("Error %d reading eadata for ino %lu\n", + rc, inode->i_ino); + GOTO(out_free_lmm, rc); + } else if (rc > 0) { + lmm_size = rc; + rc = mds_convert_lov_ea(obd, inode, lmm, lmm_size); + if (rc > 0) + lmm_size = rc; + rc = 0; + } handle = fsfilt_start(obd, pending_dir, FSFILT_OP_UNLINK_LOG, NULL); if (IS_ERR(handle)) { rc = PTR_ERR(handle); CERROR("error fsfilt_start: %d\n", rc); handle = NULL; - GOTO(out_free_msg, rc); + GOTO(out_free_lmm, rc); } - if (S_ISDIR(inode->i_mode)) { + down(&inode->i_sem); + rc = fsfilt_get_md(obd, inode, lmm, mds->mds_max_mdsize); + up(&inode->i_sem); + + if (rc < 0) { + CERROR("Error %d reading eadata for ino %lu\n", + rc, inode->i_ino); + GOTO(out_free_lmm, rc); + } else if (rc > 0) { + lmm_size = rc; + rc = 0; + } + + if (S_ISDIR(inode->i_mode)) rc = vfs_rmdir(pending_dir, dchild); - } else { + else rc = vfs_unlink(pending_dir, dchild); - } + if (rc) CERROR("error %d unlinking orphan %*s from PENDING directory\n", rc, dchild->d_name.len, dchild->d_name.name); - if ((body->valid & OBD_MD_FLEASIZE)) { - if (mds_log_op_unlink(obd, inode, req->rq_repmsg, 1) > 0) - body->valid |= OBD_MD_FLCOOKIE; + if (!rc && lmm_size) { + OBD_ALLOC(logcookies, mds->mds_max_cookiesize); + if (logcookies == NULL) + rc = -ENOMEM; + else if (mds_log_op_unlink(obd, inode, lmm,lmm_size,logcookies, + mds->mds_max_cookiesize) > 0) + log_unlink = 1; } - - if (handle) { - int err = fsfilt_commit(obd, pending_dir, handle, 0); - if (err) { - CERROR("error committing orphan unlink: %d\n", err); + err = fsfilt_commit(obd, pending_dir, handle, 0); + if (err) { + CERROR("error committing orphan unlink: %d\n", err); + if (!rc) rc = err; - GOTO(out_free_msg, rc); - } } - rc = mds_osc_destroy_orphan(mds, req); -out_free_msg: - OBD_FREE(req->rq_repmsg, req->rq_replen); - req->rq_repmsg = NULL; -out_free_req: - OBD_FREE(req, sizeof(*req)); -err_alloc_req: + if (!rc) { + rc = mds_osc_destroy_orphan(mds, inode, lmm, lmm_size, + logcookies, log_unlink); + } + + if (logcookies != NULL) + OBD_FREE(logcookies, mds->mds_max_cookiesize); +out_free_lmm: + OBD_FREE(lmm, mds->mds_max_mdsize); RETURN(rc); } diff --git a/lustre/mgmt/mgmt_svc.c b/lustre/mgmt/mgmt_svc.c index 51abb74..8743e72 100644 --- a/lustre/mgmt/mgmt_svc.c +++ b/lustre/mgmt/mgmt_svc.c @@ -32,7 +32,6 @@ #include #include -#define MGMT_NEVENTS 1024UL #define MGMT_NBUFS 128UL #define MGMT_BUFSIZE 8192 #define MGMT_MAXREQSIZE 512 @@ -89,10 +88,11 @@ static int mgmt_setup(struct obd_device *obd, obd_count len, void *buf) if (mgmt_initialized) RETURN(-EALREADY); - mgmt_service = ptlrpc_init_svc(MGMT_NEVENTS, MGMT_NBUFS, MGMT_BUFSIZE, - MGMT_MAXREQSIZE, MGMT_REQUEST_PORTAL, - MGMT_REPLY_PORTAL, mgmt_handler, - "mgmt", obd->obd_proc_entry); + mgmt_service = + ptlrpc_init_svc(MGMT_NBUFS, MGMT_BUFSIZE, MGMT_MAXREQSIZE, + MGMT_REQUEST_PORTAL, MGMT_REPLY_PORTAL, + mgmt_handler, "mgmt", + obd->obd_proc_entry); if (!mgmt_service) { CERROR("Failed to start mgmt service\n"); RETURN(-ENOMEM); diff --git a/lustre/obdclass/Makefile.am b/lustre/obdclass/Makefile.am index d7c3c1e..9fc783b 100644 --- a/lustre/obdclass/Makefile.am +++ b/lustre/obdclass/Makefile.am @@ -1,20 +1,25 @@ DEFS= MODULE = obdclass -class_obd.o: lustre_build_version - if LIBLUSTRE -lib_LIBRARIES = liblustreclass.a + +noinst_LIBRARIES = liblustreclass.a liblustreclass_a_SOURCES = class_obd.c debug.c genops.c statfs_pack.c uuid.c liblustreclass_a_SOURCES += lustre_handles.c lustre_peer.c lprocfs_status.c liblustreclass_a_SOURCES += obdo.c obd_config.c llog.c llog_obd.c llog_cat.c liblustreclass_a_SOURCES += llog_lvfs.c #llog_ioctl.c rbtree.c +liblustreclass_a_CFLAGS = -fPIC + +class_obd.c: lustre_build_version lustre_build_version: - echo '#define LUSTRE_VERSION 31' > $(top_builddir)/include/linux/lustre_build_version.h + echo '#define LUSTRE_VERSION 32' > $(top_builddir)/include/linux/lustre_build_version.h echo '#define BUILD_VERSION "1"' >> $(top_builddir)/include/linux/lustre_build_version.h else + +class_obd.o: lustre_build_version + modulefs_DATA = lustre_build_version obdclass.o llog_test.o EXTRA_PROGRAMS = obdclass llog_test diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index a807fd8..17beed2 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -67,6 +67,11 @@ #include #include "llog_internal.h" +#ifndef __KERNEL__ +/* liblustre workaround */ +atomic_t portal_kmemory = {0}; +#endif + struct semaphore obd_conf_sem; /* serialize configuration commands */ struct obd_device obd_dev[MAX_OBD_DEVICES]; struct list_head obd_types; @@ -199,12 +204,6 @@ int class_handle_ioctl(unsigned int cmd, unsigned long arg) char *buf; struct lustre_cfg *lcfg; - /* FIXME hack to liblustre dump, remove when switch - to zeroconf */ -#ifndef __KERNEL__ - data->ioc_pbuf1 = data->ioc_inlbuf1; - data->ioc_plen1 = data->ioc_inllen1; -#endif if (!data->ioc_plen1 || !data->ioc_pbuf1) { CERROR("No config buffer passed!\n"); GOTO(out, err = -EINVAL); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 615c102..d3b86bf 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -413,6 +413,7 @@ void __class_export_put(struct obd_export *exp) if (exp->exp_connection) ptlrpc_put_connection_superhack(exp->exp_connection); + LASSERT(list_empty(&exp->exp_outstanding_replies)); LASSERT(list_empty(&exp->exp_handle.h_link)); obd_destroy_export(exp); @@ -440,6 +441,7 @@ struct obd_export *class_new_export(struct obd_device *obd) export->exp_conn_cnt = 0; atomic_set(&export->exp_refcount, 2); export->exp_obd = obd; + INIT_LIST_HEAD(&export->exp_outstanding_replies); /* XXX this should be in LDLM init */ INIT_LIST_HEAD(&export->exp_ldlm_data.led_held_locks); diff --git a/lustre/obdclass/llog_lvfs.c b/lustre/obdclass/llog_lvfs.c index 3271968..ec32b11 100644 --- a/lustre/obdclass/llog_lvfs.c +++ b/lustre/obdclass/llog_lvfs.c @@ -672,7 +672,7 @@ static int llog_lvfs_next_block(struct llog_handle *loghandle, int *cur_idx, return 0; } -static int llog_lvfs_create(struct llog_obd_ctxt *ctxt,struct llog_handle **res, +static int llog_lvfs_create(struct llog_ctxt *ctxt, struct llog_handle **res, struct llog_logid *logid, char *name) { LBUG(); diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index fadf05b..4a96820 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -145,6 +145,13 @@ void lprocfs_remove(struct proc_dir_entry *root) rm_entry = temp; temp = temp->parent; + + /* Memory corruption once caused this to fail, and + without this LASSERT we would loop here forever. */ + LASSERTF(strlen(rm_entry->name) == rm_entry->namelen, + "0x%p %s/%s len %d\n", rm_entry, + temp->name, rm_entry->name, strlen(rm_entry->name)); + remove_proc_entry(rm_entry->name, rm_entry->parent); if (temp == parent) break; diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index 46710aa..9156dc8 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -133,7 +133,8 @@ int class_attach(struct lustre_cfg *lcfg) INIT_LIST_HEAD(&obd->obd_recovery_queue); INIT_LIST_HEAD(&obd->obd_delayed_reply_queue); - init_waitqueue_head(&obd->obd_commit_waitq); + spin_lock_init (&obd->obd_uncommitted_replies_lock); + INIT_LIST_HEAD (&obd->obd_uncommitted_replies); len = strlen(name) + 1; OBD_ALLOC(obd->obd_name, len); @@ -257,10 +258,22 @@ static void dump_exports(struct obd_device *obd) struct obd_export *exp, *n; list_for_each_entry_safe(exp, n, &obd->obd_exports, exp_obd_chain) { - CERROR("%s: %p %s %d %d %p\n", + struct ptlrpc_reply_state *rs; + struct ptlrpc_reply_state *first_reply = NULL; + int nreplies = 0; + + list_for_each_entry (rs, &exp->exp_outstanding_replies, + rs_exp_list) { + if (nreplies == 0) + first_reply = rs; + nreplies++; + } + + CERROR("%s: %p %s %d %d %d: %p %s\n", obd->obd_name, exp, exp->exp_client_uuid.uuid, atomic_read(&exp->exp_refcount), - exp->exp_failed, exp->exp_outstanding_reply ); + exp->exp_failed, nreplies, first_reply, + nreplies > 3 ? "..." : ""); } } diff --git a/lustre/obdecho/Makefile.am b/lustre/obdecho/Makefile.am index 08136d7..b9fa3b8 100644 --- a/lustre/obdecho/Makefile.am +++ b/lustre/obdecho/Makefile.am @@ -6,8 +6,9 @@ DEFS= if LIBLUSTRE -lib_LIBRARIES = libobdecho.a +noinst_LIBRARIES = libobdecho.a libobdecho_a_SOURCES = echo_client.c +libobdecho_a_CFLAGS = -fPIC else MODULE = obdecho modulefs_DATA = obdecho.o diff --git a/lustre/obdecho/echo.c b/lustre/obdecho/echo.c index c627f82..a922517 100644 --- a/lustre/obdecho/echo.c +++ b/lustre/obdecho/echo.c @@ -351,15 +351,14 @@ int echo_commitrw(int cmd, struct obd_export *export, struct obdo *oa, struct page *page = r->page; void *addr; - if (!page || !(addr = kmap(page)) || - !kern_addr_valid((unsigned long)addr)) { - - CERROR("bad page objid "LPU64":%p, buf %d/%d\n", + if (page == NULL) { + CERROR("null page objid "LPU64":%p, buf %d/%d\n", obj->ioo_id, page, j, obj->ioo_bufcnt); - kunmap(page); GOTO(commitrw_cleanup, rc = -EFAULT); } + addr = kmap(page); + CDEBUG(D_PAGE, "$$$$ use page %p, addr %p@"LPU64"\n", r->page, addr, r->offset); diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c index d3d79ad..2f15e62 100644 --- a/lustre/obdecho/echo_client.c +++ b/lustre/obdecho/echo_client.c @@ -423,6 +423,51 @@ echo_get_stripe_off_id (struct lov_stripe_md *lsm, obd_off *offp, obd_id *idp) *offp = offset * stripe_size + woffset % stripe_size; } +static void echo_page_debug_setup(struct lov_stripe_md *lsm, + struct page *page, int rw, obd_id id, + obd_off offset, obd_off count) +{ + void *addr; + obd_off stripe_off; + obd_id stripe_id; + + if (id == 0) + return; + + addr = kmap(page); + + if (rw == OBD_BRW_WRITE) { + stripe_off = offset; + stripe_id = id; + echo_get_stripe_off_id(lsm, &stripe_off, &stripe_id); + } else { + stripe_off = 0xdeadbeef00c0ffeeULL; + stripe_id = 0xdeadbeef00c0ffeeULL; + } + page_debug_setup(addr, count, stripe_off, stripe_id); + + kunmap(page); +} + +static int echo_page_debug_check(struct lov_stripe_md *lsm, + struct page *page, obd_id id, + obd_off offset, obd_off count) +{ + obd_off stripe_off = offset; + obd_id stripe_id = id; + void *addr; + int rc; + + if (id == 0) + return 0; + + addr = kmap(page); + echo_get_stripe_off_id (lsm, &stripe_off, &stripe_id); + rc = page_debug_check("test_brw", addr, count, stripe_off, stripe_id); + kunmap(page); + return rc; +} + static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa, struct lov_stripe_md *lsm, obd_off offset, obd_size count, struct obd_trans_info *oti) @@ -434,13 +479,12 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa, obd_off off; int i; int rc; - int verify; + int verify = 0; int gfp_mask; /* oa_id == 0 => speed test (no verification) else... * oa & 1 => use HIGHMEM */ - verify = (oa->o_id != 0); gfp_mask = ((oa->o_id & 1) == 0) ? GFP_KERNEL : GFP_HIGHUSER; LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ); @@ -473,48 +517,26 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa, pgp->off = off; pgp->flag = 0; - if (verify) { - void *addr = kmap(pgp->pg); - obd_off stripe_off = off; - obd_id stripe_id = oa->o_id; - - if (rw == OBD_BRW_WRITE) { - echo_get_stripe_off_id(lsm, &stripe_off, - &stripe_id); - page_debug_setup(addr, pgp->count, - stripe_off, stripe_id); - } else { - page_debug_setup(addr, pgp->count, - 0xdeadbeef00c0ffeeULL, - 0xdeadbeef00c0ffeeULL); - } - kunmap(pgp->pg); - } + echo_page_debug_setup(lsm, pgp->pg, rw, oa->o_id, off, + pgp->count); } rc = obd_brw(rw, ec->ec_exp, oa, lsm, npages, pga, oti); out: - if (rc != 0) - verify = 0; + if (rc == 0 && rw == OBD_BRW_READ) + verify = 1; for (i = 0, pgp = pga; i < npages; i++, pgp++) { if (pgp->pg == NULL) continue; if (verify) { - void *addr = kmap(pgp->pg); - obd_off stripe_off = pgp->off; - obd_id stripe_id = oa->o_id; - int vrc; - - echo_get_stripe_off_id (lsm, &stripe_off, &stripe_id); - vrc = page_debug_check("test_brw", addr, pgp->count, - stripe_off, stripe_id); + int vrc; + vrc = echo_page_debug_check(lsm, pgp->pg, oa->o_id, + pgp->off, pgp->count); if (vrc != 0 && rc == 0) rc = vrc; - - kunmap(pgp->pg); } __free_pages(pgp->pg, 0); } @@ -623,6 +645,7 @@ struct echo_async_state { wait_queue_head_t eas_waitq; struct list_head eas_avail; struct obdo eas_oa; + struct lov_stripe_md *eas_lsm; }; static int eas_should_wake(struct echo_async_state *eas) @@ -675,6 +698,11 @@ static void ec_ap_completion(void *data, int cmd, int rc) return; eas = eap->eap_eas; + if (cmd == OBD_BRW_READ) + echo_page_debug_check(eas->eas_lsm, eap->eap_page, + eas->eas_oa.o_id, eap->eap_off, + PAGE_SIZE); + spin_lock_irqsave(&eas->eas_lock, flags); if (rc && !eas->eas_rc) eas->eas_rc = rc; @@ -731,6 +759,7 @@ static int echo_client_async_page(struct obd_export *exp, int rw, init_waitqueue_head(&eas.eas_waitq); eas.eas_in_flight = 0; eas.eas_rc = 0; + eas.eas_lsm = lsm; INIT_LIST_HEAD(&eas.eas_avail); /* prepare the group of pages that we're going to be keeping @@ -740,6 +769,7 @@ static int echo_client_async_page(struct obd_export *exp, int rw, if (page == NULL) GOTO(out, rc = -ENOMEM); + page->private = 0; list_add_tail(&page->list, &pages); OBD_ALLOC(eap, sizeof(*eap)); @@ -749,7 +779,7 @@ static int echo_client_async_page(struct obd_export *exp, int rw, eap->eap_magic = EAP_MAGIC; eap->eap_page = page; eap->eap_eas = &eas; - eap->eap_cookie = ERR_PTR(-ENOENT); + page->private = (unsigned long)eap; list_add_tail(&eap->eap_item, &eas.eas_avail); } @@ -775,10 +805,10 @@ static int echo_client_async_page(struct obd_export *exp, int rw, spin_unlock_irqrestore(&eas.eas_lock, flags); /* unbind the eap from its old page offset */ - if (!IS_ERR(eap->eap_cookie)) { + if (eap->eap_cookie != NULL) { obd_teardown_async_page(exp, lsm, NULL, eap->eap_cookie); - eap->eap_cookie = ERR_PTR(-ENOENT); + eap->eap_cookie = NULL; } eas.eas_next_offset += PAGE_SIZE; @@ -793,6 +823,10 @@ static int echo_client_async_page(struct obd_export *exp, int rw, break; } + if (rw == OBD_BRW_WRITE) + echo_page_debug_setup(lsm, eap->eap_page, rw, oa->o_id, + eap->eap_off, PAGE_SIZE); + /* always asserts urgent, which isn't quite right */ rc = obd_queue_async_io(exp, lsm, NULL, eap->eap_cookie, rw, 0, PAGE_SIZE, 0, @@ -824,9 +858,9 @@ out: struct page *page = list_entry(pos, struct page, list); list_del(&page->list); - if (page->private) { + if (page->private != 0) { eap = (struct echo_async_page *)page->private; - if (!IS_ERR(eap->eap_cookie)) + if (eap->eap_cookie != NULL) obd_teardown_async_page(exp, lsm, NULL, eap->eap_cookie); OBD_FREE(eap, sizeof(*eap)); @@ -886,23 +920,19 @@ static int echo_client_prep_commit(struct obd_export *exp, int rw, for (i = 0; i < npages; i++) { struct page *page = lnb[i].page; - void *addr; /* read past eof? */ if (page == NULL && lnb[i].rc == 0) continue; - addr = kmap(lnb[i].page); - if (rw == OBD_BRW_WRITE) - page_debug_setup(addr, PAGE_SIZE, - rnb[i].offset, oa->o_id); - else - err = page_debug_check("prep_commit", addr, - PAGE_SIZE, rnb[i].offset, - oa->o_id); - - kunmap(lnb[i].page); + echo_page_debug_setup(lsm, page, rw, oa->o_id, + rnb[i].offset, + rnb[i].len); + else + echo_page_debug_check(lsm, page, oa->o_id, + rnb[i].offset, + rnb[i].len); } ret = obd_commitrw(rw, exp, oa, 1, &ioo, npages, lnb, oti); diff --git a/lustre/osc/Makefile.am b/lustre/osc/Makefile.am index 90a5d11..0cb3bcd 100644 --- a/lustre/osc/Makefile.am +++ b/lustre/osc/Makefile.am @@ -6,8 +6,9 @@ DEFS= if LIBLUSTRE -lib_LIBRARIES = libosc.a +noinst_LIBRARIES = libosc.a libosc_a_SOURCES = osc_request.c osc_lib.c osc_create.c osc_internal.h +libosc_a_CFLAGS = -fPIC else MODULE = osc modulefs_DATA = osc.o diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h index b5f6392..68a2d35 100644 --- a/lustre/osc/osc_internal.h +++ b/lustre/osc/osc_internal.h @@ -65,7 +65,7 @@ void osc_wake_cache_waiters(struct client_obd *cli); #ifdef __KERNEL__ int lproc_osc_attach_seqstat(struct obd_device *dev); #else -static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {} +static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;} #endif #endif /* OSC_INTERNAL_H */ diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index e8dd043..666de06 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -537,7 +537,12 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, { obd_flag bits = OBD_MD_FLBLOCKS|OBD_MD_FLGRANT; - LASSERT(!(oa->o_valid & bits)); + /* XXX obd_brw_internal() might reuse obdo in it's loop thus + * hit the following assert. any actual meaning of this? temporarily + * disable it. + * in kernel mode, probably VFS will prevent it happen. + */ + //LASSERT(!(oa->o_valid & bits)); oa->o_valid |= bits; spin_lock(&cli->cl_loi_list_lock); @@ -645,34 +650,40 @@ static void handle_short_read(int nob_read, obd_count page_count, } } -static int check_write_rcs(struct ptlrpc_request *request, int niocount, +static int check_write_rcs(struct ptlrpc_request *request, + int requested_nob, int niocount, obd_count page_count, struct brw_page *pga) { - int i; - int *remote_rcs; + int *remote_rcs, i; /* return error if any niobuf was in error */ remote_rcs = lustre_swab_repbuf(request, 1, sizeof(*remote_rcs) * niocount, NULL); if (remote_rcs == NULL) { - CERROR ("Missing/short RC vector on BRW_WRITE reply\n"); - return (-EPROTO); + CERROR("Missing/short RC vector on BRW_WRITE reply\n"); + return(-EPROTO); } - if (lustre_msg_swabbed (request->rq_repmsg)) + if (lustre_msg_swabbed(request->rq_repmsg)) for (i = 0; i < niocount; i++) - __swab32s (&remote_rcs[i]); + __swab32s(&remote_rcs[i]); for (i = 0; i < niocount; i++) { if (remote_rcs[i] < 0) - return (remote_rcs[i]); + return(remote_rcs[i]); if (remote_rcs[i] != 0) { - CERROR ("rc[%d] invalid (%d) req %p\n", + CERROR("rc[%d] invalid (%d) req %p\n", i, remote_rcs[i], request); - return (-EPROTO); + return(-EPROTO); } } + if (request->rq_bulk->bd_nob_transferred != requested_nob) { + CERROR("Unexpected # bytes transferred: %d (requested %d)\n", + requested_nob, request->rq_bulk->bd_nob_transferred); + return(-EPROTO); + } + return (0); } @@ -750,11 +761,11 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa, return (-ENOMEM); if (opc == OST_WRITE) - desc = ptlrpc_prep_bulk_imp(req, BULK_GET_SOURCE, - OST_BULK_PORTAL); + desc = ptlrpc_prep_bulk_imp (req, page_count, + BULK_GET_SOURCE, OST_BULK_PORTAL); else - desc = ptlrpc_prep_bulk_imp(req, BULK_PUT_SINK, - OST_BULK_PORTAL); + desc = ptlrpc_prep_bulk_imp (req, page_count, + BULK_PUT_SINK, OST_BULK_PORTAL); if (desc == NULL) GOTO(out, rc = -ENOMEM); /* NB request now owns desc and will free it when it gets freed */ @@ -783,11 +794,8 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa, pg_prev->pg, pg_prev->pg->private, pg_prev->pg->index, pg_prev->off); - rc = ptlrpc_prep_bulk_page(desc, pg->pg, pg->off & ~PAGE_MASK, - pg->count); - if (rc != 0) - GOTO(out, rc); - + ptlrpc_prep_bulk_page(desc, pg->pg, pg->off & ~PAGE_MASK, + pg->count); requested_nob += pg->count; if (i > 0 && can_merge_pages(pg_prev, pg)) { @@ -856,8 +864,10 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, struct obdo *oa, CERROR ("Unexpected +ve rc %d\n", rc); RETURN(-EPROTO); } + LASSERT (req->rq_bulk->bd_nob == requested_nob); - RETURN(check_write_rcs(req, niocount, page_count, pga)); + RETURN(check_write_rcs(req, requested_nob, niocount, + page_count, pga)); } if (rc > requested_nob) { @@ -865,6 +875,12 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, struct obdo *oa, RETURN(-EPROTO); } + if (rc != req->rq_bulk->bd_nob_transferred) { + CERROR ("Unexpected rc %d (%d transferred)\n", + rc, req->rq_bulk->bd_nob_transferred); + return (-EPROTO); + } + if (rc < requested_nob) handle_short_read(rc, page_count, pga); @@ -1361,6 +1377,8 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, oap = list_entry(pos, struct osc_async_page, oap_pending_item); ops = oap->oap_caller_ops; + LASSERT(oap->oap_magic == OAP_MAGIC); + /* in llite being 'ready' equates to the page being locked * until completion unlocks it. commit_write submits a page * as not ready because its unlock will happen unconditionally @@ -1472,6 +1490,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, list_splice(&rpc_list, &aa->aa_oaps); INIT_LIST_HEAD(&rpc_list); +#ifdef __KERNEL__ if (cmd == OBD_BRW_READ) { lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count); lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_brw_in_flight); @@ -1480,6 +1499,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_brw_in_flight); } +#endif spin_lock(&cli->cl_loi_list_lock); diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index dfdcf1c..f5c5579 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -70,7 +70,8 @@ void oti_to_request(struct obd_trans_info *oti, struct ptlrpc_request *req) for (ack_lock = oti->oti_ack_locks, i = 0; i < 4; i++, ack_lock++) { if (!ack_lock->mode) break; - ldlm_put_lock_into_req(req, &ack_lock->lock, ack_lock->mode); + /* XXX not even calling target_send_reply in some cases... */ + ptlrpc_save_lock (req, &ack_lock->lock, ack_lock->mode); } } @@ -417,7 +418,8 @@ static int ost_brw_read(struct ptlrpc_request *req) if (local_nb == NULL) GOTO(out_pp_rnb, rc = -ENOMEM); - desc = ptlrpc_prep_bulk_exp(req, BULK_PUT_SOURCE, OST_BULK_PORTAL); + desc = ptlrpc_prep_bulk_exp (req, npages, + BULK_PUT_SOURCE, OST_BULK_PORTAL); if (desc == NULL) GOTO(out_local, rc = -ENOMEM); @@ -439,11 +441,9 @@ static int ost_brw_read(struct ptlrpc_request *req) nob += page_rc; if (page_rc != 0) { /* some data! */ LASSERT (local_nb[i].page != NULL); - rc = ptlrpc_prep_bulk_page(desc, local_nb[i].page, - pp_rnb[i].offset& ~PAGE_MASK, - page_rc); - if (rc != 0) - break; + ptlrpc_prep_bulk_page(desc, local_nb[i].page, + pp_rnb[i].offset & (PAGE_SIZE - 1), + page_rc); } if (page_rc != pp_rnb[i].len) { /* short read */ @@ -455,16 +455,25 @@ static int ost_brw_read(struct ptlrpc_request *req) } if (rc == 0) { - rc = ptlrpc_bulk_put(desc); + rc = ptlrpc_start_bulk_transfer(desc); if (rc == 0) { lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, ost_bulk_timeout, desc); rc = l_wait_event(desc->bd_waitq, - ptlrpc_bulk_complete(desc), &lwi); - if (rc) { - LASSERT(rc == -ETIMEDOUT); + !ptlrpc_bulk_active(desc), &lwi); + LASSERT(rc == 0 || rc == -ETIMEDOUT); + if (rc == -ETIMEDOUT) { DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT"); ptlrpc_abort_bulk(desc); + } else if (!desc->bd_success || + desc->bd_nob_transferred != desc->bd_nob) { + DEBUG_REQ(D_ERROR, req, "%s bulk PUT %d(%d)", + desc->bd_success ? + "truncated" : "network error on", + desc->bd_nob_transferred, + desc->bd_nob); + /* XXX should this be a different errno? */ + rc = -ETIMEDOUT; } } else { DEBUG_REQ(D_ERROR, req, "bulk PUT failed: rc %d\n", rc); @@ -502,9 +511,9 @@ static int ost_brw_read(struct ptlrpc_request *req) req->rq_status = rc; ptlrpc_error(req); } else { - if (req->rq_repmsg != NULL) { + if (req->rq_reply_state != NULL) { /* reply out callback would free */ - OBD_FREE(req->rq_repmsg, req->rq_replen); + lustre_free_reply_state (req->rq_reply_state); } if (req->rq_reqmsg->conn_cnt == req->rq_export->exp_conn_cnt) { CERROR("bulk IO comms error: " @@ -545,7 +554,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) int objcount, niocount, npages; int comms_error = 0; int rc, rc2, swab, i, j; - char str[PTL_NALFMT_SIZE]; + char str[PTL_NALFMT_SIZE]; ENTRY; if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK)) @@ -607,7 +616,8 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) if (local_nb == NULL) GOTO(out_pp_rnb, rc = -ENOMEM); - desc = ptlrpc_prep_bulk_exp(req, BULK_GET_SINK, OST_BULK_PORTAL); + desc = ptlrpc_prep_bulk_exp (req, npages, + BULK_GET_SINK, OST_BULK_PORTAL); if (desc == NULL) GOTO(out_local, rc = -ENOMEM); @@ -618,31 +628,34 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) /* NB Having prepped, we must commit... */ - for (i = 0; i < npages; i++) { - rc = ptlrpc_prep_bulk_page(desc, local_nb[i].page, - pp_rnb[i].offset & (PAGE_SIZE - 1), - pp_rnb[i].len); - if (rc != 0) - break; - } + for (i = 0; i < npages; i++) + ptlrpc_prep_bulk_page(desc, local_nb[i].page, + pp_rnb[i].offset & (PAGE_SIZE - 1), + pp_rnb[i].len); + rc = ptlrpc_start_bulk_transfer (desc); if (rc == 0) { - rc = ptlrpc_bulk_get(desc); - if (rc == 0) { - lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, - ost_bulk_timeout, desc); - rc = l_wait_event(desc->bd_waitq, - ptlrpc_bulk_complete(desc), &lwi); - if (rc) { - LASSERT(rc == -ETIMEDOUT); - DEBUG_REQ(D_ERROR, req, "timeout on bulk GET"); - ptlrpc_abort_bulk(desc); - } - } else { - DEBUG_REQ(D_ERROR, req, "bulk GET failed: rc %d\n", rc); + lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, + ost_bulk_timeout, desc); + rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), + &lwi); + LASSERT(rc == 0 || rc == -ETIMEDOUT); + if (rc == -ETIMEDOUT) { + DEBUG_REQ(D_ERROR, req, "timeout on bulk GET"); + ptlrpc_abort_bulk(desc); + } else if (!desc->bd_success || + desc->bd_nob_transferred != desc->bd_nob) { + DEBUG_REQ(D_ERROR, req, "%s bulk GET %d(%d)", + desc->bd_success ? + "truncated" : "network error on", + desc->bd_nob_transferred, desc->bd_nob); + /* XXX should this be a different errno? */ + rc = -ETIMEDOUT; } - comms_error = rc != 0; + } else { + DEBUG_REQ(D_ERROR, req, "ptlrpc_bulk_get failed: rc %d\n", rc); } + comms_error = rc != 0; repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*repbody)); memcpy(&repbody->oa, &body->oa, sizeof(repbody->oa)); @@ -710,9 +723,9 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) req->rq_status = rc; ptlrpc_error(req); } else { - if (req->rq_repmsg != NULL) { + if (req->rq_reply_state != NULL) { /* reply out callback would free */ - OBD_FREE (req->rq_repmsg, req->rq_replen); + lustre_free_reply_state (req->rq_reply_state); } if (req->rq_reqmsg->conn_cnt == req->rq_export->exp_conn_cnt) { CERROR("bulk IO comms error: " @@ -806,8 +819,6 @@ out_pp_rnb: free_per_page_niobufs(npages, pp_rnb, remote_nb); out: if (rc) { - OBD_FREE(req->rq_repmsg, req->rq_replen); - req->rq_repmsg = NULL; req->rq_status = rc; ptlrpc_error(req); } else @@ -1122,11 +1133,11 @@ static int ost_setup(struct obd_device *obddev, obd_count len, void *buf) if (rc < 0) RETURN(rc); - ost->ost_service = ptlrpc_init_svc(OST_NEVENTS, OST_NBUFS, - OST_BUFSIZE, OST_MAXREQSIZE, - OST_REQUEST_PORTAL, OSC_REPLY_PORTAL, - ost_handle, "ost", - obddev->obd_proc_entry); + ost->ost_service = + ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, + OST_REQUEST_PORTAL, OSC_REPLY_PORTAL, + ost_handle, "ost", + obddev->obd_proc_entry); if (ost->ost_service == NULL) { CERROR("failed to start service\n"); RETURN(-ENOMEM); @@ -1138,9 +1149,9 @@ static int ost_setup(struct obd_device *obddev, obd_count len, void *buf) GOTO(out, rc = -EINVAL); ost->ost_create_service = - ptlrpc_init_svc(OST_NEVENTS, OST_NBUFS, OST_BUFSIZE, - OST_MAXREQSIZE, OST_CREATE_PORTAL, - OSC_REPLY_PORTAL, ost_handle, "ost_create", + ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, + OST_CREATE_PORTAL, OSC_REPLY_PORTAL, + ost_handle, "ost_create", obddev->obd_proc_entry); if (ost->ost_create_service == NULL) { CERROR("failed to start OST create service\n"); diff --git a/lustre/portals/archdep.m4 b/lustre/portals/archdep.m4 index e955c33..c06bc8a 100644 --- a/lustre/portals/archdep.m4 +++ b/lustre/portals/archdep.m4 @@ -333,6 +333,7 @@ AC_SUBST(SCIMACNAL) CFLAGS="$KCFLAGS" CPPFLAGS="$KINCFLAGS $KCPPFLAGS $MFLAGS $enable_zerocopy $enable_affinity $with_quadrics $with_gm $with_scamac $with_ib" +AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib) AC_SUBST(MOD_LINK) AC_SUBST(LINUX25) AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib) diff --git a/lustre/portals/include/portals/errno.h b/lustre/portals/include/portals/errno.h index 817936a..08f084a 100644 --- a/lustre/portals/include/portals/errno.h +++ b/lustre/portals/include/portals/errno.h @@ -50,9 +50,8 @@ typedef enum { PTL_IOV_TOO_SMALL = 31, PTL_EQ_INUSE = 32, - PTL_MD_INUSE = 33, - PTL_MAX_ERRNO = 33 + PTL_MAX_ERRNO = 32 } ptl_err_t; /* If you change these, you must update the string table in api-errno.c */ diff --git a/lustre/portals/include/portals/lib-nal.h b/lustre/portals/include/portals/lib-nal.h index 4052c0c..0bf557e 100644 --- a/lustre/portals/include/portals/lib-nal.h +++ b/lustre/portals/include/portals/lib-nal.h @@ -18,47 +18,60 @@ struct nal_cb_t { lib_ni_t ni; void *nal_data; /* - * send: Sends a preformatted header and user data to a - * specified remote process. - * Can overwrite iov. + * send: Sends a preformatted header and payload data to a + * specified remote process. The payload is scattered over 'niov' + * fragments described by iov, starting at 'offset' for 'mlen' + * bytes. + * NB the NAL may NOT overwrite iov. + * PTL_OK on success => NAL has committed to send and will call + * lib_finalize on completion */ - int (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, size_t mlen); + ptl_err_t (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen); /* as send, but with a set of page fragments (NULL if not supported) */ - int (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, ptl_kiov_t *iov, size_t mlen); + ptl_err_t (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, ptl_kiov_t *iov, + size_t offset, size_t mlen); /* - * recv: Receives an incoming message from a remote process - * Type of iov depends on options. Can overwrite iov. + * recv: Receives an incoming message from a remote process. The + * payload is to be received into the scattered buffer of 'niov' + * fragments described by iov, starting at 'offset' for 'mlen' + * bytes. Payload bytes after 'mlen' up to 'rlen' are to be + * discarded. + * NB the NAL may NOT overwrite iov. + * PTL_OK on success => NAL has committed to receive and will call + * lib_finalize on completion */ - int (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - unsigned int niov, struct iovec *iov, size_t mlen, - size_t rlen); + ptl_err_t (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen, size_t rlen); /* as recv, but with a set of page fragments (NULL if not supported) */ - int (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, - unsigned int niov, ptl_kiov_t *iov, size_t mlen, - size_t rlen); + ptl_err_t (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + unsigned int niov, ptl_kiov_t *iov, + size_t offset, size_t mlen, size_t rlen); /* * read: Reads a block of data from a specified user address */ - int (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr, - user_ptr src_addr, size_t len); + ptl_err_t (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr, + user_ptr src_addr, size_t len); /* * write: Writes a block of data into a specified user address */ - int (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr, - void *src_addr, size_t len); + ptl_err_t (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr, + void *src_addr, size_t len); /* * callback: Calls an event callback + * NULL => lib calls eq's callback (if any) directly. */ - int (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq, - ptl_event_t *ev); + void (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq, + ptl_event_t *ev); /* * malloc: Acquire a block of memory in a system independent @@ -74,14 +87,14 @@ struct nal_cb_t { * type of *iov depends on options. * Set to NULL if not required. */ - int (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, - void **addrkey); + ptl_err_t (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, + void **addrkey); void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, void **addrkey); /* as (un)map, but with a set of page fragments */ - int (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, - void **addrkey); + ptl_err_t (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, + void **addrkey); void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, void **addrkey); diff --git a/lustre/portals/include/portals/lib-p30.h b/lustre/portals/include/portals/lib-p30.h index 3582b94..e9e4635 100644 --- a/lustre/portals/include/portals/lib-p30.h +++ b/lustre/portals/include/portals/lib-p30.h @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -42,7 +41,7 @@ do { \ nal->cb_sti(nal, flagsp); \ } -#ifdef PTL_USE_DESC_LISTS +#ifdef PTL_USE_LIB_FREELIST #define MAX_MES 2048 #define MAX_MDS 2048 @@ -98,7 +97,7 @@ lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) } static inline lib_md_t * -lib_md_alloc (nal_cb_t *nal) +lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) { /* NEVER called with statelock held */ unsigned long flags; @@ -142,8 +141,20 @@ lib_me_free (nal_cb_t *nal, lib_me_t *me) static inline lib_msg_t * lib_msg_alloc (nal_cb_t *nal) { - /* ALWAYS called with statelock held */ - return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs)); + /* NEVER called with statelock held */ + unsigned long flags; + lib_msg_t *msg; + + state_lock (nal, &flags); + msg = (lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs); + state_unlock (nal, &flags); + + if (msg != NULL) { + /* NULL pointers, clear flags etc */ + memset (msg, 0, sizeof (*msg)); + msg->ack_wmd = PTL_WIRE_HANDLE_NONE; + } + return(msg); } static inline void @@ -155,22 +166,13 @@ lib_msg_free (nal_cb_t *nal, lib_msg_t *msg) #else -extern atomic_t md_in_use_count; -extern atomic_t msg_in_use_count; -extern atomic_t me_in_use_count; -extern atomic_t eq_in_use_count; - static inline lib_eq_t * lib_eq_alloc (nal_cb_t *nal) { /* NEVER called with statelock held */ lib_eq_t *eq; - PORTAL_ALLOC(eq, sizeof(*eq)); - - if (eq == NULL) - return (NULL); - atomic_inc (&eq_in_use_count); + PORTAL_ALLOC(eq, sizeof(*eq)); return (eq); } @@ -178,21 +180,34 @@ static inline void lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) { /* ALWAYS called with statelock held */ - atomic_dec (&eq_in_use_count); PORTAL_FREE(eq, sizeof(*eq)); } static inline lib_md_t * -lib_md_alloc (nal_cb_t *nal) +lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd) { /* NEVER called with statelock held */ lib_md_t *md; - PORTAL_ALLOC(md, sizeof(*md)); - - if (md == NULL) - return (NULL); - - atomic_inc (&md_in_use_count); + int size; + int niov; + + if ((umd->options & PTL_MD_KIOV) != 0) { + niov = umd->niov; + size = offsetof(lib_md_t, md_iov.kiov[niov]); + } else { + niov = ((umd->options & PTL_MD_IOV) != 0) ? + umd->niov : 1; + size = offsetof(lib_md_t, md_iov.iov[niov]); + } + + PORTAL_ALLOC(md, size); + + if (md != NULL) { + /* Set here in case of early free */ + md->options = umd->options; + md->md_niov = niov; + } + return (md); } @@ -200,8 +215,14 @@ static inline void lib_md_free (nal_cb_t *nal, lib_md_t *md) { /* ALWAYS called with statelock held */ - atomic_dec (&md_in_use_count); - PORTAL_FREE(md, sizeof(*md)); + int size; + + if ((md->options & PTL_MD_KIOV) != 0) + size = offsetof(lib_md_t, md_iov.kiov[md->md_niov]); + else + size = offsetof(lib_md_t, md_iov.iov[md->md_niov]); + + PORTAL_FREE(md, size); } static inline lib_me_t * @@ -209,12 +230,8 @@ lib_me_alloc (nal_cb_t *nal) { /* NEVER called with statelock held */ lib_me_t *me; - PORTAL_ALLOC(me, sizeof(*me)); - - if (me == NULL) - return (NULL); - atomic_inc (&me_in_use_count); + PORTAL_ALLOC(me, sizeof(*me)); return (me); } @@ -222,21 +239,21 @@ static inline void lib_me_free(nal_cb_t *nal, lib_me_t *me) { /* ALWAYS called with statelock held */ - atomic_dec (&me_in_use_count); PORTAL_FREE(me, sizeof(*me)); } static inline lib_msg_t * lib_msg_alloc(nal_cb_t *nal) { - /* ALWAYS called with statelock held */ + /* NEVER called with statelock held */ lib_msg_t *msg; - PORTAL_ALLOC_ATOMIC(msg, sizeof(*msg)); - if (msg == NULL) - return (NULL); - - atomic_inc (&msg_in_use_count); + PORTAL_ALLOC(msg, sizeof(*msg)); + if (msg != NULL) { + /* NULL pointers, clear flags etc */ + memset (msg, 0, sizeof (*msg)); + msg->ack_wmd = PTL_WIRE_HANDLE_NONE; + } return (msg); } @@ -244,7 +261,6 @@ static inline void lib_msg_free(nal_cb_t *nal, lib_msg_t *msg) { /* ALWAYS called with statelock held */ - atomic_dec (&msg_in_use_count); PORTAL_FREE(msg, sizeof(*msg)); } #endif @@ -344,26 +360,41 @@ extern char *dispatch_name(int index); * Call backs will be made to write events, send acks or * replies and so on. */ -extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private); -extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg); +extern void lib_enq_event_locked (nal_cb_t *nal, void *private, + lib_eq_t *eq, ptl_event_t *ev); +extern void lib_finalize (nal_cb_t *nal, void *private, lib_msg_t *msg, + ptl_err_t status); +extern void lib_parse (nal_cb_t *nal, ptl_hdr_t *hdr, void *private); extern lib_msg_t *lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_md_t *getmd); -extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr); +extern void print_hdr (nal_cb_t * nal, ptl_hdr_t * hdr); + extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov); -extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len); -extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len); +extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, + ptl_size_t offset, ptl_size_t len); +extern void lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, + char *src, ptl_size_t len); +extern int lib_extract_iov (int dst_niov, struct iovec *dst, + int src_niov, struct iovec *src, + ptl_size_t offset, ptl_size_t len); extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov); -extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len); -extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len); +extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, + ptl_size_t offset, ptl_size_t len); +extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset, + char *src, ptl_size_t len); +extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, + int src_niov, ptl_kiov_t *src, + ptl_size_t offset, ptl_size_t len); + extern void lib_assert_wire_constants (void); -extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, - ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); -extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - lib_md_t *md, ptl_size_t offset, ptl_size_t len); +extern ptl_err_t lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, + ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); +extern ptl_err_t lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + lib_md_t *md, ptl_size_t offset, ptl_size_t len); extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in, ptl_md_t * md_out); diff --git a/lustre/portals/include/portals/lib-types.h b/lustre/portals/include/portals/lib-types.h index d9e3c11..904204b 100644 --- a/lustre/portals/include/portals/lib-types.h +++ b/lustre/portals/include/portals/lib-types.h @@ -16,7 +16,7 @@ # include # include #else -# define PTL_USE_DESC_LISTS +# define PTL_USE_LIB_FREELIST # include #endif @@ -139,16 +139,9 @@ typedef struct { struct lib_msg_t { struct list_head msg_list; - int send_ack; lib_md_t *md; - ptl_nid_t nid; - ptl_pid_t pid; - ptl_event_t ev; ptl_handle_wire_t ack_wmd; - union { - struct iovec iov[PTL_MD_MAX_IOV]; - ptl_kiov_t kiov[PTL_MD_MAX_IOV]; - } msg_iov; + ptl_event_t ev; }; struct lib_ptl_t { @@ -212,9 +205,8 @@ struct lib_md_t { }; #define PTL_MD_FLAG_UNLINK (1 << 0) -#define PTL_MD_FLAG_AUTO_UNLINKED (1 << 1) -#ifdef PTL_USE_DESC_LISTS +#ifdef PTL_USE_LIB_FREELIST typedef struct { void *fl_objs; /* single contiguous array of objects */ @@ -262,7 +254,7 @@ typedef struct { struct list_head ni_test_peers; -#ifdef PTL_USE_DESC_LISTS +#ifdef PTL_USE_LIB_FREELIST lib_freelist_t ni_free_mes; lib_freelist_t ni_free_msgs; lib_freelist_t ni_free_mds; diff --git a/lustre/portals/include/portals/p30.h b/lustre/portals/include/portals/p30.h index a4ea39b..8b1495e 100644 --- a/lustre/portals/include/portals/p30.h +++ b/lustre/portals/include/portals/p30.h @@ -21,7 +21,6 @@ #include #include #include -#include #include extern int __p30_initialized; /* for libraries & test codes */ diff --git a/lustre/portals/include/portals/types.h b/lustre/portals/include/portals/types.h index e4ccebf..7ffe797 100644 --- a/lustre/portals/include/portals/types.h +++ b/lustre/portals/include/portals/types.h @@ -17,6 +17,8 @@ typedef u_int64_t __u64; # define do_gettimeofday(tv) gettimeofday(tv, NULL) #endif +#include + typedef __u64 ptl_nid_t; typedef __u32 ptl_pid_t; typedef __u32 ptl_pt_index_t; @@ -97,7 +99,8 @@ typedef enum { PTL_EVENT_PUT, PTL_EVENT_REPLY, PTL_EVENT_ACK, - PTL_EVENT_SENT + PTL_EVENT_SENT, + PTL_EVENT_UNLINK, } ptl_event_kind_t; #define PTL_SEQ_BASETYPE long @@ -112,15 +115,19 @@ typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t; #pragma pack(push, 4) #endif typedef struct { - ptl_event_kind_t type; - ptl_process_id_t initiator; - ptl_pt_index_t portal; - ptl_match_bits_t match_bits; - ptl_size_t rlength, mlength, offset; - ptl_handle_me_t unlinked_me; - ptl_md_t mem_desc; - ptl_hdr_data_t hdr_data; - struct timeval arrival_time; + ptl_event_kind_t type; + ptl_err_t status; + int unlinked; + ptl_process_id_t initiator; + ptl_pt_index_t portal; + ptl_match_bits_t match_bits; + ptl_size_t rlength; + ptl_size_t mlength; + ptl_size_t offset; + ptl_md_t mem_desc; + ptl_hdr_data_t hdr_data; + struct timeval arrival_time; + volatile ptl_seq_t sequence; } ptl_event_t; #ifdef __CYGWIN__ diff --git a/lustre/portals/knals/gmnal/gmnal.h b/lustre/portals/knals/gmnal/gmnal.h index 53757ab..cdde5b7 100644 --- a/lustre/portals/knals/gmnal/gmnal.h +++ b/lustre/portals/knals/gmnal/gmnal.h @@ -353,8 +353,6 @@ int gmnal_cb_read(nal_cb_t *, void *private, void *, user_ptr, size_t); int gmnal_cb_write(nal_cb_t *, void *private, user_ptr, void *, size_t); -int gmnal_cb_callback(nal_cb_t *, void *, lib_eq_t *, ptl_event_t *); - void *gmnal_cb_malloc(nal_cb_t *, size_t); void gmnal_cb_free(nal_cb_t *, void *, size_t); @@ -384,7 +382,7 @@ void gmnal_fini(void); a->cb_recv_pages = gmnal_cb_recv_pages; \ a->cb_read = gmnal_cb_read; \ a->cb_write = gmnal_cb_write; \ - a->cb_callback = gmnal_cb_callback; \ + a->cb_callback = NULL; \ a->cb_malloc = gmnal_cb_malloc; \ a->cb_free = gmnal_cb_free; \ a->cb_map = NULL; \ diff --git a/lustre/portals/knals/gmnal/gmnal_cb.c b/lustre/portals/knals/gmnal/gmnal_cb.c index 6ae91db..e055242 100644 --- a/lustre/portals/knals/gmnal/gmnal_cb.c +++ b/lustre/portals/knals/gmnal/gmnal_cb.c @@ -126,7 +126,6 @@ int gmnal_cb_send(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, niov, iov, len); } else { CDEBUG(D_ERROR, "Large message send it is not supported\n"); - lib_finalize(nal_cb, private, cookie); return(PTL_FAIL); gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid, niov, iov, len); @@ -200,18 +199,6 @@ int gmnal_cb_write(nal_cb_t *nal_cb, void *private, user_ptr dst, return(PTL_OK); } -int gmnal_cb_callback(nal_cb_t *nal_cb, void *private, lib_eq_t *eq, - ptl_event_t *ev) -{ - - if (eq->event_callback != NULL) { - CDEBUG(D_INFO, "found callback\n"); - eq->event_callback(ev); - } - - return(PTL_OK); -} - void *gmnal_cb_malloc(nal_cb_t *nal_cb, size_t len) { void *ptr = NULL; diff --git a/lustre/portals/knals/gmnal/gmnal_comm.c b/lustre/portals/knals/gmnal/gmnal_comm.c index 4171df6..a0d3530 100644 --- a/lustre/portals/knals/gmnal/gmnal_comm.c +++ b/lustre/portals/knals/gmnal/gmnal_comm.c @@ -321,7 +321,6 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, if (!private) { CDEBUG(D_ERROR, "gmnal_small_rx no context\n"); - lib_finalize(nal_cb, private, cookie); return(PTL_FAIL); } @@ -343,10 +342,8 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, * let portals library know receive is complete */ CDEBUG(D_PORTALS, "calling lib_finalize\n"); - if (lib_finalize(nal_cb, private, cookie) != PTL_OK) { - /* TO DO what to do with failed lib_finalise? */ - CDEBUG(D_INFO, "lib_finalize failed\n"); - } + lib_finalize(nal_cb, private, cookie, PTL_OK); + /* * return buffer so it can be used again */ @@ -590,10 +587,8 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) return; } gmnal_return_stxd(nal_data, stxd); - if (lib_finalize(nal_cb, stxd, cookie) != PTL_OK) { - CDEBUG(D_INFO, "Call to lib_finalize failed for stxd [%p]\n", - stxd); - } + lib_finalize(nal_cb, stxd, cookie, PTL_OK); + return; } @@ -817,7 +812,6 @@ gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, if (!srxd) { CDEBUG(D_ERROR, "gmnal_large_rx no context\n"); - lib_finalize(nal_cb, private, cookie); return(PTL_FAIL); } @@ -1114,10 +1108,7 @@ gmnal_remote_get_callback(gm_port_t *gm_port, void *context, * Let our client application proceed */ CDEBUG(D_ERROR, "final callback context[%p]\n", srxd); - if (lib_finalize(nal_cb, srxd, srxd->cookie) != PTL_OK) { - CDEBUG(D_INFO, "Call to lib_finalize failed for srxd [%p]\n", - srxd); - } + lib_finalize(nal_cb, srxd, srxd->cookie, PTL_OK); /* * send an ack to the sender to let him know we got the data @@ -1282,10 +1273,7 @@ gmnal_large_tx_ack_received(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) CDEBUG(D_INFO, "gmnal_large_tx_ack_received stxd [%p]\n", stxd); - if (lib_finalize(nal_cb, stxd, stxd->cookie) != PTL_OK) { - CDEBUG(D_INFO, "Call to lib_finalize failed for stxd [%p]\n", - stxd); - } + lib_finalize(nal_cb, stxd, stxd->cookie, PTL_OK); /* * extract the iovec from the stxd, deregister the memory. diff --git a/lustre/portals/knals/ibnal/ibnal_cb.c b/lustre/portals/knals/ibnal/ibnal_cb.c index 2c07cc4..0688062 100644 --- a/lustre/portals/knals/ibnal/ibnal_cb.c +++ b/lustre/portals/knals/ibnal/ibnal_cb.c @@ -306,7 +306,7 @@ kibnal_send(nal_cb_t *nal, if(buf_length > MAX_MSG_SIZE) { CERROR("kibnal_send:request exceeds Transmit data size (%d).\n", MAX_MSG_SIZE); - rc = -1; + rc = PTL_FAIL; return rc; } else { @@ -363,7 +363,7 @@ kibnal_send(nal_cb_t *nal, PROF_FINISH(kibnal_send); // time stapm of send operation - rc = 1; + rc = PTL_OK; return rc; } @@ -386,7 +386,7 @@ int kibnal_send_pages(nal_cb_t * nal, ptl_kiov_t *iov, size_t mlen) { - int rc = 1; + int rc = PTL_FAIL; CDEBUG(D_NET, "kibnal_send_pages\n"); @@ -420,7 +420,7 @@ void kibnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) // // do you need this // -int kibnal_callback(nal_cb_t * nal, +void kibnal_callback(nal_cb_t * nal, void *private, lib_eq_t *eq, ptl_event_t *ev) @@ -507,7 +507,7 @@ kibnal_recv_pages(nal_cb_t * nal, { CDEBUG(D_NET, "recv_pages not implemented\n"); - return PTL_OK; + return PTL_FAIL; } @@ -526,11 +526,12 @@ kibnal_recv(nal_cb_t *nal, CDEBUG(D_NET,"kibnal_recv: mlen=%d, rlen=%d\n", mlen, rlen); /* What was actually received must be >= what sender claims to - * have sent. This is an LASSERT, since lib-move doesn't - * check cb return code yet. */ - LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen); + * have sent. */ LASSERT (mlen <= rlen); + if (krx->krx_len < sizeof (ptl_hdr_t) + rlen) + return (PTL_FAIL); + PROF_START(kibnal_recv); if(mlen != 0) { @@ -542,12 +543,12 @@ kibnal_recv(nal_cb_t *nal, PROF_START(lib_finalize); - lib_finalize(nal, private, cookie); + lib_finalize(nal, private, cookie, PTL_OK); PROF_FINISH(lib_finalize); PROF_FINISH(kibnal_recv); - return rlen; + return PTL_OK; } // diff --git a/lustre/portals/knals/qswnal/qswnal_cb.c b/lustre/portals/knals/qswnal/qswnal_cb.c index 96749cd..4c2bd6a 100644 --- a/lustre/portals/knals/qswnal/qswnal_cb.c +++ b/lustre/portals/knals/qswnal/qswnal_cb.c @@ -33,7 +33,7 @@ EP_STATUSBLK kqswnal_rpc_failed; * LIB functions follow * */ -static int +static ptl_err_t kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, size_t len) { @@ -41,10 +41,10 @@ kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, nal->ni.nid, len, src_addr, dst_addr ); memcpy( dst_addr, src_addr, len ); - return (0); + return (PTL_OK); } -static int +static ptl_err_t kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, size_t len) { @@ -52,7 +52,7 @@ kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, nal->ni.nid, len, src_addr, dst_addr ); memcpy( dst_addr, src_addr, len ); - return (0); + return (PTL_OK); } static void * @@ -157,13 +157,12 @@ kqswnal_unmap_tx (kqswnal_tx_t *ktx) elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState, kqswnal_data.kqn_eptxdmahandle, ktx->ktx_basepage, ktx->ktx_nmappedpages); - #endif ktx->ktx_nmappedpages = 0; } int -kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov) +kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_t *kiov) { int nfrags = ktx->ktx_nfrag; int nmapped = ktx->ktx_nmappedpages; @@ -188,8 +187,16 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov) LASSERT (niov > 0); LASSERT (nob > 0); + /* skip complete frags before 'offset' */ + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + kiov++; + niov--; + LASSERT (niov > 0); + } + do { - int fraglen = kiov->kiov_len; + int fraglen = kiov->kiov_len - offset; /* nob exactly spans the iovs */ LASSERT (fraglen <= nob); @@ -212,7 +219,7 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov) /* XXX this is really crap, but we'll have to kmap until * EKC has a page (rather than vaddr) mapping interface */ - ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset; CDEBUG(D_NET, "%p[%d] loading %p for %d, page %d, %d total\n", @@ -257,6 +264,7 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov) kiov++; niov--; nob -= fraglen; + offset = 0; /* iov must not run out before end of data */ LASSERT (nob == 0 || niov > 0); @@ -271,7 +279,8 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov) } int -kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov) +kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob, + int niov, struct iovec *iov) { int nfrags = ktx->ktx_nfrag; int nmapped = ktx->ktx_nmappedpages; @@ -295,8 +304,16 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov) LASSERT (niov > 0); LASSERT (nob > 0); + /* skip complete frags before offset */ + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + niov--; + LASSERT (niov > 0); + } + do { - int fraglen = iov->iov_len; + int fraglen = iov->iov_len - offset; long npages = kqswnal_pages_spanned (iov->iov_base, fraglen); /* nob exactly spans the iovs */ @@ -317,12 +334,12 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov) CDEBUG(D_NET, "%p[%d] loading %p for %d, pages %d for %ld, %d total\n", - ktx, nfrags, iov->iov_base, fraglen, basepage, npages, - nmapped); + ktx, nfrags, iov->iov_base + offset, fraglen, + basepage, npages, nmapped); #if MULTIRAIL_EKC ep_dvma_load(kqswnal_data.kqn_ep, NULL, - iov->iov_base, fraglen, + iov->iov_base + offset, fraglen, kqswnal_data.kqn_ep_tx_nmh, basepage, &railmask, &ktx->ktx_frags[nfrags]); @@ -336,7 +353,7 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov) #else elan3_dvma_kaddr_load (kqswnal_data.kqn_ep->DmaState, kqswnal_data.kqn_eptxdmahandle, - iov->iov_base, fraglen, + iov->iov_base + offset, fraglen, basepage, &ktx->ktx_frags[nfrags].Base); if (nfrags > 0 && /* previous frag mapped */ @@ -357,6 +374,7 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov) iov++; niov--; nob -= fraglen; + offset = 0; /* iov must not run out before end of data */ LASSERT (nob == 0 || niov > 0); @@ -483,7 +501,7 @@ void kqswnal_tx_done (kqswnal_tx_t *ktx, int error) { lib_msg_t *msg; - lib_msg_t *repmsg; + lib_msg_t *repmsg = NULL; switch (ktx->ktx_state) { case KTX_FORWARDING: /* router asked me to forward this packet */ @@ -493,21 +511,29 @@ kqswnal_tx_done (kqswnal_tx_t *ktx, int error) case KTX_SENDING: /* packet sourced locally */ lib_finalize (&kqswnal_lib, ktx->ktx_args[0], - (lib_msg_t *)ktx->ktx_args[1]); + (lib_msg_t *)ktx->ktx_args[1], + (error == 0) ? PTL_OK : + (error == -ENOMEM) ? PTL_NOSPACE : PTL_FAIL); break; case KTX_GETTING: /* Peer has DMA-ed direct? */ msg = (lib_msg_t *)ktx->ktx_args[1]; - repmsg = NULL; - if (error == 0) + if (error == 0) { repmsg = lib_fake_reply_msg (&kqswnal_lib, ktx->ktx_nid, msg->md); + if (repmsg == NULL) + error = -ENOMEM; + } - lib_finalize (&kqswnal_lib, ktx->ktx_args[0], msg); - - if (repmsg != NULL) - lib_finalize (&kqswnal_lib, NULL, repmsg); + if (error == 0) { + lib_finalize (&kqswnal_lib, ktx->ktx_args[0], + msg, PTL_OK); + lib_finalize (&kqswnal_lib, NULL, repmsg, PTL_OK); + } else { + lib_finalize (&kqswnal_lib, ktx->ktx_args[0], msg, + (error == -ENOMEM) ? PTL_NOSPACE : PTL_FAIL); + } break; default: @@ -533,7 +559,7 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status) ktx->ktx_nid, status); kqswnal_notify_peer_down(ktx); - status = -EIO; + status = -EHOSTDOWN; } else if (ktx->ktx_state == KTX_GETTING) { /* RPC completed OK; what did our peer put in the status @@ -745,7 +771,8 @@ kqswnal_eiovs2datav (int ndv, EP_DATAVEC *dv, int kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, - struct iovec *iov, ptl_kiov_t *kiov, int nob) + struct iovec *iov, ptl_kiov_t *kiov, + int offset, int nob) { kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; char *buffer = (char *)page_address(krx->krx_pages[0]); @@ -779,9 +806,9 @@ kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, /* Map the source data... */ ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0; if (kiov != NULL) - rc = kqswnal_map_tx_kiov (ktx, nob, nfrag, kiov); + rc = kqswnal_map_tx_kiov (ktx, offset, nob, nfrag, kiov); else - rc = kqswnal_map_tx_iov (ktx, nob, nfrag, iov); + rc = kqswnal_map_tx_iov (ktx, offset, nob, nfrag, iov); if (rc != 0) { CERROR ("Can't map source data: %d\n", rc); @@ -846,7 +873,7 @@ kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, return (-ECONNABORTED); } -static int +static ptl_err_t kqswnal_sendmsg (nal_cb_t *nal, void *private, lib_msg_t *libmsg, @@ -857,6 +884,7 @@ kqswnal_sendmsg (nal_cb_t *nal, unsigned int payload_niov, struct iovec *payload_iov, ptl_kiov_t *payload_kiov, + size_t payload_offset, size_t payload_nob) { kqswnal_tx_t *ktx; @@ -865,6 +893,7 @@ kqswnal_sendmsg (nal_cb_t *nal, #if KQSW_CHECKSUM int i; kqsw_csum_t csum; + int sumoff; int sumnob; #endif @@ -928,9 +957,9 @@ kqswnal_sendmsg (nal_cb_t *nal, } /* peer expects RPC completion with GET data */ - rc = kqswnal_dma_reply (ktx, - payload_niov, payload_iov, - payload_kiov, payload_nob); + rc = kqswnal_dma_reply (ktx, payload_niov, + payload_iov, payload_kiov, + payload_offset, payload_nob); if (rc == 0) return (PTL_OK); @@ -945,22 +974,39 @@ kqswnal_sendmsg (nal_cb_t *nal, #if KQSW_CHECKSUM csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr)); memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum)); - for (csum = 0, i = 0, sumnob = payload_nob; sumnob > 0; i++) { + for (csum = 0, i = 0, sumoff = payload_offset, sumnob = payload_nob; sumnob > 0; i++) { + LASSERT(i < niov); if (payload_kiov != NULL) { ptl_kiov_t *kiov = &payload_kiov[i]; - char *addr = ((char *)kmap (kiov->kiov_page)) + - kiov->kiov_offset; - - csum = kqsw_csum (csum, addr, MIN (sumnob, kiov->kiov_len)); - sumnob -= kiov->kiov_len; + + if (sumoff >= kiov->kiov_len) { + sumoff -= kiov->kiov_len; + } else { + char *addr = ((char *)kmap (kiov->kiov_page)) + + kiov->kiov_offset + sumoff; + int fragnob = kiov->kiov_len - sumoff; + + csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob)); + sumnob -= fragnob; + sumoff = 0; + kunmap(kiov->kiov_page); + } } else { struct iovec *iov = &payload_iov[i]; - csum = kqsw_csum (csum, iov->iov_base, MIN (sumnob, kiov->iov_len)); - sumnob -= iov->iov_len; + if (sumoff > iov->iov_len) { + sumoff -= iov->iov_len; + } else { + char *addr = iov->iov_base + sumoff; + int fragnob = iov->iov_len - sumoff; + + csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob)); + sumnob -= fragnob; + sumoff = 0; + } } } - memcpy(ktx->ktx_buffer +sizeof(*hdr) +sizeof(csum), &csum,sizeof(csum)); + memcpy(ktx->ktx_buffer + sizeof(*hdr) + sizeof(csum), &csum, sizeof(csum)); #endif if (kqswnal_data.kqn_optimized_gets && @@ -987,10 +1033,10 @@ kqswnal_sendmsg (nal_cb_t *nal, ktx->ktx_state = KTX_GETTING; if ((libmsg->md->options & PTL_MD_KIOV) != 0) - rc = kqswnal_map_tx_kiov (ktx, md->length, + rc = kqswnal_map_tx_kiov (ktx, 0, md->length, md->md_niov, md->md_iov.kiov); else - rc = kqswnal_map_tx_iov (ktx, md->length, + rc = kqswnal_map_tx_iov (ktx, 0, md->length, md->md_niov, md->md_iov.iov); if (rc < 0) { @@ -1033,10 +1079,12 @@ kqswnal_sendmsg (nal_cb_t *nal, if (payload_nob > 0) { if (payload_kiov != NULL) lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE, - payload_niov, payload_kiov, payload_nob); + payload_niov, payload_kiov, + payload_offset, payload_nob); else lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE, - payload_niov, payload_iov, payload_nob); + payload_niov, payload_iov, + payload_offset, payload_nob); } } else { @@ -1052,10 +1100,10 @@ kqswnal_sendmsg (nal_cb_t *nal, ktx->ktx_frags[0].Len = KQSW_HDR_SIZE; #endif if (payload_kiov != NULL) - rc = kqswnal_map_tx_kiov (ktx, payload_nob, + rc = kqswnal_map_tx_kiov (ktx, payload_offset, payload_nob, payload_niov, payload_kiov); else - rc = kqswnal_map_tx_iov (ktx, payload_nob, + rc = kqswnal_map_tx_iov (ktx, payload_offset, payload_nob, payload_niov, payload_iov); if (rc != 0) { kqswnal_put_idle_tx (ktx); @@ -1078,7 +1126,7 @@ kqswnal_sendmsg (nal_cb_t *nal, return (PTL_OK); } -static int +static ptl_err_t kqswnal_send (nal_cb_t *nal, void *private, lib_msg_t *libmsg, @@ -1088,13 +1136,15 @@ kqswnal_send (nal_cb_t *nal, ptl_pid_t pid, unsigned int payload_niov, struct iovec *payload_iov, + size_t payload_offset, size_t payload_nob) { return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid, - payload_niov, payload_iov, NULL, payload_nob)); + payload_niov, payload_iov, NULL, + payload_offset, payload_nob)); } -static int +static ptl_err_t kqswnal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *libmsg, @@ -1104,10 +1154,12 @@ kqswnal_send_pages (nal_cb_t *nal, ptl_pid_t pid, unsigned int payload_niov, ptl_kiov_t *payload_kiov, + size_t payload_offset, size_t payload_nob) { return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid, - payload_niov, NULL, payload_kiov, payload_nob)); + payload_niov, NULL, payload_kiov, + payload_offset, payload_nob)); } void @@ -1161,7 +1213,7 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) nob <= KQSW_TX_BUFFER_SIZE) { /* send from ktx's pre-mapped contiguous buffer? */ - lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, nob); + lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, 0, nob); #if MULTIRAIL_EKC ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob); @@ -1176,7 +1228,7 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) { /* zero copy */ ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0; - rc = kqswnal_map_tx_iov (ktx, nob, niov, iov); + rc = kqswnal_map_tx_iov (ktx, 0, nob, niov, iov); if (rc != 0) goto failed; @@ -1231,7 +1283,8 @@ kqswnal_dma_reply_complete (EP_RXD *rxd) krx->krx_rpc_reply_needed = 0; kqswnal_rx_done (krx); - lib_finalize (&kqswnal_lib, NULL, msg); + lib_finalize (&kqswnal_lib, NULL, msg, + (status == EP_SUCCESS) ? PTL_OK : PTL_FAIL); kqswnal_put_idle_tx (ktx); } @@ -1461,13 +1514,14 @@ kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr) } #endif -static int +static ptl_err_t kqswnal_recvmsg (nal_cb_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t mlen, size_t rlen) { @@ -1498,10 +1552,13 @@ kqswnal_recvmsg (nal_cb_t *nal, #endif CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen); - /* What was actually received must be >= payload. - * This is an LASSERT, as lib_finalize() doesn't have a completion status. */ - LASSERT (krx->krx_nob >= KQSW_HDR_SIZE + mlen); + /* What was actually received must be >= payload. */ LASSERT (mlen <= rlen); + if (krx->krx_nob < KQSW_HDR_SIZE + mlen) { + CERROR("Bad message size: have %d, need %d + %d\n", + krx->krx_nob, KQSW_HDR_SIZE, mlen); + return (PTL_FAIL); + } /* It must be OK to kmap() if required */ LASSERT (kiov == NULL || !in_interrupt ()); @@ -1516,20 +1573,37 @@ kqswnal_recvmsg (nal_cb_t *nal, page_nob = PAGE_SIZE - KQSW_HDR_SIZE; LASSERT (niov > 0); + if (kiov != NULL) { - iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; - iov_nob = kiov->kiov_len; + /* skip complete frags */ + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + kiov++; + niov--; + LASSERT (niov > 0); + } + iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset; + iov_nob = kiov->kiov_len - offset; } else { - iov_ptr = iov->iov_base; - iov_nob = iov->iov_len; + /* skip complete frags */ + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + niov--; + LASSERT (niov > 0); + } + iov_ptr = iov->iov_base + offset; + iov_nob = iov->iov_len - offset; } - + for (;;) { - /* We expect the iov to exactly match mlen */ - LASSERT (iov_nob <= mlen); - - frag = MIN (page_nob, iov_nob); + frag = mlen; + if (frag > page_nob) + frag = page_nob; + if (frag > iov_nob) + frag = iov_nob; + memcpy (iov_ptr, page_ptr, frag); #if KQSW_CHECKSUM payload_csum = kqsw_csum (payload_csum, iov_ptr, frag); @@ -1588,33 +1662,39 @@ kqswnal_recvmsg (nal_cb_t *nal, "csum_nob %d\n", hdr_csum, payload_csum, csum_frags, csum_nob); #endif - lib_finalize(nal, private, libmsg); + lib_finalize(nal, private, libmsg, PTL_OK); - return (rlen); + return (PTL_OK); } -static int +static ptl_err_t kqswnal_recv(nal_cb_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen, size_t rlen) { - return (kqswnal_recvmsg (nal, private, libmsg, niov, iov, NULL, mlen, rlen)); + return (kqswnal_recvmsg(nal, private, libmsg, + niov, iov, NULL, + offset, mlen, rlen)); } -static int +static ptl_err_t kqswnal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *libmsg, unsigned int niov, ptl_kiov_t *kiov, + size_t offset, size_t mlen, size_t rlen) { - return (kqswnal_recvmsg (nal, private, libmsg, niov, NULL, kiov, mlen, rlen)); + return (kqswnal_recvmsg(nal, private, libmsg, + niov, NULL, kiov, + offset, mlen, rlen)); } int diff --git a/lustre/portals/knals/scimacnal/scimacnal_cb.c b/lustre/portals/knals/scimacnal/scimacnal_cb.c index b31c2ea..52afb98 100644 --- a/lustre/portals/knals/scimacnal/scimacnal_cb.c +++ b/lustre/portals/knals/scimacnal/scimacnal_cb.c @@ -176,7 +176,8 @@ kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context) break; } - lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie); + lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie, + (err == 0) ? PTL_OK : PTL_FAIL); PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); } @@ -225,14 +226,14 @@ kscimacnal_sendmsg(nal_cb_t *nal, if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) { CERROR("kscimacnal:request exceeds TX MTU size (%ld).\n", mac_get_mtusize(ksci->ksci_machandle)); - return -EINVAL; + return PTL_FAIL; } /* save transaction info for later finalize and cleanup */ PORTAL_ALLOC(ktx, (sizeof(kscimacnal_tx_t))); if (!ktx) { - return -ENOMEM; + return PTL_NOSPACE; } ktx->ktx_nmapped = 0; /* Start with no mapped pages :) */ @@ -247,7 +248,7 @@ kscimacnal_sendmsg(nal_cb_t *nal, kscimacnal_txrelease, ktx); if (!msg) { PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); - return -ENOMEM; + return PTL_NOSPACE; } mac_put_mblk(msg, sizeof(ptl_hdr_t)); lastblk=msg; @@ -284,7 +285,7 @@ kscimacnal_sendmsg(nal_cb_t *nal, if(!newblk) { mac_free_msg(msg); PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); - return -ENOMEM; + return PTL_NOSPACE; } mac_put_mblk(newblk, nob); mac_link_mblk(lastblk, newblk); @@ -315,10 +316,10 @@ kscimacnal_sendmsg(nal_cb_t *nal, CERROR("kscimacnal: mac_send() failed, rc=%d\n", rc); mac_free_msg(msg); PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); - return rc; + return PTL_FAIL; } - return 0; + return PTL_OK; } @@ -463,12 +464,15 @@ kscimacnal_recvmsg(nal_cb_t *nal, krx->msg, mlen, rlen, niov); /* What was actually received must be >= what sender claims to have - * sent. This is an LASSERT, since lib-move doesn't check cb return - * code yet. Also, rlen seems to be negative when mlen==0 so don't - * assert on that. - */ - LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen); - LASSERT (mlen==0 || mlen <= rlen); + * sent. */ + LASSERT (mlen <= rlen); /* something is wrong if this isn't true */ + if (mac_msg_size(krx->msg) < sizeof(ptl_hdr_t)+mlen) { + /* We didn't receive everything lib thinks we did */ + CERROR("Bad message size: have %d, need %d + %d\n", + mac_msg_size(krx->msg), sizeof(ptl_hdr_t), mlen); + return (PTL_FAIL); + } + /* It must be OK to kmap() if required */ LASSERT (kiov == NULL || !in_interrupt ()); /* Either all pages or all vaddrs */ @@ -545,12 +549,12 @@ kscimacnal_recvmsg(nal_cb_t *nal, CDEBUG(D_NET, "Calling lib_finalize.\n"); PROF_START(lib_finalize); - lib_finalize(nal, private, cookie); + lib_finalize(nal, private, cookie, PTL_OK); PROF_FINISH(lib_finalize); CDEBUG(D_NET, "Done.\n"); - return rlen; + return PTL_OK; } diff --git a/lustre/portals/knals/socknal/socknal.c b/lustre/portals/knals/socknal/socknal.c index 9ae1c87..c47dcb4 100644 --- a/lustre/portals/knals/socknal/socknal.c +++ b/lustre/portals/knals/socknal/socknal.c @@ -993,15 +993,11 @@ ksocknal_destroy_conn (ksock_conn_t *conn) /* complete current receive if any */ switch (conn->ksnc_rx_state) { case SOCKNAL_RX_BODY: -#if 0 - lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie); -#else - CERROR ("Refusing to complete a partial receive from " - LPX64", ip %d.%d.%d.%d:%d\n", conn->ksnc_peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); - CERROR ("This may hang communications and " - "prevent modules from unloading\n"); -#endif + CERROR("Completing partial receive from "LPX64 + ", ip %d.%d.%d.%d:%d, with error\n", + conn->ksnc_peer->ksnp_nid, + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); + lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_FAIL); break; case SOCKNAL_RX_BODY_FWD: ksocknal_fmb_callback (conn->ksnc_cookie, -ECONNABORTED); diff --git a/lustre/portals/knals/socknal/socknal_cb.c b/lustre/portals/knals/socknal/socknal_cb.c index 82d4c64..3ecead1 100644 --- a/lustre/portals/knals/socknal/socknal_cb.c +++ b/lustre/portals/knals/socknal/socknal_cb.c @@ -29,7 +29,7 @@ * LIB functions follow * */ -int +ptl_err_t ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, size_t len) { @@ -37,10 +37,10 @@ ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr, nal->ni.nid, (long)len, src_addr, dst_addr); memcpy( dst_addr, src_addr, len ); - return 0; + return PTL_OK; } -int +ptl_err_t ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, size_t len) { @@ -48,20 +48,7 @@ ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, nal->ni.nid, (long)len, src_addr, dst_addr); memcpy( dst_addr, src_addr, len ); - return 0; -} - -int -ksocknal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq, - ptl_event_t *ev) -{ - CDEBUG(D_NET, LPX64": callback eq %p ev %p\n", - nal->ni.nid, eq, ev); - - if (eq->event_callback != NULL) - eq->event_callback(ev); - - return 0; + return PTL_OK; } void * @@ -617,7 +604,8 @@ ksocknal_tx_done (ksock_tx_t *tx, int asynch) if (tx->tx_isfwd) { /* was a forwarded packet? */ kpr_fwd_done (&ksocknal_data.ksnd_router, - KSOCK_TX_2_KPR_FWD_DESC (tx), 0); + KSOCK_TX_2_KPR_FWD_DESC (tx), + (tx->tx_resid == 0) ? 0 : -ECONNABORTED); EXIT; return; } @@ -625,7 +613,8 @@ ksocknal_tx_done (ksock_tx_t *tx, int asynch) /* local send */ ltx = KSOCK_TX_2_KSOCK_LTX (tx); - lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie); + lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie, + (tx->tx_resid == 0) ? PTL_OK : PTL_FAIL); ksocknal_free_ltx (ltx); EXIT; @@ -694,17 +683,17 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx) LASSERT (rc < 0); if (!conn->ksnc_closing) - CERROR ("[%p] Error %d on write to "LPX64 - " ip %d.%d.%d.%d:%d\n",conn, rc, - conn->ksnc_peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr), - conn->ksnc_port); + CERROR("[%p] Error %d on write to "LPX64 + " ip %d.%d.%d.%d:%d\n", conn, rc, + conn->ksnc_peer->ksnp_nid, + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); ksocknal_close_conn_and_siblings (conn, rc); ksocknal_tx_launched (tx); - + return (rc); -} +} void ksocknal_launch_autoconnect_locked (ksock_route_t *route) @@ -742,21 +731,21 @@ ksocknal_find_target_peer_locked (ksock_tx_t *tx, ptl_nid_t nid) ptl_nid_t target_nid; int rc; ksock_peer_t *peer = ksocknal_find_peer_locked (nid); - + if (peer != NULL) return (peer); - + if (tx->tx_isfwd) { CERROR ("Can't send packet to "LPX64 - " %s: routed target is not a peer\n", + " %s: routed target is not a peer\n", nid, portals_nid2str(SOCKNAL, nid, ipbuf)); return (NULL); } - + rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, tx->tx_nob, &target_nid); if (rc != 0) { - CERROR ("Can't route to "LPX64" %s: router error %d\n", + CERROR ("Can't route to "LPX64" %s: router error %d\n", nid, portals_nid2str(SOCKNAL, nid, ipbuf), rc); return (NULL); } @@ -1018,7 +1007,7 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) return (-EHOSTUNREACH); } -int +ptl_err_t ksocknal_sendmsg(nal_cb_t *nal, void *private, lib_msg_t *cookie, @@ -1029,6 +1018,7 @@ ksocknal_sendmsg(nal_cb_t *nal, unsigned int payload_niov, struct iovec *payload_iov, ptl_kiov_t *payload_kiov, + size_t payload_offset, size_t payload_nob) { ksock_ltx_t *ltx; @@ -1091,20 +1081,19 @@ ksocknal_sendmsg(nal_cb_t *nal, ltx->ltx_tx.tx_kiov = NULL; ltx->ltx_tx.tx_nkiov = 0; - ltx->ltx_tx.tx_niov = 1 + payload_niov; - - memcpy(ltx->ltx_iov + 1, payload_iov, - payload_niov * sizeof (*payload_iov)); - + ltx->ltx_tx.tx_niov = + 1 + lib_extract_iov(payload_niov, <x->ltx_iov[1], + payload_niov, payload_iov, + payload_offset, payload_nob); } else { /* payload is all pages */ - ltx->ltx_tx.tx_kiov = ltx->ltx_kiov; - ltx->ltx_tx.tx_nkiov = payload_niov; - ltx->ltx_tx.tx_niov = 1; - memcpy(ltx->ltx_kiov, payload_kiov, - payload_niov * sizeof (*payload_kiov)); + ltx->ltx_tx.tx_kiov = ltx->ltx_kiov; + ltx->ltx_tx.tx_nkiov = + lib_extract_kiov(payload_niov, ltx->ltx_kiov, + payload_niov, payload_kiov, + payload_offset, payload_nob); } rc = ksocknal_launch_packet(<x->ltx_tx, nid); @@ -1115,28 +1104,28 @@ ksocknal_sendmsg(nal_cb_t *nal, return (PTL_FAIL); } -int +ptl_err_t ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, struct iovec *payload_iov, - size_t payload_len) + size_t payload_offset, size_t payload_len) { return (ksocknal_sendmsg(nal, private, cookie, hdr, type, nid, pid, payload_niov, payload_iov, NULL, - payload_len)); + payload_offset, payload_len)); } -int +ptl_err_t ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int payload_niov, ptl_kiov_t *payload_kiov, - size_t payload_len) + size_t payload_offset, size_t payload_len) { return (ksocknal_sendmsg(nal, private, cookie, hdr, type, nid, pid, payload_niov, NULL, payload_kiov, - payload_len)); + payload_offset, payload_len)); } void @@ -1208,7 +1197,7 @@ ksocknal_fmb_callback (void *arg, int error) /* drop peer ref taken on init */ ksocknal_put_peer (fmb->fmb_peer); - + spin_lock_irqsave (&fmp->fmp_lock, flags); list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs); @@ -1591,7 +1580,7 @@ ksocknal_process_receive (ksock_conn_t *conn) case SOCKNAL_RX_BODY: /* payload all received */ - lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie); + lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_OK); /* Fall through */ case SOCKNAL_RX_SLOP: @@ -1627,9 +1616,10 @@ ksocknal_process_receive (ksock_conn_t *conn) return (-EINVAL); /* keep gcc happy */ } -int +ptl_err_t ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen) + unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen, size_t rlen) { ksock_conn_t *conn = (ksock_conn_t *)private; @@ -1642,20 +1632,22 @@ ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, conn->ksnc_rx_nkiov = 0; conn->ksnc_rx_kiov = NULL; - conn->ksnc_rx_niov = niov; conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; - memcpy (conn->ksnc_rx_iov, iov, niov * sizeof (*iov)); + conn->ksnc_rx_niov = + lib_extract_iov(PTL_MD_MAX_IOV, conn->ksnc_rx_iov, + niov, iov, offset, mlen); LASSERT (mlen == lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); - return (rlen); + return (PTL_OK); } -int +ptl_err_t ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, ptl_kiov_t *kiov, size_t mlen, size_t rlen) + unsigned int niov, ptl_kiov_t *kiov, + size_t offset, size_t mlen, size_t rlen) { ksock_conn_t *conn = (ksock_conn_t *)private; @@ -1668,15 +1660,16 @@ ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg, conn->ksnc_rx_niov = 0; conn->ksnc_rx_iov = NULL; - conn->ksnc_rx_nkiov = niov; conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; - memcpy (conn->ksnc_rx_kiov, kiov, niov * sizeof (*kiov)); + conn->ksnc_rx_nkiov = + lib_extract_kiov(PTL_MD_MAX_IOV, conn->ksnc_rx_kiov, + niov, kiov, offset, mlen); LASSERT (mlen == lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); - return (rlen); + return (PTL_OK); } int ksocknal_scheduler (void *arg) @@ -2064,7 +2057,7 @@ ksocknal_hello (struct socket *sock, ptl_nid_t *nid, int *type, __u64 *incarnati rc, *nid, portals_nid2str(SOCKNAL, *nid, ipbuf)); return (rc); } - + if (hmv->magic != __le32_to_cpu (PORTALS_PROTO_MAGIC)) { CERROR ("Bad magic %#08x (%#08x expected) from "LPX64" %s\n", __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC, *nid, @@ -2118,7 +2111,7 @@ ksocknal_hello (struct socket *sock, ptl_nid_t *nid, int *type, __u64 *incarnati } else if (*nid != __le64_to_cpu (hdr.src_nid)) { CERROR ("Connected to nid "LPX64" %s, but expecting "LPX64" %s\n", __le64_to_cpu (hdr.src_nid), - portals_nid2str(SOCKNAL, + portals_nid2str(SOCKNAL, __le64_to_cpu(hdr.src_nid), ipbuf), *nid, portals_nid2str(SOCKNAL, *nid, ipbuf)); @@ -2139,7 +2132,7 @@ ksocknal_hello (struct socket *sock, ptl_nid_t *nid, int *type, __u64 *incarnati *type = SOCKNAL_CONN_BULK_IN; break; default: - CERROR ("Unexpected type %d from "LPX64" %s\n", + CERROR ("Unexpected type %d from "LPX64" %s\n", *type, *nid, portals_nid2str(SOCKNAL, *nid, ipbuf)); return (-EPROTO); @@ -2346,8 +2339,8 @@ ksocknal_connect_peer (ksock_route_t *route, int type) if (rc != 0) { CERROR ("Error %d connecting to "LPX64" %s\n", rc, route->ksnr_peer->ksnp_nid, - portals_nid2str(SOCKNAL, - route->ksnr_peer->ksnp_nid, + portals_nid2str(SOCKNAL, + route->ksnr_peer->ksnp_nid, ipbuf)); goto out; } @@ -2432,7 +2425,7 @@ ksocknal_autoconnect (ksock_route_t *route) while (!list_empty (&zombies)) { char ipbuf[PTL_NALFMT_SIZE]; tx = list_entry (zombies.next, ksock_tx_t, tx_list); - + CERROR ("Deleting packet type %d len %d ("LPX64" %s->"LPX64" %s)\n", NTOH__u32 (tx->tx_hdr->type), NTOH__u32 (tx->tx_hdr->payload_length), @@ -2719,7 +2712,6 @@ nal_cb_t ksocknal_lib = { cb_recv_pages: ksocknal_recv_pages, cb_read: ksocknal_read, cb_write: ksocknal_write, - cb_callback: ksocknal_callback, cb_malloc: ksocknal_malloc, cb_free: ksocknal_free, cb_printf: ksocknal_printf, diff --git a/lustre/portals/libcfs/module.c b/lustre/portals/libcfs/module.c index 2768c8d..2f5a852 100644 --- a/lustre/portals/libcfs/module.c +++ b/lustre/portals/libcfs/module.c @@ -812,9 +812,11 @@ EXPORT_SYMBOL(PtlMDBind); EXPORT_SYMBOL(lib_iov_nob); EXPORT_SYMBOL(lib_copy_iov2buf); EXPORT_SYMBOL(lib_copy_buf2iov); +EXPORT_SYMBOL(lib_extract_iov); EXPORT_SYMBOL(lib_kiov_nob); EXPORT_SYMBOL(lib_copy_kiov2buf); EXPORT_SYMBOL(lib_copy_buf2kiov); +EXPORT_SYMBOL(lib_extract_kiov); EXPORT_SYMBOL(lib_finalize); EXPORT_SYMBOL(lib_parse); EXPORT_SYMBOL(lib_fake_reply_msg); diff --git a/lustre/portals/portals/Makefile.am b/lustre/portals/portals/Makefile.am index 8c03749..d17db61 100644 --- a/lustre/portals/portals/Makefile.am +++ b/lustre/portals/portals/Makefile.am @@ -6,5 +6,9 @@ CPPFLAGS= INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -lib_LIBRARIES= libportals.a +noinst_LIBRARIES= libportals.a libportals_a_SOURCES= api-eq.c api-init.c api-me.c api-errno.c api-ni.c api-wrap.c lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-eq.c lib-md.c lib-move.c lib-ni.c lib-pid.c + +if LIBLUSTRE +libportals_a_CFLAGS= -fPIC +endif diff --git a/lustre/portals/portals/api-eq.c b/lustre/portals/portals/api-eq.c index 9bc9c36..964b9d8 100644 --- a/lustre/portals/portals/api-eq.c +++ b/lustre/portals/portals/api-eq.c @@ -81,12 +81,6 @@ int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t * ev) *ev = *new_event; - /* Set the unlinked_me interface number if there is one to pass - * back, since the NAL hasn't a clue what it is and therefore can't - * set it. */ - if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE)) - ev->unlinked_me.nal_idx = eventq.nal_idx; - /* ensure event is delivered correctly despite possible races with lib_finalize */ if (eq->sequence != new_event->sequence) { @@ -119,6 +113,7 @@ int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out) } #ifndef __KERNEL__ +#if 0 static jmp_buf eq_jumpbuf; static void eq_timeout(int signal) @@ -162,6 +157,46 @@ int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out, return rc; } +#else +#include -#endif +/* FIXME + * Here timeout need a trick with tcpnal, definitely unclean but OK for + * this moment. + */ + +/* global variables defined by tcpnal */ +extern int __tcpnal_eqwait_timeout_value; +extern int __tcpnal_eqwait_timedout; + +int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out, + int timeout) +{ + int rc; + if (!timeout) + return PtlEQWait(eventq_in, event_out); + + __tcpnal_eqwait_timeout_value = timeout; + + while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) { + nal_t *nal = ptl_hndl2nal(&eventq_in); + + if (nal->yield) + nal->yield(nal); + + if (__tcpnal_eqwait_timedout) { + if (__tcpnal_eqwait_timedout != ETIMEDOUT) + printf("Warning: yield return error %d\n", + __tcpnal_eqwait_timedout); + rc = PTL_EQ_EMPTY; + break; + } + } + + __tcpnal_eqwait_timeout_value = 0; + + return rc; +} +#endif +#endif /* __KERNEL__ */ diff --git a/lustre/portals/portals/api-errno.c b/lustre/portals/portals/api-errno.c index 026c93b..b5e7aa1 100644 --- a/lustre/portals/portals/api-errno.c +++ b/lustre/portals/portals/api-errno.c @@ -50,6 +50,5 @@ const char *ptl_err_str[] = { "PTL_IOV_TOO_SMALL", "PTL_EQ_INUSE", - "PTL_MD_INUSE" }; /* If you change these, you must update the number table in portals/errno.h */ diff --git a/lustre/portals/portals/api-ni.c b/lustre/portals/portals/api-ni.c index b2e069e..18eea91 100644 --- a/lustre/portals/portals/api-ni.c +++ b/lustre/portals/portals/api-ni.c @@ -125,7 +125,7 @@ int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size, if (ptl_interfaces[i] == nal) { nal->refct++; handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | i; - fprintf(stderr, "Returning existing NAL (%d)\n", i); + CDEBUG(D_OTHER, "Returning existing NAL (%d)\n", i); ptl_ni_init_mutex_exit (); return PTL_OK; } diff --git a/lustre/portals/portals/api-wrap.c b/lustre/portals/portals/api-wrap.c index e54707f..d23a6aa 100644 --- a/lustre/portals/portals/api-wrap.c +++ b/lustre/portals/portals/api-wrap.c @@ -32,7 +32,7 @@ static int do_forward(ptl_handle_any_t any_h, int cmd, void *argbuf, nal_t *nal; if (!ptl_init) { - fprintf(stderr, "PtlGetId: Not initialized\n"); + CERROR("Not initialized\n"); return PTL_NOINIT; } @@ -262,7 +262,7 @@ static int validate_md(ptl_handle_any_t current_in, ptl_md_t md_in) int i; if (!ptl_init) { - fprintf(stderr, "PtlMDAttach/Bind/Update: Not initialized\n"); + CERROR("PtlMDAttach/Bind/Update: Not initialized\n"); return PTL_NOINIT; } diff --git a/lustre/portals/portals/lib-init.c b/lustre/portals/portals/lib-init.c index 0765498..d4d8860 100644 --- a/lustre/portals/portals/lib-init.c +++ b/lustre/portals/portals/lib-init.c @@ -38,31 +38,17 @@ # include #endif -#ifndef PTL_USE_DESC_LISTS -static int ptl_slab_users; - -atomic_t md_in_use_count = ATOMIC_INIT(0); -atomic_t msg_in_use_count = ATOMIC_INIT(0); -atomic_t me_in_use_count = ATOMIC_INIT(0); -atomic_t eq_in_use_count = ATOMIC_INIT(0); +#ifndef PTL_USE_LIB_FREELIST int kportal_descriptor_setup (nal_cb_t *nal) { - ptl_slab_users++; - RETURN(PTL_OK); + return PTL_OK; } void kportal_descriptor_cleanup (nal_cb_t *nal) { - if (--ptl_slab_users != 0) - return; - - LASSERT (atomic_read (&md_in_use_count) == 0); - LASSERT (atomic_read (&me_in_use_count) == 0); - LASSERT (atomic_read (&eq_in_use_count) == 0); - LASSERT (atomic_read (&msg_in_use_count) == 0); } #else diff --git a/lustre/portals/portals/lib-md.c b/lustre/portals/portals/lib-md.c index be6949c..a1ed583 100644 --- a/lustre/portals/portals/lib-md.c +++ b/lustre/portals/portals/lib-md.c @@ -83,7 +83,7 @@ static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private, int rc; int i; - /* NB we are passes an allocated, but uninitialised/active md. + /* NB we are passed an allocated, but uninitialised/active md. * if we return success, caller may lib_md_unlink() it. * otherwise caller may only lib_md_free() it. */ @@ -94,9 +94,10 @@ static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private, return PTL_INV_EQ; } - if ((md->options & PTL_MD_IOV) != 0 && /* discontiguous MD */ - md->niov > PTL_MD_MAX_IOV) /* too many fragments */ - return PTL_IOV_TOO_MANY; + /* Must check this _before_ allocation. Also, note that non-iov + * MDs must set md_niov to 0. */ + LASSERT((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0 || + md->niov <= PTL_MD_MAX_IOV); if ((md->options & max_size_opts) != 0 && /* max size used */ (md->max_size < 0 || md->max_size > md->length)) // illegal max_size @@ -239,7 +240,11 @@ int do_PtlMDAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret) lib_md_t *md; unsigned long flags; - md = lib_md_alloc (nal); + if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOV)) != 0 && + args->md_in.niov > PTL_MD_MAX_IOV) /* too many fragments */ + return (ret->rc = PTL_IOV_TOO_MANY); + + md = lib_md_alloc(nal, &args->md_in); if (md == NULL) return (ret->rc = PTL_NOSPACE); @@ -287,7 +292,11 @@ int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret) lib_md_t *md; unsigned long flags; - md = lib_md_alloc (nal); + if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOV)) != 0 && + args->md_in.niov > PTL_MD_MAX_IOV) /* too many fragments */ + return (ret->rc = PTL_IOV_TOO_MANY); + + md = lib_md_alloc(nal, &args->md_in); if (md == NULL) return (ret->rc = PTL_NOSPACE); @@ -311,34 +320,43 @@ int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret) int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) { - PtlMDUnlink_in *args = v_args; + PtlMDUnlink_in *args = v_args; PtlMDUnlink_out *ret = v_ret; - - lib_md_t *md; - unsigned long flags; + ptl_event_t ev; + lib_md_t *md; + unsigned long flags; state_lock(nal, &flags); md = ptl_handle2md(&args->md_in, nal); if (md == NULL) { - ret->rc = PTL_INV_MD; - } else if (md->pending != 0) { /* being filled/spilled */ - ret->rc = PTL_MD_INUSE; - } else { - /* Callers attempting to unlink a busy MD which will get - * unlinked once the net op completes should see INUSE, - * before completion and INV_MD thereafter. LASSERT we've - * got that right... */ - LASSERT ((md->md_flags & PTL_MD_FLAG_UNLINK) == 0); - - lib_md_deconstruct(nal, md, &ret->status_out); - lib_md_unlink(nal, md); - ret->rc = PTL_OK; + state_unlock(nal, &flags); + return (ret->rc = PTL_INV_MD); + } + + /* If the MD is busy, lib_md_unlink just marks it for deletion, and + * when the NAL is done, the completion event flags that the MD was + * unlinked. Otherwise, we enqueue an event now... */ + + if (md->eq != NULL && + md->pending == 0) { + memset(&ev, 0, sizeof(ev)); + + ev.type = PTL_EVENT_UNLINK; + ev.status = PTL_OK; + ev.unlinked = 1; + lib_md_deconstruct(nal, md, &ev.mem_desc); + + lib_enq_event_locked(nal, private, md->eq, &ev); } + lib_md_deconstruct(nal, md, &ret->status_out); + lib_md_unlink(nal, md); + ret->rc = PTL_OK; + state_unlock(nal, &flags); - return (ret->rc); + return (PTL_OK); } int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args, @@ -379,6 +397,23 @@ int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args, goto out; } + /* XXX fttb, the new MD must be the same type wrt fragmentation */ + if (((new->options ^ md->options) & + (PTL_MD_IOV | PTL_MD_KIOV)) != 0) { + ret->rc = PTL_INV_MD; + goto out; + } + + if (new->niov > md->md_niov) { + ret->rc = PTL_IOV_TOO_MANY; + goto out; + } + + if (new->niov < md->md_niov) { + ret->rc = PTL_IOV_TOO_SMALL; + goto out; + } + if (!PtlHandleEqual (args->testq_in, PTL_EQ_NONE)) { test_eq = ptl_handle2eq(&args->testq_in, nal); if (test_eq == NULL) { diff --git a/lustre/portals/portals/lib-move.c b/lustre/portals/portals/lib-move.c index d844a7a..ecd543c 100644 --- a/lustre/portals/portals/lib-move.c +++ b/lustre/portals/portals/lib-move.c @@ -258,55 +258,78 @@ lib_iov_nob (int niov, struct iovec *iov) } void -lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len) +lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, + ptl_size_t offset, ptl_size_t len) { ptl_size_t nob; - while (len > 0) - { + if (len == 0) + return; + + /* skip complete frags before 'offset' */ + LASSERT (niov > 0); + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + niov--; + LASSERT (niov > 0); + } + + do { LASSERT (niov > 0); - nob = MIN (iov->iov_len, len); - memcpy (dest, iov->iov_base, nob); + nob = MIN (iov->iov_len - offset, len); + memcpy (dest, iov->iov_base + offset, nob); len -= nob; dest += nob; niov--; iov++; - } + offset = 0; + } while (len > 0); } void -lib_copy_buf2iov (int niov, struct iovec *iov, char *src, ptl_size_t len) +lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, + char *src, ptl_size_t len) { ptl_size_t nob; - while (len > 0) - { + if (len == 0) + return; + + /* skip complete frags before 'offset' */ + LASSERT (niov > 0); + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + niov--; LASSERT (niov > 0); - nob = MIN (iov->iov_len, len); - memcpy (iov->iov_base, src, nob); + } + + do { + LASSERT (niov > 0); + nob = MIN (iov->iov_len - offset, len); + memcpy (iov->iov_base + offset, src, nob); len -= nob; src += nob; niov--; iov++; - } + offset = 0; + } while (len > 0); } -static int -lib_extract_iov (struct iovec *dst, lib_md_t *md, +int +lib_extract_iov (int dst_niov, struct iovec *dst, + int src_niov, struct iovec *src, ptl_size_t offset, ptl_size_t len) { /* Initialise 'dst' to the subset of 'src' starting at 'offset', * for exactly 'len' bytes, and return the number of entries. * NB not destructive to 'src' */ - int src_niov = md->md_niov; - struct iovec *src = md->md_iov.iov; ptl_size_t frag_len; - int dst_niov; + int niov; - LASSERT (offset + len <= md->length); - if (len == 0) /* no data => */ return (0); /* no frags */ @@ -318,17 +341,17 @@ lib_extract_iov (struct iovec *dst, lib_md_t *md, LASSERT (src_niov > 0); } - dst_niov = 1; + niov = 1; for (;;) { LASSERT (src_niov > 0); - LASSERT (dst_niov <= PTL_MD_MAX_IOV); + LASSERT (niov <= dst_niov); frag_len = src->iov_len - offset; dst->iov_base = ((char *)src->iov_base) + offset; if (len <= frag_len) { dst->iov_len = len; - return (dst_niov); + return (niov); } dst->iov_len = frag_len; @@ -336,7 +359,7 @@ lib_extract_iov (struct iovec *dst, lib_md_t *md, len -= frag_len; dst++; src++; - dst_niov++; + niov++; src_niov--; offset = 0; } @@ -351,19 +374,22 @@ lib_kiov_nob (int niov, ptl_kiov_t *kiov) } void -lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len) +lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, + ptl_size_t offset, ptl_size_t len) { LASSERT (0); } void -lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *dest, ptl_size_t len) +lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset, + char *src, ptl_size_t len) { LASSERT (0); } -static int -lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, +int +lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, + int src_niov, ptl_kiov_t *src, ptl_size_t offset, ptl_size_t len) { LASSERT (0); @@ -383,18 +409,30 @@ lib_kiov_nob (int niov, ptl_kiov_t *kiov) } void -lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len) +lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, + ptl_size_t offset, ptl_size_t len) { ptl_size_t nob; char *addr; + + if (len == 0) + return; LASSERT (!in_interrupt ()); - while (len > 0) - { + + LASSERT (niov > 0); + while (offset > kiov->kiov_len) { + offset -= kiov->kiov_len; + kiov++; + niov--; + LASSERT (niov > 0); + } + + do{ LASSERT (niov > 0); - nob = MIN (kiov->kiov_len, len); + nob = MIN (kiov->kiov_len - offset, len); - addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset; memcpy (dest, addr, nob); kunmap (kiov->kiov_page); @@ -402,22 +440,35 @@ lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len) dest += nob; niov--; kiov++; - } + offset = 0; + } while (len > 0); } void -lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len) +lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset, + char *src, ptl_size_t len) { ptl_size_t nob; char *addr; + if (len == 0) + return; + LASSERT (!in_interrupt ()); - while (len > 0) - { + + LASSERT (niov > 0); + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + kiov++; + niov--; + LASSERT (niov > 0); + } + + do { LASSERT (niov > 0); - nob = MIN (kiov->kiov_len, len); + nob = MIN (kiov->kiov_len - offset, len); - addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset; memcpy (addr, src, nob); kunmap (kiov->kiov_page); @@ -425,23 +476,21 @@ lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len) src += nob; niov--; kiov++; - } + offset = 0; + } while (len > 0); } -static int -lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, +int +lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, + int src_niov, ptl_kiov_t *src, ptl_size_t offset, ptl_size_t len) { /* Initialise 'dst' to the subset of 'src' starting at 'offset', * for exactly 'len' bytes, and return the number of entries. * NB not destructive to 'src' */ - int src_niov = md->md_niov; - ptl_kiov_t *src = md->md_iov.kiov; ptl_size_t frag_len; - int dst_niov; + int niov; - LASSERT (offset + len <= md->length); - if (len == 0) /* no data => */ return (0); /* no frags */ @@ -453,10 +502,10 @@ lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, LASSERT (src_niov > 0); } - dst_niov = 1; + niov = 1; for (;;) { LASSERT (src_niov > 0); - LASSERT (dst_niov <= PTL_MD_MAX_IOV); + LASSERT (niov <= dst_niov); frag_len = src->kiov_len - offset; dst->kiov_page = src->kiov_page; @@ -465,7 +514,7 @@ lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, if (len <= frag_len) { dst->kiov_len = len; LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); - return (dst_niov); + return (niov); } dst->kiov_len = frag_len; @@ -474,73 +523,66 @@ lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, len -= frag_len; dst++; src++; - dst_niov++; + niov++; src_niov--; offset = 0; } } #endif -void +ptl_err_t lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen) { - int niov; - if (mlen == 0) - nal->cb_recv (nal, private, msg, 0, NULL, 0, rlen); - else if ((md->options & PTL_MD_KIOV) == 0) { - niov = lib_extract_iov (msg->msg_iov.iov, md, offset, mlen); - nal->cb_recv (nal, private, msg, - niov, msg->msg_iov.iov, mlen, rlen); - } else { - niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, mlen); - nal->cb_recv_pages (nal, private, msg, - niov, msg->msg_iov.kiov, mlen, rlen); - } + return (nal->cb_recv(nal, private, msg, + 0, NULL, + offset, mlen, rlen)); + + if ((md->options & PTL_MD_KIOV) == 0) + return (nal->cb_recv(nal, private, msg, + md->md_niov, md->md_iov.iov, + offset, mlen, rlen)); + + return (nal->cb_recv_pages(nal, private, msg, + md->md_niov, md->md_iov.kiov, + offset, mlen, rlen)); } -int +ptl_err_t lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, lib_md_t *md, ptl_size_t offset, ptl_size_t len) { - int niov; - if (len == 0) - return (nal->cb_send (nal, private, msg, - hdr, type, nid, pid, - 0, NULL, 0)); + return (nal->cb_send(nal, private, msg, + hdr, type, nid, pid, + 0, NULL, + offset, len)); - if ((md->options & PTL_MD_KIOV) == 0) { - niov = lib_extract_iov (msg->msg_iov.iov, md, offset, len); - return (nal->cb_send (nal, private, msg, - hdr, type, nid, pid, - niov, msg->msg_iov.iov, len)); - } - - niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, len); - return (nal->cb_send_pages (nal, private, msg, - hdr, type, nid, pid, - niov, msg->msg_iov.kiov, len)); + if ((md->options & PTL_MD_KIOV) == 0) + return (nal->cb_send(nal, private, msg, + hdr, type, nid, pid, + md->md_niov, md->md_iov.iov, + offset, len)); + + return (nal->cb_send_pages(nal, private, msg, + hdr, type, nid, pid, + md->md_niov, md->md_iov.kiov, + offset, len)); } -static lib_msg_t * -get_new_msg (nal_cb_t *nal, lib_md_t *md) +static void +lib_commit_md (nal_cb_t *nal, lib_md_t *md, lib_msg_t *msg) { /* ALWAYS called holding the state_lock */ lib_counters_t *counters = &nal->ni.counters; - lib_msg_t *msg = lib_msg_alloc (nal); - - if (msg == NULL) - return (NULL); - - memset (msg, 0, sizeof (*msg)); - - msg->send_ack = 0; + /* Here, we commit the MD to a network OP by marking it busy and + * decrementing its threshold. Come what may, the network "owns" + * the MD until a call to lib_finalize() signals completion. */ msg->md = md; - do_gettimeofday(&msg->ev.arrival_time); + md->pending++; if (md->threshold != PTL_MD_THRESH_INF) { LASSERT (md->threshold > 0); @@ -552,8 +594,24 @@ get_new_msg (nal_cb_t *nal, lib_md_t *md) counters->msgs_max = counters->msgs_alloc; list_add (&msg->msg_list, &nal->ni.ni_active_msgs); +} - return (msg); +static void +lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr) +{ + unsigned long flags; + + /* CAVEAT EMPTOR: this only drops messages that we've not committed + * to receive (init_msg() not called) and therefore can't cause an + * event. */ + + state_lock(nal, &flags); + nal->ni.counters.drop_count++; + nal->ni.counters.drop_length += hdr->payload_length; + state_unlock(nal, &flags); + + /* NULL msg => if NAL calls lib_finalize it will be a noop */ + (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length); } /* @@ -563,17 +621,18 @@ get_new_msg (nal_cb_t *nal, lib_md_t *md) * of long messages. * */ -static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +static ptl_err_t +parse_put(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { lib_ni_t *ni = &nal->ni; ptl_size_t mlength = 0; ptl_size_t offset = 0; int unlink = 0; + ptl_err_t rc; lib_me_t *me; lib_md_t *md; - lib_msg_t *msg; unsigned long flags; - + /* Convert put fields to host byte order */ hdr->msg.put.match_bits = NTOH__u64 (hdr->msg.put.match_bits); hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index); @@ -586,8 +645,10 @@ static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) hdr->payload_length, hdr->msg.put.offset, hdr->msg.put.match_bits, &mlength, &offset, &unlink); - if (me == NULL) - goto drop; + if (me == NULL) { + state_unlock(nal, &flags); + return (PTL_FAIL); + } md = me->md; CDEBUG(D_NET, "Incoming put index %x from "LPU64"/%u of length %d/%d " @@ -595,69 +656,46 @@ static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) hdr->src_nid, hdr->src_pid, mlength, hdr->payload_length, md->md_lh.lh_cookie, md->md_niov, offset); - msg = get_new_msg (nal, md); - if (msg == NULL) { - CERROR(LPU64": Dropping PUT from "LPU64": can't allocate msg\n", - ni->nid, hdr->src_nid); - goto drop; - } + lib_commit_md(nal, md, msg); + + msg->ev.type = PTL_EVENT_PUT; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.portal = hdr->msg.put.ptl_index; + msg->ev.match_bits = hdr->msg.put.match_bits; + msg->ev.rlength = hdr->payload_length; + msg->ev.mlength = mlength; + msg->ev.offset = offset; + msg->ev.hdr_data = hdr->msg.put.hdr_data; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) && !(md->options & PTL_MD_ACK_DISABLE)) { - msg->send_ack = 1; msg->ack_wmd = hdr->msg.put.ack_wmd; - msg->nid = hdr->src_nid; - msg->pid = hdr->src_pid; - msg->ev.match_bits = hdr->msg.put.match_bits; - } - - if (md->eq) { - msg->ev.type = PTL_EVENT_PUT; - msg->ev.initiator.nid = hdr->src_nid; - msg->ev.initiator.pid = hdr->src_pid; - msg->ev.portal = hdr->msg.put.ptl_index; - msg->ev.match_bits = hdr->msg.put.match_bits; - msg->ev.rlength = hdr->payload_length; - msg->ev.mlength = mlength; - msg->ev.offset = offset; - msg->ev.hdr_data = hdr->msg.put.hdr_data; - - /* NB if this match has exhausted the MD, we can't be sure - * that this event will the the last one associated with - * this MD in the event queue (another message already - * matching this ME/MD could end up being last). So we - * remember the ME handle anyway and check again when we're - * allocating our slot in the event queue. - */ - ptl_me2handle (&msg->ev.unlinked_me, me); - - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); } ni->counters.recv_count++; ni->counters.recv_length += mlength; - /* only unlink after MD's pending count has been bumped - * in get_new_msg() otherwise lib_me_unlink() will nuke it */ - if (unlink) { - md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED; + /* only unlink after MD's pending count has been bumped in + * lib_commit_md() otherwise lib_me_unlink() will nuke it */ + if (unlink) lib_me_unlink (nal, me); - } state_unlock(nal, &flags); - lib_recv (nal, private, msg, md, offset, mlength, hdr->payload_length); - return 0; + rc = lib_recv(nal, private, msg, md, offset, mlength, + hdr->payload_length); + if (rc != PTL_OK) + CERROR(LPU64": error on receiving PUT from "LPU64": %d\n", + ni->nid, hdr->src_nid, rc); - drop: - nal->ni.counters.drop_count++; - nal->ni.counters.drop_length += hdr->payload_length; - state_unlock (nal, &flags); - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); - return -1; + return (rc); } -static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +static ptl_err_t +parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { lib_ni_t *ni = &nal->ni; ptl_size_t mlength = 0; @@ -665,7 +703,6 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) int unlink = 0; lib_me_t *me; lib_md_t *md; - lib_msg_t *msg; ptl_hdr_t reply; unsigned long flags; int rc; @@ -683,8 +720,10 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) hdr->msg.get.sink_length, hdr->msg.get.src_offset, hdr->msg.get.match_bits, &mlength, &offset, &unlink); - if (me == NULL) - goto drop; + if (me == NULL) { + state_unlock(nal, &flags); + return (PTL_FAIL); + } md = me->md; CDEBUG(D_NET, "Incoming get index %d from "LPU64".%u of length %d/%d " @@ -692,45 +731,27 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) hdr->src_nid, hdr->src_pid, mlength, hdr->payload_length, md->md_lh.lh_cookie, md->md_niov, offset); - msg = get_new_msg (nal, md); - if (msg == NULL) { - CERROR(LPU64": Dropping GET from "LPU64": can't allocate msg\n", - ni->nid, hdr->src_nid); - goto drop; - } + lib_commit_md(nal, md, msg); - if (md->eq) { - msg->ev.type = PTL_EVENT_GET; - msg->ev.initiator.nid = hdr->src_nid; - msg->ev.initiator.pid = hdr->src_pid; - msg->ev.portal = hdr->msg.get.ptl_index; - msg->ev.match_bits = hdr->msg.get.match_bits; - msg->ev.rlength = hdr->payload_length; - msg->ev.mlength = mlength; - msg->ev.offset = offset; - msg->ev.hdr_data = 0; - - /* NB if this match has exhausted the MD, we can't be sure - * that this event will the the last one associated with - * this MD in the event queue (another message already - * matching this ME/MD could end up being last). So we - * remember the ME handle anyway and check again when we're - * allocating our slot in the event queue. - */ - ptl_me2handle (&msg->ev.unlinked_me, me); - - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - } + msg->ev.type = PTL_EVENT_GET; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.portal = hdr->msg.get.ptl_index; + msg->ev.match_bits = hdr->msg.get.match_bits; + msg->ev.rlength = hdr->payload_length; + msg->ev.mlength = mlength; + msg->ev.offset = offset; + msg->ev.hdr_data = 0; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); ni->counters.send_count++; ni->counters.send_length += mlength; - /* only unlink after MD's refcount has been bumped - * in get_new_msg() otherwise lib_me_unlink() will nuke it */ - if (unlink) { - md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED; + /* only unlink after MD's refcount has been bumped in + * lib_commit_md() otherwise lib_me_unlink() will nuke it */ + if (unlink) lib_me_unlink (nal, me); - } state_unlock(nal, &flags); @@ -749,36 +770,25 @@ static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY, hdr->src_nid, hdr->src_pid, md, offset, mlength); - if (rc != PTL_OK) { - CERROR(LPU64": Dropping GET from "LPU64": send REPLY failed\n", - ni->nid, hdr->src_nid); - /* Hmm, this will create a GET event and make believe - * the reply completed, which it kind of did, only the - * source won't get her reply */ - lib_finalize (nal, private, msg); - state_lock (nal, &flags); - goto drop; - } + if (rc != PTL_OK) + CERROR(LPU64": Unable to send REPLY for GET from "LPU64": %d\n", + ni->nid, hdr->src_nid, rc); + + /* Discard any junk after the hdr */ + (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length); - /* Complete the incoming message */ - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); return (rc); - drop: - ni->counters.drop_count++; - ni->counters.drop_length += hdr->msg.get.sink_length; - state_unlock(nal, &flags); - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); - return -1; } -static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +static ptl_err_t +parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { lib_ni_t *ni = &nal->ni; lib_md_t *md; int rlength; int length; - lib_msg_t *msg; unsigned long flags; + ptl_err_t rc; state_lock(nal, &flags); @@ -790,7 +800,9 @@ static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) md == NULL ? "invalid" : "inactive", hdr->msg.reply.dst_wmd.wh_interface_cookie, hdr->msg.reply.dst_wmd.wh_object_cookie); - goto drop; + + state_unlock(nal, &flags); + return (PTL_FAIL); } LASSERT (md->offset == 0); @@ -804,7 +816,8 @@ static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) ni->nid, hdr->src_nid, length, hdr->msg.reply.dst_wmd.wh_object_cookie, md->length); - goto drop; + state_unlock(nal, &flags); + return (PTL_FAIL); } length = md->length; } @@ -813,46 +826,36 @@ static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) hdr->src_nid, length, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie); - msg = get_new_msg (nal, md); - if (msg == NULL) { - CERROR(LPU64": Dropping REPLY from "LPU64": can't " - "allocate msg\n", ni->nid, hdr->src_nid); - goto drop; - } + lib_commit_md(nal, md, msg); - if (md->eq) { - msg->ev.type = PTL_EVENT_REPLY; - msg->ev.initiator.nid = hdr->src_nid; - msg->ev.initiator.pid = hdr->src_pid; - msg->ev.rlength = rlength; - msg->ev.mlength = length; - msg->ev.offset = 0; + msg->ev.type = PTL_EVENT_REPLY; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.rlength = rlength; + msg->ev.mlength = length; + msg->ev.offset = 0; - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - } + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); ni->counters.recv_count++; ni->counters.recv_length += length; state_unlock(nal, &flags); - lib_recv (nal, private, msg, md, 0, length, rlength); - return 0; + rc = lib_recv(nal, private, msg, md, 0, length, rlength); + if (rc != PTL_OK) + CERROR(LPU64": error on receiving REPLY from "LPU64": %d\n", + ni->nid, hdr->src_nid, rc); - drop: - nal->ni.counters.drop_count++; - nal->ni.counters.drop_length += hdr->payload_length; - state_unlock (nal, &flags); - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); - return -1; + return (rc); } -static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +static ptl_err_t +parse_ack(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg) { - lib_ni_t *ni = &nal->ni; - lib_md_t *md; - lib_msg_t *msg = NULL; - unsigned long flags; + lib_ni_t *ni = &nal->ni; + lib_md_t *md; + unsigned long flags; /* Convert ack fields to host byte order */ hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits); @@ -868,40 +871,37 @@ static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) (md == NULL) ? "invalid" : "inactive", hdr->msg.ack.dst_wmd.wh_interface_cookie, hdr->msg.ack.dst_wmd.wh_object_cookie); - goto drop; + + state_unlock(nal, &flags); + return (PTL_FAIL); } CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n", ni->nid, hdr->src_nid, hdr->msg.ack.dst_wmd.wh_object_cookie); - msg = get_new_msg (nal, md); - if (msg == NULL) { - CERROR(LPU64": Dropping ACK from "LPU64": can't allocate msg\n", - ni->nid, hdr->src_nid); - goto drop; - } + lib_commit_md(nal, md, msg); - if (md->eq) { - msg->ev.type = PTL_EVENT_ACK; - msg->ev.initiator.nid = hdr->src_nid; - msg->ev.initiator.pid = hdr->src_pid; - msg->ev.mlength = hdr->msg.ack.mlength; - msg->ev.match_bits = hdr->msg.ack.match_bits; + msg->ev.type = PTL_EVENT_ACK; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.mlength = hdr->msg.ack.mlength; + msg->ev.match_bits = hdr->msg.ack.match_bits; - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - } + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); ni->counters.recv_count++; - state_unlock(nal, &flags); - lib_recv (nal, private, msg, NULL, 0, 0, hdr->payload_length); - return 0; - drop: - nal->ni.counters.drop_count++; - state_unlock (nal, &flags); - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); - return -1; + state_unlock(nal, &flags); + + /* We have received and matched up the ack OK, create the + * completion event now... */ + lib_finalize(nal, private, msg, PTL_OK); + + /* ...and now discard any junk after the hdr */ + (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length); + + return (PTL_OK); } static char * @@ -983,10 +983,13 @@ void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr) } /* end of print_hdr() */ -int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +void +lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private) { unsigned long flags; - + ptl_err_t rc; + lib_msg_t *msg; + /* convert common fields to host byte order */ hdr->dest_nid = NTOH__u64 (hdr->dest_nid); hdr->src_nid = NTOH__u64 (hdr->src_nid); @@ -1008,22 +1011,16 @@ int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) nal->ni.nid, mv->magic, mv->version_major, mv->version_minor, hdr->src_nid); - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); - return (-1); + lib_drop_message(nal, private, hdr); + return; } if (hdr->dest_nid != nal->ni.nid) { CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64 " (not me)\n", nal->ni.nid, hdr_type_string (hdr), hdr->src_nid, hdr->dest_nid); - - state_lock (nal, &flags); - nal->ni.counters.drop_count++; - nal->ni.counters.drop_length += hdr->payload_length; - state_unlock (nal, &flags); - - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); - return (-1); + lib_drop_message(nal, private, hdr); + return; } if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ @@ -1033,34 +1030,59 @@ int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) ": simulated failure\n", nal->ni.nid, hdr_type_string (hdr), hdr->src_nid); - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); - return (-1); + lib_drop_message(nal, private, hdr); + return; } - + + msg = lib_msg_alloc(nal); + if (msg == NULL) { + CERROR(LPU64": Dropping incoming %s from "LPU64 + ": can't allocate a lib_msg_t\n", + nal->ni.nid, hdr_type_string (hdr), + hdr->src_nid); + lib_drop_message(nal, private, hdr); + return; + } + + do_gettimeofday(&msg->ev.arrival_time); + switch (hdr->type) { case PTL_MSG_ACK: - return (parse_ack(nal, hdr, private)); + rc = parse_ack(nal, hdr, private, msg); + break; case PTL_MSG_PUT: - return (parse_put(nal, hdr, private)); + rc = parse_put(nal, hdr, private, msg); break; case PTL_MSG_GET: - return (parse_get(nal, hdr, private)); + rc = parse_get(nal, hdr, private, msg); break; case PTL_MSG_REPLY: - return (parse_reply(nal, hdr, private)); + rc = parse_reply(nal, hdr, private, msg); break; default: CERROR(LPU64": Dropping message from "LPU64 ": Bad type=0x%x\n", nal->ni.nid, hdr->src_nid, hdr->type); - - lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length); - return (-1); + rc = PTL_FAIL; + break; + } + + if (rc != PTL_OK) { + if (msg->md != NULL) { + /* committed... */ + lib_finalize(nal, private, msg, rc); + } else { + state_lock(nal, &flags); + lib_msg_free(nal, msg); /* expects state_lock held */ + state_unlock(nal, &flags); + + lib_drop_message(nal, private, hdr); + } } } - -int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +do_PtlPut(nal_cb_t *nal, void *private, void *v_args, void *v_ret) { /* * Incoming: @@ -1075,16 +1097,15 @@ int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret) * Outgoing: */ - PtlPut_in *args = v_args; - PtlPut_out *ret = v_ret; - ptl_hdr_t hdr; - - lib_ni_t *ni = &nal->ni; - lib_md_t *md; - lib_msg_t *msg = NULL; + PtlPut_in *args = v_args; ptl_process_id_t *id = &args->target_in; - unsigned long flags; - int rc; + PtlPut_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + lib_msg_t *msg; + ptl_hdr_t hdr; + lib_md_t *md; + unsigned long flags; + int rc; if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ fail_peer (nal, id->nid, 1)) /* shall we now? */ @@ -1093,13 +1114,22 @@ int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret) nal->ni.nid, id->nid); return (ret->rc = PTL_INV_PROC); } - - ret->rc = PTL_OK; + + msg = lib_msg_alloc(nal); + if (msg == NULL) { + CERROR(LPU64": Dropping PUT to "LPU64": ENOMEM on lib_msg_t\n", + ni->nid, id->nid); + return (ret->rc = PTL_NOSPACE); + } + state_lock(nal, &flags); + md = ptl_handle2md(&args->md_in, nal); - if (md == NULL || !md->threshold) { + if (md == NULL || md->threshold == 0) { + lib_msg_free(nal, msg); state_unlock(nal, &flags); - return ret->rc = PTL_INV_MD; + + return (ret->rc = PTL_INV_MD); } CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid, @@ -1126,57 +1156,39 @@ int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret) hdr.msg.put.offset = HTON__u32 (args->offset_in); hdr.msg.put.hdr_data = args->hdr_data_in; + lib_commit_md(nal, md, msg); + + msg->ev.type = PTL_EVENT_SENT; + msg->ev.initiator.nid = ni->nid; + msg->ev.initiator.pid = ni->pid; + msg->ev.portal = args->portal_in; + msg->ev.match_bits = args->match_bits_in; + msg->ev.rlength = md->length; + msg->ev.mlength = md->length; + msg->ev.offset = args->offset_in; + msg->ev.hdr_data = args->hdr_data_in; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + ni->counters.send_count++; ni->counters.send_length += md->length; - msg = get_new_msg (nal, md); - if (msg == NULL) { - CERROR("BAD: could not allocate msg!\n"); - state_unlock(nal, &flags); - return ret->rc = PTL_NOSPACE; - } - - /* - * If this memory descriptor has an event queue associated with - * it we need to allocate a message state object and record the - * information about this operation that will be recorded into - * event queue once the message has been completed. - * - * NB. We're now committed to the GET, since we just marked the MD - * busy. Callers who observe this (by getting PTL_MD_INUSE from - * PtlMDUnlink()) expect a completion event to tell them when the - * MD becomes idle. - */ - if (md->eq) { - msg->ev.type = PTL_EVENT_SENT; - msg->ev.initiator.nid = ni->nid; - msg->ev.initiator.pid = ni->pid; - msg->ev.portal = args->portal_in; - msg->ev.match_bits = args->match_bits_in; - msg->ev.rlength = md->length; - msg->ev.mlength = md->length; - msg->ev.offset = args->offset_in; - msg->ev.hdr_data = args->hdr_data_in; - - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - } - state_unlock(nal, &flags); rc = lib_send (nal, private, msg, &hdr, PTL_MSG_PUT, id->nid, id->pid, md, 0, md->length); if (rc != PTL_OK) { - /* get_new_msg() committed us to sending by decrementing - * md->threshold, so we have to act like we did send, but - * the network dropped it. */ - lib_finalize (nal, private, msg); + CERROR(LPU64": error sending PUT to "LPU64": %d\n", + ni->nid, id->nid, rc); + lib_finalize (nal, private, msg, rc); } + /* completion will be signalled by an event */ return ret->rc = PTL_OK; } -lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, - lib_md_t *getmd) +lib_msg_t * +lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_md_t *getmd) { /* The NAL can DMA direct to the GET md (i.e. no REPLY msg). This * returns a msg the NAL can pass to lib_finalize() so that a REPLY @@ -1188,39 +1200,38 @@ lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, * lib_finalize() of the original GET. */ lib_ni_t *ni = &nal->ni; - lib_msg_t *msg; + lib_msg_t *msg = lib_msg_alloc(nal); unsigned long flags; state_lock(nal, &flags); LASSERT (getmd->pending > 0); + if (msg == NULL) { + CERROR ("Dropping REPLY from "LPU64": can't allocate msg\n", + peer_nid); + goto drop; + } + if (getmd->threshold == 0) { CERROR ("Dropping REPLY from "LPU64" for inactive MD %p\n", peer_nid, getmd); - goto drop; + goto drop_msg; } LASSERT (getmd->offset == 0); CDEBUG(D_NET, "Reply from "LPU64" md %p\n", peer_nid, getmd); - msg = get_new_msg (nal, getmd); - if (msg == NULL) { - CERROR("Dropping REPLY from "LPU64" md %p: can't allocate msg\n", - peer_nid, getmd); - goto drop; - } + lib_commit_md (nal, getmd, msg); - if (getmd->eq) { - msg->ev.type = PTL_EVENT_REPLY; - msg->ev.initiator.nid = peer_nid; - msg->ev.initiator.pid = 0; /* XXX FIXME!!! */ - msg->ev.rlength = msg->ev.mlength = getmd->length; - msg->ev.offset = 0; + msg->ev.type = PTL_EVENT_REPLY; + msg->ev.initiator.nid = peer_nid; + msg->ev.initiator.pid = 0; /* XXX FIXME!!! */ + msg->ev.rlength = msg->ev.mlength = getmd->length; + msg->ev.offset = 0; - lib_md_deconstruct(nal, getmd, &msg->ev.mem_desc); - } + lib_md_deconstruct(nal, getmd, &msg->ev.mem_desc); ni->counters.recv_count++; ni->counters.recv_length += getmd->length; @@ -1228,7 +1239,9 @@ lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, state_unlock(nal, &flags); return msg; - + + drop_msg: + lib_msg_free(nal, msg); drop: nal->ni.counters.drop_count++; nal->ni.counters.drop_length += getmd->length; @@ -1238,7 +1251,8 @@ lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, return NULL; } -int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +int +do_PtlGet(nal_cb_t *nal, void *private, void *v_args, void *v_ret) { /* * Incoming: @@ -1252,15 +1266,15 @@ int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret) * Outgoing: */ - PtlGet_in *args = v_args; - PtlGet_out *ret = v_ret; - ptl_hdr_t hdr; - lib_msg_t *msg = NULL; - lib_ni_t *ni = &nal->ni; + PtlGet_in *args = v_args; ptl_process_id_t *id = &args->target_in; - lib_md_t *md; - unsigned long flags; - int rc; + PtlGet_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + lib_msg_t *msg; + ptl_hdr_t hdr; + lib_md_t *md; + unsigned long flags; + int rc; if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ fail_peer (nal, id->nid, 1)) /* shall we now? */ @@ -1269,16 +1283,24 @@ int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret) nal->ni.nid, id->nid); return (ret->rc = PTL_INV_PROC); } - + + msg = lib_msg_alloc(nal); + if (msg == NULL) { + CERROR(LPU64": Dropping GET to "LPU64": ENOMEM on lib_msg_t\n", + ni->nid, id->nid); + return (ret->rc = PTL_NOSPACE); + } + state_lock(nal, &flags); + md = ptl_handle2md(&args->md_in, nal); if (md == NULL || !md->threshold) { + lib_msg_free(nal, msg); state_unlock(nal, &flags); + return ret->rc = PTL_INV_MD; } - LASSERT (md->offset == 0); - CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid, (unsigned long)id->pid); @@ -1299,51 +1321,33 @@ int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret) hdr.msg.get.src_offset = HTON__u32 (args->offset_in); hdr.msg.get.sink_length = HTON__u32 (md->length); - ni->counters.send_count++; + lib_commit_md(nal, md, msg); - msg = get_new_msg (nal, md); - if (msg == NULL) { - CERROR("do_PtlGet: BAD - could not allocate cookie!\n"); - state_unlock(nal, &flags); - return ret->rc = PTL_NOSPACE; - } + msg->ev.type = PTL_EVENT_SENT; + msg->ev.initiator.nid = ni->nid; + msg->ev.initiator.pid = ni->pid; + msg->ev.portal = args->portal_in; + msg->ev.match_bits = args->match_bits_in; + msg->ev.rlength = md->length; + msg->ev.mlength = md->length; + msg->ev.offset = args->offset_in; + msg->ev.hdr_data = 0; - /* - * If this memory descriptor has an event queue associated with - * it we must allocate a message state object that will record - * the information to be filled in once the message has been - * completed. More information is in the do_PtlPut() comments. - * - * NB. We're now committed to the GET, since we just marked the MD - * busy. Callers who observe this (by getting PTL_MD_INUSE from - * PtlMDUnlink()) expect a completion event to tell them when the - * MD becomes idle. - */ - if (md->eq) { - msg->ev.type = PTL_EVENT_SENT; - msg->ev.initiator.nid = ni->nid; - msg->ev.initiator.pid = ni->pid; - msg->ev.portal = args->portal_in; - msg->ev.match_bits = args->match_bits_in; - msg->ev.rlength = md->length; - msg->ev.mlength = md->length; - msg->ev.offset = args->offset_in; - msg->ev.hdr_data = 0; - - lib_md_deconstruct(nal, md, &msg->ev.mem_desc); - } + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + + ni->counters.send_count++; state_unlock(nal, &flags); rc = lib_send (nal, private, msg, &hdr, PTL_MSG_GET, id->nid, id->pid, NULL, 0, 0); if (rc != PTL_OK) { - /* get_new_msg() committed us to sending by decrementing - * md->threshold, so we have to act like we did send, but - * the network dropped it. */ - lib_finalize (nal, private, msg); + CERROR(LPU64": error sending GET to "LPU64": %d\n", + ni->nid, id->nid, rc); + lib_finalize (nal, private, msg, rc); } + /* completion will be signalled by an event */ return ret->rc = PTL_OK; } diff --git a/lustre/portals/portals/lib-msg.c b/lustre/portals/portals/lib-msg.c index 9840ff5..04c69b1 100644 --- a/lustre/portals/portals/lib-msg.c +++ b/lustre/portals/portals/lib-msg.c @@ -32,32 +32,81 @@ #include -int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg) +void +lib_enq_event_locked (nal_cb_t *nal, void *private, + lib_eq_t *eq, ptl_event_t *ev) { - lib_md_t *md; - lib_eq_t *eq; + ptl_event_t *eq_slot; int rc; + + ev->sequence = eq->sequence++; /* Allocate the next queue slot */ + + /* size must be a power of 2 to handle a wrapped sequence # */ + LASSERT (eq->size != 0 && + eq->size == LOWEST_BIT_SET (eq->size)); + eq_slot = eq->base + (ev->sequence & (eq->size - 1)); + + /* Copy the event into the allocated slot, ensuring all the rest of + * the event's contents have been copied _before_ the sequence + * number gets updated. A processes 'getting' an event waits on + * the next queue slot's sequence to be 'new'. When it is, _all_ + * other event fields had better be consistent. I assert + * 'sequence' is the last member, so I only need a 2 stage copy. */ + + LASSERT(sizeof (ptl_event_t) == + offsetof(ptl_event_t, sequence) + sizeof(ev->sequence)); + + rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev, + offsetof (ptl_event_t, sequence)); + LASSERT (rc == PTL_OK); + +#ifdef __KERNEL__ + barrier(); +#endif + /* Updating the sequence number is what makes the event 'new' NB if + * the cb_write below isn't atomic, this could cause a race with + * PtlEQGet */ + rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence, + (void *)&ev->sequence,sizeof (ev->sequence)); + LASSERT (rc == PTL_OK); + +#ifdef __KERNEL__ + barrier(); +#endif + + if (nal->cb_callback != NULL) + nal->cb_callback(nal, private, eq, ev); + else if (eq->event_callback != NULL) + eq->event_callback(ev); +} + +void +lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) +{ + lib_md_t *md; + int unlink; unsigned long flags; + int rc; + ptl_hdr_t ack; /* ni went down while processing this message */ - if (nal->ni.up == 0) { - return -1; - } + if (nal->ni.up == 0) + return; if (msg == NULL) - return 0; + return; - rc = 0; - if (msg->send_ack) { - ptl_hdr_t ack; + /* Only send an ACK if the PUT completed successfully */ + if (status == PTL_OK && + !ptl_is_wire_handle_none(&msg->ack_wmd)) { - LASSERT (!ptl_is_wire_handle_none (&msg->ack_wmd)); + LASSERT(msg->ev.type == PTL_EVENT_PUT); memset (&ack, 0, sizeof (ack)); ack.type = HTON__u32 (PTL_MSG_ACK); - ack.dest_nid = HTON__u64 (msg->nid); + ack.dest_nid = HTON__u64 (msg->ev.initiator.nid); ack.src_nid = HTON__u64 (nal->ni.nid); - ack.dest_pid = HTON__u32 (msg->pid); + ack.dest_pid = HTON__u32 (msg->ev.initiator.pid); ack.src_pid = HTON__u32 (nal->ni.pid); ack.payload_length = 0; @@ -66,92 +115,35 @@ int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg) ack.msg.ack.mlength = HTON__u32 (msg->ev.mlength); rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK, - msg->nid, msg->pid, NULL, 0, 0); - /* If this send fails, there's nothing else to clean up */ + msg->ev.initiator.nid, msg->ev.initiator.pid, + NULL, 0, 0); + if (rc != PTL_OK) { + /* send failed: there's nothing else to clean up. */ + CERROR("Error %d sending ACK to "LPX64"\n", + rc, msg->ev.initiator.nid); + } } md = msg->md; - LASSERT (md->pending > 0); /* I've not dropped my ref yet */ - eq = md->eq; state_lock(nal, &flags); - if (eq != NULL) { - ptl_event_t *ev = &msg->ev; - ptl_event_t *eq_slot; - - /* I have to hold the lock while I bump the sequence number - * and copy the event into the queue. If not, and I was - * interrupted after bumping the sequence number, other - * events could fill the queue, including the slot I just - * allocated to this event. On resuming, I would overwrite - * a more 'recent' event with old event state, and - * processes taking events off the queue would not detect - * overflow correctly. - */ - - ev->sequence = eq->sequence++;/* Allocate the next queue slot */ - - /* size must be a power of 2 to handle a wrapped sequence # */ - LASSERT (eq->size != 0 && - eq->size == LOWEST_BIT_SET (eq->size)); - eq_slot = eq->base + (ev->sequence & (eq->size - 1)); - - /* Invalidate unlinked_me unless this is the last - * event for an auto-unlinked MD. Note that if md was - * auto-unlinked, md->pending can only decrease - */ - if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || /* not auto-unlinked */ - md->pending != 1) /* not last ref */ - ev->unlinked_me = PTL_HANDLE_NONE; - - /* Copy the event into the allocated slot, ensuring all the - * rest of the event's contents have been copied _before_ - * the sequence number gets updated. A processes 'getting' - * an event waits on the next queue slot's sequence to be - * 'new'. When it is, _all_ other event fields had better - * be consistent. I assert 'sequence' is the last member, - * so I only need a 2 stage copy. - */ - LASSERT(sizeof (ptl_event_t) == - offsetof(ptl_event_t, sequence) + sizeof(ev->sequence)); - - rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev, - offsetof (ptl_event_t, sequence)); - LASSERT (rc == 0); - -#ifdef __KERNEL__ - barrier(); -#endif - /* Updating the sequence number is what makes the event 'new' */ - - /* cb_write is not necessarily atomic, so this could - cause a race with PtlEQGet */ - rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence, - (void *)&ev->sequence,sizeof (ev->sequence)); - LASSERT (rc == 0); + /* Now it's safe to drop my caller's ref */ + md->pending--; + LASSERT (md->pending >= 0); -#ifdef __KERNEL__ - barrier(); -#endif + /* Should I unlink this MD? */ + unlink = (md->pending == 0 && /* No other refs */ + (md->threshold == 0 || /* All ops done */ + md->md_flags & PTL_MD_FLAG_UNLINK) != 0); /* black spot */ - /* I must also ensure that (a) callbacks are made in the - * same order as the events land in the queue, and (b) the - * callback occurs before the event can be removed from the - * queue, so I can't drop the lock during the callback. */ - if (nal->cb_callback != NULL) - nal->cb_callback(nal, private, eq, ev); - else if (eq->event_callback != NULL) - (void)((eq->event_callback) (ev)); - } + msg->ev.status = status; + msg->ev.unlinked = unlink; - LASSERT ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || - (md->md_flags & PTL_MD_FLAG_UNLINK) != 0); + if (md->eq != NULL) + lib_enq_event_locked(nal, private, md->eq, &msg->ev); - md->pending--; - if (md->pending == 0 && /* no more outstanding operations on this md */ - (md->threshold == 0 || /* done its business */ - (md->md_flags & PTL_MD_FLAG_UNLINK) != 0)) /* marked for death */ + if (unlink) lib_md_unlink(nal, md); list_del (&msg->msg_list); @@ -159,6 +151,4 @@ int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg) lib_msg_free(nal, msg); state_unlock(nal, &flags); - - return rc; } diff --git a/lustre/portals/unals/Makefile.am b/lustre/portals/unals/Makefile.am index dc427b0..6035ca1 100644 --- a/lustre/portals/unals/Makefile.am +++ b/lustre/portals/unals/Makefile.am @@ -1,5 +1,9 @@ CPPFLAGS= INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir) -lib_LIBRARIES = libtcpnal.a +noinst_LIBRARIES = libtcpnal.a pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h + +if LIBLUSTRE +libtcpnal_a_CFLAGS = -fPIC +endif diff --git a/lustre/portals/unals/bridge.h b/lustre/portals/unals/bridge.h index 0b4940f..9a90ab8 100644 --- a/lustre/portals/unals/bridge.h +++ b/lustre/portals/unals/bridge.h @@ -6,6 +6,9 @@ * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ */ +#ifndef TCPNAL_PROCBRIDGE_H +#define TCPNAL_PROCBRIDGE_H + #include typedef struct bridge { @@ -27,3 +30,5 @@ nal_t *bridge_init(ptl_interface_t nal, typedef int (*nal_initialize)(bridge); extern nal_initialize nal_table[PTL_IFACE_MAX]; + +#endif diff --git a/lustre/portals/unals/connection.c b/lustre/portals/unals/connection.c index 29e75be..ca6999a 100644 --- a/lustre/portals/unals/connection.c +++ b/lustre/portals/unals/connection.c @@ -309,7 +309,8 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation) */ connection force_tcp_connection(manager m, unsigned int ip, - unsigned short port) + unsigned short port, + procbridge pb) { connection conn; struct sockaddr_in addr; @@ -357,6 +358,10 @@ connection force_tcp_connection(manager m, exit(-1); conn = allocate_connection(m, ip, port, fd); + + /* let nal thread know this event right away */ + if (conn) + procbridge_wakeup_nal(pb); } pthread_mutex_unlock(&m->conn_lock); diff --git a/lustre/portals/unals/connection.h b/lustre/portals/unals/connection.h index fb1eaab..343ffa6 100644 --- a/lustre/portals/unals/connection.h +++ b/lustre/portals/unals/connection.h @@ -7,6 +7,7 @@ */ #include +#include typedef struct manager { table connections; @@ -26,7 +27,8 @@ typedef struct connection { manager m; } *connection; -connection force_tcp_connection(manager m, unsigned int ip, unsigned int short); +connection force_tcp_connection(manager m, unsigned int ip, unsigned int short, + procbridge pb); manager init_connections(unsigned short, int (*f)(void *, void *), void *); void remove_connection(void *arg); void shutdown_connections(manager m); diff --git a/lustre/portals/unals/procapi.c b/lustre/portals/unals/procapi.c index 2a3fbd8..bddfe9a 100644 --- a/lustre/portals/unals/procapi.c +++ b/lustre/portals/unals/procapi.c @@ -32,12 +32,34 @@ #include #include #include +#ifndef __CYGWIN__ +#include +#endif +#include #include #include #include #include +/* XXX CFS workaround, to give a chance to let nal thread wake up + * from waiting in select + */ +static int procbridge_notifier_handler(void *arg) +{ + static char buf[8]; + procbridge p = (procbridge) arg; + + syscall(SYS_read, p->notifier[1], buf, sizeof(buf)); + return 1; +} + +void procbridge_wakeup_nal(procbridge p) +{ + static char buf[8]; + syscall(SYS_write, p->notifier[0], buf, sizeof(buf)); +} + /* Function: forward * Arguments: nal_t *nal: pointer to my top-side nal structure * id: the command to pass to the lower layer @@ -79,6 +101,7 @@ static int procbridge_shutdown(nal_t *n, int ni) procbridge p=(procbridge)b->local; p->nal_flags |= NAL_FLAG_STOPPING; + procbridge_wakeup_nal(p); do { pthread_mutex_lock(&p->mutex); @@ -104,6 +127,12 @@ static int procbridge_validate(nal_t *nal, void *base, size_t extent) } +/* FIXME cfs temporary workaround! FIXME + * global time out value + */ +int __tcpnal_eqwait_timeout_value = 0; +int __tcpnal_eqwait_timedout = 0; + /* Function: yield * Arguments: pid: * @@ -118,7 +147,19 @@ static void procbridge_yield(nal_t *n) procbridge p=(procbridge)b->local; pthread_mutex_lock(&p->mutex); - pthread_cond_wait(&p->cond,&p->mutex); + if (!__tcpnal_eqwait_timeout_value) { + pthread_cond_wait(&p->cond,&p->mutex); + } else { + struct timeval now; + struct timespec timeout; + + gettimeofday(&now, NULL); + timeout.tv_sec = now.tv_sec + __tcpnal_eqwait_timeout_value; + timeout.tv_nsec = now.tv_usec * 1000; + + __tcpnal_eqwait_timedout = + pthread_cond_timedwait(&p->cond, &p->mutex, &timeout); + } pthread_mutex_unlock(&p->mutex); } @@ -194,6 +235,19 @@ nal_t *procbridge_interface(int num_interface, p->nal_flags = 0; pthread_mutex_init(&p->nal_cb_lock, 0); + /* initialize notifier */ + if (socketpair(AF_UNIX, SOCK_STREAM, 0, p->notifier)) { + perror("socketpair failed"); + return NULL; + } + + if (!register_io_handler(p->notifier[1], READ_HANDLER, + procbridge_notifier_handler, p)) { + perror("fail to register notifier handler"); + return NULL; + } + + /* create nal thread */ if (pthread_create(&p->t, NULL, nal_thread, &args)) { perror("nal_init: pthread_create"); return(NULL); diff --git a/lustre/portals/unals/procbridge.h b/lustre/portals/unals/procbridge.h index 317e22f..965f83d 100644 --- a/lustre/portals/unals/procbridge.h +++ b/lustre/portals/unals/procbridge.h @@ -25,6 +25,9 @@ typedef struct procbridge { pthread_cond_t cond; pthread_mutex_t mutex; + /* socket pair used to notify nal thread */ + int notifier[2]; + int nal_flags; pthread_mutex_t nal_cb_lock; @@ -51,5 +54,6 @@ extern nal_t *procbridge_interface(int num_interface, ptl_pt_index_t ptl_size, ptl_ac_index_t acl_size, ptl_pid_t requested_pid); +extern void procbridge_wakeup_nal(procbridge p); #endif diff --git a/lustre/portals/unals/proclib.c b/lustre/portals/unals/proclib.c index 2627253..2a5ba0d 100644 --- a/lustre/portals/unals/proclib.c +++ b/lustre/portals/unals/proclib.c @@ -43,24 +43,24 @@ /* the following functions are stubs to satisfy the nal definition without doing anything particularily useful*/ -static int nal_write(nal_cb_t *nal, - void *private, - user_ptr dst_addr, - void *src_addr, - size_t len) +static ptl_err_t nal_write(nal_cb_t *nal, + void *private, + user_ptr dst_addr, + void *src_addr, + size_t len) { memcpy(dst_addr, src_addr, len); - return 0; + return PTL_OK; } -static int nal_read(nal_cb_t * nal, - void *private, - void *dst_addr, - user_ptr src_addr, - size_t len) +static ptl_err_t nal_read(nal_cb_t * nal, + void *private, + void *dst_addr, + user_ptr src_addr, + size_t len) { memcpy(dst_addr, src_addr, len); - return 0; + return PTL_OK; } static void *nal_malloc(nal_cb_t *nal, diff --git a/lustre/portals/unals/select.c b/lustre/portals/unals/select.c index fe24efc..c4ccae1 100644 --- a/lustre/portals/unals/select.c +++ b/lustre/portals/unals/select.c @@ -126,15 +126,6 @@ void select_timer_block(when until) timeout_pointer=&timeout; } else timeout_pointer=0; - - /* FIXME - * temporarily add timer for endless waiting problem. - * FIXME - */ - timeout.tv_sec = 1; - timeout.tv_usec = 0; - timeout_pointer=&timeout; - FD_ZERO(&fds[0]); FD_ZERO(&fds[1]); FD_ZERO(&fds[2]); diff --git a/lustre/portals/unals/tcpnal.c b/lustre/portals/unals/tcpnal.c index 1041d1d..0c47f42 100644 --- a/lustre/portals/unals/tcpnal.c +++ b/lustre/portals/unals/tcpnal.c @@ -55,69 +55,69 @@ * * sends a packet to the peer, after insuring that a connection exists */ -int tcpnal_send(nal_cb_t *n, - void *private, - lib_msg_t *cookie, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int niov, - struct iovec *iov, - size_t len) +ptl_err_t tcpnal_send(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int niov, + struct iovec *iov, + size_t offset, + size_t len) { connection c; bridge b=(bridge)n->nal_data; struct iovec tiov[257]; static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER; - int rc; + ptl_err_t rc = PTL_OK; + int sysrc; int total; + int ntiov; int i; if (!(c=force_tcp_connection((manager)b->lower, PNAL_IP(nid,b), - PNAL_PORT(nid,pid)))) - return(1); + PNAL_PORT(nid,pid), + b->local))) + return(PTL_FAIL); -#if 0 /* TODO: these results should be checked. furthermore, provision must be made for the SIGPIPE which is delivered when writing on a tcp socket which has closed underneath the application. there is a linux flag in the sendmsg call which turns off the signally behaviour, but its nonstandard */ - syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t)); - LASSERT (niov <= 1); - if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len); -#else + LASSERT (niov <= 256); tiov[0].iov_base = hdr; tiov[0].iov_len = sizeof(ptl_hdr_t); + ntiov = 1 + lib_extract_iov(256, &tiov[1], niov, iov, offset, len); - if (niov > 0) - memcpy(&tiov[1], iov, niov * sizeof(struct iovec)); pthread_mutex_lock(&send_lock); #if 1 - for (i = total = 0; i <= niov; i++) + for (i = total = 0; i < ntiov; i++) total += tiov[i].iov_len; - rc = syscall(SYS_writev, c->fd, tiov, niov+1); - if (rc != total) { + sysrc = syscall(SYS_writev, c->fd, tiov, ntiov); + if (sysrc != total) { fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n", rc, total, errno); - abort(); + rc = PTL_FAIL; } #else - for (i = total = 0; i <= niov; i++) { + for (i = total = 0; i <= ntiov; i++) { rc = send(c->fd, tiov[i].iov_base, tiov[i].iov_len, 0); if (rc != tiov[i].iov_len) { fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n", rc, tiov[i].iov_len, errno); - abort(); + rc = PTL_FAIL; + break; } - total != rc; + total += rc; } #endif #if 0 @@ -130,10 +130,14 @@ int tcpnal_send(nal_cb_t *n, total, niov + 1); #endif pthread_mutex_unlock(&send_lock); -#endif - lib_finalize(n, private, cookie); - - return(0); + + if (rc == PTL_OK) { + /* NB the NAL only calls lib_finalize() if it returns PTL_OK + * from cb_send() */ + lib_finalize(n, private, cookie, PTL_OK); + } + + return(rc); } @@ -150,15 +154,18 @@ int tcpnal_send(nal_cb_t *n, * blocking read of the requested data. must drain out the * difference of mainpulated and requested lengths from the network */ -int tcpnal_recv(nal_cb_t *n, - void *private, - lib_msg_t *cookie, - unsigned int niov, - struct iovec *iov, - size_t mlen, - size_t rlen) +ptl_err_t tcpnal_recv(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + size_t offset, + size_t mlen, + size_t rlen) { + struct iovec tiov[256]; + int ntiov; int i; if (!niov) @@ -168,16 +175,19 @@ int tcpnal_recv(nal_cb_t *n, LASSERT(rlen); LASSERT(rlen >= mlen); + ntiov = lib_extract_iov(256, tiov, niov, iov, offset, mlen); + /* FIXME * 1. Is this effecient enough? change to use readv() directly? * 2. need check return from read_connection() * - MeiJia */ - for (i = 0; i < niov; i++) - read_connection(private, iov[i].iov_base, iov[i].iov_len); + for (i = 0; i < ntiov; i++) + read_connection(private, tiov[i].iov_base, tiov[i].iov_len); finalize: - lib_finalize(n, private, cookie); + /* FIXME; we always assume success here... */ + lib_finalize(n, private, cookie, PTL_OK); if (mlen!=rlen){ char *trash=malloc(rlen-mlen); @@ -187,7 +197,7 @@ finalize: free(trash); } - return(rlen); + return(PTL_OK); } diff --git a/lustre/portals/utils/Makefile.am b/lustre/portals/utils/Makefile.am index f1878df..6c31b3d 100644 --- a/lustre/portals/utils/Makefile.am +++ b/lustre/portals/utils/Makefile.am @@ -3,17 +3,18 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution - COMPILE = $(CC) -Wall -g -I$(srcdir)/../include LINK = $(CC) -o $@ if LIBLUSTRE -tmp= + +noinst_LIBRARIES = libuptlctl.a +libuptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h +libuptlctl_a_CFLAGS = -fPIC + else -tmp=gmnalnid -endif -sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck $(tmp) +sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid lib_LIBRARIES = libptlctl.a acceptor_SOURCES = acceptor.c # -lefence @@ -33,3 +34,4 @@ debugctl_LDADD = -L. -lptlctl -lncurses # -lefence debugctl_DEPENDENCIES = libptlctl.a routerstat_SOURCES = routerstat.c +endif diff --git a/lustre/portals/utils/l_ioctl.c b/lustre/portals/utils/l_ioctl.c index c6628ff..58a408a 100644 --- a/lustre/portals/utils/l_ioctl.c +++ b/lustre/portals/utils/l_ioctl.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -35,6 +34,16 @@ #include #include +#ifndef __CYGWIN__ + #include +#else + #include + #include +#endif + +static ioc_handler_t do_ioctl; /* forward ref */ +static ioc_handler_t *current_ioc_handler = &do_ioctl; + struct ioc_dev { const char * dev_name; int dev_fd; @@ -48,7 +57,16 @@ struct dump_hdr { int opc; }; -char * dump_filename; +char *dump_filename; + +void +set_ioc_handler (ioc_handler_t *handler) +{ + if (handler == NULL) + current_ioc_handler = do_ioctl; + else + current_ioc_handler = handler; +} static int open_ioc_dev(int dev_id) @@ -115,7 +133,7 @@ dump(int dev_id, int opc, void *buf) { FILE *fp; struct dump_hdr dump_hdr; - struct portal_ioctl_hdr * ioc_hdr = (struct portal_ioctl_hdr *) buf; + struct portal_ioctl_hdr * ioc_hdr = (struct portal_ioctl_hdr *) buf; int rc; printf("dumping opc %x to %s\n", opc, dump_filename); @@ -132,17 +150,17 @@ dump(int dev_id, int opc, void *buf) return -EINVAL; } - rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp); - if (rc == 1) - rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp); - fclose(fp); - if (rc != 1) { - fprintf(stderr, "%s: %s\n", dump_filename, - strerror(errno)); - return -EINVAL; - } - - return 0; + rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp); + if (rc == 1) + rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp); + fclose(fp); + if (rc != 1) { + fprintf(stderr, "%s: %s\n", dump_filename, + strerror(errno)); + return -EINVAL; + } + + return 0; } /* register a device to send ioctls to. */ @@ -184,16 +202,17 @@ set_ioctl_dump(char * file) free(dump_filename); dump_filename = strdup(file); + if (dump_filename == NULL) + abort(); + + set_ioc_handler(&dump); return 0; } int l_ioctl(int dev_id, int opc, void *buf) { - if (dump_filename) - return dump(dev_id, opc, buf); - else - return do_ioctl(dev_id, opc, buf); + return current_ioc_handler(dev_id, opc, buf); } /* Read an ioctl dump file, and call the ioc_func for each ioctl buffer @@ -207,16 +226,28 @@ l_ioctl(int dev_id, int opc, void *buf) int parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *)) { - int fd, line =0; + int line =0; struct stat st; - char *buf, *end; + char *start, *buf, *end; +#ifndef __CYGWIN__ + int fd; +#else + HANDLE fd, hmap; + DWORD size; +#endif +#ifndef __CYGWIN__ fd = syscall(SYS_open, dump_file, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "couldn't open %s: %s\n", dump_file, + strerror(errno)); + exit(1); + } #ifndef SYS_fstat64 -#define __SYS_fstat__ SYS_fstat +# define __SYS_fstat__ SYS_fstat #else -#define __SYS_fstat__ SYS_fstat64 +# define __SYS_fstat__ SYS_fstat64 #endif if (syscall(__SYS_fstat__, fd, &st)) { perror("stat fails"); @@ -228,41 +259,72 @@ parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *)) exit(1); } - buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0); - end = buf + st.st_size; + start = buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0); + end = start + st.st_size; close(fd); - while (buf < end) { - struct dump_hdr *dump_hdr = (struct dump_hdr *) buf; - struct portal_ioctl_hdr * data; - char tmp[8096]; - int rc; - - line++; + if (start == MAP_FAILED) { + fprintf(stderr, "can't create file mapping\n"); + exit(1); + } +#else + fd = CreateFile(dump_file, GENERIC_READ, FILE_SHARE_READ, NULL, + OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); + size = GetFileSize(fd, NULL); + if (size < 1) { + fprintf(stderr, "KML is empty\n"); + exit(1); + } - data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr)); - if (buf + data->ioc_len > end ) { - fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf, - data->ioc_len, end); - return -1; - } + hmap = CreateFileMapping(fd, NULL, PAGE_READONLY, 0,0, NULL); + start = buf = MapViewOfFile(hmap, FILE_MAP_READ, 0, 0, 0); + end = buf + size; + CloseHandle(fd); + if (start == NULL) { + fprintf(stderr, "can't create file mapping\n"); + exit(1); + } +#endif /* __CYGWIN__ */ + + while (buf < end) { + struct dump_hdr *dump_hdr = (struct dump_hdr *) buf; + struct portal_ioctl_hdr * data; + char tmp[8096]; + int rc; + + line++; + + data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr)); + if (buf + data->ioc_len > end ) { + fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf, + data->ioc_len, end); + return -1; + } #if 0 - printf ("dump_hdr: %lx data: %lx\n", - (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf); - - printf("%d: opcode %x len: %d ver: %x ", line, dump_hdr->opc, - data->ioc_len, data->ioc_version); + printf ("dump_hdr: %lx data: %lx\n", + (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf); + + printf("%d: opcode %x len: %d ver: %x ", line, dump_hdr->opc, + data->ioc_len, data->ioc_version); #endif - memcpy(tmp, data, data->ioc_len); + memcpy(tmp, data, data->ioc_len); - rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp); - if (rc) { - printf("failed: %d\n", rc); - exit(1); - } + rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp); + if (rc) { + printf("failed: %d\n", rc); + exit(1); + } - buf += data->ioc_len + sizeof(*dump_hdr); + buf += data->ioc_len + sizeof(*dump_hdr); } + +#ifndef __CYGWIN__ + munmap(start, end - start); +#else + UnmapViewOfFile(start); + CloseHandle(hmap); +#endif + return 0; } diff --git a/lustre/portals/utils/portals.c b/lustre/portals/utils/portals.c index 3c7ec20..fb031ae 100644 --- a/lustre/portals/utils/portals.c +++ b/lustre/portals/utils/portals.c @@ -36,6 +36,21 @@ #include #include +#ifdef __CYGWIN__ + +#include + +#warning assuming little endian + +#define __cpu_to_le64(x) ((__u64)(x)) +#define __le64_to_cpu(x) ((__u64)(x)) +#define __cpu_to_le32(x) ((__u32)(x)) +#define __le32_to_cpu(x) ((__u32)(x)) +#define __cpu_to_le16(x) ((__u16)(x)) +#define __le16_to_cpu(x) ((__u16)(x)) + +#endif /* __CYGWIN__ */ + #include #include #include @@ -94,6 +109,9 @@ pcfg_ioctl(struct portals_cfg *pcfg) PORTAL_IOC_INIT (data); data.ioc_pbuf1 = (char*)pcfg; data.ioc_plen1 = sizeof(*pcfg); + /* XXX liblustre hack XXX */ + data.ioc_nal_cmd = pcfg->pcfg_command; + data.ioc_nid = pcfg->pcfg_nid; rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); } diff --git a/lustre/ptlbd/rpc.c b/lustre/ptlbd/rpc.c index 51883f2..5af3249 100644 --- a/lustre/ptlbd/rpc.c +++ b/lustre/ptlbd/rpc.c @@ -69,19 +69,19 @@ int ptlbd_send_rw_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, op->op_block_cnt = page_count; if (cmd == PTLBD_READ) - desc = ptlrpc_prep_bulk_imp (req, BULK_PUT_SINK, PTLBD_BULK_PORTAL); + desc = ptlrpc_prep_bulk_imp (req, page_count, + BULK_PUT_SINK, PTLBD_BULK_PORTAL); else - desc = ptlrpc_prep_bulk_imp (req, BULK_GET_SOURCE, PTLBD_BULK_PORTAL); + desc = ptlrpc_prep_bulk_imp (req, page_count, + BULK_GET_SOURCE, PTLBD_BULK_PORTAL); if ( desc == NULL ) GOTO(out, rc = 1); /* need to return error cnt */ /* NB req now owns desc, and frees it when she frees herself */ for ( niob = niobs, bh = first_bh ; bh ; bh = bh->b_reqnext, niob++ ) { - rc = ptlrpc_prep_bulk_page(desc, bh->b_page, - bh_offset (bh) & (PAGE_SIZE - 1), - bh->b_size); - if (rc != 0) - GOTO(out, rc = 1); /* need to return error cnt */ + ptlrpc_prep_bulk_page(desc, bh->b_page, + bh_offset (bh) & (PAGE_SIZE - 1), + bh->b_size); niob->n_block_nr = bh->b_blocknr; niob->n_offset = bh_offset(bh); @@ -221,6 +221,7 @@ int ptlbd_srv_rw_req(ptlbd_cmd_t cmd, __u16 index, if ( rsp == NULL ) GOTO (out, rc = -EFAULT); + /* FIXME: assumes each niobuf fits in 1 page */ page_count = req->rq_reqmsg->buflens[1] / sizeof(struct ptlbd_niob); if (swab) { /* swab remaining niobs */ for (i = 1; i < page_count; i++) @@ -232,9 +233,11 @@ int ptlbd_srv_rw_req(ptlbd_cmd_t cmd, __u16 index, } if (cmd == PTLBD_READ) - desc = ptlrpc_prep_bulk_exp (req, BULK_PUT_SOURCE, PTLBD_BULK_PORTAL); + desc = ptlrpc_prep_bulk_exp (req, page_count, + BULK_PUT_SOURCE, PTLBD_BULK_PORTAL); else - desc = ptlrpc_prep_bulk_exp (req, BULK_GET_SINK, PTLBD_BULK_PORTAL); + desc = ptlrpc_prep_bulk_exp (req, page_count, + BULK_GET_SINK, PTLBD_BULK_PORTAL); if (desc == NULL) { error_cnt++; GOTO(out_reply, rc = -ENOMEM); @@ -250,25 +253,20 @@ int ptlbd_srv_rw_req(ptlbd_cmd_t cmd, __u16 index, } list_add_tail(&page->list, &tmp_pages); - rc = ptlrpc_prep_bulk_page(desc, page, - niob->n_offset & (PAGE_SIZE - 1), - niob->n_length); - if (rc != 0) { - error_cnt++; - GOTO(out_reply, rc); - } + ptlrpc_prep_bulk_page(desc, page, + niob->n_offset & (PAGE_SIZE - 1), + niob->n_length); } if ( cmd == PTLBD_READ ) { - if ((rc = ptlbd_do_filp(filp, PTLBD_READ, niobs, - page_count, &tmp_pages)) < 0) { + rc = ptlbd_do_filp(filp, PTLBD_READ, niobs, + page_count, &tmp_pages); + if (rc < 0) { error_cnt++; GOTO(out_reply, rc); } - rc = ptlrpc_bulk_put(desc); - } else { - rc = ptlrpc_bulk_get(desc); } + rc = ptlrpc_start_bulk_transfer(desc); if ( rc ) { error_cnt++; @@ -276,13 +274,16 @@ int ptlbd_srv_rw_req(ptlbd_cmd_t cmd, __u16 index, } lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, desc); - rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete(desc), &lwi); + rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), &lwi); if (rc != 0) { LASSERT(rc == -ETIMEDOUT); ptlrpc_abort_bulk(desc); error_cnt++; GOTO(out_reply, rc); } + + /* XXX do some error handling */ + LASSERT(desc->bd_success && desc->bd_nob_transferred == desc->bd_nob); if ( cmd == PTLBD_WRITE ) { if ((rc = ptlbd_do_filp(filp, PTLBD_WRITE, niobs, diff --git a/lustre/ptlbd/server.c b/lustre/ptlbd/server.c index e159df0..c7ac53b 100644 --- a/lustre/ptlbd/server.c +++ b/lustre/ptlbd/server.c @@ -49,10 +49,9 @@ static int ptlbd_sv_setup(struct obd_device *obddev, obd_count len, void *buf) RETURN(PTR_ERR(ptlbd->filp)); ptlbd->ptlbd_service = - ptlrpc_init_svc(PTLBD_NEVENTS, PTLBD_NBUFS, PTLBD_BUFSIZE, - PTLBD_MAXREQSIZE, PTLBD_REQUEST_PORTAL, - PTLBD_REPLY_PORTAL, - ptlbd_handle, "ptlbd_sv", + ptlrpc_init_svc(PTLBD_NBUFS, PTLBD_BUFSIZE, PTLBD_MAXREQSIZE, + PTLBD_REQUEST_PORTAL, PTLBD_REPLY_PORTAL, + ptlbd_handle, "ptlbd_sv", obddev->obd_proc_entry); if (ptlbd->ptlbd_service == NULL) diff --git a/lustre/ptlrpc/Makefile.am b/lustre/ptlrpc/Makefile.am index 2c6de45..4822d33 100644 --- a/lustre/ptlrpc/Makefile.am +++ b/lustre/ptlrpc/Makefile.am @@ -5,19 +5,21 @@ DEFS= -LDLMSOURCES= $(top_srcdir)/ldlm/l_lock.c $(top_srcdir)/ldlm/ldlm_lock.c \ - $(top_srcdir)/ldlm/ldlm_resource.c $(top_srcdir)/ldlm/ldlm_lib.c \ - $(top_srcdir)/ldlm/ldlm_plain.c $(top_srcdir)/ldlm/ldlm_extent.c \ - $(top_srcdir)/ldlm/ldlm_flock.c $(top_srcdir)/ldlm/ldlm_request.c \ - $(top_srcdir)/ldlm/ldlm_lockd.c $(top_srcdir)/ldlm/ldlm_internal.h +LDLM_COMM_SOURCES= $(top_srcdir)/ldlm/l_lock.c $(top_srcdir)/ldlm/ldlm_lock.c \ + $(top_srcdir)/ldlm/ldlm_resource.c $(top_srcdir)/ldlm/ldlm_lib.c \ + $(top_srcdir)/ldlm/ldlm_plain.c $(top_srcdir)/ldlm/ldlm_extent.c \ + $(top_srcdir)/ldlm/ldlm_request.c $(top_srcdir)/ldlm/ldlm_lockd.c \ + $(top_srcdir)/ldlm/ldlm_internal.h -COMMON_SOURCES = client.c recover.c connection.c niobuf.c pack_generic.c \ - events.c ptlrpc_module.c service.c pinger.c recov_thread.c llog_net.c \ - llog_client.c import.c ptlrpcd.c $(LDLMSOURCES) +COMMON_SOURCES = client.c recover.c connection.c niobuf.c pack_generic.c \ + events.c ptlrpc_module.c service.c pinger.c recov_thread.c llog_net.c \ + llog_client.c llog_server.c import.c ptlrpcd.c ptlrpc_internal.h \ + $(LDLM_COMM_SOURCES) if LIBLUSTRE -lib_LIBRARIES = libptlrpc.a +noinst_LIBRARIES = libptlrpc.a +libptlrpc_a_CFLAGS = -fPIC libptlrpc_a_SOURCES = $(COMMON_SOURCES) else @@ -26,8 +28,9 @@ MODULE = ptlrpc modulefs_DATA = ptlrpc.o EXTRA_PROGRAMS = ptlrpc -ptlrpc_SOURCES = $(COMMON_SOURCES) lproc_ptlrpc.c ptlrpc_internal.h \ - llog_server.c +ptlrpc_SOURCES = $(top_srcdir)/ldlm/ldlm_flock.c $(COMMON_SOURCES) \ + lproc_ptlrpc.c + endif ptlrpc_DEPENDENCIES=symlinks diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index fdc1b37..84c781d 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -82,40 +82,42 @@ void ptlrpc_readdress_connection(struct ptlrpc_connection *conn, return; } -static inline struct ptlrpc_bulk_desc *new_bulk(void) +static inline struct ptlrpc_bulk_desc *new_bulk(int npages, int type, int portal) { struct ptlrpc_bulk_desc *desc; - OBD_ALLOC(desc, sizeof(*desc)); + OBD_ALLOC(desc, offsetof (struct ptlrpc_bulk_desc, bd_iov[npages])); if (!desc) return NULL; spin_lock_init(&desc->bd_lock); init_waitqueue_head(&desc->bd_waitq); - INIT_LIST_HEAD(&desc->bd_page_list); + desc->bd_max_pages = npages; + desc->bd_page_count = 0; desc->bd_md_h = PTL_HANDLE_NONE; - desc->bd_me_h = PTL_HANDLE_NONE; - + desc->bd_portal = portal; + desc->bd_type = type; + return desc; } struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req, - int type, int portal) + int npages, int type, int portal) { struct obd_import *imp = req->rq_import; struct ptlrpc_bulk_desc *desc; LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE); - - desc = new_bulk(); + desc = new_bulk(npages, type, portal); if (desc == NULL) RETURN(NULL); desc->bd_import_generation = req->rq_import_generation; desc->bd_import = class_import_get(imp); desc->bd_req = req; - desc->bd_type = type; - desc->bd_portal = portal; + + desc->bd_cbid.cbid_fn = client_bulk_callback; + desc->bd_cbid.cbid_arg = desc; /* This makes req own desc, and free it when she frees herself */ req->rq_bulk = desc; @@ -124,21 +126,22 @@ struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req, } struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp (struct ptlrpc_request *req, - int type, int portal) + int npages, int type, int portal) { struct obd_export *exp = req->rq_export; struct ptlrpc_bulk_desc *desc; LASSERT(type == BULK_PUT_SOURCE || type == BULK_GET_SINK); - desc = new_bulk(); + desc = new_bulk(npages, type, portal); if (desc == NULL) RETURN(NULL); desc->bd_export = class_export_get(exp); desc->bd_req = req; - desc->bd_type = type; - desc->bd_portal = portal; + + desc->bd_cbid.cbid_fn = server_bulk_callback; + desc->bd_cbid.cbid_arg = desc; /* NB we don't assign rq_bulk here; server-side requests are * re-used, and the handler frees the bulk desc explicitly. */ @@ -146,66 +149,50 @@ struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp (struct ptlrpc_request *req, return desc; } -int ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, - struct page *page, int pageoffset, int len) +void ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, int len) { - struct ptlrpc_bulk_page *bulk; - - OBD_ALLOC(bulk, sizeof(*bulk)); - if (bulk == NULL) - return -ENOMEM; - +#ifdef __KERNEL__ + ptl_kiov_t *kiov = &desc->bd_iov[desc->bd_page_count]; +#else + struct iovec *iov = &desc->bd_iov[desc->bd_page_count]; +#endif + LASSERT(desc->bd_page_count < desc->bd_max_pages); LASSERT(page != NULL); LASSERT(pageoffset >= 0); LASSERT(len > 0); LASSERT(pageoffset + len <= PAGE_SIZE); - bulk->bp_page = page; - bulk->bp_pageoffset = pageoffset; - bulk->bp_buflen = len; - - bulk->bp_desc = desc; - list_add_tail(&bulk->bp_link, &desc->bd_page_list); +#ifdef __KERNEL__ + kiov->kiov_page = page; + kiov->kiov_offset = pageoffset; + kiov->kiov_len = len; +#else + iov->iov_base = page->addr + pageoffset; + iov->iov_len = len; +#endif desc->bd_page_count++; - return 0; + desc->bd_nob += len; } void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc) { - struct list_head *tmp, *next; ENTRY; LASSERT(desc != NULL); LASSERT(desc->bd_page_count != 0x5a5a5a5a); /* not freed already */ LASSERT(!desc->bd_network_rw); /* network hands off or */ - - list_for_each_safe(tmp, next, &desc->bd_page_list) { - struct ptlrpc_bulk_page *bulk; - bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link); - ptlrpc_free_bulk_page(bulk); - } - - LASSERT(desc->bd_page_count == 0); LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL)); - if (desc->bd_export) class_export_put(desc->bd_export); else class_import_put(desc->bd_import); - OBD_FREE(desc, sizeof(*desc)); + OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc, + bd_iov[desc->bd_max_pages])); EXIT; } -void ptlrpc_free_bulk_page(struct ptlrpc_bulk_page *bulk) -{ - LASSERT(bulk != NULL); - - list_del(&bulk->bp_link); - bulk->bp_desc->bd_page_count--; - OBD_FREE(bulk, sizeof(*bulk)); -} - struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode, int count, int *lengths, char **bufs) { @@ -235,6 +222,13 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode, request->rq_send_state = LUSTRE_IMP_FULL; request->rq_type = PTL_RPC_MSG_REQUEST; request->rq_import = class_import_get(imp); + + request->rq_req_cbid.cbid_fn = request_out_callback; + request->rq_req_cbid.cbid_arg = request; + + request->rq_reply_cbid.cbid_fn = reply_in_callback; + request->rq_reply_cbid.cbid_arg = request; + request->rq_phase = RQ_PHASE_NEW; /* XXX FIXME bug 249 */ @@ -462,7 +456,6 @@ static int after_reply(struct ptlrpc_request *req) ENTRY; LASSERT(!req->rq_receiving_reply); - LASSERT(req->rq_replied); /* NB Until this point, the whole of the incoming message, * including buflens, status etc is in the sender's byte order. */ @@ -471,7 +464,8 @@ static int after_reply(struct ptlrpc_request *req) /* Clear reply swab mask; this is a new reply in sender's byte order */ req->rq_rep_swab_mask = 0; #endif - rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen); + LASSERT (req->rq_nob_received <= req->rq_replen); + rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received); if (rc) { CERROR("unpack_rep failed: %d\n", rc); RETURN(-EPROTO); @@ -658,6 +652,11 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) if (req->rq_phase == RQ_PHASE_RPC) { if (req->rq_waiting || req->rq_resend) { int status; + + LASSERT (!ptlrpc_client_receiving_reply(req)); + LASSERT (req->rq_bulk == NULL || + !ptlrpc_bulk_active(req->rq_bulk)); + spin_lock_irqsave(&imp->imp_lock, flags); if (ptlrpc_import_delay_req(imp, req, &status)) { @@ -686,7 +685,7 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) ptlrpc_unregister_reply(req); if (req->rq_bulk) { __u64 old_xid = req->rq_xid; - ptlrpc_unregister_bulk(req); + /* ensure previous bulk fails */ req->rq_xid = ptlrpc_next_xid(); CDEBUG(D_HA, "resend bulk " @@ -707,13 +706,13 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) force_timer_recalc = 1; } - /* Ensure the network callback returned */ - spin_lock_irqsave (&req->rq_lock, flags); - if (!req->rq_replied) { - spin_unlock_irqrestore (&req->rq_lock, flags); + /* Still waiting for a reply? */ + if (ptlrpc_client_receiving_reply(req)) + continue; + + /* Did we actually receive a reply? */ + if (!ptlrpc_client_replied(req)) continue; - } - spin_unlock_irqrestore (&req->rq_lock, flags); spin_lock_irqsave(&imp->imp_lock, flags); list_del_init(&req->rq_list); @@ -745,9 +744,18 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) } LASSERT(req->rq_phase == RQ_PHASE_BULK); - if (!ptlrpc_bulk_complete (req->rq_bulk)) + if (ptlrpc_bulk_active(req->rq_bulk)) continue; + if (!req->rq_bulk->bd_success) { + /* The RPC reply arrived OK, but the bulk screwed + * up! Dead wierd since the server told us the RPC + * was good after getting the REPLY for her GET or + * the ACK for her PUT. */ + DEBUG_REQ(D_ERROR, req, "bulk transfer failed"); + LBUG(); + } + req->rq_phase = RQ_PHASE_INTERPRET; interpret: @@ -796,6 +804,9 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req) ptlrpc_unregister_reply (req); + if (req->rq_bulk != NULL) + ptlrpc_unregister_bulk (req); + if (imp == NULL) { DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?"); RETURN(1); @@ -926,7 +937,8 @@ int ptlrpc_set_wait(struct ptlrpc_request_set *set) LASSERT(!list_empty(&set->set_requests)); list_for_each(tmp, &set->set_requests) { req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); - (void)ptlrpc_send_new_req(req); + if (req->rq_phase == RQ_PHASE_NEW) + (void)ptlrpc_send_new_req(req); } do { @@ -981,6 +993,7 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) } LASSERT(!request->rq_receiving_reply); + LASSERT(request->rq_rqbd == NULL); /* client-side */ /* We must take it off the imp_replay_list first. Otherwise, we'll set * request->rq_reqmsg to NULL while osc_close is dereferencing it. */ @@ -1073,67 +1086,39 @@ void ptlrpc_req_finished(struct ptlrpc_request *request) */ void ptlrpc_unregister_reply (struct ptlrpc_request *request) { - unsigned long flags; - int rc; - ENTRY; + int rc; + wait_queue_head_t *wq; + struct l_wait_info lwi; LASSERT(!in_interrupt ()); /* might sleep */ - spin_lock_irqsave (&request->rq_lock, flags); - if (!request->rq_receiving_reply) { /* not waiting for a reply */ - spin_unlock_irqrestore (&request->rq_lock, flags); - EXIT; - /* NB reply buffer not freed here */ + if (!ptlrpc_client_receiving_reply(request)) return; - } - - LASSERT(!request->rq_replied); /* callback hasn't completed */ - spin_unlock_irqrestore (&request->rq_lock, flags); rc = PtlMDUnlink (request->rq_reply_md_h); - switch (rc) { - default: - LBUG (); - - case PTL_OK: /* unlinked before completion */ - LASSERT(request->rq_receiving_reply); - LASSERT(!request->rq_replied); - spin_lock_irqsave (&request->rq_lock, flags); - request->rq_receiving_reply = 0; - spin_unlock_irqrestore (&request->rq_lock, flags); - OBD_FREE(request->rq_repmsg, request->rq_replen); - request->rq_repmsg = NULL; - EXIT; + if (rc == PTL_INV_MD) { + LASSERT (!ptlrpc_client_receiving_reply(request)); return; + } + + LASSERT (rc == PTL_OK); - case PTL_MD_INUSE: /* callback in progress */ - for (;;) { - /* Network access will complete in finite time but - * the timeout lets us CERROR for visibility */ - struct l_wait_info lwi = LWI_TIMEOUT(10*HZ, NULL, NULL); - - rc = l_wait_event (request->rq_reply_waitq, - request->rq_replied, &lwi); - LASSERT(rc == 0 || rc == -ETIMEDOUT); - if (rc == 0) { - spin_lock_irqsave (&request->rq_lock, flags); - /* Ensure the callback has completed scheduling - * me and taken its hands off the request */ - spin_unlock_irqrestore(&request->rq_lock,flags); - break; - } - - CERROR ("Unexpectedly long timeout: req %p\n", request); - } - /* fall through */ - - case PTL_INV_MD: /* callback completed */ - LASSERT(!request->rq_receiving_reply); - LASSERT(request->rq_replied); - EXIT; - return; + if (request->rq_set == NULL) + wq = &request->rq_set->set_waitq; + else + wq = &request->rq_reply_waitq; + + for (;;) { + /* Network access will complete in finite time but the HUGE + * timeout lets us CWARN for visibility of sluggish NALs */ + lwi = LWI_TIMEOUT(300 * HZ, NULL, NULL); + rc = l_wait_event (*wq, !ptlrpc_client_receiving_reply(request), &lwi); + if (rc == 0) + return; + + LASSERT (rc == -ETIMEDOUT); + DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout"); } - /* Not Reached */ } /* caller must hold imp->imp_lock */ @@ -1207,11 +1192,17 @@ void ptlrpc_resend_req(struct ptlrpc_request *req) spin_lock_irqsave (&req->rq_lock, flags); req->rq_resend = 1; req->rq_timedout = 0; - if (req->rq_set != NULL) - wake_up (&req->rq_set->set_waitq); - else - wake_up(&req->rq_reply_waitq); + if (req->rq_bulk) { + __u64 old_xid = req->rq_xid; + + /* ensure previous bulk fails */ + req->rq_xid = ptlrpc_next_xid(); + CDEBUG(D_HA, "resend bulk old x"LPU64" new x"LPU64"\n", + old_xid, req->rq_xid); + } + ptlrpc_wake_client_req(req); spin_unlock_irqrestore (&req->rq_lock, flags); + } /* XXX: this function and rq_status are currently unused */ @@ -1225,10 +1216,7 @@ void ptlrpc_restart_req(struct ptlrpc_request *req) spin_lock_irqsave (&req->rq_lock, flags); req->rq_restart = 1; req->rq_timedout = 0; - if (req->rq_set != NULL) - wake_up (&req->rq_set->set_waitq); - else - wake_up(&req->rq_reply_waitq); + ptlrpc_wake_client_req(req); spin_unlock_irqrestore (&req->rq_lock, flags); } @@ -1456,15 +1444,24 @@ restart: out: if (req->rq_bulk != NULL) { - if (rc >= 0) { /* success so far */ + if (rc >= 0) { + /* success so far. Note that anything going wrong + * with bulk now, is EXTREMELY strange, since the + * server must have believed that the bulk + * tranferred OK before she replied with success to + * me. */ lwi = LWI_TIMEOUT(timeout, NULL, NULL); brc = l_wait_event(req->rq_reply_waitq, - ptlrpc_bulk_complete(req->rq_bulk), + !ptlrpc_bulk_active(req->rq_bulk), &lwi); + LASSERT(brc == 0 || brc == -ETIMEDOUT); if (brc != 0) { LASSERT(brc == -ETIMEDOUT); - CERROR ("Timed out waiting for bulk\n"); + DEBUG_REQ(D_ERROR, req, "bulk timed out"); rc = brc; + } else if (!req->rq_bulk->bd_success) { + DEBUG_REQ(D_ERROR, req, "bulk transfer failed"); + rc = -EIO; } } if (rc < 0) @@ -1499,7 +1496,8 @@ static int ptlrpc_replay_interpret(struct ptlrpc_request *req, /* Clear reply swab mask; this is a new reply in sender's byte order */ req->rq_rep_swab_mask = 0; #endif - rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen); + LASSERT (req->rq_nob_received <= req->rq_replen); + rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received); if (rc) { CERROR("unpack_rep failed: %d\n", rc); GOTO(out, rc = -EPROTO); @@ -1607,10 +1605,7 @@ void ptlrpc_abort_inflight(struct obd_import *imp) spin_lock (&req->rq_lock); if (req->rq_import_generation < imp->imp_generation) { req->rq_err = 1; - if (req->rq_set != NULL) - wake_up(&req->rq_set->set_waitq); - else - wake_up(&req->rq_reply_waitq); + ptlrpc_wake_client_req(req); } spin_unlock (&req->rq_lock); } @@ -1624,10 +1619,7 @@ void ptlrpc_abort_inflight(struct obd_import *imp) spin_lock (&req->rq_lock); if (req->rq_import_generation < imp->imp_generation) { req->rq_err = 1; - if (req->rq_set != NULL) - wake_up(&req->rq_set->set_waitq); - else - wake_up(&req->rq_reply_waitq); + ptlrpc_wake_client_req(req); } spin_unlock (&req->rq_lock); } diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index e91d7a3..b2aa6b9 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -33,311 +33,308 @@ struct ptlrpc_ni ptlrpc_interfaces[NAL_MAX_NR]; int ptlrpc_ninterfaces; -/* - * Free the packet when it has gone out +/* + * Client's outgoing request callback */ -static int request_out_callback(ptl_event_t *ev) +void request_out_callback(ptl_event_t *ev) { - struct ptlrpc_request *req = ev->mem_desc.user_ptr; + struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr; + struct ptlrpc_request *req = cbid->cbid_arg; + unsigned long flags; ENTRY; - /* requests always contiguous */ - LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0); - - if (ev->type != PTL_EVENT_SENT) { - // XXX make sure we understand all events, including ACK's - CERROR("Unknown event %d\n", ev->type); - LBUG(); - } + LASSERT (ev->type == PTL_EVENT_SENT || + ev->type == PTL_EVENT_UNLINK); + LASSERT (ev->unlinked); - /* this balances the atomic_inc in ptl_send_rpc() */ - ptlrpc_req_finished(req); - RETURN(1); -} + DEBUG_REQ((ev->status == PTL_OK) ? D_NET : D_ERROR, req, + "type %d, status %d", ev->type, ev->status); -/* - * Free the packet when it has gone out - */ -static int reply_out_callback(ptl_event_t *ev) -{ - struct ptlrpc_request *req = ev->mem_desc.user_ptr; - unsigned long flags; - ENTRY; + if (ev->type == PTL_EVENT_UNLINK || + ev->status != PTL_OK) { - /* replies always contiguous */ - LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0); + /* Failed send: make it seem like the reply timed out, just + * like failing sends in client.c does currently... */ - if (ev->type == PTL_EVENT_SENT) { - /* NB don't even know if this is the current reply! In fact - * we can't touch any state in the request, since the - * service handler zeros it on each incoming request. */ - OBD_FREE(ev->mem_desc.start, ev->mem_desc.length); - } else if (ev->type == PTL_EVENT_ACK) { - LASSERT(req->rq_want_ack); spin_lock_irqsave(&req->rq_lock, flags); - req->rq_want_ack = 0; - wake_up(&req->rq_reply_waitq); + req->rq_timeout = 0; spin_unlock_irqrestore(&req->rq_lock, flags); - } else { - // XXX make sure we understand all events - CERROR("Unknown event %d\n", ev->type); - LBUG(); + + ptlrpc_wake_client_req(req); } - RETURN(1); + /* this balances the atomic_inc in ptl_send_rpc() */ + ptlrpc_req_finished(req); + EXIT; } /* - * Wake up the thread waiting for the reply once it comes in. + * Client's incoming reply callback */ -int reply_in_callback(ptl_event_t *ev) +void reply_in_callback(ptl_event_t *ev) { - struct ptlrpc_request *req = ev->mem_desc.user_ptr; + struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr; + struct ptlrpc_request *req = cbid->cbid_arg; unsigned long flags; ENTRY; - /* replies always contiguous */ - LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0); - - if (req->rq_xid == 0x5a5a5a5a5a5a5a5aULL) { - CERROR("Reply received for freed request! Probably a missing " - "ptlrpc_abort()\n"); - LBUG(); - } + LASSERT (ev->type == PTL_EVENT_PUT || + ev->type == PTL_EVENT_UNLINK); + LASSERT (ev->unlinked); + LASSERT (ev->mem_desc.start == req->rq_repmsg); + LASSERT (ev->offset == 0); + LASSERT (ev->mlength <= req->rq_replen); + + DEBUG_REQ((ev->status == PTL_OK) ? D_NET : D_ERROR, req, + "type %d, status %d", ev->type, ev->status); - if (req->rq_xid != ev->match_bits) { - CERROR("Reply packet for wrong request\n"); - LBUG(); - } + spin_lock_irqsave (&req->rq_lock, flags); - if (ev->type == PTL_EVENT_PUT) { - /* Bug 1190: should handle non-zero offset as a protocol - * error */ - LASSERT (ev->offset == 0); + LASSERT (req->rq_receiving_reply); + req->rq_receiving_reply = 0; - spin_lock_irqsave (&req->rq_lock, flags); - LASSERT (req->rq_receiving_reply); - req->rq_receiving_reply = 0; + if (ev->type == PTL_EVENT_PUT && + ev->status == PTL_OK) { req->rq_replied = 1; - if (req->rq_set != NULL) - wake_up(&req->rq_set->set_waitq); - else - wake_up(&req->rq_reply_waitq); - spin_unlock_irqrestore (&req->rq_lock, flags); - } else { - // XXX make sure we understand all events, including ACKs - CERROR("Unknown event %d\n", ev->type); - LBUG(); - } - - RETURN(1); -} - -int request_in_callback(ptl_event_t *ev) -{ - struct ptlrpc_request_buffer_desc *rqbd = ev->mem_desc.user_ptr; - struct ptlrpc_srv_ni *srv_ni = rqbd->rqbd_srv_ni; - struct ptlrpc_service *service = srv_ni->sni_service; - - /* requests always contiguous */ - LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0); - /* we only enable puts */ - LASSERT(ev->type == PTL_EVENT_PUT); - LASSERT(atomic_read(&srv_ni->sni_nrqbds_receiving) > 0); - LASSERT(atomic_read(&rqbd->rqbd_refcount) > 0); - - if (ev->rlength != ev->mlength) - CERROR("Warning: Possibly truncated rpc (%d/%d)\n", - ev->mlength, ev->rlength); - - if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE)) { - /* This is the last request to be received into this - * request buffer. We don't bump the refcount, since the - * thread servicing this event is effectively taking over - * portals' reference. - */ - /* NB ev->unlinked_me.nal_idx is not set properly in a callback */ - LASSERT(ev->unlinked_me.cookie==rqbd->rqbd_me_h.cookie); - - /* we're off the air */ - /* we'll probably start dropping packets in portals soon */ - if (atomic_dec_and_test(&srv_ni->sni_nrqbds_receiving)) - CERROR("All request buffers busy\n"); - } else { - /* +1 ref for service thread */ - atomic_inc(&rqbd->rqbd_refcount); + req->rq_nob_received = ev->mlength; } - wake_up(&service->srv_waitq); + /* NB don't unlock till after wakeup; req can disappear under us + * since we don't have our own ref */ + ptlrpc_wake_client_req(req); - return 0; + spin_unlock_irqrestore (&req->rq_lock, flags); + EXIT; } -static int bulk_put_source_callback(ptl_event_t *ev) +/* + * Client's bulk has been written/read + */ +void client_bulk_callback (ptl_event_t *ev) { + struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr; + struct ptlrpc_bulk_desc *desc = cbid->cbid_arg; unsigned long flags; - struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr; ENTRY; - CDEBUG(D_NET, "got %s event %d\n", - (ev->type == PTL_EVENT_SENT) ? "SENT" : - (ev->type == PTL_EVENT_ACK) ? "ACK" : "UNEXPECTED", ev->type); + LASSERT ((desc->bd_type == BULK_PUT_SINK && + ev->type == PTL_EVENT_PUT) || + (desc->bd_type == BULK_GET_SOURCE && + ev->type == PTL_EVENT_GET) || + ev->type == PTL_EVENT_UNLINK); + LASSERT (ev->unlinked); - LASSERT(ev->type == PTL_EVENT_SENT || ev->type == PTL_EVENT_ACK); - - /* 1 fragment for each page always */ - LASSERT(ev->mem_desc.niov == desc->bd_page_count); + CDEBUG((ev->status == PTL_OK) ? D_NET : D_ERROR, + "event type %d, status %d, desc %p\n", + ev->type, ev->status, desc); spin_lock_irqsave (&desc->bd_lock, flags); - - LASSERT(desc->bd_callback_count > 0 && - desc->bd_callback_count <= 2); - - if (--desc->bd_callback_count == 0) { - desc->bd_network_rw = 0; - desc->bd_complete = 1; - wake_up(&desc->bd_waitq); + + LASSERT(desc->bd_network_rw); + desc->bd_network_rw = 0; + + if (ev->type != PTL_EVENT_UNLINK && + ev->status == PTL_OK) { + desc->bd_success = 1; + desc->bd_nob_transferred = ev->mlength; } + /* NB don't unlock till after wakeup; desc can disappear under us + * otherwise */ + ptlrpc_wake_client_req(desc->bd_req); + spin_unlock_irqrestore (&desc->bd_lock, flags); - RETURN(0); + EXIT; } -struct ptlrpc_bulk_desc ptlrpc_bad_desc; -ptl_event_t ptlrpc_bad_event; - -static int bulk_put_sink_callback(ptl_event_t *ev) +/* + * Server's incoming request callback + */ +void request_in_callback(ptl_event_t *ev) { - struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr; - unsigned long flags; + struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr; + struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg; + struct ptlrpc_srv_ni *srv_ni = rqbd->rqbd_srv_ni; + struct ptlrpc_service *service = srv_ni->sni_service; + struct ptlrpc_request *req; + long flags; ENTRY; - LASSERT(ev->type == PTL_EVENT_PUT); - - /* used iovs */ - LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == - PTL_MD_KIOV); - /* Honestly, it's best to find out early. */ - if (desc->bd_page_count == 0x5a5a5a5a || - desc->bd_page_count != ev->mem_desc.niov || - ev->mem_desc.start != &desc->bd_iov) { - /* not guaranteed (don't LASSERT) but good for this bug hunt */ - ptlrpc_bad_event = *ev; - ptlrpc_bad_desc = *desc; - CERROR ("XXX ev %p type %d portal %d match "LPX64", seq %ld\n", - ev, ev->type, ev->portal, ev->match_bits, ev->sequence); - CERROR ("XXX desc %p, export %p import %p gen %d " - " portal %d\n", - desc, desc->bd_export, - desc->bd_import, desc->bd_import_generation, - desc->bd_portal); - RETURN (0); + LASSERT (ev->type == PTL_EVENT_PUT || + ev->type == PTL_EVENT_UNLINK); + LASSERT ((char *)ev->mem_desc.start >= rqbd->rqbd_buffer); + LASSERT ((char *)ev->mem_desc.start + ev->offset + ev->mlength <= + rqbd->rqbd_buffer + service->srv_buf_size); + + CDEBUG((ev->status == PTL_OK) ? D_NET : D_ERROR, + "event type %d, status %d, service %s\n", + ev->type, ev->status, service->srv_name); + + if (ev->unlinked) { + /* If this is the last request message to fit in the + * request buffer we can use the request object embedded in + * rqbd. Note that if we failed to allocate a request, + * we'd have to re-post the rqbd, which we can't do in this + * context. */ + req = &rqbd->rqbd_req; + memset(req, 0, sizeof (*req)); + } else { + LASSERT (ev->type == PTL_EVENT_PUT); + if (ev->status != PTL_OK) { + /* We moaned above already... */ + return; + } + OBD_ALLOC_GFP(req, sizeof(*req), GFP_ATOMIC); + if (req == NULL) { + CERROR("Can't allocate incoming request descriptor: " + "Dropping %s RPC from "LPX64"\n", + service->srv_name, ev->initiator.nid); + return; + } } - - LASSERT(desc->bd_page_count != 0x5a5a5a5a); - /* 1 fragment for each page always */ - LASSERT(ev->mem_desc.niov == desc->bd_page_count); - LASSERT(ev->match_bits == desc->bd_req->rq_xid); - - /* peer must put with zero offset */ - if (ev->offset != 0) { - /* Bug 1190: handle this as a protocol failure */ - CERROR ("Bad offset %d\n", ev->offset); - LBUG (); + + /* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL, + * flags are reset and scalars are zero. We only set the message + * size to non-zero if this was a successful receive. */ + req->rq_xid = ev->match_bits; + req->rq_reqmsg = ev->mem_desc.start + ev->offset; + if (ev->type == PTL_EVENT_PUT && + ev->status == PTL_OK) + req->rq_reqlen = ev->mlength; + req->rq_arrival_time = ev->arrival_time; + req->rq_peer.peer_nid = ev->initiator.nid; + req->rq_peer.peer_ni = rqbd->rqbd_srv_ni->sni_ni; + req->rq_rqbd = rqbd; + + spin_lock_irqsave (&service->srv_lock, flags); + + if (ev->unlinked) { + srv_ni->sni_nrqbd_receiving--; + if (ev->type != PTL_EVENT_UNLINK && + srv_ni->sni_nrqbd_receiving == 0) { + /* This service is off-air on this interface because + * all its request buffers are busy. Portals will + * start dropping incoming requests until more buffers + * get posted. NB don't moan if it's because we're + * tearing down the service. */ + CWARN("All %s %s request buffers busy\n", + service->srv_name, srv_ni->sni_ni->pni_name); + } + /* req takes over the network's ref on rqbd */ + } else { + /* req takes a ref on rqbd */ + rqbd->rqbd_refcount++; } - /* No check for total # bytes; this could be a short read */ + list_add_tail(&req->rq_list, &service->srv_request_queue); + service->srv_n_queued_reqs++; + rqbd->rqbd_eventcount++; - spin_lock_irqsave (&desc->bd_lock, flags); - desc->bd_network_rw = 0; - desc->bd_complete = 1; - if (desc->bd_req->rq_set != NULL) - wake_up (&desc->bd_req->rq_set->set_waitq); - else - wake_up (&desc->bd_req->rq_reply_waitq); - spin_unlock_irqrestore (&desc->bd_lock, flags); + /* NB everything can disappear under us once the request + * has been queued and we unlock, so do the wake now... */ + wake_up(&service->srv_waitq); - RETURN(1); + spin_unlock_irqrestore(&service->srv_lock, flags); + EXIT; } -static int bulk_get_source_callback(ptl_event_t *ev) +/* + * Server's outgoing reply callback + */ +void reply_out_callback(ptl_event_t *ev) { - struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr; - struct ptlrpc_bulk_page *bulk; - struct list_head *tmp; - unsigned long flags; - ptl_size_t total = 0; + struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr; + struct ptlrpc_reply_state *rs = cbid->cbid_arg; + struct ptlrpc_srv_ni *sni = rs->rs_srv_ni; + struct ptlrpc_service *svc = sni->sni_service; + unsigned long flags; ENTRY; - LASSERT(ev->type == PTL_EVENT_GET); - - /* used iovs */ - LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == - PTL_MD_KIOV); - /* 1 fragment for each page always */ - LASSERT(ev->mem_desc.niov == desc->bd_page_count); - LASSERT(ev->match_bits == desc->bd_req->rq_xid); - - /* peer must get with zero offset */ - if (ev->offset != 0) { - /* Bug 1190: handle this as a protocol failure */ - CERROR ("Bad offset %d\n", ev->offset); - LBUG (); + LASSERT (ev->type == PTL_EVENT_SENT || + ev->type == PTL_EVENT_ACK || + ev->type == PTL_EVENT_UNLINK); + + if (!rs->rs_difficult) { + /* I'm totally responsible for freeing "easy" replies */ + LASSERT (ev->unlinked); + lustre_free_reply_state (rs); + atomic_dec (&svc->srv_outstanding_replies); + EXIT; + return; } - - list_for_each (tmp, &desc->bd_page_list) { - bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link); - total += bulk->bp_buflen; - } + LASSERT (rs->rs_on_net); - /* peer must get everything */ - if (ev->mem_desc.length != total) { - /* Bug 1190: handle this as a protocol failure */ - CERROR ("Bad length/total %d/%d\n", ev->mem_desc.length, total); - LBUG (); + if (ev->unlinked) { + /* Last network callback */ + spin_lock_irqsave (&svc->srv_lock, flags); + rs->rs_on_net = 0; + ptlrpc_schedule_difficult_reply (rs); + spin_unlock_irqrestore (&svc->srv_lock, flags); } - spin_lock_irqsave (&desc->bd_lock, flags); - desc->bd_network_rw = 0; - desc->bd_complete = 1; - if (desc->bd_req->rq_set != NULL) - wake_up (&desc->bd_req->rq_set->set_waitq); - else - wake_up (&desc->bd_req->rq_reply_waitq); - spin_unlock_irqrestore (&desc->bd_lock, flags); - - RETURN(1); + EXIT; } -static int bulk_get_sink_callback(ptl_event_t *ev) +/* + * Server's bulk completion callback + */ +void server_bulk_callback (ptl_event_t *ev) { - struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr; + struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr; + struct ptlrpc_bulk_desc *desc = cbid->cbid_arg; unsigned long flags; ENTRY; - CDEBUG(D_NET, "got %s event %d desc %p\n", - (ev->type == PTL_EVENT_SENT) ? "SENT" : - (ev->type == PTL_EVENT_REPLY) ? "REPLY" : "UNEXPECTED", - ev->type, desc); + LASSERT (ev->type == PTL_EVENT_SENT || + ev->type == PTL_EVENT_UNLINK || + (desc->bd_type == BULK_PUT_SOURCE && + ev->type == PTL_EVENT_ACK) || + (desc->bd_type == BULK_GET_SINK && + ev->type == PTL_EVENT_REPLY)); - LASSERT(ev->type == PTL_EVENT_SENT || ev->type == PTL_EVENT_REPLY); - - /* 1 fragment for each page always */ - LASSERT(ev->mem_desc.niov == desc->bd_page_count); + CDEBUG((ev->status == PTL_OK) ? D_NET : D_ERROR, + "event type %d, status %d, desc %p\n", + ev->type, ev->status, desc); spin_lock_irqsave (&desc->bd_lock, flags); - LASSERT(desc->bd_callback_count > 0 && - desc->bd_callback_count <= 2); + + if ((ev->type == PTL_EVENT_ACK || + ev->type == PTL_EVENT_REPLY) && + ev->status == PTL_OK) { + /* We heard back from the peer, so even if we get this + * before the SENT event (oh yes we can), we know we + * read/wrote the peer buffer and how much... */ + desc->bd_success = 1; + desc->bd_nob_transferred = ev->mlength; + } - if (--desc->bd_callback_count == 0) { + if (ev->unlinked) { + /* This is the last callback no matter what... */ desc->bd_network_rw = 0; - desc->bd_complete = 1; wake_up(&desc->bd_waitq); } + spin_unlock_irqrestore (&desc->bd_lock, flags); + EXIT; +} + +static int ptlrpc_master_callback(ptl_event_t *ev) +{ + struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr; + void (*callback)(ptl_event_t *ev) = cbid->cbid_fn; - RETURN(0); + /* Honestly, it's best to find out early. */ + LASSERT (cbid->cbid_arg != (void *)0x5a5a5a5a5a5a5a5a); + LASSERT (callback == request_out_callback || + callback == reply_in_callback || + callback == client_bulk_callback || + callback == request_in_callback || + callback == reply_out_callback || + callback == server_bulk_callback); + + callback (ev); + return (0); } int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer) @@ -368,14 +365,7 @@ int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer) void ptlrpc_ni_fini(struct ptlrpc_ni *pni) { - PtlEQFree(pni->pni_request_out_eq_h); - PtlEQFree(pni->pni_reply_out_eq_h); - PtlEQFree(pni->pni_reply_in_eq_h); - PtlEQFree(pni->pni_bulk_put_source_eq_h); - PtlEQFree(pni->pni_bulk_put_sink_eq_h); - PtlEQFree(pni->pni_bulk_get_source_eq_h); - PtlEQFree(pni->pni_bulk_get_sink_eq_h); - + PtlEQFree(pni->pni_eq_h); kportal_put_ni (pni->pni_number); } @@ -395,51 +385,18 @@ int ptlrpc_ni_init(int number, char *name, struct ptlrpc_ni *pni) pni->pni_number = number; pni->pni_ni_h = *nip; - pni->pni_request_out_eq_h = PTL_HANDLE_NONE; - pni->pni_reply_out_eq_h = PTL_HANDLE_NONE; - pni->pni_reply_in_eq_h = PTL_HANDLE_NONE; - pni->pni_bulk_put_source_eq_h = PTL_HANDLE_NONE; - pni->pni_bulk_put_sink_eq_h = PTL_HANDLE_NONE; - pni->pni_bulk_get_source_eq_h = PTL_HANDLE_NONE; - pni->pni_bulk_get_sink_eq_h = PTL_HANDLE_NONE; - - /* NB We never actually PtlEQGet() out of these events queues since - * we're only interested in the event callback, so we can just let - * them wrap. Their sizes aren't a big deal, apart from providing - * a little history for debugging... */ - - rc = PtlEQAlloc(pni->pni_ni_h, 1024, request_out_callback, - &pni->pni_request_out_eq_h); - if (rc != PTL_OK) - GOTO (fail, rc = -ENOMEM); - - rc = PtlEQAlloc(pni->pni_ni_h, 1024, reply_out_callback, - &pni->pni_reply_out_eq_h); - if (rc != PTL_OK) - GOTO (fail, rc = -ENOMEM); - - rc = PtlEQAlloc(pni->pni_ni_h, 1024, reply_in_callback, - &pni->pni_reply_in_eq_h); - if (rc != PTL_OK) - GOTO (fail, rc = -ENOMEM); - - rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_put_source_callback, - &pni->pni_bulk_put_source_eq_h); - if (rc != PTL_OK) - GOTO (fail, rc = -ENOMEM); + pni->pni_eq_h = PTL_HANDLE_NONE; - rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_put_sink_callback, - &pni->pni_bulk_put_sink_eq_h); - if (rc != PTL_OK) - GOTO (fail, rc = -ENOMEM); - - rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_get_source_callback, - &pni->pni_bulk_get_source_eq_h); - if (rc != PTL_OK) - GOTO (fail, rc = -ENOMEM); - - rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_get_sink_callback, - &pni->pni_bulk_get_sink_eq_h); +#ifdef __KERNEL__ + /* kernel: portals calls the callback when the event is added to the + * queue, so we don't care if we lose events */ + rc = PtlEQAlloc(pni->pni_ni_h, 1024, ptlrpc_master_callback, + &pni->pni_eq_h); +#else + /* liblustre: no asynchronous callback and allocate a nice big event + * queue so we don't drop any events... */ + rc = PtlEQAlloc(pni->pni_ni_h, 10240, NULL, &pni->pni_eq_h); +#endif if (rc != PTL_OK) GOTO (fail, rc = -ENOMEM); @@ -454,18 +411,42 @@ int ptlrpc_ni_init(int number, char *name, struct ptlrpc_ni *pni) } #ifndef __KERNEL__ +LIST_HEAD(liblustre_wait_callbacks); +void *liblustre_services_callback; + +void * +liblustre_register_wait_callback (int (*fn)(void *arg), void *arg) +{ + struct liblustre_wait_callback *llwc; + + OBD_ALLOC(llwc, sizeof(*llwc)); + LASSERT (llwc != NULL); + + llwc->llwc_fn = fn; + llwc->llwc_arg = arg; + list_add_tail(&llwc->llwc_list, &liblustre_wait_callbacks); + + return (llwc); +} + +void +liblustre_deregister_wait_callback (void *opaque) +{ + struct liblustre_wait_callback *llwc = opaque; + + list_del(&llwc->llwc_list); + OBD_FREE(llwc, sizeof(*llwc)); +} + int -liblustre_check_events (int block) +liblustre_check_events (int timeout) { ptl_event_t ev; int rc; ENTRY; - if (block) { - /* XXX to accelerate recovery tests XXX */ - if (block > 10) - block = 10; - rc = PtlEQWait_timeout(ptlrpc_interfaces[0].pni_eq_h, &ev, block); + if (timeout) { + rc = PtlEQWait_timeout(ptlrpc_interfaces[0].pni_eq_h, &ev, timeout); } else { rc = PtlEQGet (ptlrpc_interfaces[0].pni_eq_h, &ev); } @@ -474,36 +455,58 @@ liblustre_check_events (int block) LASSERT (rc == PTL_EQ_DROPPED || rc == PTL_OK); -#if PORTALS_DOES_NOT_SUPPORT_CALLBACKS - if (rc == PTL_EQ_DROPPED) +#ifndef __KERNEL__ + /* liblustre: no asynch callback so we can't affort to miss any + * events... */ + if (rc == PTL_EQ_DROPPED) { CERROR ("Dropped an event!!!\n"); + abort(); + } ptlrpc_master_callback (&ev); #endif RETURN(1); } -int liblustre_wait_event(struct l_wait_info *lwi) +int +liblustre_wait_event (int timeout) { - ENTRY; - - /* non-blocking checks (actually we might block in a service for - * bulk but we won't block in a blocked service) - */ - if (liblustre_check_events(0) || - liblustre_check_services()) { - /* the condition the caller is waiting for may now hold */ - RETURN(0); + struct list_head *tmp; + struct liblustre_wait_callback *llwc; + int found_something = 0; + + /* First check for any new events */ + if (liblustre_check_events(0)) + found_something = 1; + + /* Now give all registered callbacks a bite at the cherry */ + list_for_each(tmp, &liblustre_wait_callbacks) { + llwc = list_entry(tmp, struct liblustre_wait_callback, + llwc_list); + + if (llwc->llwc_fn(llwc->llwc_arg)) + found_something = 1; } - - /* block for an event */ - liblustre_check_events(lwi->lwi_timeout); - /* check it's not for some service */ - liblustre_check_services (); + /* return to caller if something happened */ + if (found_something) + return 1; + + /* block for an event, returning immediately on timeout */ + if (!liblustre_check_events(timeout)) + return 0; + + /* an event occurred; let all registered callbacks progress... */ + list_for_each(tmp, &liblustre_wait_callbacks) { + llwc = list_entry(tmp, struct liblustre_wait_callback, + llwc_list); + + if (llwc->llwc_fn(llwc->llwc_arg)) + found_something = 1; + } - /* XXX check this */ - RETURN(0); + /* ...and tell caller something happened */ + return 1; } #endif @@ -541,11 +544,18 @@ int ptlrpc_init_portals(void) "loaded?\n"); return -EIO; } +#ifndef __KERNEL__ + liblustre_services_callback = + liblustre_register_wait_callback(&liblustre_check_services, NULL); +#endif return 0; } void ptlrpc_exit_portals(void) { +#ifndef __KERNEL__ + liblustre_deregister_wait_callback(liblustre_services_callback); +#endif while (ptlrpc_ninterfaces > 0) ptlrpc_ni_fini (&ptlrpc_interfaces[--ptlrpc_ninterfaces]); } diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 2fd25ec..5bc9e3f 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -215,6 +215,10 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) if (!request) GOTO(out, rc = -ENOMEM); +#ifndef __KERNEL__ + lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_LIBCLIENT); +#endif + request->rq_send_state = LUSTRE_IMP_CONNECTING; request->rq_replen = lustre_msg_size(0, NULL); request->rq_interpret_reply = ptlrpc_connect_interpret; @@ -229,6 +233,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) if (aa->pcaa_initial_connect) imp->imp_replayable = 1; + ptlrpcd_add_req(request); rc = 0; out: @@ -349,8 +354,10 @@ finish: out: if (rc != 0) { IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON); - if (aa->pcaa_initial_connect && !imp->imp_initial_recov) + if (aa->pcaa_initial_connect && !imp->imp_initial_recov) { + ptlrpc_set_import_active(imp, 0); GOTO(norecov, rc); + } CDEBUG(D_ERROR, "recovery of %s on %s failed (%d); restarting\n", imp->imp_target_uuid.uuid, diff --git a/lustre/ptlrpc/llog_net.c b/lustre/ptlrpc/llog_net.c index 7fc27fc..cdd70e2 100644 --- a/lustre/ptlrpc/llog_net.c +++ b/lustre/ptlrpc/llog_net.c @@ -137,8 +137,7 @@ EXPORT_SYMBOL(llog_initiator_connect); #else /* !__KERNEL__ */ int llog_origin_connect(struct llog_ctxt *ctxt, int count, - struct llog_logid *logid, - struct llog_ctxt_gen *gen) + struct llog_logid *logid, struct llog_gen *gen) { return 0; } diff --git a/lustre/ptlrpc/llog_server.c b/lustre/ptlrpc/llog_server.c index 4d9e68c..a15f67c 100644 --- a/lustre/ptlrpc/llog_server.c +++ b/lustre/ptlrpc/llog_server.c @@ -29,13 +29,20 @@ #define EXPORT_SYMTAB #endif +#ifndef __KERNEL__ +#include +#else #include +#endif + #include #include #include #include #include +#ifdef __KERNEL__ + int llog_origin_handle_create(struct ptlrpc_request *req) { struct obd_export *exp = req->rq_export; @@ -525,3 +532,31 @@ out_free: OBD_FREE(buf, buf_len); return rc; } + +#else /* !__KERNEL__ */ +int llog_origin_handle_create(struct ptlrpc_request *req) +{ + LBUG(); + return 0; +} +int llog_origin_handle_next_block(struct ptlrpc_request *req) +{ + LBUG(); + return 0; +} +int llog_origin_handle_read_header(struct ptlrpc_request *req) +{ + LBUG(); + return 0; +} +int llog_origin_handle_close(struct ptlrpc_request *req) +{ + LBUG(); + return 0; +} +int llog_origin_handle_cancel(struct ptlrpc_request *req) +{ + LBUG(); + return 0; +} +#endif diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index 1559403b..f783ebf 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -107,8 +107,8 @@ void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir, struct proc_dir_entry *svc_procroot; struct lprocfs_stats *svc_stats; int i, rc; - unsigned int svc_counter_config = LPROCFS_CNTR_EXTERNALLOCK | - LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV; + unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX | + LPROCFS_CNTR_STDDEV; LASSERT(*procroot_ret == NULL); LASSERT(*stats_ret == NULL); @@ -123,19 +123,16 @@ void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir, lprocfs_free_stats(svc_stats); return; } - } else + } else { svc_procroot = root; + } lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR, svc_counter_config, "req_waittime", "usec"); - /* Wait for b_eq branch - lprocfs_counter_init(svc_stats, PTLRPC_SVCEQDEPTH_CNTR, - svc_counter_config, "svc_eqdepth", "reqs"); - */ - /* no stddev on idletime */ - lprocfs_counter_init(svc_stats, PTLRPC_SVCIDLETIME_CNTR, - (LPROCFS_CNTR_EXTERNALLOCK|LPROCFS_CNTR_AVGMINMAX), - "svc_idletime", "usec"); + lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR, + svc_counter_config, "req_qdepth", "reqs"); + lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR, + svc_counter_config, "req_active", "reqs"); for (i = 0; i < LUSTRE_MAX_OPCODES; i++) { __u32 opcode = ll_rpc_opcode_table[i].opcode; lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i, @@ -159,14 +156,14 @@ void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry, struct ptlrpc_service *svc) { ptlrpc_lprocfs_register(entry, svc->srv_name, - "stats", &svc->srv_procroot, + "stats", &svc->srv_procroot, &svc->srv_stats); } void ptlrpc_lprocfs_register_obd(struct obd_device *obddev) { - ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats", - &obddev->obd_svc_procroot, + ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats", + &obddev->obd_svc_procroot, &obddev->obd_svc_stats); } diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index ab6684a..c25db89 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -31,15 +31,16 @@ #include #include "ptlrpc_internal.h" -static int ptl_send_buf(struct ptlrpc_request *request, - struct ptlrpc_connection *conn, int portal) +static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len, + ptl_ack_req_t ack, struct ptlrpc_cb_id *cbid, + struct ptlrpc_connection *conn, int portal, __u64 xid) { - int rc; - int rc2; ptl_process_id_t remote_id; - ptl_handle_md_t md_h; - ptl_ack_req_t ack_req; + int rc; + int rc2; + ptl_md_t md; char str[PTL_NALFMT_SIZE]; + ENTRY; LASSERT (portal != 0); LASSERT (conn != NULL); @@ -50,156 +51,82 @@ static int ptl_send_buf(struct ptlrpc_request *request, conn->c_peer.peer_nid, str), conn->c_peer.peer_ni->pni_name); - request->rq_req_md.user_ptr = request; - - switch (request->rq_type) { - case PTL_RPC_MSG_REQUEST: - request->rq_reqmsg->type = request->rq_type; - request->rq_req_md.start = request->rq_reqmsg; - request->rq_req_md.length = request->rq_reqlen; - request->rq_req_md.eventq = - conn->c_peer.peer_ni->pni_request_out_eq_h; - LASSERT (!request->rq_want_ack); - break; - case PTL_RPC_MSG_ERR: - case PTL_RPC_MSG_REPLY: - request->rq_repmsg->type = request->rq_type; - request->rq_req_md.start = request->rq_repmsg; - request->rq_req_md.length = request->rq_replen; - request->rq_req_md.eventq = - conn->c_peer.peer_ni->pni_reply_out_eq_h; - break; - default: - LBUG(); - return -1; /* notreached */ - } - if (request->rq_want_ack) { - request->rq_req_md.threshold = 2; /* SENT and ACK */ - ack_req = PTL_ACK_REQ; - } else { - request->rq_req_md.threshold = 1; - ack_req = PTL_NOACK_REQ; - } - request->rq_req_md.options = PTL_MD_OP_PUT; - request->rq_req_md.user_ptr = request; + remote_id.nid = conn->c_peer.peer_nid, + remote_id.pid = 0; - if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_ACK | OBD_FAIL_ONCE)) { - request->rq_req_md.options |= PTL_MD_ACK_DISABLE; + md.start = base; + md.length = len; + md.threshold = (ack == PTL_ACK_REQ) ? 2 : 1; + md.options = 0; + md.user_ptr = cbid; + md.eventq = conn->c_peer.peer_ni->pni_eq_h; + + if (ack == PTL_ACK_REQ && + OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_ACK | OBD_FAIL_ONCE)) { + /* don't ask for the ack to simulate failing client */ + ack = PTL_NOACK_REQ; obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED; } - /* NB if the send fails, we back out of the send and return - * failure; it's down to the caller to handle missing callbacks */ - - rc = PtlMDBind(conn->c_peer.peer_ni->pni_ni_h, request->rq_req_md, - &md_h); + rc = PtlMDBind (conn->c_peer.peer_ni->pni_ni_h, md, mdh); if (rc != PTL_OK) { - CERROR("PtlMDBind failed: %d\n", rc); + CERROR ("PtlMDBind failed: %d\n", rc); LASSERT (rc == PTL_NOSPACE); RETURN (-ENOMEM); } - if (request->rq_type != PTL_RPC_MSG_REQUEST) - memcpy(&request->rq_reply_md_h, &md_h, sizeof(md_h)); - - remote_id.nid = conn->c_peer.peer_nid; - remote_id.pid = 0; CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64"\n", - request->rq_req_md.length, portal, request->rq_xid); + len, portal, xid); - rc = PtlPut(md_h, ack_req, remote_id, portal, 0, request->rq_xid, 0, 0); + rc2 = PtlPut (*mdh, ack, remote_id, portal, 0, xid, 0, 0); if (rc != PTL_OK) { + /* We're going to get an UNLINK event when I unlink below, + * which will complete just like any other failed send, so + * I fall through and return success here! */ CERROR("PtlPut("LPU64", %d, "LPD64") failed: %d\n", - remote_id.nid, portal, request->rq_xid, rc); - rc2 = PtlMDUnlink(md_h); + remote_id.nid, portal, xid, rc); + rc2 = PtlMDUnlink(*mdh); LASSERT (rc2 == PTL_OK); - RETURN ((rc == PTL_NOSPACE) ? -ENOMEM : -ECOMM); } - return 0; + RETURN (0); } -static inline ptl_kiov_t * -ptlrpc_get_bulk_iov (struct ptlrpc_bulk_desc *desc) +int ptlrpc_start_bulk_transfer (struct ptlrpc_bulk_desc *desc) { - ptl_kiov_t *iov; - - if (desc->bd_page_count <= sizeof (desc->bd_iov)/sizeof (*iov)) - return (desc->bd_iov); - - OBD_ALLOC (iov, desc->bd_page_count * sizeof (*iov)); - if (iov == NULL) - LBUG(); - - return (iov); -} - -static inline void -ptlrpc_put_bulk_iov (struct ptlrpc_bulk_desc *desc, ptl_kiov_t *iov) -{ - if (desc->bd_page_count <= sizeof (desc->bd_iov)/sizeof (*iov)) - return; - - OBD_FREE (iov, desc->bd_page_count * sizeof (*iov)); -} - -int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc) -{ - int rc; - int rc2; + int rc; + int rc2; struct ptlrpc_peer *peer; - struct list_head *tmp, *next; - ptl_process_id_t remote_id; - ptl_kiov_t *iov; - __u64 xid; + ptl_process_id_t remote_id; + ptl_md_t md; + __u64 xid; ENTRY; /* NB no locking required until desc is on the network */ LASSERT (!desc->bd_network_rw); - LASSERT (desc->bd_type == BULK_PUT_SOURCE); - desc->bd_complete = 0; - - iov = ptlrpc_get_bulk_iov (desc); - if (iov == NULL) - RETURN (-ENOMEM); - + LASSERT (desc->bd_type == BULK_PUT_SOURCE || + desc->bd_type == BULK_GET_SINK); + desc->bd_success = 0; peer = &desc->bd_export->exp_connection->c_peer; - desc->bd_md.start = iov; - desc->bd_md.niov = 0; - desc->bd_md.length = 0; - desc->bd_md.eventq = peer->peer_ni->pni_bulk_put_source_eq_h; - desc->bd_md.threshold = 2; /* SENT and ACK */ - desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_KIOV; - desc->bd_md.user_ptr = desc; - - desc->bd_callback_count = 2; - - list_for_each_safe(tmp, next, &desc->bd_page_list) { - struct ptlrpc_bulk_page *bulk; - bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link); - - LASSERT(desc->bd_md.niov < desc->bd_page_count); - - iov[desc->bd_md.niov].kiov_page = bulk->bp_page; - iov[desc->bd_md.niov].kiov_offset = bulk->bp_pageoffset; - iov[desc->bd_md.niov].kiov_len = bulk->bp_buflen; - - LASSERT (iov[desc->bd_md.niov].kiov_offset + - iov[desc->bd_md.niov].kiov_len <= PAGE_SIZE); - desc->bd_md.niov++; - desc->bd_md.length += bulk->bp_buflen; - } + md.start = &desc->bd_iov[0]; + md.niov = desc->bd_page_count; + md.length = desc->bd_nob; + md.eventq = peer->peer_ni->pni_eq_h; + md.threshold = 2; /* SENT and ACK/REPLY */ +#ifdef __KERNEL__ + md.options = PTL_MD_KIOV; +#else + md.options = PTL_MD_IOV; +#endif + md.user_ptr = &desc->bd_cbid; + LASSERT (desc->bd_cbid.cbid_fn == server_bulk_callback); + LASSERT (desc->bd_cbid.cbid_arg == desc); /* NB total length may be 0 for a read past EOF, so we send a 0 * length bulk, since the client expects a bulk event. */ - LASSERT(desc->bd_md.niov == desc->bd_page_count); - - rc = PtlMDBind(peer->peer_ni->pni_ni_h, desc->bd_md, - &desc->bd_md_h); - - ptlrpc_put_bulk_iov (desc, iov); /*move down to reduce latency to send*/ + rc = PtlMDBind(peer->peer_ni->pni_ni_h, md, &desc->bd_md_h); if (rc != PTL_OK) { CERROR("PtlMDBind failed: %d\n", rc); LASSERT (rc == PTL_NOSPACE); @@ -211,109 +138,29 @@ int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc) remote_id.nid = peer->peer_nid; remote_id.pid = 0; - CDEBUG(D_NET, "Sending %u pages %u bytes to portal %d on %s " - "nid "LPX64" pid %d xid "LPX64"\n", - desc->bd_md.niov, desc->bd_md.length, - desc->bd_portal, peer->peer_ni->pni_name, + CDEBUG(D_NET, "Transferring %u pages %u bytes via portal %d on %s " + "nid "LPX64" pid %d xid "LPX64"\n", + md.niov, md.length, desc->bd_portal, peer->peer_ni->pni_name, remote_id.nid, remote_id.pid, xid); + /* Network is about to get at the memory */ desc->bd_network_rw = 1; - rc = PtlPut(desc->bd_md_h, PTL_ACK_REQ, remote_id, - desc->bd_portal, 0, xid, 0, 0); - if (rc != PTL_OK) { - desc->bd_network_rw = 0; - CERROR("PtlPut("LPU64", %d, "LPX64") failed: %d\n", - remote_id.nid, desc->bd_portal, xid, rc); - rc2 = PtlMDUnlink(desc->bd_md_h); - LASSERT (rc2 == PTL_OK); - RETURN((rc == PTL_NOSPACE) ? -ENOMEM : -ECOMM); - } - - RETURN(0); -} - -int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *desc) -{ - int rc; - int rc2; - struct ptlrpc_peer *peer; - struct list_head *tmp, *next; - ptl_process_id_t remote_id; - ptl_kiov_t *iov; - __u64 xid; - ENTRY; - - /* NB no locking required until desc is on the network */ - LASSERT (!desc->bd_network_rw); - LASSERT (desc->bd_type == BULK_GET_SINK); - desc->bd_complete = 0; - - iov = ptlrpc_get_bulk_iov (desc); - if (iov == NULL) - RETURN(-ENOMEM); - - peer = &desc->bd_export->exp_connection->c_peer; - - desc->bd_md.start = iov; - desc->bd_md.niov = 0; - desc->bd_md.length = 0; - desc->bd_md.eventq = peer->peer_ni->pni_bulk_get_sink_eq_h; - desc->bd_md.threshold = 2; /* SENT and REPLY */ - desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_KIOV; - desc->bd_md.user_ptr = desc; - - desc->bd_callback_count = 2; - - list_for_each_safe(tmp, next, &desc->bd_page_list) { - struct ptlrpc_bulk_page *bulk; - bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link); - LASSERT(desc->bd_md.niov < desc->bd_page_count); - - iov[desc->bd_md.niov].kiov_page = bulk->bp_page; - iov[desc->bd_md.niov].kiov_len = bulk->bp_buflen; - iov[desc->bd_md.niov].kiov_offset = bulk->bp_pageoffset; - - LASSERT (iov[desc->bd_md.niov].kiov_offset + - iov[desc->bd_md.niov].kiov_len <= PAGE_SIZE); - desc->bd_md.niov++; - desc->bd_md.length += bulk->bp_buflen; - } - - LASSERT(desc->bd_md.niov == desc->bd_page_count); - LASSERT(desc->bd_md.niov != 0); - - rc = PtlMDBind(peer->peer_ni->pni_ni_h, desc->bd_md, &desc->bd_md_h); - - ptlrpc_put_bulk_iov(desc, iov); /*move down to reduce latency to send*/ - - if (rc != PTL_OK) { - CERROR("PtlMDBind failed: %d\n", rc); - LASSERT (rc == PTL_NOSPACE); - RETURN(-ENOMEM); - } - - /* Client's bulk and reply matchbits are the same */ - xid = desc->bd_req->rq_xid; - remote_id.nid = desc->bd_export->exp_connection->c_peer.peer_nid; - remote_id.pid = 0; - - CDEBUG(D_NET, "Fetching %u pages %u bytes from portal %d on %s " - "nid "LPX64" pid %d xid "LPX64"\n", - desc->bd_md.niov, desc->bd_md.length, desc->bd_portal, - peer->peer_ni->pni_name, remote_id.nid, remote_id.pid, - xid); - - desc->bd_network_rw = 1; - rc = PtlGet(desc->bd_md_h, remote_id, desc->bd_portal, 0, - xid, 0); + if (desc->bd_type == BULK_PUT_SOURCE) + rc = PtlPut (desc->bd_md_h, PTL_ACK_REQ, remote_id, + desc->bd_portal, 0, xid, 0, 0); + else + rc = PtlGet (desc->bd_md_h, remote_id, + desc->bd_portal, 0, xid, 0); + if (rc != PTL_OK) { - desc->bd_network_rw = 0; - CERROR("PtlGet("LPU64", %d, "LPX64") failed: %d\n", + /* Can't send, so we unlink the MD bound above. The UNLINK + * event this creates will signal completion with failure, + * so we return SUCCESS here! */ + CERROR("Transfer("LPU64", %d, "LPX64") failed: %d\n", remote_id.nid, desc->bd_portal, xid, rc); rc2 = PtlMDUnlink(desc->bd_md_h); LASSERT (rc2 == PTL_OK); - RETURN((rc == PTL_NOSPACE) ? -ENOMEM : -ECOMM); } RETURN(0); @@ -323,166 +170,116 @@ void ptlrpc_abort_bulk (struct ptlrpc_bulk_desc *desc) { /* Server side bulk abort. Idempotent. Not thread-safe (i.e. only * serialises with completion callback) */ - unsigned long flags; struct l_wait_info lwi; - int callback_count; int rc; LASSERT (!in_interrupt ()); /* might sleep */ - /* NB. server-side bulk gets 2 events, so we have to keep trying to - * unlink the MD until all callbacks have happened, or - * PtlMDUnlink() returns OK or INVALID */ - again: - spin_lock_irqsave (&desc->bd_lock, flags); - if (!desc->bd_network_rw) { - /* completed or never even registered. NB holding bd_lock - * guarantees callback has completed if it ran. */ - spin_unlock_irqrestore (&desc->bd_lock, flags); - return; - } - - /* sample callback count while we have the lock */ - callback_count = desc->bd_callback_count; - spin_unlock_irqrestore (&desc->bd_lock, flags); + if (!ptlrpc_bulk_active(desc)) /* completed or */ + return; /* never started */ + + /* The unlink ensures the callback happens ASAP and is the last + * one. If it fails, it must be because completion just + * happened. */ rc = PtlMDUnlink (desc->bd_md_h); - switch (rc) { - default: - CERROR("PtlMDUnlink returned %d\n", rc); - LBUG (); - case PTL_OK: /* Won the race with the network */ - LASSERT (!desc->bd_complete); /* Not all callbacks ran */ - desc->bd_network_rw = 0; - return; - - case PTL_MD_INUSE: /* MD is being accessed right now */ - for (;;) { - /* Network access will complete in finite time but the - * timeout lets us CERROR for visibility */ - lwi = LWI_TIMEOUT (10 * HZ, NULL, NULL); - rc = l_wait_event(desc->bd_waitq, - desc->bd_callback_count != - callback_count, &lwi); - if (rc == -ETIMEDOUT) { - CERROR("Unexpectedly long timeout: desc %p\n", - desc); - continue; - } - LASSERT (rc == 0); - break; - } - /* go back and try again... */ - goto again; - - case PTL_INV_MD: /* Lost the race with completion */ - LASSERT (desc->bd_complete); /* Callbacks all ran */ - LASSERT (!desc->bd_network_rw); + if (rc == PTL_INV_MD) { + LASSERT(!ptlrpc_bulk_active(desc)); return; } + + LASSERT (rc == PTL_OK); + + for (;;) { + /* Network access will complete in finite time but the HUGE + * timeout lets us CWARN for visibility of sluggish NALs */ + lwi = LWI_TIMEOUT (300 * HZ, NULL, NULL); + rc = l_wait_event(desc->bd_waitq, + !ptlrpc_bulk_active(desc), &lwi); + if (rc == 0) + return; + + LASSERT(rc == -ETIMEDOUT); + CWARN("Unexpectedly long timeout: desc %p\n", desc); + } } int ptlrpc_register_bulk (struct ptlrpc_request *req) { struct ptlrpc_bulk_desc *desc = req->rq_bulk; struct ptlrpc_peer *peer; - struct list_head *tmp, *next; int rc; int rc2; - ptl_kiov_t *iov; ptl_process_id_t source_id; + ptl_handle_me_t me_h; + ptl_md_t md; ENTRY; /* NB no locking required until desc is on the network */ + LASSERT (desc->bd_nob > 0); LASSERT (!desc->bd_network_rw); LASSERT (desc->bd_page_count <= PTL_MD_MAX_PAGES); LASSERT (desc->bd_req != NULL); LASSERT (desc->bd_type == BULK_PUT_SINK || desc->bd_type == BULK_GET_SOURCE); - desc->bd_complete = 0; - - iov = ptlrpc_get_bulk_iov (desc); - if (iov == NULL) - return (-ENOMEM); + desc->bd_success = 0; peer = &desc->bd_import->imp_connection->c_peer; - desc->bd_md.start = iov; - desc->bd_md.niov = 0; - desc->bd_md.length = 0; - desc->bd_md.threshold = 1; - desc->bd_md.user_ptr = desc; - - if (desc->bd_type == BULK_GET_SOURCE) { - desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_KIOV; - desc->bd_md.eventq = peer->peer_ni->pni_bulk_get_source_eq_h; - } else { - desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_KIOV; - desc->bd_md.eventq = peer->peer_ni->pni_bulk_put_sink_eq_h; - } - - list_for_each_safe(tmp, next, &desc->bd_page_list) { - struct ptlrpc_bulk_page *bulk; - bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link); - - LASSERT(desc->bd_md.niov < desc->bd_page_count); - - iov[desc->bd_md.niov].kiov_page = bulk->bp_page; - iov[desc->bd_md.niov].kiov_len = bulk->bp_buflen; - iov[desc->bd_md.niov].kiov_offset = bulk->bp_pageoffset; - - LASSERT (bulk->bp_pageoffset + bulk->bp_buflen <= PAGE_SIZE); - desc->bd_md.niov++; - desc->bd_md.length += bulk->bp_buflen; - } - - LASSERT(desc->bd_md.niov == desc->bd_page_count); - LASSERT(desc->bd_md.niov != 0); + md.start = &desc->bd_iov[0]; + md.niov = desc->bd_page_count; + md.length = desc->bd_nob; + md.eventq = peer->peer_ni->pni_eq_h; + md.threshold = 1; /* PUT or GET */ + md.options = (desc->bd_type == BULK_GET_SOURCE) ? + PTL_MD_OP_GET : PTL_MD_OP_PUT; +#ifdef __KERNEL__ + md.options |= PTL_MD_KIOV; +#else + md.options |= PTL_MD_IOV; +#endif + md.user_ptr = &desc->bd_cbid; + LASSERT (desc->bd_cbid.cbid_fn == client_bulk_callback); + LASSERT (desc->bd_cbid.cbid_arg == desc); /* XXX Registering the same xid on retried bulk makes my head * explode trying to understand how the original request's bulk * might interfere with the retried request -eeb */ LASSERT (!desc->bd_registered || req->rq_xid != desc->bd_last_xid); desc->bd_registered = 1; - desc->bd_last_xid = desc->bd_last_xid; + desc->bd_last_xid = req->rq_xid; source_id.nid = desc->bd_import->imp_connection->c_peer.peer_nid; source_id.pid = PTL_PID_ANY; rc = PtlMEAttach(peer->peer_ni->pni_ni_h, desc->bd_portal, source_id, req->rq_xid, 0, - PTL_UNLINK, PTL_INS_AFTER, &desc->bd_me_h); - + PTL_UNLINK, PTL_INS_AFTER, &me_h); if (rc != PTL_OK) { CERROR("PtlMEAttach failed: %d\n", rc); LASSERT (rc == PTL_NOSPACE); - GOTO(out, rc = -ENOMEM); + RETURN (-ENOMEM); } /* About to let the network at it... */ desc->bd_network_rw = 1; - rc = PtlMDAttach(desc->bd_me_h, desc->bd_md, PTL_UNLINK, - &desc->bd_md_h); + rc = PtlMDAttach(me_h, md, PTL_UNLINK, &desc->bd_md_h); if (rc != PTL_OK) { CERROR("PtlMDAttach failed: %d\n", rc); LASSERT (rc == PTL_NOSPACE); desc->bd_network_rw = 0; - rc2 = PtlMEUnlink (desc->bd_me_h); + rc2 = PtlMEUnlink (me_h); LASSERT (rc2 == PTL_OK); - GOTO(out, rc = -ENOMEM); + RETURN (-ENOMEM); } - rc = 0; CDEBUG(D_NET, "Setup bulk %s buffers: %u pages %u bytes, xid "LPX64", " "portal %u on %s\n", desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink", - desc->bd_md.niov, desc->bd_md.length, + md.niov, md.length, req->rq_xid, desc->bd_portal, peer->peer_ni->pni_name); - - out: - ptlrpc_put_bulk_iov (desc, iov); - RETURN(rc); + RETURN(0); } void ptlrpc_unregister_bulk (struct ptlrpc_request *req) @@ -491,101 +288,104 @@ void ptlrpc_unregister_bulk (struct ptlrpc_request *req) * thread-safe (i.e. only interlocks with completion callback). */ struct ptlrpc_bulk_desc *desc = req->rq_bulk; wait_queue_head_t *wq; - unsigned long flags; struct l_wait_info lwi; int rc; LASSERT (!in_interrupt ()); /* might sleep */ - spin_lock_irqsave (&desc->bd_lock, flags); - if (!desc->bd_network_rw) { /* completed or never even registered */ - spin_unlock_irqrestore (&desc->bd_lock, flags); - return; - } - spin_unlock_irqrestore (&desc->bd_lock, flags); + if (!ptlrpc_bulk_active(desc)) /* completed or */ + return; /* never registered */ + + LASSERT (desc->bd_req == req); /* bd_req NULL until registered */ - LASSERT (desc->bd_req == req); /* NB bd_req NULL until registered */ + /* the unlink ensures the callback happens ASAP and is the last + * one. If it fails, it must be because completion just + * happened. */ - /* NB... - * 1. If the MD unlink is successful, the ME gets unlinked too. - * 2. Since client-side bulk only gets a single event and a - * .. threshold of 1. If the MD was inuse at the first link - * .. attempt, the callback is due any minute, and the MD/ME will - * .. unlink themselves. - */ rc = PtlMDUnlink (desc->bd_md_h); - switch (rc) { - default: - CERROR("PtlMDUnlink returned %d\n", rc); - LBUG (); - case PTL_OK: /* Won the race with completion */ - LASSERT (!desc->bd_complete); /* Callback hasn't happened */ - desc->bd_network_rw = 0; - return; - case PTL_MD_INUSE: /* MD is being accessed right now */ - for (;;) { - /* Network access will complete in finite time but the - * timeout lets us CERROR for visibility */ - if (desc->bd_req->rq_set != NULL) - wq = &req->rq_set->set_waitq; - else - wq = &req->rq_reply_waitq; - lwi = LWI_TIMEOUT (10 * HZ, NULL, NULL); - rc = l_wait_event(*wq, ptlrpc_bulk_complete(desc), &lwi); - LASSERT (rc == 0 || rc == -ETIMEDOUT); - if (rc == 0) - break; - CERROR ("Unexpectedly long timeout: desc %p\n", desc); - LBUG(); - } - /* Fall through */ - case PTL_INV_MD: /* Lost the race with completion */ - LASSERT (desc->bd_complete);/* Callback has run to completion */ - LASSERT (!desc->bd_network_rw); + if (rc == PTL_INV_MD) { + LASSERT(!ptlrpc_bulk_active(desc)); return; } + + LASSERT (rc == PTL_OK); + + if (desc->bd_req->rq_set != NULL) + wq = &req->rq_set->set_waitq; + else + wq = &req->rq_reply_waitq; + + for (;;) { + /* Network access will complete in finite time but the HUGE + * timeout lets us CWARN for visibility of sluggish NALs */ + lwi = LWI_TIMEOUT (300 * HZ, NULL, NULL); + rc = l_wait_event(*wq, !ptlrpc_bulk_active(desc), &lwi); + if (rc == 0) + return; + + LASSERT (rc == -ETIMEDOUT); + CWARN("Unexpectedly long timeout: desc %p\n", desc); + } } -int ptlrpc_reply(struct ptlrpc_request *req) +int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) { - struct ptlrpc_connection *conn; - unsigned long flags; - int rc; + struct ptlrpc_service *svc = req->rq_rqbd->rqbd_srv_ni->sni_service; + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_connection *conn; + int rc; /* We must already have a reply buffer (only ptlrpc_error() may be * called without one). We must also have a request buffer which * is either the actual (swabbed) incoming request, or a saved copy * if this is a req saved in target_queue_final_reply(). */ - LASSERT (req->rq_repmsg != NULL); LASSERT (req->rq_reqmsg != NULL); + LASSERT (rs != NULL); + LASSERT (req->rq_repmsg != NULL); + LASSERT (may_be_difficult || !rs->rs_difficult); + LASSERT (req->rq_repmsg == &rs->rs_msg); + LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback); + LASSERT (rs->rs_cb_id.cbid_arg == rs); - /* FIXME: we need to increment the count of handled events */ + LASSERT (req->rq_repmsg != NULL); if (req->rq_type != PTL_RPC_MSG_ERR) req->rq_type = PTL_RPC_MSG_REPLY; + req->rq_repmsg->type = req->rq_type; req->rq_repmsg->status = req->rq_status; - req->rq_repmsg->opc = req->rq_reqmsg->opc; + req->rq_repmsg->opc = req->rq_reqmsg->opc; if (req->rq_export == NULL) conn = ptlrpc_get_connection(&req->rq_peer, NULL); else conn = ptlrpc_connection_addref(req->rq_export->exp_connection); - init_waitqueue_head(&req->rq_reply_waitq); - rc = ptl_send_buf(req, conn, - req->rq_svc->srv_rep_portal); - if (rc != 0) { - /* Do what the callback handler would have done */ - OBD_FREE (req->rq_repmsg, req->rq_replen); + atomic_inc (&svc->srv_outstanding_replies); - spin_lock_irqsave (&req->rq_lock, flags); - req->rq_want_ack = 0; - spin_unlock_irqrestore (&req->rq_lock, flags); + rc = ptl_send_buf (&rs->rs_md_h, req->rq_repmsg, req->rq_replen, + rs->rs_difficult ? PTL_ACK_REQ : PTL_NOACK_REQ, + &rs->rs_cb_id, conn, + svc->srv_rep_portal, req->rq_xid); + if (rc != 0) { + atomic_dec (&svc->srv_outstanding_replies); + + if (!rs->rs_difficult) { + /* Callers other than target_send_reply() expect me + * to clean up on a comms error */ + lustre_free_reply_state (rs); + req->rq_reply_state = NULL; + req->rq_repmsg = NULL; + } } ptlrpc_put_connection(conn); return rc; } +int ptlrpc_reply (struct ptlrpc_request *req) +{ + return (ptlrpc_send_reply (req, 0)); +} + int ptlrpc_error(struct ptlrpc_request *req) { int rc; @@ -597,10 +397,9 @@ int ptlrpc_error(struct ptlrpc_request *req) RETURN(rc); } - req->rq_type = PTL_RPC_MSG_ERR; - rc = ptlrpc_reply(req); + rc = ptlrpc_send_reply (req, 0); RETURN(rc); } @@ -612,6 +411,7 @@ int ptl_send_rpc(struct ptlrpc_request *request) unsigned long flags; ptl_process_id_t source_id; ptl_handle_me_t reply_me_h; + ptl_md_t reply_md; ENTRY; LASSERT (request->rq_type == PTL_RPC_MSG_REQUEST); @@ -629,6 +429,7 @@ int ptl_send_rpc(struct ptlrpc_request *request) } request->rq_reqmsg->handle = request->rq_import->imp_remote_handle; + request->rq_reqmsg->type = PTL_RPC_MSG_REQUEST; request->rq_reqmsg->conn_cnt = request->rq_import->imp_conn_cnt; source_id.nid = connection->c_peer.peer_nid; @@ -639,7 +440,7 @@ int ptl_send_rpc(struct ptlrpc_request *request) OBD_ALLOC(request->rq_repmsg, request->rq_replen); if (request->rq_repmsg == NULL) { LBUG(); - RETURN(-ENOMEM); + GOTO(cleanup_bulk, rc = -ENOMEM); } rc = PtlMEAttach(connection->c_peer.peer_ni->pni_ni_h, @@ -650,24 +451,34 @@ int ptl_send_rpc(struct ptlrpc_request *request) CERROR("PtlMEAttach failed: %d\n", rc); LASSERT (rc == PTL_NOSPACE); LBUG(); - GOTO(cleanup, rc = -ENOMEM); + GOTO(cleanup_repmsg, rc = -ENOMEM); } - request->rq_reply_md.start = request->rq_repmsg; - request->rq_reply_md.length = request->rq_replen; - request->rq_reply_md.threshold = 1; - request->rq_reply_md.options = PTL_MD_OP_PUT; - request->rq_reply_md.user_ptr = request; - request->rq_reply_md.eventq = - connection->c_peer.peer_ni->pni_reply_in_eq_h; + spin_lock_irqsave (&request->rq_lock, flags); + /* If the MD attach succeeds, there _will_ be a reply_in callback */ + request->rq_receiving_reply = 1; + /* Clear any flags that may be present from previous sends. */ + request->rq_replied = 0; + request->rq_err = 0; + request->rq_timedout = 0; + request->rq_resend = 0; + request->rq_restart = 0; + spin_unlock_irqrestore (&request->rq_lock, flags); - rc = PtlMDAttach(reply_me_h, request->rq_reply_md, - PTL_UNLINK, &request->rq_reply_md_h); + reply_md.start = request->rq_repmsg; + reply_md.length = request->rq_replen; + reply_md.threshold = 1; + reply_md.options = PTL_MD_OP_PUT; + reply_md.user_ptr = &request->rq_reply_cbid; + reply_md.eventq = connection->c_peer.peer_ni->pni_eq_h; + + rc = PtlMDAttach(reply_me_h, reply_md, PTL_UNLINK, + &request->rq_reply_md_h); if (rc != PTL_OK) { CERROR("PtlMDAttach failed: %d\n", rc); LASSERT (rc == PTL_NOSPACE); LBUG(); - GOTO(cleanup2, rc -ENOMEM); + GOTO(cleanup_me, rc -ENOMEM); } CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64 @@ -676,87 +487,102 @@ int ptl_send_rpc(struct ptlrpc_request *request) request->rq_reply_portal, connection->c_peer.peer_ni->pni_name); - ptlrpc_request_addref(request); /* 1 ref for the SENT callback */ - - spin_lock_irqsave (&request->rq_lock, flags); - request->rq_receiving_reply = 1; - /* Clear any flags that may be present from previous sends. */ - request->rq_replied = 0; - request->rq_err = 0; - request->rq_timedout = 0; - request->rq_resend = 0; - request->rq_restart = 0; - spin_unlock_irqrestore (&request->rq_lock, flags); + ptlrpc_request_addref(request); /* +1 ref for the SENT callback */ request->rq_sent = LTIME_S(CURRENT_TIME); ptlrpc_pinger_sending_on_import(request->rq_import); - rc = ptl_send_buf(request, connection, request->rq_request_portal); + rc = ptl_send_buf(&request->rq_req_md_h, + request->rq_reqmsg, request->rq_reqlen, + PTL_NOACK_REQ, &request->rq_req_cbid, + connection, + request->rq_request_portal, + request->rq_xid); if (rc == 0) { ptlrpc_lprocfs_rpc_sent(request); RETURN(rc); } - spin_lock_irqsave (&request->rq_lock, flags); - request->rq_receiving_reply = 0; - spin_unlock_irqrestore (&request->rq_lock, flags); ptlrpc_req_finished (request); /* drop callback ref */ - cleanup2: + + cleanup_me: /* MEUnlink is safe; the PUT didn't even get off the ground, and * nobody apart from the PUT's target has the right nid+XID to * access the reply buffer. */ rc2 = PtlMEUnlink(reply_me_h); LASSERT (rc2 == PTL_OK); - cleanup: + /* UNLINKED callback called synchronously */ + LASSERT (!request->rq_receiving_reply); + + cleanup_repmsg: OBD_FREE(request->rq_repmsg, request->rq_replen); request->rq_repmsg = NULL; + + cleanup_bulk: + if (request->rq_bulk != NULL) + ptlrpc_unregister_bulk(request); + return rc; } -void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd) +void ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd) { - struct ptlrpc_srv_ni *srv_ni = rqbd->rqbd_srv_ni; - struct ptlrpc_service *service = srv_ni->sni_service; - static ptl_process_id_t match_id = {PTL_NID_ANY, PTL_PID_ANY}; - int rc; - ptl_md_t dummy; - ptl_handle_md_t md_h; - - LASSERT(atomic_read(&rqbd->rqbd_refcount) == 0); + struct ptlrpc_srv_ni *srv_ni = rqbd->rqbd_srv_ni; + struct ptlrpc_service *service = srv_ni->sni_service; + static ptl_process_id_t match_id = {PTL_NID_ANY, PTL_PID_ANY}; + int rc; + ptl_md_t md; + ptl_handle_me_t me_h; + unsigned long flags; CDEBUG(D_NET, "PtlMEAttach: portal %d on %s h %lx."LPX64"\n", service->srv_req_portal, srv_ni->sni_ni->pni_name, srv_ni->sni_ni->pni_ni_h.nal_idx, srv_ni->sni_ni->pni_ni_h.cookie); - /* Attach the leading ME on which we build the ring */ rc = PtlMEAttach(srv_ni->sni_ni->pni_ni_h, service->srv_req_portal, - match_id, 0, ~0, - PTL_UNLINK, PTL_INS_AFTER, &rqbd->rqbd_me_h); + match_id, 0, ~0, PTL_UNLINK, PTL_INS_AFTER, &me_h); if (rc != PTL_OK) { CERROR("PtlMEAttach failed: %d\n", rc); - /* BUG 1191 */ - LBUG(); + GOTO (failed, NULL); } - dummy.start = rqbd->rqbd_buffer; - dummy.length = service->srv_buf_size; - dummy.max_size = service->srv_max_req_size; - dummy.threshold = PTL_MD_THRESH_INF; - dummy.options = PTL_MD_OP_PUT | PTL_MD_MAX_SIZE | PTL_MD_AUTO_UNLINK; - dummy.user_ptr = rqbd; - dummy.eventq = srv_ni->sni_eq_h; - - atomic_inc(&srv_ni->sni_nrqbds_receiving); - atomic_set(&rqbd->rqbd_refcount, 1); /* 1 ref for portals */ - - rc = PtlMDAttach(rqbd->rqbd_me_h, dummy, PTL_UNLINK, &md_h); - if (rc != PTL_OK) { - CERROR("PtlMDAttach failed: %d\n", rc); - LASSERT (rc == PTL_NOSPACE); - LBUG(); - /* BUG 1191 */ - PtlMEUnlink (rqbd->rqbd_me_h); - atomic_set(&rqbd->rqbd_refcount, 0); - atomic_dec(&srv_ni->sni_nrqbds_receiving); + LASSERT(rqbd->rqbd_refcount == 0); + rqbd->rqbd_refcount = 1; + + md.start = rqbd->rqbd_buffer; + md.length = service->srv_buf_size; + md.max_size = service->srv_max_req_size; + md.threshold = PTL_MD_THRESH_INF; + md.options = PTL_MD_OP_PUT | PTL_MD_MAX_SIZE | PTL_MD_AUTO_UNLINK; + md.user_ptr = &rqbd->rqbd_cbid; + md.eventq = srv_ni->sni_ni->pni_eq_h; + + spin_lock_irqsave (&service->srv_lock, flags); + srv_ni->sni_nrqbd_receiving++; + spin_unlock_irqrestore (&service->srv_lock, flags); + + rc = PtlMDAttach(me_h, md, PTL_UNLINK, &rqbd->rqbd_md_h); + if (rc == PTL_OK) + return; + + CERROR("PtlMDAttach failed: %d\n", rc); + LASSERT (rc == PTL_NOSPACE); + rc = PtlMEUnlink (me_h); + LASSERT (rc == PTL_OK); + + spin_lock_irqsave (&service->srv_lock, flags); + srv_ni->sni_nrqbd_receiving--; + if (srv_ni->sni_nrqbd_receiving == 0) { + /* This service is off-air on this interface because all + * its request buffers are busy. Portals will have started + * dropping incoming requests until more buffers get + * posted */ + CERROR("All %s %s request buffers busy\n", + service->srv_name, srv_ni->sni_ni->pni_name); } + spin_unlock_irqrestore (&service->srv_lock, flags); + + failed: + LBUG(); /* BUG 1191 */ + /* put req on a retry list? */ } diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index d29fe39..16ca32a 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -31,6 +31,7 @@ #endif #include +#include #include @@ -42,54 +43,114 @@ int lustre_msg_swabbed(struct lustre_msg *msg) return (msg->magic == __swab32(PTLRPC_MSG_MAGIC)); } -static int lustre_pack_msg(int count, int *lens, char **bufs, int *len, - struct lustre_msg **msg) +static void +lustre_init_msg (struct lustre_msg *msg, int count, int *lens, char **bufs) { char *ptr; - struct lustre_msg *m; - int size = 0, i; - - size = HDR_SIZE (count); + int i; + + msg->magic = PTLRPC_MSG_MAGIC; + msg->version = PTLRPC_MSG_VERSION; + msg->bufcount = count; for (i = 0; i < count; i++) - size += size_round(lens[i]); - - *len = size; + msg->buflens[i] = lens[i]; - OBD_ALLOC(*msg, *len); - if (!*msg) - RETURN(-ENOMEM); - - m = *msg; - m->magic = PTLRPC_MSG_MAGIC; - m->version = PTLRPC_MSG_VERSION; - m->bufcount = count; - for (i = 0; i < count; i++) - m->buflens[i] = lens[i]; + if (bufs == NULL) + return; - ptr = (char *)m + HDR_SIZE(count); + ptr = (char *)msg + HDR_SIZE(count); for (i = 0; i < count; i++) { - char *tmp = NULL; - if (bufs) - tmp = bufs[i]; + char *tmp = bufs[i]; LOGL(tmp, lens[i], ptr); - } +} + +int lustre_pack_request (struct ptlrpc_request *req, + int count, int *lens, char **bufs) +{ + ENTRY; + + req->rq_reqlen = lustre_msg_size (count, lens); + OBD_ALLOC(req->rq_reqmsg, req->rq_reqlen); + if (req->rq_reqmsg == NULL) + RETURN(-ENOMEM); - return 0; + lustre_init_msg (req->rq_reqmsg, count, lens, bufs); + RETURN (0); } -int lustre_pack_request(struct ptlrpc_request *req, int count, int *lens, - char **bufs) +#if RS_DEBUG +LIST_HEAD(ptlrpc_rs_debug_lru); +spinlock_t ptlrpc_rs_debug_lock = SPIN_LOCK_UNLOCKED; + +#define PTLRPC_RS_DEBUG_LRU_ADD(rs) \ +do { \ + unsigned long __flags; \ + \ + spin_lock_irqsave(&ptlrpc_rs_debug_lock, __flags); \ + list_add_tail(&(rs)->rs_debug_list, &ptlrpc_rs_debug_lru); \ + spin_unlock_irqrestore(&ptlrpc_rs_debug_lock, __flags); \ +} while (0) + +#define PTLRPC_RS_DEBUG_LRU_DEL(rs) \ +do { \ + unsigned long __flags; \ + \ + spin_lock_irqsave(&ptlrpc_rs_debug_lock, __flags); \ + list_del(&(rs)->rs_debug_list); \ + spin_unlock_irqrestore(&ptlrpc_rs_debug_lock, __flags); \ +} while (0) +#else +# define PTLRPC_RS_DEBUG_LRU_ADD(rs) do {} while(0) +# define PTLRPC_RS_DEBUG_LRU_DEL(rs) do {} while(0) +#endif + +int lustre_pack_reply (struct ptlrpc_request *req, + int count, int *lens, char **bufs) { - return lustre_pack_msg(count, lens, bufs, &req->rq_reqlen, - &req->rq_reqmsg); + struct ptlrpc_reply_state *rs; + int msg_len; + int size; + ENTRY; + + LASSERT (req->rq_reply_state == NULL); + + msg_len = lustre_msg_size (count, lens); + size = offsetof (struct ptlrpc_reply_state, rs_msg) + msg_len; + OBD_ALLOC (rs, size); + if (rs == NULL) + RETURN (-ENOMEM); + + rs->rs_cb_id.cbid_fn = reply_out_callback; + rs->rs_cb_id.cbid_arg = rs; + rs->rs_srv_ni = req->rq_rqbd->rqbd_srv_ni; + rs->rs_size = size; + INIT_LIST_HEAD(&rs->rs_exp_list); + INIT_LIST_HEAD(&rs->rs_obd_list); + + req->rq_replen = msg_len; + req->rq_reply_state = rs; + req->rq_repmsg = &rs->rs_msg; + lustre_init_msg (&rs->rs_msg, count, lens, bufs); + + PTLRPC_RS_DEBUG_LRU_ADD(rs); + + RETURN (0); } -int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens, - char **bufs) +void lustre_free_reply_state (struct ptlrpc_reply_state *rs) { - return lustre_pack_msg(count, lens, bufs, &req->rq_replen, - &req->rq_repmsg); + PTLRPC_RS_DEBUG_LRU_DEL(rs); + + LASSERT (!rs->rs_difficult || rs->rs_handled); + LASSERT (!rs->rs_on_net); + LASSERT (!rs->rs_scheduled); + LASSERT (rs->rs_export == NULL); + LASSERT (rs->rs_nlocks == 0); + LASSERT (list_empty(&rs->rs_exp_list)); + LASSERT (list_empty(&rs->rs_obd_list)); + + OBD_FREE (rs, rs->rs_size); } /* This returns the size of the buffer that is required to hold a lustre_msg diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index 3caf74e..ab85900 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -35,12 +35,12 @@ #include #include "ptlrpc_internal.h" -#ifdef __KERNEL__ - -static struct ptlrpc_thread *pinger_thread = NULL; static DECLARE_MUTEX(pinger_sem); static struct list_head pinger_imports = LIST_HEAD_INIT(pinger_imports); +#ifdef __KERNEL__ +static struct ptlrpc_thread *pinger_thread = NULL; + static int ptlrpc_pinger_main(void *arg) { struct ptlrpc_svc_data *data = (struct ptlrpc_svc_data *)arg; @@ -307,30 +307,219 @@ int ptlrpc_pinger_del_import(struct obd_import *imp) RETURN(0); } -#else /* !__KERNEL__ */ +#else +/* XXX + * the current implementation of pinger in liblustre is not optimized + */ + +static struct pinger_data { + int pd_recursion; + unsigned long pd_this_ping; + unsigned long pd_next_ping; + struct ptlrpc_request_set *pd_set; +} pinger_args; + +static int pinger_check_rpcs(void *arg) +{ + unsigned long curtime = time(NULL); + struct ptlrpc_request *req; + struct ptlrpc_request_set *set; + struct list_head *iter; + struct pinger_data *pd = &pinger_args; + int rc; + + /* prevent recursion */ + if (pd->pd_recursion++) { + CDEBUG(D_HA, "pinger: recursion! quit\n"); + LASSERT(pd->pd_set); + pd->pd_recursion--; + return 0; + } + + /* have we reached ping point? */ + if (!pd->pd_set && pd->pd_next_ping > curtime) { + pd->pd_recursion--; + return 0; + } + + /* if we have rpc_set already, continue processing it */ + if (pd->pd_set) { + LASSERT(pd->pd_this_ping); + set = pd->pd_set; + goto do_check_set; + } + + pd->pd_this_ping = curtime; + pd->pd_set = ptlrpc_prep_set(); + set = pd->pd_set; + + /* add rpcs into set */ + down(&pinger_sem); + list_for_each(iter, &pinger_imports) { + struct obd_import *imp = + list_entry(iter, struct obd_import, + imp_pinger_chain); + int generation, level; + unsigned long flags; + + if (imp->imp_next_ping <= pd->pd_this_ping) { + /* Add a ping. */ + spin_lock_irqsave(&imp->imp_lock, flags); + generation = imp->imp_generation; + level = imp->imp_state; + spin_unlock_irqrestore(&imp->imp_lock, flags); + + if (level != LUSTRE_IMP_FULL) { + CDEBUG(D_HA, + "not pinging %s (in recovery)\n", + imp->imp_target_uuid.uuid); + continue; + } + + req = ptlrpc_prep_req(imp, OBD_PING, 0, NULL, + NULL); + if (!req) { + CERROR("out of memory\n"); + break; + } + req->rq_no_resend = 1; + req->rq_replen = lustre_msg_size(0, NULL); + req->rq_send_state = LUSTRE_IMP_FULL; + req->rq_phase = RQ_PHASE_RPC; + req->rq_import_generation = generation; + ptlrpc_set_add_req(set, req); + } else { + CDEBUG(D_HA, "don't need to ping %s (%lu > " + "%lu)\n", imp->imp_target_uuid.uuid, + imp->imp_next_ping, pd->pd_this_ping); + } + } + pd->pd_this_ping = curtime; + up(&pinger_sem); + + /* Might be empty, that's OK. */ + if (set->set_remaining == 0) + CDEBUG(D_HA, "nothing to ping\n"); + + list_for_each(iter, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(iter, struct ptlrpc_request, + rq_set_chain); + DEBUG_REQ(D_HA, req, "pinging %s->%s", + req->rq_import->imp_obd->obd_uuid.uuid, + req->rq_import->imp_target_uuid.uuid); + (void)ptl_send_rpc(req); + } + +do_check_set: + rc = ptlrpc_check_set(set); + + /* not finished, and we are not expired, simply return */ + if (!rc && curtime < pd->pd_this_ping + obd_timeout) { + CDEBUG(D_HA, "not finished, but also not expired\n"); + pd->pd_recursion--; + return 0; + } + + /* Expire all the requests that didn't come back. */ + down(&pinger_sem); + list_for_each(iter, &set->set_requests) { + req = list_entry(iter, struct ptlrpc_request, + rq_set_chain); + + if (req->rq_replied) + continue; + + req->rq_phase = RQ_PHASE_COMPLETE; + set->set_remaining--; + /* If it was disconnected, don't sweat it. */ + if (list_empty(&req->rq_import->imp_pinger_chain)) { + ptlrpc_unregister_reply(req); + continue; + } + + CDEBUG(D_HA, "pinger initiate expire_one_request\n"); + ptlrpc_expire_one_request(req); + } + up(&pinger_sem); + + ptlrpc_set_destroy(set); + pd->pd_set = NULL; + + pd->pd_next_ping = pd->pd_this_ping + obd_timeout; + pd->pd_this_ping = 0; /* XXX for debug */ + + CDEBUG(D_HA, "finished a round ping\n"); + pd->pd_recursion--; + return 0; +} + +static void *pinger_callback = NULL; int ptlrpc_start_pinger(void) { + memset(&pinger_args, 0, sizeof(pinger_args)); +#ifdef ENABLE_PINGER + pinger_callback = + liblustre_register_wait_callback(&pinger_check_rpcs, &pinger_args); +#endif + obd_timeout = 10; return 0; } int ptlrpc_stop_pinger(void) { +#ifdef ENABLE_PINGER + if (pinger_callback) + liblustre_deregister_wait_callback(pinger_callback); +#endif return 0; } -int ptlrpc_pinger_add_import(struct obd_import *imp) +void ptlrpc_pinger_sending_on_import(struct obd_import *imp) { - return 0; + down(&pinger_sem); + imp->imp_next_ping = time(NULL) + obd_timeout; + if (pinger_args.pd_set == NULL && + pinger_args.pd_next_ping > imp->imp_next_ping) { + CDEBUG(D_HA, "set next ping to %ld(cur %ld)\n", + imp->imp_next_ping, time(NULL)); + pinger_args.pd_next_ping = imp->imp_next_ping; + } + up(&pinger_sem); } -int ptlrpc_pinger_del_import(struct obd_import *imp) +int ptlrpc_pinger_add_import(struct obd_import *imp) { - return 0; + ENTRY; + if (!list_empty(&imp->imp_pinger_chain)) + RETURN(-EALREADY); + + CDEBUG(D_HA, "adding pingable import %s->%s\n", + imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid); + ptlrpc_pinger_sending_on_import(imp); + + down(&pinger_sem); + list_add_tail(&imp->imp_pinger_chain, &pinger_imports); + class_import_get(imp); + up(&pinger_sem); + + RETURN(0); } -void ptlrpc_pinger_sending_on_import(struct obd_import *imp) +int ptlrpc_pinger_del_import(struct obd_import *imp) { + ENTRY; + if (list_empty(&imp->imp_pinger_chain)) + RETURN(-ENOENT); + + down(&pinger_sem); + list_del_init(&imp->imp_pinger_chain); + CDEBUG(D_HA, "removing pingable import %s->%s\n", + imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid); + class_import_put(imp); + up(&pinger_sem); + RETURN(0); } -#endif +#endif /* !__KERNEL__ */ diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h index 7ec9bbe..d42eb65 100644 --- a/lustre/ptlrpc/ptlrpc_internal.h +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -99,9 +99,9 @@ static inline int opcode_offset(__u32 opc) { (OBD_LAST_OPC - OBD_FIRST_OPC)) enum { - PTLRPC_REQWAIT_CNTR = 0, - PTLRPC_SVCIDLETIME_CNTR = 1, - //PTLRPC_SVCEQDEPTH_CNTR, + PTLRPC_REQWAIT_CNTR = 0, + PTLRPC_REQQDEPTH_CNTR, + PTLRPC_REQACTIVE_CNTR, PTLRPC_LAST_CNTR }; diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index bfe525c..519b434 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -83,16 +83,15 @@ EXPORT_SYMBOL(ptlrpc_init_connection); EXPORT_SYMBOL(ptlrpc_cleanup_connection); /* niobuf.c */ -EXPORT_SYMBOL(ptlrpc_bulk_put); -EXPORT_SYMBOL(ptlrpc_bulk_get); +EXPORT_SYMBOL(ptlrpc_start_bulk_transfer); EXPORT_SYMBOL(ptlrpc_abort_bulk); EXPORT_SYMBOL(ptlrpc_register_bulk); EXPORT_SYMBOL(ptlrpc_unregister_bulk); +EXPORT_SYMBOL(ptlrpc_send_reply); EXPORT_SYMBOL(ptlrpc_reply); EXPORT_SYMBOL(ptlrpc_error); EXPORT_SYMBOL(ptlrpc_resend_req); EXPORT_SYMBOL(ptl_send_rpc); -EXPORT_SYMBOL(ptlrpc_link_svc_me); /* client.c */ EXPORT_SYMBOL(ptlrpc_init_client); @@ -111,7 +110,6 @@ EXPORT_SYMBOL(ptlrpc_prep_bulk_imp); EXPORT_SYMBOL(ptlrpc_prep_bulk_exp); EXPORT_SYMBOL(ptlrpc_free_bulk); EXPORT_SYMBOL(ptlrpc_prep_bulk_page); -EXPORT_SYMBOL(ptlrpc_free_bulk_page); EXPORT_SYMBOL(ptlrpc_abort_inflight); EXPORT_SYMBOL(ptlrpc_retain_replayable_request); EXPORT_SYMBOL(ptlrpc_next_xid); @@ -128,6 +126,9 @@ EXPORT_SYMBOL(ptlrpc_interrupted_set); EXPORT_SYMBOL(ptlrpc_mark_interrupted); /* service.c */ +EXPORT_SYMBOL(ptlrpc_save_lock); +EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply); +EXPORT_SYMBOL(ptlrpc_commit_replies); EXPORT_SYMBOL(ptlrpc_init_svc); EXPORT_SYMBOL(ptlrpc_stop_all_threads); EXPORT_SYMBOL(ptlrpc_start_n_threads); @@ -138,6 +139,7 @@ EXPORT_SYMBOL(ptlrpc_unregister_service); EXPORT_SYMBOL(lustre_msg_swabbed); EXPORT_SYMBOL(lustre_pack_request); EXPORT_SYMBOL(lustre_pack_reply); +EXPORT_SYMBOL(lustre_free_reply_state); EXPORT_SYMBOL(lustre_msg_size); EXPORT_SYMBOL(lustre_unpack_msg); EXPORT_SYMBOL(lustre_msg_buf); diff --git a/lustre/ptlrpc/ptlrpcd.c b/lustre/ptlrpc/ptlrpcd.c index 7b56097..4e688a8 100644 --- a/lustre/ptlrpc/ptlrpcd.c +++ b/lustre/ptlrpc/ptlrpcd.c @@ -162,7 +162,9 @@ static int ptlrpcd(void *arg) if (test_bit(LIOD_STOP, &pc->pc_flags)) break; } - /* XXX should be making sure we don't have anything in flight */ + /* wait for inflight requests to drain */ + if (!list_empty(&pc->pc_set->set_requests)) + ptlrpc_set_wait(pc->pc_set); complete(&pc->pc_finishing); return 0; } diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index 6b069a5..76469cb 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -113,6 +113,10 @@ void ptlrpc_run_failed_import_upcall(struct obd_import* imp) argv[0], argv[1], argv[2], argv[3], argv[4]); } #else + if (imp->imp_state == LUSTRE_IMP_CLOSED) { + EXIT; + return; + } ptlrpc_recover_import(imp, NULL); #endif } @@ -215,13 +219,8 @@ void ptlrpc_wake_delayed(struct obd_import *imp) list_for_each_safe(tmp, pos, &imp->imp_delayed_list) { req = list_entry(tmp, struct ptlrpc_request, rq_list); - if (req->rq_set) { - DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set); - wake_up(&req->rq_set->set_waitq); - } else { - DEBUG_REQ(D_HA, req, "waking:"); - wake_up(&req->rq_reply_waitq); - } + DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set); + ptlrpc_wake_client_req(req); } spin_unlock_irqrestore(&imp->imp_lock, flags); } diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 979355c..e07cae9 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -31,70 +31,184 @@ #include #include "ptlrpc_internal.h" -extern int request_in_callback(ptl_event_t *ev); +static LIST_HEAD (ptlrpc_all_services); +static spinlock_t ptlrpc_all_services_lock = SPIN_LOCK_UNLOCKED; -static int ptlrpc_check_event(struct ptlrpc_service *svc, - struct ptlrpc_thread *thread, ptl_event_t *event) +static void +ptlrpc_free_server_req (struct ptlrpc_request *req) { - struct ptlrpc_srv_ni *srv_ni; - int i, idx, rc; - ENTRY; + /* The last request to be received into a request buffer uses space + * in the request buffer descriptor, otherwise requests are + * allocated dynamically in the incoming reply event handler */ + if (req == &req->rq_rqbd->rqbd_req) + return; - spin_lock(&svc->srv_lock); + OBD_FREE(req, sizeof(*req)); +} + +static char * +ptlrpc_alloc_request_buffer (int size) +{ + char *ptr; + + if (size > SVC_BUF_VMALLOC_THRESHOLD) + OBD_VMALLOC(ptr, size); + else + OBD_ALLOC(ptr, size); + + return (ptr); +} - if (thread->t_flags & SVC_STOPPING) - GOTO(out, rc = 1); +static void +ptlrpc_free_request_buffer (char *ptr, int size) +{ + if (size > SVC_BUF_VMALLOC_THRESHOLD) + OBD_VFREE(ptr, size); + else + OBD_FREE(ptr, size); +} - LASSERT ((thread->t_flags & SVC_EVENT) == 0); - LASSERT (ptlrpc_ninterfaces > 0); +struct ptlrpc_request_buffer_desc * +ptlrpc_alloc_rqbd (struct ptlrpc_srv_ni *srv_ni) +{ + struct ptlrpc_service *svc = srv_ni->sni_service; + unsigned long flags; + struct ptlrpc_request_buffer_desc *rqbd; + + OBD_ALLOC(rqbd, sizeof (*rqbd)); + if (rqbd == NULL) + return (NULL); + + rqbd->rqbd_srv_ni = srv_ni; + rqbd->rqbd_refcount = 0; + rqbd->rqbd_cbid.cbid_fn = request_in_callback; + rqbd->rqbd_cbid.cbid_arg = rqbd; + rqbd->rqbd_buffer = ptlrpc_alloc_request_buffer(svc->srv_buf_size); + + if (rqbd->rqbd_buffer == NULL) { + OBD_FREE(rqbd, sizeof (*rqbd)); + return (NULL); + } - for (i = 0; i < ptlrpc_ninterfaces; i++) { - idx = (svc->srv_interface_rover + i) % ptlrpc_ninterfaces; - srv_ni = &svc->srv_interfaces[idx]; + spin_lock_irqsave (&svc->srv_lock, flags); + list_add(&rqbd->rqbd_list, &srv_ni->sni_rqbds); + svc->srv_nbufs++; + spin_unlock_irqrestore (&svc->srv_lock, flags); - LASSERT (!PtlHandleEqual (srv_ni->sni_eq_h, PTL_HANDLE_NONE)); + return (rqbd); +} - rc = PtlEQGet(srv_ni->sni_eq_h, event); - switch (rc) { - case PTL_OK: - /* next time start with the next interface */ - svc->srv_interface_rover = (idx+1) % ptlrpc_ninterfaces; - thread->t_flags |= SVC_EVENT; - GOTO(out, rc = 1); +void +ptlrpc_free_rqbd (struct ptlrpc_request_buffer_desc *rqbd) +{ + struct ptlrpc_srv_ni *sni = rqbd->rqbd_srv_ni; + struct ptlrpc_service *svc = sni->sni_service; + unsigned long flags; + + LASSERT (rqbd->rqbd_refcount == 0); + + spin_lock_irqsave(&svc->srv_lock, flags); + list_del(&rqbd->rqbd_list); + svc->srv_nbufs--; + spin_unlock_irqrestore(&svc->srv_lock, flags); + + ptlrpc_free_request_buffer (rqbd->rqbd_buffer, svc->srv_buf_size); + OBD_FREE (rqbd, sizeof (*rqbd)); +} - case PTL_EQ_EMPTY: - continue; +void +ptlrpc_save_lock (struct ptlrpc_request *req, + struct lustre_handle *lock, int mode) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + int idx; - case PTL_EQ_DROPPED: - CWARN("Event queue overflow (bug 2125): timeouts will " - "follow.\n"); - continue; + LASSERT (rs != NULL); + LASSERT (rs->rs_nlocks < RS_MAX_LOCKS); + + idx = rs->rs_nlocks++; + rs->rs_locks[idx] = *lock; + rs->rs_modes[idx] = mode; + rs->rs_difficult = 1; +} + +void +ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service; + +#ifdef CONFIG_SMP + LASSERT (spin_is_locked (&svc->srv_lock)); +#endif + LASSERT (rs->rs_difficult); + rs->rs_scheduled_ever = 1; /* flag any notification attempt */ + + if (rs->rs_scheduled) /* being set up or already notified */ + return; + + rs->rs_scheduled = 1; + list_del (&rs->rs_list); + list_add (&rs->rs_list, &svc->srv_reply_queue); + wake_up (&svc->srv_waitq); +} + +void +ptlrpc_commit_replies (struct obd_device *obd) +{ + struct list_head *tmp; + struct list_head *nxt; + unsigned long flags; + + /* Find any replies that have been committed and get their service + * to attend to complete them. */ + + /* CAVEAT EMPTOR: spinlock ordering!!! */ + spin_lock_irqsave (&obd->obd_uncommitted_replies_lock, flags); + + list_for_each_safe (tmp, nxt, &obd->obd_uncommitted_replies) { + struct ptlrpc_reply_state *rs = + list_entry (tmp, struct ptlrpc_reply_state, rs_obd_list); + + LASSERT (rs->rs_difficult); - default: - CERROR("BUG: PtlEQGet returned %d\n", rc); - LBUG(); + if (rs->rs_transno <= obd->obd_last_committed) { + struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service; + + spin_lock (&svc->srv_lock); + list_del_init (&rs->rs_obd_list); + ptlrpc_schedule_difficult_reply (rs); + spin_unlock (&svc->srv_lock); } } - rc = 0; - EXIT; - out: - spin_unlock(&svc->srv_lock); - return rc; + + spin_unlock_irqrestore (&obd->obd_uncommitted_replies_lock, flags); +} + +static long +timeval_sub(struct timeval *large, struct timeval *small) +{ + return (large->tv_sec - small->tv_sec) * 1000000 + + (large->tv_usec - small->tv_usec); } -struct ptlrpc_service * ptlrpc_init_svc(__u32 nevents, __u32 nbufs, - __u32 bufsize, __u32 max_req_size, - int req_portal, int rep_portal, - svc_handler_t handler, char *name, - struct proc_dir_entry *proc_entry) +struct ptlrpc_service * +ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, + int req_portal, int rep_portal, + svc_handler_t handler, char *name, + struct proc_dir_entry *proc_entry) { - int i, j, ssize, rc; - struct ptlrpc_service *service; - struct ptlrpc_srv_ni *srv_ni; + int i; + int j; + int ssize; + struct ptlrpc_service *service; + struct ptlrpc_srv_ni *srv_ni; + struct ptlrpc_request_buffer_desc *rqbd; ENTRY; LASSERT (ptlrpc_ninterfaces > 0); - + LASSERT (nbufs > 0); + LASSERT (bufsize >= max_req_size); + ssize = offsetof (struct ptlrpc_service, srv_interfaces[ptlrpc_ninterfaces]); OBD_ALLOC(service, ssize); @@ -108,11 +222,12 @@ struct ptlrpc_service * ptlrpc_init_svc(__u32 nevents, __u32 nbufs, service->srv_max_req_size = max_req_size; service->srv_buf_size = bufsize; - service->srv_rep_portal = rep_portal; service->srv_req_portal = req_portal; service->srv_handler = handler; - service->srv_interface_rover = 0; + + INIT_LIST_HEAD(&service->srv_request_queue); + INIT_LIST_HEAD(&service->srv_reply_queue); /* First initialise enough for early teardown */ for (i = 0; i < ptlrpc_ninterfaces; i++) { @@ -120,56 +235,31 @@ struct ptlrpc_service * ptlrpc_init_svc(__u32 nevents, __u32 nbufs, srv_ni->sni_service = service; srv_ni->sni_ni = &ptlrpc_interfaces[i]; - srv_ni->sni_eq_h = PTL_HANDLE_NONE; INIT_LIST_HEAD(&srv_ni->sni_rqbds); - srv_ni->sni_nrqbds = 0; - atomic_set(&srv_ni->sni_nrqbds_receiving, 0); + INIT_LIST_HEAD(&srv_ni->sni_active_replies); } - /* Now allocate the event queue and request buffers, assuming all - * interfaces require the same level of buffering. */ + spin_lock (&ptlrpc_all_services_lock); + list_add (&service->srv_list, &ptlrpc_all_services); + spin_unlock (&ptlrpc_all_services_lock); + + /* Now allocate the request buffers, assuming all interfaces require + * the same number. */ for (i = 0; i < ptlrpc_ninterfaces; i++) { srv_ni = &service->srv_interfaces[i]; CDEBUG (D_NET, "%s: initialising interface %s\n", name, srv_ni->sni_ni->pni_name); - rc = PtlEQAlloc(srv_ni->sni_ni->pni_ni_h, nevents, - request_in_callback, &(srv_ni->sni_eq_h)); - if (rc != PTL_OK) { - CERROR("%s.%d: PtlEQAlloc on %s failed: %d\n", - name, i, srv_ni->sni_ni->pni_name, rc); - GOTO (failed, NULL); - } - for (j = 0; j < nbufs; j++) { - struct ptlrpc_request_buffer_desc *rqbd; - - OBD_ALLOC_WAIT(rqbd, sizeof(*rqbd)); + rqbd = ptlrpc_alloc_rqbd (srv_ni); + if (rqbd == NULL) { - CERROR ("%s.%d: Can't allocate request " - "descriptor %d on %s\n", - name, i, srv_ni->sni_nrqbds, + CERROR ("%s.%d: Can't allocate request %d " + "on %s\n", name, i, j, srv_ni->sni_ni->pni_name); GOTO(failed, NULL); } - - rqbd->rqbd_srv_ni = srv_ni; - rqbd->rqbd_me_h = PTL_HANDLE_NONE; - atomic_set(&rqbd->rqbd_refcount, 0); - - OBD_ALLOC_WAIT(rqbd->rqbd_buffer, service->srv_buf_size); - if (rqbd->rqbd_buffer == NULL) { - CERROR ("%s.%d: Can't allocate request " - "buffer %d on %s\n", - name, i, srv_ni->sni_nrqbds, - srv_ni->sni_ni->pni_name); - OBD_FREE(rqbd, sizeof(*rqbd)); - GOTO(failed, NULL); - } - list_add(&rqbd->rqbd_list, &srv_ni->sni_rqbds); - srv_ni->sni_nrqbds++; - - ptlrpc_link_svc_me(rqbd); + ptlrpc_register_rqbd (rqbd); } } @@ -185,30 +275,47 @@ failed: return NULL; } -static int handle_incoming_request(struct obd_device *obddev, - struct ptlrpc_service *svc, - ptl_event_t *event, - struct ptlrpc_request *request) +static int +ptlrpc_server_handle_request (struct ptlrpc_service *svc) { - struct ptlrpc_request_buffer_desc *rqbd = event->mem_desc.user_ptr; - int rc; - - /* FIXME: If we move to an event-driven model, we should put the request - * on the stack of mds_handle instead. */ + struct ptlrpc_request *request; + unsigned long flags; + struct timeval work_start; + struct timeval work_end; + long timediff; + int refcount; + int rc; + ENTRY; - LASSERT (atomic_read (&rqbd->rqbd_refcount) > 0); - LASSERT ((event->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0); - LASSERT (rqbd->rqbd_srv_ni->sni_service == svc); - LASSERT (rqbd->rqbd_buffer == event->mem_desc.start); - LASSERT (event->offset + event->mlength <= svc->srv_buf_size); + spin_lock_irqsave (&svc->srv_lock, flags); + if (list_empty (&svc->srv_request_queue) || + (svc->srv_n_difficult_replies != 0 && + svc->srv_n_active_reqs >= (svc->srv_nthreads - 1))) { + /* If all the other threads are handling requests, I must + * remain free to handle any 'difficult' reply that might + * block them */ + spin_unlock_irqrestore (&svc->srv_lock, flags); + RETURN(0); + } - memset(request, 0, sizeof(*request)); - spin_lock_init (&request->rq_lock); - INIT_LIST_HEAD(&request->rq_list); - request->rq_svc = svc; - request->rq_xid = event->match_bits; - request->rq_reqmsg = event->mem_desc.start + event->offset; - request->rq_reqlen = event->mlength; + request = list_entry (svc->srv_request_queue.next, + struct ptlrpc_request, rq_list); + list_del_init (&request->rq_list); + svc->srv_n_queued_reqs--; + svc->srv_n_active_reqs++; + + spin_unlock_irqrestore (&svc->srv_lock, flags); + + do_gettimeofday(&work_start); + timediff = timeval_sub(&work_start, &request->rq_arrival_time); + if (svc->srv_stats != NULL) { + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR, + timediff); + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR, + svc->srv_n_queued_reqs); + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR, + svc->srv_n_active_reqs); + } #if SWAB_PARANOIA /* Clear request swab mask; this is a new request */ @@ -218,26 +325,34 @@ static int handle_incoming_request(struct obd_device *obddev, if (rc != 0) { CERROR ("error unpacking request: ptl %d from "LPX64 " xid "LPU64"\n", svc->srv_req_portal, - event->initiator.nid, request->rq_xid); + request->rq_peer.peer_nid, request->rq_xid); goto out; } + rc = -EINVAL; if (request->rq_reqmsg->type != PTL_RPC_MSG_REQUEST) { - CERROR("wrong packet type received (type=%u)\n", - request->rq_reqmsg->type); + CERROR("wrong packet type received (type=%u) from " + LPX64"\n", request->rq_reqmsg->type, + request->rq_peer.peer_nid); goto out; } - CDEBUG(D_NET, "got req "LPD64" (md: %p + %d)\n", request->rq_xid, - event->mem_desc.start, event->offset); + CDEBUG(D_NET, "got req "LPD64"\n", request->rq_xid); - request->rq_peer.peer_nid = event->initiator.nid; - request->rq_peer.peer_ni = rqbd->rqbd_srv_ni->sni_ni; + /* Discard requests queued for longer than my timeout. If the + * client's timeout is similar to mine, she'll be timing out this + * REQ anyway (bug 1502) */ + if (timediff / 1000000 > (long)obd_timeout) { + CERROR("Dropping timed-out request from "LPX64 + ": %ld seconds old\n", + request->rq_peer.peer_nid, timediff / 1000000); + goto out; + } request->rq_export = class_conn2export(&request->rq_reqmsg->handle); if (request->rq_export) { - if (request->rq_reqmsg->conn_cnt < + if (request->rq_reqmsg->conn_cnt < request->rq_export->exp_conn_cnt) { DEBUG_REQ(D_ERROR, request, "DROPPING req from old connection %d < %d", @@ -248,7 +363,7 @@ static int handle_incoming_request(struct obd_device *obddev, request->rq_export->exp_last_request_time = LTIME_S(CURRENT_TIME); - } + } CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:ni:nid:opc " "%s:%s+%d:%d:"LPU64":%s:"LPX64":%d\n", current->comm, @@ -257,7 +372,8 @@ static int handle_incoming_request(struct obd_device *obddev, (request->rq_export ? atomic_read(&request->rq_export->exp_refcount) : -99), request->rq_reqmsg->status, request->rq_xid, - rqbd->rqbd_srv_ni->sni_ni->pni_name, event->initiator.nid, + request->rq_peer.peer_ni->pni_name, + request->rq_peer.peer_nid, request->rq_reqmsg->opc); rc = svc->srv_handler(request); @@ -268,7 +384,8 @@ static int handle_incoming_request(struct obd_device *obddev, (request->rq_export ? atomic_read(&request->rq_export->exp_refcount) : -99), request->rq_reqmsg->status, request->rq_xid, - rqbd->rqbd_srv_ni->sni_ni->pni_name, event->initiator.nid, + request->rq_peer.peer_ni->pni_name, + request->rq_peer.peer_nid, request->rq_reqmsg->opc); put_conn: @@ -276,12 +393,175 @@ put_conn: class_export_put(request->rq_export); out: - if (atomic_dec_and_test (&rqbd->rqbd_refcount)) /* last reference? */ - ptlrpc_link_svc_me (rqbd); + do_gettimeofday(&work_end); + + timediff = timeval_sub(&work_end, &work_start); + + CDEBUG((timediff / 1000000 > (long)obd_timeout) ? D_ERROR : D_HA, + "request "LPU64" opc %u from NID "LPX64" processed in %ldus " + "(%ldus total)\n", request->rq_xid, request->rq_reqmsg->opc, + request->rq_peer.peer_nid, + timediff, timeval_sub(&work_end, &request->rq_arrival_time)); + + if (svc->srv_stats != NULL) { + int opc = opcode_offset(request->rq_reqmsg->opc); + if (opc > 0) { + LASSERT(opc < LUSTRE_MAX_OPCODES); + lprocfs_counter_add(svc->srv_stats, + opc + PTLRPC_LAST_CNTR, + timediff); + } + } + + spin_lock_irqsave(&svc->srv_lock, flags); + svc->srv_n_active_reqs--; + refcount = --(request->rq_rqbd->rqbd_refcount); + spin_unlock_irqrestore(&svc->srv_lock, flags); + + if (refcount == 0) { + /* rqbd now idle: repost */ + ptlrpc_register_rqbd(request->rq_rqbd); + } + + ptlrpc_free_server_req(request); + + RETURN(1); +} + +static int +ptlrpc_server_handle_reply (struct ptlrpc_service *svc) +{ + struct ptlrpc_reply_state *rs; + unsigned long flags; + struct obd_export *exp; + struct obd_device *obd; + int nlocks; + int been_handled; + ENTRY; + + spin_lock_irqsave (&svc->srv_lock, flags); + if (list_empty (&svc->srv_reply_queue)) { + spin_unlock_irqrestore (&svc->srv_lock, flags); + RETURN(0); + } + + rs = list_entry (svc->srv_reply_queue.next, + struct ptlrpc_reply_state, rs_list); + + exp = rs->rs_export; + obd = exp->exp_obd; + + LASSERT (rs->rs_difficult); + LASSERT (rs->rs_scheduled); + + list_del_init (&rs->rs_list); + + /* Disengage from notifiers carefully (lock ordering!) */ + spin_unlock(&svc->srv_lock); + + spin_lock (&obd->obd_uncommitted_replies_lock); + /* Noop if removed already */ + list_del_init (&rs->rs_obd_list); + spin_unlock (&obd->obd_uncommitted_replies_lock); + + spin_lock (&exp->exp_lock); + /* Noop if removed already */ + list_del_init (&rs->rs_exp_list); + spin_unlock (&exp->exp_lock); + + spin_lock(&svc->srv_lock); + + been_handled = rs->rs_handled; + rs->rs_handled = 1; + + nlocks = rs->rs_nlocks; /* atomic "steal", but */ + rs->rs_nlocks = 0; /* locks still on rs_locks! */ + + if (nlocks == 0 && !been_handled) { + /* If we see this, we should already have seen the warning + * in mds_steal_ack_locks() */ + CWARN("All locks stolen from rs %p x"LPD64".t"LPD64 + " o%d NID"LPX64"\n", + rs, + rs->rs_xid, rs->rs_transno, + rs->rs_msg.opc, exp->exp_connection->c_peer.peer_nid); + } + + if ((!been_handled && rs->rs_on_net) || + nlocks > 0) { + spin_unlock_irqrestore(&svc->srv_lock, flags); + + if (!been_handled && rs->rs_on_net) { + PtlMDUnlink(rs->rs_md_h); + /* Ignore return code; we're racing with + * completion... */ + } + + while (nlocks-- > 0) + ldlm_lock_decref(&rs->rs_locks[nlocks], + rs->rs_modes[nlocks]); + + spin_lock_irqsave(&svc->srv_lock, flags); + } + + rs->rs_scheduled = 0; + + if (!rs->rs_on_net) { + /* Off the net */ + svc->srv_n_difficult_replies--; + spin_unlock_irqrestore(&svc->srv_lock, flags); + + class_export_put (exp); + rs->rs_export = NULL; + lustre_free_reply_state (rs); + atomic_dec (&svc->srv_outstanding_replies); + RETURN(1); + } + + /* still on the net; callback will schedule */ + spin_unlock_irqrestore (&svc->srv_lock, flags); + RETURN(1); +} + +#ifndef __KERNEL__ +/* FIXME make use of timeout later */ +int +liblustre_check_services (void *arg) +{ + int did_something = 0; + struct list_head *tmp, *nxt; + ENTRY; + + /* I'm relying on being single threaded, not to have to lock + * ptlrpc_all_services etc */ + list_for_each_safe (tmp, nxt, &ptlrpc_all_services) { + struct ptlrpc_service *svc = + list_entry (tmp, struct ptlrpc_service, srv_list); + + if (svc->srv_nthreads != 0) /* I've recursed */ + continue; + + /* service threads can block for bulk, so this limits us + * (arbitrarily) to recursing 1 stack frame per service. + * Note that the problem with recursion is that we have to + * unwind completely before our caller can resume. */ + + svc->srv_nthreads++; + + while (ptlrpc_server_handle_reply (svc)) + did_something++; + + while (ptlrpc_server_handle_request (svc)) + did_something++; + + svc->srv_nthreads--; + } - return rc; + RETURN(did_something); } +#else /* __KERNEL__ */ + /* Don't use daemonize, it removes fs struct from new thread (bug 418) */ void ptlrpc_daemonize(void) { @@ -291,24 +571,12 @@ void ptlrpc_daemonize(void) reparent_to_init(); } -static long timeval_sub(struct timeval *large, struct timeval *small) -{ - return (large->tv_sec - small->tv_sec) * 1000000 + - (large->tv_usec - small->tv_usec); -} - static int ptlrpc_main(void *arg) { - struct ptlrpc_svc_data *data = arg; - struct obd_device *obddev = data->dev; - struct ptlrpc_service *svc = data->svc; - struct ptlrpc_thread *thread = data->thread; - struct ptlrpc_request *request; - ptl_event_t *event; - unsigned long flags; - struct timeval start_time, finish_time; - long total; - int rc = 0; + struct ptlrpc_svc_data *data = (struct ptlrpc_svc_data *)arg; + struct ptlrpc_service *svc = data->svc; + struct ptlrpc_thread *thread = data->thread; + unsigned long flags; ENTRY; lock_kernel(); @@ -322,134 +590,91 @@ static int ptlrpc_main(void *arg) THREAD_NAME(current->comm, "%s", data->name); unlock_kernel(); - OBD_ALLOC(event, sizeof(*event)); - if (event == NULL) - GOTO(out, rc = -ENOMEM); - OBD_ALLOC(request, sizeof(*request)); - if (request == NULL) - GOTO(out_event, rc = -ENOMEM); - /* Record that the thread is running */ thread->t_flags = SVC_RUNNING; wake_up(&thread->t_ctl_waitq); + spin_lock_irqsave(&svc->srv_lock, flags); + svc->srv_nthreads++; + spin_unlock_irqrestore(&svc->srv_lock, flags); + /* XXX maintain a list of all managed devices: insert here */ - do_gettimeofday(&finish_time); - /* And now, loop forever on requests */ - while (1) { + while ((thread->t_flags & SVC_STOPPING) == 0 || + svc->srv_n_difficult_replies != 0) { + /* Don't exit while there are replies to be handled */ struct l_wait_info lwi = { 0 }; - l_wait_event_exclusive(svc->srv_waitq, - ptlrpc_check_event(svc, thread, event), - &lwi); - - spin_lock(&svc->srv_lock); - if (thread->t_flags & SVC_STOPPING) { - thread->t_flags &= ~SVC_STOPPING; - spin_unlock(&svc->srv_lock); - - EXIT; - break; - } - - if (!(thread->t_flags & SVC_EVENT)) { - CERROR("unknown flag in service"); - spin_unlock(&svc->srv_lock); - LBUG(); - EXIT; - break; - } - - thread->t_flags &= ~SVC_EVENT; - spin_unlock(&svc->srv_lock); - - do_gettimeofday(&start_time); - total = timeval_sub(&start_time, &event->arrival_time); - if (svc->srv_stats != NULL) { - lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR, - total); - lprocfs_counter_add(svc->srv_stats, - PTLRPC_SVCIDLETIME_CNTR, - timeval_sub(&start_time, - &finish_time)); -#if 0 /* Wait for b_eq branch */ - lprocfs_counter_add(svc->srv_stats, - PTLRPC_SVCEQDEPTH_CNTR, 0); -#endif - } - if (total / 1000000 > (long)obd_timeout) { - CERROR("Dropping request from NID "LPX64" because it's " - "%ld seconds old.\n", event->initiator.nid, - total / 1000000); /* bug 1502 */ - } else { - CDEBUG(D_HA, "request from NID "LPX64" noticed after " - "%ldus\n", event->initiator.nid, total); - rc = handle_incoming_request(obddev, svc, event, - request); - } - do_gettimeofday(&finish_time); - total = timeval_sub(&finish_time, &start_time); - - CDEBUG((total / 1000000 > (long)obd_timeout) ? D_ERROR : D_HA, - "request "LPU64" from NID "LPX64" processed in %ldus " - "(%ldus total)\n", request->rq_xid, event->initiator.nid, - total, timeval_sub(&finish_time, &event->arrival_time)); - - if (svc->srv_stats != NULL) { - int opc = opcode_offset(request->rq_reqmsg->opc); - if (opc > 0) { - LASSERT(opc < LUSTRE_MAX_OPCODES); - lprocfs_counter_add(svc->srv_stats, - opc + PTLRPC_LAST_CNTR, - total); - } - } + l_wait_event_exclusive (svc->srv_waitq, + (thread->t_flags & SVC_STOPPING) != 0 || + !list_empty (&svc->srv_reply_queue) || + (!list_empty (&svc->srv_request_queue) && + (svc->srv_n_difficult_replies == 0 || + svc->srv_n_active_reqs < + (svc->srv_nthreads - 1))), + &lwi); + + if (!list_empty (&svc->srv_reply_queue)) + ptlrpc_server_handle_reply (svc); + + /* only handle requests if there are no difficult replies + * outstanding, or I'm not the last thread handling + * requests */ + if (!list_empty (&svc->srv_request_queue) && + (svc->srv_n_difficult_replies == 0 || + svc->srv_n_active_reqs < (svc->srv_nthreads - 1))) + ptlrpc_server_handle_request (svc); } - /* NB should wait for all SENT callbacks to complete before exiting - * here. Unfortunately at this time there is no way to track this - * state. */ - OBD_FREE(request, sizeof(*request)); -out_event: - OBD_FREE(event, sizeof(*event)); -out: + spin_lock_irqsave(&svc->srv_lock, flags); + + svc->srv_nthreads--; /* must know immediately */ thread->t_flags = SVC_STOPPED; wake_up(&thread->t_ctl_waitq); - CDEBUG(D_NET, "service thread exiting, process %d: rc = %d\n", - current->pid, rc); - return rc; + spin_unlock_irqrestore(&svc->srv_lock, flags); + + CDEBUG(D_NET, "service thread exiting, process %d\n", current->pid); + return 0; } static void ptlrpc_stop_thread(struct ptlrpc_service *svc, struct ptlrpc_thread *thread) { struct l_wait_info lwi = { 0 }; + unsigned long flags; - spin_lock(&svc->srv_lock); + spin_lock_irqsave(&svc->srv_lock, flags); thread->t_flags = SVC_STOPPING; - spin_unlock(&svc->srv_lock); + spin_unlock_irqrestore(&svc->srv_lock, flags); wake_up_all(&svc->srv_waitq); l_wait_event(thread->t_ctl_waitq, (thread->t_flags & SVC_STOPPED), &lwi); + + spin_lock_irqsave(&svc->srv_lock, flags); + list_del(&thread->t_link); + spin_unlock_irqrestore(&svc->srv_lock, flags); + + OBD_FREE(thread, sizeof(*thread)); } void ptlrpc_stop_all_threads(struct ptlrpc_service *svc) { - spin_lock(&svc->srv_lock); + unsigned long flags; + struct ptlrpc_thread *thread; + + spin_lock_irqsave(&svc->srv_lock, flags); while (!list_empty(&svc->srv_threads)) { - struct ptlrpc_thread *thread; - thread = list_entry(svc->srv_threads.next, struct ptlrpc_thread, - t_link); - spin_unlock(&svc->srv_lock); + thread = list_entry(svc->srv_threads.next, + struct ptlrpc_thread, t_link); + + spin_unlock_irqrestore(&svc->srv_lock, flags); ptlrpc_stop_thread(svc, thread); - spin_lock(&svc->srv_lock); - list_del(&thread->t_link); - OBD_FREE(thread, sizeof(*thread)); + spin_lock_irqsave(&svc->srv_lock, flags); } - spin_unlock(&svc->srv_lock); + + spin_unlock_irqrestore(&svc->srv_lock, flags); } int ptlrpc_start_n_threads(struct obd_device *dev, struct ptlrpc_service *svc, @@ -477,6 +702,7 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc, struct l_wait_info lwi = { 0 }; struct ptlrpc_svc_data d; struct ptlrpc_thread *thread; + unsigned long flags; int rc; ENTRY; @@ -484,15 +710,15 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc, if (thread == NULL) RETURN(-ENOMEM); init_waitqueue_head(&thread->t_ctl_waitq); - + d.dev = dev; d.svc = svc; d.name = name; d.thread = thread; - spin_lock(&svc->srv_lock); + spin_lock_irqsave(&svc->srv_lock, flags); list_add(&thread->t_link, &svc->srv_threads); - spin_unlock(&svc->srv_lock); + spin_unlock_irqrestore(&svc->srv_lock, flags); /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we * just drop the VM and FILES in ptlrpc_daemonize() right away. @@ -507,65 +733,126 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc, RETURN(0); } +#endif int ptlrpc_unregister_service(struct ptlrpc_service *service) { - int i, rc; + int i; + int rc; + unsigned long flags; struct ptlrpc_srv_ni *srv_ni; + struct l_wait_info lwi; + struct list_head *tmp; - LASSERT (list_empty (&service->srv_threads)); + LASSERT(list_empty(&service->srv_threads)); - /* XXX We could reply (with failure) to all buffered requests - * _after_ unlinking _all_ the request buffers, but _before_ - * freeing them. - */ + spin_lock (&ptlrpc_all_services_lock); + list_del_init (&service->srv_list); + spin_unlock (&ptlrpc_all_services_lock); + + for (i = 0; i < ptlrpc_ninterfaces; i++) { + srv_ni = &service->srv_interfaces[i]; + CDEBUG(D_NET, "%s: tearing down interface %s\n", + service->srv_name, srv_ni->sni_ni->pni_name); + + /* Unlink all the request buffers. This forces a 'final' + * event with its 'unlink' flag set for each rqbd */ + list_for_each(tmp, &srv_ni->sni_rqbds) { + struct ptlrpc_request_buffer_desc *rqbd = + list_entry(tmp, struct ptlrpc_request_buffer_desc, + rqbd_list); + + rc = PtlMDUnlink(rqbd->rqbd_md_h); + LASSERT (rc == PTL_OK || rc == PTL_INV_MD); + } + + /* Wait for the network to release any buffers it's + * currently filling */ + for (;;) { + spin_lock_irqsave(&service->srv_lock, flags); + rc = srv_ni->sni_nrqbd_receiving; + spin_unlock_irqrestore(&service->srv_lock, flags); + + if (rc == 0) + break; + + /* Network access will complete in finite time but + * the HUGE timeout lets us CWARN for visibility of + * sluggish NALs */ + lwi = LWI_TIMEOUT(300 * HZ, NULL, NULL); + rc = l_wait_event(service->srv_waitq, + srv_ni->sni_nrqbd_receiving == 0, + &lwi); + if (rc == -ETIMEDOUT) + CWARN("Waiting for request buffers on " + "service %s on interface %s ", + service->srv_name, srv_ni->sni_ni->pni_name); + } + + /* schedule all outstanding replies to terminate them */ + spin_lock_irqsave(&service->srv_lock, flags); + while (!list_empty(&srv_ni->sni_active_replies)) { + struct ptlrpc_reply_state *rs = + list_entry(srv_ni->sni_active_replies.next, + struct ptlrpc_reply_state, + rs_list); + ptlrpc_schedule_difficult_reply(rs); + } + spin_unlock_irqrestore(&service->srv_lock, flags); + } + + /* purge the request queue. NB No new replies (rqbds all unlinked) + * and no service threads, so I'm the only thread noodling the + * request queue now */ + while (!list_empty(&service->srv_request_queue)) { + struct ptlrpc_request *req = + list_entry(service->srv_request_queue.next, + struct ptlrpc_request, + rq_list); + + list_del(&req->rq_list); + service->srv_n_queued_reqs--; + req->rq_rqbd->rqbd_refcount--; + + ptlrpc_free_server_req(req); + } + LASSERT(service->srv_n_queued_reqs == 0); + /* Now free all the request buffers since nothing references them + * any more... */ for (i = 0; i < ptlrpc_ninterfaces; i++) { srv_ni = &service->srv_interfaces[i]; - CDEBUG (D_NET, "%s: tearing down interface %s\n", - service->srv_name, srv_ni->sni_ni->pni_name); - while (!list_empty (&srv_ni->sni_rqbds)) { + while (!list_empty(&srv_ni->sni_rqbds)) { struct ptlrpc_request_buffer_desc *rqbd = - list_entry (srv_ni->sni_rqbds.next, - struct ptlrpc_request_buffer_desc, - rqbd_list); - - list_del (&rqbd->rqbd_list); - - LASSERT (atomic_read (&rqbd->rqbd_refcount) > 0); - /* refcount could be anything; it's possible for - * the buffers to continued to get filled after all - * the server threads exited. But we know they - * _have_ exited. - */ - - (void) PtlMEUnlink(rqbd->rqbd_me_h); - /* The callback handler could have unlinked this ME - * already (we're racing with her) but it's safe to - * ensure it _has_ been unlinked. - */ - - OBD_FREE (rqbd->rqbd_buffer, service->srv_buf_size); - OBD_FREE (rqbd, sizeof (*rqbd)); - srv_ni->sni_nrqbds--; + list_entry(srv_ni->sni_rqbds.next, + struct ptlrpc_request_buffer_desc, + rqbd_list); + + ptlrpc_free_rqbd(rqbd); } + } - LASSERT (srv_ni->sni_nrqbds == 0); + /* wait for all outstanding replies to complete (they were + * scheduled having been flagged to abort above) */ + while (atomic_read(&service->srv_outstanding_replies) != 0) { + struct l_wait_info lwi = LWI_TIMEOUT(10 * HZ, NULL, NULL); - if (!PtlHandleEqual (srv_ni->sni_eq_h, PTL_HANDLE_NONE)) { - rc = PtlEQFree(srv_ni->sni_eq_h); - if (rc) - CERROR("%s.%d: PtlEQFree failed on %s: %d\n", - service->srv_name, i, - srv_ni->sni_ni->pni_name, rc); + rc = l_wait_event(service->srv_waitq, + !list_empty(&service->srv_reply_queue), &lwi); + LASSERT(rc == 0 || rc == -ETIMEDOUT); + + if (rc == 0) { + ptlrpc_server_handle_reply(service); + continue; } + CWARN("Unexpectedly long timeout %p\n", service); } ptlrpc_lprocfs_unregister_service(service); OBD_FREE(service, - offsetof (struct ptlrpc_service, - srv_interfaces[ptlrpc_ninterfaces])); + offsetof(struct ptlrpc_service, + srv_interfaces[ptlrpc_ninterfaces])); return 0; } diff --git a/lustre/scripts/lustre.spec.in b/lustre/scripts/lustre.spec.in index b7f2b83..9d02148 100644 --- a/lustre/scripts/lustre.spec.in +++ b/lustre/scripts/lustre.spec.in @@ -1,5 +1,5 @@ # lustre.spec -%define version HEAD +%define version b_eq %define kversion @LINUXRELEASE@ %define linuxdir @LINUX@ %define enable_doc @ENABLE_DOC@ @@ -143,9 +143,7 @@ mkdir -p $RPM_BUILD_ROOT/var/lib/ldap/lustre %attr(-, root, root) /usr/lib/lustre/examples/llechocleanup.sh %attr(-, root, root) /etc/init.d/lustre -%attr(-, root, root) /lib/libportals.a %attr(-, root, root) /lib/libptlctl.a -%attr(-, root, root) /lib/libtcpnal.a %attr(-, root, root) /lib/liblustreapi.a %attr(-, root, root) /usr/include/lustre/*.h diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 34ba231..9380a2d 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -183,9 +183,10 @@ test_5() { # if all the modules have unloaded. umount $MOUNT & UMOUNT_PID=$! - sleep $TIMEOUT + sleep 2 echo "killing umount" kill -TERM $UMOUNT_PID + echo "waiting for umount to finish" wait $UMOUNT_PID # cleanup client modules @@ -200,6 +201,48 @@ test_5() { } run_test 5 "force cleanup mds, then cleanup" +test_5b() { + start_ost + start_mds + stop_mds + + [ -d $MOUNT ] || mkdir -p $MOUNT + $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null + llmount $mds_HOST://mds_svc/client_facet $MOUNT && exit 1 + + # cleanup client modules + $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null + + # stop_mds is a no-op here, and should not fail + stop_mds || return 2 + stop_ost || return 3 + + lsmod | grep -q portals && return 3 + return 0 + +} +run_test 5b "mds down, cleanup after failed mount (bug 2712)" + +test_5c() { + start_ost + start_mds + + [ -d $MOUNT ] || mkdir -p $MOUNT + $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null + llmount $mds_HOST://wrong_mds_svc/client_facet $MOUNT && exit 1 + + # cleanup client modules + $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null + + stop_mds || return 2 + stop_ost || return 3 + + lsmod | grep -q portals && return 3 + return 0 + +} +run_test 5c "cleanup after failed mount (bug 2712)" + test_6() { setup manual_umount_client diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh index 1aabb7d..0861045 100755 --- a/lustre/tests/replay-ost-single.sh +++ b/lustre/tests/replay-ost-single.sh @@ -80,7 +80,7 @@ test_2() { done fail ost for i in `seq 10`; do - grep -q "tag-$i" $DIR/$tfile-$i || error "f1c-$i" + grep -q "tag-$i" $DIR/$tfile-$i || error "f2-$i" done } run_test 2 "|x| 10 open(O_CREAT)s" diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am index 6e3aad9..0a3f785 100644 --- a/lustre/utils/Makefile.am +++ b/lustre/utils/Makefile.am @@ -3,8 +3,19 @@ DEFS= SUBDIRS = Lustre CFLAGS:=-g -O2 -I$(top_srcdir)/utils -I$(top_srcdir)/portals/include -I$(srcdir)/../include -Wall -L../portals/utils -KFLAGS:= CPPFLAGS = $(HAVE_LIBREADLINE) + +if LIBLUSTRE + +bin_SCRIPTS = lrun + +EXTRA_DIST = $(sbin_SCRIPTS) + +include $(top_srcdir)/Rules + +else + +KFLAGS:= lctl_LDADD := $(LIBREADLINE) -lptlctl lfs_LDADD := $(LIBREADLINE) parser.o liblustreapi.a -lptlctl obd.o lload_LDADD := -lptlctl @@ -37,3 +48,5 @@ newwiretest: wirehdr.c wirecheck mount.lustre$(EXEEXT): llmount cp llmount mount.lustre + +endif diff --git a/lustre/utils/lrun b/lustre/utils/lrun index 193a062..56d3d04 100755 --- a/lustre/utils/lrun +++ b/lustre/utils/lrun @@ -1,7 +1,7 @@ #!/bin/sh LIBLUSTRE_MOUNT_POINT=${LIBLUSTRE_MOUNT_POINT:-"/mnt/lustre"} -LIBLUSTRE_MOUNT_TARGET=${LIBLUSTRE_MOUNT_TARGET:-""} +LIBLUSTRE_MOUNT_TARGET=${LIBLUSTRE_MOUNT_TARGET:-"TARGET_NOT_SET"} LIBLUSTRE_DUMPFILE=${LIBLUSTRE_DUMPFILE:-"/tmp/DUMP_FILE"} LD_PRELOAD=${LD_PRELOAD:-"/usr/lib/liblustre.so"} -- 1.8.3.1