CFLAGS="$KCFLAGS"
CPPFLAGS="$KINCFLAGS $KCPPFLAGS $MFLAGS $enable_zerocopy $enable_affinity $with_quadrics $with_gm $with_scamac $with_ib"
+AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib)
AC_SUBST(MOD_LINK)
AC_SUBST(LINUX25)
AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib)
PTL_IOV_TOO_SMALL = 31,
PTL_EQ_INUSE = 32,
- PTL_MD_INUSE = 33,
- PTL_MAX_ERRNO = 33
+ PTL_MAX_ERRNO = 32
} ptl_err_t;
/* If you change these, you must update the string table in api-errno.c */
#include <portals/types.h>
#include <linux/kp30.h>
#include <portals/p30.h>
-#include <portals/errno.h>
#include <portals/lib-types.h>
#include <portals/lib-nal.h>
#include <portals/lib-dispatch.h>
nal->cb_sti(nal, flagsp); \
}
-#ifdef PTL_USE_DESC_LISTS
+#ifdef PTL_USE_LIB_FREELIST
#define MAX_MES 2048
#define MAX_MDS 2048
}
static inline lib_md_t *
-lib_md_alloc (nal_cb_t *nal)
+lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd)
{
/* NEVER called with statelock held */
unsigned long flags;
static inline lib_msg_t *
lib_msg_alloc (nal_cb_t *nal)
{
- /* ALWAYS called with statelock held */
- return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs));
+ /* NEVER called with statelock held */
+ unsigned long flags;
+ lib_msg_t *msg;
+
+ state_lock (nal, &flags);
+ msg = (lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs);
+ state_unlock (nal, &flags);
+
+ if (msg != NULL) {
+ /* NULL pointers, clear flags etc */
+ memset (msg, 0, sizeof (*msg));
+ msg->ack_wmd = PTL_WIRE_HANDLE_NONE;
+ }
+ return(msg);
}
static inline void
#else
-extern atomic_t md_in_use_count;
-extern atomic_t msg_in_use_count;
-extern atomic_t me_in_use_count;
-extern atomic_t eq_in_use_count;
-
static inline lib_eq_t *
lib_eq_alloc (nal_cb_t *nal)
{
/* NEVER called with statelock held */
lib_eq_t *eq;
- PORTAL_ALLOC(eq, sizeof(*eq));
-
- if (eq == NULL)
- return (NULL);
- atomic_inc (&eq_in_use_count);
+ PORTAL_ALLOC(eq, sizeof(*eq));
return (eq);
}
lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
{
/* ALWAYS called with statelock held */
- atomic_dec (&eq_in_use_count);
PORTAL_FREE(eq, sizeof(*eq));
}
static inline lib_md_t *
-lib_md_alloc (nal_cb_t *nal)
+lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd)
{
/* NEVER called with statelock held */
lib_md_t *md;
- PORTAL_ALLOC(md, sizeof(*md));
-
- if (md == NULL)
- return (NULL);
-
- atomic_inc (&md_in_use_count);
+ int size;
+ int niov;
+
+ if ((umd->options & PTL_MD_KIOV) != 0) {
+ niov = umd->niov;
+ size = offsetof(lib_md_t, md_iov.kiov[niov]);
+ } else {
+ niov = ((umd->options & PTL_MD_IOV) != 0) ?
+ umd->niov : 1;
+ size = offsetof(lib_md_t, md_iov.iov[niov]);
+ }
+
+ PORTAL_ALLOC(md, size);
+
+ if (md != NULL) {
+ /* Set here in case of early free */
+ md->options = umd->options;
+ md->md_niov = niov;
+ }
+
return (md);
}
lib_md_free (nal_cb_t *nal, lib_md_t *md)
{
/* ALWAYS called with statelock held */
- atomic_dec (&md_in_use_count);
- PORTAL_FREE(md, sizeof(*md));
+ int size;
+
+ if ((md->options & PTL_MD_KIOV) != 0)
+ size = offsetof(lib_md_t, md_iov.kiov[md->md_niov]);
+ else
+ size = offsetof(lib_md_t, md_iov.iov[md->md_niov]);
+
+ PORTAL_FREE(md, size);
}
static inline lib_me_t *
{
/* NEVER called with statelock held */
lib_me_t *me;
- PORTAL_ALLOC(me, sizeof(*me));
-
- if (me == NULL)
- return (NULL);
- atomic_inc (&me_in_use_count);
+ PORTAL_ALLOC(me, sizeof(*me));
return (me);
}
lib_me_free(nal_cb_t *nal, lib_me_t *me)
{
/* ALWAYS called with statelock held */
- atomic_dec (&me_in_use_count);
PORTAL_FREE(me, sizeof(*me));
}
static inline lib_msg_t *
lib_msg_alloc(nal_cb_t *nal)
{
- /* ALWAYS called with statelock held */
+ /* NEVER called with statelock held */
lib_msg_t *msg;
- PORTAL_ALLOC_ATOMIC(msg, sizeof(*msg));
- if (msg == NULL)
- return (NULL);
-
- atomic_inc (&msg_in_use_count);
+ PORTAL_ALLOC(msg, sizeof(*msg));
+ if (msg != NULL) {
+ /* NULL pointers, clear flags etc */
+ memset (msg, 0, sizeof (*msg));
+ msg->ack_wmd = PTL_WIRE_HANDLE_NONE;
+ }
return (msg);
}
lib_msg_free(nal_cb_t *nal, lib_msg_t *msg)
{
/* ALWAYS called with statelock held */
- atomic_dec (&msg_in_use_count);
PORTAL_FREE(msg, sizeof(*msg));
}
#endif
* Call backs will be made to write events, send acks or
* replies and so on.
*/
-extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private);
-extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg);
+extern void lib_enq_event_locked (nal_cb_t *nal, void *private,
+ lib_eq_t *eq, ptl_event_t *ev);
+extern void lib_finalize (nal_cb_t *nal, void *private, lib_msg_t *msg,
+ ptl_err_t status);
+extern void lib_parse (nal_cb_t *nal, ptl_hdr_t *hdr, void *private);
extern lib_msg_t *lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid,
lib_md_t *getmd);
-extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr);
+extern void print_hdr (nal_cb_t * nal, ptl_hdr_t * hdr);
+
extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
-extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len);
-extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len);
+extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov,
+ ptl_size_t offset, ptl_size_t len);
+extern void lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset,
+ char *src, ptl_size_t len);
+extern int lib_extract_iov (int dst_niov, struct iovec *dst,
+ int src_niov, struct iovec *src,
+ ptl_size_t offset, ptl_size_t len);
extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov);
-extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len);
-extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len);
+extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov,
+ ptl_size_t offset, ptl_size_t len);
+extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
+ char *src, ptl_size_t len);
+extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst,
+ int src_niov, ptl_kiov_t *src,
+ ptl_size_t offset, ptl_size_t len);
+
extern void lib_assert_wire_constants (void);
-extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
- ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
-extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
- ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
- lib_md_t *md, ptl_size_t offset, ptl_size_t len);
+extern ptl_err_t lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+ ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
+extern ptl_err_t lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+ ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+ lib_md_t *md, ptl_size_t offset, ptl_size_t len);
extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in,
ptl_md_t * md_out);
lib_ni_t ni;
void *nal_data;
/*
- * send: Sends a preformatted header and user data to a
- * specified remote process.
- * Can overwrite iov.
+ * send: Sends a preformatted header and payload data to a
+ * specified remote process. The payload is scattered over 'niov'
+ * fragments described by iov, starting at 'offset' for 'mlen'
+ * bytes.
+ * NB the NAL may NOT overwrite iov.
+ * PTL_OK on success => NAL has committed to send and will call
+ * lib_finalize on completion
*/
- int (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
- ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
- unsigned int niov, struct iovec *iov, size_t mlen);
+ ptl_err_t (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+ ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+ unsigned int niov, struct iovec *iov,
+ size_t offset, size_t mlen);
/* as send, but with a set of page fragments (NULL if not supported) */
- int (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
- ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
- unsigned int niov, ptl_kiov_t *iov, size_t mlen);
+ ptl_err_t (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+ ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+ unsigned int niov, ptl_kiov_t *iov,
+ size_t offset, size_t mlen);
/*
- * recv: Receives an incoming message from a remote process
- * Type of iov depends on options. Can overwrite iov.
+ * recv: Receives an incoming message from a remote process. The
+ * payload is to be received into the scattered buffer of 'niov'
+ * fragments described by iov, starting at 'offset' for 'mlen'
+ * bytes. Payload bytes after 'mlen' up to 'rlen' are to be
+ * discarded.
+ * NB the NAL may NOT overwrite iov.
+ * PTL_OK on success => NAL has committed to receive and will call
+ * lib_finalize on completion
*/
- int (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
- unsigned int niov, struct iovec *iov, size_t mlen,
- size_t rlen);
+ ptl_err_t (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+ unsigned int niov, struct iovec *iov,
+ size_t offset, size_t mlen, size_t rlen);
/* as recv, but with a set of page fragments (NULL if not supported) */
- int (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
- unsigned int niov, ptl_kiov_t *iov, size_t mlen,
- size_t rlen);
+ ptl_err_t (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+ unsigned int niov, ptl_kiov_t *iov,
+ size_t offset, size_t mlen, size_t rlen);
/*
* read: Reads a block of data from a specified user address
*/
- int (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr,
- user_ptr src_addr, size_t len);
+ ptl_err_t (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr,
+ user_ptr src_addr, size_t len);
/*
* write: Writes a block of data into a specified user address
*/
- int (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr,
- void *src_addr, size_t len);
+ ptl_err_t (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr,
+ void *src_addr, size_t len);
/*
* callback: Calls an event callback
+ * NULL => lib calls eq's callback (if any) directly.
*/
- int (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq,
- ptl_event_t *ev);
+ void (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq,
+ ptl_event_t *ev);
/*
* malloc: Acquire a block of memory in a system independent
* type of *iov depends on options.
* Set to NULL if not required.
*/
- int (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov,
- void **addrkey);
+ ptl_err_t (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov,
+ void **addrkey);
void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov,
void **addrkey);
/* as (un)map, but with a set of page fragments */
- int (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov,
- void **addrkey);
+ ptl_err_t (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov,
+ void **addrkey);
void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov,
void **addrkey);
#include <portals/types.h>
#include <linux/kp30.h>
#include <portals/p30.h>
-#include <portals/errno.h>
#include <portals/lib-types.h>
#include <portals/lib-nal.h>
#include <portals/lib-dispatch.h>
nal->cb_sti(nal, flagsp); \
}
-#ifdef PTL_USE_DESC_LISTS
+#ifdef PTL_USE_LIB_FREELIST
#define MAX_MES 2048
#define MAX_MDS 2048
}
static inline lib_md_t *
-lib_md_alloc (nal_cb_t *nal)
+lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd)
{
/* NEVER called with statelock held */
unsigned long flags;
static inline lib_msg_t *
lib_msg_alloc (nal_cb_t *nal)
{
- /* ALWAYS called with statelock held */
- return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs));
+ /* NEVER called with statelock held */
+ unsigned long flags;
+ lib_msg_t *msg;
+
+ state_lock (nal, &flags);
+ msg = (lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs);
+ state_unlock (nal, &flags);
+
+ if (msg != NULL) {
+ /* NULL pointers, clear flags etc */
+ memset (msg, 0, sizeof (*msg));
+ msg->ack_wmd = PTL_WIRE_HANDLE_NONE;
+ }
+ return(msg);
}
static inline void
#else
-extern atomic_t md_in_use_count;
-extern atomic_t msg_in_use_count;
-extern atomic_t me_in_use_count;
-extern atomic_t eq_in_use_count;
-
static inline lib_eq_t *
lib_eq_alloc (nal_cb_t *nal)
{
/* NEVER called with statelock held */
lib_eq_t *eq;
- PORTAL_ALLOC(eq, sizeof(*eq));
-
- if (eq == NULL)
- return (NULL);
- atomic_inc (&eq_in_use_count);
+ PORTAL_ALLOC(eq, sizeof(*eq));
return (eq);
}
lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
{
/* ALWAYS called with statelock held */
- atomic_dec (&eq_in_use_count);
PORTAL_FREE(eq, sizeof(*eq));
}
static inline lib_md_t *
-lib_md_alloc (nal_cb_t *nal)
+lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd)
{
/* NEVER called with statelock held */
lib_md_t *md;
- PORTAL_ALLOC(md, sizeof(*md));
-
- if (md == NULL)
- return (NULL);
-
- atomic_inc (&md_in_use_count);
+ int size;
+ int niov;
+
+ if ((umd->options & PTL_MD_KIOV) != 0) {
+ niov = umd->niov;
+ size = offsetof(lib_md_t, md_iov.kiov[niov]);
+ } else {
+ niov = ((umd->options & PTL_MD_IOV) != 0) ?
+ umd->niov : 1;
+ size = offsetof(lib_md_t, md_iov.iov[niov]);
+ }
+
+ PORTAL_ALLOC(md, size);
+
+ if (md != NULL) {
+ /* Set here in case of early free */
+ md->options = umd->options;
+ md->md_niov = niov;
+ }
+
return (md);
}
lib_md_free (nal_cb_t *nal, lib_md_t *md)
{
/* ALWAYS called with statelock held */
- atomic_dec (&md_in_use_count);
- PORTAL_FREE(md, sizeof(*md));
+ int size;
+
+ if ((md->options & PTL_MD_KIOV) != 0)
+ size = offsetof(lib_md_t, md_iov.kiov[md->md_niov]);
+ else
+ size = offsetof(lib_md_t, md_iov.iov[md->md_niov]);
+
+ PORTAL_FREE(md, size);
}
static inline lib_me_t *
{
/* NEVER called with statelock held */
lib_me_t *me;
- PORTAL_ALLOC(me, sizeof(*me));
-
- if (me == NULL)
- return (NULL);
- atomic_inc (&me_in_use_count);
+ PORTAL_ALLOC(me, sizeof(*me));
return (me);
}
lib_me_free(nal_cb_t *nal, lib_me_t *me)
{
/* ALWAYS called with statelock held */
- atomic_dec (&me_in_use_count);
PORTAL_FREE(me, sizeof(*me));
}
static inline lib_msg_t *
lib_msg_alloc(nal_cb_t *nal)
{
- /* ALWAYS called with statelock held */
+ /* NEVER called with statelock held */
lib_msg_t *msg;
- PORTAL_ALLOC_ATOMIC(msg, sizeof(*msg));
- if (msg == NULL)
- return (NULL);
-
- atomic_inc (&msg_in_use_count);
+ PORTAL_ALLOC(msg, sizeof(*msg));
+ if (msg != NULL) {
+ /* NULL pointers, clear flags etc */
+ memset (msg, 0, sizeof (*msg));
+ msg->ack_wmd = PTL_WIRE_HANDLE_NONE;
+ }
return (msg);
}
lib_msg_free(nal_cb_t *nal, lib_msg_t *msg)
{
/* ALWAYS called with statelock held */
- atomic_dec (&msg_in_use_count);
PORTAL_FREE(msg, sizeof(*msg));
}
#endif
* Call backs will be made to write events, send acks or
* replies and so on.
*/
-extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private);
-extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg);
+extern void lib_enq_event_locked (nal_cb_t *nal, void *private,
+ lib_eq_t *eq, ptl_event_t *ev);
+extern void lib_finalize (nal_cb_t *nal, void *private, lib_msg_t *msg,
+ ptl_err_t status);
+extern void lib_parse (nal_cb_t *nal, ptl_hdr_t *hdr, void *private);
extern lib_msg_t *lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid,
lib_md_t *getmd);
-extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr);
+extern void print_hdr (nal_cb_t * nal, ptl_hdr_t * hdr);
+
extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
-extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len);
-extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len);
+extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov,
+ ptl_size_t offset, ptl_size_t len);
+extern void lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset,
+ char *src, ptl_size_t len);
+extern int lib_extract_iov (int dst_niov, struct iovec *dst,
+ int src_niov, struct iovec *src,
+ ptl_size_t offset, ptl_size_t len);
extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov);
-extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len);
-extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len);
+extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov,
+ ptl_size_t offset, ptl_size_t len);
+extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
+ char *src, ptl_size_t len);
+extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst,
+ int src_niov, ptl_kiov_t *src,
+ ptl_size_t offset, ptl_size_t len);
+
extern void lib_assert_wire_constants (void);
-extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
- ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
-extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
- ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
- lib_md_t *md, ptl_size_t offset, ptl_size_t len);
+extern ptl_err_t lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+ ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
+extern ptl_err_t lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+ ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+ lib_md_t *md, ptl_size_t offset, ptl_size_t len);
extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in,
ptl_md_t * md_out);
# include <linux/smp_lock.h>
# include <linux/types.h>
#else
-# define PTL_USE_DESC_LISTS
+# define PTL_USE_LIB_FREELIST
# include <sys/types.h>
#endif
struct lib_msg_t {
struct list_head msg_list;
- int send_ack;
lib_md_t *md;
- ptl_nid_t nid;
- ptl_pid_t pid;
- ptl_event_t ev;
ptl_handle_wire_t ack_wmd;
- union {
- struct iovec iov[PTL_MD_MAX_IOV];
- ptl_kiov_t kiov[PTL_MD_MAX_IOV];
- } msg_iov;
+ ptl_event_t ev;
};
struct lib_ptl_t {
};
#define PTL_MD_FLAG_UNLINK (1 << 0)
-#define PTL_MD_FLAG_AUTO_UNLINKED (1 << 1)
-#ifdef PTL_USE_DESC_LISTS
+#ifdef PTL_USE_LIB_FREELIST
typedef struct
{
void *fl_objs; /* single contiguous array of objects */
struct list_head ni_test_peers;
-#ifdef PTL_USE_DESC_LISTS
+#ifdef PTL_USE_LIB_FREELIST
lib_freelist_t ni_free_mes;
lib_freelist_t ni_free_msgs;
lib_freelist_t ni_free_mds;
#include <portals/types.h>
#include <portals/nal.h>
#include <portals/api.h>
-#include <portals/errno.h>
#include <portals/nalids.h>
extern int __p30_initialized; /* for libraries & test codes */
#include <portals/types.h>
#include <portals/nal.h>
#include <portals/api.h>
-#include <portals/errno.h>
#include <portals/nalids.h>
extern int __p30_initialized; /* for libraries & test codes */
# define do_gettimeofday(tv) gettimeofday(tv, NULL)
#endif
+#include <portals/errno.h>
+
typedef __u64 ptl_nid_t;
typedef __u32 ptl_pid_t;
typedef __u32 ptl_pt_index_t;
PTL_EVENT_PUT,
PTL_EVENT_REPLY,
PTL_EVENT_ACK,
- PTL_EVENT_SENT
+ PTL_EVENT_SENT,
+ PTL_EVENT_UNLINK,
} ptl_event_kind_t;
#define PTL_SEQ_BASETYPE long
#pragma pack(push, 4)
#endif
typedef struct {
- ptl_event_kind_t type;
- ptl_process_id_t initiator;
- ptl_pt_index_t portal;
- ptl_match_bits_t match_bits;
- ptl_size_t rlength, mlength, offset;
- ptl_handle_me_t unlinked_me;
- ptl_md_t mem_desc;
- ptl_hdr_data_t hdr_data;
- struct timeval arrival_time;
+ ptl_event_kind_t type;
+ ptl_err_t status;
+ int unlinked;
+ ptl_process_id_t initiator;
+ ptl_pt_index_t portal;
+ ptl_match_bits_t match_bits;
+ ptl_size_t rlength;
+ ptl_size_t mlength;
+ ptl_size_t offset;
+ ptl_md_t mem_desc;
+ ptl_hdr_data_t hdr_data;
+ struct timeval arrival_time;
+
volatile ptl_seq_t sequence;
} ptl_event_t;
#ifdef __CYGWIN__
int gmnal_cb_write(nal_cb_t *, void *private, user_ptr, void *, size_t);
-int gmnal_cb_callback(nal_cb_t *, void *, lib_eq_t *, ptl_event_t *);
-
void *gmnal_cb_malloc(nal_cb_t *, size_t);
void gmnal_cb_free(nal_cb_t *, void *, size_t);
a->cb_recv_pages = gmnal_cb_recv_pages; \
a->cb_read = gmnal_cb_read; \
a->cb_write = gmnal_cb_write; \
- a->cb_callback = gmnal_cb_callback; \
+ a->cb_callback = NULL; \
a->cb_malloc = gmnal_cb_malloc; \
a->cb_free = gmnal_cb_free; \
a->cb_map = NULL; \
niov, iov, len);
} else {
CDEBUG(D_ERROR, "Large message send it is not supported\n");
- lib_finalize(nal_cb, private, cookie);
return(PTL_FAIL);
gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid,
niov, iov, len);
return(PTL_OK);
}
-int gmnal_cb_callback(nal_cb_t *nal_cb, void *private, lib_eq_t *eq,
- ptl_event_t *ev)
-{
-
- if (eq->event_callback != NULL) {
- CDEBUG(D_INFO, "found callback\n");
- eq->event_callback(ev);
- }
-
- return(PTL_OK);
-}
-
void *gmnal_cb_malloc(nal_cb_t *nal_cb, size_t len)
{
void *ptr = NULL;
if (!private) {
CDEBUG(D_ERROR, "gmnal_small_rx no context\n");
- lib_finalize(nal_cb, private, cookie);
return(PTL_FAIL);
}
* let portals library know receive is complete
*/
CDEBUG(D_PORTALS, "calling lib_finalize\n");
- if (lib_finalize(nal_cb, private, cookie) != PTL_OK) {
- /* TO DO what to do with failed lib_finalise? */
- CDEBUG(D_INFO, "lib_finalize failed\n");
- }
+ lib_finalize(nal_cb, private, cookie, PTL_OK);
+
/*
* return buffer so it can be used again
*/
return;
}
gmnal_return_stxd(nal_data, stxd);
- if (lib_finalize(nal_cb, stxd, cookie) != PTL_OK) {
- CDEBUG(D_INFO, "Call to lib_finalize failed for stxd [%p]\n",
- stxd);
- }
+ lib_finalize(nal_cb, stxd, cookie, PTL_OK);
+
return;
}
if (!srxd) {
CDEBUG(D_ERROR, "gmnal_large_rx no context\n");
- lib_finalize(nal_cb, private, cookie);
return(PTL_FAIL);
}
* Let our client application proceed
*/
CDEBUG(D_ERROR, "final callback context[%p]\n", srxd);
- if (lib_finalize(nal_cb, srxd, srxd->cookie) != PTL_OK) {
- CDEBUG(D_INFO, "Call to lib_finalize failed for srxd [%p]\n",
- srxd);
- }
+ lib_finalize(nal_cb, srxd, srxd->cookie, PTL_OK);
/*
* send an ack to the sender to let him know we got the data
CDEBUG(D_INFO, "gmnal_large_tx_ack_received stxd [%p]\n", stxd);
- if (lib_finalize(nal_cb, stxd, stxd->cookie) != PTL_OK) {
- CDEBUG(D_INFO, "Call to lib_finalize failed for stxd [%p]\n",
- stxd);
- }
+ lib_finalize(nal_cb, stxd, stxd->cookie, PTL_OK);
/*
* extract the iovec from the stxd, deregister the memory.
if(buf_length > MAX_MSG_SIZE) {
CERROR("kibnal_send:request exceeds Transmit data size (%d).\n",
MAX_MSG_SIZE);
- rc = -1;
+ rc = PTL_FAIL;
return rc;
}
else {
PROF_FINISH(kibnal_send); // time stapm of send operation
- rc = 1;
+ rc = PTL_OK;
return rc;
}
ptl_kiov_t *iov,
size_t mlen)
{
- int rc = 1;
+ int rc = PTL_FAIL;
CDEBUG(D_NET, "kibnal_send_pages\n");
//
// do you need this
//
-int kibnal_callback(nal_cb_t * nal,
+void kibnal_callback(nal_cb_t * nal,
void *private,
lib_eq_t *eq,
ptl_event_t *ev)
{
CDEBUG(D_NET, "recv_pages not implemented\n");
- return PTL_OK;
+ return PTL_FAIL;
}
CDEBUG(D_NET,"kibnal_recv: mlen=%d, rlen=%d\n", mlen, rlen);
/* What was actually received must be >= what sender claims to
- * have sent. This is an LASSERT, since lib-move doesn't
- * check cb return code yet. */
- LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen);
+ * have sent. */
LASSERT (mlen <= rlen);
+ if (krx->krx_len < sizeof (ptl_hdr_t) + rlen)
+ return (PTL_FAIL);
+
PROF_START(kibnal_recv);
if(mlen != 0) {
PROF_START(lib_finalize);
- lib_finalize(nal, private, cookie);
+ lib_finalize(nal, private, cookie, PTL_OK);
PROF_FINISH(lib_finalize);
PROF_FINISH(kibnal_recv);
- return rlen;
+ return PTL_OK;
}
//
* LIB functions follow
*
*/
-static int
+static ptl_err_t
kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
size_t len)
{
nal->ni.nid, len, src_addr, dst_addr );
memcpy( dst_addr, src_addr, len );
- return (0);
+ return (PTL_OK);
}
-static int
+static ptl_err_t
kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
size_t len)
{
nal->ni.nid, len, src_addr, dst_addr );
memcpy( dst_addr, src_addr, len );
- return (0);
+ return (PTL_OK);
}
static void *
elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState,
kqswnal_data.kqn_eptxdmahandle,
ktx->ktx_basepage, ktx->ktx_nmappedpages);
-
#endif
ktx->ktx_nmappedpages = 0;
}
int
-kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov)
+kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_t *kiov)
{
int nfrags = ktx->ktx_nfrag;
int nmapped = ktx->ktx_nmappedpages;
LASSERT (niov > 0);
LASSERT (nob > 0);
+ /* skip complete frags before 'offset' */
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ kiov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+
do {
- int fraglen = kiov->kiov_len;
+ int fraglen = kiov->kiov_len - offset;
/* nob exactly spans the iovs */
LASSERT (fraglen <= nob);
/* XXX this is really crap, but we'll have to kmap until
* EKC has a page (rather than vaddr) mapping interface */
- ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+ ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
CDEBUG(D_NET,
"%p[%d] loading %p for %d, page %d, %d total\n",
kiov++;
niov--;
nob -= fraglen;
+ offset = 0;
/* iov must not run out before end of data */
LASSERT (nob == 0 || niov > 0);
}
int
-kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov)
+kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob,
+ int niov, struct iovec *iov)
{
int nfrags = ktx->ktx_nfrag;
int nmapped = ktx->ktx_nmappedpages;
LASSERT (niov > 0);
LASSERT (nob > 0);
+ /* skip complete frags before offset */
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ iov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+
do {
- int fraglen = iov->iov_len;
+ int fraglen = iov->iov_len - offset;
long npages = kqswnal_pages_spanned (iov->iov_base, fraglen);
/* nob exactly spans the iovs */
CDEBUG(D_NET,
"%p[%d] loading %p for %d, pages %d for %ld, %d total\n",
- ktx, nfrags, iov->iov_base, fraglen, basepage, npages,
- nmapped);
+ ktx, nfrags, iov->iov_base + offset, fraglen,
+ basepage, npages, nmapped);
#if MULTIRAIL_EKC
ep_dvma_load(kqswnal_data.kqn_ep, NULL,
- iov->iov_base, fraglen,
+ iov->iov_base + offset, fraglen,
kqswnal_data.kqn_ep_tx_nmh, basepage,
&railmask, &ktx->ktx_frags[nfrags]);
#else
elan3_dvma_kaddr_load (kqswnal_data.kqn_ep->DmaState,
kqswnal_data.kqn_eptxdmahandle,
- iov->iov_base, fraglen,
+ iov->iov_base + offset, fraglen,
basepage, &ktx->ktx_frags[nfrags].Base);
if (nfrags > 0 && /* previous frag mapped */
iov++;
niov--;
nob -= fraglen;
+ offset = 0;
/* iov must not run out before end of data */
LASSERT (nob == 0 || niov > 0);
kqswnal_tx_done (kqswnal_tx_t *ktx, int error)
{
lib_msg_t *msg;
- lib_msg_t *repmsg;
+ lib_msg_t *repmsg = NULL;
switch (ktx->ktx_state) {
case KTX_FORWARDING: /* router asked me to forward this packet */
case KTX_SENDING: /* packet sourced locally */
lib_finalize (&kqswnal_lib, ktx->ktx_args[0],
- (lib_msg_t *)ktx->ktx_args[1]);
+ (lib_msg_t *)ktx->ktx_args[1],
+ (error == 0) ? PTL_OK :
+ (error == -ENOMEM) ? PTL_NOSPACE : PTL_FAIL);
break;
case KTX_GETTING: /* Peer has DMA-ed direct? */
msg = (lib_msg_t *)ktx->ktx_args[1];
- repmsg = NULL;
- if (error == 0)
+ if (error == 0) {
repmsg = lib_fake_reply_msg (&kqswnal_lib,
ktx->ktx_nid, msg->md);
+ if (repmsg == NULL)
+ error = -ENOMEM;
+ }
- lib_finalize (&kqswnal_lib, ktx->ktx_args[0], msg);
-
- if (repmsg != NULL)
- lib_finalize (&kqswnal_lib, NULL, repmsg);
+ if (error == 0) {
+ lib_finalize (&kqswnal_lib, ktx->ktx_args[0],
+ msg, PTL_OK);
+ lib_finalize (&kqswnal_lib, NULL, repmsg, PTL_OK);
+ } else {
+ lib_finalize (&kqswnal_lib, ktx->ktx_args[0], msg,
+ (error == -ENOMEM) ? PTL_NOSPACE : PTL_FAIL);
+ }
break;
default:
ktx->ktx_nid, status);
kqswnal_notify_peer_down(ktx);
- status = -EIO;
+ status = -EHOSTDOWN;
} else if (ktx->ktx_state == KTX_GETTING) {
/* RPC completed OK; what did our peer put in the status
int
kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag,
- struct iovec *iov, ptl_kiov_t *kiov, int nob)
+ struct iovec *iov, ptl_kiov_t *kiov,
+ int offset, int nob)
{
kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0];
char *buffer = (char *)page_address(krx->krx_pages[0]);
/* Map the source data... */
ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0;
if (kiov != NULL)
- rc = kqswnal_map_tx_kiov (ktx, nob, nfrag, kiov);
+ rc = kqswnal_map_tx_kiov (ktx, offset, nob, nfrag, kiov);
else
- rc = kqswnal_map_tx_iov (ktx, nob, nfrag, iov);
+ rc = kqswnal_map_tx_iov (ktx, offset, nob, nfrag, iov);
if (rc != 0) {
CERROR ("Can't map source data: %d\n", rc);
return (-ECONNABORTED);
}
-static int
+static ptl_err_t
kqswnal_sendmsg (nal_cb_t *nal,
void *private,
lib_msg_t *libmsg,
unsigned int payload_niov,
struct iovec *payload_iov,
ptl_kiov_t *payload_kiov,
+ size_t payload_offset,
size_t payload_nob)
{
kqswnal_tx_t *ktx;
#if KQSW_CHECKSUM
int i;
kqsw_csum_t csum;
+ int sumoff;
int sumnob;
#endif
}
/* peer expects RPC completion with GET data */
- rc = kqswnal_dma_reply (ktx,
- payload_niov, payload_iov,
- payload_kiov, payload_nob);
+ rc = kqswnal_dma_reply (ktx, payload_niov,
+ payload_iov, payload_kiov,
+ payload_offset, payload_nob);
if (rc == 0)
return (PTL_OK);
#if KQSW_CHECKSUM
csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr));
memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum));
- for (csum = 0, i = 0, sumnob = payload_nob; sumnob > 0; i++) {
+ for (csum = 0, i = 0, sumoff = payload_offset, sumnob = payload_nob; sumnob > 0; i++) {
+ LASSERT(i < niov);
if (payload_kiov != NULL) {
ptl_kiov_t *kiov = &payload_kiov[i];
- char *addr = ((char *)kmap (kiov->kiov_page)) +
- kiov->kiov_offset;
-
- csum = kqsw_csum (csum, addr, MIN (sumnob, kiov->kiov_len));
- sumnob -= kiov->kiov_len;
+
+ if (sumoff >= kiov->kiov_len) {
+ sumoff -= kiov->kiov_len;
+ } else {
+ char *addr = ((char *)kmap (kiov->kiov_page)) +
+ kiov->kiov_offset + sumoff;
+ int fragnob = kiov->kiov_len - sumoff;
+
+ csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob));
+ sumnob -= fragnob;
+ sumoff = 0;
+ kunmap(kiov->kiov_page);
+ }
} else {
struct iovec *iov = &payload_iov[i];
- csum = kqsw_csum (csum, iov->iov_base, MIN (sumnob, kiov->iov_len));
- sumnob -= iov->iov_len;
+ if (sumoff > iov->iov_len) {
+ sumoff -= iov->iov_len;
+ } else {
+ char *addr = iov->iov_base + sumoff;
+ int fragnob = iov->iov_len - sumoff;
+
+ csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob));
+ sumnob -= fragnob;
+ sumoff = 0;
+ }
}
}
- memcpy(ktx->ktx_buffer +sizeof(*hdr) +sizeof(csum), &csum,sizeof(csum));
+ memcpy(ktx->ktx_buffer + sizeof(*hdr) + sizeof(csum), &csum, sizeof(csum));
#endif
if (kqswnal_data.kqn_optimized_gets &&
ktx->ktx_state = KTX_GETTING;
if ((libmsg->md->options & PTL_MD_KIOV) != 0)
- rc = kqswnal_map_tx_kiov (ktx, md->length,
+ rc = kqswnal_map_tx_kiov (ktx, 0, md->length,
md->md_niov, md->md_iov.kiov);
else
- rc = kqswnal_map_tx_iov (ktx, md->length,
+ rc = kqswnal_map_tx_iov (ktx, 0, md->length,
md->md_niov, md->md_iov.iov);
if (rc < 0) {
if (payload_nob > 0) {
if (payload_kiov != NULL)
lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
- payload_niov, payload_kiov, payload_nob);
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
else
lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
- payload_niov, payload_iov, payload_nob);
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
}
} else {
ktx->ktx_frags[0].Len = KQSW_HDR_SIZE;
#endif
if (payload_kiov != NULL)
- rc = kqswnal_map_tx_kiov (ktx, payload_nob,
+ rc = kqswnal_map_tx_kiov (ktx, payload_offset, payload_nob,
payload_niov, payload_kiov);
else
- rc = kqswnal_map_tx_iov (ktx, payload_nob,
+ rc = kqswnal_map_tx_iov (ktx, payload_offset, payload_nob,
payload_niov, payload_iov);
if (rc != 0) {
kqswnal_put_idle_tx (ktx);
return (PTL_OK);
}
-static int
+static ptl_err_t
kqswnal_send (nal_cb_t *nal,
void *private,
lib_msg_t *libmsg,
ptl_pid_t pid,
unsigned int payload_niov,
struct iovec *payload_iov,
+ size_t payload_offset,
size_t payload_nob)
{
return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid,
- payload_niov, payload_iov, NULL, payload_nob));
+ payload_niov, payload_iov, NULL,
+ payload_offset, payload_nob));
}
-static int
+static ptl_err_t
kqswnal_send_pages (nal_cb_t *nal,
void *private,
lib_msg_t *libmsg,
ptl_pid_t pid,
unsigned int payload_niov,
ptl_kiov_t *payload_kiov,
+ size_t payload_offset,
size_t payload_nob)
{
return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid,
- payload_niov, NULL, payload_kiov, payload_nob));
+ payload_niov, NULL, payload_kiov,
+ payload_offset, payload_nob));
}
void
nob <= KQSW_TX_BUFFER_SIZE)
{
/* send from ktx's pre-mapped contiguous buffer? */
- lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, nob);
+ lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, 0, nob);
#if MULTIRAIL_EKC
ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer,
0, nob);
{
/* zero copy */
ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0;
- rc = kqswnal_map_tx_iov (ktx, nob, niov, iov);
+ rc = kqswnal_map_tx_iov (ktx, 0, nob, niov, iov);
if (rc != 0)
goto failed;
krx->krx_rpc_reply_needed = 0;
kqswnal_rx_done (krx);
- lib_finalize (&kqswnal_lib, NULL, msg);
+ lib_finalize (&kqswnal_lib, NULL, msg,
+ (status == EP_SUCCESS) ? PTL_OK : PTL_FAIL);
kqswnal_put_idle_tx (ktx);
}
}
#endif
-static int
+static ptl_err_t
kqswnal_recvmsg (nal_cb_t *nal,
void *private,
lib_msg_t *libmsg,
unsigned int niov,
struct iovec *iov,
ptl_kiov_t *kiov,
+ size_t offset,
size_t mlen,
size_t rlen)
{
#endif
CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen);
- /* What was actually received must be >= payload.
- * This is an LASSERT, as lib_finalize() doesn't have a completion status. */
- LASSERT (krx->krx_nob >= KQSW_HDR_SIZE + mlen);
+ /* What was actually received must be >= payload. */
LASSERT (mlen <= rlen);
+ if (krx->krx_nob < KQSW_HDR_SIZE + mlen) {
+ CERROR("Bad message size: have %d, need %d + %d\n",
+ krx->krx_nob, KQSW_HDR_SIZE, mlen);
+ return (PTL_FAIL);
+ }
/* It must be OK to kmap() if required */
LASSERT (kiov == NULL || !in_interrupt ());
page_nob = PAGE_SIZE - KQSW_HDR_SIZE;
LASSERT (niov > 0);
+
if (kiov != NULL) {
- iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
- iov_nob = kiov->kiov_len;
+ /* skip complete frags */
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ kiov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+ iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
+ iov_nob = kiov->kiov_len - offset;
} else {
- iov_ptr = iov->iov_base;
- iov_nob = iov->iov_len;
+ /* skip complete frags */
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ iov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+ iov_ptr = iov->iov_base + offset;
+ iov_nob = iov->iov_len - offset;
}
-
+
for (;;)
{
- /* We expect the iov to exactly match mlen */
- LASSERT (iov_nob <= mlen);
-
- frag = MIN (page_nob, iov_nob);
+ frag = mlen;
+ if (frag > page_nob)
+ frag = page_nob;
+ if (frag > iov_nob)
+ frag = iov_nob;
+
memcpy (iov_ptr, page_ptr, frag);
#if KQSW_CHECKSUM
payload_csum = kqsw_csum (payload_csum, iov_ptr, frag);
"csum_nob %d\n",
hdr_csum, payload_csum, csum_frags, csum_nob);
#endif
- lib_finalize(nal, private, libmsg);
+ lib_finalize(nal, private, libmsg, PTL_OK);
- return (rlen);
+ return (PTL_OK);
}
-static int
+static ptl_err_t
kqswnal_recv(nal_cb_t *nal,
void *private,
lib_msg_t *libmsg,
unsigned int niov,
struct iovec *iov,
+ size_t offset,
size_t mlen,
size_t rlen)
{
- return (kqswnal_recvmsg (nal, private, libmsg, niov, iov, NULL, mlen, rlen));
+ return (kqswnal_recvmsg(nal, private, libmsg,
+ niov, iov, NULL,
+ offset, mlen, rlen));
}
-static int
+static ptl_err_t
kqswnal_recv_pages (nal_cb_t *nal,
void *private,
lib_msg_t *libmsg,
unsigned int niov,
ptl_kiov_t *kiov,
+ size_t offset,
size_t mlen,
size_t rlen)
{
- return (kqswnal_recvmsg (nal, private, libmsg, niov, NULL, kiov, mlen, rlen));
+ return (kqswnal_recvmsg(nal, private, libmsg,
+ niov, NULL, kiov,
+ offset, mlen, rlen));
}
int
break;
}
- lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie);
+ lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie,
+ (err == 0) ? PTL_OK : PTL_FAIL);
PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
}
if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) {
CERROR("kscimacnal:request exceeds TX MTU size (%ld).\n",
mac_get_mtusize(ksci->ksci_machandle));
- return -EINVAL;
+ return PTL_FAIL;
}
/* save transaction info for later finalize and cleanup */
PORTAL_ALLOC(ktx, (sizeof(kscimacnal_tx_t)));
if (!ktx) {
- return -ENOMEM;
+ return PTL_NOSPACE;
}
ktx->ktx_nmapped = 0; /* Start with no mapped pages :) */
kscimacnal_txrelease, ktx);
if (!msg) {
PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
- return -ENOMEM;
+ return PTL_NOSPACE;
}
mac_put_mblk(msg, sizeof(ptl_hdr_t));
lastblk=msg;
if(!newblk) {
mac_free_msg(msg);
PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
- return -ENOMEM;
+ return PTL_NOSPACE;
}
mac_put_mblk(newblk, nob);
mac_link_mblk(lastblk, newblk);
CERROR("kscimacnal: mac_send() failed, rc=%d\n", rc);
mac_free_msg(msg);
PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
- return rc;
+ return PTL_FAIL;
}
- return 0;
+ return PTL_OK;
}
krx->msg, mlen, rlen, niov);
/* What was actually received must be >= what sender claims to have
- * sent. This is an LASSERT, since lib-move doesn't check cb return
- * code yet. Also, rlen seems to be negative when mlen==0 so don't
- * assert on that.
- */
- LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen);
- LASSERT (mlen==0 || mlen <= rlen);
+ * sent. */
+ LASSERT (mlen <= rlen); /* something is wrong if this isn't true */
+ if (mac_msg_size(krx->msg) < sizeof(ptl_hdr_t)+mlen) {
+ /* We didn't receive everything lib thinks we did */
+ CERROR("Bad message size: have %d, need %d + %d\n",
+ mac_msg_size(krx->msg), sizeof(ptl_hdr_t), mlen);
+ return (PTL_FAIL);
+ }
+
/* It must be OK to kmap() if required */
LASSERT (kiov == NULL || !in_interrupt ());
/* Either all pages or all vaddrs */
CDEBUG(D_NET, "Calling lib_finalize.\n");
PROF_START(lib_finalize);
- lib_finalize(nal, private, cookie);
+ lib_finalize(nal, private, cookie, PTL_OK);
PROF_FINISH(lib_finalize);
CDEBUG(D_NET, "Done.\n");
- return rlen;
+ return PTL_OK;
}
/* complete current receive if any */
switch (conn->ksnc_rx_state) {
case SOCKNAL_RX_BODY:
-#if 0
- lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie);
-#else
- CERROR ("Refusing to complete a partial receive from "
- LPX64", ip %d.%d.%d.%d:%d\n", conn->ksnc_peer->ksnp_nid,
- HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
- CERROR ("This may hang communications and "
- "prevent modules from unloading\n");
-#endif
+ CERROR("Completing partial receive from "LPX64
+ ", ip %d.%d.%d.%d:%d, with error\n",
+ conn->ksnc_peer->ksnp_nid,
+ HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+ lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_FAIL);
break;
case SOCKNAL_RX_BODY_FWD:
ksocknal_fmb_callback (conn->ksnc_cookie, -ECONNABORTED);
* LIB functions follow
*
*/
-int
+ptl_err_t
ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr,
user_ptr src_addr, size_t len)
{
nal->ni.nid, (long)len, src_addr, dst_addr);
memcpy( dst_addr, src_addr, len );
- return 0;
+ return PTL_OK;
}
-int
+ptl_err_t
ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr,
void *src_addr, size_t len)
{
nal->ni.nid, (long)len, src_addr, dst_addr);
memcpy( dst_addr, src_addr, len );
- return 0;
-}
-
-int
-ksocknal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq,
- ptl_event_t *ev)
-{
- CDEBUG(D_NET, LPX64": callback eq %p ev %p\n",
- nal->ni.nid, eq, ev);
-
- if (eq->event_callback != NULL)
- eq->event_callback(ev);
-
- return 0;
+ return PTL_OK;
}
void *
if (tx->tx_isfwd) { /* was a forwarded packet? */
kpr_fwd_done (&ksocknal_data.ksnd_router,
- KSOCK_TX_2_KPR_FWD_DESC (tx), 0);
+ KSOCK_TX_2_KPR_FWD_DESC (tx),
+ (tx->tx_resid == 0) ? 0 : -ECONNABORTED);
EXIT;
return;
}
/* local send */
ltx = KSOCK_TX_2_KSOCK_LTX (tx);
- lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie);
+ lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie,
+ (tx->tx_resid == 0) ? PTL_OK : PTL_FAIL);
ksocknal_free_ltx (ltx);
EXIT;
LASSERT (rc < 0);
if (!conn->ksnc_closing)
- CERROR ("[%p] Error %d on write to "LPX64
- " ip %d.%d.%d.%d:%d\n",conn, rc,
- conn->ksnc_peer->ksnp_nid,
- HIPQUAD(conn->ksnc_ipaddr),
- conn->ksnc_port);
+ CERROR("[%p] Error %d on write to "LPX64
+ " ip %d.%d.%d.%d:%d\n", conn, rc,
+ conn->ksnc_peer->ksnp_nid,
+ HIPQUAD(conn->ksnc_ipaddr),
+ conn->ksnc_port);
ksocknal_close_conn_and_siblings (conn, rc);
ksocknal_tx_launched (tx);
-
+
return (rc);
-}
+}
void
ksocknal_launch_autoconnect_locked (ksock_route_t *route)
ptl_nid_t target_nid;
int rc;
ksock_peer_t *peer = ksocknal_find_peer_locked (nid);
-
+
if (peer != NULL)
return (peer);
-
+
if (tx->tx_isfwd) {
CERROR ("Can't send packet to "LPX64
- " %s: routed target is not a peer\n",
+ " %s: routed target is not a peer\n",
nid, portals_nid2str(SOCKNAL, nid, ipbuf));
return (NULL);
}
-
+
rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, tx->tx_nob,
&target_nid);
if (rc != 0) {
- CERROR ("Can't route to "LPX64" %s: router error %d\n",
+ CERROR ("Can't route to "LPX64" %s: router error %d\n",
nid, portals_nid2str(SOCKNAL, nid, ipbuf), rc);
return (NULL);
}
return (-EHOSTUNREACH);
}
-int
+ptl_err_t
ksocknal_sendmsg(nal_cb_t *nal,
void *private,
lib_msg_t *cookie,
unsigned int payload_niov,
struct iovec *payload_iov,
ptl_kiov_t *payload_kiov,
+ size_t payload_offset,
size_t payload_nob)
{
ksock_ltx_t *ltx;
ltx->ltx_tx.tx_kiov = NULL;
ltx->ltx_tx.tx_nkiov = 0;
- ltx->ltx_tx.tx_niov = 1 + payload_niov;
-
- memcpy(ltx->ltx_iov + 1, payload_iov,
- payload_niov * sizeof (*payload_iov));
-
+ ltx->ltx_tx.tx_niov =
+ 1 + lib_extract_iov(payload_niov, <x->ltx_iov[1],
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
} else {
/* payload is all pages */
- ltx->ltx_tx.tx_kiov = ltx->ltx_kiov;
- ltx->ltx_tx.tx_nkiov = payload_niov;
-
ltx->ltx_tx.tx_niov = 1;
- memcpy(ltx->ltx_kiov, payload_kiov,
- payload_niov * sizeof (*payload_kiov));
+ ltx->ltx_tx.tx_kiov = ltx->ltx_kiov;
+ ltx->ltx_tx.tx_nkiov =
+ lib_extract_kiov(payload_niov, ltx->ltx_kiov,
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
}
rc = ksocknal_launch_packet(<x->ltx_tx, nid);
return (PTL_FAIL);
}
-int
+ptl_err_t
ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie,
ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
unsigned int payload_niov, struct iovec *payload_iov,
- size_t payload_len)
+ size_t payload_offset, size_t payload_len)
{
return (ksocknal_sendmsg(nal, private, cookie,
hdr, type, nid, pid,
payload_niov, payload_iov, NULL,
- payload_len));
+ payload_offset, payload_len));
}
-int
+ptl_err_t
ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie,
ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
unsigned int payload_niov, ptl_kiov_t *payload_kiov,
- size_t payload_len)
+ size_t payload_offset, size_t payload_len)
{
return (ksocknal_sendmsg(nal, private, cookie,
hdr, type, nid, pid,
payload_niov, NULL, payload_kiov,
- payload_len));
+ payload_offset, payload_len));
}
void
/* drop peer ref taken on init */
ksocknal_put_peer (fmb->fmb_peer);
-
+
spin_lock_irqsave (&fmp->fmp_lock, flags);
list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs);
case SOCKNAL_RX_BODY:
/* payload all received */
- lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie);
+ lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_OK);
/* Fall through */
case SOCKNAL_RX_SLOP:
return (-EINVAL); /* keep gcc happy */
}
-int
+ptl_err_t
ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg,
- unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen)
+ unsigned int niov, struct iovec *iov,
+ size_t offset, size_t mlen, size_t rlen)
{
ksock_conn_t *conn = (ksock_conn_t *)private;
conn->ksnc_rx_nkiov = 0;
conn->ksnc_rx_kiov = NULL;
- conn->ksnc_rx_niov = niov;
conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
- memcpy (conn->ksnc_rx_iov, iov, niov * sizeof (*iov));
+ conn->ksnc_rx_niov =
+ lib_extract_iov(PTL_MD_MAX_IOV, conn->ksnc_rx_iov,
+ niov, iov, offset, mlen);
LASSERT (mlen ==
lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
- return (rlen);
+ return (PTL_OK);
}
-int
+ptl_err_t
ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg,
- unsigned int niov, ptl_kiov_t *kiov, size_t mlen, size_t rlen)
+ unsigned int niov, ptl_kiov_t *kiov,
+ size_t offset, size_t mlen, size_t rlen)
{
ksock_conn_t *conn = (ksock_conn_t *)private;
conn->ksnc_rx_niov = 0;
conn->ksnc_rx_iov = NULL;
- conn->ksnc_rx_nkiov = niov;
conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
- memcpy (conn->ksnc_rx_kiov, kiov, niov * sizeof (*kiov));
+ conn->ksnc_rx_nkiov =
+ lib_extract_kiov(PTL_MD_MAX_IOV, conn->ksnc_rx_kiov,
+ niov, kiov, offset, mlen);
LASSERT (mlen ==
lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
- return (rlen);
+ return (PTL_OK);
}
int ksocknal_scheduler (void *arg)
rc, *nid, portals_nid2str(SOCKNAL, *nid, ipbuf));
return (rc);
}
-
+
if (hmv->magic != __le32_to_cpu (PORTALS_PROTO_MAGIC)) {
CERROR ("Bad magic %#08x (%#08x expected) from "LPX64" %s\n",
__cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC, *nid,
} else if (*nid != __le64_to_cpu (hdr.src_nid)) {
CERROR ("Connected to nid "LPX64" %s, but expecting "LPX64" %s\n",
__le64_to_cpu (hdr.src_nid),
- portals_nid2str(SOCKNAL,
+ portals_nid2str(SOCKNAL,
__le64_to_cpu(hdr.src_nid),
ipbuf),
*nid, portals_nid2str(SOCKNAL, *nid, ipbuf));
*type = SOCKNAL_CONN_BULK_IN;
break;
default:
- CERROR ("Unexpected type %d from "LPX64" %s\n",
+ CERROR ("Unexpected type %d from "LPX64" %s\n",
*type, *nid,
portals_nid2str(SOCKNAL, *nid, ipbuf));
return (-EPROTO);
if (rc != 0) {
CERROR ("Error %d connecting to "LPX64" %s\n", rc,
route->ksnr_peer->ksnp_nid,
- portals_nid2str(SOCKNAL,
- route->ksnr_peer->ksnp_nid,
+ portals_nid2str(SOCKNAL,
+ route->ksnr_peer->ksnp_nid,
ipbuf));
goto out;
}
while (!list_empty (&zombies)) {
char ipbuf[PTL_NALFMT_SIZE];
tx = list_entry (zombies.next, ksock_tx_t, tx_list);
-
+
CERROR ("Deleting packet type %d len %d ("LPX64" %s->"LPX64" %s)\n",
NTOH__u32 (tx->tx_hdr->type),
NTOH__u32 (tx->tx_hdr->payload_length),
cb_recv_pages: ksocknal_recv_pages,
cb_read: ksocknal_read,
cb_write: ksocknal_write,
- cb_callback: ksocknal_callback,
cb_malloc: ksocknal_malloc,
cb_free: ksocknal_free,
cb_printf: ksocknal_printf,
EXPORT_SYMBOL(lib_iov_nob);
EXPORT_SYMBOL(lib_copy_iov2buf);
EXPORT_SYMBOL(lib_copy_buf2iov);
+EXPORT_SYMBOL(lib_extract_iov);
EXPORT_SYMBOL(lib_kiov_nob);
EXPORT_SYMBOL(lib_copy_kiov2buf);
EXPORT_SYMBOL(lib_copy_buf2kiov);
+EXPORT_SYMBOL(lib_extract_kiov);
EXPORT_SYMBOL(lib_finalize);
EXPORT_SYMBOL(lib_parse);
EXPORT_SYMBOL(lib_fake_reply_msg);
CPPFLAGS=
INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include
-lib_LIBRARIES= libportals.a
+noinst_LIBRARIES= libportals.a
libportals_a_SOURCES= api-eq.c api-init.c api-me.c api-errno.c api-ni.c api-wrap.c lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-eq.c lib-md.c lib-move.c lib-ni.c lib-pid.c
+
+if LIBLUSTRE
+libportals_a_CFLAGS= -fPIC
+endif
*ev = *new_event;
- /* Set the unlinked_me interface number if there is one to pass
- * back, since the NAL hasn't a clue what it is and therefore can't
- * set it. */
- if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE))
- ev->unlinked_me.nal_idx = eventq.nal_idx;
-
/* ensure event is delivered correctly despite possible
races with lib_finalize */
if (eq->sequence != new_event->sequence) {
}
#ifndef __KERNEL__
+#if 0
static jmp_buf eq_jumpbuf;
static void eq_timeout(int signal)
return rc;
}
+#else
+#include <errno.h>
-#endif
+/* FIXME
+ * Here timeout need a trick with tcpnal, definitely unclean but OK for
+ * this moment.
+ */
+
+/* global variables defined by tcpnal */
+extern int __tcpnal_eqwait_timeout_value;
+extern int __tcpnal_eqwait_timedout;
+
+int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
+ int timeout)
+{
+ int rc;
+ if (!timeout)
+ return PtlEQWait(eventq_in, event_out);
+
+ __tcpnal_eqwait_timeout_value = timeout;
+
+ while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) {
+ nal_t *nal = ptl_hndl2nal(&eventq_in);
+
+ if (nal->yield)
+ nal->yield(nal);
+
+ if (__tcpnal_eqwait_timedout) {
+ if (__tcpnal_eqwait_timedout != ETIMEDOUT)
+ printf("Warning: yield return error %d\n",
+ __tcpnal_eqwait_timedout);
+ rc = PTL_EQ_EMPTY;
+ break;
+ }
+ }
+
+ __tcpnal_eqwait_timeout_value = 0;
+
+ return rc;
+}
+#endif
+#endif /* __KERNEL__ */
"PTL_IOV_TOO_SMALL",
"PTL_EQ_INUSE",
- "PTL_MD_INUSE"
};
/* If you change these, you must update the number table in portals/errno.h */
if (ptl_interfaces[i] == nal) {
nal->refct++;
handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | i;
- fprintf(stderr, "Returning existing NAL (%d)\n", i);
+ CDEBUG(D_OTHER, "Returning existing NAL (%d)\n", i);
ptl_ni_init_mutex_exit ();
return PTL_OK;
}
nal_t *nal;
if (!ptl_init) {
- fprintf(stderr, "PtlGetId: Not initialized\n");
+ CERROR("Not initialized\n");
return PTL_NOINIT;
}
int i;
if (!ptl_init) {
- fprintf(stderr, "PtlMDAttach/Bind/Update: Not initialized\n");
+ CERROR("PtlMDAttach/Bind/Update: Not initialized\n");
return PTL_NOINIT;
}
# include <sys/time.h>
#endif
-#ifndef PTL_USE_DESC_LISTS
-static int ptl_slab_users;
-
-atomic_t md_in_use_count = ATOMIC_INIT(0);
-atomic_t msg_in_use_count = ATOMIC_INIT(0);
-atomic_t me_in_use_count = ATOMIC_INIT(0);
-atomic_t eq_in_use_count = ATOMIC_INIT(0);
+#ifndef PTL_USE_LIB_FREELIST
int
kportal_descriptor_setup (nal_cb_t *nal)
{
- ptl_slab_users++;
- RETURN(PTL_OK);
+ return PTL_OK;
}
void
kportal_descriptor_cleanup (nal_cb_t *nal)
{
- if (--ptl_slab_users != 0)
- return;
-
- LASSERT (atomic_read (&md_in_use_count) == 0);
- LASSERT (atomic_read (&me_in_use_count) == 0);
- LASSERT (atomic_read (&eq_in_use_count) == 0);
- LASSERT (atomic_read (&msg_in_use_count) == 0);
}
#else
int rc;
int i;
- /* NB we are passes an allocated, but uninitialised/active md.
+ /* NB we are passed an allocated, but uninitialised/active md.
* if we return success, caller may lib_md_unlink() it.
* otherwise caller may only lib_md_free() it.
*/
return PTL_INV_EQ;
}
- if ((md->options & PTL_MD_IOV) != 0 && /* discontiguous MD */
- md->niov > PTL_MD_MAX_IOV) /* too many fragments */
- return PTL_IOV_TOO_MANY;
+ /* Must check this _before_ allocation. Also, note that non-iov
+ * MDs must set md_niov to 0. */
+ LASSERT((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0 ||
+ md->niov <= PTL_MD_MAX_IOV);
if ((md->options & max_size_opts) != 0 && /* max size used */
(md->max_size < 0 || md->max_size > md->length)) // illegal max_size
lib_md_t *md;
unsigned long flags;
- md = lib_md_alloc (nal);
+ if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOV)) != 0 &&
+ args->md_in.niov > PTL_MD_MAX_IOV) /* too many fragments */
+ return (ret->rc = PTL_IOV_TOO_MANY);
+
+ md = lib_md_alloc(nal, &args->md_in);
if (md == NULL)
return (ret->rc = PTL_NOSPACE);
lib_md_t *md;
unsigned long flags;
- md = lib_md_alloc (nal);
+ if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOV)) != 0 &&
+ args->md_in.niov > PTL_MD_MAX_IOV) /* too many fragments */
+ return (ret->rc = PTL_IOV_TOO_MANY);
+
+ md = lib_md_alloc(nal, &args->md_in);
if (md == NULL)
return (ret->rc = PTL_NOSPACE);
int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
{
- PtlMDUnlink_in *args = v_args;
+ PtlMDUnlink_in *args = v_args;
PtlMDUnlink_out *ret = v_ret;
-
- lib_md_t *md;
- unsigned long flags;
+ ptl_event_t ev;
+ lib_md_t *md;
+ unsigned long flags;
state_lock(nal, &flags);
md = ptl_handle2md(&args->md_in, nal);
if (md == NULL) {
- ret->rc = PTL_INV_MD;
- } else if (md->pending != 0) { /* being filled/spilled */
- ret->rc = PTL_MD_INUSE;
- } else {
- /* Callers attempting to unlink a busy MD which will get
- * unlinked once the net op completes should see INUSE,
- * before completion and INV_MD thereafter. LASSERT we've
- * got that right... */
- LASSERT ((md->md_flags & PTL_MD_FLAG_UNLINK) == 0);
-
- lib_md_deconstruct(nal, md, &ret->status_out);
- lib_md_unlink(nal, md);
- ret->rc = PTL_OK;
+ state_unlock(nal, &flags);
+ return (ret->rc = PTL_INV_MD);
+ }
+
+ /* If the MD is busy, lib_md_unlink just marks it for deletion, and
+ * when the NAL is done, the completion event flags that the MD was
+ * unlinked. Otherwise, we enqueue an event now... */
+
+ if (md->eq != NULL &&
+ md->pending == 0) {
+ memset(&ev, 0, sizeof(ev));
+
+ ev.type = PTL_EVENT_UNLINK;
+ ev.status = PTL_OK;
+ ev.unlinked = 1;
+ lib_md_deconstruct(nal, md, &ev.mem_desc);
+
+ lib_enq_event_locked(nal, private, md->eq, &ev);
}
+ lib_md_deconstruct(nal, md, &ret->status_out);
+ lib_md_unlink(nal, md);
+ ret->rc = PTL_OK;
+
state_unlock(nal, &flags);
- return (ret->rc);
+ return (PTL_OK);
}
int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args,
goto out;
}
+ /* XXX fttb, the new MD must be the same type wrt fragmentation */
+ if (((new->options ^ md->options) &
+ (PTL_MD_IOV | PTL_MD_KIOV)) != 0) {
+ ret->rc = PTL_INV_MD;
+ goto out;
+ }
+
+ if (new->niov > md->md_niov) {
+ ret->rc = PTL_IOV_TOO_MANY;
+ goto out;
+ }
+
+ if (new->niov < md->md_niov) {
+ ret->rc = PTL_IOV_TOO_SMALL;
+ goto out;
+ }
+
if (!PtlHandleEqual (args->testq_in, PTL_EQ_NONE)) {
test_eq = ptl_handle2eq(&args->testq_in, nal);
if (test_eq == NULL) {
}
void
-lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len)
+lib_copy_iov2buf (char *dest, int niov, struct iovec *iov,
+ ptl_size_t offset, ptl_size_t len)
{
ptl_size_t nob;
- while (len > 0)
- {
+ if (len == 0)
+ return;
+
+ /* skip complete frags before 'offset' */
+ LASSERT (niov > 0);
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ iov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+
+ do {
LASSERT (niov > 0);
- nob = MIN (iov->iov_len, len);
- memcpy (dest, iov->iov_base, nob);
+ nob = MIN (iov->iov_len - offset, len);
+ memcpy (dest, iov->iov_base + offset, nob);
len -= nob;
dest += nob;
niov--;
iov++;
- }
+ offset = 0;
+ } while (len > 0);
}
void
-lib_copy_buf2iov (int niov, struct iovec *iov, char *src, ptl_size_t len)
+lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset,
+ char *src, ptl_size_t len)
{
ptl_size_t nob;
- while (len > 0)
- {
+ if (len == 0)
+ return;
+
+ /* skip complete frags before 'offset' */
+ LASSERT (niov > 0);
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ iov++;
+ niov--;
LASSERT (niov > 0);
- nob = MIN (iov->iov_len, len);
- memcpy (iov->iov_base, src, nob);
+ }
+
+ do {
+ LASSERT (niov > 0);
+ nob = MIN (iov->iov_len - offset, len);
+ memcpy (iov->iov_base + offset, src, nob);
len -= nob;
src += nob;
niov--;
iov++;
- }
+ offset = 0;
+ } while (len > 0);
}
-static int
-lib_extract_iov (struct iovec *dst, lib_md_t *md,
+int
+lib_extract_iov (int dst_niov, struct iovec *dst,
+ int src_niov, struct iovec *src,
ptl_size_t offset, ptl_size_t len)
{
/* Initialise 'dst' to the subset of 'src' starting at 'offset',
* for exactly 'len' bytes, and return the number of entries.
* NB not destructive to 'src' */
- int src_niov = md->md_niov;
- struct iovec *src = md->md_iov.iov;
ptl_size_t frag_len;
- int dst_niov;
+ int niov;
- LASSERT (offset + len <= md->length);
-
if (len == 0) /* no data => */
return (0); /* no frags */
LASSERT (src_niov > 0);
}
- dst_niov = 1;
+ niov = 1;
for (;;) {
LASSERT (src_niov > 0);
- LASSERT (dst_niov <= PTL_MD_MAX_IOV);
+ LASSERT (niov <= dst_niov);
frag_len = src->iov_len - offset;
dst->iov_base = ((char *)src->iov_base) + offset;
if (len <= frag_len) {
dst->iov_len = len;
- return (dst_niov);
+ return (niov);
}
dst->iov_len = frag_len;
len -= frag_len;
dst++;
src++;
- dst_niov++;
+ niov++;
src_niov--;
offset = 0;
}
}
void
-lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
+lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov,
+ ptl_size_t offset, ptl_size_t len)
{
LASSERT (0);
}
void
-lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *dest, ptl_size_t len)
+lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
+ char *src, ptl_size_t len)
{
LASSERT (0);
}
-static int
-lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
+int
+lib_extract_kiov (int dst_niov, ptl_kiov_t *dst,
+ int src_niov, ptl_kiov_t *src,
ptl_size_t offset, ptl_size_t len)
{
LASSERT (0);
}
void
-lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
+lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov,
+ ptl_size_t offset, ptl_size_t len)
{
ptl_size_t nob;
char *addr;
+
+ if (len == 0)
+ return;
LASSERT (!in_interrupt ());
- while (len > 0)
- {
+
+ LASSERT (niov > 0);
+ while (offset > kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ kiov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+
+ do{
LASSERT (niov > 0);
- nob = MIN (kiov->kiov_len, len);
+ nob = MIN (kiov->kiov_len - offset, len);
- addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+ addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
memcpy (dest, addr, nob);
kunmap (kiov->kiov_page);
dest += nob;
niov--;
kiov++;
- }
+ offset = 0;
+ } while (len > 0);
}
void
-lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len)
+lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
+ char *src, ptl_size_t len)
{
ptl_size_t nob;
char *addr;
+ if (len == 0)
+ return;
+
LASSERT (!in_interrupt ());
- while (len > 0)
- {
+
+ LASSERT (niov > 0);
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ kiov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+
+ do {
LASSERT (niov > 0);
- nob = MIN (kiov->kiov_len, len);
+ nob = MIN (kiov->kiov_len - offset, len);
- addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+ addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
memcpy (addr, src, nob);
kunmap (kiov->kiov_page);
src += nob;
niov--;
kiov++;
- }
+ offset = 0;
+ } while (len > 0);
}
-static int
-lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
+int
+lib_extract_kiov (int dst_niov, ptl_kiov_t *dst,
+ int src_niov, ptl_kiov_t *src,
ptl_size_t offset, ptl_size_t len)
{
/* Initialise 'dst' to the subset of 'src' starting at 'offset',
* for exactly 'len' bytes, and return the number of entries.
* NB not destructive to 'src' */
- int src_niov = md->md_niov;
- ptl_kiov_t *src = md->md_iov.kiov;
ptl_size_t frag_len;
- int dst_niov;
+ int niov;
- LASSERT (offset + len <= md->length);
-
if (len == 0) /* no data => */
return (0); /* no frags */
LASSERT (src_niov > 0);
}
- dst_niov = 1;
+ niov = 1;
for (;;) {
LASSERT (src_niov > 0);
- LASSERT (dst_niov <= PTL_MD_MAX_IOV);
+ LASSERT (niov <= dst_niov);
frag_len = src->kiov_len - offset;
dst->kiov_page = src->kiov_page;
if (len <= frag_len) {
dst->kiov_len = len;
LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
- return (dst_niov);
+ return (niov);
}
dst->kiov_len = frag_len;
len -= frag_len;
dst++;
src++;
- dst_niov++;
+ niov++;
src_niov--;
offset = 0;
}
}
#endif
-void
+ptl_err_t
lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen)
{
- int niov;
-
if (mlen == 0)
- nal->cb_recv (nal, private, msg, 0, NULL, 0, rlen);
- else if ((md->options & PTL_MD_KIOV) == 0) {
- niov = lib_extract_iov (msg->msg_iov.iov, md, offset, mlen);
- nal->cb_recv (nal, private, msg,
- niov, msg->msg_iov.iov, mlen, rlen);
- } else {
- niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, mlen);
- nal->cb_recv_pages (nal, private, msg,
- niov, msg->msg_iov.kiov, mlen, rlen);
- }
+ return (nal->cb_recv(nal, private, msg,
+ 0, NULL,
+ offset, mlen, rlen));
+
+ if ((md->options & PTL_MD_KIOV) == 0)
+ return (nal->cb_recv(nal, private, msg,
+ md->md_niov, md->md_iov.iov,
+ offset, mlen, rlen));
+
+ return (nal->cb_recv_pages(nal, private, msg,
+ md->md_niov, md->md_iov.kiov,
+ offset, mlen, rlen));
}
-int
+ptl_err_t
lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
lib_md_t *md, ptl_size_t offset, ptl_size_t len)
{
- int niov;
-
if (len == 0)
- return (nal->cb_send (nal, private, msg,
- hdr, type, nid, pid,
- 0, NULL, 0));
+ return (nal->cb_send(nal, private, msg,
+ hdr, type, nid, pid,
+ 0, NULL,
+ offset, len));
- if ((md->options & PTL_MD_KIOV) == 0) {
- niov = lib_extract_iov (msg->msg_iov.iov, md, offset, len);
- return (nal->cb_send (nal, private, msg,
- hdr, type, nid, pid,
- niov, msg->msg_iov.iov, len));
- }
-
- niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, len);
- return (nal->cb_send_pages (nal, private, msg,
- hdr, type, nid, pid,
- niov, msg->msg_iov.kiov, len));
+ if ((md->options & PTL_MD_KIOV) == 0)
+ return (nal->cb_send(nal, private, msg,
+ hdr, type, nid, pid,
+ md->md_niov, md->md_iov.iov,
+ offset, len));
+
+ return (nal->cb_send_pages(nal, private, msg,
+ hdr, type, nid, pid,
+ md->md_niov, md->md_iov.kiov,
+ offset, len));
}
-static lib_msg_t *
-get_new_msg (nal_cb_t *nal, lib_md_t *md)
+static void
+lib_commit_md (nal_cb_t *nal, lib_md_t *md, lib_msg_t *msg)
{
/* ALWAYS called holding the state_lock */
lib_counters_t *counters = &nal->ni.counters;
- lib_msg_t *msg = lib_msg_alloc (nal);
-
- if (msg == NULL)
- return (NULL);
-
- memset (msg, 0, sizeof (*msg));
-
- msg->send_ack = 0;
+ /* Here, we commit the MD to a network OP by marking it busy and
+ * decrementing its threshold. Come what may, the network "owns"
+ * the MD until a call to lib_finalize() signals completion. */
msg->md = md;
- do_gettimeofday(&msg->ev.arrival_time);
+
md->pending++;
if (md->threshold != PTL_MD_THRESH_INF) {
LASSERT (md->threshold > 0);
counters->msgs_max = counters->msgs_alloc;
list_add (&msg->msg_list, &nal->ni.ni_active_msgs);
+}
- return (msg);
+static void
+lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr)
+{
+ unsigned long flags;
+
+ /* CAVEAT EMPTOR: this only drops messages that we've not committed
+ * to receive (init_msg() not called) and therefore can't cause an
+ * event. */
+
+ state_lock(nal, &flags);
+ nal->ni.counters.drop_count++;
+ nal->ni.counters.drop_length += hdr->payload_length;
+ state_unlock(nal, &flags);
+
+ /* NULL msg => if NAL calls lib_finalize it will be a noop */
+ (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length);
}
/*
* of long messages.
*
*/
-static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+static ptl_err_t
+parse_put(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
{
lib_ni_t *ni = &nal->ni;
ptl_size_t mlength = 0;
ptl_size_t offset = 0;
int unlink = 0;
+ ptl_err_t rc;
lib_me_t *me;
lib_md_t *md;
- lib_msg_t *msg;
unsigned long flags;
-
+
/* Convert put fields to host byte order */
hdr->msg.put.match_bits = NTOH__u64 (hdr->msg.put.match_bits);
hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index);
hdr->payload_length, hdr->msg.put.offset,
hdr->msg.put.match_bits,
&mlength, &offset, &unlink);
- if (me == NULL)
- goto drop;
+ if (me == NULL) {
+ state_unlock(nal, &flags);
+ return (PTL_FAIL);
+ }
md = me->md;
CDEBUG(D_NET, "Incoming put index %x from "LPU64"/%u of length %d/%d "
hdr->src_nid, hdr->src_pid, mlength, hdr->payload_length,
md->md_lh.lh_cookie, md->md_niov, offset);
- msg = get_new_msg (nal, md);
- if (msg == NULL) {
- CERROR(LPU64": Dropping PUT from "LPU64": can't allocate msg\n",
- ni->nid, hdr->src_nid);
- goto drop;
- }
+ lib_commit_md(nal, md, msg);
+
+ msg->ev.type = PTL_EVENT_PUT;
+ msg->ev.initiator.nid = hdr->src_nid;
+ msg->ev.initiator.pid = hdr->src_pid;
+ msg->ev.portal = hdr->msg.put.ptl_index;
+ msg->ev.match_bits = hdr->msg.put.match_bits;
+ msg->ev.rlength = hdr->payload_length;
+ msg->ev.mlength = mlength;
+ msg->ev.offset = offset;
+ msg->ev.hdr_data = hdr->msg.put.hdr_data;
+
+ lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
!(md->options & PTL_MD_ACK_DISABLE)) {
- msg->send_ack = 1;
msg->ack_wmd = hdr->msg.put.ack_wmd;
- msg->nid = hdr->src_nid;
- msg->pid = hdr->src_pid;
- msg->ev.match_bits = hdr->msg.put.match_bits;
- }
-
- if (md->eq) {
- msg->ev.type = PTL_EVENT_PUT;
- msg->ev.initiator.nid = hdr->src_nid;
- msg->ev.initiator.pid = hdr->src_pid;
- msg->ev.portal = hdr->msg.put.ptl_index;
- msg->ev.match_bits = hdr->msg.put.match_bits;
- msg->ev.rlength = hdr->payload_length;
- msg->ev.mlength = mlength;
- msg->ev.offset = offset;
- msg->ev.hdr_data = hdr->msg.put.hdr_data;
-
- /* NB if this match has exhausted the MD, we can't be sure
- * that this event will the the last one associated with
- * this MD in the event queue (another message already
- * matching this ME/MD could end up being last). So we
- * remember the ME handle anyway and check again when we're
- * allocating our slot in the event queue.
- */
- ptl_me2handle (&msg->ev.unlinked_me, me);
-
- lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
}
ni->counters.recv_count++;
ni->counters.recv_length += mlength;
- /* only unlink after MD's pending count has been bumped
- * in get_new_msg() otherwise lib_me_unlink() will nuke it */
- if (unlink) {
- md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED;
+ /* only unlink after MD's pending count has been bumped in
+ * lib_commit_md() otherwise lib_me_unlink() will nuke it */
+ if (unlink)
lib_me_unlink (nal, me);
- }
state_unlock(nal, &flags);
- lib_recv (nal, private, msg, md, offset, mlength, hdr->payload_length);
- return 0;
+ rc = lib_recv(nal, private, msg, md, offset, mlength,
+ hdr->payload_length);
+ if (rc != PTL_OK)
+ CERROR(LPU64": error on receiving PUT from "LPU64": %d\n",
+ ni->nid, hdr->src_nid, rc);
- drop:
- nal->ni.counters.drop_count++;
- nal->ni.counters.drop_length += hdr->payload_length;
- state_unlock (nal, &flags);
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- return -1;
+ return (rc);
}
-static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+static ptl_err_t
+parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
{
lib_ni_t *ni = &nal->ni;
ptl_size_t mlength = 0;
int unlink = 0;
lib_me_t *me;
lib_md_t *md;
- lib_msg_t *msg;
ptl_hdr_t reply;
unsigned long flags;
int rc;
hdr->msg.get.sink_length, hdr->msg.get.src_offset,
hdr->msg.get.match_bits,
&mlength, &offset, &unlink);
- if (me == NULL)
- goto drop;
+ if (me == NULL) {
+ state_unlock(nal, &flags);
+ return (PTL_FAIL);
+ }
md = me->md;
CDEBUG(D_NET, "Incoming get index %d from "LPU64".%u of length %d/%d "
hdr->src_nid, hdr->src_pid, mlength, hdr->payload_length,
md->md_lh.lh_cookie, md->md_niov, offset);
- msg = get_new_msg (nal, md);
- if (msg == NULL) {
- CERROR(LPU64": Dropping GET from "LPU64": can't allocate msg\n",
- ni->nid, hdr->src_nid);
- goto drop;
- }
+ lib_commit_md(nal, md, msg);
- if (md->eq) {
- msg->ev.type = PTL_EVENT_GET;
- msg->ev.initiator.nid = hdr->src_nid;
- msg->ev.initiator.pid = hdr->src_pid;
- msg->ev.portal = hdr->msg.get.ptl_index;
- msg->ev.match_bits = hdr->msg.get.match_bits;
- msg->ev.rlength = hdr->payload_length;
- msg->ev.mlength = mlength;
- msg->ev.offset = offset;
- msg->ev.hdr_data = 0;
-
- /* NB if this match has exhausted the MD, we can't be sure
- * that this event will the the last one associated with
- * this MD in the event queue (another message already
- * matching this ME/MD could end up being last). So we
- * remember the ME handle anyway and check again when we're
- * allocating our slot in the event queue.
- */
- ptl_me2handle (&msg->ev.unlinked_me, me);
-
- lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
- }
+ msg->ev.type = PTL_EVENT_GET;
+ msg->ev.initiator.nid = hdr->src_nid;
+ msg->ev.initiator.pid = hdr->src_pid;
+ msg->ev.portal = hdr->msg.get.ptl_index;
+ msg->ev.match_bits = hdr->msg.get.match_bits;
+ msg->ev.rlength = hdr->payload_length;
+ msg->ev.mlength = mlength;
+ msg->ev.offset = offset;
+ msg->ev.hdr_data = 0;
+
+ lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
ni->counters.send_count++;
ni->counters.send_length += mlength;
- /* only unlink after MD's refcount has been bumped
- * in get_new_msg() otherwise lib_me_unlink() will nuke it */
- if (unlink) {
- md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED;
+ /* only unlink after MD's refcount has been bumped in
+ * lib_commit_md() otherwise lib_me_unlink() will nuke it */
+ if (unlink)
lib_me_unlink (nal, me);
- }
state_unlock(nal, &flags);
rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY,
hdr->src_nid, hdr->src_pid, md, offset, mlength);
- if (rc != PTL_OK) {
- CERROR(LPU64": Dropping GET from "LPU64": send REPLY failed\n",
- ni->nid, hdr->src_nid);
- /* Hmm, this will create a GET event and make believe
- * the reply completed, which it kind of did, only the
- * source won't get her reply */
- lib_finalize (nal, private, msg);
- state_lock (nal, &flags);
- goto drop;
- }
+ if (rc != PTL_OK)
+ CERROR(LPU64": Unable to send REPLY for GET from "LPU64": %d\n",
+ ni->nid, hdr->src_nid, rc);
+
+ /* Discard any junk after the hdr */
+ (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- /* Complete the incoming message */
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
return (rc);
- drop:
- ni->counters.drop_count++;
- ni->counters.drop_length += hdr->msg.get.sink_length;
- state_unlock(nal, &flags);
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- return -1;
}
-static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+static ptl_err_t
+parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
{
lib_ni_t *ni = &nal->ni;
lib_md_t *md;
int rlength;
int length;
- lib_msg_t *msg;
unsigned long flags;
+ ptl_err_t rc;
state_lock(nal, &flags);
md == NULL ? "invalid" : "inactive",
hdr->msg.reply.dst_wmd.wh_interface_cookie,
hdr->msg.reply.dst_wmd.wh_object_cookie);
- goto drop;
+
+ state_unlock(nal, &flags);
+ return (PTL_FAIL);
}
LASSERT (md->offset == 0);
ni->nid, hdr->src_nid, length,
hdr->msg.reply.dst_wmd.wh_object_cookie,
md->length);
- goto drop;
+ state_unlock(nal, &flags);
+ return (PTL_FAIL);
}
length = md->length;
}
hdr->src_nid, length, rlength,
hdr->msg.reply.dst_wmd.wh_object_cookie);
- msg = get_new_msg (nal, md);
- if (msg == NULL) {
- CERROR(LPU64": Dropping REPLY from "LPU64": can't "
- "allocate msg\n", ni->nid, hdr->src_nid);
- goto drop;
- }
+ lib_commit_md(nal, md, msg);
- if (md->eq) {
- msg->ev.type = PTL_EVENT_REPLY;
- msg->ev.initiator.nid = hdr->src_nid;
- msg->ev.initiator.pid = hdr->src_pid;
- msg->ev.rlength = rlength;
- msg->ev.mlength = length;
- msg->ev.offset = 0;
+ msg->ev.type = PTL_EVENT_REPLY;
+ msg->ev.initiator.nid = hdr->src_nid;
+ msg->ev.initiator.pid = hdr->src_pid;
+ msg->ev.rlength = rlength;
+ msg->ev.mlength = length;
+ msg->ev.offset = 0;
- lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
- }
+ lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
ni->counters.recv_count++;
ni->counters.recv_length += length;
state_unlock(nal, &flags);
- lib_recv (nal, private, msg, md, 0, length, rlength);
- return 0;
+ rc = lib_recv(nal, private, msg, md, 0, length, rlength);
+ if (rc != PTL_OK)
+ CERROR(LPU64": error on receiving REPLY from "LPU64": %d\n",
+ ni->nid, hdr->src_nid, rc);
- drop:
- nal->ni.counters.drop_count++;
- nal->ni.counters.drop_length += hdr->payload_length;
- state_unlock (nal, &flags);
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- return -1;
+ return (rc);
}
-static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+static ptl_err_t
+parse_ack(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
{
- lib_ni_t *ni = &nal->ni;
- lib_md_t *md;
- lib_msg_t *msg = NULL;
- unsigned long flags;
+ lib_ni_t *ni = &nal->ni;
+ lib_md_t *md;
+ unsigned long flags;
/* Convert ack fields to host byte order */
hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits);
(md == NULL) ? "invalid" : "inactive",
hdr->msg.ack.dst_wmd.wh_interface_cookie,
hdr->msg.ack.dst_wmd.wh_object_cookie);
- goto drop;
+
+ state_unlock(nal, &flags);
+ return (PTL_FAIL);
}
CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n",
ni->nid, hdr->src_nid,
hdr->msg.ack.dst_wmd.wh_object_cookie);
- msg = get_new_msg (nal, md);
- if (msg == NULL) {
- CERROR(LPU64": Dropping ACK from "LPU64": can't allocate msg\n",
- ni->nid, hdr->src_nid);
- goto drop;
- }
+ lib_commit_md(nal, md, msg);
- if (md->eq) {
- msg->ev.type = PTL_EVENT_ACK;
- msg->ev.initiator.nid = hdr->src_nid;
- msg->ev.initiator.pid = hdr->src_pid;
- msg->ev.mlength = hdr->msg.ack.mlength;
- msg->ev.match_bits = hdr->msg.ack.match_bits;
+ msg->ev.type = PTL_EVENT_ACK;
+ msg->ev.initiator.nid = hdr->src_nid;
+ msg->ev.initiator.pid = hdr->src_pid;
+ msg->ev.mlength = hdr->msg.ack.mlength;
+ msg->ev.match_bits = hdr->msg.ack.match_bits;
- lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
- }
+ lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
ni->counters.recv_count++;
- state_unlock(nal, &flags);
- lib_recv (nal, private, msg, NULL, 0, 0, hdr->payload_length);
- return 0;
- drop:
- nal->ni.counters.drop_count++;
- state_unlock (nal, &flags);
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- return -1;
+ state_unlock(nal, &flags);
+
+ /* We have received and matched up the ack OK, create the
+ * completion event now... */
+ lib_finalize(nal, private, msg, PTL_OK);
+
+ /* ...and now discard any junk after the hdr */
+ (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length);
+
+ return (PTL_OK);
}
static char *
} /* end of print_hdr() */
-int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+void
+lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private)
{
unsigned long flags;
-
+ ptl_err_t rc;
+ lib_msg_t *msg;
+
/* convert common fields to host byte order */
hdr->dest_nid = NTOH__u64 (hdr->dest_nid);
hdr->src_nid = NTOH__u64 (hdr->src_nid);
nal->ni.nid, mv->magic,
mv->version_major, mv->version_minor,
hdr->src_nid);
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- return (-1);
+ lib_drop_message(nal, private, hdr);
+ return;
}
if (hdr->dest_nid != nal->ni.nid) {
CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64
" (not me)\n", nal->ni.nid, hdr_type_string (hdr),
hdr->src_nid, hdr->dest_nid);
-
- state_lock (nal, &flags);
- nal->ni.counters.drop_count++;
- nal->ni.counters.drop_length += hdr->payload_length;
- state_unlock (nal, &flags);
-
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- return (-1);
+ lib_drop_message(nal, private, hdr);
+ return;
}
if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
": simulated failure\n",
nal->ni.nid, hdr_type_string (hdr),
hdr->src_nid);
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- return (-1);
+ lib_drop_message(nal, private, hdr);
+ return;
}
-
+
+ msg = lib_msg_alloc(nal);
+ if (msg == NULL) {
+ CERROR(LPU64": Dropping incoming %s from "LPU64
+ ": can't allocate a lib_msg_t\n",
+ nal->ni.nid, hdr_type_string (hdr),
+ hdr->src_nid);
+ lib_drop_message(nal, private, hdr);
+ return;
+ }
+
+ do_gettimeofday(&msg->ev.arrival_time);
+
switch (hdr->type) {
case PTL_MSG_ACK:
- return (parse_ack(nal, hdr, private));
+ rc = parse_ack(nal, hdr, private, msg);
+ break;
case PTL_MSG_PUT:
- return (parse_put(nal, hdr, private));
+ rc = parse_put(nal, hdr, private, msg);
break;
case PTL_MSG_GET:
- return (parse_get(nal, hdr, private));
+ rc = parse_get(nal, hdr, private, msg);
break;
case PTL_MSG_REPLY:
- return (parse_reply(nal, hdr, private));
+ rc = parse_reply(nal, hdr, private, msg);
break;
default:
CERROR(LPU64": Dropping <unknown> message from "LPU64
": Bad type=0x%x\n", nal->ni.nid, hdr->src_nid,
hdr->type);
-
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- return (-1);
+ rc = PTL_FAIL;
+ break;
+ }
+
+ if (rc != PTL_OK) {
+ if (msg->md != NULL) {
+ /* committed... */
+ lib_finalize(nal, private, msg, rc);
+ } else {
+ state_lock(nal, &flags);
+ lib_msg_free(nal, msg); /* expects state_lock held */
+ state_unlock(nal, &flags);
+
+ lib_drop_message(nal, private, hdr);
+ }
}
}
-
-int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int
+do_PtlPut(nal_cb_t *nal, void *private, void *v_args, void *v_ret)
{
/*
* Incoming:
* Outgoing:
*/
- PtlPut_in *args = v_args;
- PtlPut_out *ret = v_ret;
- ptl_hdr_t hdr;
-
- lib_ni_t *ni = &nal->ni;
- lib_md_t *md;
- lib_msg_t *msg = NULL;
+ PtlPut_in *args = v_args;
ptl_process_id_t *id = &args->target_in;
- unsigned long flags;
- int rc;
+ PtlPut_out *ret = v_ret;
+ lib_ni_t *ni = &nal->ni;
+ lib_msg_t *msg;
+ ptl_hdr_t hdr;
+ lib_md_t *md;
+ unsigned long flags;
+ int rc;
if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
fail_peer (nal, id->nid, 1)) /* shall we now? */
nal->ni.nid, id->nid);
return (ret->rc = PTL_INV_PROC);
}
-
- ret->rc = PTL_OK;
+
+ msg = lib_msg_alloc(nal);
+ if (msg == NULL) {
+ CERROR(LPU64": Dropping PUT to "LPU64": ENOMEM on lib_msg_t\n",
+ ni->nid, id->nid);
+ return (ret->rc = PTL_NOSPACE);
+ }
+
state_lock(nal, &flags);
+
md = ptl_handle2md(&args->md_in, nal);
- if (md == NULL || !md->threshold) {
+ if (md == NULL || md->threshold == 0) {
+ lib_msg_free(nal, msg);
state_unlock(nal, &flags);
- return ret->rc = PTL_INV_MD;
+
+ return (ret->rc = PTL_INV_MD);
}
CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid,
hdr.msg.put.offset = HTON__u32 (args->offset_in);
hdr.msg.put.hdr_data = args->hdr_data_in;
+ lib_commit_md(nal, md, msg);
+
+ msg->ev.type = PTL_EVENT_SENT;
+ msg->ev.initiator.nid = ni->nid;
+ msg->ev.initiator.pid = ni->pid;
+ msg->ev.portal = args->portal_in;
+ msg->ev.match_bits = args->match_bits_in;
+ msg->ev.rlength = md->length;
+ msg->ev.mlength = md->length;
+ msg->ev.offset = args->offset_in;
+ msg->ev.hdr_data = args->hdr_data_in;
+
+ lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+
ni->counters.send_count++;
ni->counters.send_length += md->length;
- msg = get_new_msg (nal, md);
- if (msg == NULL) {
- CERROR("BAD: could not allocate msg!\n");
- state_unlock(nal, &flags);
- return ret->rc = PTL_NOSPACE;
- }
-
- /*
- * If this memory descriptor has an event queue associated with
- * it we need to allocate a message state object and record the
- * information about this operation that will be recorded into
- * event queue once the message has been completed.
- *
- * NB. We're now committed to the GET, since we just marked the MD
- * busy. Callers who observe this (by getting PTL_MD_INUSE from
- * PtlMDUnlink()) expect a completion event to tell them when the
- * MD becomes idle.
- */
- if (md->eq) {
- msg->ev.type = PTL_EVENT_SENT;
- msg->ev.initiator.nid = ni->nid;
- msg->ev.initiator.pid = ni->pid;
- msg->ev.portal = args->portal_in;
- msg->ev.match_bits = args->match_bits_in;
- msg->ev.rlength = md->length;
- msg->ev.mlength = md->length;
- msg->ev.offset = args->offset_in;
- msg->ev.hdr_data = args->hdr_data_in;
-
- lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
- }
-
state_unlock(nal, &flags);
rc = lib_send (nal, private, msg, &hdr, PTL_MSG_PUT,
id->nid, id->pid, md, 0, md->length);
if (rc != PTL_OK) {
- /* get_new_msg() committed us to sending by decrementing
- * md->threshold, so we have to act like we did send, but
- * the network dropped it. */
- lib_finalize (nal, private, msg);
+ CERROR(LPU64": error sending PUT to "LPU64": %d\n",
+ ni->nid, id->nid, rc);
+ lib_finalize (nal, private, msg, rc);
}
+ /* completion will be signalled by an event */
return ret->rc = PTL_OK;
}
-lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid,
- lib_md_t *getmd)
+lib_msg_t *
+lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_md_t *getmd)
{
/* The NAL can DMA direct to the GET md (i.e. no REPLY msg). This
* returns a msg the NAL can pass to lib_finalize() so that a REPLY
* lib_finalize() of the original GET. */
lib_ni_t *ni = &nal->ni;
- lib_msg_t *msg;
+ lib_msg_t *msg = lib_msg_alloc(nal);
unsigned long flags;
state_lock(nal, &flags);
LASSERT (getmd->pending > 0);
+ if (msg == NULL) {
+ CERROR ("Dropping REPLY from "LPU64": can't allocate msg\n",
+ peer_nid);
+ goto drop;
+ }
+
if (getmd->threshold == 0) {
CERROR ("Dropping REPLY from "LPU64" for inactive MD %p\n",
peer_nid, getmd);
- goto drop;
+ goto drop_msg;
}
LASSERT (getmd->offset == 0);
CDEBUG(D_NET, "Reply from "LPU64" md %p\n", peer_nid, getmd);
- msg = get_new_msg (nal, getmd);
- if (msg == NULL) {
- CERROR("Dropping REPLY from "LPU64" md %p: can't allocate msg\n",
- peer_nid, getmd);
- goto drop;
- }
+ lib_commit_md (nal, getmd, msg);
- if (getmd->eq) {
- msg->ev.type = PTL_EVENT_REPLY;
- msg->ev.initiator.nid = peer_nid;
- msg->ev.initiator.pid = 0; /* XXX FIXME!!! */
- msg->ev.rlength = msg->ev.mlength = getmd->length;
- msg->ev.offset = 0;
+ msg->ev.type = PTL_EVENT_REPLY;
+ msg->ev.initiator.nid = peer_nid;
+ msg->ev.initiator.pid = 0; /* XXX FIXME!!! */
+ msg->ev.rlength = msg->ev.mlength = getmd->length;
+ msg->ev.offset = 0;
- lib_md_deconstruct(nal, getmd, &msg->ev.mem_desc);
- }
+ lib_md_deconstruct(nal, getmd, &msg->ev.mem_desc);
ni->counters.recv_count++;
ni->counters.recv_length += getmd->length;
state_unlock(nal, &flags);
return msg;
-
+
+ drop_msg:
+ lib_msg_free(nal, msg);
drop:
nal->ni.counters.drop_count++;
nal->ni.counters.drop_length += getmd->length;
return NULL;
}
-int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int
+do_PtlGet(nal_cb_t *nal, void *private, void *v_args, void *v_ret)
{
/*
* Incoming:
* Outgoing:
*/
- PtlGet_in *args = v_args;
- PtlGet_out *ret = v_ret;
- ptl_hdr_t hdr;
- lib_msg_t *msg = NULL;
- lib_ni_t *ni = &nal->ni;
+ PtlGet_in *args = v_args;
ptl_process_id_t *id = &args->target_in;
- lib_md_t *md;
- unsigned long flags;
- int rc;
+ PtlGet_out *ret = v_ret;
+ lib_ni_t *ni = &nal->ni;
+ lib_msg_t *msg;
+ ptl_hdr_t hdr;
+ lib_md_t *md;
+ unsigned long flags;
+ int rc;
if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
fail_peer (nal, id->nid, 1)) /* shall we now? */
nal->ni.nid, id->nid);
return (ret->rc = PTL_INV_PROC);
}
-
+
+ msg = lib_msg_alloc(nal);
+ if (msg == NULL) {
+ CERROR(LPU64": Dropping GET to "LPU64": ENOMEM on lib_msg_t\n",
+ ni->nid, id->nid);
+ return (ret->rc = PTL_NOSPACE);
+ }
+
state_lock(nal, &flags);
+
md = ptl_handle2md(&args->md_in, nal);
if (md == NULL || !md->threshold) {
+ lib_msg_free(nal, msg);
state_unlock(nal, &flags);
+
return ret->rc = PTL_INV_MD;
}
- LASSERT (md->offset == 0);
-
CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid,
(unsigned long)id->pid);
hdr.msg.get.src_offset = HTON__u32 (args->offset_in);
hdr.msg.get.sink_length = HTON__u32 (md->length);
- ni->counters.send_count++;
+ lib_commit_md(nal, md, msg);
- msg = get_new_msg (nal, md);
- if (msg == NULL) {
- CERROR("do_PtlGet: BAD - could not allocate cookie!\n");
- state_unlock(nal, &flags);
- return ret->rc = PTL_NOSPACE;
- }
+ msg->ev.type = PTL_EVENT_SENT;
+ msg->ev.initiator.nid = ni->nid;
+ msg->ev.initiator.pid = ni->pid;
+ msg->ev.portal = args->portal_in;
+ msg->ev.match_bits = args->match_bits_in;
+ msg->ev.rlength = md->length;
+ msg->ev.mlength = md->length;
+ msg->ev.offset = args->offset_in;
+ msg->ev.hdr_data = 0;
- /*
- * If this memory descriptor has an event queue associated with
- * it we must allocate a message state object that will record
- * the information to be filled in once the message has been
- * completed. More information is in the do_PtlPut() comments.
- *
- * NB. We're now committed to the GET, since we just marked the MD
- * busy. Callers who observe this (by getting PTL_MD_INUSE from
- * PtlMDUnlink()) expect a completion event to tell them when the
- * MD becomes idle.
- */
- if (md->eq) {
- msg->ev.type = PTL_EVENT_SENT;
- msg->ev.initiator.nid = ni->nid;
- msg->ev.initiator.pid = ni->pid;
- msg->ev.portal = args->portal_in;
- msg->ev.match_bits = args->match_bits_in;
- msg->ev.rlength = md->length;
- msg->ev.mlength = md->length;
- msg->ev.offset = args->offset_in;
- msg->ev.hdr_data = 0;
-
- lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
- }
+ lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+
+ ni->counters.send_count++;
state_unlock(nal, &flags);
rc = lib_send (nal, private, msg, &hdr, PTL_MSG_GET,
id->nid, id->pid, NULL, 0, 0);
if (rc != PTL_OK) {
- /* get_new_msg() committed us to sending by decrementing
- * md->threshold, so we have to act like we did send, but
- * the network dropped it. */
- lib_finalize (nal, private, msg);
+ CERROR(LPU64": error sending GET to "LPU64": %d\n",
+ ni->nid, id->nid, rc);
+ lib_finalize (nal, private, msg, rc);
}
+ /* completion will be signalled by an event */
return ret->rc = PTL_OK;
}
#include <portals/lib-p30.h>
-int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg)
+void
+lib_enq_event_locked (nal_cb_t *nal, void *private,
+ lib_eq_t *eq, ptl_event_t *ev)
{
- lib_md_t *md;
- lib_eq_t *eq;
+ ptl_event_t *eq_slot;
int rc;
+
+ ev->sequence = eq->sequence++; /* Allocate the next queue slot */
+
+ /* size must be a power of 2 to handle a wrapped sequence # */
+ LASSERT (eq->size != 0 &&
+ eq->size == LOWEST_BIT_SET (eq->size));
+ eq_slot = eq->base + (ev->sequence & (eq->size - 1));
+
+ /* Copy the event into the allocated slot, ensuring all the rest of
+ * the event's contents have been copied _before_ the sequence
+ * number gets updated. A processes 'getting' an event waits on
+ * the next queue slot's sequence to be 'new'. When it is, _all_
+ * other event fields had better be consistent. I assert
+ * 'sequence' is the last member, so I only need a 2 stage copy. */
+
+ LASSERT(sizeof (ptl_event_t) ==
+ offsetof(ptl_event_t, sequence) + sizeof(ev->sequence));
+
+ rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev,
+ offsetof (ptl_event_t, sequence));
+ LASSERT (rc == PTL_OK);
+
+#ifdef __KERNEL__
+ barrier();
+#endif
+ /* Updating the sequence number is what makes the event 'new' NB if
+ * the cb_write below isn't atomic, this could cause a race with
+ * PtlEQGet */
+ rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence,
+ (void *)&ev->sequence,sizeof (ev->sequence));
+ LASSERT (rc == PTL_OK);
+
+#ifdef __KERNEL__
+ barrier();
+#endif
+
+ if (nal->cb_callback != NULL)
+ nal->cb_callback(nal, private, eq, ev);
+ else if (eq->event_callback != NULL)
+ eq->event_callback(ev);
+}
+
+void
+lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status)
+{
+ lib_md_t *md;
+ int unlink;
unsigned long flags;
+ int rc;
+ ptl_hdr_t ack;
/* ni went down while processing this message */
- if (nal->ni.up == 0) {
- return -1;
- }
+ if (nal->ni.up == 0)
+ return;
if (msg == NULL)
- return 0;
+ return;
- rc = 0;
- if (msg->send_ack) {
- ptl_hdr_t ack;
+ /* Only send an ACK if the PUT completed successfully */
+ if (status == PTL_OK &&
+ !ptl_is_wire_handle_none(&msg->ack_wmd)) {
- LASSERT (!ptl_is_wire_handle_none (&msg->ack_wmd));
+ LASSERT(msg->ev.type == PTL_EVENT_PUT);
memset (&ack, 0, sizeof (ack));
ack.type = HTON__u32 (PTL_MSG_ACK);
- ack.dest_nid = HTON__u64 (msg->nid);
+ ack.dest_nid = HTON__u64 (msg->ev.initiator.nid);
ack.src_nid = HTON__u64 (nal->ni.nid);
- ack.dest_pid = HTON__u32 (msg->pid);
+ ack.dest_pid = HTON__u32 (msg->ev.initiator.pid);
ack.src_pid = HTON__u32 (nal->ni.pid);
ack.payload_length = 0;
ack.msg.ack.mlength = HTON__u32 (msg->ev.mlength);
rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK,
- msg->nid, msg->pid, NULL, 0, 0);
- /* If this send fails, there's nothing else to clean up */
+ msg->ev.initiator.nid, msg->ev.initiator.pid,
+ NULL, 0, 0);
+ if (rc != PTL_OK) {
+ /* send failed: there's nothing else to clean up. */
+ CERROR("Error %d sending ACK to "LPX64"\n",
+ rc, msg->ev.initiator.nid);
+ }
}
md = msg->md;
- LASSERT (md->pending > 0); /* I've not dropped my ref yet */
- eq = md->eq;
state_lock(nal, &flags);
- if (eq != NULL) {
- ptl_event_t *ev = &msg->ev;
- ptl_event_t *eq_slot;
-
- /* I have to hold the lock while I bump the sequence number
- * and copy the event into the queue. If not, and I was
- * interrupted after bumping the sequence number, other
- * events could fill the queue, including the slot I just
- * allocated to this event. On resuming, I would overwrite
- * a more 'recent' event with old event state, and
- * processes taking events off the queue would not detect
- * overflow correctly.
- */
-
- ev->sequence = eq->sequence++;/* Allocate the next queue slot */
-
- /* size must be a power of 2 to handle a wrapped sequence # */
- LASSERT (eq->size != 0 &&
- eq->size == LOWEST_BIT_SET (eq->size));
- eq_slot = eq->base + (ev->sequence & (eq->size - 1));
-
- /* Invalidate unlinked_me unless this is the last
- * event for an auto-unlinked MD. Note that if md was
- * auto-unlinked, md->pending can only decrease
- */
- if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || /* not auto-unlinked */
- md->pending != 1) /* not last ref */
- ev->unlinked_me = PTL_HANDLE_NONE;
-
- /* Copy the event into the allocated slot, ensuring all the
- * rest of the event's contents have been copied _before_
- * the sequence number gets updated. A processes 'getting'
- * an event waits on the next queue slot's sequence to be
- * 'new'. When it is, _all_ other event fields had better
- * be consistent. I assert 'sequence' is the last member,
- * so I only need a 2 stage copy.
- */
- LASSERT(sizeof (ptl_event_t) ==
- offsetof(ptl_event_t, sequence) + sizeof(ev->sequence));
-
- rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev,
- offsetof (ptl_event_t, sequence));
- LASSERT (rc == 0);
-
-#ifdef __KERNEL__
- barrier();
-#endif
- /* Updating the sequence number is what makes the event 'new' */
-
- /* cb_write is not necessarily atomic, so this could
- cause a race with PtlEQGet */
- rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence,
- (void *)&ev->sequence,sizeof (ev->sequence));
- LASSERT (rc == 0);
+ /* Now it's safe to drop my caller's ref */
+ md->pending--;
+ LASSERT (md->pending >= 0);
-#ifdef __KERNEL__
- barrier();
-#endif
+ /* Should I unlink this MD? */
+ unlink = (md->pending == 0 && /* No other refs */
+ (md->threshold == 0 || /* All ops done */
+ md->md_flags & PTL_MD_FLAG_UNLINK) != 0); /* black spot */
- /* I must also ensure that (a) callbacks are made in the
- * same order as the events land in the queue, and (b) the
- * callback occurs before the event can be removed from the
- * queue, so I can't drop the lock during the callback. */
- if (nal->cb_callback != NULL)
- nal->cb_callback(nal, private, eq, ev);
- else if (eq->event_callback != NULL)
- (void)((eq->event_callback) (ev));
- }
+ msg->ev.status = status;
+ msg->ev.unlinked = unlink;
- LASSERT ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 ||
- (md->md_flags & PTL_MD_FLAG_UNLINK) != 0);
+ if (md->eq != NULL)
+ lib_enq_event_locked(nal, private, md->eq, &msg->ev);
- md->pending--;
- if (md->pending == 0 && /* no more outstanding operations on this md */
- (md->threshold == 0 || /* done its business */
- (md->md_flags & PTL_MD_FLAG_UNLINK) != 0)) /* marked for death */
+ if (unlink)
lib_md_unlink(nal, md);
list_del (&msg->msg_list);
lib_msg_free(nal, msg);
state_unlock(nal, &flags);
-
- return rc;
}
CPPFLAGS=
INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir)
-lib_LIBRARIES = libtcpnal.a
+noinst_LIBRARIES = libtcpnal.a
pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
+
+if LIBLUSTRE
+libtcpnal_a_CFLAGS = -fPIC
+endif
* This file is part of Portals, http://www.sf.net/projects/sandiaportals/
*/
+#ifndef TCPNAL_PROCBRIDGE_H
+#define TCPNAL_PROCBRIDGE_H
+
#include <portals/lib-p30.h>
typedef struct bridge {
typedef int (*nal_initialize)(bridge);
extern nal_initialize nal_table[PTL_IFACE_MAX];
+
+#endif
*/
connection force_tcp_connection(manager m,
unsigned int ip,
- unsigned short port)
+ unsigned short port,
+ procbridge pb)
{
connection conn;
struct sockaddr_in addr;
exit(-1);
conn = allocate_connection(m, ip, port, fd);
+
+ /* let nal thread know this event right away */
+ if (conn)
+ procbridge_wakeup_nal(pb);
}
pthread_mutex_unlock(&m->conn_lock);
*/
#include <table.h>
+#include <procbridge.h>
typedef struct manager {
table connections;
manager m;
} *connection;
-connection force_tcp_connection(manager m, unsigned int ip, unsigned int short);
+connection force_tcp_connection(manager m, unsigned int ip, unsigned int short,
+ procbridge pb);
manager init_connections(unsigned short, int (*f)(void *, void *), void *);
void remove_connection(void *arg);
void shutdown_connections(manager m);
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
+#ifndef __CYGWIN__
+#include <syscall.h>
+#endif
+#include <sys/socket.h>
#include <procbridge.h>
#include <pqtimer.h>
#include <dispatch.h>
#include <errno.h>
+/* XXX CFS workaround, to give a chance to let nal thread wake up
+ * from waiting in select
+ */
+static int procbridge_notifier_handler(void *arg)
+{
+ static char buf[8];
+ procbridge p = (procbridge) arg;
+
+ syscall(SYS_read, p->notifier[1], buf, sizeof(buf));
+ return 1;
+}
+
+void procbridge_wakeup_nal(procbridge p)
+{
+ static char buf[8];
+ syscall(SYS_write, p->notifier[0], buf, sizeof(buf));
+}
+
/* Function: forward
* Arguments: nal_t *nal: pointer to my top-side nal structure
* id: the command to pass to the lower layer
procbridge p=(procbridge)b->local;
p->nal_flags |= NAL_FLAG_STOPPING;
+ procbridge_wakeup_nal(p);
do {
pthread_mutex_lock(&p->mutex);
}
+/* FIXME cfs temporary workaround! FIXME
+ * global time out value
+ */
+int __tcpnal_eqwait_timeout_value = 0;
+int __tcpnal_eqwait_timedout = 0;
+
/* Function: yield
* Arguments: pid:
*
procbridge p=(procbridge)b->local;
pthread_mutex_lock(&p->mutex);
- pthread_cond_wait(&p->cond,&p->mutex);
+ if (!__tcpnal_eqwait_timeout_value) {
+ pthread_cond_wait(&p->cond,&p->mutex);
+ } else {
+ struct timeval now;
+ struct timespec timeout;
+
+ gettimeofday(&now, NULL);
+ timeout.tv_sec = now.tv_sec + __tcpnal_eqwait_timeout_value;
+ timeout.tv_nsec = now.tv_usec * 1000;
+
+ __tcpnal_eqwait_timedout =
+ pthread_cond_timedwait(&p->cond, &p->mutex, &timeout);
+ }
pthread_mutex_unlock(&p->mutex);
}
p->nal_flags = 0;
pthread_mutex_init(&p->nal_cb_lock, 0);
+ /* initialize notifier */
+ if (socketpair(AF_UNIX, SOCK_STREAM, 0, p->notifier)) {
+ perror("socketpair failed");
+ return NULL;
+ }
+
+ if (!register_io_handler(p->notifier[1], READ_HANDLER,
+ procbridge_notifier_handler, p)) {
+ perror("fail to register notifier handler");
+ return NULL;
+ }
+
+ /* create nal thread */
if (pthread_create(&p->t, NULL, nal_thread, &args)) {
perror("nal_init: pthread_create");
return(NULL);
pthread_cond_t cond;
pthread_mutex_t mutex;
+ /* socket pair used to notify nal thread */
+ int notifier[2];
+
int nal_flags;
pthread_mutex_t nal_cb_lock;
ptl_pt_index_t ptl_size,
ptl_ac_index_t acl_size,
ptl_pid_t requested_pid);
+extern void procbridge_wakeup_nal(procbridge p);
#endif
/* the following functions are stubs to satisfy the nal definition
without doing anything particularily useful*/
-static int nal_write(nal_cb_t *nal,
- void *private,
- user_ptr dst_addr,
- void *src_addr,
- size_t len)
+static ptl_err_t nal_write(nal_cb_t *nal,
+ void *private,
+ user_ptr dst_addr,
+ void *src_addr,
+ size_t len)
{
memcpy(dst_addr, src_addr, len);
- return 0;
+ return PTL_OK;
}
-static int nal_read(nal_cb_t * nal,
- void *private,
- void *dst_addr,
- user_ptr src_addr,
- size_t len)
+static ptl_err_t nal_read(nal_cb_t * nal,
+ void *private,
+ void *dst_addr,
+ user_ptr src_addr,
+ size_t len)
{
memcpy(dst_addr, src_addr, len);
- return 0;
+ return PTL_OK;
}
static void *nal_malloc(nal_cb_t *nal,
timeout_pointer=&timeout;
} else timeout_pointer=0;
-
- /* FIXME
- * temporarily add timer for endless waiting problem.
- * FIXME
- */
- timeout.tv_sec = 1;
- timeout.tv_usec = 0;
- timeout_pointer=&timeout;
-
FD_ZERO(&fds[0]);
FD_ZERO(&fds[1]);
FD_ZERO(&fds[2]);
CPPFLAGS=
INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir)
-lib_LIBRARIES = libtcpnal.a
+noinst_LIBRARIES = libtcpnal.a
pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
+
+if LIBLUSTRE
+libtcpnal_a_CFLAGS = -fPIC
+endif
* This file is part of Portals, http://www.sf.net/projects/sandiaportals/
*/
+#ifndef TCPNAL_PROCBRIDGE_H
+#define TCPNAL_PROCBRIDGE_H
+
#include <portals/lib-p30.h>
typedef struct bridge {
typedef int (*nal_initialize)(bridge);
extern nal_initialize nal_table[PTL_IFACE_MAX];
+
+#endif
*/
connection force_tcp_connection(manager m,
unsigned int ip,
- unsigned short port)
+ unsigned short port,
+ procbridge pb)
{
connection conn;
struct sockaddr_in addr;
exit(-1);
conn = allocate_connection(m, ip, port, fd);
+
+ /* let nal thread know this event right away */
+ if (conn)
+ procbridge_wakeup_nal(pb);
}
pthread_mutex_unlock(&m->conn_lock);
*/
#include <table.h>
+#include <procbridge.h>
typedef struct manager {
table connections;
manager m;
} *connection;
-connection force_tcp_connection(manager m, unsigned int ip, unsigned int short);
+connection force_tcp_connection(manager m, unsigned int ip, unsigned int short,
+ procbridge pb);
manager init_connections(unsigned short, int (*f)(void *, void *), void *);
void remove_connection(void *arg);
void shutdown_connections(manager m);
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
+#ifndef __CYGWIN__
+#include <syscall.h>
+#endif
+#include <sys/socket.h>
#include <procbridge.h>
#include <pqtimer.h>
#include <dispatch.h>
#include <errno.h>
+/* XXX CFS workaround, to give a chance to let nal thread wake up
+ * from waiting in select
+ */
+static int procbridge_notifier_handler(void *arg)
+{
+ static char buf[8];
+ procbridge p = (procbridge) arg;
+
+ syscall(SYS_read, p->notifier[1], buf, sizeof(buf));
+ return 1;
+}
+
+void procbridge_wakeup_nal(procbridge p)
+{
+ static char buf[8];
+ syscall(SYS_write, p->notifier[0], buf, sizeof(buf));
+}
+
/* Function: forward
* Arguments: nal_t *nal: pointer to my top-side nal structure
* id: the command to pass to the lower layer
procbridge p=(procbridge)b->local;
p->nal_flags |= NAL_FLAG_STOPPING;
+ procbridge_wakeup_nal(p);
do {
pthread_mutex_lock(&p->mutex);
}
+/* FIXME cfs temporary workaround! FIXME
+ * global time out value
+ */
+int __tcpnal_eqwait_timeout_value = 0;
+int __tcpnal_eqwait_timedout = 0;
+
/* Function: yield
* Arguments: pid:
*
procbridge p=(procbridge)b->local;
pthread_mutex_lock(&p->mutex);
- pthread_cond_wait(&p->cond,&p->mutex);
+ if (!__tcpnal_eqwait_timeout_value) {
+ pthread_cond_wait(&p->cond,&p->mutex);
+ } else {
+ struct timeval now;
+ struct timespec timeout;
+
+ gettimeofday(&now, NULL);
+ timeout.tv_sec = now.tv_sec + __tcpnal_eqwait_timeout_value;
+ timeout.tv_nsec = now.tv_usec * 1000;
+
+ __tcpnal_eqwait_timedout =
+ pthread_cond_timedwait(&p->cond, &p->mutex, &timeout);
+ }
pthread_mutex_unlock(&p->mutex);
}
p->nal_flags = 0;
pthread_mutex_init(&p->nal_cb_lock, 0);
+ /* initialize notifier */
+ if (socketpair(AF_UNIX, SOCK_STREAM, 0, p->notifier)) {
+ perror("socketpair failed");
+ return NULL;
+ }
+
+ if (!register_io_handler(p->notifier[1], READ_HANDLER,
+ procbridge_notifier_handler, p)) {
+ perror("fail to register notifier handler");
+ return NULL;
+ }
+
+ /* create nal thread */
if (pthread_create(&p->t, NULL, nal_thread, &args)) {
perror("nal_init: pthread_create");
return(NULL);
pthread_cond_t cond;
pthread_mutex_t mutex;
+ /* socket pair used to notify nal thread */
+ int notifier[2];
+
int nal_flags;
pthread_mutex_t nal_cb_lock;
ptl_pt_index_t ptl_size,
ptl_ac_index_t acl_size,
ptl_pid_t requested_pid);
+extern void procbridge_wakeup_nal(procbridge p);
#endif
/* the following functions are stubs to satisfy the nal definition
without doing anything particularily useful*/
-static int nal_write(nal_cb_t *nal,
- void *private,
- user_ptr dst_addr,
- void *src_addr,
- size_t len)
+static ptl_err_t nal_write(nal_cb_t *nal,
+ void *private,
+ user_ptr dst_addr,
+ void *src_addr,
+ size_t len)
{
memcpy(dst_addr, src_addr, len);
- return 0;
+ return PTL_OK;
}
-static int nal_read(nal_cb_t * nal,
- void *private,
- void *dst_addr,
- user_ptr src_addr,
- size_t len)
+static ptl_err_t nal_read(nal_cb_t * nal,
+ void *private,
+ void *dst_addr,
+ user_ptr src_addr,
+ size_t len)
{
memcpy(dst_addr, src_addr, len);
- return 0;
+ return PTL_OK;
}
static void *nal_malloc(nal_cb_t *nal,
timeout_pointer=&timeout;
} else timeout_pointer=0;
-
- /* FIXME
- * temporarily add timer for endless waiting problem.
- * FIXME
- */
- timeout.tv_sec = 1;
- timeout.tv_usec = 0;
- timeout_pointer=&timeout;
-
FD_ZERO(&fds[0]);
FD_ZERO(&fds[1]);
FD_ZERO(&fds[2]);
*
* sends a packet to the peer, after insuring that a connection exists
*/
-int tcpnal_send(nal_cb_t *n,
- void *private,
- lib_msg_t *cookie,
- ptl_hdr_t *hdr,
- int type,
- ptl_nid_t nid,
- ptl_pid_t pid,
- unsigned int niov,
- struct iovec *iov,
- size_t len)
+ptl_err_t tcpnal_send(nal_cb_t *n,
+ void *private,
+ lib_msg_t *cookie,
+ ptl_hdr_t *hdr,
+ int type,
+ ptl_nid_t nid,
+ ptl_pid_t pid,
+ unsigned int niov,
+ struct iovec *iov,
+ size_t offset,
+ size_t len)
{
connection c;
bridge b=(bridge)n->nal_data;
struct iovec tiov[257];
static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER;
- int rc;
+ ptl_err_t rc = PTL_OK;
+ int sysrc;
int total;
+ int ntiov;
int i;
if (!(c=force_tcp_connection((manager)b->lower,
PNAL_IP(nid,b),
- PNAL_PORT(nid,pid))))
- return(1);
+ PNAL_PORT(nid,pid),
+ b->local)))
+ return(PTL_FAIL);
-#if 0
/* TODO: these results should be checked. furthermore, provision
must be made for the SIGPIPE which is delivered when
writing on a tcp socket which has closed underneath
the application. there is a linux flag in the sendmsg
call which turns off the signally behaviour, but its
nonstandard */
- syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t));
- LASSERT (niov <= 1);
- if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len);
-#else
+
LASSERT (niov <= 256);
tiov[0].iov_base = hdr;
tiov[0].iov_len = sizeof(ptl_hdr_t);
+ ntiov = 1 + lib_extract_iov(256, &tiov[1], niov, iov, offset, len);
- if (niov > 0)
- memcpy(&tiov[1], iov, niov * sizeof(struct iovec));
pthread_mutex_lock(&send_lock);
#if 1
- for (i = total = 0; i <= niov; i++)
+ for (i = total = 0; i < ntiov; i++)
total += tiov[i].iov_len;
- rc = syscall(SYS_writev, c->fd, tiov, niov+1);
- if (rc != total) {
+ sysrc = syscall(SYS_writev, c->fd, tiov, ntiov);
+ if (sysrc != total) {
fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
rc, total, errno);
- abort();
+ rc = PTL_FAIL;
}
#else
- for (i = total = 0; i <= niov; i++) {
+ for (i = total = 0; i <= ntiov; i++) {
rc = send(c->fd, tiov[i].iov_base, tiov[i].iov_len, 0);
if (rc != tiov[i].iov_len) {
fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
rc, tiov[i].iov_len, errno);
- abort();
+ rc = PTL_FAIL;
+ break;
}
- total != rc;
+ total += rc;
}
#endif
#if 0
total, niov + 1);
#endif
pthread_mutex_unlock(&send_lock);
-#endif
- lib_finalize(n, private, cookie);
-
- return(0);
+
+ if (rc == PTL_OK) {
+ /* NB the NAL only calls lib_finalize() if it returns PTL_OK
+ * from cb_send() */
+ lib_finalize(n, private, cookie, PTL_OK);
+ }
+
+ return(rc);
}
* blocking read of the requested data. must drain out the
* difference of mainpulated and requested lengths from the network
*/
-int tcpnal_recv(nal_cb_t *n,
- void *private,
- lib_msg_t *cookie,
- unsigned int niov,
- struct iovec *iov,
- size_t mlen,
- size_t rlen)
+ptl_err_t tcpnal_recv(nal_cb_t *n,
+ void *private,
+ lib_msg_t *cookie,
+ unsigned int niov,
+ struct iovec *iov,
+ size_t offset,
+ size_t mlen,
+ size_t rlen)
{
+ struct iovec tiov[256];
+ int ntiov;
int i;
if (!niov)
LASSERT(rlen);
LASSERT(rlen >= mlen);
+ ntiov = lib_extract_iov(256, tiov, niov, iov, offset, mlen);
+
/* FIXME
* 1. Is this effecient enough? change to use readv() directly?
* 2. need check return from read_connection()
* - MeiJia
*/
- for (i = 0; i < niov; i++)
- read_connection(private, iov[i].iov_base, iov[i].iov_len);
+ for (i = 0; i < ntiov; i++)
+ read_connection(private, tiov[i].iov_base, tiov[i].iov_len);
finalize:
- lib_finalize(n, private, cookie);
+ /* FIXME; we always assume success here... */
+ lib_finalize(n, private, cookie, PTL_OK);
if (mlen!=rlen){
char *trash=malloc(rlen-mlen);
free(trash);
}
- return(rlen);
+ return(PTL_OK);
}
*
* sends a packet to the peer, after insuring that a connection exists
*/
-int tcpnal_send(nal_cb_t *n,
- void *private,
- lib_msg_t *cookie,
- ptl_hdr_t *hdr,
- int type,
- ptl_nid_t nid,
- ptl_pid_t pid,
- unsigned int niov,
- struct iovec *iov,
- size_t len)
+ptl_err_t tcpnal_send(nal_cb_t *n,
+ void *private,
+ lib_msg_t *cookie,
+ ptl_hdr_t *hdr,
+ int type,
+ ptl_nid_t nid,
+ ptl_pid_t pid,
+ unsigned int niov,
+ struct iovec *iov,
+ size_t offset,
+ size_t len)
{
connection c;
bridge b=(bridge)n->nal_data;
struct iovec tiov[257];
static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER;
- int rc;
+ ptl_err_t rc = PTL_OK;
+ int sysrc;
int total;
+ int ntiov;
int i;
if (!(c=force_tcp_connection((manager)b->lower,
PNAL_IP(nid,b),
- PNAL_PORT(nid,pid))))
- return(1);
+ PNAL_PORT(nid,pid),
+ b->local)))
+ return(PTL_FAIL);
-#if 0
/* TODO: these results should be checked. furthermore, provision
must be made for the SIGPIPE which is delivered when
writing on a tcp socket which has closed underneath
the application. there is a linux flag in the sendmsg
call which turns off the signally behaviour, but its
nonstandard */
- syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t));
- LASSERT (niov <= 1);
- if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len);
-#else
+
LASSERT (niov <= 256);
tiov[0].iov_base = hdr;
tiov[0].iov_len = sizeof(ptl_hdr_t);
+ ntiov = 1 + lib_extract_iov(256, &tiov[1], niov, iov, offset, len);
- if (niov > 0)
- memcpy(&tiov[1], iov, niov * sizeof(struct iovec));
pthread_mutex_lock(&send_lock);
#if 1
- for (i = total = 0; i <= niov; i++)
+ for (i = total = 0; i < ntiov; i++)
total += tiov[i].iov_len;
- rc = syscall(SYS_writev, c->fd, tiov, niov+1);
- if (rc != total) {
+ sysrc = syscall(SYS_writev, c->fd, tiov, ntiov);
+ if (sysrc != total) {
fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
rc, total, errno);
- abort();
+ rc = PTL_FAIL;
}
#else
- for (i = total = 0; i <= niov; i++) {
+ for (i = total = 0; i <= ntiov; i++) {
rc = send(c->fd, tiov[i].iov_base, tiov[i].iov_len, 0);
if (rc != tiov[i].iov_len) {
fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
rc, tiov[i].iov_len, errno);
- abort();
+ rc = PTL_FAIL;
+ break;
}
- total != rc;
+ total += rc;
}
#endif
#if 0
total, niov + 1);
#endif
pthread_mutex_unlock(&send_lock);
-#endif
- lib_finalize(n, private, cookie);
-
- return(0);
+
+ if (rc == PTL_OK) {
+ /* NB the NAL only calls lib_finalize() if it returns PTL_OK
+ * from cb_send() */
+ lib_finalize(n, private, cookie, PTL_OK);
+ }
+
+ return(rc);
}
* blocking read of the requested data. must drain out the
* difference of mainpulated and requested lengths from the network
*/
-int tcpnal_recv(nal_cb_t *n,
- void *private,
- lib_msg_t *cookie,
- unsigned int niov,
- struct iovec *iov,
- size_t mlen,
- size_t rlen)
+ptl_err_t tcpnal_recv(nal_cb_t *n,
+ void *private,
+ lib_msg_t *cookie,
+ unsigned int niov,
+ struct iovec *iov,
+ size_t offset,
+ size_t mlen,
+ size_t rlen)
{
+ struct iovec tiov[256];
+ int ntiov;
int i;
if (!niov)
LASSERT(rlen);
LASSERT(rlen >= mlen);
+ ntiov = lib_extract_iov(256, tiov, niov, iov, offset, mlen);
+
/* FIXME
* 1. Is this effecient enough? change to use readv() directly?
* 2. need check return from read_connection()
* - MeiJia
*/
- for (i = 0; i < niov; i++)
- read_connection(private, iov[i].iov_base, iov[i].iov_len);
+ for (i = 0; i < ntiov; i++)
+ read_connection(private, tiov[i].iov_base, tiov[i].iov_len);
finalize:
- lib_finalize(n, private, cookie);
+ /* FIXME; we always assume success here... */
+ lib_finalize(n, private, cookie, PTL_OK);
if (mlen!=rlen){
char *trash=malloc(rlen-mlen);
free(trash);
}
- return(rlen);
+ return(PTL_OK);
}
# This code is issued under the GNU General Public License.
# See the file COPYING in this distribution
-
COMPILE = $(CC) -Wall -g -I$(srcdir)/../include
LINK = $(CC) -o $@
if LIBLUSTRE
-tmp=
+
+noinst_LIBRARIES = libuptlctl.a
+libuptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h
+libuptlctl_a_CFLAGS = -fPIC
+
else
-tmp=gmnalnid
-endif
-sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck $(tmp)
+sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid
lib_LIBRARIES = libptlctl.a
acceptor_SOURCES = acceptor.c # -lefence
debugctl_DEPENDENCIES = libptlctl.a
routerstat_SOURCES = routerstat.c
+endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <syscall.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <portals/api-support.h>
#include <portals/ptlctl.h>
+#ifndef __CYGWIN__
+ #include <syscall.h>
+#else
+ #include <windows.h>
+ #include <windef.h>
+#endif
+
+static ioc_handler_t do_ioctl; /* forward ref */
+static ioc_handler_t *current_ioc_handler = &do_ioctl;
+
struct ioc_dev {
const char * dev_name;
int dev_fd;
int opc;
};
-char * dump_filename;
+char *dump_filename;
+
+void
+set_ioc_handler (ioc_handler_t *handler)
+{
+ if (handler == NULL)
+ current_ioc_handler = do_ioctl;
+ else
+ current_ioc_handler = handler;
+}
static int
open_ioc_dev(int dev_id)
{
FILE *fp;
struct dump_hdr dump_hdr;
- struct portal_ioctl_hdr * ioc_hdr = (struct portal_ioctl_hdr *) buf;
+ struct portal_ioctl_hdr * ioc_hdr = (struct portal_ioctl_hdr *) buf;
int rc;
printf("dumping opc %x to %s\n", opc, dump_filename);
return -EINVAL;
}
- rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp);
- if (rc == 1)
- rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp);
- fclose(fp);
- if (rc != 1) {
- fprintf(stderr, "%s: %s\n", dump_filename,
- strerror(errno));
- return -EINVAL;
- }
-
- return 0;
+ rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp);
+ if (rc == 1)
+ rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp);
+ fclose(fp);
+ if (rc != 1) {
+ fprintf(stderr, "%s: %s\n", dump_filename,
+ strerror(errno));
+ return -EINVAL;
+ }
+
+ return 0;
}
/* register a device to send ioctls to. */
free(dump_filename);
dump_filename = strdup(file);
+ if (dump_filename == NULL)
+ abort();
+
+ set_ioc_handler(&dump);
return 0;
}
int
l_ioctl(int dev_id, int opc, void *buf)
{
- if (dump_filename)
- return dump(dev_id, opc, buf);
- else
- return do_ioctl(dev_id, opc, buf);
+ return current_ioc_handler(dev_id, opc, buf);
}
/* Read an ioctl dump file, and call the ioc_func for each ioctl buffer
int
parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *))
{
- int fd, line =0;
+ int line =0;
struct stat st;
- char *buf, *end;
+ char *start, *buf, *end;
+#ifndef __CYGWIN__
+ int fd;
+#else
+ HANDLE fd, hmap;
+ DWORD size;
+#endif
+#ifndef __CYGWIN__
fd = syscall(SYS_open, dump_file, O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "couldn't open %s: %s\n", dump_file,
+ strerror(errno));
+ exit(1);
+ }
#ifndef SYS_fstat64
-#define __SYS_fstat__ SYS_fstat
+# define __SYS_fstat__ SYS_fstat
#else
-#define __SYS_fstat__ SYS_fstat64
+# define __SYS_fstat__ SYS_fstat64
#endif
if (syscall(__SYS_fstat__, fd, &st)) {
perror("stat fails");
exit(1);
}
- buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0);
- end = buf + st.st_size;
+ start = buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0);
+ end = start + st.st_size;
close(fd);
- while (buf < end) {
- struct dump_hdr *dump_hdr = (struct dump_hdr *) buf;
- struct portal_ioctl_hdr * data;
- char tmp[8096];
- int rc;
-
- line++;
+ if (start == MAP_FAILED) {
+ fprintf(stderr, "can't create file mapping\n");
+ exit(1);
+ }
+#else
+ fd = CreateFile(dump_file, GENERIC_READ, FILE_SHARE_READ, NULL,
+ OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+ size = GetFileSize(fd, NULL);
+ if (size < 1) {
+ fprintf(stderr, "KML is empty\n");
+ exit(1);
+ }
- data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr));
- if (buf + data->ioc_len > end ) {
- fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf,
- data->ioc_len, end);
- return -1;
- }
+ hmap = CreateFileMapping(fd, NULL, PAGE_READONLY, 0,0, NULL);
+ start = buf = MapViewOfFile(hmap, FILE_MAP_READ, 0, 0, 0);
+ end = buf + size;
+ CloseHandle(fd);
+ if (start == NULL) {
+ fprintf(stderr, "can't create file mapping\n");
+ exit(1);
+ }
+#endif /* __CYGWIN__ */
+
+ while (buf < end) {
+ struct dump_hdr *dump_hdr = (struct dump_hdr *) buf;
+ struct portal_ioctl_hdr * data;
+ char tmp[8096];
+ int rc;
+
+ line++;
+
+ data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr));
+ if (buf + data->ioc_len > end ) {
+ fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf,
+ data->ioc_len, end);
+ return -1;
+ }
#if 0
- printf ("dump_hdr: %lx data: %lx\n",
- (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf);
-
- printf("%d: opcode %x len: %d ver: %x ", line, dump_hdr->opc,
- data->ioc_len, data->ioc_version);
+ printf ("dump_hdr: %lx data: %lx\n",
+ (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf);
+
+ printf("%d: opcode %x len: %d ver: %x ", line, dump_hdr->opc,
+ data->ioc_len, data->ioc_version);
#endif
- memcpy(tmp, data, data->ioc_len);
+ memcpy(tmp, data, data->ioc_len);
- rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp);
- if (rc) {
- printf("failed: %d\n", rc);
- exit(1);
- }
+ rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp);
+ if (rc) {
+ printf("failed: %d\n", rc);
+ exit(1);
+ }
- buf += data->ioc_len + sizeof(*dump_hdr);
+ buf += data->ioc_len + sizeof(*dump_hdr);
}
+
+#ifndef __CYGWIN__
+ munmap(start, end - start);
+#else
+ UnmapViewOfFile(start);
+ CloseHandle(hmap);
+#endif
+
return 0;
}
#include <stdarg.h>
#include <asm/byteorder.h>
+#ifdef __CYGWIN__
+
+#include <netinet/in.h>
+
+#warning assuming little endian
+
+#define __cpu_to_le64(x) ((__u64)(x))
+#define __le64_to_cpu(x) ((__u64)(x))
+#define __cpu_to_le32(x) ((__u32)(x))
+#define __le32_to_cpu(x) ((__u32)(x))
+#define __cpu_to_le16(x) ((__u16)(x))
+#define __le16_to_cpu(x) ((__u16)(x))
+
+#endif /* __CYGWIN__ */
+
#include <portals/api-support.h>
#include <portals/ptlctl.h>
#include <portals/list.h>
PORTAL_IOC_INIT (data);
data.ioc_pbuf1 = (char*)pcfg;
data.ioc_plen1 = sizeof(*pcfg);
+ /* XXX liblustre hack XXX */
+ data.ioc_nal_cmd = pcfg->pcfg_command;
+ data.ioc_nid = pcfg->pcfg_nid;
rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
}
- ptlrpcd can be blocked, stopping ALL progress (2477)
- recovery for initial connections (2355)
- fixes for mds_cleanup_orphans (1934)
+ - abort_recovery crashes MDS in b_eq (mds_unlink_orphan) (2584)
- block all file creations until orphan recovery completes (1901)
- client remove rq_connection from request struct (2423)
- conf-sanity test_5, proper cleanup in umount log not availale (2640)
- recovery timer race (2670)
- mdc_close recovey bug (2532)
+ - ptlrpc cleanup bug (2710)
+ - mds timeout on local locks (2588)
+ - namespace lock held during RPCs (2431)
- don't try to handle a message that hasn't been replied to (2699)
- - don't fail assertion if in recovery during cleanup (2701)
+ - client assert failure during cleanup after abort recovery (2701)
+ - leak mdc device after failed mount (2712)
+ - ptlrpc_check_set allows timedout requests to complete (2714)
+ - wait for inflight reqs when ptlrpcd finishes (2710)
+ - make sure unregistered services are removed from the srv_list
+ - reset bulk XID's when resending them (caught by 1138 test)
+ - unregister_bulk after timeout
- fix lconf error (2694)
* miscellania
- return LL_SUPER_MAGIC from statfs for the filesystem type (1972)
endif
if LIBLUSTRE
-SUBDIRS = portals obdclass lov ptlrpc obdecho osc utils mdc lvfs #liblustre
+SUBDIRS = portals obdclass lov ptlrpc obdecho osc utils mdc lvfs liblustre
else
SUBDIRS = lvfs portals obdclass include $(DIRS24) mds utils obdfilter mdc osc ost
SUBDIRS+= llite obdecho lov cobd tests doc scripts conf ptlrpc
portals/knals/scimacnal/Makefile \
portals/knals/ibnal/Makefile \
portals/utils/Makefile portals/tests/Makefile portals/doc/Makefile \
- obdecho/Makefile ptlrpc/Makefile liblustre/Makefile \
+ obdecho/Makefile ptlrpc/Makefile liblustre/Makefile liblustre/tests/Makefile \
lov/Makefile osc/Makefile mdc/Makefile mds/Makefile ost/Makefile \
cobd/Makefile ptlbd/Makefile conf/Makefile tests/Makefile \
utils/Makefile utils/Lustre/Makefile obdfilter/Makefile \
#define GFP_HIGHUSER 1
#define GFP_ATOMIC 1
#define GFP_NOFS 1
-#define IS_ERR(a) (((a) && abs((int)(a)) < 500) ? 1 : 0)
-#define PTR_ERR(a) ((int)(a))
-#define ERR_PTR(a) ((void*)(a))
+#define IS_ERR(a) (((a) && abs((long)(a)) < 500) ? 1 : 0)
+#define PTR_ERR(a) ((long)(a))
+#define ERR_PTR(a) ((void*)((long)(a)))
#define capable(foo) 1
#define CAP_SYS_ADMIN 1
#define PAGE_CACHE_SHIFT 12
#define PAGE_CACHE_MASK PAGE_MASK
+/* XXX
+ * for this moment, liblusre will not rely OST for non-page-aligned write
+ */
+#define LIBLUSTRE_HANDLE_UNALIGNED_PAGE
+
struct page {
void *addr;
unsigned long index;
/* internally used by liblustre file i/o */
int _offset;
int _count;
+#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
+ int _managed;
+#endif
};
#define kmap(page) (page)->addr
}
#define __free_page(page) __free_pages((page), 0)
+#define free_page(page) __free_page(page)
static inline struct page* __grab_cache_page(unsigned long index)
{
free(l);
}
+#define time_after(a, b) \
+({ \
+ printf("Error: inapproiate call time_after()\n"); \
+ 1; \
+})
+
typedef struct { volatile int counter; } atomic_t;
#define atomic_read(a) ((a)->counter)
struct ldlm_lock *ldlm_handle2lock_ns(struct ldlm_namespace *,
struct lustre_handle *);
-void *ldlm_put_lock_into_req(struct ptlrpc_request *,
- struct lustre_handle *, int);
-
static inline struct ldlm_lock *ldlm_handle2lock(struct lustre_handle *h)
{
return __ldlm_handle2lock(h, 0);
struct obd_uuid exp_client_uuid;
struct list_head exp_obd_chain;
struct obd_device *exp_obd;
- struct obd_import *exp_imp_reverse; /* to make rpc's backwards */
+ struct obd_import *exp_imp_reverse; /* to make RPCs backwards */
struct ptlrpc_connection *exp_connection;
__u32 exp_conn_cnt;
struct ldlm_export_data exp_ldlm_data;
- struct ptlrpc_request *exp_outstanding_reply;
+ struct list_head exp_outstanding_replies;
time_t exp_last_request_time;
spinlock_t exp_lock; /* protects flags int below */
- int exp_failed:1;
+ /* ^ protects exp_outstanding_replies too */
int exp_flags;
+ int exp_failed:1;
+ int exp_libclient:1; /* liblustre client? */
union {
struct mds_export_data eu_mds_data;
struct filter_export_data eu_filter_data;
- struct ec_export_data eu_ec_data;
+ struct ec_export_data eu_ec_data;
struct osc_export_data eu_osc_data;
} u;
};
#define MSG_CONNECT_RECONNECT 0x2
#define MSG_CONNECT_REPLAYABLE 0x4
//#define MSG_CONNECT_PEER 0x8
+#define MSG_CONNECT_LIBCLIENT 0x10
/*
* OST requests: OBDO & OBD request records
#include <linux/lustre_import.h>
#include <linux/lprocfs_status.h>
+/* Size over which to OBD_VMALLOC() rather than OBD_ALLOC() service request
+ * buffers */
+#define SVC_BUF_VMALLOC_THRESHOLD (2*PAGE_SIZE)
+
/* The following constants determine how much memory is devoted to
* buffering in the lustre services.
*
* total memory = ?_NBUFS * ?_BUFSIZE
*
* ?_MAXREQSIZE # maximum request service will receive
- * larger messages will get dropped.
+ * messages larger than ?_MAXREQSIZE are dropped.
* request buffers are auto-unlinked when less than ?_MAXREQSIZE
* is left in them.
*/
#define LDLM_NUM_THREADS min(smp_num_cpus * smp_num_cpus * 8, 64)
-#define LDLM_NEVENT_MAX 8192UL
-#define LDLM_NEVENTS min_t(unsigned long, num_physpages / 64, \
- LDLM_NEVENT_MAX)
#define LDLM_NBUF_MAX 256UL
-#define LDLM_NBUFS min(LDLM_NEVENTS / 16, LDLM_NBUF_MAX)
#define LDLM_BUFSIZE (8 * 1024)
#define LDLM_MAXREQSIZE (5 * 1024)
+#define LDLM_MAXMEM (num_physpages*(PAGE_SIZE/1024))
+#define LDLM_NBUFS min(LDLM_MAXMEM/LDLM_BUFSIZE, LDLM_NBUF_MAX)
#define MDT_MAX_THREADS 32UL
#define MDT_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \
MDT_MAX_THREADS), 2UL)
-#define MDS_NEVENT_MAX 8192UL
-#define MDS_NEVENTS min_t(unsigned long, num_physpages / 64, \
- MDS_NEVENT_MAX)
#define MDS_NBUF_MAX 512UL
-#define MDS_NBUFS min(MDS_NEVENTS / 16, MDS_NBUF_MAX)
#define MDS_BUFSIZE (8 * 1024)
/* Assume file name length = FNAME_MAX = 256 (true for extN).
* path name length = PATH_MAX = 4096
* except in the open case where there are a large number of OSTs in a LOV.
*/
#define MDS_MAXREQSIZE (5 * 1024)
+#define MDS_MAXMEM (num_physpages*(PAGE_SIZE/512))
+#define MDS_NBUFS min(MDS_MAXMEM/MDS_BUFSIZE, MDS_NBUF_MAX)
#define OST_MAX_THREADS 36UL
#define OST_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \
OST_MAX_THREADS), 2UL)
-#define OST_NEVENT_MAX 16384UL
-#define OST_NEVENTS min_t(unsigned long, num_physpages / 16, \
- OST_NEVENT_MAX)
#define OST_NBUF_MAX 5000UL
-#define OST_NBUFS min(OST_NEVENTS / 2, OST_NBUF_MAX)
#define OST_BUFSIZE (8 * 1024)
/* OST_MAXREQSIZE ~= 1640 bytes =
* lustre_msg + obdo + 16 * obd_ioobj + 64 * niobuf_remote
* - OST_MAXREQSIZE must be at least 1 page of cookies plus some spillover
*/
#define OST_MAXREQSIZE (5 * 1024)
+#define OST_MAXMEM (num_physpages*(PAGE_SIZE/512))
+#define OST_NBUFS min(OST_MAXMEM/OST_BUFSIZE, OST_NBUF_MAX)
#define PTLBD_NUM_THREADS 4
-#define PTLBD_NEVENTS 1024
#define PTLBD_NBUFS 20
#define PTLBD_BUFSIZE (32 * 1024)
#define PTLBD_MAXREQSIZE 1024
struct ptlrpc_bulk_desc;
+/*
+ * ptlrpc callback & work item stuff
+ */
+struct ptlrpc_cb_id {
+ void (*cbid_fn)(ptl_event_t *ev); /* specific callback fn */
+ void *cbid_arg; /* additional arg */
+};
+
+#define RS_MAX_LOCKS 4
+#define RS_DEBUG 1
+
+struct ptlrpc_reply_state {
+ struct ptlrpc_cb_id rs_cb_id;
+ struct list_head rs_list;
+ struct list_head rs_exp_list;
+ struct list_head rs_obd_list;
+#if RS_DEBUG
+ struct list_head rs_debug_list;
+#endif
+ /* updates to following flag serialised by srv_request_lock */
+ unsigned int rs_difficult:1; /* ACK/commit stuff */
+ unsigned int rs_scheduled:1; /* being handled? */
+ unsigned int rs_scheduled_ever:1; /* any schedule attempts? */
+ unsigned int rs_handled:1; /* been handled yet? */
+ unsigned int rs_on_net:1; /* reply_out_callback pending? */
+
+ int rs_size;
+ __u64 rs_transno;
+ __u64 rs_xid;
+ struct obd_export *rs_export;
+ struct ptlrpc_srv_ni *rs_srv_ni;
+ ptl_handle_md_t rs_md_h;
+
+ /* locks awaiting client reply ACK */
+ int rs_nlocks;
+ struct lustre_handle rs_locks[RS_MAX_LOCKS];
+ ldlm_mode_t rs_modes[RS_MAX_LOCKS];
+ /* last member: variable sized reply message */
+ struct lustre_msg rs_msg;
+};
+
struct ptlrpc_request {
int rq_type; /* one of PTL_RPC_MSG_* */
struct list_head rq_list;
int rq_status;
spinlock_t rq_lock;
- unsigned int rq_intr:1, rq_replied:1, rq_want_ack:1, rq_err:1,
+ /* client-side flags */
+ unsigned int rq_intr:1, rq_replied:1, rq_err:1,
rq_timedout:1, rq_resend:1, rq_restart:1, rq_replay:1,
- rq_no_resend:1, rq_resent:1, rq_waiting:1, rq_receiving_reply:1;
+ rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1;
int rq_phase;
-
+ /* client-side refcount for SENT race */
atomic_t rq_refcount;
int rq_request_portal; /* XXX FIXME bug 249 */
int rq_reply_portal; /* XXX FIXME bug 249 */
+ /* client-side # reply bytes actually received */
+ int rq_nob_received;
+
int rq_reqlen;
struct lustre_msg *rq_reqmsg;
int rq_import_generation;
enum lustre_imp_state rq_send_state;
- wait_queue_head_t rq_reply_waitq; /* XXX also _for_ack */
- /* incoming reply */
- ptl_md_t rq_reply_md;
- ptl_handle_md_t rq_reply_md_h;
-
- /* outgoing req/rep */
- ptl_md_t rq_req_md;
+ /* client+server request */
+ ptl_handle_md_t rq_req_md_h;
+ struct ptlrpc_cb_id rq_req_cbid;
+ /* server-side... */
+ struct timeval rq_arrival_time; /* request arrival time */
+ struct ptlrpc_reply_state *rq_reply_state; /* separated reply state */
+ struct ptlrpc_request_buffer_desc *rq_rqbd; /* incoming request buffer */
+
+ /* client-only incoming reply */
+ ptl_handle_md_t rq_reply_md_h;
+ wait_queue_head_t rq_reply_waitq;
+ struct ptlrpc_cb_id rq_reply_cbid;
+
struct ptlrpc_peer rq_peer; /* XXX see service.c can this be factored away? */
struct obd_export *rq_export;
struct obd_import *rq_import;
- struct ptlrpc_service *rq_svc;
-
+
void (*rq_replay_cb)(struct ptlrpc_request *);
void (*rq_commit_cb)(struct ptlrpc_request *);
void *rq_cb_data;
struct ptlrpc_request_set *rq_set;
void *rq_interpret_reply; /* Async completion handler */
union ptlrpc_async_args rq_async_args; /* Async completion context */
-
- /* Only used on the server side for tracking acks. */
- struct ptlrpc_req_ack_lock {
- struct lustre_handle lock;
- __u32 mode;
- } rq_ack_locks[REQ_MAX_ACK_LOCKS];
};
#define RQ_PHASE_NEW 0xebc0de00
-#define RQ_PHASE_RPC 0xebc0de01
+#define RQ_PHASE_RPC 0xebc0de01
#define RQ_PHASE_BULK 0xebc0de02
#define RQ_PHASE_INTERPRET 0xebc0de03
#define RQ_PHASE_COMPLETE 0xebc0de04
#define PTLRPC_REQUEST_COMPLETE(req) ((req)->rq_phase > RQ_PHASE_RPC)
-#define DEBUG_REQ_FLAGS(req) \
- ((req->rq_phase == RQ_PHASE_NEW) ? "New" : \
- (req->rq_phase == RQ_PHASE_RPC) ? "RPC" : \
- (req->rq_phase == RQ_PHASE_INTERPRET) ? "Interpret" : \
- (req->rq_phase == RQ_PHASE_COMPLETE) ? "Complete" : \
- (req->rq_phase == RQ_PHASE_BULK) ? "Bulk" : "?phase?"), \
- FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \
- FLAG(req->rq_want_ack, "A"), FLAG(req->rq_err, "E"), \
- FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \
- FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \
- FLAG(req->rq_no_resend, "N"), FLAG(req->rq_resent, "s"), \
+#define DEBUG_REQ_FLAGS(req) \
+ ((req->rq_phase == RQ_PHASE_NEW) ? "New" : \
+ (req->rq_phase == RQ_PHASE_RPC) ? "Rpc" : \
+ (req->rq_phase == RQ_PHASE_INTERPRET) ? "Interpret" : \
+ (req->rq_phase == RQ_PHASE_COMPLETE) ? "Complete" : "?phase?"), \
+ FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \
+ FLAG(req->rq_err, "E"), \
+ FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \
+ FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \
+ FLAG(req->rq_no_resend, "N"), \
FLAG(req->rq_waiting, "W")
-#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s"
+#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s"
#define DEBUG_REQ(level, req, fmt, args...) \
do { \
} while (0)
struct ptlrpc_bulk_page {
- struct ptlrpc_bulk_desc *bp_desc;
struct list_head bp_link;
int bp_buflen;
int bp_pageoffset; /* offset within a page */
struct page *bp_page;
};
-#define BULK_GET_SOURCE 0
+#define BULK_GET_SOURCE 0
#define BULK_PUT_SINK 1
#define BULK_GET_SINK 2
#define BULK_PUT_SOURCE 3
struct ptlrpc_bulk_desc {
- unsigned int bd_complete:1;
+ unsigned int bd_success:1; /* completed successfully */
unsigned int bd_network_rw:1; /* accessible to the network */
unsigned int bd_type:2; /* {put,get}{source,sink} */
unsigned int bd_registered:1; /* client side */
struct obd_import *bd_import;
__u32 bd_portal;
struct ptlrpc_request *bd_req; /* associated request */
- wait_queue_head_t bd_waitq; /* server side only WQ */
- struct list_head bd_page_list;
- __u32 bd_page_count;
- __u32 bd_last_xid;
-
- ptl_md_t bd_md;
- ptl_handle_md_t bd_md_h;
- ptl_handle_me_t bd_me_h;
+ wait_queue_head_t bd_waitq; /* server side only WQ */
+ int bd_page_count; /* # pages (== entries in bd_iov) */
+ int bd_max_pages; /* allocated size of bd_iov */
+ int bd_nob; /* # bytes covered */
+ int bd_nob_transferred; /* # bytes GOT/PUT */
- int bd_callback_count; /* server side callbacks */
+ __u64 bd_last_xid;
+ struct ptlrpc_cb_id bd_cbid; /* network callback info */
+ ptl_handle_md_t bd_md_h; /* associated MD */
+
#ifdef __KERNEL__
ptl_kiov_t bd_iov[PTL_MD_MAX_IOV];
#else
struct ptlrpc_request_buffer_desc {
struct list_head rqbd_list;
struct ptlrpc_srv_ni *rqbd_srv_ni;
- ptl_handle_me_t rqbd_me_h;
- atomic_t rqbd_refcount;
+ ptl_handle_md_t rqbd_md_h;
+ int rqbd_refcount;
+ int rqbd_eventcount;
char *rqbd_buffer;
+ struct ptlrpc_cb_id rqbd_cbid;
+ struct ptlrpc_request rqbd_req;
};
/* event queues are per-ni, because one day we may get a hardware
char *pni_name;
int pni_number;
ptl_handle_ni_t pni_ni_h;
- ptl_handle_eq_t pni_request_out_eq_h;
- ptl_handle_eq_t pni_reply_in_eq_h;
- ptl_handle_eq_t pni_reply_out_eq_h;
- ptl_handle_eq_t pni_bulk_put_source_eq_h;
- ptl_handle_eq_t pni_bulk_put_sink_eq_h;
- ptl_handle_eq_t pni_bulk_get_source_eq_h;
- ptl_handle_eq_t pni_bulk_get_sink_eq_h;
+ ptl_handle_eq_t pni_eq_h;
};
struct ptlrpc_srv_ni {
/* Interface-specific service state */
struct ptlrpc_service *sni_service; /* owning service */
struct ptlrpc_ni *sni_ni; /* network interface */
- ptl_handle_eq_t sni_eq_h; /* event queue handle */
- struct list_head sni_rqbds; /* all the request buffer descriptors */
- __u32 sni_nrqbds; /* # request buffers */
- atomic_t sni_nrqbds_receiving; /* # request buffers posted */
+ struct list_head sni_rqbds; /* all the request buffers */
+ struct list_head sni_active_replies; /* all the active replies */
+ int sni_nrqbd_receiving; /* # posted request buffers */
};
-struct ptlrpc_service {
- time_t srv_time;
- time_t srv_timeout;
-
- struct list_head srv_ni_list; /* list of interfaces */
- __u32 srv_max_req_size; /* biggest request to receive */
- __u32 srv_buf_size; /* # bytes in a request buffer */
+typedef int (*svc_handler_t)(struct ptlrpc_request *req);
+struct ptlrpc_service {
+ struct list_head srv_list; /* chain thru all services */
+ int srv_max_req_size; /* biggest request to receive */
+ int srv_buf_size; /* size of individual buffers */
+ int srv_nbufs; /* total # req buffer descs allocated */
+ int srv_nthreads; /* # running threads */
+ int srv_n_difficult_replies; /* # 'difficult' replies */
+ int srv_n_active_reqs; /* # reqs being served */
+
__u32 srv_req_portal;
__u32 srv_rep_portal;
- __u32 srv_xid;
+ int srv_n_queued_reqs; /* # reqs waiting to be served */
+ struct list_head srv_request_queue; /* reqs waiting for service */
+
+ atomic_t srv_outstanding_replies;
+ struct list_head srv_reply_queue; /* replies waiting for service */
wait_queue_head_t srv_waitq; /* all threads sleep on this */
- spinlock_t srv_lock;
- struct list_head srv_threads;
- int (*srv_handler)(struct ptlrpc_request *req);
+ struct list_head srv_threads;
+ struct obd_device *srv_obddev;
+ svc_handler_t srv_handler;
+
char *srv_name; /* only statically allocated strings here; we don't clean them */
- struct proc_dir_entry *srv_procroot;
- struct lprocfs_stats *srv_stats;
- int srv_interface_rover;
+ spinlock_t srv_lock;
+
+ struct proc_dir_entry *srv_procroot;
+ struct lprocfs_stats *srv_stats;
+
struct ptlrpc_srv_ni srv_interfaces[0];
};
-typedef int (*svc_handler_t)(struct ptlrpc_request *req);
-
/* ptlrpc/events.c */
extern struct ptlrpc_ni ptlrpc_interfaces[];
extern int ptlrpc_ninterfaces;
extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, struct ptlrpc_peer *peer);
+extern void request_out_callback (ptl_event_t *ev);
+extern void reply_in_callback(ptl_event_t *ev);
+extern void client_bulk_callback (ptl_event_t *ev);
+extern void request_in_callback(ptl_event_t *ev);
+extern void reply_out_callback(ptl_event_t *ev);
+extern void server_bulk_callback (ptl_event_t *ev);
/* ptlrpc/connection.c */
void ptlrpc_dump_connections(void);
void ptlrpc_cleanup_connection(void);
/* ptlrpc/niobuf.c */
-int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *);
-int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *);
-void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *bulk);
+int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc);
+void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc);
int ptlrpc_register_bulk(struct ptlrpc_request *req);
void ptlrpc_unregister_bulk (struct ptlrpc_request *req);
-static inline int ptlrpc_bulk_complete (struct ptlrpc_bulk_desc *desc)
+static inline int ptlrpc_bulk_active (struct ptlrpc_bulk_desc *desc)
{
unsigned long flags;
int rc;
spin_lock_irqsave (&desc->bd_lock, flags);
- rc = desc->bd_complete;
+ rc = desc->bd_network_rw;
spin_unlock_irqrestore (&desc->bd_lock, flags);
return (rc);
}
+int ptlrpc_send_reply(struct ptlrpc_request *req, int);
int ptlrpc_reply(struct ptlrpc_request *req);
int ptlrpc_error(struct ptlrpc_request *req);
void ptlrpc_resend_req(struct ptlrpc_request *request);
int ptl_send_rpc(struct ptlrpc_request *request);
-void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd);
+void ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd);
/* ptlrpc/client.c */
void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
void ptlrpc_cleanup_client(struct obd_import *imp);
struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid);
+static inline int
+ptlrpc_client_receiving_reply (struct ptlrpc_request *req)
+{
+ unsigned long flags;
+ int rc;
+
+ spin_lock_irqsave(&req->rq_lock, flags);
+ rc = req->rq_receiving_reply;
+ spin_unlock_irqrestore(&req->rq_lock, flags);
+ return (rc);
+}
+
+static inline int
+ptlrpc_client_replied (struct ptlrpc_request *req)
+{
+ unsigned long flags;
+ int rc;
+
+ spin_lock_irqsave(&req->rq_lock, flags);
+ rc = req->rq_replied;
+ spin_unlock_irqrestore(&req->rq_lock, flags);
+ return (rc);
+}
+
+static inline void
+ptlrpc_wake_client_req (struct ptlrpc_request *req)
+{
+ if (req->rq_set == NULL)
+ wake_up(&req->rq_reply_waitq);
+ else
+ wake_up(&req->rq_set->set_waitq);
+}
+
int ptlrpc_queue_wait(struct ptlrpc_request *req);
int ptlrpc_replay_req(struct ptlrpc_request *req);
void ptlrpc_unregister_reply(struct ptlrpc_request *req);
void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request);
struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req,
- int type, int portal);
+ int npages, int type, int portal);
struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req,
- int type, int portal);
+ int npages, int type, int portal);
void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk);
-int ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
- struct page *page, int pageoffset, int len);
-void ptlrpc_free_bulk_page(struct ptlrpc_bulk_page *page);
+void ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+ struct page *page, int pageoffset, int len);
void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
struct obd_import *imp);
__u64 ptlrpc_next_xid(void);
/* ptlrpc/service.c */
-struct ptlrpc_service *
-ptlrpc_init_svc(__u32 nevents, __u32 nbufs, __u32 bufsize, __u32 max_req_size,
- int req_portal, int rep_portal, svc_handler_t, char *name,
- struct proc_dir_entry *proc_entry);
+void ptlrpc_save_lock (struct ptlrpc_request *req,
+ struct lustre_handle *lock, int mode);
+void ptlrpc_commit_replies (struct obd_device *obd);
+void ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs);
+struct ptlrpc_service *ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size,
+ int req_portal, int rep_portal,
+ svc_handler_t, char *name,
+ struct proc_dir_entry *proc_entry);
void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
int ptlrpc_start_n_threads(struct obd_device *dev, struct ptlrpc_service *svc,
int cnt, char *base_name);
int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
char *name);
int ptlrpc_unregister_service(struct ptlrpc_service *service);
+int liblustre_check_services (void *arg);
struct ptlrpc_svc_data {
char *name;
char **bufs);
int lustre_pack_reply(struct ptlrpc_request *, int count, int *lens,
char **bufs);
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs);
int lustre_msg_size(int count, int *lengths);
int lustre_unpack_msg(struct lustre_msg *m, int len);
void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
#endif
/* ptlrpc/llog_server.c */
-struct llog_obd_ctxt;
int llog_origin_handle_create(struct ptlrpc_request *req);
int llog_origin_handle_next_block(struct ptlrpc_request *req);
int llog_origin_handle_read_header(struct ptlrpc_request *req);
int obd_replayed_requests;
int obd_requests_queued_for_recovery;
wait_queue_head_t obd_next_transno_waitq;
- wait_queue_head_t obd_commit_waitq;
+ struct list_head obd_uncommitted_replies;
+ spinlock_t obd_uncommitted_replies_lock;
struct timer_list obd_recovery_timer;
struct list_head obd_recovery_queue;
struct list_head obd_delayed_reply_queue;
obd->obd_name, transno);
if (transno > obd->obd_last_committed) {
obd->obd_last_committed = transno;
- wake_up(&obd->obd_commit_waitq);
+ ptlrpc_commit_replies (obd);
}
}
+++ 25/arch/parisc/lib/checksum.c 2003-10-05 00:33:23.000000000 -0700
@@ -16,8 +16,10 @@
*
- * $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
+ * $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
*/
-#include <net/checksum.h>
+#include <linux/module.h>
--- linux-2.6.0-test6/drivers/char/ftape/compressor/zftape-compress.c 2003-06-14 12:18:32.000000000 -0700
+++ 25/drivers/char/ftape/compressor/zftape-compress.c 2003-10-05 00:33:24.000000000 -0700
@@ -31,6 +31,7 @@
- char zftc_rev[] = "$Revision: 1.3 $";
- char zftc_dat[] = "$Date: 2003/12/03 05:12:20 $";
+ char zftc_rev[] = "$Revision: 1.4 $";
+ char zftc_dat[] = "$Date: 2004/02/14 03:14:33 $";
+#include <linux/version.h>
#include <linux/errno.h>
--- linux-2.6.0-test6/drivers/isdn/hardware/eicon/divamnt.c 2003-09-27 18:57:44.000000000 -0700
+++ 25/drivers/isdn/hardware/eicon/divamnt.c 2003-10-05 00:33:24.000000000 -0700
@@ -1,4 +1,4 @@
--/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
-+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
+-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
*
* Driver for Eicon DIVA Server ISDN cards.
* Maint module
-#include "di_defs.h"
#include "debug_if.h"
--static char *main_revision = "$Revision: 1.3 $";
-+static char *main_revision = "$Revision: 1.3 $";
+-static char *main_revision = "$Revision: 1.4 $";
++static char *main_revision = "$Revision: 1.4 $";
static int major;
--- linux-2.6.0-test6/drivers/isdn/hardware/eicon/divasmain.c 2003-09-27 18:57:44.000000000 -0700
+++ 25/drivers/isdn/hardware/eicon/divasmain.c 2003-10-05 00:33:24.000000000 -0700
@@ -1,4 +1,4 @@
--/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
-+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
+-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
*
* Low level driver for Eicon DIVA Server ISDN cards.
*
#include "diva_dma.h"
#include "diva_pci.h"
--static char *main_revision = "$Revision: 1.3 $";
-+static char *main_revision = "$Revision: 1.3 $";
+-static char *main_revision = "$Revision: 1.4 $";
++static char *main_revision = "$Revision: 1.4 $";
static int major;
--- linux-2.6.0-test6/drivers/isdn/hardware/eicon/dqueue.c 2003-06-14 12:18:22.000000000 -0700
+++ 25/drivers/isdn/hardware/eicon/dqueue.c 2003-10-05 00:33:24.000000000 -0700
@@ -1,10 +1,10 @@
--/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
-+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
+-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
*
* Driver for Eicon DIVA Server ISDN cards.
* User Mode IDI Interface
--- linux-2.6.0-test6/drivers/isdn/hardware/eicon/mntfunc.c 2003-09-27 18:57:44.000000000 -0700
+++ 25/drivers/isdn/hardware/eicon/mntfunc.c 2003-10-05 00:33:24.000000000 -0700
@@ -1,4 +1,4 @@
--/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
-+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
+-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
*
* Driver for Eicon DIVA Server ISDN cards.
* Maint module
--- linux-2.6.0-test6/drivers/isdn/hardware/eicon/os_capi.h 2003-06-14 12:18:25.000000000 -0700
+++ 25/drivers/isdn/hardware/eicon/os_capi.h 2003-10-05 00:33:24.000000000 -0700
@@ -1,10 +1,10 @@
--/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
-+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
+-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
*
* ISDN interface module for Eicon active cards DIVA.
* CAPI Interface OS include files
--- linux-2.6.0-test6/drivers/isdn/hardware/eicon/platform.h 2003-09-27 18:57:44.000000000 -0700
+++ 25/drivers/isdn/hardware/eicon/platform.h 2003-10-05 00:33:24.000000000 -0700
@@ -1,4 +1,4 @@
--/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
-+/* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
+-/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
++/* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
*
* platform.h
*
+++ 25/drivers/media/video/planb.c 2003-10-05 00:33:24.000000000 -0700
@@ -27,7 +27,6 @@
- /* $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ */
+ /* $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ */
-#include <linux/version.h>
#include <linux/init.h>
--- linux-2.6.0-test6/drivers/mtd/chips/map_rom.c 2003-06-14 12:18:24.000000000 -0700
+++ 25/drivers/mtd/chips/map_rom.c 2003-10-05 00:33:24.000000000 -0700
@@ -4,7 +4,6 @@
- * $Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $
+ * $Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $
*/
-#include <linux/version.h>
#include <linux/hdlc.h>
/* Version */
--static const char version[] = "$Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ for Linux\n";
-+static const char version[] = "$Id: 2.6.0-test6-mm4.patch,v 1.3 2003/12/03 05:12:20 phil Exp $ for Linux\n";
+-static const char version[] = "$Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ for Linux\n";
++static const char version[] = "$Id: 2.6.0-test6-mm4.patch,v 1.4 2004/02/14 03:14:33 rread Exp $ for Linux\n";
static int debug;
static int quartz;
-$Id: bproc-patch-2.4.20,v 1.3 2003/12/03 05:12:25 phil Exp $
+$Id: bproc-patch-2.4.20,v 1.4 2004/02/14 03:14:37 rread Exp $
Index: linux/fs/exec.c
===================================================================
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
-+ * $Id: bproc-patch-2.4.20,v 1.3 2003/12/03 05:12:25 phil Exp $
++ * $Id: bproc-patch-2.4.20,v 1.4 2004/02/14 03:14:37 rread Exp $
+ *-----------------------------------------------------------------------*/
+#include <linux/kernel.h>
+#include <linux/sched.h>
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
-+ * $Id: bproc-patch-2.4.20,v 1.3 2003/12/03 05:12:25 phil Exp $
++ * $Id: bproc-patch-2.4.20,v 1.4 2004/02/14 03:14:37 rread Exp $
+ *-----------------------------------------------------------------------*/
+#ifndef _LINUX_BPROC_H
+#define _LINUX_BPROC_H
Index: linux-2.4.20/fs/ext3/xattr.c
===================================================================
---- linux-2.4.20.orig/fs/ext3/xattr.c 2003-11-13 17:14:52.000000000 +0300
-+++ linux-2.4.20/fs/ext3/xattr.c 2003-11-21 16:43:48.000000000 +0300
+--- linux-2.4.20.orig/fs/ext3/xattr.c 2003-11-13 10:59:33.000000000 +0800
++++ linux-2.4.20/fs/ext3/xattr.c 2003-11-25 21:16:51.000000000 +0800
@@ -1293,9 +1293,10 @@
goto cleanup;
memcpy(header, HDR(bh), bh->b_size);
lib_LIBRARIES = libldlm.a
libldlm_a_SOURCES = l_lock.c ldlm_lock.c ldlm_resource.c ldlm_lib.c \
ldlm_plain.c ldlm_extent.c ldlm_request.c ldlm_lockd.c ldlm_internal.h
+libldlm_a_CFLAGS = -fPIC
endif
include $(top_srcdir)/Rules
export = req->rq_export = class_conn2export(&conn);
LASSERT(export != NULL);
+ /* request from liblustre? */
+ if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT)
+ export->exp_libclient = 1;
+
if (export->exp_connection != NULL)
ptlrpc_put_connection(export->exp_connection);
export->exp_connection = ptlrpc_get_connection(&req->rq_peer,
int recovery_done = 0;
int rc2;
+ LASSERT ((rc == 0) == (req->rq_reply_state != NULL));
+
if (rc) {
/* Just like ptlrpc_error, but without the sending. */
rc = lustre_pack_reply(req, 0, NULL, NULL);
req->rq_type = PTL_RPC_MSG_ERR;
}
+ LASSERT (!req->rq_reply_state->rs_difficult);
LASSERT(list_empty(&req->rq_list));
/* XXX a bit like the request-dup code in queue_recovery_request */
OBD_ALLOC(saved_req, sizeof *saved_req);
LBUG();
memcpy(saved_req, req, sizeof *saved_req);
memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
+ /* the copied req takes over the reply state */
+ req->rq_reply_state = NULL;
req = saved_req;
req->rq_reqmsg = reqmsg;
class_export_get(req->rq_export);
return 1;
}
-static void ptlrpc_abort_reply (struct ptlrpc_request *req)
-{
- /* On return, we must be sure that the ACK callback has either
- * happened or will not happen. Note that the SENT callback will
- * happen come what may since we successfully posted the PUT. */
- int rc;
- struct l_wait_info lwi;
- unsigned long flags;
-
- again:
- /* serialise with ACK callback */
- spin_lock_irqsave (&req->rq_lock, flags);
- if (!req->rq_want_ack) {
- spin_unlock_irqrestore (&req->rq_lock, flags);
- /* The ACK callback has happened already. Although the
- * SENT callback might still be outstanding (yes really) we
- * don't care; this is just like normal completion. */
- return;
- }
- spin_unlock_irqrestore (&req->rq_lock, flags);
-
- /* Have a bash at unlinking the MD. This will fail until the SENT
- * callback has happened since the MD is busy from the PUT. If the
- * ACK still hasn't arrived after then, a successful unlink will
- * ensure the ACK callback never happens. */
- rc = PtlMDUnlink (req->rq_reply_md_h);
- switch (rc) {
- default:
- LBUG ();
- case PTL_OK:
- /* SENT callback happened; ACK callback preempted */
- LASSERT (req->rq_want_ack);
- spin_lock_irqsave (&req->rq_lock, flags);
- req->rq_want_ack = 0;
- spin_unlock_irqrestore (&req->rq_lock, flags);
- return;
- case PTL_INV_MD:
- return;
- case PTL_MD_INUSE:
- /* Still sending or ACK callback in progress: wait until
- * either callback has completed and try again.
- * Actually we can't wait for the SENT callback because
- * there's no state the SENT callback can touch that will
- * allow it to communicate with us! So we just wait here
- * for a short time, effectively polling for the SENT
- * callback by calling PtlMDUnlink() again, to see if it
- * has finished. Note that if the ACK does arrive, its
- * callback wakes us in short order. --eeb */
- lwi = LWI_TIMEOUT (HZ/4, NULL, NULL);
- rc = l_wait_event(req->rq_reply_waitq, !req->rq_want_ack,
- &lwi);
- CDEBUG (D_HA, "Retrying req %p: %d\n", req, rc);
- /* NB go back and test rq_want_ack with locking, to ensure
- * if ACK callback happened, it has completed stopped
- * referencing this req. */
- goto again;
- }
-}
-
-void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
+int
+target_send_reply_msg (struct ptlrpc_request *req, int rc, int fail_id)
{
- int i;
- int netrc;
- unsigned long flags;
- struct ptlrpc_req_ack_lock *ack_lock;
- struct l_wait_info lwi = { 0 };
- wait_queue_t commit_wait;
- struct obd_device *obd =
- req->rq_export ? req->rq_export->exp_obd : NULL;
- struct obd_export *exp = NULL;
-
- if (req->rq_export) {
- for (i = 0; i < REQ_MAX_ACK_LOCKS; i++) {
- if (req->rq_ack_locks[i].mode) {
- exp = req->rq_export;
- break;
+ if (OBD_FAIL_CHECK(fail_id | OBD_FAIL_ONCE)) {
+ obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED;
+ DEBUG_REQ(D_ERROR, req, "dropping reply");
+ /* NB this does _not_ send with ACK disabled, to simulate
+ * sending OK, but timing out for the ACK */
+ if (req->rq_reply_state != NULL) {
+ if (!req->rq_reply_state->rs_difficult) {
+ lustre_free_reply_state (req->rq_reply_state);
+ req->rq_reply_state = NULL;
+ } else {
+ struct ptlrpc_service *svc =
+ req->rq_rqbd->rqbd_srv_ni->sni_service;
+ atomic_inc(&svc->srv_outstanding_replies);
}
}
+ return (-ECOMM);
}
- if (exp) {
- exp->exp_outstanding_reply = req;
- spin_lock_irqsave (&req->rq_lock, flags);
- req->rq_want_ack = 1;
- spin_unlock_irqrestore (&req->rq_lock, flags);
- }
-
- if (!OBD_FAIL_CHECK(fail_id | OBD_FAIL_ONCE)) {
- if (rc == 0) {
- DEBUG_REQ(D_NET, req, "sending reply");
- netrc = ptlrpc_reply(req);
- } else if (rc == -ENOTCONN) {
- DEBUG_REQ(D_HA, req, "processing error (%d)", rc);
- netrc = ptlrpc_error(req);
- } else {
- DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc);
- netrc = ptlrpc_error(req);
+ if (rc) {
+ DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc);
+ if (req->rq_reply_state == NULL) {
+ rc = lustre_pack_reply (req, 0, NULL, NULL);
+ if (rc != 0) {
+ CERROR ("can't allocate reply\n");
+ return (rc);
+ }
}
+ req->rq_type = PTL_RPC_MSG_ERR;
} else {
- obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED;
- DEBUG_REQ(D_ERROR, req, "dropping reply");
- if (req->rq_repmsg) {
- OBD_FREE(req->rq_repmsg, req->rq_replen);
- req->rq_repmsg = NULL;
- }
- init_waitqueue_head(&req->rq_reply_waitq);
- netrc = 0;
+ DEBUG_REQ(D_NET, req, "sending reply");
}
+
+ return (ptlrpc_send_reply(req, 1));
+}
- /* a failed send simulates the callbacks */
- LASSERT(netrc == 0 || req->rq_want_ack == 0);
- if (exp == NULL) {
- LASSERT(req->rq_want_ack == 0);
+void
+target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
+{
+ int netrc;
+ unsigned long flags;
+ struct ptlrpc_reply_state *rs;
+ struct obd_device *obd;
+ struct obd_export *exp;
+ struct ptlrpc_srv_ni *sni;
+ struct ptlrpc_service *svc;
+
+ sni = req->rq_rqbd->rqbd_srv_ni;
+ svc = sni->sni_service;
+
+ rs = req->rq_reply_state;
+ if (rs == NULL || !rs->rs_difficult) {
+ /* The easy case; no notifiers and reply_out_callback()
+ * cleans up (i.e. we can't look inside rs after a
+ * successful send) */
+ netrc = target_send_reply_msg (req, rc, fail_id);
+
+ LASSERT (netrc == 0 || req->rq_reply_state == NULL);
return;
}
- LASSERT(obd != NULL);
-
- init_waitqueue_entry(&commit_wait, current);
- add_wait_queue(&obd->obd_commit_waitq, &commit_wait);
- rc = l_wait_event(req->rq_reply_waitq,
- !req->rq_want_ack || req->rq_resent ||
- req->rq_transno <= obd->obd_last_committed, &lwi);
- remove_wait_queue(&obd->obd_commit_waitq, &commit_wait);
-
- spin_lock_irqsave (&req->rq_lock, flags);
- /* If we got here because the ACK callback ran, this acts as a
- * barrier to ensure the callback completed the wakeup. */
- spin_unlock_irqrestore (&req->rq_lock, flags);
-
- /* If we committed the transno already, then we might wake up before
- * the ack arrives. We need to stop waiting for the ack before we can
- * reuse this request structure. We are guaranteed by this point that
- * this cannot abort the sending of the actual reply.*/
- ptlrpc_abort_reply(req);
-
- if (req->rq_resent) {
- DEBUG_REQ(D_HA, req, "resent: not cancelling locks");
- return;
+
+ /* must be an export if locks saved */
+ LASSERT (req->rq_export != NULL);
+ /* req/reply consistent */
+ LASSERT (rs->rs_srv_ni == sni);
+
+ /* "fresh" reply */
+ LASSERT (!rs->rs_scheduled);
+ LASSERT (!rs->rs_scheduled_ever);
+ LASSERT (!rs->rs_handled);
+ LASSERT (!rs->rs_on_net);
+ LASSERT (rs->rs_export == NULL);
+ LASSERT (list_empty(&rs->rs_obd_list));
+ LASSERT (list_empty(&rs->rs_exp_list));
+
+ exp = class_export_get (req->rq_export);
+ obd = exp->exp_obd;
+
+ /* disable reply scheduling onto srv_reply_queue while I'm setting up */
+ rs->rs_scheduled = 1;
+ rs->rs_on_net = 1;
+ rs->rs_xid = req->rq_xid;
+ rs->rs_transno = req->rq_transno;
+ rs->rs_export = exp;
+
+ spin_lock_irqsave (&obd->obd_uncommitted_replies_lock, flags);
+
+ if (rs->rs_transno > obd->obd_last_committed) {
+ /* not committed already */
+ list_add_tail (&rs->rs_obd_list,
+ &obd->obd_uncommitted_replies);
}
- LASSERT(rc == 0);
- DEBUG_REQ(D_HA, req, "cancelling locks for %s",
- req->rq_want_ack ? "commit" : "ack");
+ spin_unlock (&obd->obd_uncommitted_replies_lock);
+ spin_lock (&exp->exp_lock);
- exp->exp_outstanding_reply = NULL;
+ list_add_tail (&rs->rs_exp_list, &exp->exp_outstanding_replies);
- for (ack_lock = req->rq_ack_locks, i = 0;
- i < REQ_MAX_ACK_LOCKS; i++, ack_lock++) {
- if (!ack_lock->mode)
- continue;
- ldlm_lock_decref(&ack_lock->lock, ack_lock->mode);
+ spin_unlock_irqrestore (&exp->exp_lock, flags);
+
+ netrc = target_send_reply_msg (req, rc, fail_id);
+
+ spin_lock_irqsave (&svc->srv_lock, flags);
+
+ svc->srv_n_difficult_replies++;
+
+ if (netrc != 0) /* error sending: reply is off the net */
+ rs->rs_on_net = 0;
+
+ if (!rs->rs_on_net || /* some notifier */
+ list_empty(&rs->rs_exp_list) || /* completed already */
+ list_empty(&rs->rs_obd_list)) {
+ list_add_tail (&rs->rs_list, &svc->srv_reply_queue);
+ wake_up (&svc->srv_waitq);
+ } else {
+ list_add (&rs->rs_list, &sni->sni_active_replies);
+ rs->rs_scheduled = 0; /* allow notifier to schedule */
}
+
+ spin_unlock_irqrestore (&svc->srv_lock, flags);
}
int target_handle_ping(struct ptlrpc_request *req)
{
return lustre_pack_reply(req, 0, NULL, NULL);
}
-
-void *ldlm_put_lock_into_req(struct ptlrpc_request *req,
- struct lustre_handle *lock, int mode)
-{
- int i;
-
- for (i = 0; i < REQ_MAX_ACK_LOCKS; i++) {
- if (req->rq_ack_locks[i].mode)
- continue;
- CDEBUG(D_HA, "saving lock "LPX64" in req %p ack_lock[%d]\n",
- lock->cookie, req, i);
- memcpy(&req->rq_ack_locks[i].lock, lock, sizeof(*lock));
- req->rq_ack_locks[i].mode = mode;
- return &req->rq_ack_locks[i];
- }
- CERROR("no space for lock in struct ptlrpc_request\n");
- LBUG();
- return NULL;
-}
void ldlm_reprocess_all_ns(struct ldlm_namespace *ns)
{
- (void)ldlm_namespace_foreach_res(ns, reprocess_one_queue, NULL);
+ int i, rc;
+
+ l_lock(&ns->ns_lock);
+ for (i = 0; i < RES_HASH_SIZE; i++) {
+ struct list_head *tmp, *next;
+ list_for_each_safe(tmp, next, &(ns->ns_hash[i])) {
+ struct ldlm_resource *res =
+ list_entry(tmp, struct ldlm_resource, lr_hash);
+
+ ldlm_resource_getref(res);
+ l_unlock(&ns->ns_lock);
+ rc = reprocess_one_queue(res, NULL);
+ l_lock(&ns->ns_lock);
+ next = tmp->next;
+ ldlm_resource_putref(res);
+ if (rc == LDLM_ITER_STOP)
+ GOTO(out, rc);
+ }
+ }
+ out:
+ l_unlock(&ns->ns_lock);
+ EXIT;
}
void ldlm_reprocess_all(struct ldlm_resource *res)
req->rq_timeout = 2; /* 2 second timeout for initial AST reply */
rc = ptlrpc_queue_wait(req);
if (rc == -ETIMEDOUT || rc == -EINTR) {
-#ifdef __KERNEL__
- ldlm_del_waiting_lock(lock);
- ldlm_failed_ast(lock, rc, "blocking");
-#else
- /* XXX
- * Here we treat all clients as liblustre. When BLOCKING AST
- * timeout we don't evicting the client and only cancel
- * the lock.
- * restore to orignial implementation later!!!
- * XXX
- */
- CERROR("BLOCKING AST to client (nid "LPU64") timeout, "
- "simply cancel lock 0x%p\n",
- req->rq_peer.peer_nid, lock);
- ldlm_lock_cancel(lock);
- rc = -ERESTART;
-#endif
+ LASSERT(lock->l_export);
+ if (lock->l_export->exp_libclient) {
+ CDEBUG(D_HA, "BLOCKING AST to liblustre client (nid "
+ LPU64") timeout, simply cancel lock 0x%p\n",
+ req->rq_peer.peer_nid, lock);
+ ldlm_lock_cancel(lock);
+ rc = -ERESTART;
+ } else {
+ ldlm_del_waiting_lock(lock);
+ ldlm_failed_ast(lock, rc, "blocking");
+ }
} else if (rc) {
if (rc == -EINVAL)
CDEBUG(D_DLMTRACE, "client (nid "LPU64") returned %d "
#endif
ldlm->ldlm_cb_service =
- ptlrpc_init_svc(LDLM_NEVENTS, LDLM_NBUFS, LDLM_BUFSIZE,
- LDLM_MAXREQSIZE, LDLM_CB_REQUEST_PORTAL,
- LDLM_CB_REPLY_PORTAL,
+ ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE,
+ LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
ldlm_callback_handler, "ldlm_cbd",
ldlm_svc_proc_dir);
}
ldlm->ldlm_cancel_service =
- ptlrpc_init_svc(LDLM_NEVENTS, LDLM_NBUFS, LDLM_BUFSIZE,
- LDLM_MAXREQSIZE, LDLM_CANCEL_REQUEST_PORTAL,
+ ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE,
+ LDLM_CANCEL_REQUEST_PORTAL,
LDLM_CANCEL_REPLY_PORTAL,
ldlm_cancel_handler, "ldlm_canceld",
ldlm_svc_proc_dir);
EXPORT_SYMBOL(target_handle_ping);
EXPORT_SYMBOL(target_handle_disconnect);
EXPORT_SYMBOL(target_queue_final_reply);
-EXPORT_SYMBOL(ldlm_put_lock_into_req);
## Liblustre excecutables & libraries Makefile
DEFS=
+SUBDIRS = . tests
+
CFLAGS := -g -Wall -I$(top_srcdir)/utils -I$(top_srcdir)/portals/include \
- -I$(top_srcdir)/portals/unals -I$(SYSIO)/include \
- -I/opt/lam/include -L/opt/lam/lib
+ -I$(top_srcdir)/portals/unals -I$(SYSIO)/include
-KFLAGS:=
CPPFLAGS = $(HAVE_EFENCE) -D_LARGEFILE64_SOURCE=1
LIBS = $(LIBEFENCE)
-## lustre components libs
-LLIBS := ./libllite.a \
- ../lov/liblov.a \
- ../obdecho/libobdecho.a \
- ../osc/libosc.a \
- ../mdc/libmdc.a \
- ../ldlm/libldlm.a \
- ../ptlrpc/libptlrpc.a \
- ../obdclass/liblustreclass.a \
- ../lvfs/liblvfs.a
-
-## portals components libs
-PTLLIBS := ../portals/utils/libptlctl.a \
- ../portals/unals/libtcpnal.a \
- ../portals/portals/libportals.a
-
-## sysio components libs
-SYSIOLIBS := $(SYSIO)/drivers/native/libsysio_native.a \
+LUSTRE_LIBS = libllite.a \
+ $(top_srcdir)/lov/liblov.a \
+ $(top_srcdir)/obdecho/libobdecho.a \
+ $(top_srcdir)/osc/libosc.a \
+ $(top_srcdir)/mdc/libmdc.a \
+ $(top_srcdir)/ptlrpc/libptlrpc.a \
+ $(top_srcdir)/obdclass/liblustreclass.a \
+ $(top_srcdir)/lvfs/liblvfs.a
+
+PTL_LIBS = $(top_srcdir)/portals/utils/libuptlctl.a \
+ $(top_srcdir)/portals/unals/libtcpnal.a \
+ $(top_srcdir)/portals/portals/libportals.a
+
+SYSIO_LIBS = $(SYSIO)/drivers/native/libsysio_native.a \
$(SYSIO)/drivers/sockets/libsysio_sockets.a \
$(SYSIO)/src/libsysio.a \
$(SYSIO)/dev/stdfd/libsysio_stdfd.a
-LLIB_EXEC= $(PTLLIBS) $(SYSIOLIBS) -lpthread
+#SYSIO_LIBS = $(SYSIO)/lib/libsysio.a
-lib_LIBRARIES =
-noinst_LIBRARIES = libllite.a libtestcommon.a
-libllite_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c
-libtestcommon_a_SOURCES = test_common.c
+lib_LIBRARIES = liblustre.a
+noinst_LIBRARIES = libllite.a
-bin_PROGRAMS = libtest lltest recovery_small replay_single #test_lock_cancel
+libllite_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c dir.c
+libllite_a_CFLAGS = -fPIC
-libtest_SOURCES = libtest.c ../utils/parser.c ../utils/obd.c ../utils/lustre_cfg.c
-libtest_LDADD := $(LLIBS) $(PTLLIBS) \
- $(LIBREADLINE) -lpthread
+# for make rpms -- need cleanup
+liblustre_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c dir.c
+liblustre_a_CFLAGS = -fPIC
-liblustre.a : libllite.a
+liblustre.a : $(LUSTRE_LIBS) $(PTL_LIBS) $(SYSIO_LIBS)
$(shell ./genlib.sh $(SYSIO) $(AR) $(LINK))
-lltest_SOURCES = lltest.c
-lltest_LDADD := ./libtestcommon.a $(LLIBS) $(LLIB_EXEC) $(LIBREADLINE)
-
-recovery_small_SOURCES = recovery_small.c
-recovery_small_LDADD := ./libtestcommon.a $(LLIBS) $(LLIB_EXEC) $(LIBREADLINE)
-
-replay_single_SOURCES = replay_single.c
-replay_single_LDADD := ./libtestcommon.a $(LLIBS) $(LLIB_EXEC) $(LIBREADLINE)
-
-#test_lock_cancel_SOURCES = test_lock_cancel.c
-#test_lock_cancel_LDADD := $(LLIBS) $(LLIB_EXEC) -lmpi -llam
-
include $(top_srcdir)/Rules
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Lustre Light Super operations
+ *
+ * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/fcntl.h>
+#include <sys/queue.h>
+
+#include <sysio.h>
+#include <fs.h>
+#include <mount.h>
+#include <inode.h>
+#include <file.h>
+
+#undef LIST_HEAD
+
+#include <linux/types.h>
+#include <linux/dirent.h>
+#include <linux/unistd.h>
+
+#include "llite_lib.h"
+
+static int llu_dir_do_readpage(struct inode *inode, struct page *page)
+{
+ struct llu_inode_info *lli = llu_i2info(inode);
+ struct llu_sb_info *sbi = llu_i2sbi(inode);
+ struct ll_fid mdc_fid;
+ __u64 offset;
+ int rc = 0;
+ struct ptlrpc_request *request;
+ struct lustre_handle lockh;
+ struct mds_body *body;
+ struct lookup_intent it = { .it_op = IT_READDIR };
+ struct mdc_op_data data;
+ struct obd_device *obddev = class_exp2obd(sbi->ll_mdc_exp);
+ struct ldlm_res_id res_id =
+ { .name = {lli->lli_st_ino, (__u64)lli->lli_st_generation} };
+ ENTRY;
+
+ if ((lli->lli_st_size + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT <= page->index) {
+ /* XXX why do we need this exactly, and why do we think that
+ * an all-zero directory page is useful?
+ */
+ CERROR("memsetting dir page %lu to zero (size %lld)\n",
+ page->index, lli->lli_st_size);
+ memset(page->addr, 0, PAGE_CACHE_SIZE);
+ GOTO(readpage_out, rc);
+ }
+
+ rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED,
+ &res_id, LDLM_PLAIN, NULL, 0, LCK_PR, &lockh);
+ if (!rc) {
+ llu_prepare_mdc_op_data(&data, inode, NULL, NULL, 0, 0);
+
+ rc = mdc_enqueue(sbi->ll_mdc_exp, LDLM_PLAIN, &it, LCK_PR,
+ &data, &lockh, NULL, 0,
+ ldlm_completion_ast, llu_mdc_blocking_ast,
+ inode);
+ request = (struct ptlrpc_request *)it.d.lustre.it_data;
+ if (request)
+ ptlrpc_req_finished(request);
+ if (rc < 0) {
+ CERROR("lock enqueue: err: %d\n", rc);
+ RETURN(rc);
+ }
+ }
+ ldlm_lock_dump_handle(D_OTHER, &lockh);
+
+ mdc_pack_fid(&mdc_fid, lli->lli_st_ino, lli->lli_st_generation, S_IFDIR);
+
+ offset = page->index << PAGE_SHIFT;
+ rc = mdc_readpage(sbi->ll_mdc_exp, &mdc_fid,
+ offset, page, &request);
+ if (!rc) {
+ body = lustre_msg_buf(request->rq_repmsg, 0, sizeof (*body));
+ LASSERT (body != NULL); /* checked by mdc_readpage() */
+ LASSERT_REPSWABBED (request, 0); /* swabbed by mdc_readpage() */
+
+ lli->lli_st_size = body->size;
+ }
+ ptlrpc_req_finished(request);
+ EXIT;
+
+ readpage_out:
+ ldlm_lock_decref(&lockh, LCK_PR);
+ return rc;
+}
+
+static struct page *llu_dir_read_page(struct inode *ino, int pgidx)
+{
+ struct page *page;
+ int rc;
+ ENTRY;
+
+ page = alloc_page(0);
+ if (!page) {
+ CERROR("alloc page failed\n");
+ RETURN(ERR_PTR(-ENOMEM));
+ }
+ page->index = pgidx;
+
+ rc = llu_dir_do_readpage(ino, page);
+ if (rc) {
+ free_page(page);
+ RETURN(ERR_PTR(rc));
+ }
+
+ return page;
+}
+
+#define NAME_OFFSET(de) ((int) ((de)->d_name - (char *) (de)))
+#define ROUND_UP64(x) (((x)+sizeof(__u64)-1) & ~(sizeof(__u64)-1))
+
+static int filldir(char *buf, int buflen,
+ const char *name, int namelen, loff_t offset,
+ ino_t ino, unsigned int d_type, int *filled)
+{
+ struct dirent64 *dirent = (struct dirent64 *) (buf + *filled);
+ int reclen = ROUND_UP64(NAME_OFFSET(dirent) + namelen + 1);
+
+ /* check overflow */
+ if ((*filled + reclen) > buflen)
+ return 1;
+
+ dirent->d_ino = ino;
+ dirent->d_off = offset,
+ dirent->d_reclen = reclen;
+ dirent->d_type = (unsigned short) d_type;
+ memcpy(dirent->d_name, name, namelen);
+ dirent->d_name[namelen] = 0;
+
+ *filled += reclen;
+
+ return 0;
+}
+
+ssize_t llu_iop_getdirentries(struct inode *ino, char *buf, size_t nbytes,
+ _SYSIO_OFF_T *basep)
+{
+ struct llu_inode_info *lli = llu_i2info(ino);
+ loff_t pos = *basep, offset;
+ int maxpages, pgidx, filled = 0;
+ ENTRY;
+
+ if (pos == -1)
+ pos = lli->lli_dir_pos;
+
+ maxpages = lli->lli_st_size >> PAGE_CACHE_SHIFT;
+ pgidx = pos >> PAGE_CACHE_SHIFT;
+ offset = pos & ~PAGE_CACHE_MASK;
+
+ for ( ; pgidx < maxpages ; pgidx++, offset = 0) {
+ struct page *page;
+ struct ext2_dirent *de;
+ char *addr, *limit;
+
+ page = llu_dir_read_page(ino, pgidx);
+ if (IS_ERR(page))
+ continue;
+
+ /* size might have been updated by mdc_readpage */
+ maxpages = lli->lli_st_size >> PAGE_CACHE_SHIFT;
+
+ /* fill in buffer */
+ addr = page->addr;
+ limit = addr + PAGE_CACHE_SIZE - EXT2_DIR_REC_LEN(1);
+ de = (struct ext2_dirent *) (addr + offset);
+
+ for ( ; (char*) de <= limit; de = ext2_next_entry(de)) {
+ if (de->inode) {
+ int over;
+ unsigned char d_type = 0;
+
+ /* XXX handle type, etc here */
+
+ offset = (char*) de - addr;
+ over = filldir(buf, nbytes, de->name, de->name_len,
+ (pgidx << PAGE_CACHE_SHIFT) | offset,
+ le32_to_cpu(de->inode), d_type, &filled);
+ if (over) {
+ free_page(page);
+ GOTO(done, 0);
+ }
+ }
+ }
+
+ free_page(page);
+ }
+done:
+ lli->lli_dir_pos = pgidx << PAGE_CACHE_SHIFT | offset;
+ *basep = lli->lli_dir_pos;
+ RETURN(filled);
+}
#include <inode.h>
#include <file.h>
+#undef LIST_HEAD
+
#include "llite_lib.h"
void llu_prepare_mdc_op_data(struct mdc_op_data *data,
lli->lli_st_blocks = src->o_blocks;
}
-#if 0
-static int llu_create_obj(struct lustre_handle *conn, struct inode *inode,
- struct lov_stripe_md *lsm)
-{
- struct ptlrpc_request *req = NULL;
- struct llu_inode_info *lli = llu_i2info(inode);
- struct lov_mds_md *lmm = NULL;
- struct obdo *oa;
- struct iattr iattr;
- struct mdc_op_data op_data;
- struct obd_trans_info oti = { 0 };
- int rc, err, lmm_size = 0;;
- ENTRY;
-
- oa = obdo_alloc();
- if (!oa)
- RETURN(-ENOMEM);
-
- LASSERT(S_ISREG(inode->i_mode));
- oa->o_mode = S_IFREG | 0600;
- oa->o_id = lli->lli_st_ino;
- oa->o_generation = lli->lli_st_generation;
- /* Keep these 0 for now, because chown/chgrp does not change the
- * ownership on the OST, and we don't want to allow BA OST NFS
- * users to access these objects by mistake.
- */
- oa->o_uid = 0;
- oa->o_gid = 0;
- oa->o_valid = OBD_MD_FLID | OBD_MD_FLGENER | OBD_MD_FLTYPE |
- OBD_MD_FLMODE | OBD_MD_FLUID | OBD_MD_FLGID;
-
- obdo_from_inode(oa, inode, OBD_MD_FLTYPE|OBD_MD_FLATIME|OBD_MD_FLMTIME|
- OBD_MD_FLCTIME |
- (llu_i2info(inode)->lli_st_size ? OBD_MD_FLSIZE : 0));
-
- rc = obd_create(conn, oa, &lsm, &oti);
- if (rc) {
- CERROR("error creating objects for inode %lu: rc = %d\n",
- lli->lli_st_ino, rc);
- if (rc > 0) {
- CERROR("obd_create returned invalid rc %d\n", rc);
- rc = -EIO;
- }
- GOTO(out_oa, rc);
- }
- obdo_refresh_inode(inode, oa, OBD_MD_FLBLKSZ);
-
- LASSERT(lsm && lsm->lsm_object_id);
- rc = obd_packmd(conn, &lmm, lsm);
- if (rc < 0)
- GOTO(out_destroy, rc);
-
- lmm_size = rc;
-
- /* Save the stripe MD with this file on the MDS */
- memset(&iattr, 0, sizeof(iattr));
- iattr.ia_valid = ATTR_FROM_OPEN;
-
- llu_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
-
- rc = mdc_setattr(&llu_i2sbi(inode)->ll_mdc_conn, &op_data,
- &iattr, lmm, lmm_size, oti.oti_logcookies,
- oti.oti_numcookies * sizeof(oti.oti_onecookie), &req);
- ptlrpc_req_finished(req);
-
- obd_free_diskmd(conn, &lmm);
-
- /* If we couldn't complete mdc_open() and store the stripe MD on the
- * MDS, we need to destroy the objects now or they will be leaked.
- */
- if (rc) {
- CERROR("error: storing stripe MD for %lu: rc %d\n",
- lli->lli_st_ino, rc);
- GOTO(out_destroy, rc);
- }
- lli->lli_smd = lsm;
- lli->lli_maxbytes = lsm->lsm_maxbytes;
-
- EXIT;
-out_oa:
- oti_free_cookies(&oti);
- obdo_free(oa);
- return rc;
-
-out_destroy:
- oa->o_id = lsm->lsm_object_id;
- oa->o_valid = OBD_MD_FLID;
- obdo_from_inode(oa, inode, OBD_MD_FLTYPE);
-
- err = obd_destroy(conn, oa, lsm, NULL);
- obd_free_memmd(conn, &lsm);
- if (err) {
- CERROR("error uncreating inode %lu objects: rc %d\n",
- lli->lli_st_ino, err);
- }
- goto out_oa;
-}
-#endif
-
static int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it)
{
struct ptlrpc_request *req = it->d.lustre.it_data;
* ll_mdc_close, so don't even try right now. */
LASSERT(fd != NULL);
- memset(fd, 0, sizeof(*fd));
-
memcpy(&fd->fd_mds_och.och_fh, &body->handle, sizeof(body->handle));
fd->fd_mds_och.och_magic = OBD_CLIENT_HANDLE_MAGIC;
lli->lli_file_data = fd;
RETURN(0);
}
-#if 0
-static int llu_osc_open(struct lustre_handle *conn, struct inode *inode,
- struct lov_stripe_md *lsm)
-{
- struct ll_file_data *fd = llu_i2info(inode)->lli_file_data;
- struct obdo *oa;
- int rc;
- ENTRY;
-
- oa = obdo_alloc();
- if (!oa)
- RETURN(-ENOMEM);
- oa->o_id = lsm->lsm_object_id;
- oa->o_mode = S_IFREG;
- oa->o_valid = (OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLBLOCKS |
- OBD_MD_FLMTIME | OBD_MD_FLCTIME);
- rc = obd_open(conn, oa, lsm, NULL, &fd->fd_ost_och);
- if (rc)
- GOTO(out, rc);
-
- /* file->f_flags &= ~O_LOV_DELAY_CREATE; */
- obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS | OBD_MD_FLMTIME |
- OBD_MD_FLCTIME);
-
- EXIT;
-out:
- obdo_free(oa);
- return rc;
-}
-#endif
-
-
int llu_iop_open(struct pnode *pnode, int flags, mode_t mode)
{
struct inode *inode = pnode->p_base->pb_ino;
int rc = 0;
ENTRY;
+ /* don't do anything for '/' */
+ if (llu_is_root_inode(inode))
+ RETURN(0);
+
CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", lli->lli_st_ino);
LL_GET_INTENT(inode, it);
if (!it->d.lustre.it_disposition) {
-#if 0
- struct lookup_intent oit = { .it_op = IT_OPEN,
- .it_flags = file->f_flags };
- it = &oit;
- rc = ll_intent_file_open(file, NULL, 0, it);
- if (rc)
- GOTO(out_release, rc);
-#endif
- CERROR("fixme!!\n");
+ LBUG();
}
rc = it_open_error(DISP_OPEN_OPEN, it);
CDEBUG(D_INODE, "object creation was delayed\n");
GOTO(out_release, rc);
}
-#if 0
- if (!lli->lli_smd) {
- rc = llu_create_obj(conn, inode, NULL);
- if (rc)
- GOTO(out_close, rc);
- } else {
- CERROR("warning: stripe already set on ino %lu\n",
- lli->lli_st_ino);
- }
- lsm = lli->lli_smd;
-#endif
}
fd->fd_flags &= ~O_LOV_DELAY_CREATE;
CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%lu\n", lli->lli_st_ino,
lli->lli_st_generation);
- /* FIXME need add this check later. how to find the root pnode? */
-#if 0
- /* don't do anything for / */
- if (inode->i_sb->s_root == file->f_dentry)
- RETURN(0);
-#endif
+ /* XXX don't do anything for '/'. but how to find the root pnode? */
+
/* still opened by others? */
if (--lli->lli_open_count)
RETURN(0);
#!/bin/bash
+#set -xv
#
# This script is to generate lib lustre library as a whole. It will leave
AR=/usr/bin/ar
LD=/usr/bin/ld
+RANLIB=/usr/bin/ranlib
CWD=`pwd`
SYSIO=$1
+#if [ ! -f $SYSIO/lib/libsysio.a ]; then
+# echo "ERROR: $SYSIO/lib/libsysio.a dosen't exist"
+# exit 1
+#fi
+#
+# do cleanup at first
+#rm -f liblustre.so
+
ALL_OBJS=
build_obj_list() {
_objs=`$AR -t $1/$2`
for _lib in $_objs; do
- ALL_OBJS=$ALL_OBJS"$1/$_lib ";
+ ALL_OBJS=$ALL_OBJS"$1/$_lib ";
done;
}
+#
+# special treatment for libsysio
+#
+#sysio_tmp=$CWD/sysio_tmp_`date +%s`
+#build_sysio_obj_list() {
+# _objs=`$AR -t $1`
+# mkdir -p $sysio_tmp
+# $AR -x $1
+# mv $_objs $sysio_tmp
+# for _lib in $_objs; do
+# ALL_OBJS=$ALL_OBJS"$sysio_tmp/$_lib ";
+# done
+#}
+
# lustre components libs
build_obj_list . libllite.a
build_obj_list ../lov liblov.a
build_obj_list ../obdecho libobdecho.a
build_obj_list ../osc libosc.a
build_obj_list ../mdc libmdc.a
-build_obj_list ../ldlm libldlm.a
build_obj_list ../ptlrpc libptlrpc.a
build_obj_list ../obdclass liblustreclass.a
build_obj_list ../lvfs liblvfs.a
# portals components libs
-build_obj_list ../portals/utils libptlctl.a
+build_obj_list ../portals/utils libuptlctl.a
build_obj_list ../portals/unals libtcpnal.a
build_obj_list ../portals/portals libportals.a
+# create static lib lsupport
+rm -f $CWD/liblsupport.a
+$AR -cru $CWD/liblsupport.a $ALL_OBJS
+$RANLIB $CWD/liblsupport.a
+
# libsysio components libs
build_obj_list $SYSIO/drivers/native libsysio_native.a
build_obj_list $SYSIO/drivers/sockets libsysio_sockets.a
build_obj_list $SYSIO/src libsysio.a
build_obj_list $SYSIO/dev/stdfd libsysio_stdfd.a
+#
+#build_sysio_obj_list $SYSIO/lib/libsysio.a
+#
-
-# create static lib
+# create static lib lustre
rm -f $CWD/liblustre.a
-$AR -r $CWD/liblustre.a $ALL_OBJS
+$AR -cru $CWD/liblustre.a $ALL_OBJS
+$RANLIB $CWD/liblustre.a
-# create shared lib
+# create shared lib lustre
rm -f $CWD/liblustre.so
$LD -shared -o $CWD/liblustre.so -init __liblustre_setup_ -fini __liblustre_cleanup_ \
$ALL_OBJS -lpthread
+
+#rm -rf $sysio_tmp
+++ /dev/null
-#include <stdio.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <arpa/inet.h>
-
-#include <portals/api-support.h> /* needed for ptpctl.h */
-#include <portals/ptlctl.h> /* needed for parse_dump */
-
-
-#include <liblustre.h>
-#include <linux/obd.h>
-#include <linux/obd_class.h>
-#include <procbridge.h>
-
-#define LIBLUSTRE_TEST 1
-#include "../utils/lctl.c"
-
-struct ldlm_namespace;
-struct ldlm_res_id;
-struct obd_import;
-
-void *inter_module_get(char *arg)
-{
- if (!strcmp(arg, "tcpnal_ni"))
- return &tcpnal_ni;
- else if (!strcmp(arg, "ldlm_cli_cancel_unused"))
- return ldlm_cli_cancel_unused;
- else if (!strcmp(arg, "ldlm_namespace_cleanup"))
- return ldlm_namespace_cleanup;
- else if (!strcmp(arg, "ldlm_replay_locks"))
- return ldlm_replay_locks;
- else
- return NULL;
-}
-
-/* XXX move to proper place */
-char *portals_nid2str(int nal, ptl_nid_t nid, char *str)
-{
- switch(nal){
- case TCPNAL:
- /* userspace NAL */
- case SOCKNAL:
- sprintf(str, "%u:%d.%d.%d.%d", (__u32)(nid >> 32),
- HIPQUAD(nid));
- break;
- case QSWNAL:
- case GMNAL:
- case IBNAL:
- case SCIMACNAL:
- sprintf(str, "%u:%u", (__u32)(nid >> 32), (__u32)nid);
- break;
- default:
- return NULL;
- }
- return str;
-}
-
-ptl_handle_ni_t tcpnal_ni;
-
-struct pingcli_args {
- ptl_nid_t mynid;
- ptl_nid_t nid;
- ptl_pid_t port;
- int count;
- int size;
-};
-
-struct task_struct *current;
-
-struct obd_class_user_state ocus;
-
-/* portals interfaces */
-ptl_handle_ni_t *
-kportal_get_ni (int nal)
-{
- switch (nal)
- {
- case SOCKNAL:
- return &tcpnal_ni;
- default:
- return NULL;
- }
-}
-
-inline void
-kportal_put_ni (int nal)
-{
- return;
-}
-
-int
-kportal_nal_cmd(struct portals_cfg *pcfg)
-{
-#if 0
- __u32 nal = pcfg->pcfg_nal;
- int rc = -EINVAL;
-
- ENTRY;
-
- down(&nal_cmd_sem);
- if (nal > 0 && nal <= NAL_MAX_NR && nal_cmd[nal].nch_handler) {
- CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal,
- pcfg->pcfg_command);
- rc = nal_cmd[nal].nch_handler(pcfg, nal_cmd[nal].nch_private);
- }
- up(&nal_cmd_sem);
- RETURN(rc);
-#else
- CERROR("empty function!!!\n");
- return 0;
-#endif
-}
-
-int init_current(int argc, char **argv)
-{
- current = malloc(sizeof(*current));
- strncpy(current->comm, argv[0], sizeof(current->comm));
- current->pid = getpid();
- return 0;
-}
-
-ptl_nid_t tcpnal_mynid;
-
-int init_lib_portals()
-{
- int rc;
-
- PtlInit();
- rc = PtlNIInit(procbridge_interface, 0, 0, 0, &tcpnal_ni);
- if (rc != 0) {
- CERROR("ksocknal: PtlNIInit failed: error %d\n", rc);
- PtlFini();
- RETURN (rc);
- }
- PtlNIDebug(tcpnal_ni, ~0);
- return rc;
-}
-
-extern int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, unsigned long arg);
-
-
-int lib_ioctl_nalcmd(int dev_id, int opc, void * ptr)
-{
- struct portal_ioctl_data *ptldata;
-
- if (opc == IOC_PORTAL_NAL_CMD) {
- ptldata = (struct portal_ioctl_data *) ptr;
-
- if (ptldata->ioc_nal_cmd == NAL_CMD_REGISTER_MYNID) {
- tcpnal_mynid = ptldata->ioc_nid;
- printf("mynid: %u.%u.%u.%u\n",
- (unsigned)(tcpnal_mynid>>24) & 0xFF,
- (unsigned)(tcpnal_mynid>>16) & 0xFF,
- (unsigned)(tcpnal_mynid>>8) & 0xFF,
- (unsigned)(tcpnal_mynid) & 0xFF);
- }
- }
-
- return (0);
-}
-
-int lib_ioctl(int dev_id, int opc, void * ptr)
-{
-
- if (dev_id == OBD_DEV_ID) {
- class_handle_ioctl(&ocus, opc, (unsigned long)ptr);
-
- /* you _may_ need to call obd_ioctl_unpack or some
- other verification function if you want to use ioc
- directly here */
-#if 0
- printf ("processing ioctl cmd: %x buf len: %d\n",
- opc, ioc->ioc_len);
-#endif
- }
- return (0);
-}
-
-int liblustre_ioctl(int dev_id, int opc, void *ptr)
-{
- int rc = -EINVAL;
-
- switch (dev_id) {
- default:
- fprintf(stderr, "Unexpected device id %d\n", dev_id);
- abort();
- break;
-
- case OBD_DEV_ID:
- rc = class_handle_ioctl(&ocus, opc, (unsigned long)ptr);
- break;
- }
-
- return rc;
-}
-
-extern int time_ptlwait1;
-extern int time_ptlwait2;
-extern int time_ptlselect;
-int main(int argc, char **argv)
-{
- char *config_file;
-
- if (argc > 2) {
- printf("Usage: %s [config_file]\n", argv[0]);
- return 1;
- }
-
- if (argc == 2) {
- config_file = argv[1];
- argc--;
- argv++;
- } else
- config_file = "/tmp/DUMP_FILE";
-
- srand(time(NULL));
-
- INIT_LIST_HEAD(&ocus.ocus_conns);
-#if 1
- portal_debug = 0;
- portal_subsystem_debug = 0;
-#endif
- parse_dump(config_file, lib_ioctl_nalcmd);
-
- if (init_current(argc, argv) ||
- init_obdclass() || init_lib_portals() ||
- ptlrpc_init() ||
- ldlm_init() ||
- mdc_init() ||
- lov_init() ||
- osc_init() ||
- echo_client_init()) {
- printf("error\n");
- return 1;
- }
-
- parse_dump(config_file, lib_ioctl);
-
- set_ioc_handler(liblustre_ioctl);
-#if 0
- portal_debug = -1;
- portal_subsystem_debug = -1;
-#endif
- return lctl_main(argc, argv);
-}
-
#include <inode.h>
#include <file.h>
+/* both sys/queue.h (libsysio require it) and portals/lists.h have definition
+ * of 'LIST_HEAD'. undef it to suppress warnings
+ */
+#undef LIST_HEAD
+
#include <portals/api-support.h> /* needed for ptpctl.h */
#include <portals/ptlctl.h> /* needed for parse_dump */
#include <procbridge.h>
ptl_handle_ni_t tcpnal_ni;
-struct task_struct *current;
-struct obd_class_user_state ocus;
+struct task_struct *current;
/* portals interfaces */
ptl_handle_ni_t *
PtlInit();
rc = PtlNIInit(procbridge_interface, 0, 0, 0, &tcpnal_ni);
if (rc != 0) {
- CERROR("ksocknal: PtlNIInit failed: error %d\n", rc);
+ CERROR("TCPNAL: PtlNIInit failed: error %d\n", rc);
PtlFini();
RETURN (rc);
}
return 0;
}
-extern int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, unsigned long arg);
+extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
int lib_ioctl_nalcmd(int dev_id, int opc, void * ptr)
{
ioc->ioc_pbuf1 = ioc->ioc_bulk;
//XXX
- rc = class_handle_ioctl(&ocus, opc, (unsigned long)ptr);
+ rc = class_handle_ioctl(opc, (unsigned long)ptr);
printf ("proccssing ioctl cmd: %x, rc %d\n", opc, rc);
int lllib_init(char *dumpfile)
{
- INIT_LIST_HEAD(&ocus.ocus_conns);
-
if (!g_zconf) {
/* this parse only get my nid from config file
* before initialize portals
} else {
/* XXX need setup mynid before tcpnal initialize */
tcpnal_mynid = ((uint64_t)getpid() << 32) | time(0);
- printf("set tcpnal mynid: %016llx\n", tcpnal_mynid);
+ printf("LibLustre: TCPNAL NID: %016llx\n", tcpnal_mynid);
}
init_current("dummy");
}
#endif
-int liblustre_process_log(struct config_llog_instance *cfg)
+int liblustre_process_log(struct config_llog_instance *cfg, int allow_recov)
{
struct lustre_cfg lcfg;
char *peer = "MDS_PEER_UUID";
if (obd == NULL)
GOTO(out_cleanup, err = -EINVAL);
+ /* Disable initial recovery on this import */
+ err = obd_set_info(obd->obd_self_export,
+ strlen("initial_recov"), "initial_recov",
+ sizeof(allow_recov), &allow_recov);
+
err = obd_connect(&mdc_conn, obd, &mdc_uuid);
if (err) {
CERROR("cannot connect to %s: rc = %d\n",
/* env variables */
#define ENV_LUSTRE_MNTPNT "LIBLUSTRE_MOUNT_POINT"
#define ENV_LUSTRE_MNTTGT "LIBLUSTRE_MOUNT_TARGET"
+#define ENV_LUSTRE_TIMEOUT "LIBLUSTRE_TIMEOUT"
#define ENV_LUSTRE_DUMPFILE "LIBLUSTRE_DUMPFILE"
extern int _sysio_native_init();
+extern unsigned int obd_timeout;
+
/* global variables */
int g_zconf = 0; /* zeroconf or dumpfile */
char *g_zconf_mdsname = NULL; /* mdsname, for zeroconf */
{
char *lustre_path = NULL;
char *target = NULL;
+ char *timeout = NULL;
char *dumpfile = NULL;
char *root_driver = "native";
char *lustre_driver = "llite";
int err;
- srand(time(NULL));
+ /* consider tha case of starting multiple liblustre instances
+ * at a same time on single node.
+ */
+ srand(time(NULL) + getpid());
signal(SIGUSR1, sighandler_USR1);
lustre_path, target);
}
+ timeout = getenv(ENV_LUSTRE_TIMEOUT);
+ if (timeout) {
+ obd_timeout = (unsigned int) atoi(timeout);
+ printf("LibLustre: set obd timeout as %u seconds\n",
+ obd_timeout);
+ }
+
if (_sysio_init() != 0) {
perror("init sysio");
exit(1);
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Lustre Light Super operations
+ *
+ * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
#ifndef __LLU_H_
#define __LLU_H_
struct llu_sb_info
{
struct obd_uuid ll_sb_uuid;
- struct obd_export *ll_mdc_exp;
+ struct obd_export *ll_mdc_exp;
struct obd_export *ll_osc_exp;
obd_id ll_rootino;
int ll_flags;
char *ll_instance;
};
+#define LL_SBI_NOLCK 0x1
+#define LL_SBI_READAHEAD 0x2
+
#define LLI_F_HAVE_OST_SIZE_LOCK 0
#define LLI_F_HAVE_MDS_SIZE_LOCK 1
#define LLI_F_PREFER_EXTENDED_SIZE 2
struct lov_stripe_md *lli_smd;
char *lli_symlink_name;
struct semaphore lli_open_sem;
- __u64 lli_maxbytes;
+ __u64 lli_maxbytes;
unsigned long lli_flags;
/* for libsysio */
struct lookup_intent *lli_it;
- /* XXX workaround for libsysio */
+ /* XXX workaround for libsysio unlink */
int lli_stale_flag;
+ /* XXX workaround for libsysio readdir */
+ loff_t lli_dir_pos;
/* in libsysio we have no chance to store data in file,
* so place it here. since it's possible that an file
struct ll_file_data *lli_file_data;
int lli_open_count;
- /* stat FIXME not 64 bit clean */
- dev_t lli_st_dev;
- ino_t lli_st_ino;
- mode_t lli_st_mode;
- nlink_t lli_st_nlink;
- uid_t lli_st_uid;
- gid_t lli_st_gid;
- dev_t lli_st_rdev;
- loff_t lli_st_size;
- unsigned int lli_st_blksize;
- unsigned int lli_st_blocks;
- time_t lli_st_atime;
- time_t lli_st_mtime;
- time_t lli_st_ctime;
-
- /* not for stat, change it later */
- int lli_st_flags;
- unsigned long lli_st_generation;
+ /* stat FIXME not 64 bit clean */
+ dev_t lli_st_dev;
+ ino_t lli_st_ino;
+ mode_t lli_st_mode;
+ nlink_t lli_st_nlink;
+ uid_t lli_st_uid;
+ gid_t lli_st_gid;
+ dev_t lli_st_rdev;
+ loff_t lli_st_size;
+ unsigned int lli_st_blksize;
+ unsigned int lli_st_blocks;
+ time_t lli_st_atime;
+ time_t lli_st_mtime;
+ time_t lli_st_ctime;
+
+ /* not for stat, change it later */
+ int lli_st_flags;
+ unsigned long lli_st_generation;
};
#define LLU_SYSIO_COOKIE_SIZE(x) \
struct llu_sysio_cookie {
struct obd_sync_io_container *lsc_osic;
- struct inode *lsc_inode;
- int lsc_npages;
+ struct inode *lsc_inode;
+ int lsc_maxpages;
+ int lsc_npages;
struct ll_async_page *lsc_llap;
struct page *lsc_pages;
__u64 lsc_rwcount;
struct llu_sysio_callback_args
{
- int ncookies;
- struct llu_sysio_cookie *cookies[MAX_IOVEC];
+ int ncookies;
+ struct llu_sysio_cookie *cookies[MAX_IOVEC];
};
static inline struct llu_sb_info *llu_fs2sbi(struct filesys *fs)
{
- return (struct llu_sb_info*)(fs->fs_private);
+ return (struct llu_sb_info*)(fs->fs_private);
}
static inline struct llu_inode_info *llu_i2info(struct inode *inode)
{
- return (struct llu_inode_info*)(inode->i_private);
+ return (struct llu_inode_info*)(inode->i_private);
}
static inline struct llu_sb_info *llu_i2sbi(struct inode *inode)
return llu_i2info(inode)->lli_sbi;
}
-#if 0
-static inline struct client_obd *sbi2mdc(struct llu_sb_info *sbi)
-{
- struct obd_device *obd = class_conn2obd(&sbi->ll_mdc_conn);
- if (obd == NULL)
- LBUG();
- return &obd->u.cli;
-}
-#endif
-
static inline struct obd_export *llu_i2obdexp(struct inode *inode)
{
return llu_i2info(inode)->lli_sbi->ll_osc_exp;
return llu_i2info(inode)->lli_sbi->ll_mdc_exp;
}
+static inline int llu_is_root_inode(struct inode *inode)
+{
+ return (llu_i2info(inode)->lli_fid.id ==
+ llu_i2info(inode)->lli_sbi->ll_rootino);
+}
#define LL_SAVE_INTENT(inode, it) \
do { \
- struct lookup_intent *temp; \
+ struct lookup_intent *temp; \
LASSERT(llu_i2info(inode)->lli_it == NULL); \
OBD_ALLOC(temp, sizeof(*temp)); \
memcpy(temp, it, sizeof(*temp)); \
llu_i2info(inode)->lli_it = temp; \
CDEBUG(D_DENTRY, "alloc intent %p to inode %p(ino %lu)\n", \
- temp, inode, llu_i2info(inode)->lli_st_ino); \
+ temp, inode, llu_i2info(inode)->lli_st_ino); \
} while(0)
LASSERT(it); \
llu_i2info(inode)->lli_it = NULL; \
CDEBUG(D_DENTRY, "dettach intent %p from inode %p(ino %lu)\n", \
- it, inode, llu_i2info(inode)->lli_st_ino); \
+ it, inode, llu_i2info(inode)->lli_st_ino); \
} while(0)
/* interpet return codes from intent lookup */
static inline void ll_inode2fid(struct ll_fid *fid, struct inode *inode)
{
- *fid = llu_i2info(inode)->lli_fid;
+ *fid = llu_i2info(inode)->lli_fid;
}
struct it_cb_data {
- struct inode *icbd_parent;
- struct pnode *icbd_child;
- obd_id hash;
+ struct inode *icbd_parent;
+ struct pnode *icbd_child;
+ obd_id hash;
};
static inline void ll_i2uctxt(struct ll_uctxt *ctxt, struct inode *i1,
struct inode *i2)
{
- struct llu_inode_info *lli1 = llu_i2info(i1);
- struct llu_inode_info *lli2;
+ struct llu_inode_info *lli1 = llu_i2info(i1);
+ struct llu_inode_info *lli2;
LASSERT(i1);
LASSERT(ctxt);
ctxt->gid1 = -1;
if (i2) {
- lli2 = llu_i2info(i2);
+ lli2 = llu_i2info(i2);
if (in_group_p(lli2->lli_st_gid))
ctxt->gid2 = lli2->lli_st_gid;
else
/* FIXME */
static inline int ll_permission(struct inode *inode, int flag, void * unused)
{
- return 0;
-}
-
-#if 0
-static inline int it_disposition(struct lookup_intent *it, int flag)
-{
- return it->d.lustre.it_disposition & flag;
+ return 0;
}
-static inline void it_set_disposition(struct lookup_intent *it, int flag)
-{
- it->d.lustre.it_disposition |= flag;
-}
-#endif
-
static inline __u64 ll_file_maxbytes(struct inode *inode)
{
return llu_i2info(inode)->lli_maxbytes;
struct mount_option_s
{
- char *mdc_uuid;
- char *osc_uuid;
+ char *mdc_uuid;
+ char *osc_uuid;
};
/* llite_lib.c */
void generate_random_uuid(unsigned char uuid_out[16]);
-int liblustre_process_log(struct config_llog_instance *cfg);
+int liblustre_process_log(struct config_llog_instance *cfg, int allow_recov);
int ll_parse_mount_target(const char *target, char **mdsnid,
char **mdsname, char **profile);
-extern int g_zconf;
+extern int g_zconf;
extern char *g_zconf_mdsnid;
extern char *g_zconf_mdsname;
extern char *g_zconf_profile;
struct lov_stripe_md *lmm);
void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid);
void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid);
-//struct inode* llu_new_inode(struct filesys *fs, ino_t ino, mode_t mode);
-//int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm, void *ostdata);
int ll_it_open_error(int phase, struct lookup_intent *it);
struct inode *llu_iget(struct filesys *fs, struct lustre_md *md);
int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm);
int llu_iop_iodone(struct ioctx *ioctxp __IS_UNUSED);
struct llu_sysio_callback_args*
llu_file_write(struct inode *inode, const struct iovec *iovec,
- size_t iovlen, loff_t pos);
+ size_t iovlen, loff_t pos);
struct llu_sysio_callback_args*
llu_file_read(struct inode *inode, const struct iovec *iovec,
size_t iovlen, loff_t pos);
const char *path);
void unhook_stale_inode(struct pnode *pno);
struct inode *llu_inode_from_lock(struct ldlm_lock *lock);
+int llu_mdc_blocking_ast(struct ldlm_lock *lock,
+ struct ldlm_lock_desc *desc,
+ void *data, int flag);
+
+/* dir.c */
+ssize_t llu_iop_getdirentries(struct inode *ino, char *buf, size_t nbytes,
+ _SYSIO_OFF_T *basep);
+
+/* ext2 related */
+#define EXT2_NAME_LEN (255)
+
+struct ext2_dirent {
+ __u32 inode;
+ __u16 rec_len;
+ __u8 name_len;
+ __u8 file_type;
+ char name[EXT2_NAME_LEN];
+};
+
+#define EXT2_DIR_PAD 4
+#define EXT2_DIR_ROUND (EXT2_DIR_PAD - 1)
+#define EXT2_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT2_DIR_ROUND) & \
+ ~EXT2_DIR_ROUND)
+
+static inline struct ext2_dirent *ext2_next_entry(struct ext2_dirent *p)
+{
+ return (struct ext2_dirent*)((char*) p + le16_to_cpu(p->rec_len));
+}
#endif
#include <inode.h>
#include <file.h>
+#undef LIST_HEAD
+
#include "llite_lib.h"
-static void ll_intent_release(struct lookup_intent *it)
+static void ll_intent_drop_lock(struct lookup_intent *it)
{
struct lustre_handle *handle;
- ENTRY;
- /* LASSERT(ll_d2d(de) != NULL); */
-
- if (it->d.lustre.it_lock_mode) {
+ if (it->it_op && it->d.lustre.it_lock_mode) {
handle = (struct lustre_handle *)&it->d.lustre.it_lock_handle;
CDEBUG(D_DLMTRACE, "releasing lock with cookie "LPX64
- " from it %p\n",
- handle->cookie, it);
+ " from it %p\n", handle->cookie, it);
ldlm_lock_decref(handle, it->d.lustre.it_lock_mode);
- /* intent_release may be called multiple times, from
- this thread and we don't want to double-decref this
- lock (see bug 494) */
+ /* bug 494: intent_release may be called multiple times, from
+ * this thread and we don't want to double-decref this lock */
it->d.lustre.it_lock_mode = 0;
}
- it->it_magic = 0;
- it->it_op_release = 0;
- EXIT;
}
-#if 0
-static void llu_mdc_lock_set_inode(struct lustre_handle *lockh,
- struct inode *inode)
+static void ll_intent_release(struct lookup_intent *it)
{
- struct ldlm_lock *lock = ldlm_handle2lock(lockh);
ENTRY;
- LASSERT(lock != NULL);
- lock->l_data = inode;
- LDLM_LOCK_PUT(lock);
+ ll_intent_drop_lock(it);
+ it->it_magic = 0;
+ it->it_op_release = 0;
+ it->d.lustre.it_disposition = 0;
+ it->d.lustre.it_data = NULL;
EXIT;
}
-static int pnode_revalidate_finish(struct ptlrpc_request *request,
- struct inode *parent, struct pnode *pnode,
- struct lookup_intent *it, int offset,
- obd_id ino)
-{
- struct llu_sb_info *sbi = llu_i2sbi(parent);
- struct pnode_base *pb = pnode->p_base;
- struct mds_body *body;
- struct lov_stripe_md *lsm = NULL;
- struct lov_mds_md *lmm;
- int lmmsize;
- int rc = 0;
- ENTRY;
-
- /* NB 1 request reference will be taken away by ll_intent_lock()
- * when I return */
-
- if (it_disposition(it, DISP_LOOKUP_NEG))
- RETURN(-ENOENT);
-
- /* We only get called if the mdc_enqueue() called from
- * ll_intent_lock() was successful. Therefore the mds_body is
- * present and correct, and the eadata is present (but still
- * opaque, so only obd_unpackmd() can check the size) */
- body = lustre_msg_buf(request->rq_repmsg, offset, sizeof (*body));
- LASSERT (body != NULL);
- LASSERT_REPSWABBED (request, offset);
-
- if (body->valid & OBD_MD_FLEASIZE) {
- /* Only bother with this if inodes's LSM not set? */
-
- if (body->eadatasize == 0) {
- CERROR ("OBD_MD_FLEASIZE set, but eadatasize 0\n");
- GOTO (out, rc = -EPROTO);
- }
- lmmsize = body->eadatasize;
- lmm = lustre_msg_buf (request->rq_repmsg, offset + 1, lmmsize);
- LASSERT (lmm != NULL);
- LASSERT_REPSWABBED (request, offset + 1);
-
- rc = obd_unpackmd (&sbi->ll_osc_conn,
- &lsm, lmm, lmmsize);
- if (rc < 0) {
- CERROR ("Error %d unpacking eadata\n", rc);
- LBUG();
- /* XXX don't know if I should do this... */
- GOTO (out, rc);
- /* or skip the ll_update_inode but still do
- * mdc_lock_set_inode() */
- }
- LASSERT (rc >= sizeof (*lsm));
- rc = 0;
- }
-
- llu_update_inode(pb->pb_ino, body, lsm);
-
- if (lsm != NULL &&
- llu_i2info(pb->pb_ino)->lli_smd != lsm)
- obd_free_memmd (&sbi->ll_osc_conn, &lsm);
-
- llu_mdc_lock_set_inode((struct lustre_handle *)&it->d.lustre.it_lock_handle,
- pb->pb_ino);
- out:
- RETURN(rc);
-}
-#endif
-
/*
* remove the stale inode from pnode
*/
LASSERT(llu_i2info(inode)->lli_stale_flag);
pno->p_base->pb_ino = NULL;
+ I_RELE(inode);
if (!llu_i2info(inode)->lli_open_count) {
CDEBUG(D_INODE, "unhook inode %p (ino %lu) from pno %p\n",
inode, llu_i2info(inode)->lli_st_ino, pno);
- I_RELE(inode);
if (!inode->i_ref)
_sysio_i_gone(inode);
}
}
-static inline void ll_invalidate_inode_pages(struct inode * inode)
+static inline void llu_invalidate_inode_pages(struct inode * inode)
{
/* do nothing */
}
-static int llu_mdc_blocking_ast(struct ldlm_lock *lock,
- struct ldlm_lock_desc *desc,
- void *data, int flag)
+int llu_mdc_blocking_ast(struct ldlm_lock *lock,
+ struct ldlm_lock_desc *desc,
+ void *data, int flag)
{
int rc;
struct lustre_handle lockh;
CDEBUG(D_INODE, "invalidating inode %lu\n",
lli->lli_st_ino);
- ll_invalidate_inode_pages(inode);
+ llu_invalidate_inode_pages(inode);
}
/*
RETURN(0);
}
+static int pnode_revalidate_finish(struct ptlrpc_request *req,
+ int offset,
+ struct lookup_intent *it,
+ struct pnode *pnode)
+{
+ struct inode *inode = pnode->p_base->pb_ino;
+ struct lustre_md md;
+ int rc = 0;
+ ENTRY;
+
+ LASSERT(inode);
+
+ if (!req)
+ RETURN(0);
+
+ if (it_disposition(it, DISP_LOOKUP_NEG))
+ RETURN(-ENOENT);
+
+ rc = mdc_req2lustre_md(req, offset, llu_i2sbi(inode)->ll_osc_exp, &md);
+ if (rc)
+ RETURN(rc);
+
+ llu_update_inode(inode, md.body, md.lsm);
+
+ RETURN(rc);
+}
+
int llu_pb_revalidate(struct pnode *pnode, int flags, struct lookup_intent *it)
{
struct pnode_base *pb = pnode->p_base;
}
/* This is due to bad interaction with libsysio. remove this when we
- * switched to libbsdio
+ * switched to libbsdio XXX
*/
{
struct llu_inode_info *lli = llu_i2info(pb->pb_ino);
if (req == NULL && rc >= 0)
GOTO(out, rc);
- /* unfortunately ll_intent_lock may cause a callback and revoke our
- dentry */
- /*
- spin_lock(&dcache_lock);
- list_del_init(&de->d_hash);
- spin_unlock(&dcache_lock);
- d_rehash(de);
- */
+ if (rc < 0)
+ GOTO(out, rc = 0);
+
+ rc = pnode_revalidate_finish(req, 1, it, pnode);
+
+ /* Note: ll_intent_lock may cause a callback, check this! */
+
if (it->it_op & (IT_OPEN | IT_GETATTR))
LL_SAVE_INTENT(pb->pb_ino, it);
RETURN(1);
if (rc == 0) {
LASSERT(pb->pb_ino);
if (S_ISDIR(llu_i2info(pb->pb_ino)->lli_st_mode))
- ll_invalidate_inode_pages(pb->pb_ino);
+ llu_invalidate_inode_pages(pb->pb_ino);
llu_i2info(pb->pb_ino)->lli_stale_flag = 1;
unhook_stale_inode(pnode);
} else {
int rc;
/* NB 1 request reference will be taken away by ll_intent_lock()
- * when I return */
- /* XXX libsysio require the inode must be generated here XXX */
+ * when I return
+ * Note: libsysio require the inode must be generated here
+ */
if ((it->it_op & IT_CREAT) || !it_disposition(it, DISP_LOOKUP_NEG)) {
struct lustre_md md;
struct llu_inode_info *lli;
LASSERT(lsm->lsm_object_id != 0);
+ /* bug 2334: drop MDS lock before acquiring OST lock */
+ ll_intent_drop_lock(it);
+
rc = llu_extent_lock(NULL, inode, lsm, LCK_PR, &extent,
&lockh);
if (rc != ELDLM_OK) {
ENTRY;
}
+ /* intent will be further used in cases of open()/getattr() */
if (inode && (it->it_op & (IT_OPEN | IT_GETATTR)))
LL_SAVE_INTENT(inode, it);
-/*
- dentry->d_op = &ll_d_ops;
- ll_set_dd(dentry);
- if (dentry == saved)
- d_add(dentry, inode);
-*/
child->p_base->pb_ino = inode;
RETURN(0);
return inode;
}
-/* XXX */
-#define EXT2_NAME_LEN (255)
-
static int llu_lookup_it(struct inode *parent, struct pnode *pnode,
struct lookup_intent *it, int flags)
{
if (pnode->p_base->pb_name.len > EXT2_NAME_LEN)
RETURN(-ENAMETOOLONG);
-
-/*
- CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n",
- dentry->d_name.name, parent->i_ino, parent->i_generation,
- parent, LL_IT2STR(it));
-
- if (d_mountpoint(dentry))
- CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it));
-
- ll_frob_intent(&it, &lookup_it);
-*/
-
if (!it) {
it = &lookup_it;
it->it_op_release = ll_intent_release;
llu_lookup_finish_locks(it, pnode);
-/*
- if (dentry == save)
- GOTO(out, retval = NULL);
- else
- GOTO(out, retval = dentry);
-*/
out:
if (req)
ptlrpc_req_finished(req);
it->it_flags |= fmode;
}
- /*
- else if (intent->int_opmask & INT_CREAT)
- it->it_op |= IT_LOOKUP;
- */
-
- /* FIXME libsysio has strange code on intent handling,
+ /* XXX libsysio has strange code on intent handling,
* more check later */
if (it->it_flags & O_CREAT) {
it->it_op |= IT_CREAT;
if (intent->int_opmask & INT_GETATTR)
it->it_op |= IT_GETATTR;
- /* XXX */
- if (intent->int_opmask & INT_SETATTR)
- LBUG();
+
+ LASSERT(!(intent->int_opmask & INT_SETATTR));
/* libsysio is different to linux vfs when doing unlink/rmdir,
* INT_UPDPARENT was passed down during name resolution. Here
#include <inode.h>
#include <file.h>
-#include "llite_lib.h"
-
-#if 0
-void llu_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
- struct ldlm_lock *lock)
-{
- clear_bit(LLI_F_HAVE_SIZE_LOCK, &(llu_i2info(inode)->lli_flags));
-#if 0
- struct ldlm_extent *extent = &lock->l_extent;
- unsigned long start, end, count, skip, i, j;
- struct page *page;
- int ret;
- ENTRY;
-
- CDEBUG(D_INODE, "obdo %lu inode %p ["LPU64"->"LPU64"] size: %llu\n",
- inode->i_ino, inode, extent->start, extent->end, inode->i_size);
-
- start = extent->start >> PAGE_CACHE_SHIFT;
- count = ~0;
- skip = 0;
- end = (extent->end >> PAGE_CACHE_SHIFT) + 1;
- if ((end << PAGE_CACHE_SHIFT) < extent->end)
- end = ~0;
- if (lsm->lsm_stripe_count > 1) {
- struct {
- char name[16];
- struct ldlm_lock *lock;
- struct lov_stripe_md *lsm;
- } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
- __u32 stripe;
- __u32 vallen = sizeof(stripe);
- int rc;
-
- /* get our offset in the lov */
- rc = obd_get_info(ll_i2obdconn(inode), sizeof(key),
- &key, &vallen, &stripe);
- if (rc != 0) {
- CERROR("obd_get_info: rc = %d\n", rc);
- LBUG();
- }
- LASSERT(stripe < lsm->lsm_stripe_count);
-
- count = lsm->lsm_stripe_size >> PAGE_CACHE_SHIFT;
- skip = (lsm->lsm_stripe_count - 1) * count;
- start += (start/count * skip) + (stripe * count);
- if (end != ~0)
- end += (end/count * skip) + (stripe * count);
- }
-
- i = (inode->i_size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
- if (end >= i)
- clear_bit(LLI_F_HAVE_SIZE_LOCK, &(ll_i2info(inode)->lli_flags));
- if (i < end)
- end = i;
-
- CDEBUG(D_INODE, "start: %lu j: %lu count: %lu skip: %lu end: %lu\n",
- start, start % count, count, skip, end);
-
- /* start writeback on dirty pages in the extent when its PW */
- for (i = start, j = start % count;
- lock->l_granted_mode == LCK_PW && i < end; j++, i++) {
- if (j == count) {
- i += skip;
- j = 0;
- }
- /* its unlikely, but give us a chance to bail when we're out */
- PGCACHE_WRLOCK(inode->i_mapping);
- if (list_empty(&inode->i_mapping->dirty_pages)) {
- CDEBUG(D_INODE, "dirty list empty\n");
- PGCACHE_WRUNLOCK(inode->i_mapping);
- break;
- }
- PGCACHE_WRUNLOCK(inode->i_mapping);
-
- if (need_resched())
- schedule();
-
- /* always do a getattr for the first person to pop out of lock
- * acquisition.. the DID_GETATTR flag and semaphore serialize
- * this initial race. we used to make a decision based on whether
- * the lock was matched or acquired, but the matcher could win the
- * waking race with the first issuer so that was no good..
- */
- if (test_bit(LLI_F_DID_GETATTR, &lli->lli_flags))
- RETURN(ELDLM_OK);
-
- down(&lli->lli_getattr_sem);
-
- if (!test_bit(LLI_F_DID_GETATTR, &lli->lli_flags)) {
- rc = ll_inode_getattr(inode, lsm);
- if (rc == 0) {
- set_bit(LLI_F_DID_GETATTR, &lli->lli_flags);
- } else {
- unlock_page(page);
- }
- page_cache_release(page);
-
- }
-
- /* our locks are page granular thanks to osc_enqueue, we invalidate the
- * whole page. */
- LASSERT((extent->start & ~PAGE_CACHE_MASK) == 0);
- LASSERT(((extent->end+1) & ~PAGE_CACHE_MASK) == 0);
- for (i = start, j = start % count ; i < end ; j++, i++) {
- if ( j == count ) {
- i += skip;
- j = 0;
- }
- PGCACHE_WRLOCK(inode->i_mapping);
- if (list_empty(&inode->i_mapping->dirty_pages) &&
- list_empty(&inode->i_mapping->clean_pages) &&
- list_empty(&inode->i_mapping->locked_pages)) {
- CDEBUG(D_INODE, "nothing left\n");
- PGCACHE_WRUNLOCK(inode->i_mapping);
- break;
- }
- PGCACHE_WRUNLOCK(inode->i_mapping);
- if (need_resched())
- schedule();
- page = find_get_page(inode->i_mapping, i);
- if (page == NULL)
- continue;
- CDEBUG(D_INODE, "dropping page %p at %lu\n", page, page->index);
- lock_page(page);
- if (page->mapping) /* might have raced */
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
- truncate_complete_page(page);
-#else
- truncate_complete_page(page->mapping, page);
-#endif
- unlock_page(page);
- page_cache_release(page);
- }
- EXIT;
-#endif
-}
-
-int llu_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
- void *data, int flag)
-{
- struct inode *inode = data;
- struct llu_inode_info *lli = llu_i2info(inode);
- struct lustre_handle lockh = {0};
- int rc;
- ENTRY;
-
- if (inode == NULL)
- LBUG();
-
- switch (flag) {
- case LDLM_CB_BLOCKING:
- ldlm_lock2handle(lock, &lockh);
- rc = ldlm_cli_cancel(&lockh);
- if (rc != ELDLM_OK)
- CERROR("ldlm_cli_cancel failed: %d\n", rc);
- break;
- case LDLM_CB_CANCELING: {
- /* FIXME: we could be given 'canceling intents' so that we
- * could know to write-back or simply throw away the pages
- * based on if the cancel comes from a desire to, say,
- * read or truncate.. */
- llu_pgcache_remove_extent(inode, lli->lli_smd, lock);
- break;
- }
- default:
- LBUG();
- }
+#undef LIST_HEAD
- RETURN(0);
-}
-#endif
+#include "llite_lib.h"
static int llu_extent_lock_callback(struct ldlm_lock *lock,
struct ldlm_lock_desc *new, void *data,
LASSERT(lockh->cookie == 0);
-#if 0
/* XXX phil: can we do this? won't it screw the file size up? */
if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
(sbi->ll_flags & LL_SBI_NOLCK))
RETURN(0);
-#endif
CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
lli->lli_st_ino, extent->start, extent->end);
};
static
-struct llu_sysio_cookie* get_sysio_cookie(struct inode *inode, int npages)
+struct llu_sysio_cookie* get_sysio_cookie(struct inode *inode, int maxpages)
{
struct llu_sysio_cookie *cookie;
- OBD_ALLOC(cookie, LLU_SYSIO_COOKIE_SIZE(npages));
+ OBD_ALLOC(cookie, LLU_SYSIO_COOKIE_SIZE(maxpages));
if (cookie) {
I_REF(inode);
cookie->lsc_inode = inode;
- cookie->lsc_npages = npages;
+ cookie->lsc_maxpages = maxpages;
cookie->lsc_llap = (struct ll_async_page *)(cookie + 1);
- cookie->lsc_pages = (struct page *) (cookie->lsc_llap + npages);
+ cookie->lsc_pages = (struct page *) (cookie->lsc_llap + maxpages);
osic_init(&cookie->lsc_osic);
}
struct lov_stripe_md *lsm = llu_i2info(cookie->lsc_inode)->lli_smd;
struct obd_export *exp = llu_i2obdexp(cookie->lsc_inode);
struct ll_async_page *llap = cookie->lsc_llap;
+#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
+ struct page *pages = cookie->lsc_pages;
+#endif
int i;
- for (i = 0; i< cookie->lsc_npages; i++) {
+ for (i = 0; i< cookie->lsc_maxpages; i++) {
if (llap[i].llap_cookie)
obd_teardown_async_page(exp, lsm, NULL,
llap[i].llap_cookie);
+#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
+ if (pages[i]._managed) {
+ free(pages[i].addr);
+ pages[i]._managed = 0;
+ }
+#endif
}
I_RELE(cookie->lsc_inode);
osic_release(cookie->lsc_osic);
- OBD_FREE(cookie, LLU_SYSIO_COOKIE_SIZE(cookie->lsc_npages));
+ OBD_FREE(cookie, LLU_SYSIO_COOKIE_SIZE(cookie->lsc_maxpages));
+}
+
+#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
+/* Note: these code should be removed finally, don't need
+ * more cleanup
+ */
+static
+int prepare_unaligned_write(struct llu_sysio_cookie *cookie)
+{
+ struct inode *inode = cookie->lsc_inode;
+ struct llu_inode_info *lli = llu_i2info(inode);
+ struct lov_stripe_md *lsm = lli->lli_smd;
+ struct obdo oa;
+ struct page *pages = cookie->lsc_pages;
+ int i, pgidx[2] = {0, cookie->lsc_npages-1};
+ int rc;
+ ENTRY;
+
+ for (i = 0; i < 2; i++) {
+ struct page *oldpage = &pages[pgidx[i]];
+ struct page newpage;
+ struct brw_page pg;
+ char *newbuf;
+
+ if (i == 0 && pgidx[0] == pgidx[1])
+ continue;
+
+ LASSERT(oldpage->_offset + oldpage->_count <= PAGE_CACHE_SIZE);
+
+ if (oldpage->_count == PAGE_CACHE_SIZE)
+ continue;
+
+ if (oldpage->index << PAGE_CACHE_SHIFT >=
+ lli->lli_st_size)
+ continue;
+
+ newbuf = malloc(PAGE_CACHE_SIZE);
+ if (!newbuf)
+ return -ENOMEM;
+
+ newpage.index = oldpage->index;
+ newpage.addr = newbuf;
+
+ pg.pg = &newpage;
+ pg.off = ((obd_off)newpage.index << PAGE_CACHE_SHIFT);
+ if (pg.off + PAGE_CACHE_SIZE > lli->lli_st_size)
+ pg.count = lli->lli_st_size % PAGE_CACHE_SIZE;
+ else
+ pg.count = PAGE_CACHE_SIZE;
+ pg.flag = 0;
+
+ oa.o_id = lsm->lsm_object_id;
+ oa.o_mode = lli->lli_st_mode;
+ oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE;
+
+ /* issue read */
+ rc = obd_brw(OBD_BRW_READ, llu_i2obdexp(inode), &oa, lsm, 1, &pg, NULL);
+ if (rc) {
+ free(newbuf);
+ RETURN(rc);
+ }
+
+ /* copy page content, and reset page params */
+ memcpy(newbuf + oldpage->_offset,
+ (char*)oldpage->addr + oldpage->_offset,
+ oldpage->_count);
+
+ oldpage->addr = newbuf;
+ if ((((obd_off)oldpage->index << PAGE_CACHE_SHIFT) +
+ oldpage->_offset + oldpage->_count) > lli->lli_st_size)
+ oldpage->_count += oldpage->_offset;
+ else
+ oldpage->_count = PAGE_CACHE_SIZE;
+ oldpage->_offset = 0;
+ oldpage->_managed = 1;
+ }
+
+ RETURN(0);
}
+#endif
static
int llu_prep_async_io(struct llu_sysio_cookie *cookie, int cmd,
char *buf, loff_t pos, size_t count)
{
- struct lov_stripe_md *lsm = llu_i2info(cookie->lsc_inode)->lli_smd;
+ struct llu_inode_info *lli = llu_i2info(cookie->lsc_inode);
+ struct lov_stripe_md *lsm = lli->lli_smd;
struct obd_export *exp = llu_i2obdexp(cookie->lsc_inode);
struct page *pages = cookie->lsc_pages;
struct ll_async_page *llap = cookie->lsc_llap;
if (!exp)
RETURN(-EINVAL);
- cookie->lsc_rwcount = count;
-
/* prepare the pages array */
do {
unsigned long index, offset, bytes;
if (bytes > count)
bytes = count;
+ /* prevent read beyond file range */
+ if ((cmd == OBD_BRW_READ) &&
+ (pos + bytes) >= lli->lli_st_size) {
+ if (pos >= lli->lli_st_size)
+ break;
+ bytes = lli->lli_st_size - pos;
+ }
+
/* prepare page for this index */
pages[npages].index = index;
pages[npages].addr = buf - offset;
count -= bytes;
pos += bytes;
buf += bytes;
+
+ cookie->lsc_rwcount += bytes;
} while (count);
+ cookie->lsc_npages = npages;
+
+#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
+ if (cmd == OBD_BRW_WRITE) {
+ rc = prepare_unaligned_write(cookie);
+ if (rc)
+ RETURN(rc);
+ }
+#endif
+
for (i = 0; i < npages; i++) {
llap[i].llap_magic = LLAP_MAGIC;
rc = obd_prep_async_page(exp, lsm, NULL, &pages[i],
/* FIXME optimize the following extent locking */
for (iovidx = 0; iovidx < iovlen; iovidx++) {
- char *buf = iovec[iovidx].iov_base;
+ char *buf = (char*)iovec[iovidx].iov_base;
size_t count = iovec[iovidx].iov_len;
if (count == 0)
continue;
- /* FIXME libsysio haven't consider the open flags
- * such as O_APPEND */
-#if 0
- if (!S_ISBLK(lli->lli_st_mode) && file->f_flags & O_APPEND) {
- extent.start = 0;
- extent.end = OBD_OBJECT_EOF;
- } else {
- extent.start = *ppos;
- extent.end = *ppos + count - 1;
- }
-#else
+ /* FIXME libsysio haven't handle O_APPEND */
extent.start = pos;
extent.end = pos + count - 1;
-#endif
- err = llu_extent_lock(fd, inode, lsm, LCK_PW, &extent, &lockh);
+#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
+ if ((pos & ~PAGE_CACHE_MASK) == 0 &&
+ (count & ~PAGE_CACHE_MASK) == 0)
+ err = llu_extent_lock_no_validate(fd, inode, lsm,
+ LCK_PW, &extent, &lockh, 0);
+ else
+ err = llu_extent_lock(fd, inode, lsm, LCK_PW,
+ &extent, &lockh);
+#else
+ /* server will handle partial write, so we don't
+ * care for file size here */
+ err = llu_extent_lock_no_validate(fd, inode, lsm, LCK_PW,
+ &extent, &lockh, 0);
+#endif
if (err != ELDLM_OK)
GOTO(err_out, err = -ENOLCK);
CDEBUG(D_INFO, "Reading inode %lu, "LPSZ" bytes, offset %Ld\n",
lli->lli_st_ino, count, pos);
+ if (pos >= lli->lli_st_size) {
+ llu_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
+ break;
+ }
+
cookie = llu_rw(OBD_BRW_READ, inode, buf, count, pos);
if (!IS_ERR(cookie)) {
/* save cookie */
ENTRY;
/* write/read(fd, buf, 0) */
- if (!lsca)
- return 1;
+ if (!lsca) {
+ ioctxp->ioctx_cc = 0;
+ RETURN(1);
+ }
LASSERT(!IS_ERR(lsca));
}
}
- if (rc)
- ioctxp->ioctx_cc = rc;
+ if (rc) {
+ LASSERT(rc < 0);
+ ioctxp->ioctx_cc = -1;
+ ioctxp->ioctx_errno = -rc;
+ }
OBD_FREE(lsca, sizeof(*lsca));
ioctxp->ioctx_private = NULL;
#include <inode.h>
#include <file.h>
+#undef LIST_HEAD
+
#include "llite_lib.h"
static void llu_fsop_gone(struct filesys *fs)
obdo_refresh_inode(inode, &oa, refresh_valid);
-/*
- if (inode->i_blksize < PAGE_CACHE_SIZE)
- inode->i_blksize = PAGE_CACHE_SIZE;
-
- CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %lu, blksize %lu\n",
- lsm->lsm_object_id, inode->i_size, inode->i_blocks,
- inode->i_blksize);
-*/
RETURN(0);
}
return inode;
}
-#if 0
-static int ll_intent_to_lock_mode(struct lookup_intent *it)
-{
- /* CREAT needs to be tested before open (both could be set) */
- if (it->it_op & IT_CREAT)
- return LCK_PW;
- else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP))
- return LCK_PR;
-
- LBUG();
- RETURN(-EINVAL);
-}
-#endif
-
-#if 0
-int ll_it_open_error(int phase, struct lookup_intent *it)
-{
- if (it_disposition(it, DISP_OPEN_OPEN)) {
- if (phase == DISP_OPEN_OPEN)
- return it->d.lustre.it_status;
- else
- return 0;
- }
-
- if (it_disposition(it, DISP_OPEN_CREATE)) {
- if (phase == DISP_OPEN_CREATE)
- return it->d.lustre.it_status;
- else
- return 0;
- }
-
- if (it_disposition(it, DISP_LOOKUP_EXECD)) {
- if (phase == DISP_LOOKUP_EXECD)
- return it->d.lustre.it_status;
- else
- return 0;
- }
- CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition, it->d.lustre.it_status);
- LBUG();
- return 0;
-}
-#endif
-
static int llu_have_md_lock(struct inode *inode)
{
struct llu_sb_info *sbi = llu_i2sbi(inode);
* I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE
* at the same time.
*/
-#define OST_ATTR (ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME | \
- ATTR_ATIME | ATTR_ATIME_SET | ATTR_SIZE)
int llu_setattr_raw(struct inode *inode, struct iattr *attr)
{
struct lov_stripe_md *lsm = llu_i2info(inode)->lli_smd;
/* If only OST attributes being set on objects, don't do MDS RPC.
* In that case, we need to check permissions and update the local
* inode ourselves so we can call obdo_from_inode() always. */
- if (ia_valid & (lsm ? ~(OST_ATTR | ATTR_FROM_OPEN | ATTR_RAW) : ~0)) {
+ if (ia_valid & (lsm ? ~(ATTR_SIZE | ATTR_FROM_OPEN | ATTR_RAW) : ~0)) {
struct lustre_md md;
llu_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
RETURN(rc);
}
-/* FIXME here we simply act as a thin layer to glue it with
+/* here we simply act as a thin layer to glue it with
* llu_setattr_raw(), which is copy from kernel
*/
static int llu_iop_setattr(struct pnode *pno,
iattr.ia_valid |= ATTR_GID;
}
if (mask & SETATTR_LEN) {
- iattr.ia_size = stbuf->st_size; /* FIXME signed expansion problem */
+ iattr.ia_size = stbuf->st_size; /* XXX signed expansion problem */
iattr.ia_valid |= ATTR_SIZE;
}
int rc;
ENTRY;
- /* on symlinks lli_open_sem protects lli_symlink_name allocation/data */
-/*
- down(&lli->lli_open_sem);
-*/
rc = llu_readlink_internal(inode, &request, &symname);
if (rc)
GOTO(out, rc);
ptlrpc_req_finished(request);
out:
-/*
- up(&lli->lli_open_sem);
-*/
RETURN(rc);
}
RETURN(err);
}
-#if 0
-static int llu_mdc_unlink(struct inode *dir, struct inode *child, __u32 mode,
- const char *name, int len)
-{
- struct ptlrpc_request *request = NULL;
- struct mds_body *body;
- struct lov_mds_md *eadata;
- struct lov_stripe_md *lsm = NULL;
- struct obd_trans_info oti = { 0 };
- struct mdc_op_data op_data;
- struct obdo *oa;
- int rc;
- ENTRY;
-
- llu_prepare_mdc_op_data(&op_data, dir, child, name, len, mode);
- rc = mdc_unlink(&llu_i2sbi(dir)->ll_mdc_conn, &op_data, &request);
- if (rc)
- GOTO(out, rc);
- /* req is swabbed so this is safe */
- body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body));
-
- if (!(body->valid & OBD_MD_FLEASIZE))
- GOTO(out, rc = 0);
-
- if (body->eadatasize == 0) {
- CERROR("OBD_MD_FLEASIZE set but eadatasize zero\n");
- GOTO(out, rc = -EPROTO);
- }
-
- /* The MDS sent back the EA because we unlinked the last reference
- * to this file. Use this EA to unlink the objects on the OST.
- * It's opaque so we don't swab here; we leave it to obd_unpackmd() to
- * check it is complete and sensible. */
- eadata = lustre_swab_repbuf(request, 1, body->eadatasize, NULL);
- LASSERT(eadata != NULL);
- if (eadata == NULL) {
- CERROR("Can't unpack MDS EA data\n");
- GOTO(out, rc = -EPROTO);
- }
-
- rc = obd_unpackmd(llu_i2obdconn(dir), &lsm, eadata, body->eadatasize);
- if (rc < 0) {
- CERROR("obd_unpackmd: %d\n", rc);
- GOTO(out, rc);
- }
- LASSERT(rc >= sizeof(*lsm));
-
- oa = obdo_alloc();
- if (oa == NULL)
- GOTO(out_free_memmd, rc = -ENOMEM);
-
- oa->o_id = lsm->lsm_object_id;
- oa->o_mode = body->mode & S_IFMT;
- oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
-
- if (body->valid & OBD_MD_FLCOOKIE) {
- oa->o_valid |= OBD_MD_FLCOOKIE;
- oti.oti_logcookies = lustre_msg_buf(request->rq_repmsg, 3,
- body->eadatasize);
- }
-
- rc = obd_destroy(llu_i2obdconn(dir), oa, lsm, &oti);
- obdo_free(oa);
- if (rc)
- CERROR("obd destroy objid 0x"LPX64" error %d\n",
- lsm->lsm_object_id, rc);
- out_free_memmd:
- obd_free_memmd(llu_i2obdconn(dir), &lsm);
- out:
- ptlrpc_req_finished(request);
- return rc;
-}
-#endif
-
static int llu_iop_link_raw(struct pnode *old, struct pnode *new)
{
struct inode *src = old->p_base->pb_ino;
RETURN(rc);
}
-#if 0
+#ifdef _HAVE_STATVFS
static int llu_statfs_internal(struct llu_sb_info *sbi,
struct obd_statfs *osfs,
unsigned long max_age)
RETURN(rc);
}
-static int llu_statfs(struct llu_sb_info *sbi, struct kstatfs *sfs)
+static int llu_statfs(struct llu_sb_info *sbi, struct statfs *sfs)
{
struct obd_statfs osfs;
int rc;
RETURN(0);
}
-#endif
+#endif /* _HAVE_STATVFS */
static int llu_iop_mkdir_raw(struct pnode *pno, mode_t mode)
{
GOTO(out_free, err = -EINVAL);
}
- /* XXX */
/* generate a string unique to this super, let's try
the address of the super itself.*/
len = (sizeof(sbi) * 2) + 1;
cfg.cfg_instance = sbi->ll_instance;
cfg.cfg_uuid = sbi->ll_sb_uuid;
- err = liblustre_process_log(&cfg);
+ err = liblustre_process_log(&cfg, 1);
if (err < 0) {
CERROR("Unable to process log: %s\n", g_zconf_profile);
inop_lookup: llu_iop_lookup,
inop_getattr: llu_iop_getattr,
inop_setattr: llu_iop_setattr,
- inop_getdirentries: NULL,
+ inop_getdirentries: llu_iop_getdirentries,
inop_mkdir: llu_iop_mkdir_raw,
inop_rmdir: llu_iop_rmdir_raw,
inop_symlink: llu_iop_symlink_raw,
inop_datasync: llu_iop_datasync,
inop_ioctl: llu_iop_ioctl,
inop_mknod: llu_iop_mknod_raw,
-#if 0
+#ifdef _HAVE_STATVFS
inop_statvfs: llu_iop_statvfs,
#endif
inop_gone: llu_iop_gone,
--- /dev/null
+.deps
+Makefile
+Makefile.in
--- /dev/null
+## Liblustre excecutables & libraries Makefile
+DEFS=
+
+CFLAGS := -g -Wall -I$(top_srcdir)/utils -I$(top_srcdir)/portals/include \
+ -I$(top_srcdir)/portals/unals -I$(SYSIO)/include \
+ -I/opt/lam/include -L/opt/lam/lib
+
+KFLAGS:=
+CPPFLAGS = $(HAVE_EFENCE) -D_LARGEFILE64_SOURCE=1
+LIBS = $(LIBEFENCE)
+
+
+LLIB_EXEC= ../liblustre.a -lpthread
+
+noinst_LIBRARIES = libtestcommon.a
+libtestcommon_a_SOURCES = test_common.c
+
+bin_PROGRAMS = echo_test sanity recovery_small replay_single test_lock_cancel \
+ replay_ost_single
+
+echo_test_SOURCES = echo_test.c ../../utils/parser.c ../../utils/obd.c ../../utils/lustre_cfg.c
+echo_test_LDADD = ../liblsupport.a $(LIBREADLINE) -lpthread
+echo_test_DEPENDENCIES=$(top_srcdir)/liblustre/liblsupport.a
+
+sanity_SOURCES = sanity.c
+sanity_LDADD := ./libtestcommon.a $(LLIB_EXEC)
+sanity_DEPENDENCIES = $(top_srcdir)/liblustre/liblustre.a ./libtestcommon.a
+
+recovery_small_SOURCES = recovery_small.c
+recovery_small_LDADD := ./libtestcommon.a $(LLIB_EXEC)
+recovery_small_DEPENDENCIES = $(top_srcdir)/liblustre/liblustre.a
+
+replay_single_SOURCES = replay_single.c
+replay_single_LDADD := ./libtestcommon.a $(LLIB_EXEC)
+replay_single_DEPENDENCIES = $(top_srcdir)/liblustre/liblustre.a
+
+test_lock_cancel_SOURCES = test_lock_cancel.c
+test_lock_cancel_LDADD := $(LLIB_EXEC) -lmpi -llam
+
+replay_ost_single_SOURCES = replay_ost_single.c
+replay_ost_single_LDADD := ./libtestcommon.a $(LLIB_EXEC)
+replay_ost_single_DEPENDENCIES = $(top_srcdir)/liblustre/liblustre.a
+
+include $(top_srcdir)/Rules
+
--- /dev/null
+#include <stdio.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+
+#include <portals/api-support.h> /* needed for ptpctl.h */
+#include <portals/ptlctl.h> /* needed for parse_dump */
+
+
+#include <liblustre.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <procbridge.h>
+
+#define LIBLUSTRE_TEST 1
+#include "../utils/lctl.c"
+
+struct ldlm_namespace;
+struct ldlm_res_id;
+struct obd_import;
+
+void *inter_module_get(char *arg)
+{
+ if (!strcmp(arg, "tcpnal_ni"))
+ return &tcpnal_ni;
+ else if (!strcmp(arg, "ldlm_cli_cancel_unused"))
+ return ldlm_cli_cancel_unused;
+ else if (!strcmp(arg, "ldlm_namespace_cleanup"))
+ return ldlm_namespace_cleanup;
+ else if (!strcmp(arg, "ldlm_replay_locks"))
+ return ldlm_replay_locks;
+ else
+ return NULL;
+}
+
+/* XXX move to proper place */
+char *portals_nid2str(int nal, ptl_nid_t nid, char *str)
+{
+ switch(nal){
+ case TCPNAL:
+ /* userspace NAL */
+ case SOCKNAL:
+ sprintf(str, "%u:%d.%d.%d.%d", (__u32)(nid >> 32),
+ HIPQUAD(nid));
+ break;
+ case QSWNAL:
+ case GMNAL:
+ case IBNAL:
+ case SCIMACNAL:
+ sprintf(str, "%u:%u", (__u32)(nid >> 32), (__u32)nid);
+ break;
+ default:
+ return NULL;
+ }
+ return str;
+}
+
+ptl_handle_ni_t tcpnal_ni;
+
+struct pingcli_args {
+ ptl_nid_t mynid;
+ ptl_nid_t nid;
+ ptl_pid_t port;
+ int count;
+ int size;
+};
+
+struct task_struct *current;
+
+/* portals interfaces */
+ptl_handle_ni_t *
+kportal_get_ni (int nal)
+{
+ switch (nal)
+ {
+ case SOCKNAL:
+ return &tcpnal_ni;
+ default:
+ return NULL;
+ }
+}
+
+inline void
+kportal_put_ni (int nal)
+{
+ return;
+}
+
+int
+kportal_nal_cmd(struct portals_cfg *pcfg)
+{
+#if 0
+ __u32 nal = pcfg->pcfg_nal;
+ int rc = -EINVAL;
+
+ ENTRY;
+
+ down(&nal_cmd_sem);
+ if (nal > 0 && nal <= NAL_MAX_NR && nal_cmd[nal].nch_handler) {
+ CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal,
+ pcfg->pcfg_command);
+ rc = nal_cmd[nal].nch_handler(pcfg, nal_cmd[nal].nch_private);
+ }
+ up(&nal_cmd_sem);
+ RETURN(rc);
+#else
+ CERROR("empty function!!!\n");
+ return 0;
+#endif
+}
+
+int init_current(int argc, char **argv)
+{
+ current = malloc(sizeof(*current));
+ strncpy(current->comm, argv[0], sizeof(current->comm));
+ current->pid = getpid();
+ return 0;
+}
+
+ptl_nid_t tcpnal_mynid;
+
+int init_lib_portals()
+{
+ int rc;
+
+ PtlInit();
+ rc = PtlNIInit(procbridge_interface, 0, 0, 0, &tcpnal_ni);
+ if (rc != 0) {
+ CERROR("ksocknal: PtlNIInit failed: error %d\n", rc);
+ PtlFini();
+ RETURN (rc);
+ }
+ PtlNIDebug(tcpnal_ni, ~0);
+ return rc;
+}
+
+extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
+
+int liblustre_ioctl(int dev_id, int opc, void *ptr)
+{
+ int rc = -EINVAL;
+
+ switch (dev_id) {
+ default:
+ fprintf(stderr, "Unexpected device id %d\n", dev_id);
+ abort();
+ break;
+
+ case OBD_DEV_ID:
+ rc = class_handle_ioctl(opc, (unsigned long)ptr);
+ break;
+ }
+
+ return rc;
+}
+
+static void generate_random_uuid(unsigned char uuid_out[16])
+{
+ int *arr = (int*)uuid_out;
+ int i;
+
+ for (i = 0; i < sizeof(uuid_out)/sizeof(int); i++)
+ arr[i] = rand();
+}
+
+static char *echo_server_nid = NULL;
+static char *echo_server_ostname = "obd1";
+static char *osc_dev_name = "OSC_DEV_NAME";
+static char *echo_dev_name = "ECHO_CLIENT_DEV_NAME";
+
+static int connect_echo_client(void)
+{
+ struct lustre_cfg lcfg;
+ ptl_nid_t nid;
+ char *peer = "ECHO_PEER_NID";
+ class_uuid_t osc_uuid, echo_uuid;
+ struct obd_uuid osc_uuid_str, echo_uuid_str;
+ int nal, err;
+ ENTRY;
+
+ generate_random_uuid(osc_uuid);
+ class_uuid_unparse(osc_uuid, &osc_uuid_str);
+ generate_random_uuid(echo_uuid);
+ class_uuid_unparse(echo_uuid, &echo_uuid_str);
+
+ if (ptl_parse_nid(&nid, echo_server_nid)) {
+ CERROR("Can't parse NID %s\n", echo_server_nid);
+ RETURN(-EINVAL);
+ }
+ nal = ptl_name2nal("tcp");
+ if (nal <= 0) {
+ CERROR("Can't parse NAL tcp\n");
+ RETURN(-EINVAL);
+ }
+
+ /* add uuid */
+ LCFG_INIT(lcfg, LCFG_ADD_UUID, NULL);
+ lcfg.lcfg_nid = nid;
+ lcfg.lcfg_inllen1 = strlen(peer) + 1;
+ lcfg.lcfg_inlbuf1 = peer;
+ lcfg.lcfg_nal = nal;
+ err = class_process_config(&lcfg);
+ if (err < 0) {
+ CERROR("failed add_uuid\n");
+ RETURN(-EINVAL);
+ }
+
+ /* attach osc */
+ LCFG_INIT(lcfg, LCFG_ATTACH, osc_dev_name);
+ lcfg.lcfg_inlbuf1 = "osc";
+ lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
+ lcfg.lcfg_inlbuf2 = osc_uuid_str.uuid;
+ lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
+ err = class_process_config(&lcfg);
+ if (err < 0) {
+ CERROR("failed attach osc\n");
+ RETURN(-EINVAL);
+ }
+
+ /* setup osc */
+ LCFG_INIT(lcfg, LCFG_SETUP, osc_dev_name);
+ lcfg.lcfg_inlbuf1 = echo_server_ostname;
+ lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
+ lcfg.lcfg_inlbuf2 = peer;
+ lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
+ err = class_process_config(&lcfg);
+ if (err < 0) {
+ CERROR("failed setup osc\n");
+ RETURN(-EINVAL);
+ }
+
+ /* attach echo_client */
+ LCFG_INIT(lcfg, LCFG_ATTACH, echo_dev_name);
+ lcfg.lcfg_inlbuf1 = "echo_client";
+ lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
+ lcfg.lcfg_inlbuf2 = echo_uuid_str.uuid;
+ lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
+ err = class_process_config(&lcfg);
+ if (err < 0) {
+ CERROR("failed attach echo_client\n");
+ RETURN(-EINVAL);
+ }
+
+ /* setup echo_client */
+ LCFG_INIT(lcfg, LCFG_SETUP, echo_dev_name);
+ lcfg.lcfg_inlbuf1 = osc_dev_name;
+ lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
+ lcfg.lcfg_inlbuf2 = NULL;
+ lcfg.lcfg_inllen2 = 0;
+ err = class_process_config(&lcfg);
+ if (err < 0) {
+ CERROR("failed setup echo_client\n");
+ RETURN(-EINVAL);
+ }
+
+ RETURN(0);
+}
+
+static int disconnect_echo_client(void)
+{
+ struct lustre_cfg lcfg;
+ int err;
+ ENTRY;
+
+ /* cleanup echo_client */
+ LCFG_INIT(lcfg, LCFG_CLEANUP, echo_dev_name);
+ err = class_process_config(&lcfg);
+ if (err < 0) {
+ CERROR("failed cleanup echo_client\n");
+ RETURN(-EINVAL);
+ }
+
+ /* detach echo_client */
+ LCFG_INIT(lcfg, LCFG_DETACH, echo_dev_name);
+ err = class_process_config(&lcfg);
+ if (err < 0) {
+ CERROR("failed detach echo_client\n");
+ RETURN(-EINVAL);
+ }
+
+ /* cleanup osc */
+ LCFG_INIT(lcfg, LCFG_CLEANUP, osc_dev_name);
+ err = class_process_config(&lcfg);
+ if (err < 0) {
+ CERROR("failed cleanup osc device\n");
+ RETURN(-EINVAL);
+ }
+
+ /* detach osc */
+ LCFG_INIT(lcfg, LCFG_DETACH, osc_dev_name);
+ err = class_process_config(&lcfg);
+ if (err < 0) {
+ CERROR("failed detach osc device\n");
+ RETURN(-EINVAL);
+ }
+
+ RETURN(0);
+}
+
+static void usage(const char *s)
+{
+ printf("Usage: %s -s ost_host_name [-n ost_name]\n", s);
+ printf(" ost_host_name: the host name of echo server\n");
+ printf(" ost_name: ost name, default is \"obd1\"\n");
+}
+
+extern int time_ptlwait1;
+extern int time_ptlwait2;
+extern int time_ptlselect;
+
+int main(int argc, char **argv)
+{
+ int c, rc;
+
+ while ((c = getopt(argc, argv, "s:n:")) != -1) {
+ switch (c) {
+ case 's':
+ echo_server_nid = optarg;
+ break;
+ case 'n':
+ echo_server_ostname = optarg;
+ break;
+ default:
+ usage(argv[0]);
+ return 1;
+ }
+ }
+
+ if (optind != argc)
+ usage(argv[0]);
+
+ if (!echo_server_nid) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ srand(time(NULL));
+
+ tcpnal_mynid = rand();
+#if 1
+ portal_debug = 0;
+ portal_subsystem_debug = 0;
+#endif
+
+ if (init_current(argc, argv) ||
+ init_obdclass() || init_lib_portals() ||
+ ptlrpc_init() ||
+ ldlm_init() ||
+ mdc_init() ||
+ lov_init() ||
+ osc_init() ||
+ echo_client_init()) {
+ printf("error\n");
+ return 1;
+ }
+
+ rc = connect_echo_client();
+ if (rc)
+ return rc;
+
+ set_ioc_handler(liblustre_ioctl);
+
+ rc = lctl_main(1, &argv[0]);
+
+ rc |= disconnect_echo_client();
+
+ return rc;
+}
exit(-1);
}
+ setenv(ENV_LUSTRE_TIMEOUT, "10", 1);
+
__liblustre_setup_();
while (drop_arr[drop_index].name) {
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Lustre Light user test program
+ *
+ * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define _BSD_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/queue.h>
+#include <signal.h>
+
+#include <sysio.h>
+#include <mount.h>
+
+#include "test_common.h"
+
+
+
+static char mds_server[1024] = {0,};
+static char barrier_script[1024] = {0,};
+static char failover_script[1024] = {0,};
+static char barrier_cmd[1024] = {0,};
+static char failover_cmd[1024] = {0,};
+
+static void replay_barrier()
+{
+ int rc;
+
+ if ((rc = system(barrier_cmd))) {
+ printf("excute barrier error: %d\n", rc);
+ exit(rc);
+ }
+}
+
+static void mds_failover()
+{
+ int rc;
+
+ if ((rc = system(failover_cmd))) {
+ printf("excute failover error: %d\n", rc);
+ exit(rc);
+ }
+}
+
+
+#define ENTRY(str) \
+ do { \
+ char buf[100]; \
+ int len; \
+ sprintf(buf, "===== START: %s ", (str)); \
+ len = strlen(buf); \
+ if (len < 79) { \
+ memset(buf+len, '=', 100-len); \
+ buf[79] = '\n'; \
+ buf[80] = 0; \
+ } \
+ printf("%s", buf); \
+ } while (0)
+
+#define LEAVE() \
+ do { \
+ printf("----- END TEST successfully ---"); \
+ printf("-----------------------------"); \
+ printf("-------------------\n"); \
+ } while (0)
+
+void t0()
+{
+ const int bufsize = 4096;
+ char *path = "/mnt/lustre/rp_ost_t0_file";
+ char buf[bufsize];
+ int fd, i, j, rc;
+ ENTRY("open-failover-write-verification (no ping involved)");
+
+ printf("create/open file...\n");
+ t_touch(path);
+ fd = t_open(path);
+ printf("OST failover...\n");
+ replay_barrier();
+ mds_failover();
+
+ printf("write file...\n");
+ for (i = 0; i < 20; i++) {
+ memset(buf, i, bufsize);
+ if ((rc = write(fd, buf, bufsize)) != bufsize) {
+ perror("write error after failover");
+ printf("i = %d, rc = %d\n", i, rc);
+ exit(-1);
+ }
+ }
+
+ /* verify */
+ printf("read & verify...\n");
+ lseek(fd, 0, SEEK_SET);
+ for (i = 0; i < 20; i++) {
+ memset(buf, -1, bufsize);
+ if ((rc = read(fd, buf, bufsize)) != bufsize) {
+ perror("read error rc");
+ printf("i = %d, rc = %d\n", i, rc);
+ exit(-1);
+ }
+ for (j = 0; j < bufsize; j++) {
+ if (buf[j] != i) {
+ printf("verify error!\n");
+ exit(-1);
+ }
+ }
+ }
+ t_close(fd);
+ t_unlink(path);
+ LEAVE();
+}
+
+void t1()
+{
+ const int bufsize = 4096;
+ char *path = "/mnt/lustre/rp_ost_t1_file";
+ char buf[bufsize];
+ int fd, i, j;
+ ENTRY("open-write-close-open-failover-read (no ping involved)");
+
+ printf("create/open file...\n");
+ t_touch(path);
+ fd = t_open(path);
+ printf("write file...\n");
+ for (i = 0; i < 20; i++) {
+ memset(buf, i, bufsize);
+ if (write(fd, buf, bufsize) != bufsize) {
+ perror("write error");
+ exit(-1);
+ }
+ }
+ printf("close/reopen...\n");
+ t_close(fd);
+ fd = t_open(path);
+ lseek(fd, 0, SEEK_SET);
+
+ printf("OST failover...\n");
+ replay_barrier();
+ mds_failover();
+
+ printf("read & verify...\n");
+ for (i = 0; i < 20; i++) {
+ memset(buf, -1, bufsize);
+ if (read(fd, buf, bufsize) != bufsize) {
+ perror("read error after failover");
+ exit(-1);
+ }
+ for (j = 0; j < bufsize; j++) {
+ if (buf[j] != i) {
+ printf("verify error after failover\n");
+ exit(-1);
+ }
+ }
+ }
+
+ t_close(fd);
+ t_unlink(path);
+ LEAVE();
+}
+
+void t2()
+{
+ char *path = "/mnt/lustre/rp_ost_t2_file";
+ char *str = "xxxxjoiwlsdf98lsjdfsjfoajflsjfajfoaidfojaj08eorje;";
+ ENTRY("empty replay");
+
+ replay_barrier();
+ mds_failover();
+
+ t_echo_create(path, str);
+ t_grep(path, str);
+ t_unlink(path);
+}
+
+void t3()
+{
+ char *path = "/mnt/lustre/rp_ost_t3_file";
+ char *str = "xxxxjoiwlsdf98lsjdfsjfoajflsjfajfoaidfojaj08eorje;";
+ ENTRY("touch");
+
+ printf("touch to create a file\n");
+ t_echo_create(path, str);
+ replay_barrier();
+ mds_failover();
+
+ printf("read & verify\n");
+ t_grep(path, str);
+ t_unlink(path);
+ /* XXX have problem without this, seems server side problem XXX */
+ sleep(5);
+}
+
+void t4()
+{
+ char *path = "/mnt/lustre/rp_ost_t4_file";
+ char namebuf[1024];
+ char str[1024];
+ int count = 10, i;
+ ENTRY("|X| 10 open(CREAT)s (ping involved)");
+
+ printf("create %d files\n", count);
+ for (i = 0; i < count; i++) {
+ sprintf(namebuf, "%s%02d", path, i);
+ sprintf(str, "%s-%08d-%08x-AAAAA", "content", i, i);
+ t_echo_create(namebuf, str);
+ }
+ replay_barrier();
+ mds_failover();
+
+ printf("read & verify\n");
+ for (i = 0; i < count; i++) {
+ sprintf(namebuf, "%s%02d", path, i);
+ sprintf(str, "%s-%08d-%08x-AAAAA", "content", i, i);
+ t_grep(namebuf, str);
+ t_unlink(namebuf);
+ }
+}
+
+extern int portal_debug;
+extern int portal_subsystem_debug;
+
+extern void __liblustre_setup_(void);
+extern void __liblustre_cleanup_(void);
+
+void usage(const char *cmd)
+{
+ printf("Usage: \t%s --target mdsnid:/mdsname/profile -s ost_hostname "
+ "-b \"barrier cmd\" -f \"failover cmd\"\n", cmd);
+ printf(" \t%s --dumpfile dumpfile -s ost_hostname -b \"barrier cmd\" "
+ "-f \"failover cmd\"\n", cmd);
+ exit(-1);
+}
+
+void test_ssh()
+{
+ char cmd[1024];
+
+ sprintf(cmd, "ssh %s cat /dev/null", mds_server);
+ if (system(cmd)) {
+ printf("ssh can't access server node: %s\n", mds_server);
+ exit(-1);
+ }
+}
+
+int main(int argc, char * const argv[])
+{
+ int opt_index, c;
+ static struct option long_opts[] = {
+ {"target", 1, 0, 0},
+ {"dumpfile", 1, 0, 0},
+ {0, 0, 0, 0}
+ };
+
+ if (argc < 4)
+ usage(argv[0]);
+
+ while ((c = getopt_long(argc, argv, "s:b:f:", long_opts, &opt_index)) != -1) {
+ switch (c) {
+ case 0: {
+ if (!optarg[0])
+ usage(argv[0]);
+
+ if (!strcmp(long_opts[opt_index].name, "target")) {
+ setenv(ENV_LUSTRE_MNTTGT, optarg, 1);
+ } else if (!strcmp(long_opts[opt_index].name, "dumpfile")) {
+ setenv(ENV_LUSTRE_DUMPFILE, optarg, 1);
+ } else
+ usage(argv[0]);
+ break;
+ }
+ case 's':
+ strcpy(mds_server, optarg);
+ break;
+ case 'b':
+ strcpy(barrier_script, optarg);
+ break;
+ case 'f':
+ strcpy(failover_script, optarg);
+ break;
+ default:
+ usage(argv[0]);
+ }
+ }
+
+ if (optind != argc)
+ usage(argv[0]);
+ if (!strlen(mds_server) || !strlen(barrier_script) ||
+ !strlen(failover_script))
+ usage(argv[0]);
+
+ test_ssh();
+
+ /* prepare remote command */
+ sprintf(barrier_cmd, "ssh %s \"%s\"", mds_server, barrier_script);
+ sprintf(failover_cmd, "ssh %s \"%s\"", mds_server, failover_script);
+
+ setenv(ENV_LUSTRE_TIMEOUT, "5", 1);
+
+ __liblustre_setup_();
+
+ t0();
+ t1();
+ t2();
+ t3();
+ t4();
+
+ printf("liblustre is about shutdown\n");
+ __liblustre_cleanup_();
+
+ printf("complete successfully\n");
+ return 0;
+}
sprintf(barrier_cmd, "ssh %s \"%s\"", mds_server, barrier_script);
sprintf(failover_cmd, "ssh %s \"%s\"", mds_server, failover_script);
+ setenv(ENV_LUSTRE_TIMEOUT, "10", 1);
+
__liblustre_setup_();
t0();
void t5()
{
char text[256];
- loff_t off_array[] = {1, 17, 255, 257, 4095, 4097, 8191, 1024*1024*1024};
+ loff_t off_array[] = {1, 4, 17, 255, 258, 4095, 4097, 8191, 1024*1024*1024};
int np = 1, i;
loff_t offset = 0;
LEAVE();
}
-void t100()
+void t11()
{
char *base="/mnt/lustre";
char path[4096], path2[4096];
LEAVE();
}
+void t12()
+{
+ char *dir="/mnt/lustre/test_t12_dir";
+ char buf[1024*128];
+ int fd;
+ ENTRY("empty directory readdir");
+
+ t_mkdir(dir);
+ fd = t_open(dir);
+ t_ls(fd, buf, sizeof(buf));
+ t_close(fd);
+ t_rmdir(dir);
+ LEAVE();
+}
+
+void t13()
+{
+ char *dir="/mnt/lustre/test_t13_dir/";
+ char name[1024];
+ char buf[1024];
+ const int nfiles = 20;
+ char *prefix = "test13_filename_prefix_";
+ int fd, i;
+ ENTRY("multiple entries directory readdir");
+
+ t_mkdir(dir);
+ printf("Creating %d files...\n", nfiles);
+ for (i = 0; i < nfiles; i++) {
+ sprintf(name, "%s%s%05d", dir, prefix, i);
+ t_touch(name);
+ }
+ fd = t_open(dir);
+ t_ls(fd, buf, sizeof(buf));
+ t_close(fd);
+ printf("Cleanup...\n");
+ for (i = 0; i < nfiles; i++) {
+ sprintf(name, "%s%s%05d", dir, prefix, i);
+ t_unlink(name);
+ }
+ t_rmdir(dir);
+ LEAVE();
+}
+
+void t14()
+{
+ char *dir="/mnt/lustre/test_t14_dir/";
+ char name[1024];
+ char buf[1024];
+ const int nfiles = 256;
+ char *prefix = "test14_filename_long_prefix_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA___";
+ int fd, i;
+ ENTRY(">1 block(4k) directory readdir");
+
+ t_mkdir(dir);
+ printf("Creating %d files...\n", nfiles);
+ for (i = 0; i < nfiles; i++) {
+ sprintf(name, "%s%s%05d", dir, prefix, i);
+ t_touch(name);
+ }
+ fd = t_open(dir);
+ t_ls(fd, buf, sizeof(buf));
+ t_close(fd);
+ printf("Cleanup...\n");
+ for (i = 0; i < nfiles; i++) {
+ sprintf(name, "%s%s%05d", dir, prefix, i);
+ t_unlink(name);
+ }
+ t_rmdir(dir);
+ LEAVE();
+}
+
+void t15()
+{
+ char *file = "/mnt/lustre/test_t15_file";
+ int fd;
+ ENTRY("open-stat-close");
+
+ t_touch(file);
+ fd = t_open(file);
+ t_check_stat(file, NULL);
+ t_close(fd);
+ t_unlink(file);
+ LEAVE();
+}
+
extern void __liblustre_setup_(void);
extern void __liblustre_cleanup_(void);
while ((c = getopt_long(argc, argv, "", long_opts, &opt_index)) != -1) {
switch (c) {
case 0: {
- printf("optindex %d\n", opt_index);
if (!optarg[0])
usage(argv[0]);
t8();
t9();
t10();
-
- t100();
+ t11();
+ t12();
+ t13();
+ t14();
+ t15();
#endif
printf("liblustre is about shutdown\n");
#include <fcntl.h>
#include <string.h>
#include <errno.h>
+#include <dirent.h>
#include "test_common.h"
}
}
-void _t_grep(const char *path, char *str, int should_contain)
+static void _t_grep(const char *path, char *str, int should_contain)
{
char buf[1024];
int fd;
{
_t_grep(path, str, 0);
}
+
+void t_ls(int fd, char *buf, int size)
+{
+ struct dirent64 *ent;
+ int rc, pos;
+ loff_t base = 0;
+
+ printf("dir entries listing...\n");
+ while ((rc = getdirentries64(fd, buf, size, &base)) > 0) {
+ pos = 0;
+ while (pos < rc) {
+ ent = (struct dirent64 *) ((char*) buf + pos);
+ printf("%s\n", ent->d_name);
+ pos += ent->d_reclen;
+ }
+ }
+
+ if (rc < 0) {
+ printf("getdents error %d\n", rc);
+ EXIT(-1);
+ }
+}
#define ENV_LUSTRE_MNTPNT "LIBLUSTRE_MOUNT_POINT"
#define ENV_LUSTRE_MNTTGT "LIBLUSTRE_MOUNT_TARGET"
+#define ENV_LUSTRE_TIMEOUT "LIBLUSTRE_TIMEOUT"
#define ENV_LUSTRE_DUMPFILE "LIBLUSTRE_DUMPFILE"
extern int exit_on_err;
int t_check_stat(const char *name, struct stat *buf);
int t_check_stat_fail(const char *name);
void t_echo_create(const char *path, const char *str);
-//int t_pread_once(const char *path, char *buf, size_t size, off_t offset);
void t_grep(const char *path, char *str);
void t_grep_v(const char *path, char *str);
+void t_ls(int fd, char *buf, int size);
#endif
cfg.cfg_instance = sbi->ll_instance;
cfg.cfg_uuid = sbi->ll_sb_uuid;
cfg.cfg_local_nid = lmd->lmd_local_nid;
- err = lustre_process_log(lmd, lmd->lmd_profile, &cfg, 1);
+ err = lustre_process_log(lmd, lmd->lmd_profile, &cfg, 0);
if (err < 0) {
CERROR("Unable to process log: %s\n", lmd->lmd_profile);
DEFS=
if LIBLUSTRE
-lib_LIBRARIES = liblov.a
+noinst_LIBRARIES = liblov.a
liblov_a_SOURCES = lov_log.c lov_obd.c lov_pack.c lov_internal.h
+liblov_a_CFLAGS = -fPIC
else
MODULE = lov
modulefs_DATA = lov.o
if LIBLUSTRE
-lib_LIBRARIES = liblvfs.a
+noinst_LIBRARIES = liblvfs.a
liblvfs_a_SOURCES = lvfs_userfs.c
+liblvfs_a_CFLAGS = -fPIC
#if MYSQL
#liblvfs_a_SOURCES += lvfs_user_mysql.c
DEFS=
if LIBLUSTRE
-lib_LIBRARIES = libmdc.a
+noinst_LIBRARIES = libmdc.a
libmdc_a_SOURCES = mdc_request.c mdc_reint.c mdc_lib.c mdc_internal.h mdc_locks.c
+libmdc_a_CFLAGS = -fPIC
else
MODULE = mdc
modulefs_DATA = mdc.o
/* XXX FIXME bug 249 */
req->rq_request_portal = MDS_READPAGE_PORTAL;
- desc = ptlrpc_prep_bulk_imp(req, BULK_PUT_SINK, MDS_BULK_PORTAL);
+ desc = ptlrpc_prep_bulk_imp(req, 1, BULK_PUT_SINK, MDS_BULK_PORTAL);
if (desc == NULL)
GOTO(out, rc = -ENOMEM);
/* NB req now owns desc and will free it when it gets freed */
- rc = ptlrpc_prep_bulk_page(desc, page, 0, PAGE_CACHE_SIZE);
- if (rc != 0)
- GOTO(out, rc);
+ ptlrpc_prep_bulk_page(desc, page, 0, PAGE_CACHE_SIZE);
mdc_readdir_pack(req, offset, PAGE_CACHE_SIZE, mdc_fid);
rc = ptlrpc_queue_wait(req);
if (rc == 0) {
- LASSERT(desc->bd_page_count == 1);
- body = lustre_swab_repbuf(req, 0, sizeof(*body),
+ body = lustre_swab_repbuf(req, 0, sizeof (*body),
lustre_swab_mds_body);
if (body == NULL) {
CERROR("Can't unpack mds_body\n");
GOTO(out, rc = -EPROTO);
}
+
+ if (req->rq_bulk->bd_nob_transferred != PAGE_CACHE_SIZE) {
+ CERROR ("Unexpected # bytes transferred: %d"
+ " (%ld expected)\n",
+ req->rq_bulk->bd_nob_transferred,
+ PAGE_CACHE_SIZE);
+ GOTO (out, rc = -EPROTO);
+ }
}
EXIT;
static int mds_postsetup(struct obd_device *obd);
static int mds_cleanup(struct obd_device *obd, int flags);
-static int mds_bulk_timeout(void *data)
-{
- struct ptlrpc_bulk_desc *desc = data;
- struct obd_export *exp = desc->bd_export;
-
- DEBUG_REQ(D_ERROR, desc->bd_req,"bulk send timed out: evicting %s@%s\n",
- exp->exp_client_uuid.uuid,
- exp->exp_connection->c_remote_uuid.uuid);
- ptlrpc_fail_export(exp);
- ptlrpc_abort_bulk (desc);
- RETURN(1);
-}
-
/* Assumes caller has already pushed into the kernel filesystem context */
static int mds_sendpage(struct ptlrpc_request *req, struct file *file,
loff_t offset, int count)
if (!pages)
GOTO(out, rc = -ENOMEM);
- desc = ptlrpc_prep_bulk_exp (req, BULK_PUT_SOURCE, MDS_BULK_PORTAL);
+ desc = ptlrpc_prep_bulk_exp (req, 1, BULK_PUT_SOURCE, MDS_BULK_PORTAL);
if (desc == NULL)
GOTO(out_free, rc = -ENOMEM);
if (pages[i] == NULL)
GOTO(cleanup_buf, rc = -ENOMEM);
- rc = ptlrpc_prep_bulk_page(desc, pages[i], 0, tmpsize);
- if (rc != 0)
- GOTO(cleanup_buf, rc);
+ ptlrpc_prep_bulk_page(desc, pages[i], 0, tmpsize);
}
for (i = 0, tmpcount = count; i < npages; i++, tmpcount -= tmpsize) {
GOTO(cleanup_buf, rc = -EIO);
}
- rc = ptlrpc_bulk_put(desc);
+ LASSERT(desc->bd_nob == count);
+
+ rc = ptlrpc_start_bulk_transfer(desc);
if (rc)
GOTO(cleanup_buf, rc);
if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE)) {
CERROR("obd_fail_loc=%x, fail operation rc=%d\n",
OBD_FAIL_MDS_SENDPAGE, rc);
- ptlrpc_abort_bulk(desc);
- GOTO(cleanup_buf, rc);
+ GOTO(abort_bulk, rc);
}
- lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, mds_bulk_timeout, desc);
- rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete (desc), &lwi);
- if (rc) {
- LASSERT (rc == -ETIMEDOUT);
- GOTO(cleanup_buf, rc);
+ lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL);
+ rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), &lwi);
+ LASSERT (rc == 0 || rc == -ETIMEDOUT);
+
+ if (rc == 0) {
+ if (desc->bd_success &&
+ desc->bd_nob_transferred == count)
+ GOTO(cleanup_buf, rc);
+
+ rc = -ETIMEDOUT; /* XXX should this be a different errno? */
}
+
+ DEBUG_REQ(D_ERROR, req, "bulk failed: %s %d(%d), evicting %s@%s\n",
+ (rc == -ETIMEDOUT) ? "timeout" : "network error",
+ desc->bd_nob_transferred, count,
+ req->rq_export->exp_client_uuid.uuid,
+ req->rq_export->exp_connection->c_remote_uuid.uuid);
+
+ ptlrpc_fail_export(req->rq_export);
EXIT;
+ abort_bulk:
+ ptlrpc_abort_bulk (desc);
cleanup_buf:
for (i = 0; i < npages; i++)
if (pages[i])
ldlm_cancel_locks_for_export(export);
+ /* complete all outstanding replies */
+ spin_lock_irqsave (&export->exp_lock, irqflags);
+ while (!list_empty (&export->exp_outstanding_replies)) {
+ struct ptlrpc_reply_state *rs =
+ list_entry (export->exp_outstanding_replies.next,
+ struct ptlrpc_reply_state, rs_exp_list);
+ struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service;
+
+ spin_lock (&svc->srv_lock);
+ list_del_init (&rs->rs_exp_list);
+ ptlrpc_schedule_difficult_reply (rs);
+ spin_unlock (&svc->srv_lock);
+ }
+ spin_unlock_irqrestore (&export->exp_lock, irqflags);
+
spin_lock_irqsave(&export->exp_lock, irqflags);
export->exp_flags = flags;
spin_unlock_irqrestore(&export->exp_lock, irqflags);
OBD_FAIL_RETURN(OBD_FAIL_MDS_READPAGE_NET, 0);
rc = mds_readpage(req);
- OBD_FAIL_RETURN(OBD_FAIL_MDS_SENDPAGE, 0);
+ if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_MDS_SENDPAGE)) {
+ if (req->rq_reply_state) {
+ lustre_free_reply_state (req->rq_reply_state);
+ req->rq_reply_state = NULL;
+ }
+ RETURN(0);
+ }
break;
int rc = 0;
ENTRY;
- mds->mds_service = ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS,
- MDS_BUFSIZE, MDS_MAXREQSIZE,
- MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL,
- mds_handle, "mds",
- obddev->obd_proc_entry);
+ mds->mds_service =
+ ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
+ MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL,
+ mds_handle, "mds",
+ obddev->obd_proc_entry);
if (!mds->mds_service) {
CERROR("failed to start service\n");
GOTO(err_thread, rc);
mds->mds_setattr_service =
- ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS,
- MDS_BUFSIZE, MDS_MAXREQSIZE,
+ ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
MDS_SETATTR_PORTAL, MDC_REPLY_PORTAL,
mds_handle, "mds_setattr",
obddev->obd_proc_entry);
GOTO(err_thread2, rc);
mds->mds_readpage_service =
- ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS,
- MDS_BUFSIZE, MDS_MAXREQSIZE,
+ ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
MDS_READPAGE_PORTAL, MDC_REPLY_PORTAL,
mds_handle, "mds_readpage",
obddev->obd_proc_entry);
/* mds/mds_log.c */
int mds_log_op_unlink(struct obd_device *obd, struct inode *inode,
- struct lustre_msg *repmsg, int offset);
+ struct lov_mds_md *lmm, int lmm_size,
+ struct llog_cookie *logcookies, int cookies_size);
int mds_llog_init(struct obd_device *obd, struct obd_device *tgt, int count,
struct llog_logid *logid);
int mds_llog_finish(struct obd_device *obd, int count);
}
int mds_log_op_unlink(struct obd_device *obd, struct inode *inode,
- struct lustre_msg *repmsg, int offset)
+ struct lov_mds_md *lmm, int lmm_size,
+ struct llog_cookie *logcookies, int cookies_size)
{
struct mds_obd *mds = &obd->u.mds;
struct lov_stripe_md *lsm = NULL;
RETURN(PTR_ERR(mds->mds_osc_obd));
rc = obd_unpackmd(mds->mds_osc_exp, &lsm,
- lustre_msg_buf(repmsg, offset, 0),
- repmsg->buflens[offset]);
+ lmm, lmm_size);
if (rc < 0)
RETURN(rc);
ctxt = llog_get_context(obd, LLOG_UNLINK_ORIG_CTXT);
- rc = llog_add(ctxt, NULL, lsm, lustre_msg_buf(repmsg, offset + 1, 0),
- repmsg->buflens[offset + 1] / sizeof(struct llog_cookie));
+ rc = llog_add(ctxt, NULL, lsm, logcookies,
+ cookies_size / sizeof(struct llog_cookie));
obd_free_memmd(mds->mds_osc_exp, &lsm);
struct obd_device *lov_obd = obd->u.mds.mds_osc_obd;
int rc;
ENTRY;
-
+
rc = llog_setup(obd, LLOG_UNLINK_ORIG_CTXT, tgt, 0, NULL,
&mds_unlink_orig_logops);
if (rc)
if (rc)
RETURN(rc);
- rc = obd_llog_init(lov_obd, tgt, count, logid);
- if (rc)
- CERROR("error lov_llog_init\n");
+ rc = obd_llog_init(lov_obd, tgt, count, logid);
+ if (rc)
+ CERROR("error lov_llog_init\n");
RETURN(rc);
}
struct obd_device *lov_obd = obd->u.mds.mds_osc_obd;
int rc;
ENTRY;
-
+
rc = llog_cleanup(llog_get_context(obd, LLOG_UNLINK_ORIG_CTXT));
if (rc)
RETURN(rc);
if (rc)
RETURN(rc);
- rc = obd_llog_finish(lov_obd, count);
- if (rc)
- CERROR("error lov_llog_finish\n");
+ rc = obd_llog_finish(lov_obd, count);
+ if (rc)
+ CERROR("error lov_llog_finish\n");
RETURN(rc);
}
struct ptlrpc_request *req,
struct lustre_handle *child_lockh)
{
- struct ptlrpc_request *oldreq = req->rq_export->exp_outstanding_reply;
struct mds_export_data *med = &req->rq_export->exp_mds_data;
struct mds_client_data *mcd = med->med_mcd;
struct mds_obd *mds = mds_req2mds(req);
mfd = NULL;
}
- if (oldreq != NULL) {
- /* if we're not recovering, it had better be found */
- LASSERT(mfd != NULL);
- } else if (mfd == NULL) {
+#warning "XXX fixme"
+ /* Here it used to LASSERT(mfd) if exp_outstanding_reply != NULL.
+ * Now that exp_outstanding_reply is a list, it's just using mfd != NULL
+ * to detect a re-open */
+ if (mfd == NULL) {
mntget(mds->mds_vfsmnt);
CERROR("Re-opened file \n");
mfd = mds_dentry_open(child, mds->mds_vfsmnt,
if (rc)
ldlm_lock_decref(&parent_lockh, parent_mode);
else
- ldlm_put_lock_into_req(req, &parent_lockh, parent_mode);
+ ptlrpc_save_lock (req, &parent_lockh, parent_mode);
}
if (rc == 0)
atomic_inc(&mds->mds_open_count);
if (req != NULL &&
(reply_body->valid & OBD_MD_FLEASIZE) &&
mds_log_op_unlink(obd, pending_child->d_inode,
- req->rq_repmsg, 1) > 0) {
+ lustre_msg_buf(req->rq_repmsg, 1, 0),
+ req->rq_repmsg->buflens[1],
+ lustre_msg_buf(req->rq_repmsg, 2, 0),
+ req->rq_repmsg->buflens[2]) > 0) {
reply_body->valid |= OBD_MD_FLCOOKIE;
}
RETURN(0);
}
-void mds_steal_ack_locks(struct obd_export *exp, struct ptlrpc_request *req)
+void mds_steal_ack_locks(struct ptlrpc_request *req)
{
- unsigned long flags;
- struct ptlrpc_request *oldrep = exp->exp_outstanding_reply;
-
- if (oldrep == NULL)
+ struct obd_export *exp = req->rq_export;
+ struct list_head *tmp;
+ struct ptlrpc_reply_state *oldrep;
+ struct ptlrpc_service *svc;
+ unsigned long flags;
+ int i;
+
+ /* CAVEAT EMPTOR: spinlock order */
+ spin_lock_irqsave (&exp->exp_lock, flags);
+ list_for_each (tmp, &exp->exp_outstanding_replies) {
+ oldrep = list_entry(tmp, struct ptlrpc_reply_state,rs_exp_list);
+
+ if (oldrep->rs_xid != req->rq_xid)
+ continue;
+
+ if (oldrep->rs_msg.opc != req->rq_reqmsg->opc)
+ CERROR ("Resent req xid "LPX64" has mismatched opc: "
+ "new %d old %d\n", req->rq_xid,
+ req->rq_reqmsg->opc, oldrep->rs_msg.opc);
+
+ svc = oldrep->rs_srv_ni->sni_service;
+ spin_lock (&svc->srv_lock);
+
+ list_del_init (&oldrep->rs_exp_list);
+
+ CWARN("Stealing %d locks from rs %p x"LPD64".t"LPD64
+ " o%d NID"LPX64"\n",
+ oldrep->rs_nlocks, oldrep,
+ oldrep->rs_xid, oldrep->rs_transno, oldrep->rs_msg.opc,
+ exp->exp_connection->c_peer.peer_nid);
+
+ for (i = 0; i < oldrep->rs_nlocks; i++)
+ ptlrpc_save_lock(req,
+ &oldrep->rs_locks[i],
+ oldrep->rs_modes[i]);
+ oldrep->rs_nlocks = 0;
+
+ DEBUG_REQ(D_HA, req, "stole locks for");
+ ptlrpc_schedule_difficult_reply (oldrep);
+
+ spin_unlock (&svc->srv_lock);
+ spin_unlock_irqrestore (&exp->exp_lock, flags);
return;
- memcpy(req->rq_ack_locks, oldrep->rq_ack_locks,
- sizeof req->rq_ack_locks);
- spin_lock_irqsave(&req->rq_lock, flags);
- oldrep->rq_resent = 1;
- wake_up(&oldrep->rq_reply_waitq);
- spin_unlock_irqrestore(&req->rq_lock, flags);
- DEBUG_REQ(D_HA, oldrep, "stole locks from");
- DEBUG_REQ(D_HA, req, "stole locks for");
+ }
+ spin_unlock_irqrestore (&exp->exp_lock, flags);
}
void mds_req_from_mcd(struct ptlrpc_request *req, struct mds_client_data *mcd)
req->rq_repmsg->transno = req->rq_transno = mcd->mcd_last_transno;
req->rq_repmsg->status = req->rq_status = mcd->mcd_last_result;
- if (req->rq_export->exp_outstanding_reply)
- mds_steal_ack_locks(req->rq_export, req);
+ mds_steal_ack_locks(req);
}
static void reconstruct_reint_setattr(struct mds_update_record *rec,
if (rc) {
ldlm_lock_decref(&lockh, LCK_PW);
} else {
- ldlm_put_lock_into_req(req, &lockh, LCK_PW);
+ ptlrpc_save_lock (req, &lockh, LCK_PW);
}
}
case 0:
if (rc) {
ldlm_lock_decref(&lockh, LCK_PW);
} else {
- ldlm_put_lock_into_req(req, &lockh, LCK_PW);
+ ptlrpc_save_lock (req, &lockh, LCK_PW);
}
l_dput(dparent);
case 0:
rc = vfs_unlink(dparent->d_inode, dchild);
if (!rc && log_unlink)
- if (mds_log_op_unlink(obd, child_inode, req->rq_repmsg,
- offset + 1) > 0)
+ if (mds_log_op_unlink(obd, child_inode,
+ lustre_msg_buf(req->rq_repmsg, offset + 1, 0),
+ req->rq_repmsg->buflens[offset + 1],
+ lustre_msg_buf(req->rq_repmsg, offset + 2, 0),
+ req->rq_repmsg->buflens[offset + 2]) > 0)
body->valid |= OBD_MD_FLCOOKIE;
break;
}
if (rc)
ldlm_lock_decref(&child_reuse_lockh, LCK_EX);
else
- ldlm_put_lock_into_req(req, &child_reuse_lockh, LCK_EX);
+ ptlrpc_save_lock(req, &child_reuse_lockh, LCK_EX);
case 2: /* child lock */
ldlm_lock_decref(&child_lockh, LCK_EX);
case 1: /* child and parent dentry, parent lock */
if (rc)
ldlm_lock_decref(&parent_lockh, LCK_PW);
else
- ldlm_put_lock_into_req(req, &parent_lockh, LCK_PW);
+ ptlrpc_save_lock(req, &parent_lockh, LCK_PW);
l_dput(dchild);
l_dput(dparent);
case 0:
ldlm_lock_decref(&src_lockh, LCK_EX);
ldlm_lock_decref(&tgt_dir_lockh, LCK_EX);
} else {
- ldlm_put_lock_into_req(req, &src_lockh, LCK_EX);
- ldlm_put_lock_into_req(req, &tgt_dir_lockh, LCK_EX);
+ ptlrpc_save_lock(req, &src_lockh, LCK_EX);
+ ptlrpc_save_lock(req, &tgt_dir_lockh, LCK_EX);
}
case 2: /* target dentry */
l_dput(de_tgt_dir);
ldlm_lock_decref(&(dlm_handles[0]), LCK_PW);
} else {
if (lock_count == 4)
- ldlm_put_lock_into_req(req,
- &(dlm_handles[3]), LCK_EX);
- ldlm_put_lock_into_req(req, &(dlm_handles[2]), LCK_EX);
- ldlm_put_lock_into_req(req, &(dlm_handles[1]), LCK_PW);
- ldlm_put_lock_into_req(req, &(dlm_handles[0]), LCK_PW);
+ ptlrpc_save_lock(req,
+ &(dlm_handles[3]), LCK_EX);
+ ptlrpc_save_lock(req, &(dlm_handles[2]), LCK_EX);
+ ptlrpc_save_lock(req, &(dlm_handles[1]), LCK_PW);
+ ptlrpc_save_lock(req, &(dlm_handles[0]), LCK_PW);
}
l_dput(de_new);
l_dput(de_old);
}
static int mds_osc_destroy_orphan(struct mds_obd *mds,
- struct ptlrpc_request *request)
+ struct inode *inode,
+ struct lov_mds_md *lmm,
+ int lmm_size,
+ struct llog_cookie *logcookies,
+ int log_unlink)
{
- struct mds_body *body;
- struct lov_mds_md *lmm = NULL;
struct lov_stripe_md *lsm = NULL;
struct obd_trans_info oti = { 0 };
struct obdo *oa;
int rc;
ENTRY;
- body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body));
- if (!(body->valid & OBD_MD_FLEASIZE))
+ if (lmm_size == 0)
RETURN(0);
- if (body->eadatasize == 0) {
- CERROR("OBD_MD_FLEASIZE set but eadatasize zero\n");
- RETURN(rc = -EPROTO);
- }
- lmm = lustre_msg_buf(request->rq_repmsg, 1, body->eadatasize);
- LASSERT(lmm != NULL);
-
- rc = obd_unpackmd(mds->mds_osc_exp, &lsm, lmm, body->eadatasize);
+ rc = obd_unpackmd(mds->mds_osc_exp, &lsm, lmm, lmm_size);
if (rc < 0) {
CERROR("Error unpack md %p\n", lmm);
RETURN(rc);
if (oa == NULL)
GOTO(out_free_memmd, rc = -ENOMEM);
oa->o_id = lsm->lsm_object_id;
- oa->o_mode = body->mode & S_IFMT;
+ oa->o_mode = inode->i_mode & S_IFMT;
oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
- if (body->valid & OBD_MD_FLCOOKIE) {
+ if (log_unlink && logcookies) {
oa->o_valid |= OBD_MD_FLCOOKIE;
- oti.oti_logcookies =
- lustre_msg_buf(request->rq_repmsg, 2,
- sizeof(struct llog_cookie) *
- lsm->lsm_stripe_count);
- if (oti.oti_logcookies == NULL)
- oa->o_valid &= ~OBD_MD_FLCOOKIE;
- body->valid &= ~OBD_MD_FLCOOKIE;
+ oti.oti_logcookies = logcookies;
}
rc = obd_destroy(mds->mds_osc_exp, oa, lsm, &oti);
struct inode *inode, struct inode *pending_dir)
{
struct mds_obd *mds = &obd->u.mds;
- struct mds_body *body;
+ struct lov_mds_md *lmm = NULL;
+ struct llog_cookie *logcookies = NULL;
+ int lmm_size = 0, log_unlink = 0;
void *handle = NULL;
- struct ptlrpc_request *req;
- int lengths[3] = {sizeof(struct mds_body),
- mds->mds_max_mdsize,
- mds->mds_max_cookiesize};
- int rc;
+ int rc, err;
ENTRY;
LASSERT(mds->mds_osc_obd != NULL);
- OBD_ALLOC(req, sizeof(*req));
- if (!req) {
- CERROR("request allocation out of memory\n");
- GOTO(err_alloc_req, rc = -ENOMEM);
- }
- rc = lustre_pack_reply(req, 3, lengths, NULL);
- if (rc) {
- CERROR("cannot pack request %d\n", rc);
- GOTO(out_free_req, rc);
- }
- body = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*body));
- LASSERT(body != NULL);
- mds_pack_inode2body(body, inode);
- mds_pack_md(obd, req->rq_repmsg, 1, body, inode, 1);
+ OBD_ALLOC(lmm, mds->mds_max_mdsize);
+ if (lmm == NULL)
+ RETURN(-ENOMEM);
+
+ down(&inode->i_sem);
+ rc = fsfilt_get_md(obd, inode, lmm, mds->mds_max_mdsize);
+ up(&inode->i_sem);
+
+ if (rc < 0) {
+ CERROR("Error %d reading eadata for ino %lu\n",
+ rc, inode->i_ino);
+ GOTO(out_free_lmm, rc);
+ } else if (rc > 0) {
+ lmm_size = rc;
+ rc = mds_convert_lov_ea(obd, inode, lmm, lmm_size);
+ if (rc > 0)
+ lmm_size = rc;
+ rc = 0;
+ }
handle = fsfilt_start(obd, pending_dir, FSFILT_OP_UNLINK_LOG, NULL);
if (IS_ERR(handle)) {
rc = PTR_ERR(handle);
CERROR("error fsfilt_start: %d\n", rc);
handle = NULL;
- GOTO(out_free_msg, rc);
+ GOTO(out_free_lmm, rc);
}
- if (S_ISDIR(inode->i_mode)) {
+ down(&inode->i_sem);
+ rc = fsfilt_get_md(obd, inode, lmm, mds->mds_max_mdsize);
+ up(&inode->i_sem);
+
+ if (rc < 0) {
+ CERROR("Error %d reading eadata for ino %lu\n",
+ rc, inode->i_ino);
+ GOTO(out_free_lmm, rc);
+ } else if (rc > 0) {
+ lmm_size = rc;
+ rc = 0;
+ }
+
+ if (S_ISDIR(inode->i_mode))
rc = vfs_rmdir(pending_dir, dchild);
- } else {
+ else
rc = vfs_unlink(pending_dir, dchild);
- }
+
if (rc)
CERROR("error %d unlinking orphan %*s from PENDING directory\n",
rc, dchild->d_name.len, dchild->d_name.name);
- if ((body->valid & OBD_MD_FLEASIZE)) {
- if (mds_log_op_unlink(obd, inode, req->rq_repmsg, 1) > 0)
- body->valid |= OBD_MD_FLCOOKIE;
+ if (!rc && lmm_size) {
+ OBD_ALLOC(logcookies, mds->mds_max_cookiesize);
+ if (logcookies == NULL)
+ rc = -ENOMEM;
+ else if (mds_log_op_unlink(obd, inode, lmm,lmm_size,logcookies,
+ mds->mds_max_cookiesize) > 0)
+ log_unlink = 1;
}
-
- if (handle) {
- int err = fsfilt_commit(obd, pending_dir, handle, 0);
- if (err) {
- CERROR("error committing orphan unlink: %d\n", err);
+ err = fsfilt_commit(obd, pending_dir, handle, 0);
+ if (err) {
+ CERROR("error committing orphan unlink: %d\n", err);
+ if (!rc)
rc = err;
- GOTO(out_free_msg, rc);
- }
}
- rc = mds_osc_destroy_orphan(mds, req);
-out_free_msg:
- OBD_FREE(req->rq_repmsg, req->rq_replen);
- req->rq_repmsg = NULL;
-out_free_req:
- OBD_FREE(req, sizeof(*req));
-err_alloc_req:
+ if (!rc) {
+ rc = mds_osc_destroy_orphan(mds, inode, lmm, lmm_size,
+ logcookies, log_unlink);
+ }
+
+ if (logcookies != NULL)
+ OBD_FREE(logcookies, mds->mds_max_cookiesize);
+out_free_lmm:
+ OBD_FREE(lmm, mds->mds_max_mdsize);
RETURN(rc);
}
#include <linux/obd_class.h>
#include <linux/lustre_net.h>
-#define MGMT_NEVENTS 1024UL
#define MGMT_NBUFS 128UL
#define MGMT_BUFSIZE 8192
#define MGMT_MAXREQSIZE 512
if (mgmt_initialized)
RETURN(-EALREADY);
- mgmt_service = ptlrpc_init_svc(MGMT_NEVENTS, MGMT_NBUFS, MGMT_BUFSIZE,
- MGMT_MAXREQSIZE, MGMT_REQUEST_PORTAL,
- MGMT_REPLY_PORTAL, mgmt_handler,
- "mgmt", obd->obd_proc_entry);
+ mgmt_service =
+ ptlrpc_init_svc(MGMT_NBUFS, MGMT_BUFSIZE, MGMT_MAXREQSIZE,
+ MGMT_REQUEST_PORTAL, MGMT_REPLY_PORTAL,
+ mgmt_handler, "mgmt",
+ obd->obd_proc_entry);
if (!mgmt_service) {
CERROR("Failed to start mgmt service\n");
RETURN(-ENOMEM);
DEFS=
MODULE = obdclass
-class_obd.o: lustre_build_version
-
if LIBLUSTRE
-lib_LIBRARIES = liblustreclass.a
+
+noinst_LIBRARIES = liblustreclass.a
liblustreclass_a_SOURCES = class_obd.c debug.c genops.c statfs_pack.c uuid.c
liblustreclass_a_SOURCES += lustre_handles.c lustre_peer.c lprocfs_status.c
liblustreclass_a_SOURCES += obdo.c obd_config.c llog.c llog_obd.c llog_cat.c
liblustreclass_a_SOURCES += llog_lvfs.c #llog_ioctl.c rbtree.c
+liblustreclass_a_CFLAGS = -fPIC
+
+class_obd.c: lustre_build_version
lustre_build_version:
- echo '#define LUSTRE_VERSION 31' > $(top_builddir)/include/linux/lustre_build_version.h
+ echo '#define LUSTRE_VERSION 32' > $(top_builddir)/include/linux/lustre_build_version.h
echo '#define BUILD_VERSION "1"' >> $(top_builddir)/include/linux/lustre_build_version.h
else
+
+class_obd.o: lustre_build_version
+
modulefs_DATA = lustre_build_version obdclass.o llog_test.o
EXTRA_PROGRAMS = obdclass llog_test
#include <portals/list.h>
#include "llog_internal.h"
+#ifndef __KERNEL__
+/* liblustre workaround */
+atomic_t portal_kmemory = {0};
+#endif
+
struct semaphore obd_conf_sem; /* serialize configuration commands */
struct obd_device obd_dev[MAX_OBD_DEVICES];
struct list_head obd_types;
char *buf;
struct lustre_cfg *lcfg;
- /* FIXME hack to liblustre dump, remove when switch
- to zeroconf */
-#ifndef __KERNEL__
- data->ioc_pbuf1 = data->ioc_inlbuf1;
- data->ioc_plen1 = data->ioc_inllen1;
-#endif
if (!data->ioc_plen1 || !data->ioc_pbuf1) {
CERROR("No config buffer passed!\n");
GOTO(out, err = -EINVAL);
if (exp->exp_connection)
ptlrpc_put_connection_superhack(exp->exp_connection);
+ LASSERT(list_empty(&exp->exp_outstanding_replies));
LASSERT(list_empty(&exp->exp_handle.h_link));
obd_destroy_export(exp);
export->exp_conn_cnt = 0;
atomic_set(&export->exp_refcount, 2);
export->exp_obd = obd;
+ INIT_LIST_HEAD(&export->exp_outstanding_replies);
/* XXX this should be in LDLM init */
INIT_LIST_HEAD(&export->exp_ldlm_data.led_held_locks);
return 0;
}
-static int llog_lvfs_create(struct llog_obd_ctxt *ctxt,struct llog_handle **res,
+static int llog_lvfs_create(struct llog_ctxt *ctxt, struct llog_handle **res,
struct llog_logid *logid, char *name)
{
LBUG();
rm_entry = temp;
temp = temp->parent;
+
+ /* Memory corruption once caused this to fail, and
+ without this LASSERT we would loop here forever. */
+ LASSERTF(strlen(rm_entry->name) == rm_entry->namelen,
+ "0x%p %s/%s len %d\n", rm_entry,
+ temp->name, rm_entry->name, strlen(rm_entry->name));
+
remove_proc_entry(rm_entry->name, rm_entry->parent);
if (temp == parent)
break;
INIT_LIST_HEAD(&obd->obd_recovery_queue);
INIT_LIST_HEAD(&obd->obd_delayed_reply_queue);
- init_waitqueue_head(&obd->obd_commit_waitq);
+ spin_lock_init (&obd->obd_uncommitted_replies_lock);
+ INIT_LIST_HEAD (&obd->obd_uncommitted_replies);
len = strlen(name) + 1;
OBD_ALLOC(obd->obd_name, len);
struct obd_export *exp, *n;
list_for_each_entry_safe(exp, n, &obd->obd_exports, exp_obd_chain) {
- CERROR("%s: %p %s %d %d %p\n",
+ struct ptlrpc_reply_state *rs;
+ struct ptlrpc_reply_state *first_reply = NULL;
+ int nreplies = 0;
+
+ list_for_each_entry (rs, &exp->exp_outstanding_replies,
+ rs_exp_list) {
+ if (nreplies == 0)
+ first_reply = rs;
+ nreplies++;
+ }
+
+ CERROR("%s: %p %s %d %d %d: %p %s\n",
obd->obd_name, exp, exp->exp_client_uuid.uuid,
atomic_read(&exp->exp_refcount),
- exp->exp_failed, exp->exp_outstanding_reply );
+ exp->exp_failed, nreplies, first_reply,
+ nreplies > 3 ? "..." : "");
}
}
DEFS=
if LIBLUSTRE
-lib_LIBRARIES = libobdecho.a
+noinst_LIBRARIES = libobdecho.a
libobdecho_a_SOURCES = echo_client.c
+libobdecho_a_CFLAGS = -fPIC
else
MODULE = obdecho
modulefs_DATA = obdecho.o
struct page *page = r->page;
void *addr;
- if (!page || !(addr = kmap(page)) ||
- !kern_addr_valid((unsigned long)addr)) {
-
- CERROR("bad page objid "LPU64":%p, buf %d/%d\n",
+ if (page == NULL) {
+ CERROR("null page objid "LPU64":%p, buf %d/%d\n",
obj->ioo_id, page, j, obj->ioo_bufcnt);
- kunmap(page);
GOTO(commitrw_cleanup, rc = -EFAULT);
}
+ addr = kmap(page);
+
CDEBUG(D_PAGE, "$$$$ use page %p, addr %p@"LPU64"\n",
r->page, addr, r->offset);
*offp = offset * stripe_size + woffset % stripe_size;
}
+static void echo_page_debug_setup(struct lov_stripe_md *lsm,
+ struct page *page, int rw, obd_id id,
+ obd_off offset, obd_off count)
+{
+ void *addr;
+ obd_off stripe_off;
+ obd_id stripe_id;
+
+ if (id == 0)
+ return;
+
+ addr = kmap(page);
+
+ if (rw == OBD_BRW_WRITE) {
+ stripe_off = offset;
+ stripe_id = id;
+ echo_get_stripe_off_id(lsm, &stripe_off, &stripe_id);
+ } else {
+ stripe_off = 0xdeadbeef00c0ffeeULL;
+ stripe_id = 0xdeadbeef00c0ffeeULL;
+ }
+ page_debug_setup(addr, count, stripe_off, stripe_id);
+
+ kunmap(page);
+}
+
+static int echo_page_debug_check(struct lov_stripe_md *lsm,
+ struct page *page, obd_id id,
+ obd_off offset, obd_off count)
+{
+ obd_off stripe_off = offset;
+ obd_id stripe_id = id;
+ void *addr;
+ int rc;
+
+ if (id == 0)
+ return 0;
+
+ addr = kmap(page);
+ echo_get_stripe_off_id (lsm, &stripe_off, &stripe_id);
+ rc = page_debug_check("test_brw", addr, count, stripe_off, stripe_id);
+ kunmap(page);
+ return rc;
+}
+
static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa,
struct lov_stripe_md *lsm, obd_off offset,
obd_size count, struct obd_trans_info *oti)
obd_off off;
int i;
int rc;
- int verify;
+ int verify = 0;
int gfp_mask;
/* oa_id == 0 => speed test (no verification) else...
* oa & 1 => use HIGHMEM
*/
- verify = (oa->o_id != 0);
gfp_mask = ((oa->o_id & 1) == 0) ? GFP_KERNEL : GFP_HIGHUSER;
LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ);
pgp->off = off;
pgp->flag = 0;
- if (verify) {
- void *addr = kmap(pgp->pg);
- obd_off stripe_off = off;
- obd_id stripe_id = oa->o_id;
-
- if (rw == OBD_BRW_WRITE) {
- echo_get_stripe_off_id(lsm, &stripe_off,
- &stripe_id);
- page_debug_setup(addr, pgp->count,
- stripe_off, stripe_id);
- } else {
- page_debug_setup(addr, pgp->count,
- 0xdeadbeef00c0ffeeULL,
- 0xdeadbeef00c0ffeeULL);
- }
- kunmap(pgp->pg);
- }
+ echo_page_debug_setup(lsm, pgp->pg, rw, oa->o_id, off,
+ pgp->count);
}
rc = obd_brw(rw, ec->ec_exp, oa, lsm, npages, pga, oti);
out:
- if (rc != 0)
- verify = 0;
+ if (rc == 0 && rw == OBD_BRW_READ)
+ verify = 1;
for (i = 0, pgp = pga; i < npages; i++, pgp++) {
if (pgp->pg == NULL)
continue;
if (verify) {
- void *addr = kmap(pgp->pg);
- obd_off stripe_off = pgp->off;
- obd_id stripe_id = oa->o_id;
- int vrc;
-
- echo_get_stripe_off_id (lsm, &stripe_off, &stripe_id);
- vrc = page_debug_check("test_brw", addr, pgp->count,
- stripe_off, stripe_id);
+ int vrc;
+ vrc = echo_page_debug_check(lsm, pgp->pg, oa->o_id,
+ pgp->off, pgp->count);
if (vrc != 0 && rc == 0)
rc = vrc;
-
- kunmap(pgp->pg);
}
__free_pages(pgp->pg, 0);
}
wait_queue_head_t eas_waitq;
struct list_head eas_avail;
struct obdo eas_oa;
+ struct lov_stripe_md *eas_lsm;
};
static int eas_should_wake(struct echo_async_state *eas)
return;
eas = eap->eap_eas;
+ if (cmd == OBD_BRW_READ)
+ echo_page_debug_check(eas->eas_lsm, eap->eap_page,
+ eas->eas_oa.o_id, eap->eap_off,
+ PAGE_SIZE);
+
spin_lock_irqsave(&eas->eas_lock, flags);
if (rc && !eas->eas_rc)
eas->eas_rc = rc;
init_waitqueue_head(&eas.eas_waitq);
eas.eas_in_flight = 0;
eas.eas_rc = 0;
+ eas.eas_lsm = lsm;
INIT_LIST_HEAD(&eas.eas_avail);
/* prepare the group of pages that we're going to be keeping
if (page == NULL)
GOTO(out, rc = -ENOMEM);
+ page->private = 0;
list_add_tail(&page->list, &pages);
OBD_ALLOC(eap, sizeof(*eap));
eap->eap_magic = EAP_MAGIC;
eap->eap_page = page;
eap->eap_eas = &eas;
- eap->eap_cookie = ERR_PTR(-ENOENT);
+ page->private = (unsigned long)eap;
list_add_tail(&eap->eap_item, &eas.eas_avail);
}
spin_unlock_irqrestore(&eas.eas_lock, flags);
/* unbind the eap from its old page offset */
- if (!IS_ERR(eap->eap_cookie)) {
+ if (eap->eap_cookie != NULL) {
obd_teardown_async_page(exp, lsm, NULL,
eap->eap_cookie);
- eap->eap_cookie = ERR_PTR(-ENOENT);
+ eap->eap_cookie = NULL;
}
eas.eas_next_offset += PAGE_SIZE;
break;
}
+ if (rw == OBD_BRW_WRITE)
+ echo_page_debug_setup(lsm, eap->eap_page, rw, oa->o_id,
+ eap->eap_off, PAGE_SIZE);
+
/* always asserts urgent, which isn't quite right */
rc = obd_queue_async_io(exp, lsm, NULL, eap->eap_cookie,
rw, 0, PAGE_SIZE, 0,
struct page *page = list_entry(pos, struct page, list);
list_del(&page->list);
- if (page->private) {
+ if (page->private != 0) {
eap = (struct echo_async_page *)page->private;
- if (!IS_ERR(eap->eap_cookie))
+ if (eap->eap_cookie != NULL)
obd_teardown_async_page(exp, lsm, NULL,
eap->eap_cookie);
OBD_FREE(eap, sizeof(*eap));
for (i = 0; i < npages; i++) {
struct page *page = lnb[i].page;
- void *addr;
/* read past eof? */
if (page == NULL && lnb[i].rc == 0)
continue;
- addr = kmap(lnb[i].page);
-
if (rw == OBD_BRW_WRITE)
- page_debug_setup(addr, PAGE_SIZE,
- rnb[i].offset, oa->o_id);
- else
- err = page_debug_check("prep_commit", addr,
- PAGE_SIZE, rnb[i].offset,
- oa->o_id);
-
- kunmap(lnb[i].page);
+ echo_page_debug_setup(lsm, page, rw, oa->o_id,
+ rnb[i].offset,
+ rnb[i].len);
+ else
+ echo_page_debug_check(lsm, page, oa->o_id,
+ rnb[i].offset,
+ rnb[i].len);
}
ret = obd_commitrw(rw, exp, oa, 1, &ioo, npages, lnb, oti);
DEFS=
if LIBLUSTRE
-lib_LIBRARIES = libosc.a
+noinst_LIBRARIES = libosc.a
libosc_a_SOURCES = osc_request.c osc_lib.c osc_create.c osc_internal.h
+libosc_a_CFLAGS = -fPIC
else
MODULE = osc
modulefs_DATA = osc.o
#ifdef __KERNEL__
int lproc_osc_attach_seqstat(struct obd_device *dev);
#else
-static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {}
+static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;}
#endif
#endif /* OSC_INTERNAL_H */
{
obd_flag bits = OBD_MD_FLBLOCKS|OBD_MD_FLGRANT;
- LASSERT(!(oa->o_valid & bits));
+ /* XXX obd_brw_internal() might reuse obdo in it's loop thus
+ * hit the following assert. any actual meaning of this? temporarily
+ * disable it.
+ * in kernel mode, probably VFS will prevent it happen.
+ */
+ //LASSERT(!(oa->o_valid & bits));
oa->o_valid |= bits;
spin_lock(&cli->cl_loi_list_lock);
}
}
-static int check_write_rcs(struct ptlrpc_request *request, int niocount,
+static int check_write_rcs(struct ptlrpc_request *request,
+ int requested_nob, int niocount,
obd_count page_count, struct brw_page *pga)
{
- int i;
- int *remote_rcs;
+ int *remote_rcs, i;
/* return error if any niobuf was in error */
remote_rcs = lustre_swab_repbuf(request, 1,
sizeof(*remote_rcs) * niocount, NULL);
if (remote_rcs == NULL) {
- CERROR ("Missing/short RC vector on BRW_WRITE reply\n");
- return (-EPROTO);
+ CERROR("Missing/short RC vector on BRW_WRITE reply\n");
+ return(-EPROTO);
}
- if (lustre_msg_swabbed (request->rq_repmsg))
+ if (lustre_msg_swabbed(request->rq_repmsg))
for (i = 0; i < niocount; i++)
- __swab32s (&remote_rcs[i]);
+ __swab32s(&remote_rcs[i]);
for (i = 0; i < niocount; i++) {
if (remote_rcs[i] < 0)
- return (remote_rcs[i]);
+ return(remote_rcs[i]);
if (remote_rcs[i] != 0) {
- CERROR ("rc[%d] invalid (%d) req %p\n",
+ CERROR("rc[%d] invalid (%d) req %p\n",
i, remote_rcs[i], request);
- return (-EPROTO);
+ return(-EPROTO);
}
}
+ if (request->rq_bulk->bd_nob_transferred != requested_nob) {
+ CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
+ requested_nob, request->rq_bulk->bd_nob_transferred);
+ return(-EPROTO);
+ }
+
return (0);
}
return (-ENOMEM);
if (opc == OST_WRITE)
- desc = ptlrpc_prep_bulk_imp(req, BULK_GET_SOURCE,
- OST_BULK_PORTAL);
+ desc = ptlrpc_prep_bulk_imp (req, page_count,
+ BULK_GET_SOURCE, OST_BULK_PORTAL);
else
- desc = ptlrpc_prep_bulk_imp(req, BULK_PUT_SINK,
- OST_BULK_PORTAL);
+ desc = ptlrpc_prep_bulk_imp (req, page_count,
+ BULK_PUT_SINK, OST_BULK_PORTAL);
if (desc == NULL)
GOTO(out, rc = -ENOMEM);
/* NB request now owns desc and will free it when it gets freed */
pg_prev->pg, pg_prev->pg->private, pg_prev->pg->index,
pg_prev->off);
- rc = ptlrpc_prep_bulk_page(desc, pg->pg, pg->off & ~PAGE_MASK,
- pg->count);
- if (rc != 0)
- GOTO(out, rc);
-
+ ptlrpc_prep_bulk_page(desc, pg->pg, pg->off & ~PAGE_MASK,
+ pg->count);
requested_nob += pg->count;
if (i > 0 && can_merge_pages(pg_prev, pg)) {
CERROR ("Unexpected +ve rc %d\n", rc);
RETURN(-EPROTO);
}
+ LASSERT (req->rq_bulk->bd_nob == requested_nob);
- RETURN(check_write_rcs(req, niocount, page_count, pga));
+ RETURN(check_write_rcs(req, requested_nob, niocount,
+ page_count, pga));
}
if (rc > requested_nob) {
RETURN(-EPROTO);
}
+ if (rc != req->rq_bulk->bd_nob_transferred) {
+ CERROR ("Unexpected rc %d (%d transferred)\n",
+ rc, req->rq_bulk->bd_nob_transferred);
+ return (-EPROTO);
+ }
+
if (rc < requested_nob)
handle_short_read(rc, page_count, pga);
oap = list_entry(pos, struct osc_async_page, oap_pending_item);
ops = oap->oap_caller_ops;
+ LASSERT(oap->oap_magic == OAP_MAGIC);
+
/* in llite being 'ready' equates to the page being locked
* until completion unlocks it. commit_write submits a page
* as not ready because its unlock will happen unconditionally
list_splice(&rpc_list, &aa->aa_oaps);
INIT_LIST_HEAD(&rpc_list);
+#ifdef __KERNEL__
if (cmd == OBD_BRW_READ) {
lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_brw_in_flight);
lprocfs_oh_tally(&cli->cl_write_rpc_hist,
cli->cl_brw_in_flight);
}
+#endif
spin_lock(&cli->cl_loi_list_lock);
for (ack_lock = oti->oti_ack_locks, i = 0; i < 4; i++, ack_lock++) {
if (!ack_lock->mode)
break;
- ldlm_put_lock_into_req(req, &ack_lock->lock, ack_lock->mode);
+ /* XXX not even calling target_send_reply in some cases... */
+ ptlrpc_save_lock (req, &ack_lock->lock, ack_lock->mode);
}
}
if (local_nb == NULL)
GOTO(out_pp_rnb, rc = -ENOMEM);
- desc = ptlrpc_prep_bulk_exp(req, BULK_PUT_SOURCE, OST_BULK_PORTAL);
+ desc = ptlrpc_prep_bulk_exp (req, npages,
+ BULK_PUT_SOURCE, OST_BULK_PORTAL);
if (desc == NULL)
GOTO(out_local, rc = -ENOMEM);
nob += page_rc;
if (page_rc != 0) { /* some data! */
LASSERT (local_nb[i].page != NULL);
- rc = ptlrpc_prep_bulk_page(desc, local_nb[i].page,
- pp_rnb[i].offset& ~PAGE_MASK,
- page_rc);
- if (rc != 0)
- break;
+ ptlrpc_prep_bulk_page(desc, local_nb[i].page,
+ pp_rnb[i].offset & (PAGE_SIZE - 1),
+ page_rc);
}
if (page_rc != pp_rnb[i].len) { /* short read */
}
if (rc == 0) {
- rc = ptlrpc_bulk_put(desc);
+ rc = ptlrpc_start_bulk_transfer(desc);
if (rc == 0) {
lwi = LWI_TIMEOUT(obd_timeout * HZ / 4,
ost_bulk_timeout, desc);
rc = l_wait_event(desc->bd_waitq,
- ptlrpc_bulk_complete(desc), &lwi);
- if (rc) {
- LASSERT(rc == -ETIMEDOUT);
+ !ptlrpc_bulk_active(desc), &lwi);
+ LASSERT(rc == 0 || rc == -ETIMEDOUT);
+ if (rc == -ETIMEDOUT) {
DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT");
ptlrpc_abort_bulk(desc);
+ } else if (!desc->bd_success ||
+ desc->bd_nob_transferred != desc->bd_nob) {
+ DEBUG_REQ(D_ERROR, req, "%s bulk PUT %d(%d)",
+ desc->bd_success ?
+ "truncated" : "network error on",
+ desc->bd_nob_transferred,
+ desc->bd_nob);
+ /* XXX should this be a different errno? */
+ rc = -ETIMEDOUT;
}
} else {
DEBUG_REQ(D_ERROR, req, "bulk PUT failed: rc %d\n", rc);
req->rq_status = rc;
ptlrpc_error(req);
} else {
- if (req->rq_repmsg != NULL) {
+ if (req->rq_reply_state != NULL) {
/* reply out callback would free */
- OBD_FREE(req->rq_repmsg, req->rq_replen);
+ lustre_free_reply_state (req->rq_reply_state);
}
if (req->rq_reqmsg->conn_cnt == req->rq_export->exp_conn_cnt) {
CERROR("bulk IO comms error: "
int objcount, niocount, npages;
int comms_error = 0;
int rc, rc2, swab, i, j;
- char str[PTL_NALFMT_SIZE];
+ char str[PTL_NALFMT_SIZE];
ENTRY;
if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK))
if (local_nb == NULL)
GOTO(out_pp_rnb, rc = -ENOMEM);
- desc = ptlrpc_prep_bulk_exp(req, BULK_GET_SINK, OST_BULK_PORTAL);
+ desc = ptlrpc_prep_bulk_exp (req, npages,
+ BULK_GET_SINK, OST_BULK_PORTAL);
if (desc == NULL)
GOTO(out_local, rc = -ENOMEM);
/* NB Having prepped, we must commit... */
- for (i = 0; i < npages; i++) {
- rc = ptlrpc_prep_bulk_page(desc, local_nb[i].page,
- pp_rnb[i].offset & (PAGE_SIZE - 1),
- pp_rnb[i].len);
- if (rc != 0)
- break;
- }
+ for (i = 0; i < npages; i++)
+ ptlrpc_prep_bulk_page(desc, local_nb[i].page,
+ pp_rnb[i].offset & (PAGE_SIZE - 1),
+ pp_rnb[i].len);
+ rc = ptlrpc_start_bulk_transfer (desc);
if (rc == 0) {
- rc = ptlrpc_bulk_get(desc);
- if (rc == 0) {
- lwi = LWI_TIMEOUT(obd_timeout * HZ / 4,
- ost_bulk_timeout, desc);
- rc = l_wait_event(desc->bd_waitq,
- ptlrpc_bulk_complete(desc), &lwi);
- if (rc) {
- LASSERT(rc == -ETIMEDOUT);
- DEBUG_REQ(D_ERROR, req, "timeout on bulk GET");
- ptlrpc_abort_bulk(desc);
- }
- } else {
- DEBUG_REQ(D_ERROR, req, "bulk GET failed: rc %d\n", rc);
+ lwi = LWI_TIMEOUT(obd_timeout * HZ / 4,
+ ost_bulk_timeout, desc);
+ rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc),
+ &lwi);
+ LASSERT(rc == 0 || rc == -ETIMEDOUT);
+ if (rc == -ETIMEDOUT) {
+ DEBUG_REQ(D_ERROR, req, "timeout on bulk GET");
+ ptlrpc_abort_bulk(desc);
+ } else if (!desc->bd_success ||
+ desc->bd_nob_transferred != desc->bd_nob) {
+ DEBUG_REQ(D_ERROR, req, "%s bulk GET %d(%d)",
+ desc->bd_success ?
+ "truncated" : "network error on",
+ desc->bd_nob_transferred, desc->bd_nob);
+ /* XXX should this be a different errno? */
+ rc = -ETIMEDOUT;
}
- comms_error = rc != 0;
+ } else {
+ DEBUG_REQ(D_ERROR, req, "ptlrpc_bulk_get failed: rc %d\n", rc);
}
+ comms_error = rc != 0;
repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*repbody));
memcpy(&repbody->oa, &body->oa, sizeof(repbody->oa));
req->rq_status = rc;
ptlrpc_error(req);
} else {
- if (req->rq_repmsg != NULL) {
+ if (req->rq_reply_state != NULL) {
/* reply out callback would free */
- OBD_FREE (req->rq_repmsg, req->rq_replen);
+ lustre_free_reply_state (req->rq_reply_state);
}
if (req->rq_reqmsg->conn_cnt == req->rq_export->exp_conn_cnt) {
CERROR("bulk IO comms error: "
free_per_page_niobufs(npages, pp_rnb, remote_nb);
out:
if (rc) {
- OBD_FREE(req->rq_repmsg, req->rq_replen);
- req->rq_repmsg = NULL;
req->rq_status = rc;
ptlrpc_error(req);
} else
if (rc < 0)
RETURN(rc);
- ost->ost_service = ptlrpc_init_svc(OST_NEVENTS, OST_NBUFS,
- OST_BUFSIZE, OST_MAXREQSIZE,
- OST_REQUEST_PORTAL, OSC_REPLY_PORTAL,
- ost_handle, "ost",
- obddev->obd_proc_entry);
+ ost->ost_service =
+ ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
+ OST_REQUEST_PORTAL, OSC_REPLY_PORTAL,
+ ost_handle, "ost",
+ obddev->obd_proc_entry);
if (ost->ost_service == NULL) {
CERROR("failed to start service\n");
RETURN(-ENOMEM);
GOTO(out, rc = -EINVAL);
ost->ost_create_service =
- ptlrpc_init_svc(OST_NEVENTS, OST_NBUFS, OST_BUFSIZE,
- OST_MAXREQSIZE, OST_CREATE_PORTAL,
- OSC_REPLY_PORTAL, ost_handle, "ost_create",
+ ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
+ OST_CREATE_PORTAL, OSC_REPLY_PORTAL,
+ ost_handle, "ost_create",
obddev->obd_proc_entry);
if (ost->ost_create_service == NULL) {
CERROR("failed to start OST create service\n");
CFLAGS="$KCFLAGS"
CPPFLAGS="$KINCFLAGS $KCPPFLAGS $MFLAGS $enable_zerocopy $enable_affinity $with_quadrics $with_gm $with_scamac $with_ib"
+AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib)
AC_SUBST(MOD_LINK)
AC_SUBST(LINUX25)
AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib)
PTL_IOV_TOO_SMALL = 31,
PTL_EQ_INUSE = 32,
- PTL_MD_INUSE = 33,
- PTL_MAX_ERRNO = 33
+ PTL_MAX_ERRNO = 32
} ptl_err_t;
/* If you change these, you must update the string table in api-errno.c */
lib_ni_t ni;
void *nal_data;
/*
- * send: Sends a preformatted header and user data to a
- * specified remote process.
- * Can overwrite iov.
+ * send: Sends a preformatted header and payload data to a
+ * specified remote process. The payload is scattered over 'niov'
+ * fragments described by iov, starting at 'offset' for 'mlen'
+ * bytes.
+ * NB the NAL may NOT overwrite iov.
+ * PTL_OK on success => NAL has committed to send and will call
+ * lib_finalize on completion
*/
- int (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
- ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
- unsigned int niov, struct iovec *iov, size_t mlen);
+ ptl_err_t (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+ ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+ unsigned int niov, struct iovec *iov,
+ size_t offset, size_t mlen);
/* as send, but with a set of page fragments (NULL if not supported) */
- int (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
- ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
- unsigned int niov, ptl_kiov_t *iov, size_t mlen);
+ ptl_err_t (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+ ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+ unsigned int niov, ptl_kiov_t *iov,
+ size_t offset, size_t mlen);
/*
- * recv: Receives an incoming message from a remote process
- * Type of iov depends on options. Can overwrite iov.
+ * recv: Receives an incoming message from a remote process. The
+ * payload is to be received into the scattered buffer of 'niov'
+ * fragments described by iov, starting at 'offset' for 'mlen'
+ * bytes. Payload bytes after 'mlen' up to 'rlen' are to be
+ * discarded.
+ * NB the NAL may NOT overwrite iov.
+ * PTL_OK on success => NAL has committed to receive and will call
+ * lib_finalize on completion
*/
- int (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
- unsigned int niov, struct iovec *iov, size_t mlen,
- size_t rlen);
+ ptl_err_t (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+ unsigned int niov, struct iovec *iov,
+ size_t offset, size_t mlen, size_t rlen);
/* as recv, but with a set of page fragments (NULL if not supported) */
- int (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
- unsigned int niov, ptl_kiov_t *iov, size_t mlen,
- size_t rlen);
+ ptl_err_t (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+ unsigned int niov, ptl_kiov_t *iov,
+ size_t offset, size_t mlen, size_t rlen);
/*
* read: Reads a block of data from a specified user address
*/
- int (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr,
- user_ptr src_addr, size_t len);
+ ptl_err_t (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr,
+ user_ptr src_addr, size_t len);
/*
* write: Writes a block of data into a specified user address
*/
- int (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr,
- void *src_addr, size_t len);
+ ptl_err_t (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr,
+ void *src_addr, size_t len);
/*
* callback: Calls an event callback
+ * NULL => lib calls eq's callback (if any) directly.
*/
- int (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq,
- ptl_event_t *ev);
+ void (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq,
+ ptl_event_t *ev);
/*
* malloc: Acquire a block of memory in a system independent
* type of *iov depends on options.
* Set to NULL if not required.
*/
- int (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov,
- void **addrkey);
+ ptl_err_t (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov,
+ void **addrkey);
void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov,
void **addrkey);
/* as (un)map, but with a set of page fragments */
- int (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov,
- void **addrkey);
+ ptl_err_t (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov,
+ void **addrkey);
void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov,
void **addrkey);
#include <portals/types.h>
#include <linux/kp30.h>
#include <portals/p30.h>
-#include <portals/errno.h>
#include <portals/lib-types.h>
#include <portals/lib-nal.h>
#include <portals/lib-dispatch.h>
nal->cb_sti(nal, flagsp); \
}
-#ifdef PTL_USE_DESC_LISTS
+#ifdef PTL_USE_LIB_FREELIST
#define MAX_MES 2048
#define MAX_MDS 2048
}
static inline lib_md_t *
-lib_md_alloc (nal_cb_t *nal)
+lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd)
{
/* NEVER called with statelock held */
unsigned long flags;
static inline lib_msg_t *
lib_msg_alloc (nal_cb_t *nal)
{
- /* ALWAYS called with statelock held */
- return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs));
+ /* NEVER called with statelock held */
+ unsigned long flags;
+ lib_msg_t *msg;
+
+ state_lock (nal, &flags);
+ msg = (lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs);
+ state_unlock (nal, &flags);
+
+ if (msg != NULL) {
+ /* NULL pointers, clear flags etc */
+ memset (msg, 0, sizeof (*msg));
+ msg->ack_wmd = PTL_WIRE_HANDLE_NONE;
+ }
+ return(msg);
}
static inline void
#else
-extern atomic_t md_in_use_count;
-extern atomic_t msg_in_use_count;
-extern atomic_t me_in_use_count;
-extern atomic_t eq_in_use_count;
-
static inline lib_eq_t *
lib_eq_alloc (nal_cb_t *nal)
{
/* NEVER called with statelock held */
lib_eq_t *eq;
- PORTAL_ALLOC(eq, sizeof(*eq));
-
- if (eq == NULL)
- return (NULL);
- atomic_inc (&eq_in_use_count);
+ PORTAL_ALLOC(eq, sizeof(*eq));
return (eq);
}
lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
{
/* ALWAYS called with statelock held */
- atomic_dec (&eq_in_use_count);
PORTAL_FREE(eq, sizeof(*eq));
}
static inline lib_md_t *
-lib_md_alloc (nal_cb_t *nal)
+lib_md_alloc (nal_cb_t *nal, ptl_md_t *umd)
{
/* NEVER called with statelock held */
lib_md_t *md;
- PORTAL_ALLOC(md, sizeof(*md));
-
- if (md == NULL)
- return (NULL);
-
- atomic_inc (&md_in_use_count);
+ int size;
+ int niov;
+
+ if ((umd->options & PTL_MD_KIOV) != 0) {
+ niov = umd->niov;
+ size = offsetof(lib_md_t, md_iov.kiov[niov]);
+ } else {
+ niov = ((umd->options & PTL_MD_IOV) != 0) ?
+ umd->niov : 1;
+ size = offsetof(lib_md_t, md_iov.iov[niov]);
+ }
+
+ PORTAL_ALLOC(md, size);
+
+ if (md != NULL) {
+ /* Set here in case of early free */
+ md->options = umd->options;
+ md->md_niov = niov;
+ }
+
return (md);
}
lib_md_free (nal_cb_t *nal, lib_md_t *md)
{
/* ALWAYS called with statelock held */
- atomic_dec (&md_in_use_count);
- PORTAL_FREE(md, sizeof(*md));
+ int size;
+
+ if ((md->options & PTL_MD_KIOV) != 0)
+ size = offsetof(lib_md_t, md_iov.kiov[md->md_niov]);
+ else
+ size = offsetof(lib_md_t, md_iov.iov[md->md_niov]);
+
+ PORTAL_FREE(md, size);
}
static inline lib_me_t *
{
/* NEVER called with statelock held */
lib_me_t *me;
- PORTAL_ALLOC(me, sizeof(*me));
-
- if (me == NULL)
- return (NULL);
- atomic_inc (&me_in_use_count);
+ PORTAL_ALLOC(me, sizeof(*me));
return (me);
}
lib_me_free(nal_cb_t *nal, lib_me_t *me)
{
/* ALWAYS called with statelock held */
- atomic_dec (&me_in_use_count);
PORTAL_FREE(me, sizeof(*me));
}
static inline lib_msg_t *
lib_msg_alloc(nal_cb_t *nal)
{
- /* ALWAYS called with statelock held */
+ /* NEVER called with statelock held */
lib_msg_t *msg;
- PORTAL_ALLOC_ATOMIC(msg, sizeof(*msg));
- if (msg == NULL)
- return (NULL);
-
- atomic_inc (&msg_in_use_count);
+ PORTAL_ALLOC(msg, sizeof(*msg));
+ if (msg != NULL) {
+ /* NULL pointers, clear flags etc */
+ memset (msg, 0, sizeof (*msg));
+ msg->ack_wmd = PTL_WIRE_HANDLE_NONE;
+ }
return (msg);
}
lib_msg_free(nal_cb_t *nal, lib_msg_t *msg)
{
/* ALWAYS called with statelock held */
- atomic_dec (&msg_in_use_count);
PORTAL_FREE(msg, sizeof(*msg));
}
#endif
* Call backs will be made to write events, send acks or
* replies and so on.
*/
-extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private);
-extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg);
+extern void lib_enq_event_locked (nal_cb_t *nal, void *private,
+ lib_eq_t *eq, ptl_event_t *ev);
+extern void lib_finalize (nal_cb_t *nal, void *private, lib_msg_t *msg,
+ ptl_err_t status);
+extern void lib_parse (nal_cb_t *nal, ptl_hdr_t *hdr, void *private);
extern lib_msg_t *lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid,
lib_md_t *getmd);
-extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr);
+extern void print_hdr (nal_cb_t * nal, ptl_hdr_t * hdr);
+
extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
-extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len);
-extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len);
+extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov,
+ ptl_size_t offset, ptl_size_t len);
+extern void lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset,
+ char *src, ptl_size_t len);
+extern int lib_extract_iov (int dst_niov, struct iovec *dst,
+ int src_niov, struct iovec *src,
+ ptl_size_t offset, ptl_size_t len);
extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov);
-extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len);
-extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len);
+extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov,
+ ptl_size_t offset, ptl_size_t len);
+extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
+ char *src, ptl_size_t len);
+extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst,
+ int src_niov, ptl_kiov_t *src,
+ ptl_size_t offset, ptl_size_t len);
+
extern void lib_assert_wire_constants (void);
-extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
- ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
-extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
- ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
- lib_md_t *md, ptl_size_t offset, ptl_size_t len);
+extern ptl_err_t lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+ ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
+extern ptl_err_t lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+ ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+ lib_md_t *md, ptl_size_t offset, ptl_size_t len);
extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in,
ptl_md_t * md_out);
# include <linux/smp_lock.h>
# include <linux/types.h>
#else
-# define PTL_USE_DESC_LISTS
+# define PTL_USE_LIB_FREELIST
# include <sys/types.h>
#endif
struct lib_msg_t {
struct list_head msg_list;
- int send_ack;
lib_md_t *md;
- ptl_nid_t nid;
- ptl_pid_t pid;
- ptl_event_t ev;
ptl_handle_wire_t ack_wmd;
- union {
- struct iovec iov[PTL_MD_MAX_IOV];
- ptl_kiov_t kiov[PTL_MD_MAX_IOV];
- } msg_iov;
+ ptl_event_t ev;
};
struct lib_ptl_t {
};
#define PTL_MD_FLAG_UNLINK (1 << 0)
-#define PTL_MD_FLAG_AUTO_UNLINKED (1 << 1)
-#ifdef PTL_USE_DESC_LISTS
+#ifdef PTL_USE_LIB_FREELIST
typedef struct
{
void *fl_objs; /* single contiguous array of objects */
struct list_head ni_test_peers;
-#ifdef PTL_USE_DESC_LISTS
+#ifdef PTL_USE_LIB_FREELIST
lib_freelist_t ni_free_mes;
lib_freelist_t ni_free_msgs;
lib_freelist_t ni_free_mds;
#include <portals/types.h>
#include <portals/nal.h>
#include <portals/api.h>
-#include <portals/errno.h>
#include <portals/nalids.h>
extern int __p30_initialized; /* for libraries & test codes */
# define do_gettimeofday(tv) gettimeofday(tv, NULL)
#endif
+#include <portals/errno.h>
+
typedef __u64 ptl_nid_t;
typedef __u32 ptl_pid_t;
typedef __u32 ptl_pt_index_t;
PTL_EVENT_PUT,
PTL_EVENT_REPLY,
PTL_EVENT_ACK,
- PTL_EVENT_SENT
+ PTL_EVENT_SENT,
+ PTL_EVENT_UNLINK,
} ptl_event_kind_t;
#define PTL_SEQ_BASETYPE long
#pragma pack(push, 4)
#endif
typedef struct {
- ptl_event_kind_t type;
- ptl_process_id_t initiator;
- ptl_pt_index_t portal;
- ptl_match_bits_t match_bits;
- ptl_size_t rlength, mlength, offset;
- ptl_handle_me_t unlinked_me;
- ptl_md_t mem_desc;
- ptl_hdr_data_t hdr_data;
- struct timeval arrival_time;
+ ptl_event_kind_t type;
+ ptl_err_t status;
+ int unlinked;
+ ptl_process_id_t initiator;
+ ptl_pt_index_t portal;
+ ptl_match_bits_t match_bits;
+ ptl_size_t rlength;
+ ptl_size_t mlength;
+ ptl_size_t offset;
+ ptl_md_t mem_desc;
+ ptl_hdr_data_t hdr_data;
+ struct timeval arrival_time;
+
volatile ptl_seq_t sequence;
} ptl_event_t;
#ifdef __CYGWIN__
int gmnal_cb_write(nal_cb_t *, void *private, user_ptr, void *, size_t);
-int gmnal_cb_callback(nal_cb_t *, void *, lib_eq_t *, ptl_event_t *);
-
void *gmnal_cb_malloc(nal_cb_t *, size_t);
void gmnal_cb_free(nal_cb_t *, void *, size_t);
a->cb_recv_pages = gmnal_cb_recv_pages; \
a->cb_read = gmnal_cb_read; \
a->cb_write = gmnal_cb_write; \
- a->cb_callback = gmnal_cb_callback; \
+ a->cb_callback = NULL; \
a->cb_malloc = gmnal_cb_malloc; \
a->cb_free = gmnal_cb_free; \
a->cb_map = NULL; \
niov, iov, len);
} else {
CDEBUG(D_ERROR, "Large message send it is not supported\n");
- lib_finalize(nal_cb, private, cookie);
return(PTL_FAIL);
gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid,
niov, iov, len);
return(PTL_OK);
}
-int gmnal_cb_callback(nal_cb_t *nal_cb, void *private, lib_eq_t *eq,
- ptl_event_t *ev)
-{
-
- if (eq->event_callback != NULL) {
- CDEBUG(D_INFO, "found callback\n");
- eq->event_callback(ev);
- }
-
- return(PTL_OK);
-}
-
void *gmnal_cb_malloc(nal_cb_t *nal_cb, size_t len)
{
void *ptr = NULL;
if (!private) {
CDEBUG(D_ERROR, "gmnal_small_rx no context\n");
- lib_finalize(nal_cb, private, cookie);
return(PTL_FAIL);
}
* let portals library know receive is complete
*/
CDEBUG(D_PORTALS, "calling lib_finalize\n");
- if (lib_finalize(nal_cb, private, cookie) != PTL_OK) {
- /* TO DO what to do with failed lib_finalise? */
- CDEBUG(D_INFO, "lib_finalize failed\n");
- }
+ lib_finalize(nal_cb, private, cookie, PTL_OK);
+
/*
* return buffer so it can be used again
*/
return;
}
gmnal_return_stxd(nal_data, stxd);
- if (lib_finalize(nal_cb, stxd, cookie) != PTL_OK) {
- CDEBUG(D_INFO, "Call to lib_finalize failed for stxd [%p]\n",
- stxd);
- }
+ lib_finalize(nal_cb, stxd, cookie, PTL_OK);
+
return;
}
if (!srxd) {
CDEBUG(D_ERROR, "gmnal_large_rx no context\n");
- lib_finalize(nal_cb, private, cookie);
return(PTL_FAIL);
}
* Let our client application proceed
*/
CDEBUG(D_ERROR, "final callback context[%p]\n", srxd);
- if (lib_finalize(nal_cb, srxd, srxd->cookie) != PTL_OK) {
- CDEBUG(D_INFO, "Call to lib_finalize failed for srxd [%p]\n",
- srxd);
- }
+ lib_finalize(nal_cb, srxd, srxd->cookie, PTL_OK);
/*
* send an ack to the sender to let him know we got the data
CDEBUG(D_INFO, "gmnal_large_tx_ack_received stxd [%p]\n", stxd);
- if (lib_finalize(nal_cb, stxd, stxd->cookie) != PTL_OK) {
- CDEBUG(D_INFO, "Call to lib_finalize failed for stxd [%p]\n",
- stxd);
- }
+ lib_finalize(nal_cb, stxd, stxd->cookie, PTL_OK);
/*
* extract the iovec from the stxd, deregister the memory.
if(buf_length > MAX_MSG_SIZE) {
CERROR("kibnal_send:request exceeds Transmit data size (%d).\n",
MAX_MSG_SIZE);
- rc = -1;
+ rc = PTL_FAIL;
return rc;
}
else {
PROF_FINISH(kibnal_send); // time stapm of send operation
- rc = 1;
+ rc = PTL_OK;
return rc;
}
ptl_kiov_t *iov,
size_t mlen)
{
- int rc = 1;
+ int rc = PTL_FAIL;
CDEBUG(D_NET, "kibnal_send_pages\n");
//
// do you need this
//
-int kibnal_callback(nal_cb_t * nal,
+void kibnal_callback(nal_cb_t * nal,
void *private,
lib_eq_t *eq,
ptl_event_t *ev)
{
CDEBUG(D_NET, "recv_pages not implemented\n");
- return PTL_OK;
+ return PTL_FAIL;
}
CDEBUG(D_NET,"kibnal_recv: mlen=%d, rlen=%d\n", mlen, rlen);
/* What was actually received must be >= what sender claims to
- * have sent. This is an LASSERT, since lib-move doesn't
- * check cb return code yet. */
- LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen);
+ * have sent. */
LASSERT (mlen <= rlen);
+ if (krx->krx_len < sizeof (ptl_hdr_t) + rlen)
+ return (PTL_FAIL);
+
PROF_START(kibnal_recv);
if(mlen != 0) {
PROF_START(lib_finalize);
- lib_finalize(nal, private, cookie);
+ lib_finalize(nal, private, cookie, PTL_OK);
PROF_FINISH(lib_finalize);
PROF_FINISH(kibnal_recv);
- return rlen;
+ return PTL_OK;
}
//
* LIB functions follow
*
*/
-static int
+static ptl_err_t
kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
size_t len)
{
nal->ni.nid, len, src_addr, dst_addr );
memcpy( dst_addr, src_addr, len );
- return (0);
+ return (PTL_OK);
}
-static int
+static ptl_err_t
kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
size_t len)
{
nal->ni.nid, len, src_addr, dst_addr );
memcpy( dst_addr, src_addr, len );
- return (0);
+ return (PTL_OK);
}
static void *
elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState,
kqswnal_data.kqn_eptxdmahandle,
ktx->ktx_basepage, ktx->ktx_nmappedpages);
-
#endif
ktx->ktx_nmappedpages = 0;
}
int
-kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov)
+kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_t *kiov)
{
int nfrags = ktx->ktx_nfrag;
int nmapped = ktx->ktx_nmappedpages;
LASSERT (niov > 0);
LASSERT (nob > 0);
+ /* skip complete frags before 'offset' */
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ kiov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+
do {
- int fraglen = kiov->kiov_len;
+ int fraglen = kiov->kiov_len - offset;
/* nob exactly spans the iovs */
LASSERT (fraglen <= nob);
/* XXX this is really crap, but we'll have to kmap until
* EKC has a page (rather than vaddr) mapping interface */
- ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+ ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
CDEBUG(D_NET,
"%p[%d] loading %p for %d, page %d, %d total\n",
kiov++;
niov--;
nob -= fraglen;
+ offset = 0;
/* iov must not run out before end of data */
LASSERT (nob == 0 || niov > 0);
}
int
-kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov)
+kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob,
+ int niov, struct iovec *iov)
{
int nfrags = ktx->ktx_nfrag;
int nmapped = ktx->ktx_nmappedpages;
LASSERT (niov > 0);
LASSERT (nob > 0);
+ /* skip complete frags before offset */
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ iov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+
do {
- int fraglen = iov->iov_len;
+ int fraglen = iov->iov_len - offset;
long npages = kqswnal_pages_spanned (iov->iov_base, fraglen);
/* nob exactly spans the iovs */
CDEBUG(D_NET,
"%p[%d] loading %p for %d, pages %d for %ld, %d total\n",
- ktx, nfrags, iov->iov_base, fraglen, basepage, npages,
- nmapped);
+ ktx, nfrags, iov->iov_base + offset, fraglen,
+ basepage, npages, nmapped);
#if MULTIRAIL_EKC
ep_dvma_load(kqswnal_data.kqn_ep, NULL,
- iov->iov_base, fraglen,
+ iov->iov_base + offset, fraglen,
kqswnal_data.kqn_ep_tx_nmh, basepage,
&railmask, &ktx->ktx_frags[nfrags]);
#else
elan3_dvma_kaddr_load (kqswnal_data.kqn_ep->DmaState,
kqswnal_data.kqn_eptxdmahandle,
- iov->iov_base, fraglen,
+ iov->iov_base + offset, fraglen,
basepage, &ktx->ktx_frags[nfrags].Base);
if (nfrags > 0 && /* previous frag mapped */
iov++;
niov--;
nob -= fraglen;
+ offset = 0;
/* iov must not run out before end of data */
LASSERT (nob == 0 || niov > 0);
kqswnal_tx_done (kqswnal_tx_t *ktx, int error)
{
lib_msg_t *msg;
- lib_msg_t *repmsg;
+ lib_msg_t *repmsg = NULL;
switch (ktx->ktx_state) {
case KTX_FORWARDING: /* router asked me to forward this packet */
case KTX_SENDING: /* packet sourced locally */
lib_finalize (&kqswnal_lib, ktx->ktx_args[0],
- (lib_msg_t *)ktx->ktx_args[1]);
+ (lib_msg_t *)ktx->ktx_args[1],
+ (error == 0) ? PTL_OK :
+ (error == -ENOMEM) ? PTL_NOSPACE : PTL_FAIL);
break;
case KTX_GETTING: /* Peer has DMA-ed direct? */
msg = (lib_msg_t *)ktx->ktx_args[1];
- repmsg = NULL;
- if (error == 0)
+ if (error == 0) {
repmsg = lib_fake_reply_msg (&kqswnal_lib,
ktx->ktx_nid, msg->md);
+ if (repmsg == NULL)
+ error = -ENOMEM;
+ }
- lib_finalize (&kqswnal_lib, ktx->ktx_args[0], msg);
-
- if (repmsg != NULL)
- lib_finalize (&kqswnal_lib, NULL, repmsg);
+ if (error == 0) {
+ lib_finalize (&kqswnal_lib, ktx->ktx_args[0],
+ msg, PTL_OK);
+ lib_finalize (&kqswnal_lib, NULL, repmsg, PTL_OK);
+ } else {
+ lib_finalize (&kqswnal_lib, ktx->ktx_args[0], msg,
+ (error == -ENOMEM) ? PTL_NOSPACE : PTL_FAIL);
+ }
break;
default:
ktx->ktx_nid, status);
kqswnal_notify_peer_down(ktx);
- status = -EIO;
+ status = -EHOSTDOWN;
} else if (ktx->ktx_state == KTX_GETTING) {
/* RPC completed OK; what did our peer put in the status
int
kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag,
- struct iovec *iov, ptl_kiov_t *kiov, int nob)
+ struct iovec *iov, ptl_kiov_t *kiov,
+ int offset, int nob)
{
kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0];
char *buffer = (char *)page_address(krx->krx_pages[0]);
/* Map the source data... */
ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0;
if (kiov != NULL)
- rc = kqswnal_map_tx_kiov (ktx, nob, nfrag, kiov);
+ rc = kqswnal_map_tx_kiov (ktx, offset, nob, nfrag, kiov);
else
- rc = kqswnal_map_tx_iov (ktx, nob, nfrag, iov);
+ rc = kqswnal_map_tx_iov (ktx, offset, nob, nfrag, iov);
if (rc != 0) {
CERROR ("Can't map source data: %d\n", rc);
return (-ECONNABORTED);
}
-static int
+static ptl_err_t
kqswnal_sendmsg (nal_cb_t *nal,
void *private,
lib_msg_t *libmsg,
unsigned int payload_niov,
struct iovec *payload_iov,
ptl_kiov_t *payload_kiov,
+ size_t payload_offset,
size_t payload_nob)
{
kqswnal_tx_t *ktx;
#if KQSW_CHECKSUM
int i;
kqsw_csum_t csum;
+ int sumoff;
int sumnob;
#endif
}
/* peer expects RPC completion with GET data */
- rc = kqswnal_dma_reply (ktx,
- payload_niov, payload_iov,
- payload_kiov, payload_nob);
+ rc = kqswnal_dma_reply (ktx, payload_niov,
+ payload_iov, payload_kiov,
+ payload_offset, payload_nob);
if (rc == 0)
return (PTL_OK);
#if KQSW_CHECKSUM
csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr));
memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum));
- for (csum = 0, i = 0, sumnob = payload_nob; sumnob > 0; i++) {
+ for (csum = 0, i = 0, sumoff = payload_offset, sumnob = payload_nob; sumnob > 0; i++) {
+ LASSERT(i < niov);
if (payload_kiov != NULL) {
ptl_kiov_t *kiov = &payload_kiov[i];
- char *addr = ((char *)kmap (kiov->kiov_page)) +
- kiov->kiov_offset;
-
- csum = kqsw_csum (csum, addr, MIN (sumnob, kiov->kiov_len));
- sumnob -= kiov->kiov_len;
+
+ if (sumoff >= kiov->kiov_len) {
+ sumoff -= kiov->kiov_len;
+ } else {
+ char *addr = ((char *)kmap (kiov->kiov_page)) +
+ kiov->kiov_offset + sumoff;
+ int fragnob = kiov->kiov_len - sumoff;
+
+ csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob));
+ sumnob -= fragnob;
+ sumoff = 0;
+ kunmap(kiov->kiov_page);
+ }
} else {
struct iovec *iov = &payload_iov[i];
- csum = kqsw_csum (csum, iov->iov_base, MIN (sumnob, kiov->iov_len));
- sumnob -= iov->iov_len;
+ if (sumoff > iov->iov_len) {
+ sumoff -= iov->iov_len;
+ } else {
+ char *addr = iov->iov_base + sumoff;
+ int fragnob = iov->iov_len - sumoff;
+
+ csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob));
+ sumnob -= fragnob;
+ sumoff = 0;
+ }
}
}
- memcpy(ktx->ktx_buffer +sizeof(*hdr) +sizeof(csum), &csum,sizeof(csum));
+ memcpy(ktx->ktx_buffer + sizeof(*hdr) + sizeof(csum), &csum, sizeof(csum));
#endif
if (kqswnal_data.kqn_optimized_gets &&
ktx->ktx_state = KTX_GETTING;
if ((libmsg->md->options & PTL_MD_KIOV) != 0)
- rc = kqswnal_map_tx_kiov (ktx, md->length,
+ rc = kqswnal_map_tx_kiov (ktx, 0, md->length,
md->md_niov, md->md_iov.kiov);
else
- rc = kqswnal_map_tx_iov (ktx, md->length,
+ rc = kqswnal_map_tx_iov (ktx, 0, md->length,
md->md_niov, md->md_iov.iov);
if (rc < 0) {
if (payload_nob > 0) {
if (payload_kiov != NULL)
lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
- payload_niov, payload_kiov, payload_nob);
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
else
lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
- payload_niov, payload_iov, payload_nob);
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
}
} else {
ktx->ktx_frags[0].Len = KQSW_HDR_SIZE;
#endif
if (payload_kiov != NULL)
- rc = kqswnal_map_tx_kiov (ktx, payload_nob,
+ rc = kqswnal_map_tx_kiov (ktx, payload_offset, payload_nob,
payload_niov, payload_kiov);
else
- rc = kqswnal_map_tx_iov (ktx, payload_nob,
+ rc = kqswnal_map_tx_iov (ktx, payload_offset, payload_nob,
payload_niov, payload_iov);
if (rc != 0) {
kqswnal_put_idle_tx (ktx);
return (PTL_OK);
}
-static int
+static ptl_err_t
kqswnal_send (nal_cb_t *nal,
void *private,
lib_msg_t *libmsg,
ptl_pid_t pid,
unsigned int payload_niov,
struct iovec *payload_iov,
+ size_t payload_offset,
size_t payload_nob)
{
return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid,
- payload_niov, payload_iov, NULL, payload_nob));
+ payload_niov, payload_iov, NULL,
+ payload_offset, payload_nob));
}
-static int
+static ptl_err_t
kqswnal_send_pages (nal_cb_t *nal,
void *private,
lib_msg_t *libmsg,
ptl_pid_t pid,
unsigned int payload_niov,
ptl_kiov_t *payload_kiov,
+ size_t payload_offset,
size_t payload_nob)
{
return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid,
- payload_niov, NULL, payload_kiov, payload_nob));
+ payload_niov, NULL, payload_kiov,
+ payload_offset, payload_nob));
}
void
nob <= KQSW_TX_BUFFER_SIZE)
{
/* send from ktx's pre-mapped contiguous buffer? */
- lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, nob);
+ lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, 0, nob);
#if MULTIRAIL_EKC
ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer,
0, nob);
{
/* zero copy */
ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0;
- rc = kqswnal_map_tx_iov (ktx, nob, niov, iov);
+ rc = kqswnal_map_tx_iov (ktx, 0, nob, niov, iov);
if (rc != 0)
goto failed;
krx->krx_rpc_reply_needed = 0;
kqswnal_rx_done (krx);
- lib_finalize (&kqswnal_lib, NULL, msg);
+ lib_finalize (&kqswnal_lib, NULL, msg,
+ (status == EP_SUCCESS) ? PTL_OK : PTL_FAIL);
kqswnal_put_idle_tx (ktx);
}
}
#endif
-static int
+static ptl_err_t
kqswnal_recvmsg (nal_cb_t *nal,
void *private,
lib_msg_t *libmsg,
unsigned int niov,
struct iovec *iov,
ptl_kiov_t *kiov,
+ size_t offset,
size_t mlen,
size_t rlen)
{
#endif
CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen);
- /* What was actually received must be >= payload.
- * This is an LASSERT, as lib_finalize() doesn't have a completion status. */
- LASSERT (krx->krx_nob >= KQSW_HDR_SIZE + mlen);
+ /* What was actually received must be >= payload. */
LASSERT (mlen <= rlen);
+ if (krx->krx_nob < KQSW_HDR_SIZE + mlen) {
+ CERROR("Bad message size: have %d, need %d + %d\n",
+ krx->krx_nob, KQSW_HDR_SIZE, mlen);
+ return (PTL_FAIL);
+ }
/* It must be OK to kmap() if required */
LASSERT (kiov == NULL || !in_interrupt ());
page_nob = PAGE_SIZE - KQSW_HDR_SIZE;
LASSERT (niov > 0);
+
if (kiov != NULL) {
- iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
- iov_nob = kiov->kiov_len;
+ /* skip complete frags */
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ kiov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+ iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
+ iov_nob = kiov->kiov_len - offset;
} else {
- iov_ptr = iov->iov_base;
- iov_nob = iov->iov_len;
+ /* skip complete frags */
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ iov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+ iov_ptr = iov->iov_base + offset;
+ iov_nob = iov->iov_len - offset;
}
-
+
for (;;)
{
- /* We expect the iov to exactly match mlen */
- LASSERT (iov_nob <= mlen);
-
- frag = MIN (page_nob, iov_nob);
+ frag = mlen;
+ if (frag > page_nob)
+ frag = page_nob;
+ if (frag > iov_nob)
+ frag = iov_nob;
+
memcpy (iov_ptr, page_ptr, frag);
#if KQSW_CHECKSUM
payload_csum = kqsw_csum (payload_csum, iov_ptr, frag);
"csum_nob %d\n",
hdr_csum, payload_csum, csum_frags, csum_nob);
#endif
- lib_finalize(nal, private, libmsg);
+ lib_finalize(nal, private, libmsg, PTL_OK);
- return (rlen);
+ return (PTL_OK);
}
-static int
+static ptl_err_t
kqswnal_recv(nal_cb_t *nal,
void *private,
lib_msg_t *libmsg,
unsigned int niov,
struct iovec *iov,
+ size_t offset,
size_t mlen,
size_t rlen)
{
- return (kqswnal_recvmsg (nal, private, libmsg, niov, iov, NULL, mlen, rlen));
+ return (kqswnal_recvmsg(nal, private, libmsg,
+ niov, iov, NULL,
+ offset, mlen, rlen));
}
-static int
+static ptl_err_t
kqswnal_recv_pages (nal_cb_t *nal,
void *private,
lib_msg_t *libmsg,
unsigned int niov,
ptl_kiov_t *kiov,
+ size_t offset,
size_t mlen,
size_t rlen)
{
- return (kqswnal_recvmsg (nal, private, libmsg, niov, NULL, kiov, mlen, rlen));
+ return (kqswnal_recvmsg(nal, private, libmsg,
+ niov, NULL, kiov,
+ offset, mlen, rlen));
}
int
break;
}
- lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie);
+ lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie,
+ (err == 0) ? PTL_OK : PTL_FAIL);
PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
}
if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) {
CERROR("kscimacnal:request exceeds TX MTU size (%ld).\n",
mac_get_mtusize(ksci->ksci_machandle));
- return -EINVAL;
+ return PTL_FAIL;
}
/* save transaction info for later finalize and cleanup */
PORTAL_ALLOC(ktx, (sizeof(kscimacnal_tx_t)));
if (!ktx) {
- return -ENOMEM;
+ return PTL_NOSPACE;
}
ktx->ktx_nmapped = 0; /* Start with no mapped pages :) */
kscimacnal_txrelease, ktx);
if (!msg) {
PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
- return -ENOMEM;
+ return PTL_NOSPACE;
}
mac_put_mblk(msg, sizeof(ptl_hdr_t));
lastblk=msg;
if(!newblk) {
mac_free_msg(msg);
PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
- return -ENOMEM;
+ return PTL_NOSPACE;
}
mac_put_mblk(newblk, nob);
mac_link_mblk(lastblk, newblk);
CERROR("kscimacnal: mac_send() failed, rc=%d\n", rc);
mac_free_msg(msg);
PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
- return rc;
+ return PTL_FAIL;
}
- return 0;
+ return PTL_OK;
}
krx->msg, mlen, rlen, niov);
/* What was actually received must be >= what sender claims to have
- * sent. This is an LASSERT, since lib-move doesn't check cb return
- * code yet. Also, rlen seems to be negative when mlen==0 so don't
- * assert on that.
- */
- LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen);
- LASSERT (mlen==0 || mlen <= rlen);
+ * sent. */
+ LASSERT (mlen <= rlen); /* something is wrong if this isn't true */
+ if (mac_msg_size(krx->msg) < sizeof(ptl_hdr_t)+mlen) {
+ /* We didn't receive everything lib thinks we did */
+ CERROR("Bad message size: have %d, need %d + %d\n",
+ mac_msg_size(krx->msg), sizeof(ptl_hdr_t), mlen);
+ return (PTL_FAIL);
+ }
+
/* It must be OK to kmap() if required */
LASSERT (kiov == NULL || !in_interrupt ());
/* Either all pages or all vaddrs */
CDEBUG(D_NET, "Calling lib_finalize.\n");
PROF_START(lib_finalize);
- lib_finalize(nal, private, cookie);
+ lib_finalize(nal, private, cookie, PTL_OK);
PROF_FINISH(lib_finalize);
CDEBUG(D_NET, "Done.\n");
- return rlen;
+ return PTL_OK;
}
/* complete current receive if any */
switch (conn->ksnc_rx_state) {
case SOCKNAL_RX_BODY:
-#if 0
- lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie);
-#else
- CERROR ("Refusing to complete a partial receive from "
- LPX64", ip %d.%d.%d.%d:%d\n", conn->ksnc_peer->ksnp_nid,
- HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
- CERROR ("This may hang communications and "
- "prevent modules from unloading\n");
-#endif
+ CERROR("Completing partial receive from "LPX64
+ ", ip %d.%d.%d.%d:%d, with error\n",
+ conn->ksnc_peer->ksnp_nid,
+ HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+ lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_FAIL);
break;
case SOCKNAL_RX_BODY_FWD:
ksocknal_fmb_callback (conn->ksnc_cookie, -ECONNABORTED);
* LIB functions follow
*
*/
-int
+ptl_err_t
ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr,
user_ptr src_addr, size_t len)
{
nal->ni.nid, (long)len, src_addr, dst_addr);
memcpy( dst_addr, src_addr, len );
- return 0;
+ return PTL_OK;
}
-int
+ptl_err_t
ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr,
void *src_addr, size_t len)
{
nal->ni.nid, (long)len, src_addr, dst_addr);
memcpy( dst_addr, src_addr, len );
- return 0;
-}
-
-int
-ksocknal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq,
- ptl_event_t *ev)
-{
- CDEBUG(D_NET, LPX64": callback eq %p ev %p\n",
- nal->ni.nid, eq, ev);
-
- if (eq->event_callback != NULL)
- eq->event_callback(ev);
-
- return 0;
+ return PTL_OK;
}
void *
if (tx->tx_isfwd) { /* was a forwarded packet? */
kpr_fwd_done (&ksocknal_data.ksnd_router,
- KSOCK_TX_2_KPR_FWD_DESC (tx), 0);
+ KSOCK_TX_2_KPR_FWD_DESC (tx),
+ (tx->tx_resid == 0) ? 0 : -ECONNABORTED);
EXIT;
return;
}
/* local send */
ltx = KSOCK_TX_2_KSOCK_LTX (tx);
- lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie);
+ lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie,
+ (tx->tx_resid == 0) ? PTL_OK : PTL_FAIL);
ksocknal_free_ltx (ltx);
EXIT;
LASSERT (rc < 0);
if (!conn->ksnc_closing)
- CERROR ("[%p] Error %d on write to "LPX64
- " ip %d.%d.%d.%d:%d\n",conn, rc,
- conn->ksnc_peer->ksnp_nid,
- HIPQUAD(conn->ksnc_ipaddr),
- conn->ksnc_port);
+ CERROR("[%p] Error %d on write to "LPX64
+ " ip %d.%d.%d.%d:%d\n", conn, rc,
+ conn->ksnc_peer->ksnp_nid,
+ HIPQUAD(conn->ksnc_ipaddr),
+ conn->ksnc_port);
ksocknal_close_conn_and_siblings (conn, rc);
ksocknal_tx_launched (tx);
-
+
return (rc);
-}
+}
void
ksocknal_launch_autoconnect_locked (ksock_route_t *route)
ptl_nid_t target_nid;
int rc;
ksock_peer_t *peer = ksocknal_find_peer_locked (nid);
-
+
if (peer != NULL)
return (peer);
-
+
if (tx->tx_isfwd) {
CERROR ("Can't send packet to "LPX64
- " %s: routed target is not a peer\n",
+ " %s: routed target is not a peer\n",
nid, portals_nid2str(SOCKNAL, nid, ipbuf));
return (NULL);
}
-
+
rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, tx->tx_nob,
&target_nid);
if (rc != 0) {
- CERROR ("Can't route to "LPX64" %s: router error %d\n",
+ CERROR ("Can't route to "LPX64" %s: router error %d\n",
nid, portals_nid2str(SOCKNAL, nid, ipbuf), rc);
return (NULL);
}
return (-EHOSTUNREACH);
}
-int
+ptl_err_t
ksocknal_sendmsg(nal_cb_t *nal,
void *private,
lib_msg_t *cookie,
unsigned int payload_niov,
struct iovec *payload_iov,
ptl_kiov_t *payload_kiov,
+ size_t payload_offset,
size_t payload_nob)
{
ksock_ltx_t *ltx;
ltx->ltx_tx.tx_kiov = NULL;
ltx->ltx_tx.tx_nkiov = 0;
- ltx->ltx_tx.tx_niov = 1 + payload_niov;
-
- memcpy(ltx->ltx_iov + 1, payload_iov,
- payload_niov * sizeof (*payload_iov));
-
+ ltx->ltx_tx.tx_niov =
+ 1 + lib_extract_iov(payload_niov, <x->ltx_iov[1],
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
} else {
/* payload is all pages */
- ltx->ltx_tx.tx_kiov = ltx->ltx_kiov;
- ltx->ltx_tx.tx_nkiov = payload_niov;
-
ltx->ltx_tx.tx_niov = 1;
- memcpy(ltx->ltx_kiov, payload_kiov,
- payload_niov * sizeof (*payload_kiov));
+ ltx->ltx_tx.tx_kiov = ltx->ltx_kiov;
+ ltx->ltx_tx.tx_nkiov =
+ lib_extract_kiov(payload_niov, ltx->ltx_kiov,
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
}
rc = ksocknal_launch_packet(<x->ltx_tx, nid);
return (PTL_FAIL);
}
-int
+ptl_err_t
ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie,
ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
unsigned int payload_niov, struct iovec *payload_iov,
- size_t payload_len)
+ size_t payload_offset, size_t payload_len)
{
return (ksocknal_sendmsg(nal, private, cookie,
hdr, type, nid, pid,
payload_niov, payload_iov, NULL,
- payload_len));
+ payload_offset, payload_len));
}
-int
+ptl_err_t
ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie,
ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
unsigned int payload_niov, ptl_kiov_t *payload_kiov,
- size_t payload_len)
+ size_t payload_offset, size_t payload_len)
{
return (ksocknal_sendmsg(nal, private, cookie,
hdr, type, nid, pid,
payload_niov, NULL, payload_kiov,
- payload_len));
+ payload_offset, payload_len));
}
void
/* drop peer ref taken on init */
ksocknal_put_peer (fmb->fmb_peer);
-
+
spin_lock_irqsave (&fmp->fmp_lock, flags);
list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs);
case SOCKNAL_RX_BODY:
/* payload all received */
- lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie);
+ lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_OK);
/* Fall through */
case SOCKNAL_RX_SLOP:
return (-EINVAL); /* keep gcc happy */
}
-int
+ptl_err_t
ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg,
- unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen)
+ unsigned int niov, struct iovec *iov,
+ size_t offset, size_t mlen, size_t rlen)
{
ksock_conn_t *conn = (ksock_conn_t *)private;
conn->ksnc_rx_nkiov = 0;
conn->ksnc_rx_kiov = NULL;
- conn->ksnc_rx_niov = niov;
conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
- memcpy (conn->ksnc_rx_iov, iov, niov * sizeof (*iov));
+ conn->ksnc_rx_niov =
+ lib_extract_iov(PTL_MD_MAX_IOV, conn->ksnc_rx_iov,
+ niov, iov, offset, mlen);
LASSERT (mlen ==
lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
- return (rlen);
+ return (PTL_OK);
}
-int
+ptl_err_t
ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg,
- unsigned int niov, ptl_kiov_t *kiov, size_t mlen, size_t rlen)
+ unsigned int niov, ptl_kiov_t *kiov,
+ size_t offset, size_t mlen, size_t rlen)
{
ksock_conn_t *conn = (ksock_conn_t *)private;
conn->ksnc_rx_niov = 0;
conn->ksnc_rx_iov = NULL;
- conn->ksnc_rx_nkiov = niov;
conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
- memcpy (conn->ksnc_rx_kiov, kiov, niov * sizeof (*kiov));
+ conn->ksnc_rx_nkiov =
+ lib_extract_kiov(PTL_MD_MAX_IOV, conn->ksnc_rx_kiov,
+ niov, kiov, offset, mlen);
LASSERT (mlen ==
lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
- return (rlen);
+ return (PTL_OK);
}
int ksocknal_scheduler (void *arg)
rc, *nid, portals_nid2str(SOCKNAL, *nid, ipbuf));
return (rc);
}
-
+
if (hmv->magic != __le32_to_cpu (PORTALS_PROTO_MAGIC)) {
CERROR ("Bad magic %#08x (%#08x expected) from "LPX64" %s\n",
__cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC, *nid,
} else if (*nid != __le64_to_cpu (hdr.src_nid)) {
CERROR ("Connected to nid "LPX64" %s, but expecting "LPX64" %s\n",
__le64_to_cpu (hdr.src_nid),
- portals_nid2str(SOCKNAL,
+ portals_nid2str(SOCKNAL,
__le64_to_cpu(hdr.src_nid),
ipbuf),
*nid, portals_nid2str(SOCKNAL, *nid, ipbuf));
*type = SOCKNAL_CONN_BULK_IN;
break;
default:
- CERROR ("Unexpected type %d from "LPX64" %s\n",
+ CERROR ("Unexpected type %d from "LPX64" %s\n",
*type, *nid,
portals_nid2str(SOCKNAL, *nid, ipbuf));
return (-EPROTO);
if (rc != 0) {
CERROR ("Error %d connecting to "LPX64" %s\n", rc,
route->ksnr_peer->ksnp_nid,
- portals_nid2str(SOCKNAL,
- route->ksnr_peer->ksnp_nid,
+ portals_nid2str(SOCKNAL,
+ route->ksnr_peer->ksnp_nid,
ipbuf));
goto out;
}
while (!list_empty (&zombies)) {
char ipbuf[PTL_NALFMT_SIZE];
tx = list_entry (zombies.next, ksock_tx_t, tx_list);
-
+
CERROR ("Deleting packet type %d len %d ("LPX64" %s->"LPX64" %s)\n",
NTOH__u32 (tx->tx_hdr->type),
NTOH__u32 (tx->tx_hdr->payload_length),
cb_recv_pages: ksocknal_recv_pages,
cb_read: ksocknal_read,
cb_write: ksocknal_write,
- cb_callback: ksocknal_callback,
cb_malloc: ksocknal_malloc,
cb_free: ksocknal_free,
cb_printf: ksocknal_printf,
EXPORT_SYMBOL(lib_iov_nob);
EXPORT_SYMBOL(lib_copy_iov2buf);
EXPORT_SYMBOL(lib_copy_buf2iov);
+EXPORT_SYMBOL(lib_extract_iov);
EXPORT_SYMBOL(lib_kiov_nob);
EXPORT_SYMBOL(lib_copy_kiov2buf);
EXPORT_SYMBOL(lib_copy_buf2kiov);
+EXPORT_SYMBOL(lib_extract_kiov);
EXPORT_SYMBOL(lib_finalize);
EXPORT_SYMBOL(lib_parse);
EXPORT_SYMBOL(lib_fake_reply_msg);
CPPFLAGS=
INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include
-lib_LIBRARIES= libportals.a
+noinst_LIBRARIES= libportals.a
libportals_a_SOURCES= api-eq.c api-init.c api-me.c api-errno.c api-ni.c api-wrap.c lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-eq.c lib-md.c lib-move.c lib-ni.c lib-pid.c
+
+if LIBLUSTRE
+libportals_a_CFLAGS= -fPIC
+endif
*ev = *new_event;
- /* Set the unlinked_me interface number if there is one to pass
- * back, since the NAL hasn't a clue what it is and therefore can't
- * set it. */
- if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE))
- ev->unlinked_me.nal_idx = eventq.nal_idx;
-
/* ensure event is delivered correctly despite possible
races with lib_finalize */
if (eq->sequence != new_event->sequence) {
}
#ifndef __KERNEL__
+#if 0
static jmp_buf eq_jumpbuf;
static void eq_timeout(int signal)
return rc;
}
+#else
+#include <errno.h>
-#endif
+/* FIXME
+ * Here timeout need a trick with tcpnal, definitely unclean but OK for
+ * this moment.
+ */
+
+/* global variables defined by tcpnal */
+extern int __tcpnal_eqwait_timeout_value;
+extern int __tcpnal_eqwait_timedout;
+
+int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
+ int timeout)
+{
+ int rc;
+ if (!timeout)
+ return PtlEQWait(eventq_in, event_out);
+
+ __tcpnal_eqwait_timeout_value = timeout;
+
+ while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) {
+ nal_t *nal = ptl_hndl2nal(&eventq_in);
+
+ if (nal->yield)
+ nal->yield(nal);
+
+ if (__tcpnal_eqwait_timedout) {
+ if (__tcpnal_eqwait_timedout != ETIMEDOUT)
+ printf("Warning: yield return error %d\n",
+ __tcpnal_eqwait_timedout);
+ rc = PTL_EQ_EMPTY;
+ break;
+ }
+ }
+
+ __tcpnal_eqwait_timeout_value = 0;
+
+ return rc;
+}
+#endif
+#endif /* __KERNEL__ */
"PTL_IOV_TOO_SMALL",
"PTL_EQ_INUSE",
- "PTL_MD_INUSE"
};
/* If you change these, you must update the number table in portals/errno.h */
if (ptl_interfaces[i] == nal) {
nal->refct++;
handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | i;
- fprintf(stderr, "Returning existing NAL (%d)\n", i);
+ CDEBUG(D_OTHER, "Returning existing NAL (%d)\n", i);
ptl_ni_init_mutex_exit ();
return PTL_OK;
}
nal_t *nal;
if (!ptl_init) {
- fprintf(stderr, "PtlGetId: Not initialized\n");
+ CERROR("Not initialized\n");
return PTL_NOINIT;
}
int i;
if (!ptl_init) {
- fprintf(stderr, "PtlMDAttach/Bind/Update: Not initialized\n");
+ CERROR("PtlMDAttach/Bind/Update: Not initialized\n");
return PTL_NOINIT;
}
# include <sys/time.h>
#endif
-#ifndef PTL_USE_DESC_LISTS
-static int ptl_slab_users;
-
-atomic_t md_in_use_count = ATOMIC_INIT(0);
-atomic_t msg_in_use_count = ATOMIC_INIT(0);
-atomic_t me_in_use_count = ATOMIC_INIT(0);
-atomic_t eq_in_use_count = ATOMIC_INIT(0);
+#ifndef PTL_USE_LIB_FREELIST
int
kportal_descriptor_setup (nal_cb_t *nal)
{
- ptl_slab_users++;
- RETURN(PTL_OK);
+ return PTL_OK;
}
void
kportal_descriptor_cleanup (nal_cb_t *nal)
{
- if (--ptl_slab_users != 0)
- return;
-
- LASSERT (atomic_read (&md_in_use_count) == 0);
- LASSERT (atomic_read (&me_in_use_count) == 0);
- LASSERT (atomic_read (&eq_in_use_count) == 0);
- LASSERT (atomic_read (&msg_in_use_count) == 0);
}
#else
int rc;
int i;
- /* NB we are passes an allocated, but uninitialised/active md.
+ /* NB we are passed an allocated, but uninitialised/active md.
* if we return success, caller may lib_md_unlink() it.
* otherwise caller may only lib_md_free() it.
*/
return PTL_INV_EQ;
}
- if ((md->options & PTL_MD_IOV) != 0 && /* discontiguous MD */
- md->niov > PTL_MD_MAX_IOV) /* too many fragments */
- return PTL_IOV_TOO_MANY;
+ /* Must check this _before_ allocation. Also, note that non-iov
+ * MDs must set md_niov to 0. */
+ LASSERT((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0 ||
+ md->niov <= PTL_MD_MAX_IOV);
if ((md->options & max_size_opts) != 0 && /* max size used */
(md->max_size < 0 || md->max_size > md->length)) // illegal max_size
lib_md_t *md;
unsigned long flags;
- md = lib_md_alloc (nal);
+ if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOV)) != 0 &&
+ args->md_in.niov > PTL_MD_MAX_IOV) /* too many fragments */
+ return (ret->rc = PTL_IOV_TOO_MANY);
+
+ md = lib_md_alloc(nal, &args->md_in);
if (md == NULL)
return (ret->rc = PTL_NOSPACE);
lib_md_t *md;
unsigned long flags;
- md = lib_md_alloc (nal);
+ if ((args->md_in.options & (PTL_MD_KIOV | PTL_MD_IOV)) != 0 &&
+ args->md_in.niov > PTL_MD_MAX_IOV) /* too many fragments */
+ return (ret->rc = PTL_IOV_TOO_MANY);
+
+ md = lib_md_alloc(nal, &args->md_in);
if (md == NULL)
return (ret->rc = PTL_NOSPACE);
int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
{
- PtlMDUnlink_in *args = v_args;
+ PtlMDUnlink_in *args = v_args;
PtlMDUnlink_out *ret = v_ret;
-
- lib_md_t *md;
- unsigned long flags;
+ ptl_event_t ev;
+ lib_md_t *md;
+ unsigned long flags;
state_lock(nal, &flags);
md = ptl_handle2md(&args->md_in, nal);
if (md == NULL) {
- ret->rc = PTL_INV_MD;
- } else if (md->pending != 0) { /* being filled/spilled */
- ret->rc = PTL_MD_INUSE;
- } else {
- /* Callers attempting to unlink a busy MD which will get
- * unlinked once the net op completes should see INUSE,
- * before completion and INV_MD thereafter. LASSERT we've
- * got that right... */
- LASSERT ((md->md_flags & PTL_MD_FLAG_UNLINK) == 0);
-
- lib_md_deconstruct(nal, md, &ret->status_out);
- lib_md_unlink(nal, md);
- ret->rc = PTL_OK;
+ state_unlock(nal, &flags);
+ return (ret->rc = PTL_INV_MD);
+ }
+
+ /* If the MD is busy, lib_md_unlink just marks it for deletion, and
+ * when the NAL is done, the completion event flags that the MD was
+ * unlinked. Otherwise, we enqueue an event now... */
+
+ if (md->eq != NULL &&
+ md->pending == 0) {
+ memset(&ev, 0, sizeof(ev));
+
+ ev.type = PTL_EVENT_UNLINK;
+ ev.status = PTL_OK;
+ ev.unlinked = 1;
+ lib_md_deconstruct(nal, md, &ev.mem_desc);
+
+ lib_enq_event_locked(nal, private, md->eq, &ev);
}
+ lib_md_deconstruct(nal, md, &ret->status_out);
+ lib_md_unlink(nal, md);
+ ret->rc = PTL_OK;
+
state_unlock(nal, &flags);
- return (ret->rc);
+ return (PTL_OK);
}
int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args,
goto out;
}
+ /* XXX fttb, the new MD must be the same type wrt fragmentation */
+ if (((new->options ^ md->options) &
+ (PTL_MD_IOV | PTL_MD_KIOV)) != 0) {
+ ret->rc = PTL_INV_MD;
+ goto out;
+ }
+
+ if (new->niov > md->md_niov) {
+ ret->rc = PTL_IOV_TOO_MANY;
+ goto out;
+ }
+
+ if (new->niov < md->md_niov) {
+ ret->rc = PTL_IOV_TOO_SMALL;
+ goto out;
+ }
+
if (!PtlHandleEqual (args->testq_in, PTL_EQ_NONE)) {
test_eq = ptl_handle2eq(&args->testq_in, nal);
if (test_eq == NULL) {
}
void
-lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len)
+lib_copy_iov2buf (char *dest, int niov, struct iovec *iov,
+ ptl_size_t offset, ptl_size_t len)
{
ptl_size_t nob;
- while (len > 0)
- {
+ if (len == 0)
+ return;
+
+ /* skip complete frags before 'offset' */
+ LASSERT (niov > 0);
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ iov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+
+ do {
LASSERT (niov > 0);
- nob = MIN (iov->iov_len, len);
- memcpy (dest, iov->iov_base, nob);
+ nob = MIN (iov->iov_len - offset, len);
+ memcpy (dest, iov->iov_base + offset, nob);
len -= nob;
dest += nob;
niov--;
iov++;
- }
+ offset = 0;
+ } while (len > 0);
}
void
-lib_copy_buf2iov (int niov, struct iovec *iov, char *src, ptl_size_t len)
+lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset,
+ char *src, ptl_size_t len)
{
ptl_size_t nob;
- while (len > 0)
- {
+ if (len == 0)
+ return;
+
+ /* skip complete frags before 'offset' */
+ LASSERT (niov > 0);
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ iov++;
+ niov--;
LASSERT (niov > 0);
- nob = MIN (iov->iov_len, len);
- memcpy (iov->iov_base, src, nob);
+ }
+
+ do {
+ LASSERT (niov > 0);
+ nob = MIN (iov->iov_len - offset, len);
+ memcpy (iov->iov_base + offset, src, nob);
len -= nob;
src += nob;
niov--;
iov++;
- }
+ offset = 0;
+ } while (len > 0);
}
-static int
-lib_extract_iov (struct iovec *dst, lib_md_t *md,
+int
+lib_extract_iov (int dst_niov, struct iovec *dst,
+ int src_niov, struct iovec *src,
ptl_size_t offset, ptl_size_t len)
{
/* Initialise 'dst' to the subset of 'src' starting at 'offset',
* for exactly 'len' bytes, and return the number of entries.
* NB not destructive to 'src' */
- int src_niov = md->md_niov;
- struct iovec *src = md->md_iov.iov;
ptl_size_t frag_len;
- int dst_niov;
+ int niov;
- LASSERT (offset + len <= md->length);
-
if (len == 0) /* no data => */
return (0); /* no frags */
LASSERT (src_niov > 0);
}
- dst_niov = 1;
+ niov = 1;
for (;;) {
LASSERT (src_niov > 0);
- LASSERT (dst_niov <= PTL_MD_MAX_IOV);
+ LASSERT (niov <= dst_niov);
frag_len = src->iov_len - offset;
dst->iov_base = ((char *)src->iov_base) + offset;
if (len <= frag_len) {
dst->iov_len = len;
- return (dst_niov);
+ return (niov);
}
dst->iov_len = frag_len;
len -= frag_len;
dst++;
src++;
- dst_niov++;
+ niov++;
src_niov--;
offset = 0;
}
}
void
-lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
+lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov,
+ ptl_size_t offset, ptl_size_t len)
{
LASSERT (0);
}
void
-lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *dest, ptl_size_t len)
+lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
+ char *src, ptl_size_t len)
{
LASSERT (0);
}
-static int
-lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
+int
+lib_extract_kiov (int dst_niov, ptl_kiov_t *dst,
+ int src_niov, ptl_kiov_t *src,
ptl_size_t offset, ptl_size_t len)
{
LASSERT (0);
}
void
-lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
+lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov,
+ ptl_size_t offset, ptl_size_t len)
{
ptl_size_t nob;
char *addr;
+
+ if (len == 0)
+ return;
LASSERT (!in_interrupt ());
- while (len > 0)
- {
+
+ LASSERT (niov > 0);
+ while (offset > kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ kiov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+
+ do{
LASSERT (niov > 0);
- nob = MIN (kiov->kiov_len, len);
+ nob = MIN (kiov->kiov_len - offset, len);
- addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+ addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
memcpy (dest, addr, nob);
kunmap (kiov->kiov_page);
dest += nob;
niov--;
kiov++;
- }
+ offset = 0;
+ } while (len > 0);
}
void
-lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len)
+lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
+ char *src, ptl_size_t len)
{
ptl_size_t nob;
char *addr;
+ if (len == 0)
+ return;
+
LASSERT (!in_interrupt ());
- while (len > 0)
- {
+
+ LASSERT (niov > 0);
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ kiov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+
+ do {
LASSERT (niov > 0);
- nob = MIN (kiov->kiov_len, len);
+ nob = MIN (kiov->kiov_len - offset, len);
- addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+ addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
memcpy (addr, src, nob);
kunmap (kiov->kiov_page);
src += nob;
niov--;
kiov++;
- }
+ offset = 0;
+ } while (len > 0);
}
-static int
-lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
+int
+lib_extract_kiov (int dst_niov, ptl_kiov_t *dst,
+ int src_niov, ptl_kiov_t *src,
ptl_size_t offset, ptl_size_t len)
{
/* Initialise 'dst' to the subset of 'src' starting at 'offset',
* for exactly 'len' bytes, and return the number of entries.
* NB not destructive to 'src' */
- int src_niov = md->md_niov;
- ptl_kiov_t *src = md->md_iov.kiov;
ptl_size_t frag_len;
- int dst_niov;
+ int niov;
- LASSERT (offset + len <= md->length);
-
if (len == 0) /* no data => */
return (0); /* no frags */
LASSERT (src_niov > 0);
}
- dst_niov = 1;
+ niov = 1;
for (;;) {
LASSERT (src_niov > 0);
- LASSERT (dst_niov <= PTL_MD_MAX_IOV);
+ LASSERT (niov <= dst_niov);
frag_len = src->kiov_len - offset;
dst->kiov_page = src->kiov_page;
if (len <= frag_len) {
dst->kiov_len = len;
LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
- return (dst_niov);
+ return (niov);
}
dst->kiov_len = frag_len;
len -= frag_len;
dst++;
src++;
- dst_niov++;
+ niov++;
src_niov--;
offset = 0;
}
}
#endif
-void
+ptl_err_t
lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen)
{
- int niov;
-
if (mlen == 0)
- nal->cb_recv (nal, private, msg, 0, NULL, 0, rlen);
- else if ((md->options & PTL_MD_KIOV) == 0) {
- niov = lib_extract_iov (msg->msg_iov.iov, md, offset, mlen);
- nal->cb_recv (nal, private, msg,
- niov, msg->msg_iov.iov, mlen, rlen);
- } else {
- niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, mlen);
- nal->cb_recv_pages (nal, private, msg,
- niov, msg->msg_iov.kiov, mlen, rlen);
- }
+ return (nal->cb_recv(nal, private, msg,
+ 0, NULL,
+ offset, mlen, rlen));
+
+ if ((md->options & PTL_MD_KIOV) == 0)
+ return (nal->cb_recv(nal, private, msg,
+ md->md_niov, md->md_iov.iov,
+ offset, mlen, rlen));
+
+ return (nal->cb_recv_pages(nal, private, msg,
+ md->md_niov, md->md_iov.kiov,
+ offset, mlen, rlen));
}
-int
+ptl_err_t
lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
lib_md_t *md, ptl_size_t offset, ptl_size_t len)
{
- int niov;
-
if (len == 0)
- return (nal->cb_send (nal, private, msg,
- hdr, type, nid, pid,
- 0, NULL, 0));
+ return (nal->cb_send(nal, private, msg,
+ hdr, type, nid, pid,
+ 0, NULL,
+ offset, len));
- if ((md->options & PTL_MD_KIOV) == 0) {
- niov = lib_extract_iov (msg->msg_iov.iov, md, offset, len);
- return (nal->cb_send (nal, private, msg,
- hdr, type, nid, pid,
- niov, msg->msg_iov.iov, len));
- }
-
- niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, len);
- return (nal->cb_send_pages (nal, private, msg,
- hdr, type, nid, pid,
- niov, msg->msg_iov.kiov, len));
+ if ((md->options & PTL_MD_KIOV) == 0)
+ return (nal->cb_send(nal, private, msg,
+ hdr, type, nid, pid,
+ md->md_niov, md->md_iov.iov,
+ offset, len));
+
+ return (nal->cb_send_pages(nal, private, msg,
+ hdr, type, nid, pid,
+ md->md_niov, md->md_iov.kiov,
+ offset, len));
}
-static lib_msg_t *
-get_new_msg (nal_cb_t *nal, lib_md_t *md)
+static void
+lib_commit_md (nal_cb_t *nal, lib_md_t *md, lib_msg_t *msg)
{
/* ALWAYS called holding the state_lock */
lib_counters_t *counters = &nal->ni.counters;
- lib_msg_t *msg = lib_msg_alloc (nal);
-
- if (msg == NULL)
- return (NULL);
-
- memset (msg, 0, sizeof (*msg));
-
- msg->send_ack = 0;
+ /* Here, we commit the MD to a network OP by marking it busy and
+ * decrementing its threshold. Come what may, the network "owns"
+ * the MD until a call to lib_finalize() signals completion. */
msg->md = md;
- do_gettimeofday(&msg->ev.arrival_time);
+
md->pending++;
if (md->threshold != PTL_MD_THRESH_INF) {
LASSERT (md->threshold > 0);
counters->msgs_max = counters->msgs_alloc;
list_add (&msg->msg_list, &nal->ni.ni_active_msgs);
+}
- return (msg);
+static void
+lib_drop_message (nal_cb_t *nal, void *private, ptl_hdr_t *hdr)
+{
+ unsigned long flags;
+
+ /* CAVEAT EMPTOR: this only drops messages that we've not committed
+ * to receive (init_msg() not called) and therefore can't cause an
+ * event. */
+
+ state_lock(nal, &flags);
+ nal->ni.counters.drop_count++;
+ nal->ni.counters.drop_length += hdr->payload_length;
+ state_unlock(nal, &flags);
+
+ /* NULL msg => if NAL calls lib_finalize it will be a noop */
+ (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length);
}
/*
* of long messages.
*
*/
-static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+static ptl_err_t
+parse_put(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
{
lib_ni_t *ni = &nal->ni;
ptl_size_t mlength = 0;
ptl_size_t offset = 0;
int unlink = 0;
+ ptl_err_t rc;
lib_me_t *me;
lib_md_t *md;
- lib_msg_t *msg;
unsigned long flags;
-
+
/* Convert put fields to host byte order */
hdr->msg.put.match_bits = NTOH__u64 (hdr->msg.put.match_bits);
hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index);
hdr->payload_length, hdr->msg.put.offset,
hdr->msg.put.match_bits,
&mlength, &offset, &unlink);
- if (me == NULL)
- goto drop;
+ if (me == NULL) {
+ state_unlock(nal, &flags);
+ return (PTL_FAIL);
+ }
md = me->md;
CDEBUG(D_NET, "Incoming put index %x from "LPU64"/%u of length %d/%d "
hdr->src_nid, hdr->src_pid, mlength, hdr->payload_length,
md->md_lh.lh_cookie, md->md_niov, offset);
- msg = get_new_msg (nal, md);
- if (msg == NULL) {
- CERROR(LPU64": Dropping PUT from "LPU64": can't allocate msg\n",
- ni->nid, hdr->src_nid);
- goto drop;
- }
+ lib_commit_md(nal, md, msg);
+
+ msg->ev.type = PTL_EVENT_PUT;
+ msg->ev.initiator.nid = hdr->src_nid;
+ msg->ev.initiator.pid = hdr->src_pid;
+ msg->ev.portal = hdr->msg.put.ptl_index;
+ msg->ev.match_bits = hdr->msg.put.match_bits;
+ msg->ev.rlength = hdr->payload_length;
+ msg->ev.mlength = mlength;
+ msg->ev.offset = offset;
+ msg->ev.hdr_data = hdr->msg.put.hdr_data;
+
+ lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
!(md->options & PTL_MD_ACK_DISABLE)) {
- msg->send_ack = 1;
msg->ack_wmd = hdr->msg.put.ack_wmd;
- msg->nid = hdr->src_nid;
- msg->pid = hdr->src_pid;
- msg->ev.match_bits = hdr->msg.put.match_bits;
- }
-
- if (md->eq) {
- msg->ev.type = PTL_EVENT_PUT;
- msg->ev.initiator.nid = hdr->src_nid;
- msg->ev.initiator.pid = hdr->src_pid;
- msg->ev.portal = hdr->msg.put.ptl_index;
- msg->ev.match_bits = hdr->msg.put.match_bits;
- msg->ev.rlength = hdr->payload_length;
- msg->ev.mlength = mlength;
- msg->ev.offset = offset;
- msg->ev.hdr_data = hdr->msg.put.hdr_data;
-
- /* NB if this match has exhausted the MD, we can't be sure
- * that this event will the the last one associated with
- * this MD in the event queue (another message already
- * matching this ME/MD could end up being last). So we
- * remember the ME handle anyway and check again when we're
- * allocating our slot in the event queue.
- */
- ptl_me2handle (&msg->ev.unlinked_me, me);
-
- lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
}
ni->counters.recv_count++;
ni->counters.recv_length += mlength;
- /* only unlink after MD's pending count has been bumped
- * in get_new_msg() otherwise lib_me_unlink() will nuke it */
- if (unlink) {
- md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED;
+ /* only unlink after MD's pending count has been bumped in
+ * lib_commit_md() otherwise lib_me_unlink() will nuke it */
+ if (unlink)
lib_me_unlink (nal, me);
- }
state_unlock(nal, &flags);
- lib_recv (nal, private, msg, md, offset, mlength, hdr->payload_length);
- return 0;
+ rc = lib_recv(nal, private, msg, md, offset, mlength,
+ hdr->payload_length);
+ if (rc != PTL_OK)
+ CERROR(LPU64": error on receiving PUT from "LPU64": %d\n",
+ ni->nid, hdr->src_nid, rc);
- drop:
- nal->ni.counters.drop_count++;
- nal->ni.counters.drop_length += hdr->payload_length;
- state_unlock (nal, &flags);
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- return -1;
+ return (rc);
}
-static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+static ptl_err_t
+parse_get(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
{
lib_ni_t *ni = &nal->ni;
ptl_size_t mlength = 0;
int unlink = 0;
lib_me_t *me;
lib_md_t *md;
- lib_msg_t *msg;
ptl_hdr_t reply;
unsigned long flags;
int rc;
hdr->msg.get.sink_length, hdr->msg.get.src_offset,
hdr->msg.get.match_bits,
&mlength, &offset, &unlink);
- if (me == NULL)
- goto drop;
+ if (me == NULL) {
+ state_unlock(nal, &flags);
+ return (PTL_FAIL);
+ }
md = me->md;
CDEBUG(D_NET, "Incoming get index %d from "LPU64".%u of length %d/%d "
hdr->src_nid, hdr->src_pid, mlength, hdr->payload_length,
md->md_lh.lh_cookie, md->md_niov, offset);
- msg = get_new_msg (nal, md);
- if (msg == NULL) {
- CERROR(LPU64": Dropping GET from "LPU64": can't allocate msg\n",
- ni->nid, hdr->src_nid);
- goto drop;
- }
+ lib_commit_md(nal, md, msg);
- if (md->eq) {
- msg->ev.type = PTL_EVENT_GET;
- msg->ev.initiator.nid = hdr->src_nid;
- msg->ev.initiator.pid = hdr->src_pid;
- msg->ev.portal = hdr->msg.get.ptl_index;
- msg->ev.match_bits = hdr->msg.get.match_bits;
- msg->ev.rlength = hdr->payload_length;
- msg->ev.mlength = mlength;
- msg->ev.offset = offset;
- msg->ev.hdr_data = 0;
-
- /* NB if this match has exhausted the MD, we can't be sure
- * that this event will the the last one associated with
- * this MD in the event queue (another message already
- * matching this ME/MD could end up being last). So we
- * remember the ME handle anyway and check again when we're
- * allocating our slot in the event queue.
- */
- ptl_me2handle (&msg->ev.unlinked_me, me);
-
- lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
- }
+ msg->ev.type = PTL_EVENT_GET;
+ msg->ev.initiator.nid = hdr->src_nid;
+ msg->ev.initiator.pid = hdr->src_pid;
+ msg->ev.portal = hdr->msg.get.ptl_index;
+ msg->ev.match_bits = hdr->msg.get.match_bits;
+ msg->ev.rlength = hdr->payload_length;
+ msg->ev.mlength = mlength;
+ msg->ev.offset = offset;
+ msg->ev.hdr_data = 0;
+
+ lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
ni->counters.send_count++;
ni->counters.send_length += mlength;
- /* only unlink after MD's refcount has been bumped
- * in get_new_msg() otherwise lib_me_unlink() will nuke it */
- if (unlink) {
- md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED;
+ /* only unlink after MD's refcount has been bumped in
+ * lib_commit_md() otherwise lib_me_unlink() will nuke it */
+ if (unlink)
lib_me_unlink (nal, me);
- }
state_unlock(nal, &flags);
rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY,
hdr->src_nid, hdr->src_pid, md, offset, mlength);
- if (rc != PTL_OK) {
- CERROR(LPU64": Dropping GET from "LPU64": send REPLY failed\n",
- ni->nid, hdr->src_nid);
- /* Hmm, this will create a GET event and make believe
- * the reply completed, which it kind of did, only the
- * source won't get her reply */
- lib_finalize (nal, private, msg);
- state_lock (nal, &flags);
- goto drop;
- }
+ if (rc != PTL_OK)
+ CERROR(LPU64": Unable to send REPLY for GET from "LPU64": %d\n",
+ ni->nid, hdr->src_nid, rc);
+
+ /* Discard any junk after the hdr */
+ (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- /* Complete the incoming message */
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
return (rc);
- drop:
- ni->counters.drop_count++;
- ni->counters.drop_length += hdr->msg.get.sink_length;
- state_unlock(nal, &flags);
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- return -1;
}
-static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+static ptl_err_t
+parse_reply(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
{
lib_ni_t *ni = &nal->ni;
lib_md_t *md;
int rlength;
int length;
- lib_msg_t *msg;
unsigned long flags;
+ ptl_err_t rc;
state_lock(nal, &flags);
md == NULL ? "invalid" : "inactive",
hdr->msg.reply.dst_wmd.wh_interface_cookie,
hdr->msg.reply.dst_wmd.wh_object_cookie);
- goto drop;
+
+ state_unlock(nal, &flags);
+ return (PTL_FAIL);
}
LASSERT (md->offset == 0);
ni->nid, hdr->src_nid, length,
hdr->msg.reply.dst_wmd.wh_object_cookie,
md->length);
- goto drop;
+ state_unlock(nal, &flags);
+ return (PTL_FAIL);
}
length = md->length;
}
hdr->src_nid, length, rlength,
hdr->msg.reply.dst_wmd.wh_object_cookie);
- msg = get_new_msg (nal, md);
- if (msg == NULL) {
- CERROR(LPU64": Dropping REPLY from "LPU64": can't "
- "allocate msg\n", ni->nid, hdr->src_nid);
- goto drop;
- }
+ lib_commit_md(nal, md, msg);
- if (md->eq) {
- msg->ev.type = PTL_EVENT_REPLY;
- msg->ev.initiator.nid = hdr->src_nid;
- msg->ev.initiator.pid = hdr->src_pid;
- msg->ev.rlength = rlength;
- msg->ev.mlength = length;
- msg->ev.offset = 0;
+ msg->ev.type = PTL_EVENT_REPLY;
+ msg->ev.initiator.nid = hdr->src_nid;
+ msg->ev.initiator.pid = hdr->src_pid;
+ msg->ev.rlength = rlength;
+ msg->ev.mlength = length;
+ msg->ev.offset = 0;
- lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
- }
+ lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
ni->counters.recv_count++;
ni->counters.recv_length += length;
state_unlock(nal, &flags);
- lib_recv (nal, private, msg, md, 0, length, rlength);
- return 0;
+ rc = lib_recv(nal, private, msg, md, 0, length, rlength);
+ if (rc != PTL_OK)
+ CERROR(LPU64": error on receiving REPLY from "LPU64": %d\n",
+ ni->nid, hdr->src_nid, rc);
- drop:
- nal->ni.counters.drop_count++;
- nal->ni.counters.drop_length += hdr->payload_length;
- state_unlock (nal, &flags);
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- return -1;
+ return (rc);
}
-static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+static ptl_err_t
+parse_ack(nal_cb_t *nal, ptl_hdr_t *hdr, void *private, lib_msg_t *msg)
{
- lib_ni_t *ni = &nal->ni;
- lib_md_t *md;
- lib_msg_t *msg = NULL;
- unsigned long flags;
+ lib_ni_t *ni = &nal->ni;
+ lib_md_t *md;
+ unsigned long flags;
/* Convert ack fields to host byte order */
hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits);
(md == NULL) ? "invalid" : "inactive",
hdr->msg.ack.dst_wmd.wh_interface_cookie,
hdr->msg.ack.dst_wmd.wh_object_cookie);
- goto drop;
+
+ state_unlock(nal, &flags);
+ return (PTL_FAIL);
}
CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n",
ni->nid, hdr->src_nid,
hdr->msg.ack.dst_wmd.wh_object_cookie);
- msg = get_new_msg (nal, md);
- if (msg == NULL) {
- CERROR(LPU64": Dropping ACK from "LPU64": can't allocate msg\n",
- ni->nid, hdr->src_nid);
- goto drop;
- }
+ lib_commit_md(nal, md, msg);
- if (md->eq) {
- msg->ev.type = PTL_EVENT_ACK;
- msg->ev.initiator.nid = hdr->src_nid;
- msg->ev.initiator.pid = hdr->src_pid;
- msg->ev.mlength = hdr->msg.ack.mlength;
- msg->ev.match_bits = hdr->msg.ack.match_bits;
+ msg->ev.type = PTL_EVENT_ACK;
+ msg->ev.initiator.nid = hdr->src_nid;
+ msg->ev.initiator.pid = hdr->src_pid;
+ msg->ev.mlength = hdr->msg.ack.mlength;
+ msg->ev.match_bits = hdr->msg.ack.match_bits;
- lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
- }
+ lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
ni->counters.recv_count++;
- state_unlock(nal, &flags);
- lib_recv (nal, private, msg, NULL, 0, 0, hdr->payload_length);
- return 0;
- drop:
- nal->ni.counters.drop_count++;
- state_unlock (nal, &flags);
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- return -1;
+ state_unlock(nal, &flags);
+
+ /* We have received and matched up the ack OK, create the
+ * completion event now... */
+ lib_finalize(nal, private, msg, PTL_OK);
+
+ /* ...and now discard any junk after the hdr */
+ (void) lib_recv(nal, private, NULL, NULL, 0, 0, hdr->payload_length);
+
+ return (PTL_OK);
}
static char *
} /* end of print_hdr() */
-int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+void
+lib_parse(nal_cb_t *nal, ptl_hdr_t *hdr, void *private)
{
unsigned long flags;
-
+ ptl_err_t rc;
+ lib_msg_t *msg;
+
/* convert common fields to host byte order */
hdr->dest_nid = NTOH__u64 (hdr->dest_nid);
hdr->src_nid = NTOH__u64 (hdr->src_nid);
nal->ni.nid, mv->magic,
mv->version_major, mv->version_minor,
hdr->src_nid);
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- return (-1);
+ lib_drop_message(nal, private, hdr);
+ return;
}
if (hdr->dest_nid != nal->ni.nid) {
CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64
" (not me)\n", nal->ni.nid, hdr_type_string (hdr),
hdr->src_nid, hdr->dest_nid);
-
- state_lock (nal, &flags);
- nal->ni.counters.drop_count++;
- nal->ni.counters.drop_length += hdr->payload_length;
- state_unlock (nal, &flags);
-
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- return (-1);
+ lib_drop_message(nal, private, hdr);
+ return;
}
if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
": simulated failure\n",
nal->ni.nid, hdr_type_string (hdr),
hdr->src_nid);
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- return (-1);
+ lib_drop_message(nal, private, hdr);
+ return;
}
-
+
+ msg = lib_msg_alloc(nal);
+ if (msg == NULL) {
+ CERROR(LPU64": Dropping incoming %s from "LPU64
+ ": can't allocate a lib_msg_t\n",
+ nal->ni.nid, hdr_type_string (hdr),
+ hdr->src_nid);
+ lib_drop_message(nal, private, hdr);
+ return;
+ }
+
+ do_gettimeofday(&msg->ev.arrival_time);
+
switch (hdr->type) {
case PTL_MSG_ACK:
- return (parse_ack(nal, hdr, private));
+ rc = parse_ack(nal, hdr, private, msg);
+ break;
case PTL_MSG_PUT:
- return (parse_put(nal, hdr, private));
+ rc = parse_put(nal, hdr, private, msg);
break;
case PTL_MSG_GET:
- return (parse_get(nal, hdr, private));
+ rc = parse_get(nal, hdr, private, msg);
break;
case PTL_MSG_REPLY:
- return (parse_reply(nal, hdr, private));
+ rc = parse_reply(nal, hdr, private, msg);
break;
default:
CERROR(LPU64": Dropping <unknown> message from "LPU64
": Bad type=0x%x\n", nal->ni.nid, hdr->src_nid,
hdr->type);
-
- lib_recv (nal, private, NULL, NULL, 0, 0, hdr->payload_length);
- return (-1);
+ rc = PTL_FAIL;
+ break;
+ }
+
+ if (rc != PTL_OK) {
+ if (msg->md != NULL) {
+ /* committed... */
+ lib_finalize(nal, private, msg, rc);
+ } else {
+ state_lock(nal, &flags);
+ lib_msg_free(nal, msg); /* expects state_lock held */
+ state_unlock(nal, &flags);
+
+ lib_drop_message(nal, private, hdr);
+ }
}
}
-
-int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int
+do_PtlPut(nal_cb_t *nal, void *private, void *v_args, void *v_ret)
{
/*
* Incoming:
* Outgoing:
*/
- PtlPut_in *args = v_args;
- PtlPut_out *ret = v_ret;
- ptl_hdr_t hdr;
-
- lib_ni_t *ni = &nal->ni;
- lib_md_t *md;
- lib_msg_t *msg = NULL;
+ PtlPut_in *args = v_args;
ptl_process_id_t *id = &args->target_in;
- unsigned long flags;
- int rc;
+ PtlPut_out *ret = v_ret;
+ lib_ni_t *ni = &nal->ni;
+ lib_msg_t *msg;
+ ptl_hdr_t hdr;
+ lib_md_t *md;
+ unsigned long flags;
+ int rc;
if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
fail_peer (nal, id->nid, 1)) /* shall we now? */
nal->ni.nid, id->nid);
return (ret->rc = PTL_INV_PROC);
}
-
- ret->rc = PTL_OK;
+
+ msg = lib_msg_alloc(nal);
+ if (msg == NULL) {
+ CERROR(LPU64": Dropping PUT to "LPU64": ENOMEM on lib_msg_t\n",
+ ni->nid, id->nid);
+ return (ret->rc = PTL_NOSPACE);
+ }
+
state_lock(nal, &flags);
+
md = ptl_handle2md(&args->md_in, nal);
- if (md == NULL || !md->threshold) {
+ if (md == NULL || md->threshold == 0) {
+ lib_msg_free(nal, msg);
state_unlock(nal, &flags);
- return ret->rc = PTL_INV_MD;
+
+ return (ret->rc = PTL_INV_MD);
}
CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid,
hdr.msg.put.offset = HTON__u32 (args->offset_in);
hdr.msg.put.hdr_data = args->hdr_data_in;
+ lib_commit_md(nal, md, msg);
+
+ msg->ev.type = PTL_EVENT_SENT;
+ msg->ev.initiator.nid = ni->nid;
+ msg->ev.initiator.pid = ni->pid;
+ msg->ev.portal = args->portal_in;
+ msg->ev.match_bits = args->match_bits_in;
+ msg->ev.rlength = md->length;
+ msg->ev.mlength = md->length;
+ msg->ev.offset = args->offset_in;
+ msg->ev.hdr_data = args->hdr_data_in;
+
+ lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+
ni->counters.send_count++;
ni->counters.send_length += md->length;
- msg = get_new_msg (nal, md);
- if (msg == NULL) {
- CERROR("BAD: could not allocate msg!\n");
- state_unlock(nal, &flags);
- return ret->rc = PTL_NOSPACE;
- }
-
- /*
- * If this memory descriptor has an event queue associated with
- * it we need to allocate a message state object and record the
- * information about this operation that will be recorded into
- * event queue once the message has been completed.
- *
- * NB. We're now committed to the GET, since we just marked the MD
- * busy. Callers who observe this (by getting PTL_MD_INUSE from
- * PtlMDUnlink()) expect a completion event to tell them when the
- * MD becomes idle.
- */
- if (md->eq) {
- msg->ev.type = PTL_EVENT_SENT;
- msg->ev.initiator.nid = ni->nid;
- msg->ev.initiator.pid = ni->pid;
- msg->ev.portal = args->portal_in;
- msg->ev.match_bits = args->match_bits_in;
- msg->ev.rlength = md->length;
- msg->ev.mlength = md->length;
- msg->ev.offset = args->offset_in;
- msg->ev.hdr_data = args->hdr_data_in;
-
- lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
- }
-
state_unlock(nal, &flags);
rc = lib_send (nal, private, msg, &hdr, PTL_MSG_PUT,
id->nid, id->pid, md, 0, md->length);
if (rc != PTL_OK) {
- /* get_new_msg() committed us to sending by decrementing
- * md->threshold, so we have to act like we did send, but
- * the network dropped it. */
- lib_finalize (nal, private, msg);
+ CERROR(LPU64": error sending PUT to "LPU64": %d\n",
+ ni->nid, id->nid, rc);
+ lib_finalize (nal, private, msg, rc);
}
+ /* completion will be signalled by an event */
return ret->rc = PTL_OK;
}
-lib_msg_t * lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid,
- lib_md_t *getmd)
+lib_msg_t *
+lib_fake_reply_msg (nal_cb_t *nal, ptl_nid_t peer_nid, lib_md_t *getmd)
{
/* The NAL can DMA direct to the GET md (i.e. no REPLY msg). This
* returns a msg the NAL can pass to lib_finalize() so that a REPLY
* lib_finalize() of the original GET. */
lib_ni_t *ni = &nal->ni;
- lib_msg_t *msg;
+ lib_msg_t *msg = lib_msg_alloc(nal);
unsigned long flags;
state_lock(nal, &flags);
LASSERT (getmd->pending > 0);
+ if (msg == NULL) {
+ CERROR ("Dropping REPLY from "LPU64": can't allocate msg\n",
+ peer_nid);
+ goto drop;
+ }
+
if (getmd->threshold == 0) {
CERROR ("Dropping REPLY from "LPU64" for inactive MD %p\n",
peer_nid, getmd);
- goto drop;
+ goto drop_msg;
}
LASSERT (getmd->offset == 0);
CDEBUG(D_NET, "Reply from "LPU64" md %p\n", peer_nid, getmd);
- msg = get_new_msg (nal, getmd);
- if (msg == NULL) {
- CERROR("Dropping REPLY from "LPU64" md %p: can't allocate msg\n",
- peer_nid, getmd);
- goto drop;
- }
+ lib_commit_md (nal, getmd, msg);
- if (getmd->eq) {
- msg->ev.type = PTL_EVENT_REPLY;
- msg->ev.initiator.nid = peer_nid;
- msg->ev.initiator.pid = 0; /* XXX FIXME!!! */
- msg->ev.rlength = msg->ev.mlength = getmd->length;
- msg->ev.offset = 0;
+ msg->ev.type = PTL_EVENT_REPLY;
+ msg->ev.initiator.nid = peer_nid;
+ msg->ev.initiator.pid = 0; /* XXX FIXME!!! */
+ msg->ev.rlength = msg->ev.mlength = getmd->length;
+ msg->ev.offset = 0;
- lib_md_deconstruct(nal, getmd, &msg->ev.mem_desc);
- }
+ lib_md_deconstruct(nal, getmd, &msg->ev.mem_desc);
ni->counters.recv_count++;
ni->counters.recv_length += getmd->length;
state_unlock(nal, &flags);
return msg;
-
+
+ drop_msg:
+ lib_msg_free(nal, msg);
drop:
nal->ni.counters.drop_count++;
nal->ni.counters.drop_length += getmd->length;
return NULL;
}
-int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+int
+do_PtlGet(nal_cb_t *nal, void *private, void *v_args, void *v_ret)
{
/*
* Incoming:
* Outgoing:
*/
- PtlGet_in *args = v_args;
- PtlGet_out *ret = v_ret;
- ptl_hdr_t hdr;
- lib_msg_t *msg = NULL;
- lib_ni_t *ni = &nal->ni;
+ PtlGet_in *args = v_args;
ptl_process_id_t *id = &args->target_in;
- lib_md_t *md;
- unsigned long flags;
- int rc;
+ PtlGet_out *ret = v_ret;
+ lib_ni_t *ni = &nal->ni;
+ lib_msg_t *msg;
+ ptl_hdr_t hdr;
+ lib_md_t *md;
+ unsigned long flags;
+ int rc;
if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
fail_peer (nal, id->nid, 1)) /* shall we now? */
nal->ni.nid, id->nid);
return (ret->rc = PTL_INV_PROC);
}
-
+
+ msg = lib_msg_alloc(nal);
+ if (msg == NULL) {
+ CERROR(LPU64": Dropping GET to "LPU64": ENOMEM on lib_msg_t\n",
+ ni->nid, id->nid);
+ return (ret->rc = PTL_NOSPACE);
+ }
+
state_lock(nal, &flags);
+
md = ptl_handle2md(&args->md_in, nal);
if (md == NULL || !md->threshold) {
+ lib_msg_free(nal, msg);
state_unlock(nal, &flags);
+
return ret->rc = PTL_INV_MD;
}
- LASSERT (md->offset == 0);
-
CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid,
(unsigned long)id->pid);
hdr.msg.get.src_offset = HTON__u32 (args->offset_in);
hdr.msg.get.sink_length = HTON__u32 (md->length);
- ni->counters.send_count++;
+ lib_commit_md(nal, md, msg);
- msg = get_new_msg (nal, md);
- if (msg == NULL) {
- CERROR("do_PtlGet: BAD - could not allocate cookie!\n");
- state_unlock(nal, &flags);
- return ret->rc = PTL_NOSPACE;
- }
+ msg->ev.type = PTL_EVENT_SENT;
+ msg->ev.initiator.nid = ni->nid;
+ msg->ev.initiator.pid = ni->pid;
+ msg->ev.portal = args->portal_in;
+ msg->ev.match_bits = args->match_bits_in;
+ msg->ev.rlength = md->length;
+ msg->ev.mlength = md->length;
+ msg->ev.offset = args->offset_in;
+ msg->ev.hdr_data = 0;
- /*
- * If this memory descriptor has an event queue associated with
- * it we must allocate a message state object that will record
- * the information to be filled in once the message has been
- * completed. More information is in the do_PtlPut() comments.
- *
- * NB. We're now committed to the GET, since we just marked the MD
- * busy. Callers who observe this (by getting PTL_MD_INUSE from
- * PtlMDUnlink()) expect a completion event to tell them when the
- * MD becomes idle.
- */
- if (md->eq) {
- msg->ev.type = PTL_EVENT_SENT;
- msg->ev.initiator.nid = ni->nid;
- msg->ev.initiator.pid = ni->pid;
- msg->ev.portal = args->portal_in;
- msg->ev.match_bits = args->match_bits_in;
- msg->ev.rlength = md->length;
- msg->ev.mlength = md->length;
- msg->ev.offset = args->offset_in;
- msg->ev.hdr_data = 0;
-
- lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
- }
+ lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+
+ ni->counters.send_count++;
state_unlock(nal, &flags);
rc = lib_send (nal, private, msg, &hdr, PTL_MSG_GET,
id->nid, id->pid, NULL, 0, 0);
if (rc != PTL_OK) {
- /* get_new_msg() committed us to sending by decrementing
- * md->threshold, so we have to act like we did send, but
- * the network dropped it. */
- lib_finalize (nal, private, msg);
+ CERROR(LPU64": error sending GET to "LPU64": %d\n",
+ ni->nid, id->nid, rc);
+ lib_finalize (nal, private, msg, rc);
}
+ /* completion will be signalled by an event */
return ret->rc = PTL_OK;
}
#include <portals/lib-p30.h>
-int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg)
+void
+lib_enq_event_locked (nal_cb_t *nal, void *private,
+ lib_eq_t *eq, ptl_event_t *ev)
{
- lib_md_t *md;
- lib_eq_t *eq;
+ ptl_event_t *eq_slot;
int rc;
+
+ ev->sequence = eq->sequence++; /* Allocate the next queue slot */
+
+ /* size must be a power of 2 to handle a wrapped sequence # */
+ LASSERT (eq->size != 0 &&
+ eq->size == LOWEST_BIT_SET (eq->size));
+ eq_slot = eq->base + (ev->sequence & (eq->size - 1));
+
+ /* Copy the event into the allocated slot, ensuring all the rest of
+ * the event's contents have been copied _before_ the sequence
+ * number gets updated. A processes 'getting' an event waits on
+ * the next queue slot's sequence to be 'new'. When it is, _all_
+ * other event fields had better be consistent. I assert
+ * 'sequence' is the last member, so I only need a 2 stage copy. */
+
+ LASSERT(sizeof (ptl_event_t) ==
+ offsetof(ptl_event_t, sequence) + sizeof(ev->sequence));
+
+ rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev,
+ offsetof (ptl_event_t, sequence));
+ LASSERT (rc == PTL_OK);
+
+#ifdef __KERNEL__
+ barrier();
+#endif
+ /* Updating the sequence number is what makes the event 'new' NB if
+ * the cb_write below isn't atomic, this could cause a race with
+ * PtlEQGet */
+ rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence,
+ (void *)&ev->sequence,sizeof (ev->sequence));
+ LASSERT (rc == PTL_OK);
+
+#ifdef __KERNEL__
+ barrier();
+#endif
+
+ if (nal->cb_callback != NULL)
+ nal->cb_callback(nal, private, eq, ev);
+ else if (eq->event_callback != NULL)
+ eq->event_callback(ev);
+}
+
+void
+lib_finalize(nal_cb_t *nal, void *private, lib_msg_t *msg, ptl_err_t status)
+{
+ lib_md_t *md;
+ int unlink;
unsigned long flags;
+ int rc;
+ ptl_hdr_t ack;
/* ni went down while processing this message */
- if (nal->ni.up == 0) {
- return -1;
- }
+ if (nal->ni.up == 0)
+ return;
if (msg == NULL)
- return 0;
+ return;
- rc = 0;
- if (msg->send_ack) {
- ptl_hdr_t ack;
+ /* Only send an ACK if the PUT completed successfully */
+ if (status == PTL_OK &&
+ !ptl_is_wire_handle_none(&msg->ack_wmd)) {
- LASSERT (!ptl_is_wire_handle_none (&msg->ack_wmd));
+ LASSERT(msg->ev.type == PTL_EVENT_PUT);
memset (&ack, 0, sizeof (ack));
ack.type = HTON__u32 (PTL_MSG_ACK);
- ack.dest_nid = HTON__u64 (msg->nid);
+ ack.dest_nid = HTON__u64 (msg->ev.initiator.nid);
ack.src_nid = HTON__u64 (nal->ni.nid);
- ack.dest_pid = HTON__u32 (msg->pid);
+ ack.dest_pid = HTON__u32 (msg->ev.initiator.pid);
ack.src_pid = HTON__u32 (nal->ni.pid);
ack.payload_length = 0;
ack.msg.ack.mlength = HTON__u32 (msg->ev.mlength);
rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK,
- msg->nid, msg->pid, NULL, 0, 0);
- /* If this send fails, there's nothing else to clean up */
+ msg->ev.initiator.nid, msg->ev.initiator.pid,
+ NULL, 0, 0);
+ if (rc != PTL_OK) {
+ /* send failed: there's nothing else to clean up. */
+ CERROR("Error %d sending ACK to "LPX64"\n",
+ rc, msg->ev.initiator.nid);
+ }
}
md = msg->md;
- LASSERT (md->pending > 0); /* I've not dropped my ref yet */
- eq = md->eq;
state_lock(nal, &flags);
- if (eq != NULL) {
- ptl_event_t *ev = &msg->ev;
- ptl_event_t *eq_slot;
-
- /* I have to hold the lock while I bump the sequence number
- * and copy the event into the queue. If not, and I was
- * interrupted after bumping the sequence number, other
- * events could fill the queue, including the slot I just
- * allocated to this event. On resuming, I would overwrite
- * a more 'recent' event with old event state, and
- * processes taking events off the queue would not detect
- * overflow correctly.
- */
-
- ev->sequence = eq->sequence++;/* Allocate the next queue slot */
-
- /* size must be a power of 2 to handle a wrapped sequence # */
- LASSERT (eq->size != 0 &&
- eq->size == LOWEST_BIT_SET (eq->size));
- eq_slot = eq->base + (ev->sequence & (eq->size - 1));
-
- /* Invalidate unlinked_me unless this is the last
- * event for an auto-unlinked MD. Note that if md was
- * auto-unlinked, md->pending can only decrease
- */
- if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || /* not auto-unlinked */
- md->pending != 1) /* not last ref */
- ev->unlinked_me = PTL_HANDLE_NONE;
-
- /* Copy the event into the allocated slot, ensuring all the
- * rest of the event's contents have been copied _before_
- * the sequence number gets updated. A processes 'getting'
- * an event waits on the next queue slot's sequence to be
- * 'new'. When it is, _all_ other event fields had better
- * be consistent. I assert 'sequence' is the last member,
- * so I only need a 2 stage copy.
- */
- LASSERT(sizeof (ptl_event_t) ==
- offsetof(ptl_event_t, sequence) + sizeof(ev->sequence));
-
- rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev,
- offsetof (ptl_event_t, sequence));
- LASSERT (rc == 0);
-
-#ifdef __KERNEL__
- barrier();
-#endif
- /* Updating the sequence number is what makes the event 'new' */
-
- /* cb_write is not necessarily atomic, so this could
- cause a race with PtlEQGet */
- rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence,
- (void *)&ev->sequence,sizeof (ev->sequence));
- LASSERT (rc == 0);
+ /* Now it's safe to drop my caller's ref */
+ md->pending--;
+ LASSERT (md->pending >= 0);
-#ifdef __KERNEL__
- barrier();
-#endif
+ /* Should I unlink this MD? */
+ unlink = (md->pending == 0 && /* No other refs */
+ (md->threshold == 0 || /* All ops done */
+ md->md_flags & PTL_MD_FLAG_UNLINK) != 0); /* black spot */
- /* I must also ensure that (a) callbacks are made in the
- * same order as the events land in the queue, and (b) the
- * callback occurs before the event can be removed from the
- * queue, so I can't drop the lock during the callback. */
- if (nal->cb_callback != NULL)
- nal->cb_callback(nal, private, eq, ev);
- else if (eq->event_callback != NULL)
- (void)((eq->event_callback) (ev));
- }
+ msg->ev.status = status;
+ msg->ev.unlinked = unlink;
- LASSERT ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 ||
- (md->md_flags & PTL_MD_FLAG_UNLINK) != 0);
+ if (md->eq != NULL)
+ lib_enq_event_locked(nal, private, md->eq, &msg->ev);
- md->pending--;
- if (md->pending == 0 && /* no more outstanding operations on this md */
- (md->threshold == 0 || /* done its business */
- (md->md_flags & PTL_MD_FLAG_UNLINK) != 0)) /* marked for death */
+ if (unlink)
lib_md_unlink(nal, md);
list_del (&msg->msg_list);
lib_msg_free(nal, msg);
state_unlock(nal, &flags);
-
- return rc;
}
CPPFLAGS=
INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir)
-lib_LIBRARIES = libtcpnal.a
+noinst_LIBRARIES = libtcpnal.a
pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
+
+if LIBLUSTRE
+libtcpnal_a_CFLAGS = -fPIC
+endif
* This file is part of Portals, http://www.sf.net/projects/sandiaportals/
*/
+#ifndef TCPNAL_PROCBRIDGE_H
+#define TCPNAL_PROCBRIDGE_H
+
#include <portals/lib-p30.h>
typedef struct bridge {
typedef int (*nal_initialize)(bridge);
extern nal_initialize nal_table[PTL_IFACE_MAX];
+
+#endif
*/
connection force_tcp_connection(manager m,
unsigned int ip,
- unsigned short port)
+ unsigned short port,
+ procbridge pb)
{
connection conn;
struct sockaddr_in addr;
exit(-1);
conn = allocate_connection(m, ip, port, fd);
+
+ /* let nal thread know this event right away */
+ if (conn)
+ procbridge_wakeup_nal(pb);
}
pthread_mutex_unlock(&m->conn_lock);
*/
#include <table.h>
+#include <procbridge.h>
typedef struct manager {
table connections;
manager m;
} *connection;
-connection force_tcp_connection(manager m, unsigned int ip, unsigned int short);
+connection force_tcp_connection(manager m, unsigned int ip, unsigned int short,
+ procbridge pb);
manager init_connections(unsigned short, int (*f)(void *, void *), void *);
void remove_connection(void *arg);
void shutdown_connections(manager m);
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
+#ifndef __CYGWIN__
+#include <syscall.h>
+#endif
+#include <sys/socket.h>
#include <procbridge.h>
#include <pqtimer.h>
#include <dispatch.h>
#include <errno.h>
+/* XXX CFS workaround, to give a chance to let nal thread wake up
+ * from waiting in select
+ */
+static int procbridge_notifier_handler(void *arg)
+{
+ static char buf[8];
+ procbridge p = (procbridge) arg;
+
+ syscall(SYS_read, p->notifier[1], buf, sizeof(buf));
+ return 1;
+}
+
+void procbridge_wakeup_nal(procbridge p)
+{
+ static char buf[8];
+ syscall(SYS_write, p->notifier[0], buf, sizeof(buf));
+}
+
/* Function: forward
* Arguments: nal_t *nal: pointer to my top-side nal structure
* id: the command to pass to the lower layer
procbridge p=(procbridge)b->local;
p->nal_flags |= NAL_FLAG_STOPPING;
+ procbridge_wakeup_nal(p);
do {
pthread_mutex_lock(&p->mutex);
}
+/* FIXME cfs temporary workaround! FIXME
+ * global time out value
+ */
+int __tcpnal_eqwait_timeout_value = 0;
+int __tcpnal_eqwait_timedout = 0;
+
/* Function: yield
* Arguments: pid:
*
procbridge p=(procbridge)b->local;
pthread_mutex_lock(&p->mutex);
- pthread_cond_wait(&p->cond,&p->mutex);
+ if (!__tcpnal_eqwait_timeout_value) {
+ pthread_cond_wait(&p->cond,&p->mutex);
+ } else {
+ struct timeval now;
+ struct timespec timeout;
+
+ gettimeofday(&now, NULL);
+ timeout.tv_sec = now.tv_sec + __tcpnal_eqwait_timeout_value;
+ timeout.tv_nsec = now.tv_usec * 1000;
+
+ __tcpnal_eqwait_timedout =
+ pthread_cond_timedwait(&p->cond, &p->mutex, &timeout);
+ }
pthread_mutex_unlock(&p->mutex);
}
p->nal_flags = 0;
pthread_mutex_init(&p->nal_cb_lock, 0);
+ /* initialize notifier */
+ if (socketpair(AF_UNIX, SOCK_STREAM, 0, p->notifier)) {
+ perror("socketpair failed");
+ return NULL;
+ }
+
+ if (!register_io_handler(p->notifier[1], READ_HANDLER,
+ procbridge_notifier_handler, p)) {
+ perror("fail to register notifier handler");
+ return NULL;
+ }
+
+ /* create nal thread */
if (pthread_create(&p->t, NULL, nal_thread, &args)) {
perror("nal_init: pthread_create");
return(NULL);
pthread_cond_t cond;
pthread_mutex_t mutex;
+ /* socket pair used to notify nal thread */
+ int notifier[2];
+
int nal_flags;
pthread_mutex_t nal_cb_lock;
ptl_pt_index_t ptl_size,
ptl_ac_index_t acl_size,
ptl_pid_t requested_pid);
+extern void procbridge_wakeup_nal(procbridge p);
#endif
/* the following functions are stubs to satisfy the nal definition
without doing anything particularily useful*/
-static int nal_write(nal_cb_t *nal,
- void *private,
- user_ptr dst_addr,
- void *src_addr,
- size_t len)
+static ptl_err_t nal_write(nal_cb_t *nal,
+ void *private,
+ user_ptr dst_addr,
+ void *src_addr,
+ size_t len)
{
memcpy(dst_addr, src_addr, len);
- return 0;
+ return PTL_OK;
}
-static int nal_read(nal_cb_t * nal,
- void *private,
- void *dst_addr,
- user_ptr src_addr,
- size_t len)
+static ptl_err_t nal_read(nal_cb_t * nal,
+ void *private,
+ void *dst_addr,
+ user_ptr src_addr,
+ size_t len)
{
memcpy(dst_addr, src_addr, len);
- return 0;
+ return PTL_OK;
}
static void *nal_malloc(nal_cb_t *nal,
timeout_pointer=&timeout;
} else timeout_pointer=0;
-
- /* FIXME
- * temporarily add timer for endless waiting problem.
- * FIXME
- */
- timeout.tv_sec = 1;
- timeout.tv_usec = 0;
- timeout_pointer=&timeout;
-
FD_ZERO(&fds[0]);
FD_ZERO(&fds[1]);
FD_ZERO(&fds[2]);
*
* sends a packet to the peer, after insuring that a connection exists
*/
-int tcpnal_send(nal_cb_t *n,
- void *private,
- lib_msg_t *cookie,
- ptl_hdr_t *hdr,
- int type,
- ptl_nid_t nid,
- ptl_pid_t pid,
- unsigned int niov,
- struct iovec *iov,
- size_t len)
+ptl_err_t tcpnal_send(nal_cb_t *n,
+ void *private,
+ lib_msg_t *cookie,
+ ptl_hdr_t *hdr,
+ int type,
+ ptl_nid_t nid,
+ ptl_pid_t pid,
+ unsigned int niov,
+ struct iovec *iov,
+ size_t offset,
+ size_t len)
{
connection c;
bridge b=(bridge)n->nal_data;
struct iovec tiov[257];
static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER;
- int rc;
+ ptl_err_t rc = PTL_OK;
+ int sysrc;
int total;
+ int ntiov;
int i;
if (!(c=force_tcp_connection((manager)b->lower,
PNAL_IP(nid,b),
- PNAL_PORT(nid,pid))))
- return(1);
+ PNAL_PORT(nid,pid),
+ b->local)))
+ return(PTL_FAIL);
-#if 0
/* TODO: these results should be checked. furthermore, provision
must be made for the SIGPIPE which is delivered when
writing on a tcp socket which has closed underneath
the application. there is a linux flag in the sendmsg
call which turns off the signally behaviour, but its
nonstandard */
- syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t));
- LASSERT (niov <= 1);
- if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len);
-#else
+
LASSERT (niov <= 256);
tiov[0].iov_base = hdr;
tiov[0].iov_len = sizeof(ptl_hdr_t);
+ ntiov = 1 + lib_extract_iov(256, &tiov[1], niov, iov, offset, len);
- if (niov > 0)
- memcpy(&tiov[1], iov, niov * sizeof(struct iovec));
pthread_mutex_lock(&send_lock);
#if 1
- for (i = total = 0; i <= niov; i++)
+ for (i = total = 0; i < ntiov; i++)
total += tiov[i].iov_len;
- rc = syscall(SYS_writev, c->fd, tiov, niov+1);
- if (rc != total) {
+ sysrc = syscall(SYS_writev, c->fd, tiov, ntiov);
+ if (sysrc != total) {
fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
rc, total, errno);
- abort();
+ rc = PTL_FAIL;
}
#else
- for (i = total = 0; i <= niov; i++) {
+ for (i = total = 0; i <= ntiov; i++) {
rc = send(c->fd, tiov[i].iov_base, tiov[i].iov_len, 0);
if (rc != tiov[i].iov_len) {
fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
rc, tiov[i].iov_len, errno);
- abort();
+ rc = PTL_FAIL;
+ break;
}
- total != rc;
+ total += rc;
}
#endif
#if 0
total, niov + 1);
#endif
pthread_mutex_unlock(&send_lock);
-#endif
- lib_finalize(n, private, cookie);
-
- return(0);
+
+ if (rc == PTL_OK) {
+ /* NB the NAL only calls lib_finalize() if it returns PTL_OK
+ * from cb_send() */
+ lib_finalize(n, private, cookie, PTL_OK);
+ }
+
+ return(rc);
}
* blocking read of the requested data. must drain out the
* difference of mainpulated and requested lengths from the network
*/
-int tcpnal_recv(nal_cb_t *n,
- void *private,
- lib_msg_t *cookie,
- unsigned int niov,
- struct iovec *iov,
- size_t mlen,
- size_t rlen)
+ptl_err_t tcpnal_recv(nal_cb_t *n,
+ void *private,
+ lib_msg_t *cookie,
+ unsigned int niov,
+ struct iovec *iov,
+ size_t offset,
+ size_t mlen,
+ size_t rlen)
{
+ struct iovec tiov[256];
+ int ntiov;
int i;
if (!niov)
LASSERT(rlen);
LASSERT(rlen >= mlen);
+ ntiov = lib_extract_iov(256, tiov, niov, iov, offset, mlen);
+
/* FIXME
* 1. Is this effecient enough? change to use readv() directly?
* 2. need check return from read_connection()
* - MeiJia
*/
- for (i = 0; i < niov; i++)
- read_connection(private, iov[i].iov_base, iov[i].iov_len);
+ for (i = 0; i < ntiov; i++)
+ read_connection(private, tiov[i].iov_base, tiov[i].iov_len);
finalize:
- lib_finalize(n, private, cookie);
+ /* FIXME; we always assume success here... */
+ lib_finalize(n, private, cookie, PTL_OK);
if (mlen!=rlen){
char *trash=malloc(rlen-mlen);
free(trash);
}
- return(rlen);
+ return(PTL_OK);
}
# This code is issued under the GNU General Public License.
# See the file COPYING in this distribution
-
COMPILE = $(CC) -Wall -g -I$(srcdir)/../include
LINK = $(CC) -o $@
if LIBLUSTRE
-tmp=
+
+noinst_LIBRARIES = libuptlctl.a
+libuptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h
+libuptlctl_a_CFLAGS = -fPIC
+
else
-tmp=gmnalnid
-endif
-sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck $(tmp)
+sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid
lib_LIBRARIES = libptlctl.a
acceptor_SOURCES = acceptor.c # -lefence
debugctl_DEPENDENCIES = libptlctl.a
routerstat_SOURCES = routerstat.c
+endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <syscall.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <portals/api-support.h>
#include <portals/ptlctl.h>
+#ifndef __CYGWIN__
+ #include <syscall.h>
+#else
+ #include <windows.h>
+ #include <windef.h>
+#endif
+
+static ioc_handler_t do_ioctl; /* forward ref */
+static ioc_handler_t *current_ioc_handler = &do_ioctl;
+
struct ioc_dev {
const char * dev_name;
int dev_fd;
int opc;
};
-char * dump_filename;
+char *dump_filename;
+
+void
+set_ioc_handler (ioc_handler_t *handler)
+{
+ if (handler == NULL)
+ current_ioc_handler = do_ioctl;
+ else
+ current_ioc_handler = handler;
+}
static int
open_ioc_dev(int dev_id)
{
FILE *fp;
struct dump_hdr dump_hdr;
- struct portal_ioctl_hdr * ioc_hdr = (struct portal_ioctl_hdr *) buf;
+ struct portal_ioctl_hdr * ioc_hdr = (struct portal_ioctl_hdr *) buf;
int rc;
printf("dumping opc %x to %s\n", opc, dump_filename);
return -EINVAL;
}
- rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp);
- if (rc == 1)
- rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp);
- fclose(fp);
- if (rc != 1) {
- fprintf(stderr, "%s: %s\n", dump_filename,
- strerror(errno));
- return -EINVAL;
- }
-
- return 0;
+ rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp);
+ if (rc == 1)
+ rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp);
+ fclose(fp);
+ if (rc != 1) {
+ fprintf(stderr, "%s: %s\n", dump_filename,
+ strerror(errno));
+ return -EINVAL;
+ }
+
+ return 0;
}
/* register a device to send ioctls to. */
free(dump_filename);
dump_filename = strdup(file);
+ if (dump_filename == NULL)
+ abort();
+
+ set_ioc_handler(&dump);
return 0;
}
int
l_ioctl(int dev_id, int opc, void *buf)
{
- if (dump_filename)
- return dump(dev_id, opc, buf);
- else
- return do_ioctl(dev_id, opc, buf);
+ return current_ioc_handler(dev_id, opc, buf);
}
/* Read an ioctl dump file, and call the ioc_func for each ioctl buffer
int
parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *))
{
- int fd, line =0;
+ int line =0;
struct stat st;
- char *buf, *end;
+ char *start, *buf, *end;
+#ifndef __CYGWIN__
+ int fd;
+#else
+ HANDLE fd, hmap;
+ DWORD size;
+#endif
+#ifndef __CYGWIN__
fd = syscall(SYS_open, dump_file, O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "couldn't open %s: %s\n", dump_file,
+ strerror(errno));
+ exit(1);
+ }
#ifndef SYS_fstat64
-#define __SYS_fstat__ SYS_fstat
+# define __SYS_fstat__ SYS_fstat
#else
-#define __SYS_fstat__ SYS_fstat64
+# define __SYS_fstat__ SYS_fstat64
#endif
if (syscall(__SYS_fstat__, fd, &st)) {
perror("stat fails");
exit(1);
}
- buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0);
- end = buf + st.st_size;
+ start = buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0);
+ end = start + st.st_size;
close(fd);
- while (buf < end) {
- struct dump_hdr *dump_hdr = (struct dump_hdr *) buf;
- struct portal_ioctl_hdr * data;
- char tmp[8096];
- int rc;
-
- line++;
+ if (start == MAP_FAILED) {
+ fprintf(stderr, "can't create file mapping\n");
+ exit(1);
+ }
+#else
+ fd = CreateFile(dump_file, GENERIC_READ, FILE_SHARE_READ, NULL,
+ OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+ size = GetFileSize(fd, NULL);
+ if (size < 1) {
+ fprintf(stderr, "KML is empty\n");
+ exit(1);
+ }
- data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr));
- if (buf + data->ioc_len > end ) {
- fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf,
- data->ioc_len, end);
- return -1;
- }
+ hmap = CreateFileMapping(fd, NULL, PAGE_READONLY, 0,0, NULL);
+ start = buf = MapViewOfFile(hmap, FILE_MAP_READ, 0, 0, 0);
+ end = buf + size;
+ CloseHandle(fd);
+ if (start == NULL) {
+ fprintf(stderr, "can't create file mapping\n");
+ exit(1);
+ }
+#endif /* __CYGWIN__ */
+
+ while (buf < end) {
+ struct dump_hdr *dump_hdr = (struct dump_hdr *) buf;
+ struct portal_ioctl_hdr * data;
+ char tmp[8096];
+ int rc;
+
+ line++;
+
+ data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr));
+ if (buf + data->ioc_len > end ) {
+ fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf,
+ data->ioc_len, end);
+ return -1;
+ }
#if 0
- printf ("dump_hdr: %lx data: %lx\n",
- (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf);
-
- printf("%d: opcode %x len: %d ver: %x ", line, dump_hdr->opc,
- data->ioc_len, data->ioc_version);
+ printf ("dump_hdr: %lx data: %lx\n",
+ (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf);
+
+ printf("%d: opcode %x len: %d ver: %x ", line, dump_hdr->opc,
+ data->ioc_len, data->ioc_version);
#endif
- memcpy(tmp, data, data->ioc_len);
+ memcpy(tmp, data, data->ioc_len);
- rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp);
- if (rc) {
- printf("failed: %d\n", rc);
- exit(1);
- }
+ rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp);
+ if (rc) {
+ printf("failed: %d\n", rc);
+ exit(1);
+ }
- buf += data->ioc_len + sizeof(*dump_hdr);
+ buf += data->ioc_len + sizeof(*dump_hdr);
}
+
+#ifndef __CYGWIN__
+ munmap(start, end - start);
+#else
+ UnmapViewOfFile(start);
+ CloseHandle(hmap);
+#endif
+
return 0;
}
#include <stdarg.h>
#include <asm/byteorder.h>
+#ifdef __CYGWIN__
+
+#include <netinet/in.h>
+
+#warning assuming little endian
+
+#define __cpu_to_le64(x) ((__u64)(x))
+#define __le64_to_cpu(x) ((__u64)(x))
+#define __cpu_to_le32(x) ((__u32)(x))
+#define __le32_to_cpu(x) ((__u32)(x))
+#define __cpu_to_le16(x) ((__u16)(x))
+#define __le16_to_cpu(x) ((__u16)(x))
+
+#endif /* __CYGWIN__ */
+
#include <portals/api-support.h>
#include <portals/ptlctl.h>
#include <portals/list.h>
PORTAL_IOC_INIT (data);
data.ioc_pbuf1 = (char*)pcfg;
data.ioc_plen1 = sizeof(*pcfg);
+ /* XXX liblustre hack XXX */
+ data.ioc_nal_cmd = pcfg->pcfg_command;
+ data.ioc_nid = pcfg->pcfg_nid;
rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
}
op->op_block_cnt = page_count;
if (cmd == PTLBD_READ)
- desc = ptlrpc_prep_bulk_imp (req, BULK_PUT_SINK, PTLBD_BULK_PORTAL);
+ desc = ptlrpc_prep_bulk_imp (req, page_count,
+ BULK_PUT_SINK, PTLBD_BULK_PORTAL);
else
- desc = ptlrpc_prep_bulk_imp (req, BULK_GET_SOURCE, PTLBD_BULK_PORTAL);
+ desc = ptlrpc_prep_bulk_imp (req, page_count,
+ BULK_GET_SOURCE, PTLBD_BULK_PORTAL);
if ( desc == NULL )
GOTO(out, rc = 1); /* need to return error cnt */
/* NB req now owns desc, and frees it when she frees herself */
for ( niob = niobs, bh = first_bh ; bh ; bh = bh->b_reqnext, niob++ ) {
- rc = ptlrpc_prep_bulk_page(desc, bh->b_page,
- bh_offset (bh) & (PAGE_SIZE - 1),
- bh->b_size);
- if (rc != 0)
- GOTO(out, rc = 1); /* need to return error cnt */
+ ptlrpc_prep_bulk_page(desc, bh->b_page,
+ bh_offset (bh) & (PAGE_SIZE - 1),
+ bh->b_size);
niob->n_block_nr = bh->b_blocknr;
niob->n_offset = bh_offset(bh);
if ( rsp == NULL )
GOTO (out, rc = -EFAULT);
+ /* FIXME: assumes each niobuf fits in 1 page */
page_count = req->rq_reqmsg->buflens[1] / sizeof(struct ptlbd_niob);
if (swab) { /* swab remaining niobs */
for (i = 1; i < page_count; i++)
}
if (cmd == PTLBD_READ)
- desc = ptlrpc_prep_bulk_exp (req, BULK_PUT_SOURCE, PTLBD_BULK_PORTAL);
+ desc = ptlrpc_prep_bulk_exp (req, page_count,
+ BULK_PUT_SOURCE, PTLBD_BULK_PORTAL);
else
- desc = ptlrpc_prep_bulk_exp (req, BULK_GET_SINK, PTLBD_BULK_PORTAL);
+ desc = ptlrpc_prep_bulk_exp (req, page_count,
+ BULK_GET_SINK, PTLBD_BULK_PORTAL);
if (desc == NULL) {
error_cnt++;
GOTO(out_reply, rc = -ENOMEM);
}
list_add_tail(&page->list, &tmp_pages);
- rc = ptlrpc_prep_bulk_page(desc, page,
- niob->n_offset & (PAGE_SIZE - 1),
- niob->n_length);
- if (rc != 0) {
- error_cnt++;
- GOTO(out_reply, rc);
- }
+ ptlrpc_prep_bulk_page(desc, page,
+ niob->n_offset & (PAGE_SIZE - 1),
+ niob->n_length);
}
if ( cmd == PTLBD_READ ) {
- if ((rc = ptlbd_do_filp(filp, PTLBD_READ, niobs,
- page_count, &tmp_pages)) < 0) {
+ rc = ptlbd_do_filp(filp, PTLBD_READ, niobs,
+ page_count, &tmp_pages);
+ if (rc < 0) {
error_cnt++;
GOTO(out_reply, rc);
}
- rc = ptlrpc_bulk_put(desc);
- } else {
- rc = ptlrpc_bulk_get(desc);
}
+ rc = ptlrpc_start_bulk_transfer(desc);
if ( rc ) {
error_cnt++;
}
lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, desc);
- rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete(desc), &lwi);
+ rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), &lwi);
if (rc != 0) {
LASSERT(rc == -ETIMEDOUT);
ptlrpc_abort_bulk(desc);
error_cnt++;
GOTO(out_reply, rc);
}
+
+ /* XXX do some error handling */
+ LASSERT(desc->bd_success && desc->bd_nob_transferred == desc->bd_nob);
if ( cmd == PTLBD_WRITE ) {
if ((rc = ptlbd_do_filp(filp, PTLBD_WRITE, niobs,
RETURN(PTR_ERR(ptlbd->filp));
ptlbd->ptlbd_service =
- ptlrpc_init_svc(PTLBD_NEVENTS, PTLBD_NBUFS, PTLBD_BUFSIZE,
- PTLBD_MAXREQSIZE, PTLBD_REQUEST_PORTAL,
- PTLBD_REPLY_PORTAL,
- ptlbd_handle, "ptlbd_sv",
+ ptlrpc_init_svc(PTLBD_NBUFS, PTLBD_BUFSIZE, PTLBD_MAXREQSIZE,
+ PTLBD_REQUEST_PORTAL, PTLBD_REPLY_PORTAL,
+ ptlbd_handle, "ptlbd_sv",
obddev->obd_proc_entry);
if (ptlbd->ptlbd_service == NULL)
DEFS=
-LDLMSOURCES= $(top_srcdir)/ldlm/l_lock.c $(top_srcdir)/ldlm/ldlm_lock.c \
- $(top_srcdir)/ldlm/ldlm_resource.c $(top_srcdir)/ldlm/ldlm_lib.c \
- $(top_srcdir)/ldlm/ldlm_plain.c $(top_srcdir)/ldlm/ldlm_extent.c \
- $(top_srcdir)/ldlm/ldlm_flock.c $(top_srcdir)/ldlm/ldlm_request.c \
- $(top_srcdir)/ldlm/ldlm_lockd.c $(top_srcdir)/ldlm/ldlm_internal.h
+LDLM_COMM_SOURCES= $(top_srcdir)/ldlm/l_lock.c $(top_srcdir)/ldlm/ldlm_lock.c \
+ $(top_srcdir)/ldlm/ldlm_resource.c $(top_srcdir)/ldlm/ldlm_lib.c \
+ $(top_srcdir)/ldlm/ldlm_plain.c $(top_srcdir)/ldlm/ldlm_extent.c \
+ $(top_srcdir)/ldlm/ldlm_request.c $(top_srcdir)/ldlm/ldlm_lockd.c \
+ $(top_srcdir)/ldlm/ldlm_internal.h
-COMMON_SOURCES = client.c recover.c connection.c niobuf.c pack_generic.c \
- events.c ptlrpc_module.c service.c pinger.c recov_thread.c llog_net.c \
- llog_client.c import.c ptlrpcd.c $(LDLMSOURCES)
+COMMON_SOURCES = client.c recover.c connection.c niobuf.c pack_generic.c \
+ events.c ptlrpc_module.c service.c pinger.c recov_thread.c llog_net.c \
+ llog_client.c llog_server.c import.c ptlrpcd.c ptlrpc_internal.h \
+ $(LDLM_COMM_SOURCES)
if LIBLUSTRE
-lib_LIBRARIES = libptlrpc.a
+noinst_LIBRARIES = libptlrpc.a
+libptlrpc_a_CFLAGS = -fPIC
libptlrpc_a_SOURCES = $(COMMON_SOURCES)
else
modulefs_DATA = ptlrpc.o
EXTRA_PROGRAMS = ptlrpc
-ptlrpc_SOURCES = $(COMMON_SOURCES) lproc_ptlrpc.c ptlrpc_internal.h \
- llog_server.c
+ptlrpc_SOURCES = $(top_srcdir)/ldlm/ldlm_flock.c $(COMMON_SOURCES) \
+ lproc_ptlrpc.c
+
endif
ptlrpc_DEPENDENCIES=symlinks
return;
}
-static inline struct ptlrpc_bulk_desc *new_bulk(void)
+static inline struct ptlrpc_bulk_desc *new_bulk(int npages, int type, int portal)
{
struct ptlrpc_bulk_desc *desc;
- OBD_ALLOC(desc, sizeof(*desc));
+ OBD_ALLOC(desc, offsetof (struct ptlrpc_bulk_desc, bd_iov[npages]));
if (!desc)
return NULL;
spin_lock_init(&desc->bd_lock);
init_waitqueue_head(&desc->bd_waitq);
- INIT_LIST_HEAD(&desc->bd_page_list);
+ desc->bd_max_pages = npages;
+ desc->bd_page_count = 0;
desc->bd_md_h = PTL_HANDLE_NONE;
- desc->bd_me_h = PTL_HANDLE_NONE;
-
+ desc->bd_portal = portal;
+ desc->bd_type = type;
+
return desc;
}
struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req,
- int type, int portal)
+ int npages, int type, int portal)
{
struct obd_import *imp = req->rq_import;
struct ptlrpc_bulk_desc *desc;
LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE);
-
- desc = new_bulk();
+ desc = new_bulk(npages, type, portal);
if (desc == NULL)
RETURN(NULL);
desc->bd_import_generation = req->rq_import_generation;
desc->bd_import = class_import_get(imp);
desc->bd_req = req;
- desc->bd_type = type;
- desc->bd_portal = portal;
+
+ desc->bd_cbid.cbid_fn = client_bulk_callback;
+ desc->bd_cbid.cbid_arg = desc;
/* This makes req own desc, and free it when she frees herself */
req->rq_bulk = desc;
}
struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp (struct ptlrpc_request *req,
- int type, int portal)
+ int npages, int type, int portal)
{
struct obd_export *exp = req->rq_export;
struct ptlrpc_bulk_desc *desc;
LASSERT(type == BULK_PUT_SOURCE || type == BULK_GET_SINK);
- desc = new_bulk();
+ desc = new_bulk(npages, type, portal);
if (desc == NULL)
RETURN(NULL);
desc->bd_export = class_export_get(exp);
desc->bd_req = req;
- desc->bd_type = type;
- desc->bd_portal = portal;
+
+ desc->bd_cbid.cbid_fn = server_bulk_callback;
+ desc->bd_cbid.cbid_arg = desc;
/* NB we don't assign rq_bulk here; server-side requests are
* re-used, and the handler frees the bulk desc explicitly. */
return desc;
}
-int ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
- struct page *page, int pageoffset, int len)
+void ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+ struct page *page, int pageoffset, int len)
{
- struct ptlrpc_bulk_page *bulk;
-
- OBD_ALLOC(bulk, sizeof(*bulk));
- if (bulk == NULL)
- return -ENOMEM;
-
+#ifdef __KERNEL__
+ ptl_kiov_t *kiov = &desc->bd_iov[desc->bd_page_count];
+#else
+ struct iovec *iov = &desc->bd_iov[desc->bd_page_count];
+#endif
+ LASSERT(desc->bd_page_count < desc->bd_max_pages);
LASSERT(page != NULL);
LASSERT(pageoffset >= 0);
LASSERT(len > 0);
LASSERT(pageoffset + len <= PAGE_SIZE);
- bulk->bp_page = page;
- bulk->bp_pageoffset = pageoffset;
- bulk->bp_buflen = len;
-
- bulk->bp_desc = desc;
- list_add_tail(&bulk->bp_link, &desc->bd_page_list);
+#ifdef __KERNEL__
+ kiov->kiov_page = page;
+ kiov->kiov_offset = pageoffset;
+ kiov->kiov_len = len;
+#else
+ iov->iov_base = page->addr + pageoffset;
+ iov->iov_len = len;
+#endif
desc->bd_page_count++;
- return 0;
+ desc->bd_nob += len;
}
void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
{
- struct list_head *tmp, *next;
ENTRY;
LASSERT(desc != NULL);
LASSERT(desc->bd_page_count != 0x5a5a5a5a); /* not freed already */
LASSERT(!desc->bd_network_rw); /* network hands off or */
-
- list_for_each_safe(tmp, next, &desc->bd_page_list) {
- struct ptlrpc_bulk_page *bulk;
- bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
- ptlrpc_free_bulk_page(bulk);
- }
-
- LASSERT(desc->bd_page_count == 0);
LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
-
if (desc->bd_export)
class_export_put(desc->bd_export);
else
class_import_put(desc->bd_import);
- OBD_FREE(desc, sizeof(*desc));
+ OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc,
+ bd_iov[desc->bd_max_pages]));
EXIT;
}
-void ptlrpc_free_bulk_page(struct ptlrpc_bulk_page *bulk)
-{
- LASSERT(bulk != NULL);
-
- list_del(&bulk->bp_link);
- bulk->bp_desc->bd_page_count--;
- OBD_FREE(bulk, sizeof(*bulk));
-}
-
struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode,
int count, int *lengths, char **bufs)
{
request->rq_send_state = LUSTRE_IMP_FULL;
request->rq_type = PTL_RPC_MSG_REQUEST;
request->rq_import = class_import_get(imp);
+
+ request->rq_req_cbid.cbid_fn = request_out_callback;
+ request->rq_req_cbid.cbid_arg = request;
+
+ request->rq_reply_cbid.cbid_fn = reply_in_callback;
+ request->rq_reply_cbid.cbid_arg = request;
+
request->rq_phase = RQ_PHASE_NEW;
/* XXX FIXME bug 249 */
ENTRY;
LASSERT(!req->rq_receiving_reply);
- LASSERT(req->rq_replied);
/* NB Until this point, the whole of the incoming message,
* including buflens, status etc is in the sender's byte order. */
/* Clear reply swab mask; this is a new reply in sender's byte order */
req->rq_rep_swab_mask = 0;
#endif
- rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen);
+ LASSERT (req->rq_nob_received <= req->rq_replen);
+ rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received);
if (rc) {
CERROR("unpack_rep failed: %d\n", rc);
RETURN(-EPROTO);
if (req->rq_phase == RQ_PHASE_RPC) {
if (req->rq_waiting || req->rq_resend) {
int status;
+
+ LASSERT (!ptlrpc_client_receiving_reply(req));
+ LASSERT (req->rq_bulk == NULL ||
+ !ptlrpc_bulk_active(req->rq_bulk));
+
spin_lock_irqsave(&imp->imp_lock, flags);
if (ptlrpc_import_delay_req(imp, req, &status)) {
ptlrpc_unregister_reply(req);
if (req->rq_bulk) {
__u64 old_xid = req->rq_xid;
- ptlrpc_unregister_bulk(req);
+
/* ensure previous bulk fails */
req->rq_xid = ptlrpc_next_xid();
CDEBUG(D_HA, "resend bulk "
force_timer_recalc = 1;
}
- /* Ensure the network callback returned */
- spin_lock_irqsave (&req->rq_lock, flags);
- if (!req->rq_replied) {
- spin_unlock_irqrestore (&req->rq_lock, flags);
+ /* Still waiting for a reply? */
+ if (ptlrpc_client_receiving_reply(req))
+ continue;
+
+ /* Did we actually receive a reply? */
+ if (!ptlrpc_client_replied(req))
continue;
- }
- spin_unlock_irqrestore (&req->rq_lock, flags);
spin_lock_irqsave(&imp->imp_lock, flags);
list_del_init(&req->rq_list);
}
LASSERT(req->rq_phase == RQ_PHASE_BULK);
- if (!ptlrpc_bulk_complete (req->rq_bulk))
+ if (ptlrpc_bulk_active(req->rq_bulk))
continue;
+ if (!req->rq_bulk->bd_success) {
+ /* The RPC reply arrived OK, but the bulk screwed
+ * up! Dead wierd since the server told us the RPC
+ * was good after getting the REPLY for her GET or
+ * the ACK for her PUT. */
+ DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
+ LBUG();
+ }
+
req->rq_phase = RQ_PHASE_INTERPRET;
interpret:
ptlrpc_unregister_reply (req);
+ if (req->rq_bulk != NULL)
+ ptlrpc_unregister_bulk (req);
+
if (imp == NULL) {
DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
RETURN(1);
LASSERT(!list_empty(&set->set_requests));
list_for_each(tmp, &set->set_requests) {
req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
- (void)ptlrpc_send_new_req(req);
+ if (req->rq_phase == RQ_PHASE_NEW)
+ (void)ptlrpc_send_new_req(req);
}
do {
}
LASSERT(!request->rq_receiving_reply);
+ LASSERT(request->rq_rqbd == NULL); /* client-side */
/* We must take it off the imp_replay_list first. Otherwise, we'll set
* request->rq_reqmsg to NULL while osc_close is dereferencing it. */
*/
void ptlrpc_unregister_reply (struct ptlrpc_request *request)
{
- unsigned long flags;
- int rc;
- ENTRY;
+ int rc;
+ wait_queue_head_t *wq;
+ struct l_wait_info lwi;
LASSERT(!in_interrupt ()); /* might sleep */
- spin_lock_irqsave (&request->rq_lock, flags);
- if (!request->rq_receiving_reply) { /* not waiting for a reply */
- spin_unlock_irqrestore (&request->rq_lock, flags);
- EXIT;
- /* NB reply buffer not freed here */
+ if (!ptlrpc_client_receiving_reply(request))
return;
- }
-
- LASSERT(!request->rq_replied); /* callback hasn't completed */
- spin_unlock_irqrestore (&request->rq_lock, flags);
rc = PtlMDUnlink (request->rq_reply_md_h);
- switch (rc) {
- default:
- LBUG ();
-
- case PTL_OK: /* unlinked before completion */
- LASSERT(request->rq_receiving_reply);
- LASSERT(!request->rq_replied);
- spin_lock_irqsave (&request->rq_lock, flags);
- request->rq_receiving_reply = 0;
- spin_unlock_irqrestore (&request->rq_lock, flags);
- OBD_FREE(request->rq_repmsg, request->rq_replen);
- request->rq_repmsg = NULL;
- EXIT;
+ if (rc == PTL_INV_MD) {
+ LASSERT (!ptlrpc_client_receiving_reply(request));
return;
+ }
+
+ LASSERT (rc == PTL_OK);
- case PTL_MD_INUSE: /* callback in progress */
- for (;;) {
- /* Network access will complete in finite time but
- * the timeout lets us CERROR for visibility */
- struct l_wait_info lwi = LWI_TIMEOUT(10*HZ, NULL, NULL);
-
- rc = l_wait_event (request->rq_reply_waitq,
- request->rq_replied, &lwi);
- LASSERT(rc == 0 || rc == -ETIMEDOUT);
- if (rc == 0) {
- spin_lock_irqsave (&request->rq_lock, flags);
- /* Ensure the callback has completed scheduling
- * me and taken its hands off the request */
- spin_unlock_irqrestore(&request->rq_lock,flags);
- break;
- }
-
- CERROR ("Unexpectedly long timeout: req %p\n", request);
- }
- /* fall through */
-
- case PTL_INV_MD: /* callback completed */
- LASSERT(!request->rq_receiving_reply);
- LASSERT(request->rq_replied);
- EXIT;
- return;
+ if (request->rq_set == NULL)
+ wq = &request->rq_set->set_waitq;
+ else
+ wq = &request->rq_reply_waitq;
+
+ for (;;) {
+ /* Network access will complete in finite time but the HUGE
+ * timeout lets us CWARN for visibility of sluggish NALs */
+ lwi = LWI_TIMEOUT(300 * HZ, NULL, NULL);
+ rc = l_wait_event (*wq, !ptlrpc_client_receiving_reply(request), &lwi);
+ if (rc == 0)
+ return;
+
+ LASSERT (rc == -ETIMEDOUT);
+ DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout");
}
- /* Not Reached */
}
/* caller must hold imp->imp_lock */
spin_lock_irqsave (&req->rq_lock, flags);
req->rq_resend = 1;
req->rq_timedout = 0;
- if (req->rq_set != NULL)
- wake_up (&req->rq_set->set_waitq);
- else
- wake_up(&req->rq_reply_waitq);
+ if (req->rq_bulk) {
+ __u64 old_xid = req->rq_xid;
+
+ /* ensure previous bulk fails */
+ req->rq_xid = ptlrpc_next_xid();
+ CDEBUG(D_HA, "resend bulk old x"LPU64" new x"LPU64"\n",
+ old_xid, req->rq_xid);
+ }
+ ptlrpc_wake_client_req(req);
spin_unlock_irqrestore (&req->rq_lock, flags);
+
}
/* XXX: this function and rq_status are currently unused */
spin_lock_irqsave (&req->rq_lock, flags);
req->rq_restart = 1;
req->rq_timedout = 0;
- if (req->rq_set != NULL)
- wake_up (&req->rq_set->set_waitq);
- else
- wake_up(&req->rq_reply_waitq);
+ ptlrpc_wake_client_req(req);
spin_unlock_irqrestore (&req->rq_lock, flags);
}
out:
if (req->rq_bulk != NULL) {
- if (rc >= 0) { /* success so far */
+ if (rc >= 0) {
+ /* success so far. Note that anything going wrong
+ * with bulk now, is EXTREMELY strange, since the
+ * server must have believed that the bulk
+ * tranferred OK before she replied with success to
+ * me. */
lwi = LWI_TIMEOUT(timeout, NULL, NULL);
brc = l_wait_event(req->rq_reply_waitq,
- ptlrpc_bulk_complete(req->rq_bulk),
+ !ptlrpc_bulk_active(req->rq_bulk),
&lwi);
+ LASSERT(brc == 0 || brc == -ETIMEDOUT);
if (brc != 0) {
LASSERT(brc == -ETIMEDOUT);
- CERROR ("Timed out waiting for bulk\n");
+ DEBUG_REQ(D_ERROR, req, "bulk timed out");
rc = brc;
+ } else if (!req->rq_bulk->bd_success) {
+ DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
+ rc = -EIO;
}
}
if (rc < 0)
/* Clear reply swab mask; this is a new reply in sender's byte order */
req->rq_rep_swab_mask = 0;
#endif
- rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen);
+ LASSERT (req->rq_nob_received <= req->rq_replen);
+ rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received);
if (rc) {
CERROR("unpack_rep failed: %d\n", rc);
GOTO(out, rc = -EPROTO);
spin_lock (&req->rq_lock);
if (req->rq_import_generation < imp->imp_generation) {
req->rq_err = 1;
- if (req->rq_set != NULL)
- wake_up(&req->rq_set->set_waitq);
- else
- wake_up(&req->rq_reply_waitq);
+ ptlrpc_wake_client_req(req);
}
spin_unlock (&req->rq_lock);
}
spin_lock (&req->rq_lock);
if (req->rq_import_generation < imp->imp_generation) {
req->rq_err = 1;
- if (req->rq_set != NULL)
- wake_up(&req->rq_set->set_waitq);
- else
- wake_up(&req->rq_reply_waitq);
+ ptlrpc_wake_client_req(req);
}
spin_unlock (&req->rq_lock);
}
struct ptlrpc_ni ptlrpc_interfaces[NAL_MAX_NR];
int ptlrpc_ninterfaces;
-/*
- * Free the packet when it has gone out
+/*
+ * Client's outgoing request callback
*/
-static int request_out_callback(ptl_event_t *ev)
+void request_out_callback(ptl_event_t *ev)
{
- struct ptlrpc_request *req = ev->mem_desc.user_ptr;
+ struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr;
+ struct ptlrpc_request *req = cbid->cbid_arg;
+ unsigned long flags;
ENTRY;
- /* requests always contiguous */
- LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
-
- if (ev->type != PTL_EVENT_SENT) {
- // XXX make sure we understand all events, including ACK's
- CERROR("Unknown event %d\n", ev->type);
- LBUG();
- }
+ LASSERT (ev->type == PTL_EVENT_SENT ||
+ ev->type == PTL_EVENT_UNLINK);
+ LASSERT (ev->unlinked);
- /* this balances the atomic_inc in ptl_send_rpc() */
- ptlrpc_req_finished(req);
- RETURN(1);
-}
+ DEBUG_REQ((ev->status == PTL_OK) ? D_NET : D_ERROR, req,
+ "type %d, status %d", ev->type, ev->status);
-/*
- * Free the packet when it has gone out
- */
-static int reply_out_callback(ptl_event_t *ev)
-{
- struct ptlrpc_request *req = ev->mem_desc.user_ptr;
- unsigned long flags;
- ENTRY;
+ if (ev->type == PTL_EVENT_UNLINK ||
+ ev->status != PTL_OK) {
- /* replies always contiguous */
- LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
+ /* Failed send: make it seem like the reply timed out, just
+ * like failing sends in client.c does currently... */
- if (ev->type == PTL_EVENT_SENT) {
- /* NB don't even know if this is the current reply! In fact
- * we can't touch any state in the request, since the
- * service handler zeros it on each incoming request. */
- OBD_FREE(ev->mem_desc.start, ev->mem_desc.length);
- } else if (ev->type == PTL_EVENT_ACK) {
- LASSERT(req->rq_want_ack);
spin_lock_irqsave(&req->rq_lock, flags);
- req->rq_want_ack = 0;
- wake_up(&req->rq_reply_waitq);
+ req->rq_timeout = 0;
spin_unlock_irqrestore(&req->rq_lock, flags);
- } else {
- // XXX make sure we understand all events
- CERROR("Unknown event %d\n", ev->type);
- LBUG();
+
+ ptlrpc_wake_client_req(req);
}
- RETURN(1);
+ /* this balances the atomic_inc in ptl_send_rpc() */
+ ptlrpc_req_finished(req);
+ EXIT;
}
/*
- * Wake up the thread waiting for the reply once it comes in.
+ * Client's incoming reply callback
*/
-int reply_in_callback(ptl_event_t *ev)
+void reply_in_callback(ptl_event_t *ev)
{
- struct ptlrpc_request *req = ev->mem_desc.user_ptr;
+ struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr;
+ struct ptlrpc_request *req = cbid->cbid_arg;
unsigned long flags;
ENTRY;
- /* replies always contiguous */
- LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
-
- if (req->rq_xid == 0x5a5a5a5a5a5a5a5aULL) {
- CERROR("Reply received for freed request! Probably a missing "
- "ptlrpc_abort()\n");
- LBUG();
- }
+ LASSERT (ev->type == PTL_EVENT_PUT ||
+ ev->type == PTL_EVENT_UNLINK);
+ LASSERT (ev->unlinked);
+ LASSERT (ev->mem_desc.start == req->rq_repmsg);
+ LASSERT (ev->offset == 0);
+ LASSERT (ev->mlength <= req->rq_replen);
+
+ DEBUG_REQ((ev->status == PTL_OK) ? D_NET : D_ERROR, req,
+ "type %d, status %d", ev->type, ev->status);
- if (req->rq_xid != ev->match_bits) {
- CERROR("Reply packet for wrong request\n");
- LBUG();
- }
+ spin_lock_irqsave (&req->rq_lock, flags);
- if (ev->type == PTL_EVENT_PUT) {
- /* Bug 1190: should handle non-zero offset as a protocol
- * error */
- LASSERT (ev->offset == 0);
+ LASSERT (req->rq_receiving_reply);
+ req->rq_receiving_reply = 0;
- spin_lock_irqsave (&req->rq_lock, flags);
- LASSERT (req->rq_receiving_reply);
- req->rq_receiving_reply = 0;
+ if (ev->type == PTL_EVENT_PUT &&
+ ev->status == PTL_OK) {
req->rq_replied = 1;
- if (req->rq_set != NULL)
- wake_up(&req->rq_set->set_waitq);
- else
- wake_up(&req->rq_reply_waitq);
- spin_unlock_irqrestore (&req->rq_lock, flags);
- } else {
- // XXX make sure we understand all events, including ACKs
- CERROR("Unknown event %d\n", ev->type);
- LBUG();
- }
-
- RETURN(1);
-}
-
-int request_in_callback(ptl_event_t *ev)
-{
- struct ptlrpc_request_buffer_desc *rqbd = ev->mem_desc.user_ptr;
- struct ptlrpc_srv_ni *srv_ni = rqbd->rqbd_srv_ni;
- struct ptlrpc_service *service = srv_ni->sni_service;
-
- /* requests always contiguous */
- LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
- /* we only enable puts */
- LASSERT(ev->type == PTL_EVENT_PUT);
- LASSERT(atomic_read(&srv_ni->sni_nrqbds_receiving) > 0);
- LASSERT(atomic_read(&rqbd->rqbd_refcount) > 0);
-
- if (ev->rlength != ev->mlength)
- CERROR("Warning: Possibly truncated rpc (%d/%d)\n",
- ev->mlength, ev->rlength);
-
- if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE)) {
- /* This is the last request to be received into this
- * request buffer. We don't bump the refcount, since the
- * thread servicing this event is effectively taking over
- * portals' reference.
- */
- /* NB ev->unlinked_me.nal_idx is not set properly in a callback */
- LASSERT(ev->unlinked_me.cookie==rqbd->rqbd_me_h.cookie);
-
- /* we're off the air */
- /* we'll probably start dropping packets in portals soon */
- if (atomic_dec_and_test(&srv_ni->sni_nrqbds_receiving))
- CERROR("All request buffers busy\n");
- } else {
- /* +1 ref for service thread */
- atomic_inc(&rqbd->rqbd_refcount);
+ req->rq_nob_received = ev->mlength;
}
- wake_up(&service->srv_waitq);
+ /* NB don't unlock till after wakeup; req can disappear under us
+ * since we don't have our own ref */
+ ptlrpc_wake_client_req(req);
- return 0;
+ spin_unlock_irqrestore (&req->rq_lock, flags);
+ EXIT;
}
-static int bulk_put_source_callback(ptl_event_t *ev)
+/*
+ * Client's bulk has been written/read
+ */
+void client_bulk_callback (ptl_event_t *ev)
{
+ struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr;
+ struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
unsigned long flags;
- struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr;
ENTRY;
- CDEBUG(D_NET, "got %s event %d\n",
- (ev->type == PTL_EVENT_SENT) ? "SENT" :
- (ev->type == PTL_EVENT_ACK) ? "ACK" : "UNEXPECTED", ev->type);
+ LASSERT ((desc->bd_type == BULK_PUT_SINK &&
+ ev->type == PTL_EVENT_PUT) ||
+ (desc->bd_type == BULK_GET_SOURCE &&
+ ev->type == PTL_EVENT_GET) ||
+ ev->type == PTL_EVENT_UNLINK);
+ LASSERT (ev->unlinked);
- LASSERT(ev->type == PTL_EVENT_SENT || ev->type == PTL_EVENT_ACK);
-
- /* 1 fragment for each page always */
- LASSERT(ev->mem_desc.niov == desc->bd_page_count);
+ CDEBUG((ev->status == PTL_OK) ? D_NET : D_ERROR,
+ "event type %d, status %d, desc %p\n",
+ ev->type, ev->status, desc);
spin_lock_irqsave (&desc->bd_lock, flags);
-
- LASSERT(desc->bd_callback_count > 0 &&
- desc->bd_callback_count <= 2);
-
- if (--desc->bd_callback_count == 0) {
- desc->bd_network_rw = 0;
- desc->bd_complete = 1;
- wake_up(&desc->bd_waitq);
+
+ LASSERT(desc->bd_network_rw);
+ desc->bd_network_rw = 0;
+
+ if (ev->type != PTL_EVENT_UNLINK &&
+ ev->status == PTL_OK) {
+ desc->bd_success = 1;
+ desc->bd_nob_transferred = ev->mlength;
}
+ /* NB don't unlock till after wakeup; desc can disappear under us
+ * otherwise */
+ ptlrpc_wake_client_req(desc->bd_req);
+
spin_unlock_irqrestore (&desc->bd_lock, flags);
- RETURN(0);
+ EXIT;
}
-struct ptlrpc_bulk_desc ptlrpc_bad_desc;
-ptl_event_t ptlrpc_bad_event;
-
-static int bulk_put_sink_callback(ptl_event_t *ev)
+/*
+ * Server's incoming request callback
+ */
+void request_in_callback(ptl_event_t *ev)
{
- struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr;
- unsigned long flags;
+ struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr;
+ struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg;
+ struct ptlrpc_srv_ni *srv_ni = rqbd->rqbd_srv_ni;
+ struct ptlrpc_service *service = srv_ni->sni_service;
+ struct ptlrpc_request *req;
+ long flags;
ENTRY;
- LASSERT(ev->type == PTL_EVENT_PUT);
-
- /* used iovs */
- LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) ==
- PTL_MD_KIOV);
- /* Honestly, it's best to find out early. */
- if (desc->bd_page_count == 0x5a5a5a5a ||
- desc->bd_page_count != ev->mem_desc.niov ||
- ev->mem_desc.start != &desc->bd_iov) {
- /* not guaranteed (don't LASSERT) but good for this bug hunt */
- ptlrpc_bad_event = *ev;
- ptlrpc_bad_desc = *desc;
- CERROR ("XXX ev %p type %d portal %d match "LPX64", seq %ld\n",
- ev, ev->type, ev->portal, ev->match_bits, ev->sequence);
- CERROR ("XXX desc %p, export %p import %p gen %d "
- " portal %d\n",
- desc, desc->bd_export,
- desc->bd_import, desc->bd_import_generation,
- desc->bd_portal);
- RETURN (0);
+ LASSERT (ev->type == PTL_EVENT_PUT ||
+ ev->type == PTL_EVENT_UNLINK);
+ LASSERT ((char *)ev->mem_desc.start >= rqbd->rqbd_buffer);
+ LASSERT ((char *)ev->mem_desc.start + ev->offset + ev->mlength <=
+ rqbd->rqbd_buffer + service->srv_buf_size);
+
+ CDEBUG((ev->status == PTL_OK) ? D_NET : D_ERROR,
+ "event type %d, status %d, service %s\n",
+ ev->type, ev->status, service->srv_name);
+
+ if (ev->unlinked) {
+ /* If this is the last request message to fit in the
+ * request buffer we can use the request object embedded in
+ * rqbd. Note that if we failed to allocate a request,
+ * we'd have to re-post the rqbd, which we can't do in this
+ * context. */
+ req = &rqbd->rqbd_req;
+ memset(req, 0, sizeof (*req));
+ } else {
+ LASSERT (ev->type == PTL_EVENT_PUT);
+ if (ev->status != PTL_OK) {
+ /* We moaned above already... */
+ return;
+ }
+ OBD_ALLOC_GFP(req, sizeof(*req), GFP_ATOMIC);
+ if (req == NULL) {
+ CERROR("Can't allocate incoming request descriptor: "
+ "Dropping %s RPC from "LPX64"\n",
+ service->srv_name, ev->initiator.nid);
+ return;
+ }
}
-
- LASSERT(desc->bd_page_count != 0x5a5a5a5a);
- /* 1 fragment for each page always */
- LASSERT(ev->mem_desc.niov == desc->bd_page_count);
- LASSERT(ev->match_bits == desc->bd_req->rq_xid);
-
- /* peer must put with zero offset */
- if (ev->offset != 0) {
- /* Bug 1190: handle this as a protocol failure */
- CERROR ("Bad offset %d\n", ev->offset);
- LBUG ();
+
+ /* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL,
+ * flags are reset and scalars are zero. We only set the message
+ * size to non-zero if this was a successful receive. */
+ req->rq_xid = ev->match_bits;
+ req->rq_reqmsg = ev->mem_desc.start + ev->offset;
+ if (ev->type == PTL_EVENT_PUT &&
+ ev->status == PTL_OK)
+ req->rq_reqlen = ev->mlength;
+ req->rq_arrival_time = ev->arrival_time;
+ req->rq_peer.peer_nid = ev->initiator.nid;
+ req->rq_peer.peer_ni = rqbd->rqbd_srv_ni->sni_ni;
+ req->rq_rqbd = rqbd;
+
+ spin_lock_irqsave (&service->srv_lock, flags);
+
+ if (ev->unlinked) {
+ srv_ni->sni_nrqbd_receiving--;
+ if (ev->type != PTL_EVENT_UNLINK &&
+ srv_ni->sni_nrqbd_receiving == 0) {
+ /* This service is off-air on this interface because
+ * all its request buffers are busy. Portals will
+ * start dropping incoming requests until more buffers
+ * get posted. NB don't moan if it's because we're
+ * tearing down the service. */
+ CWARN("All %s %s request buffers busy\n",
+ service->srv_name, srv_ni->sni_ni->pni_name);
+ }
+ /* req takes over the network's ref on rqbd */
+ } else {
+ /* req takes a ref on rqbd */
+ rqbd->rqbd_refcount++;
}
- /* No check for total # bytes; this could be a short read */
+ list_add_tail(&req->rq_list, &service->srv_request_queue);
+ service->srv_n_queued_reqs++;
+ rqbd->rqbd_eventcount++;
- spin_lock_irqsave (&desc->bd_lock, flags);
- desc->bd_network_rw = 0;
- desc->bd_complete = 1;
- if (desc->bd_req->rq_set != NULL)
- wake_up (&desc->bd_req->rq_set->set_waitq);
- else
- wake_up (&desc->bd_req->rq_reply_waitq);
- spin_unlock_irqrestore (&desc->bd_lock, flags);
+ /* NB everything can disappear under us once the request
+ * has been queued and we unlock, so do the wake now... */
+ wake_up(&service->srv_waitq);
- RETURN(1);
+ spin_unlock_irqrestore(&service->srv_lock, flags);
+ EXIT;
}
-static int bulk_get_source_callback(ptl_event_t *ev)
+/*
+ * Server's outgoing reply callback
+ */
+void reply_out_callback(ptl_event_t *ev)
{
- struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr;
- struct ptlrpc_bulk_page *bulk;
- struct list_head *tmp;
- unsigned long flags;
- ptl_size_t total = 0;
+ struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr;
+ struct ptlrpc_reply_state *rs = cbid->cbid_arg;
+ struct ptlrpc_srv_ni *sni = rs->rs_srv_ni;
+ struct ptlrpc_service *svc = sni->sni_service;
+ unsigned long flags;
ENTRY;
- LASSERT(ev->type == PTL_EVENT_GET);
-
- /* used iovs */
- LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) ==
- PTL_MD_KIOV);
- /* 1 fragment for each page always */
- LASSERT(ev->mem_desc.niov == desc->bd_page_count);
- LASSERT(ev->match_bits == desc->bd_req->rq_xid);
-
- /* peer must get with zero offset */
- if (ev->offset != 0) {
- /* Bug 1190: handle this as a protocol failure */
- CERROR ("Bad offset %d\n", ev->offset);
- LBUG ();
+ LASSERT (ev->type == PTL_EVENT_SENT ||
+ ev->type == PTL_EVENT_ACK ||
+ ev->type == PTL_EVENT_UNLINK);
+
+ if (!rs->rs_difficult) {
+ /* I'm totally responsible for freeing "easy" replies */
+ LASSERT (ev->unlinked);
+ lustre_free_reply_state (rs);
+ atomic_dec (&svc->srv_outstanding_replies);
+ EXIT;
+ return;
}
-
- list_for_each (tmp, &desc->bd_page_list) {
- bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
- total += bulk->bp_buflen;
- }
+ LASSERT (rs->rs_on_net);
- /* peer must get everything */
- if (ev->mem_desc.length != total) {
- /* Bug 1190: handle this as a protocol failure */
- CERROR ("Bad length/total %d/%d\n", ev->mem_desc.length, total);
- LBUG ();
+ if (ev->unlinked) {
+ /* Last network callback */
+ spin_lock_irqsave (&svc->srv_lock, flags);
+ rs->rs_on_net = 0;
+ ptlrpc_schedule_difficult_reply (rs);
+ spin_unlock_irqrestore (&svc->srv_lock, flags);
}
- spin_lock_irqsave (&desc->bd_lock, flags);
- desc->bd_network_rw = 0;
- desc->bd_complete = 1;
- if (desc->bd_req->rq_set != NULL)
- wake_up (&desc->bd_req->rq_set->set_waitq);
- else
- wake_up (&desc->bd_req->rq_reply_waitq);
- spin_unlock_irqrestore (&desc->bd_lock, flags);
-
- RETURN(1);
+ EXIT;
}
-static int bulk_get_sink_callback(ptl_event_t *ev)
+/*
+ * Server's bulk completion callback
+ */
+void server_bulk_callback (ptl_event_t *ev)
{
- struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr;
+ struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr;
+ struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
unsigned long flags;
ENTRY;
- CDEBUG(D_NET, "got %s event %d desc %p\n",
- (ev->type == PTL_EVENT_SENT) ? "SENT" :
- (ev->type == PTL_EVENT_REPLY) ? "REPLY" : "UNEXPECTED",
- ev->type, desc);
+ LASSERT (ev->type == PTL_EVENT_SENT ||
+ ev->type == PTL_EVENT_UNLINK ||
+ (desc->bd_type == BULK_PUT_SOURCE &&
+ ev->type == PTL_EVENT_ACK) ||
+ (desc->bd_type == BULK_GET_SINK &&
+ ev->type == PTL_EVENT_REPLY));
- LASSERT(ev->type == PTL_EVENT_SENT || ev->type == PTL_EVENT_REPLY);
-
- /* 1 fragment for each page always */
- LASSERT(ev->mem_desc.niov == desc->bd_page_count);
+ CDEBUG((ev->status == PTL_OK) ? D_NET : D_ERROR,
+ "event type %d, status %d, desc %p\n",
+ ev->type, ev->status, desc);
spin_lock_irqsave (&desc->bd_lock, flags);
- LASSERT(desc->bd_callback_count > 0 &&
- desc->bd_callback_count <= 2);
+
+ if ((ev->type == PTL_EVENT_ACK ||
+ ev->type == PTL_EVENT_REPLY) &&
+ ev->status == PTL_OK) {
+ /* We heard back from the peer, so even if we get this
+ * before the SENT event (oh yes we can), we know we
+ * read/wrote the peer buffer and how much... */
+ desc->bd_success = 1;
+ desc->bd_nob_transferred = ev->mlength;
+ }
- if (--desc->bd_callback_count == 0) {
+ if (ev->unlinked) {
+ /* This is the last callback no matter what... */
desc->bd_network_rw = 0;
- desc->bd_complete = 1;
wake_up(&desc->bd_waitq);
}
+
spin_unlock_irqrestore (&desc->bd_lock, flags);
+ EXIT;
+}
+
+static int ptlrpc_master_callback(ptl_event_t *ev)
+{
+ struct ptlrpc_cb_id *cbid = ev->mem_desc.user_ptr;
+ void (*callback)(ptl_event_t *ev) = cbid->cbid_fn;
- RETURN(0);
+ /* Honestly, it's best to find out early. */
+ LASSERT (cbid->cbid_arg != (void *)0x5a5a5a5a5a5a5a5a);
+ LASSERT (callback == request_out_callback ||
+ callback == reply_in_callback ||
+ callback == client_bulk_callback ||
+ callback == request_in_callback ||
+ callback == reply_out_callback ||
+ callback == server_bulk_callback);
+
+ callback (ev);
+ return (0);
}
int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer)
void ptlrpc_ni_fini(struct ptlrpc_ni *pni)
{
- PtlEQFree(pni->pni_request_out_eq_h);
- PtlEQFree(pni->pni_reply_out_eq_h);
- PtlEQFree(pni->pni_reply_in_eq_h);
- PtlEQFree(pni->pni_bulk_put_source_eq_h);
- PtlEQFree(pni->pni_bulk_put_sink_eq_h);
- PtlEQFree(pni->pni_bulk_get_source_eq_h);
- PtlEQFree(pni->pni_bulk_get_sink_eq_h);
-
+ PtlEQFree(pni->pni_eq_h);
kportal_put_ni (pni->pni_number);
}
pni->pni_number = number;
pni->pni_ni_h = *nip;
- pni->pni_request_out_eq_h = PTL_HANDLE_NONE;
- pni->pni_reply_out_eq_h = PTL_HANDLE_NONE;
- pni->pni_reply_in_eq_h = PTL_HANDLE_NONE;
- pni->pni_bulk_put_source_eq_h = PTL_HANDLE_NONE;
- pni->pni_bulk_put_sink_eq_h = PTL_HANDLE_NONE;
- pni->pni_bulk_get_source_eq_h = PTL_HANDLE_NONE;
- pni->pni_bulk_get_sink_eq_h = PTL_HANDLE_NONE;
-
- /* NB We never actually PtlEQGet() out of these events queues since
- * we're only interested in the event callback, so we can just let
- * them wrap. Their sizes aren't a big deal, apart from providing
- * a little history for debugging... */
-
- rc = PtlEQAlloc(pni->pni_ni_h, 1024, request_out_callback,
- &pni->pni_request_out_eq_h);
- if (rc != PTL_OK)
- GOTO (fail, rc = -ENOMEM);
-
- rc = PtlEQAlloc(pni->pni_ni_h, 1024, reply_out_callback,
- &pni->pni_reply_out_eq_h);
- if (rc != PTL_OK)
- GOTO (fail, rc = -ENOMEM);
-
- rc = PtlEQAlloc(pni->pni_ni_h, 1024, reply_in_callback,
- &pni->pni_reply_in_eq_h);
- if (rc != PTL_OK)
- GOTO (fail, rc = -ENOMEM);
-
- rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_put_source_callback,
- &pni->pni_bulk_put_source_eq_h);
- if (rc != PTL_OK)
- GOTO (fail, rc = -ENOMEM);
+ pni->pni_eq_h = PTL_HANDLE_NONE;
- rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_put_sink_callback,
- &pni->pni_bulk_put_sink_eq_h);
- if (rc != PTL_OK)
- GOTO (fail, rc = -ENOMEM);
-
- rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_get_source_callback,
- &pni->pni_bulk_get_source_eq_h);
- if (rc != PTL_OK)
- GOTO (fail, rc = -ENOMEM);
-
- rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_get_sink_callback,
- &pni->pni_bulk_get_sink_eq_h);
+#ifdef __KERNEL__
+ /* kernel: portals calls the callback when the event is added to the
+ * queue, so we don't care if we lose events */
+ rc = PtlEQAlloc(pni->pni_ni_h, 1024, ptlrpc_master_callback,
+ &pni->pni_eq_h);
+#else
+ /* liblustre: no asynchronous callback and allocate a nice big event
+ * queue so we don't drop any events... */
+ rc = PtlEQAlloc(pni->pni_ni_h, 10240, NULL, &pni->pni_eq_h);
+#endif
if (rc != PTL_OK)
GOTO (fail, rc = -ENOMEM);
}
#ifndef __KERNEL__
+LIST_HEAD(liblustre_wait_callbacks);
+void *liblustre_services_callback;
+
+void *
+liblustre_register_wait_callback (int (*fn)(void *arg), void *arg)
+{
+ struct liblustre_wait_callback *llwc;
+
+ OBD_ALLOC(llwc, sizeof(*llwc));
+ LASSERT (llwc != NULL);
+
+ llwc->llwc_fn = fn;
+ llwc->llwc_arg = arg;
+ list_add_tail(&llwc->llwc_list, &liblustre_wait_callbacks);
+
+ return (llwc);
+}
+
+void
+liblustre_deregister_wait_callback (void *opaque)
+{
+ struct liblustre_wait_callback *llwc = opaque;
+
+ list_del(&llwc->llwc_list);
+ OBD_FREE(llwc, sizeof(*llwc));
+}
+
int
-liblustre_check_events (int block)
+liblustre_check_events (int timeout)
{
ptl_event_t ev;
int rc;
ENTRY;
- if (block) {
- /* XXX to accelerate recovery tests XXX */
- if (block > 10)
- block = 10;
- rc = PtlEQWait_timeout(ptlrpc_interfaces[0].pni_eq_h, &ev, block);
+ if (timeout) {
+ rc = PtlEQWait_timeout(ptlrpc_interfaces[0].pni_eq_h, &ev, timeout);
} else {
rc = PtlEQGet (ptlrpc_interfaces[0].pni_eq_h, &ev);
}
LASSERT (rc == PTL_EQ_DROPPED || rc == PTL_OK);
-#if PORTALS_DOES_NOT_SUPPORT_CALLBACKS
- if (rc == PTL_EQ_DROPPED)
+#ifndef __KERNEL__
+ /* liblustre: no asynch callback so we can't affort to miss any
+ * events... */
+ if (rc == PTL_EQ_DROPPED) {
CERROR ("Dropped an event!!!\n");
+ abort();
+ }
ptlrpc_master_callback (&ev);
#endif
RETURN(1);
}
-int liblustre_wait_event(struct l_wait_info *lwi)
+int
+liblustre_wait_event (int timeout)
{
- ENTRY;
-
- /* non-blocking checks (actually we might block in a service for
- * bulk but we won't block in a blocked service)
- */
- if (liblustre_check_events(0) ||
- liblustre_check_services()) {
- /* the condition the caller is waiting for may now hold */
- RETURN(0);
+ struct list_head *tmp;
+ struct liblustre_wait_callback *llwc;
+ int found_something = 0;
+
+ /* First check for any new events */
+ if (liblustre_check_events(0))
+ found_something = 1;
+
+ /* Now give all registered callbacks a bite at the cherry */
+ list_for_each(tmp, &liblustre_wait_callbacks) {
+ llwc = list_entry(tmp, struct liblustre_wait_callback,
+ llwc_list);
+
+ if (llwc->llwc_fn(llwc->llwc_arg))
+ found_something = 1;
}
-
- /* block for an event */
- liblustre_check_events(lwi->lwi_timeout);
- /* check it's not for some service */
- liblustre_check_services ();
+ /* return to caller if something happened */
+ if (found_something)
+ return 1;
+
+ /* block for an event, returning immediately on timeout */
+ if (!liblustre_check_events(timeout))
+ return 0;
+
+ /* an event occurred; let all registered callbacks progress... */
+ list_for_each(tmp, &liblustre_wait_callbacks) {
+ llwc = list_entry(tmp, struct liblustre_wait_callback,
+ llwc_list);
+
+ if (llwc->llwc_fn(llwc->llwc_arg))
+ found_something = 1;
+ }
- /* XXX check this */
- RETURN(0);
+ /* ...and tell caller something happened */
+ return 1;
}
#endif
"loaded?\n");
return -EIO;
}
+#ifndef __KERNEL__
+ liblustre_services_callback =
+ liblustre_register_wait_callback(&liblustre_check_services, NULL);
+#endif
return 0;
}
void ptlrpc_exit_portals(void)
{
+#ifndef __KERNEL__
+ liblustre_deregister_wait_callback(liblustre_services_callback);
+#endif
while (ptlrpc_ninterfaces > 0)
ptlrpc_ni_fini (&ptlrpc_interfaces[--ptlrpc_ninterfaces]);
}
if (!request)
GOTO(out, rc = -ENOMEM);
+#ifndef __KERNEL__
+ lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_LIBCLIENT);
+#endif
+
request->rq_send_state = LUSTRE_IMP_CONNECTING;
request->rq_replen = lustre_msg_size(0, NULL);
request->rq_interpret_reply = ptlrpc_connect_interpret;
if (aa->pcaa_initial_connect)
imp->imp_replayable = 1;
+
ptlrpcd_add_req(request);
rc = 0;
out:
out:
if (rc != 0) {
IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
- if (aa->pcaa_initial_connect && !imp->imp_initial_recov)
+ if (aa->pcaa_initial_connect && !imp->imp_initial_recov) {
+ ptlrpc_set_import_active(imp, 0);
GOTO(norecov, rc);
+ }
CDEBUG(D_ERROR,
"recovery of %s on %s failed (%d); restarting\n",
imp->imp_target_uuid.uuid,
#else /* !__KERNEL__ */
int llog_origin_connect(struct llog_ctxt *ctxt, int count,
- struct llog_logid *logid,
- struct llog_ctxt_gen *gen)
+ struct llog_logid *logid, struct llog_gen *gen)
{
return 0;
}
#define EXPORT_SYMTAB
#endif
+#ifndef __KERNEL__
+#include <liblustre.h>
+#else
#include <linux/fs.h>
+#endif
+
#include <linux/obd_class.h>
#include <linux/lustre_log.h>
#include <linux/lustre_net.h>
#include <portals/list.h>
#include <linux/lustre_fsfilt.h>
+#ifdef __KERNEL__
+
int llog_origin_handle_create(struct ptlrpc_request *req)
{
struct obd_export *exp = req->rq_export;
OBD_FREE(buf, buf_len);
return rc;
}
+
+#else /* !__KERNEL__ */
+int llog_origin_handle_create(struct ptlrpc_request *req)
+{
+ LBUG();
+ return 0;
+}
+int llog_origin_handle_next_block(struct ptlrpc_request *req)
+{
+ LBUG();
+ return 0;
+}
+int llog_origin_handle_read_header(struct ptlrpc_request *req)
+{
+ LBUG();
+ return 0;
+}
+int llog_origin_handle_close(struct ptlrpc_request *req)
+{
+ LBUG();
+ return 0;
+}
+int llog_origin_handle_cancel(struct ptlrpc_request *req)
+{
+ LBUG();
+ return 0;
+}
+#endif
struct proc_dir_entry *svc_procroot;
struct lprocfs_stats *svc_stats;
int i, rc;
- unsigned int svc_counter_config = LPROCFS_CNTR_EXTERNALLOCK |
- LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV;
+ unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX |
+ LPROCFS_CNTR_STDDEV;
LASSERT(*procroot_ret == NULL);
LASSERT(*stats_ret == NULL);
lprocfs_free_stats(svc_stats);
return;
}
- } else
+ } else {
svc_procroot = root;
+ }
lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR,
svc_counter_config, "req_waittime", "usec");
- /* Wait for b_eq branch
- lprocfs_counter_init(svc_stats, PTLRPC_SVCEQDEPTH_CNTR,
- svc_counter_config, "svc_eqdepth", "reqs");
- */
- /* no stddev on idletime */
- lprocfs_counter_init(svc_stats, PTLRPC_SVCIDLETIME_CNTR,
- (LPROCFS_CNTR_EXTERNALLOCK|LPROCFS_CNTR_AVGMINMAX),
- "svc_idletime", "usec");
+ lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR,
+ svc_counter_config, "req_qdepth", "reqs");
+ lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR,
+ svc_counter_config, "req_active", "reqs");
for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
__u32 opcode = ll_rpc_opcode_table[i].opcode;
lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i,
struct ptlrpc_service *svc)
{
ptlrpc_lprocfs_register(entry, svc->srv_name,
- "stats", &svc->srv_procroot,
+ "stats", &svc->srv_procroot,
&svc->srv_stats);
}
void ptlrpc_lprocfs_register_obd(struct obd_device *obddev)
{
- ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats",
- &obddev->obd_svc_procroot,
+ ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats",
+ &obddev->obd_svc_procroot,
&obddev->obd_svc_stats);
}
#include <linux/obd.h>
#include "ptlrpc_internal.h"
-static int ptl_send_buf(struct ptlrpc_request *request,
- struct ptlrpc_connection *conn, int portal)
+static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len,
+ ptl_ack_req_t ack, struct ptlrpc_cb_id *cbid,
+ struct ptlrpc_connection *conn, int portal, __u64 xid)
{
- int rc;
- int rc2;
ptl_process_id_t remote_id;
- ptl_handle_md_t md_h;
- ptl_ack_req_t ack_req;
+ int rc;
+ int rc2;
+ ptl_md_t md;
char str[PTL_NALFMT_SIZE];
+ ENTRY;
LASSERT (portal != 0);
LASSERT (conn != NULL);
conn->c_peer.peer_nid, str),
conn->c_peer.peer_ni->pni_name);
- request->rq_req_md.user_ptr = request;
-
- switch (request->rq_type) {
- case PTL_RPC_MSG_REQUEST:
- request->rq_reqmsg->type = request->rq_type;
- request->rq_req_md.start = request->rq_reqmsg;
- request->rq_req_md.length = request->rq_reqlen;
- request->rq_req_md.eventq =
- conn->c_peer.peer_ni->pni_request_out_eq_h;
- LASSERT (!request->rq_want_ack);
- break;
- case PTL_RPC_MSG_ERR:
- case PTL_RPC_MSG_REPLY:
- request->rq_repmsg->type = request->rq_type;
- request->rq_req_md.start = request->rq_repmsg;
- request->rq_req_md.length = request->rq_replen;
- request->rq_req_md.eventq =
- conn->c_peer.peer_ni->pni_reply_out_eq_h;
- break;
- default:
- LBUG();
- return -1; /* notreached */
- }
- if (request->rq_want_ack) {
- request->rq_req_md.threshold = 2; /* SENT and ACK */
- ack_req = PTL_ACK_REQ;
- } else {
- request->rq_req_md.threshold = 1;
- ack_req = PTL_NOACK_REQ;
- }
- request->rq_req_md.options = PTL_MD_OP_PUT;
- request->rq_req_md.user_ptr = request;
+ remote_id.nid = conn->c_peer.peer_nid,
+ remote_id.pid = 0;
- if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_ACK | OBD_FAIL_ONCE)) {
- request->rq_req_md.options |= PTL_MD_ACK_DISABLE;
+ md.start = base;
+ md.length = len;
+ md.threshold = (ack == PTL_ACK_REQ) ? 2 : 1;
+ md.options = 0;
+ md.user_ptr = cbid;
+ md.eventq = conn->c_peer.peer_ni->pni_eq_h;
+
+ if (ack == PTL_ACK_REQ &&
+ OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_ACK | OBD_FAIL_ONCE)) {
+ /* don't ask for the ack to simulate failing client */
+ ack = PTL_NOACK_REQ;
obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED;
}
- /* NB if the send fails, we back out of the send and return
- * failure; it's down to the caller to handle missing callbacks */
-
- rc = PtlMDBind(conn->c_peer.peer_ni->pni_ni_h, request->rq_req_md,
- &md_h);
+ rc = PtlMDBind (conn->c_peer.peer_ni->pni_ni_h, md, mdh);
if (rc != PTL_OK) {
- CERROR("PtlMDBind failed: %d\n", rc);
+ CERROR ("PtlMDBind failed: %d\n", rc);
LASSERT (rc == PTL_NOSPACE);
RETURN (-ENOMEM);
}
- if (request->rq_type != PTL_RPC_MSG_REQUEST)
- memcpy(&request->rq_reply_md_h, &md_h, sizeof(md_h));
-
- remote_id.nid = conn->c_peer.peer_nid;
- remote_id.pid = 0;
CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64"\n",
- request->rq_req_md.length, portal, request->rq_xid);
+ len, portal, xid);
- rc = PtlPut(md_h, ack_req, remote_id, portal, 0, request->rq_xid, 0, 0);
+ rc2 = PtlPut (*mdh, ack, remote_id, portal, 0, xid, 0, 0);
if (rc != PTL_OK) {
+ /* We're going to get an UNLINK event when I unlink below,
+ * which will complete just like any other failed send, so
+ * I fall through and return success here! */
CERROR("PtlPut("LPU64", %d, "LPD64") failed: %d\n",
- remote_id.nid, portal, request->rq_xid, rc);
- rc2 = PtlMDUnlink(md_h);
+ remote_id.nid, portal, xid, rc);
+ rc2 = PtlMDUnlink(*mdh);
LASSERT (rc2 == PTL_OK);
- RETURN ((rc == PTL_NOSPACE) ? -ENOMEM : -ECOMM);
}
- return 0;
+ RETURN (0);
}
-static inline ptl_kiov_t *
-ptlrpc_get_bulk_iov (struct ptlrpc_bulk_desc *desc)
+int ptlrpc_start_bulk_transfer (struct ptlrpc_bulk_desc *desc)
{
- ptl_kiov_t *iov;
-
- if (desc->bd_page_count <= sizeof (desc->bd_iov)/sizeof (*iov))
- return (desc->bd_iov);
-
- OBD_ALLOC (iov, desc->bd_page_count * sizeof (*iov));
- if (iov == NULL)
- LBUG();
-
- return (iov);
-}
-
-static inline void
-ptlrpc_put_bulk_iov (struct ptlrpc_bulk_desc *desc, ptl_kiov_t *iov)
-{
- if (desc->bd_page_count <= sizeof (desc->bd_iov)/sizeof (*iov))
- return;
-
- OBD_FREE (iov, desc->bd_page_count * sizeof (*iov));
-}
-
-int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc)
-{
- int rc;
- int rc2;
+ int rc;
+ int rc2;
struct ptlrpc_peer *peer;
- struct list_head *tmp, *next;
- ptl_process_id_t remote_id;
- ptl_kiov_t *iov;
- __u64 xid;
+ ptl_process_id_t remote_id;
+ ptl_md_t md;
+ __u64 xid;
ENTRY;
/* NB no locking required until desc is on the network */
LASSERT (!desc->bd_network_rw);
- LASSERT (desc->bd_type == BULK_PUT_SOURCE);
- desc->bd_complete = 0;
-
- iov = ptlrpc_get_bulk_iov (desc);
- if (iov == NULL)
- RETURN (-ENOMEM);
-
+ LASSERT (desc->bd_type == BULK_PUT_SOURCE ||
+ desc->bd_type == BULK_GET_SINK);
+ desc->bd_success = 0;
peer = &desc->bd_export->exp_connection->c_peer;
- desc->bd_md.start = iov;
- desc->bd_md.niov = 0;
- desc->bd_md.length = 0;
- desc->bd_md.eventq = peer->peer_ni->pni_bulk_put_source_eq_h;
- desc->bd_md.threshold = 2; /* SENT and ACK */
- desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_KIOV;
- desc->bd_md.user_ptr = desc;
-
- desc->bd_callback_count = 2;
-
- list_for_each_safe(tmp, next, &desc->bd_page_list) {
- struct ptlrpc_bulk_page *bulk;
- bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
-
- LASSERT(desc->bd_md.niov < desc->bd_page_count);
-
- iov[desc->bd_md.niov].kiov_page = bulk->bp_page;
- iov[desc->bd_md.niov].kiov_offset = bulk->bp_pageoffset;
- iov[desc->bd_md.niov].kiov_len = bulk->bp_buflen;
-
- LASSERT (iov[desc->bd_md.niov].kiov_offset +
- iov[desc->bd_md.niov].kiov_len <= PAGE_SIZE);
- desc->bd_md.niov++;
- desc->bd_md.length += bulk->bp_buflen;
- }
+ md.start = &desc->bd_iov[0];
+ md.niov = desc->bd_page_count;
+ md.length = desc->bd_nob;
+ md.eventq = peer->peer_ni->pni_eq_h;
+ md.threshold = 2; /* SENT and ACK/REPLY */
+#ifdef __KERNEL__
+ md.options = PTL_MD_KIOV;
+#else
+ md.options = PTL_MD_IOV;
+#endif
+ md.user_ptr = &desc->bd_cbid;
+ LASSERT (desc->bd_cbid.cbid_fn == server_bulk_callback);
+ LASSERT (desc->bd_cbid.cbid_arg == desc);
/* NB total length may be 0 for a read past EOF, so we send a 0
* length bulk, since the client expects a bulk event. */
- LASSERT(desc->bd_md.niov == desc->bd_page_count);
-
- rc = PtlMDBind(peer->peer_ni->pni_ni_h, desc->bd_md,
- &desc->bd_md_h);
-
- ptlrpc_put_bulk_iov (desc, iov); /*move down to reduce latency to send*/
+ rc = PtlMDBind(peer->peer_ni->pni_ni_h, md, &desc->bd_md_h);
if (rc != PTL_OK) {
CERROR("PtlMDBind failed: %d\n", rc);
LASSERT (rc == PTL_NOSPACE);
remote_id.nid = peer->peer_nid;
remote_id.pid = 0;
- CDEBUG(D_NET, "Sending %u pages %u bytes to portal %d on %s "
- "nid "LPX64" pid %d xid "LPX64"\n",
- desc->bd_md.niov, desc->bd_md.length,
- desc->bd_portal, peer->peer_ni->pni_name,
+ CDEBUG(D_NET, "Transferring %u pages %u bytes via portal %d on %s "
+ "nid "LPX64" pid %d xid "LPX64"\n",
+ md.niov, md.length, desc->bd_portal, peer->peer_ni->pni_name,
remote_id.nid, remote_id.pid, xid);
+ /* Network is about to get at the memory */
desc->bd_network_rw = 1;
- rc = PtlPut(desc->bd_md_h, PTL_ACK_REQ, remote_id,
- desc->bd_portal, 0, xid, 0, 0);
- if (rc != PTL_OK) {
- desc->bd_network_rw = 0;
- CERROR("PtlPut("LPU64", %d, "LPX64") failed: %d\n",
- remote_id.nid, desc->bd_portal, xid, rc);
- rc2 = PtlMDUnlink(desc->bd_md_h);
- LASSERT (rc2 == PTL_OK);
- RETURN((rc == PTL_NOSPACE) ? -ENOMEM : -ECOMM);
- }
-
- RETURN(0);
-}
-
-int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *desc)
-{
- int rc;
- int rc2;
- struct ptlrpc_peer *peer;
- struct list_head *tmp, *next;
- ptl_process_id_t remote_id;
- ptl_kiov_t *iov;
- __u64 xid;
- ENTRY;
-
- /* NB no locking required until desc is on the network */
- LASSERT (!desc->bd_network_rw);
- LASSERT (desc->bd_type == BULK_GET_SINK);
- desc->bd_complete = 0;
-
- iov = ptlrpc_get_bulk_iov (desc);
- if (iov == NULL)
- RETURN(-ENOMEM);
-
- peer = &desc->bd_export->exp_connection->c_peer;
-
- desc->bd_md.start = iov;
- desc->bd_md.niov = 0;
- desc->bd_md.length = 0;
- desc->bd_md.eventq = peer->peer_ni->pni_bulk_get_sink_eq_h;
- desc->bd_md.threshold = 2; /* SENT and REPLY */
- desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_KIOV;
- desc->bd_md.user_ptr = desc;
-
- desc->bd_callback_count = 2;
-
- list_for_each_safe(tmp, next, &desc->bd_page_list) {
- struct ptlrpc_bulk_page *bulk;
- bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
- LASSERT(desc->bd_md.niov < desc->bd_page_count);
-
- iov[desc->bd_md.niov].kiov_page = bulk->bp_page;
- iov[desc->bd_md.niov].kiov_len = bulk->bp_buflen;
- iov[desc->bd_md.niov].kiov_offset = bulk->bp_pageoffset;
-
- LASSERT (iov[desc->bd_md.niov].kiov_offset +
- iov[desc->bd_md.niov].kiov_len <= PAGE_SIZE);
- desc->bd_md.niov++;
- desc->bd_md.length += bulk->bp_buflen;
- }
-
- LASSERT(desc->bd_md.niov == desc->bd_page_count);
- LASSERT(desc->bd_md.niov != 0);
-
- rc = PtlMDBind(peer->peer_ni->pni_ni_h, desc->bd_md, &desc->bd_md_h);
-
- ptlrpc_put_bulk_iov(desc, iov); /*move down to reduce latency to send*/
-
- if (rc != PTL_OK) {
- CERROR("PtlMDBind failed: %d\n", rc);
- LASSERT (rc == PTL_NOSPACE);
- RETURN(-ENOMEM);
- }
-
- /* Client's bulk and reply matchbits are the same */
- xid = desc->bd_req->rq_xid;
- remote_id.nid = desc->bd_export->exp_connection->c_peer.peer_nid;
- remote_id.pid = 0;
-
- CDEBUG(D_NET, "Fetching %u pages %u bytes from portal %d on %s "
- "nid "LPX64" pid %d xid "LPX64"\n",
- desc->bd_md.niov, desc->bd_md.length, desc->bd_portal,
- peer->peer_ni->pni_name, remote_id.nid, remote_id.pid,
- xid);
-
- desc->bd_network_rw = 1;
- rc = PtlGet(desc->bd_md_h, remote_id, desc->bd_portal, 0,
- xid, 0);
+ if (desc->bd_type == BULK_PUT_SOURCE)
+ rc = PtlPut (desc->bd_md_h, PTL_ACK_REQ, remote_id,
+ desc->bd_portal, 0, xid, 0, 0);
+ else
+ rc = PtlGet (desc->bd_md_h, remote_id,
+ desc->bd_portal, 0, xid, 0);
+
if (rc != PTL_OK) {
- desc->bd_network_rw = 0;
- CERROR("PtlGet("LPU64", %d, "LPX64") failed: %d\n",
+ /* Can't send, so we unlink the MD bound above. The UNLINK
+ * event this creates will signal completion with failure,
+ * so we return SUCCESS here! */
+ CERROR("Transfer("LPU64", %d, "LPX64") failed: %d\n",
remote_id.nid, desc->bd_portal, xid, rc);
rc2 = PtlMDUnlink(desc->bd_md_h);
LASSERT (rc2 == PTL_OK);
- RETURN((rc == PTL_NOSPACE) ? -ENOMEM : -ECOMM);
}
RETURN(0);
{
/* Server side bulk abort. Idempotent. Not thread-safe (i.e. only
* serialises with completion callback) */
- unsigned long flags;
struct l_wait_info lwi;
- int callback_count;
int rc;
LASSERT (!in_interrupt ()); /* might sleep */
- /* NB. server-side bulk gets 2 events, so we have to keep trying to
- * unlink the MD until all callbacks have happened, or
- * PtlMDUnlink() returns OK or INVALID */
- again:
- spin_lock_irqsave (&desc->bd_lock, flags);
- if (!desc->bd_network_rw) {
- /* completed or never even registered. NB holding bd_lock
- * guarantees callback has completed if it ran. */
- spin_unlock_irqrestore (&desc->bd_lock, flags);
- return;
- }
-
- /* sample callback count while we have the lock */
- callback_count = desc->bd_callback_count;
- spin_unlock_irqrestore (&desc->bd_lock, flags);
+ if (!ptlrpc_bulk_active(desc)) /* completed or */
+ return; /* never started */
+
+ /* The unlink ensures the callback happens ASAP and is the last
+ * one. If it fails, it must be because completion just
+ * happened. */
rc = PtlMDUnlink (desc->bd_md_h);
- switch (rc) {
- default:
- CERROR("PtlMDUnlink returned %d\n", rc);
- LBUG ();
- case PTL_OK: /* Won the race with the network */
- LASSERT (!desc->bd_complete); /* Not all callbacks ran */
- desc->bd_network_rw = 0;
- return;
-
- case PTL_MD_INUSE: /* MD is being accessed right now */
- for (;;) {
- /* Network access will complete in finite time but the
- * timeout lets us CERROR for visibility */
- lwi = LWI_TIMEOUT (10 * HZ, NULL, NULL);
- rc = l_wait_event(desc->bd_waitq,
- desc->bd_callback_count !=
- callback_count, &lwi);
- if (rc == -ETIMEDOUT) {
- CERROR("Unexpectedly long timeout: desc %p\n",
- desc);
- continue;
- }
- LASSERT (rc == 0);
- break;
- }
- /* go back and try again... */
- goto again;
-
- case PTL_INV_MD: /* Lost the race with completion */
- LASSERT (desc->bd_complete); /* Callbacks all ran */
- LASSERT (!desc->bd_network_rw);
+ if (rc == PTL_INV_MD) {
+ LASSERT(!ptlrpc_bulk_active(desc));
return;
}
+
+ LASSERT (rc == PTL_OK);
+
+ for (;;) {
+ /* Network access will complete in finite time but the HUGE
+ * timeout lets us CWARN for visibility of sluggish NALs */
+ lwi = LWI_TIMEOUT (300 * HZ, NULL, NULL);
+ rc = l_wait_event(desc->bd_waitq,
+ !ptlrpc_bulk_active(desc), &lwi);
+ if (rc == 0)
+ return;
+
+ LASSERT(rc == -ETIMEDOUT);
+ CWARN("Unexpectedly long timeout: desc %p\n", desc);
+ }
}
int ptlrpc_register_bulk (struct ptlrpc_request *req)
{
struct ptlrpc_bulk_desc *desc = req->rq_bulk;
struct ptlrpc_peer *peer;
- struct list_head *tmp, *next;
int rc;
int rc2;
- ptl_kiov_t *iov;
ptl_process_id_t source_id;
+ ptl_handle_me_t me_h;
+ ptl_md_t md;
ENTRY;
/* NB no locking required until desc is on the network */
+ LASSERT (desc->bd_nob > 0);
LASSERT (!desc->bd_network_rw);
LASSERT (desc->bd_page_count <= PTL_MD_MAX_PAGES);
LASSERT (desc->bd_req != NULL);
LASSERT (desc->bd_type == BULK_PUT_SINK ||
desc->bd_type == BULK_GET_SOURCE);
- desc->bd_complete = 0;
-
- iov = ptlrpc_get_bulk_iov (desc);
- if (iov == NULL)
- return (-ENOMEM);
+ desc->bd_success = 0;
peer = &desc->bd_import->imp_connection->c_peer;
- desc->bd_md.start = iov;
- desc->bd_md.niov = 0;
- desc->bd_md.length = 0;
- desc->bd_md.threshold = 1;
- desc->bd_md.user_ptr = desc;
-
- if (desc->bd_type == BULK_GET_SOURCE) {
- desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_KIOV;
- desc->bd_md.eventq = peer->peer_ni->pni_bulk_get_source_eq_h;
- } else {
- desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_KIOV;
- desc->bd_md.eventq = peer->peer_ni->pni_bulk_put_sink_eq_h;
- }
-
- list_for_each_safe(tmp, next, &desc->bd_page_list) {
- struct ptlrpc_bulk_page *bulk;
- bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
-
- LASSERT(desc->bd_md.niov < desc->bd_page_count);
-
- iov[desc->bd_md.niov].kiov_page = bulk->bp_page;
- iov[desc->bd_md.niov].kiov_len = bulk->bp_buflen;
- iov[desc->bd_md.niov].kiov_offset = bulk->bp_pageoffset;
-
- LASSERT (bulk->bp_pageoffset + bulk->bp_buflen <= PAGE_SIZE);
- desc->bd_md.niov++;
- desc->bd_md.length += bulk->bp_buflen;
- }
-
- LASSERT(desc->bd_md.niov == desc->bd_page_count);
- LASSERT(desc->bd_md.niov != 0);
+ md.start = &desc->bd_iov[0];
+ md.niov = desc->bd_page_count;
+ md.length = desc->bd_nob;
+ md.eventq = peer->peer_ni->pni_eq_h;
+ md.threshold = 1; /* PUT or GET */
+ md.options = (desc->bd_type == BULK_GET_SOURCE) ?
+ PTL_MD_OP_GET : PTL_MD_OP_PUT;
+#ifdef __KERNEL__
+ md.options |= PTL_MD_KIOV;
+#else
+ md.options |= PTL_MD_IOV;
+#endif
+ md.user_ptr = &desc->bd_cbid;
+ LASSERT (desc->bd_cbid.cbid_fn == client_bulk_callback);
+ LASSERT (desc->bd_cbid.cbid_arg == desc);
/* XXX Registering the same xid on retried bulk makes my head
* explode trying to understand how the original request's bulk
* might interfere with the retried request -eeb */
LASSERT (!desc->bd_registered || req->rq_xid != desc->bd_last_xid);
desc->bd_registered = 1;
- desc->bd_last_xid = desc->bd_last_xid;
+ desc->bd_last_xid = req->rq_xid;
source_id.nid = desc->bd_import->imp_connection->c_peer.peer_nid;
source_id.pid = PTL_PID_ANY;
rc = PtlMEAttach(peer->peer_ni->pni_ni_h,
desc->bd_portal, source_id, req->rq_xid, 0,
- PTL_UNLINK, PTL_INS_AFTER, &desc->bd_me_h);
-
+ PTL_UNLINK, PTL_INS_AFTER, &me_h);
if (rc != PTL_OK) {
CERROR("PtlMEAttach failed: %d\n", rc);
LASSERT (rc == PTL_NOSPACE);
- GOTO(out, rc = -ENOMEM);
+ RETURN (-ENOMEM);
}
/* About to let the network at it... */
desc->bd_network_rw = 1;
- rc = PtlMDAttach(desc->bd_me_h, desc->bd_md, PTL_UNLINK,
- &desc->bd_md_h);
+ rc = PtlMDAttach(me_h, md, PTL_UNLINK, &desc->bd_md_h);
if (rc != PTL_OK) {
CERROR("PtlMDAttach failed: %d\n", rc);
LASSERT (rc == PTL_NOSPACE);
desc->bd_network_rw = 0;
- rc2 = PtlMEUnlink (desc->bd_me_h);
+ rc2 = PtlMEUnlink (me_h);
LASSERT (rc2 == PTL_OK);
- GOTO(out, rc = -ENOMEM);
+ RETURN (-ENOMEM);
}
- rc = 0;
CDEBUG(D_NET, "Setup bulk %s buffers: %u pages %u bytes, xid "LPX64", "
"portal %u on %s\n",
desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink",
- desc->bd_md.niov, desc->bd_md.length,
+ md.niov, md.length,
req->rq_xid, desc->bd_portal, peer->peer_ni->pni_name);
-
- out:
- ptlrpc_put_bulk_iov (desc, iov);
- RETURN(rc);
+ RETURN(0);
}
void ptlrpc_unregister_bulk (struct ptlrpc_request *req)
* thread-safe (i.e. only interlocks with completion callback). */
struct ptlrpc_bulk_desc *desc = req->rq_bulk;
wait_queue_head_t *wq;
- unsigned long flags;
struct l_wait_info lwi;
int rc;
LASSERT (!in_interrupt ()); /* might sleep */
- spin_lock_irqsave (&desc->bd_lock, flags);
- if (!desc->bd_network_rw) { /* completed or never even registered */
- spin_unlock_irqrestore (&desc->bd_lock, flags);
- return;
- }
- spin_unlock_irqrestore (&desc->bd_lock, flags);
+ if (!ptlrpc_bulk_active(desc)) /* completed or */
+ return; /* never registered */
+
+ LASSERT (desc->bd_req == req); /* bd_req NULL until registered */
- LASSERT (desc->bd_req == req); /* NB bd_req NULL until registered */
+ /* the unlink ensures the callback happens ASAP and is the last
+ * one. If it fails, it must be because completion just
+ * happened. */
- /* NB...
- * 1. If the MD unlink is successful, the ME gets unlinked too.
- * 2. Since client-side bulk only gets a single event and a
- * .. threshold of 1. If the MD was inuse at the first link
- * .. attempt, the callback is due any minute, and the MD/ME will
- * .. unlink themselves.
- */
rc = PtlMDUnlink (desc->bd_md_h);
- switch (rc) {
- default:
- CERROR("PtlMDUnlink returned %d\n", rc);
- LBUG ();
- case PTL_OK: /* Won the race with completion */
- LASSERT (!desc->bd_complete); /* Callback hasn't happened */
- desc->bd_network_rw = 0;
- return;
- case PTL_MD_INUSE: /* MD is being accessed right now */
- for (;;) {
- /* Network access will complete in finite time but the
- * timeout lets us CERROR for visibility */
- if (desc->bd_req->rq_set != NULL)
- wq = &req->rq_set->set_waitq;
- else
- wq = &req->rq_reply_waitq;
- lwi = LWI_TIMEOUT (10 * HZ, NULL, NULL);
- rc = l_wait_event(*wq, ptlrpc_bulk_complete(desc), &lwi);
- LASSERT (rc == 0 || rc == -ETIMEDOUT);
- if (rc == 0)
- break;
- CERROR ("Unexpectedly long timeout: desc %p\n", desc);
- LBUG();
- }
- /* Fall through */
- case PTL_INV_MD: /* Lost the race with completion */
- LASSERT (desc->bd_complete);/* Callback has run to completion */
- LASSERT (!desc->bd_network_rw);
+ if (rc == PTL_INV_MD) {
+ LASSERT(!ptlrpc_bulk_active(desc));
return;
}
+
+ LASSERT (rc == PTL_OK);
+
+ if (desc->bd_req->rq_set != NULL)
+ wq = &req->rq_set->set_waitq;
+ else
+ wq = &req->rq_reply_waitq;
+
+ for (;;) {
+ /* Network access will complete in finite time but the HUGE
+ * timeout lets us CWARN for visibility of sluggish NALs */
+ lwi = LWI_TIMEOUT (300 * HZ, NULL, NULL);
+ rc = l_wait_event(*wq, !ptlrpc_bulk_active(desc), &lwi);
+ if (rc == 0)
+ return;
+
+ LASSERT (rc == -ETIMEDOUT);
+ CWARN("Unexpectedly long timeout: desc %p\n", desc);
+ }
}
-int ptlrpc_reply(struct ptlrpc_request *req)
+int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult)
{
- struct ptlrpc_connection *conn;
- unsigned long flags;
- int rc;
+ struct ptlrpc_service *svc = req->rq_rqbd->rqbd_srv_ni->sni_service;
+ struct ptlrpc_reply_state *rs = req->rq_reply_state;
+ struct ptlrpc_connection *conn;
+ int rc;
/* We must already have a reply buffer (only ptlrpc_error() may be
* called without one). We must also have a request buffer which
* is either the actual (swabbed) incoming request, or a saved copy
* if this is a req saved in target_queue_final_reply(). */
- LASSERT (req->rq_repmsg != NULL);
LASSERT (req->rq_reqmsg != NULL);
+ LASSERT (rs != NULL);
+ LASSERT (req->rq_repmsg != NULL);
+ LASSERT (may_be_difficult || !rs->rs_difficult);
+ LASSERT (req->rq_repmsg == &rs->rs_msg);
+ LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback);
+ LASSERT (rs->rs_cb_id.cbid_arg == rs);
- /* FIXME: we need to increment the count of handled events */
+ LASSERT (req->rq_repmsg != NULL);
if (req->rq_type != PTL_RPC_MSG_ERR)
req->rq_type = PTL_RPC_MSG_REPLY;
+ req->rq_repmsg->type = req->rq_type;
req->rq_repmsg->status = req->rq_status;
- req->rq_repmsg->opc = req->rq_reqmsg->opc;
+ req->rq_repmsg->opc = req->rq_reqmsg->opc;
if (req->rq_export == NULL)
conn = ptlrpc_get_connection(&req->rq_peer, NULL);
else
conn = ptlrpc_connection_addref(req->rq_export->exp_connection);
- init_waitqueue_head(&req->rq_reply_waitq);
- rc = ptl_send_buf(req, conn,
- req->rq_svc->srv_rep_portal);
- if (rc != 0) {
- /* Do what the callback handler would have done */
- OBD_FREE (req->rq_repmsg, req->rq_replen);
+ atomic_inc (&svc->srv_outstanding_replies);
- spin_lock_irqsave (&req->rq_lock, flags);
- req->rq_want_ack = 0;
- spin_unlock_irqrestore (&req->rq_lock, flags);
+ rc = ptl_send_buf (&rs->rs_md_h, req->rq_repmsg, req->rq_replen,
+ rs->rs_difficult ? PTL_ACK_REQ : PTL_NOACK_REQ,
+ &rs->rs_cb_id, conn,
+ svc->srv_rep_portal, req->rq_xid);
+ if (rc != 0) {
+ atomic_dec (&svc->srv_outstanding_replies);
+
+ if (!rs->rs_difficult) {
+ /* Callers other than target_send_reply() expect me
+ * to clean up on a comms error */
+ lustre_free_reply_state (rs);
+ req->rq_reply_state = NULL;
+ req->rq_repmsg = NULL;
+ }
}
ptlrpc_put_connection(conn);
return rc;
}
+int ptlrpc_reply (struct ptlrpc_request *req)
+{
+ return (ptlrpc_send_reply (req, 0));
+}
+
int ptlrpc_error(struct ptlrpc_request *req)
{
int rc;
RETURN(rc);
}
-
req->rq_type = PTL_RPC_MSG_ERR;
- rc = ptlrpc_reply(req);
+ rc = ptlrpc_send_reply (req, 0);
RETURN(rc);
}
unsigned long flags;
ptl_process_id_t source_id;
ptl_handle_me_t reply_me_h;
+ ptl_md_t reply_md;
ENTRY;
LASSERT (request->rq_type == PTL_RPC_MSG_REQUEST);
}
request->rq_reqmsg->handle = request->rq_import->imp_remote_handle;
+ request->rq_reqmsg->type = PTL_RPC_MSG_REQUEST;
request->rq_reqmsg->conn_cnt = request->rq_import->imp_conn_cnt;
source_id.nid = connection->c_peer.peer_nid;
OBD_ALLOC(request->rq_repmsg, request->rq_replen);
if (request->rq_repmsg == NULL) {
LBUG();
- RETURN(-ENOMEM);
+ GOTO(cleanup_bulk, rc = -ENOMEM);
}
rc = PtlMEAttach(connection->c_peer.peer_ni->pni_ni_h,
CERROR("PtlMEAttach failed: %d\n", rc);
LASSERT (rc == PTL_NOSPACE);
LBUG();
- GOTO(cleanup, rc = -ENOMEM);
+ GOTO(cleanup_repmsg, rc = -ENOMEM);
}
- request->rq_reply_md.start = request->rq_repmsg;
- request->rq_reply_md.length = request->rq_replen;
- request->rq_reply_md.threshold = 1;
- request->rq_reply_md.options = PTL_MD_OP_PUT;
- request->rq_reply_md.user_ptr = request;
- request->rq_reply_md.eventq =
- connection->c_peer.peer_ni->pni_reply_in_eq_h;
+ spin_lock_irqsave (&request->rq_lock, flags);
+ /* If the MD attach succeeds, there _will_ be a reply_in callback */
+ request->rq_receiving_reply = 1;
+ /* Clear any flags that may be present from previous sends. */
+ request->rq_replied = 0;
+ request->rq_err = 0;
+ request->rq_timedout = 0;
+ request->rq_resend = 0;
+ request->rq_restart = 0;
+ spin_unlock_irqrestore (&request->rq_lock, flags);
- rc = PtlMDAttach(reply_me_h, request->rq_reply_md,
- PTL_UNLINK, &request->rq_reply_md_h);
+ reply_md.start = request->rq_repmsg;
+ reply_md.length = request->rq_replen;
+ reply_md.threshold = 1;
+ reply_md.options = PTL_MD_OP_PUT;
+ reply_md.user_ptr = &request->rq_reply_cbid;
+ reply_md.eventq = connection->c_peer.peer_ni->pni_eq_h;
+
+ rc = PtlMDAttach(reply_me_h, reply_md, PTL_UNLINK,
+ &request->rq_reply_md_h);
if (rc != PTL_OK) {
CERROR("PtlMDAttach failed: %d\n", rc);
LASSERT (rc == PTL_NOSPACE);
LBUG();
- GOTO(cleanup2, rc -ENOMEM);
+ GOTO(cleanup_me, rc -ENOMEM);
}
CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64
request->rq_reply_portal,
connection->c_peer.peer_ni->pni_name);
- ptlrpc_request_addref(request); /* 1 ref for the SENT callback */
-
- spin_lock_irqsave (&request->rq_lock, flags);
- request->rq_receiving_reply = 1;
- /* Clear any flags that may be present from previous sends. */
- request->rq_replied = 0;
- request->rq_err = 0;
- request->rq_timedout = 0;
- request->rq_resend = 0;
- request->rq_restart = 0;
- spin_unlock_irqrestore (&request->rq_lock, flags);
+ ptlrpc_request_addref(request); /* +1 ref for the SENT callback */
request->rq_sent = LTIME_S(CURRENT_TIME);
ptlrpc_pinger_sending_on_import(request->rq_import);
- rc = ptl_send_buf(request, connection, request->rq_request_portal);
+ rc = ptl_send_buf(&request->rq_req_md_h,
+ request->rq_reqmsg, request->rq_reqlen,
+ PTL_NOACK_REQ, &request->rq_req_cbid,
+ connection,
+ request->rq_request_portal,
+ request->rq_xid);
if (rc == 0) {
ptlrpc_lprocfs_rpc_sent(request);
RETURN(rc);
}
- spin_lock_irqsave (&request->rq_lock, flags);
- request->rq_receiving_reply = 0;
- spin_unlock_irqrestore (&request->rq_lock, flags);
ptlrpc_req_finished (request); /* drop callback ref */
- cleanup2:
+
+ cleanup_me:
/* MEUnlink is safe; the PUT didn't even get off the ground, and
* nobody apart from the PUT's target has the right nid+XID to
* access the reply buffer. */
rc2 = PtlMEUnlink(reply_me_h);
LASSERT (rc2 == PTL_OK);
- cleanup:
+ /* UNLINKED callback called synchronously */
+ LASSERT (!request->rq_receiving_reply);
+
+ cleanup_repmsg:
OBD_FREE(request->rq_repmsg, request->rq_replen);
request->rq_repmsg = NULL;
+
+ cleanup_bulk:
+ if (request->rq_bulk != NULL)
+ ptlrpc_unregister_bulk(request);
+
return rc;
}
-void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd)
+void ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd)
{
- struct ptlrpc_srv_ni *srv_ni = rqbd->rqbd_srv_ni;
- struct ptlrpc_service *service = srv_ni->sni_service;
- static ptl_process_id_t match_id = {PTL_NID_ANY, PTL_PID_ANY};
- int rc;
- ptl_md_t dummy;
- ptl_handle_md_t md_h;
-
- LASSERT(atomic_read(&rqbd->rqbd_refcount) == 0);
+ struct ptlrpc_srv_ni *srv_ni = rqbd->rqbd_srv_ni;
+ struct ptlrpc_service *service = srv_ni->sni_service;
+ static ptl_process_id_t match_id = {PTL_NID_ANY, PTL_PID_ANY};
+ int rc;
+ ptl_md_t md;
+ ptl_handle_me_t me_h;
+ unsigned long flags;
CDEBUG(D_NET, "PtlMEAttach: portal %d on %s h %lx."LPX64"\n",
service->srv_req_portal, srv_ni->sni_ni->pni_name,
srv_ni->sni_ni->pni_ni_h.nal_idx,
srv_ni->sni_ni->pni_ni_h.cookie);
- /* Attach the leading ME on which we build the ring */
rc = PtlMEAttach(srv_ni->sni_ni->pni_ni_h, service->srv_req_portal,
- match_id, 0, ~0,
- PTL_UNLINK, PTL_INS_AFTER, &rqbd->rqbd_me_h);
+ match_id, 0, ~0, PTL_UNLINK, PTL_INS_AFTER, &me_h);
if (rc != PTL_OK) {
CERROR("PtlMEAttach failed: %d\n", rc);
- /* BUG 1191 */
- LBUG();
+ GOTO (failed, NULL);
}
- dummy.start = rqbd->rqbd_buffer;
- dummy.length = service->srv_buf_size;
- dummy.max_size = service->srv_max_req_size;
- dummy.threshold = PTL_MD_THRESH_INF;
- dummy.options = PTL_MD_OP_PUT | PTL_MD_MAX_SIZE | PTL_MD_AUTO_UNLINK;
- dummy.user_ptr = rqbd;
- dummy.eventq = srv_ni->sni_eq_h;
-
- atomic_inc(&srv_ni->sni_nrqbds_receiving);
- atomic_set(&rqbd->rqbd_refcount, 1); /* 1 ref for portals */
-
- rc = PtlMDAttach(rqbd->rqbd_me_h, dummy, PTL_UNLINK, &md_h);
- if (rc != PTL_OK) {
- CERROR("PtlMDAttach failed: %d\n", rc);
- LASSERT (rc == PTL_NOSPACE);
- LBUG();
- /* BUG 1191 */
- PtlMEUnlink (rqbd->rqbd_me_h);
- atomic_set(&rqbd->rqbd_refcount, 0);
- atomic_dec(&srv_ni->sni_nrqbds_receiving);
+ LASSERT(rqbd->rqbd_refcount == 0);
+ rqbd->rqbd_refcount = 1;
+
+ md.start = rqbd->rqbd_buffer;
+ md.length = service->srv_buf_size;
+ md.max_size = service->srv_max_req_size;
+ md.threshold = PTL_MD_THRESH_INF;
+ md.options = PTL_MD_OP_PUT | PTL_MD_MAX_SIZE | PTL_MD_AUTO_UNLINK;
+ md.user_ptr = &rqbd->rqbd_cbid;
+ md.eventq = srv_ni->sni_ni->pni_eq_h;
+
+ spin_lock_irqsave (&service->srv_lock, flags);
+ srv_ni->sni_nrqbd_receiving++;
+ spin_unlock_irqrestore (&service->srv_lock, flags);
+
+ rc = PtlMDAttach(me_h, md, PTL_UNLINK, &rqbd->rqbd_md_h);
+ if (rc == PTL_OK)
+ return;
+
+ CERROR("PtlMDAttach failed: %d\n", rc);
+ LASSERT (rc == PTL_NOSPACE);
+ rc = PtlMEUnlink (me_h);
+ LASSERT (rc == PTL_OK);
+
+ spin_lock_irqsave (&service->srv_lock, flags);
+ srv_ni->sni_nrqbd_receiving--;
+ if (srv_ni->sni_nrqbd_receiving == 0) {
+ /* This service is off-air on this interface because all
+ * its request buffers are busy. Portals will have started
+ * dropping incoming requests until more buffers get
+ * posted */
+ CERROR("All %s %s request buffers busy\n",
+ service->srv_name, srv_ni->sni_ni->pni_name);
}
+ spin_unlock_irqrestore (&service->srv_lock, flags);
+
+ failed:
+ LBUG(); /* BUG 1191 */
+ /* put req on a retry list? */
}
#endif
#include <linux/obd_support.h>
+#include <linux/obd_class.h>
#include <linux/lustre_net.h>
return (msg->magic == __swab32(PTLRPC_MSG_MAGIC));
}
-static int lustre_pack_msg(int count, int *lens, char **bufs, int *len,
- struct lustre_msg **msg)
+static void
+lustre_init_msg (struct lustre_msg *msg, int count, int *lens, char **bufs)
{
char *ptr;
- struct lustre_msg *m;
- int size = 0, i;
-
- size = HDR_SIZE (count);
+ int i;
+
+ msg->magic = PTLRPC_MSG_MAGIC;
+ msg->version = PTLRPC_MSG_VERSION;
+ msg->bufcount = count;
for (i = 0; i < count; i++)
- size += size_round(lens[i]);
-
- *len = size;
+ msg->buflens[i] = lens[i];
- OBD_ALLOC(*msg, *len);
- if (!*msg)
- RETURN(-ENOMEM);
-
- m = *msg;
- m->magic = PTLRPC_MSG_MAGIC;
- m->version = PTLRPC_MSG_VERSION;
- m->bufcount = count;
- for (i = 0; i < count; i++)
- m->buflens[i] = lens[i];
+ if (bufs == NULL)
+ return;
- ptr = (char *)m + HDR_SIZE(count);
+ ptr = (char *)msg + HDR_SIZE(count);
for (i = 0; i < count; i++) {
- char *tmp = NULL;
- if (bufs)
- tmp = bufs[i];
+ char *tmp = bufs[i];
LOGL(tmp, lens[i], ptr);
-
}
+}
+
+int lustre_pack_request (struct ptlrpc_request *req,
+ int count, int *lens, char **bufs)
+{
+ ENTRY;
+
+ req->rq_reqlen = lustre_msg_size (count, lens);
+ OBD_ALLOC(req->rq_reqmsg, req->rq_reqlen);
+ if (req->rq_reqmsg == NULL)
+ RETURN(-ENOMEM);
- return 0;
+ lustre_init_msg (req->rq_reqmsg, count, lens, bufs);
+ RETURN (0);
}
-int lustre_pack_request(struct ptlrpc_request *req, int count, int *lens,
- char **bufs)
+#if RS_DEBUG
+LIST_HEAD(ptlrpc_rs_debug_lru);
+spinlock_t ptlrpc_rs_debug_lock = SPIN_LOCK_UNLOCKED;
+
+#define PTLRPC_RS_DEBUG_LRU_ADD(rs) \
+do { \
+ unsigned long __flags; \
+ \
+ spin_lock_irqsave(&ptlrpc_rs_debug_lock, __flags); \
+ list_add_tail(&(rs)->rs_debug_list, &ptlrpc_rs_debug_lru); \
+ spin_unlock_irqrestore(&ptlrpc_rs_debug_lock, __flags); \
+} while (0)
+
+#define PTLRPC_RS_DEBUG_LRU_DEL(rs) \
+do { \
+ unsigned long __flags; \
+ \
+ spin_lock_irqsave(&ptlrpc_rs_debug_lock, __flags); \
+ list_del(&(rs)->rs_debug_list); \
+ spin_unlock_irqrestore(&ptlrpc_rs_debug_lock, __flags); \
+} while (0)
+#else
+# define PTLRPC_RS_DEBUG_LRU_ADD(rs) do {} while(0)
+# define PTLRPC_RS_DEBUG_LRU_DEL(rs) do {} while(0)
+#endif
+
+int lustre_pack_reply (struct ptlrpc_request *req,
+ int count, int *lens, char **bufs)
{
- return lustre_pack_msg(count, lens, bufs, &req->rq_reqlen,
- &req->rq_reqmsg);
+ struct ptlrpc_reply_state *rs;
+ int msg_len;
+ int size;
+ ENTRY;
+
+ LASSERT (req->rq_reply_state == NULL);
+
+ msg_len = lustre_msg_size (count, lens);
+ size = offsetof (struct ptlrpc_reply_state, rs_msg) + msg_len;
+ OBD_ALLOC (rs, size);
+ if (rs == NULL)
+ RETURN (-ENOMEM);
+
+ rs->rs_cb_id.cbid_fn = reply_out_callback;
+ rs->rs_cb_id.cbid_arg = rs;
+ rs->rs_srv_ni = req->rq_rqbd->rqbd_srv_ni;
+ rs->rs_size = size;
+ INIT_LIST_HEAD(&rs->rs_exp_list);
+ INIT_LIST_HEAD(&rs->rs_obd_list);
+
+ req->rq_replen = msg_len;
+ req->rq_reply_state = rs;
+ req->rq_repmsg = &rs->rs_msg;
+ lustre_init_msg (&rs->rs_msg, count, lens, bufs);
+
+ PTLRPC_RS_DEBUG_LRU_ADD(rs);
+
+ RETURN (0);
}
-int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens,
- char **bufs)
+void lustre_free_reply_state (struct ptlrpc_reply_state *rs)
{
- return lustre_pack_msg(count, lens, bufs, &req->rq_replen,
- &req->rq_repmsg);
+ PTLRPC_RS_DEBUG_LRU_DEL(rs);
+
+ LASSERT (!rs->rs_difficult || rs->rs_handled);
+ LASSERT (!rs->rs_on_net);
+ LASSERT (!rs->rs_scheduled);
+ LASSERT (rs->rs_export == NULL);
+ LASSERT (rs->rs_nlocks == 0);
+ LASSERT (list_empty(&rs->rs_exp_list));
+ LASSERT (list_empty(&rs->rs_obd_list));
+
+ OBD_FREE (rs, rs->rs_size);
}
/* This returns the size of the buffer that is required to hold a lustre_msg
#include <linux/obd_class.h>
#include "ptlrpc_internal.h"
-#ifdef __KERNEL__
-
-static struct ptlrpc_thread *pinger_thread = NULL;
static DECLARE_MUTEX(pinger_sem);
static struct list_head pinger_imports = LIST_HEAD_INIT(pinger_imports);
+#ifdef __KERNEL__
+static struct ptlrpc_thread *pinger_thread = NULL;
+
static int ptlrpc_pinger_main(void *arg)
{
struct ptlrpc_svc_data *data = (struct ptlrpc_svc_data *)arg;
RETURN(0);
}
-#else /* !__KERNEL__ */
+#else
+/* XXX
+ * the current implementation of pinger in liblustre is not optimized
+ */
+
+static struct pinger_data {
+ int pd_recursion;
+ unsigned long pd_this_ping;
+ unsigned long pd_next_ping;
+ struct ptlrpc_request_set *pd_set;
+} pinger_args;
+
+static int pinger_check_rpcs(void *arg)
+{
+ unsigned long curtime = time(NULL);
+ struct ptlrpc_request *req;
+ struct ptlrpc_request_set *set;
+ struct list_head *iter;
+ struct pinger_data *pd = &pinger_args;
+ int rc;
+
+ /* prevent recursion */
+ if (pd->pd_recursion++) {
+ CDEBUG(D_HA, "pinger: recursion! quit\n");
+ LASSERT(pd->pd_set);
+ pd->pd_recursion--;
+ return 0;
+ }
+
+ /* have we reached ping point? */
+ if (!pd->pd_set && pd->pd_next_ping > curtime) {
+ pd->pd_recursion--;
+ return 0;
+ }
+
+ /* if we have rpc_set already, continue processing it */
+ if (pd->pd_set) {
+ LASSERT(pd->pd_this_ping);
+ set = pd->pd_set;
+ goto do_check_set;
+ }
+
+ pd->pd_this_ping = curtime;
+ pd->pd_set = ptlrpc_prep_set();
+ set = pd->pd_set;
+
+ /* add rpcs into set */
+ down(&pinger_sem);
+ list_for_each(iter, &pinger_imports) {
+ struct obd_import *imp =
+ list_entry(iter, struct obd_import,
+ imp_pinger_chain);
+ int generation, level;
+ unsigned long flags;
+
+ if (imp->imp_next_ping <= pd->pd_this_ping) {
+ /* Add a ping. */
+ spin_lock_irqsave(&imp->imp_lock, flags);
+ generation = imp->imp_generation;
+ level = imp->imp_state;
+ spin_unlock_irqrestore(&imp->imp_lock, flags);
+
+ if (level != LUSTRE_IMP_FULL) {
+ CDEBUG(D_HA,
+ "not pinging %s (in recovery)\n",
+ imp->imp_target_uuid.uuid);
+ continue;
+ }
+
+ req = ptlrpc_prep_req(imp, OBD_PING, 0, NULL,
+ NULL);
+ if (!req) {
+ CERROR("out of memory\n");
+ break;
+ }
+ req->rq_no_resend = 1;
+ req->rq_replen = lustre_msg_size(0, NULL);
+ req->rq_send_state = LUSTRE_IMP_FULL;
+ req->rq_phase = RQ_PHASE_RPC;
+ req->rq_import_generation = generation;
+ ptlrpc_set_add_req(set, req);
+ } else {
+ CDEBUG(D_HA, "don't need to ping %s (%lu > "
+ "%lu)\n", imp->imp_target_uuid.uuid,
+ imp->imp_next_ping, pd->pd_this_ping);
+ }
+ }
+ pd->pd_this_ping = curtime;
+ up(&pinger_sem);
+
+ /* Might be empty, that's OK. */
+ if (set->set_remaining == 0)
+ CDEBUG(D_HA, "nothing to ping\n");
+
+ list_for_each(iter, &set->set_requests) {
+ struct ptlrpc_request *req =
+ list_entry(iter, struct ptlrpc_request,
+ rq_set_chain);
+ DEBUG_REQ(D_HA, req, "pinging %s->%s",
+ req->rq_import->imp_obd->obd_uuid.uuid,
+ req->rq_import->imp_target_uuid.uuid);
+ (void)ptl_send_rpc(req);
+ }
+
+do_check_set:
+ rc = ptlrpc_check_set(set);
+
+ /* not finished, and we are not expired, simply return */
+ if (!rc && curtime < pd->pd_this_ping + obd_timeout) {
+ CDEBUG(D_HA, "not finished, but also not expired\n");
+ pd->pd_recursion--;
+ return 0;
+ }
+
+ /* Expire all the requests that didn't come back. */
+ down(&pinger_sem);
+ list_for_each(iter, &set->set_requests) {
+ req = list_entry(iter, struct ptlrpc_request,
+ rq_set_chain);
+
+ if (req->rq_replied)
+ continue;
+
+ req->rq_phase = RQ_PHASE_COMPLETE;
+ set->set_remaining--;
+ /* If it was disconnected, don't sweat it. */
+ if (list_empty(&req->rq_import->imp_pinger_chain)) {
+ ptlrpc_unregister_reply(req);
+ continue;
+ }
+
+ CDEBUG(D_HA, "pinger initiate expire_one_request\n");
+ ptlrpc_expire_one_request(req);
+ }
+ up(&pinger_sem);
+
+ ptlrpc_set_destroy(set);
+ pd->pd_set = NULL;
+
+ pd->pd_next_ping = pd->pd_this_ping + obd_timeout;
+ pd->pd_this_ping = 0; /* XXX for debug */
+
+ CDEBUG(D_HA, "finished a round ping\n");
+ pd->pd_recursion--;
+ return 0;
+}
+
+static void *pinger_callback = NULL;
int ptlrpc_start_pinger(void)
{
+ memset(&pinger_args, 0, sizeof(pinger_args));
+#ifdef ENABLE_PINGER
+ pinger_callback =
+ liblustre_register_wait_callback(&pinger_check_rpcs, &pinger_args);
+#endif
+ obd_timeout = 10;
return 0;
}
int ptlrpc_stop_pinger(void)
{
+#ifdef ENABLE_PINGER
+ if (pinger_callback)
+ liblustre_deregister_wait_callback(pinger_callback);
+#endif
return 0;
}
-int ptlrpc_pinger_add_import(struct obd_import *imp)
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
{
- return 0;
+ down(&pinger_sem);
+ imp->imp_next_ping = time(NULL) + obd_timeout;
+ if (pinger_args.pd_set == NULL &&
+ pinger_args.pd_next_ping > imp->imp_next_ping) {
+ CDEBUG(D_HA, "set next ping to %ld(cur %ld)\n",
+ imp->imp_next_ping, time(NULL));
+ pinger_args.pd_next_ping = imp->imp_next_ping;
+ }
+ up(&pinger_sem);
}
-int ptlrpc_pinger_del_import(struct obd_import *imp)
+int ptlrpc_pinger_add_import(struct obd_import *imp)
{
- return 0;
+ ENTRY;
+ if (!list_empty(&imp->imp_pinger_chain))
+ RETURN(-EALREADY);
+
+ CDEBUG(D_HA, "adding pingable import %s->%s\n",
+ imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid);
+ ptlrpc_pinger_sending_on_import(imp);
+
+ down(&pinger_sem);
+ list_add_tail(&imp->imp_pinger_chain, &pinger_imports);
+ class_import_get(imp);
+ up(&pinger_sem);
+
+ RETURN(0);
}
-void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
+int ptlrpc_pinger_del_import(struct obd_import *imp)
{
+ ENTRY;
+ if (list_empty(&imp->imp_pinger_chain))
+ RETURN(-ENOENT);
+
+ down(&pinger_sem);
+ list_del_init(&imp->imp_pinger_chain);
+ CDEBUG(D_HA, "removing pingable import %s->%s\n",
+ imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid);
+ class_import_put(imp);
+ up(&pinger_sem);
+ RETURN(0);
}
-#endif
+#endif /* !__KERNEL__ */
(OBD_LAST_OPC - OBD_FIRST_OPC))
enum {
- PTLRPC_REQWAIT_CNTR = 0,
- PTLRPC_SVCIDLETIME_CNTR = 1,
- //PTLRPC_SVCEQDEPTH_CNTR,
+ PTLRPC_REQWAIT_CNTR = 0,
+ PTLRPC_REQQDEPTH_CNTR,
+ PTLRPC_REQACTIVE_CNTR,
PTLRPC_LAST_CNTR
};
EXPORT_SYMBOL(ptlrpc_cleanup_connection);
/* niobuf.c */
-EXPORT_SYMBOL(ptlrpc_bulk_put);
-EXPORT_SYMBOL(ptlrpc_bulk_get);
+EXPORT_SYMBOL(ptlrpc_start_bulk_transfer);
EXPORT_SYMBOL(ptlrpc_abort_bulk);
EXPORT_SYMBOL(ptlrpc_register_bulk);
EXPORT_SYMBOL(ptlrpc_unregister_bulk);
+EXPORT_SYMBOL(ptlrpc_send_reply);
EXPORT_SYMBOL(ptlrpc_reply);
EXPORT_SYMBOL(ptlrpc_error);
EXPORT_SYMBOL(ptlrpc_resend_req);
EXPORT_SYMBOL(ptl_send_rpc);
-EXPORT_SYMBOL(ptlrpc_link_svc_me);
/* client.c */
EXPORT_SYMBOL(ptlrpc_init_client);
EXPORT_SYMBOL(ptlrpc_prep_bulk_exp);
EXPORT_SYMBOL(ptlrpc_free_bulk);
EXPORT_SYMBOL(ptlrpc_prep_bulk_page);
-EXPORT_SYMBOL(ptlrpc_free_bulk_page);
EXPORT_SYMBOL(ptlrpc_abort_inflight);
EXPORT_SYMBOL(ptlrpc_retain_replayable_request);
EXPORT_SYMBOL(ptlrpc_next_xid);
EXPORT_SYMBOL(ptlrpc_mark_interrupted);
/* service.c */
+EXPORT_SYMBOL(ptlrpc_save_lock);
+EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply);
+EXPORT_SYMBOL(ptlrpc_commit_replies);
EXPORT_SYMBOL(ptlrpc_init_svc);
EXPORT_SYMBOL(ptlrpc_stop_all_threads);
EXPORT_SYMBOL(ptlrpc_start_n_threads);
EXPORT_SYMBOL(lustre_msg_swabbed);
EXPORT_SYMBOL(lustre_pack_request);
EXPORT_SYMBOL(lustre_pack_reply);
+EXPORT_SYMBOL(lustre_free_reply_state);
EXPORT_SYMBOL(lustre_msg_size);
EXPORT_SYMBOL(lustre_unpack_msg);
EXPORT_SYMBOL(lustre_msg_buf);
if (test_bit(LIOD_STOP, &pc->pc_flags))
break;
}
- /* XXX should be making sure we don't have anything in flight */
+ /* wait for inflight requests to drain */
+ if (!list_empty(&pc->pc_set->set_requests))
+ ptlrpc_set_wait(pc->pc_set);
complete(&pc->pc_finishing);
return 0;
}
argv[0], argv[1], argv[2], argv[3], argv[4]);
}
#else
+ if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+ EXIT;
+ return;
+ }
ptlrpc_recover_import(imp, NULL);
#endif
}
list_for_each_safe(tmp, pos, &imp->imp_delayed_list) {
req = list_entry(tmp, struct ptlrpc_request, rq_list);
- if (req->rq_set) {
- DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
- wake_up(&req->rq_set->set_waitq);
- } else {
- DEBUG_REQ(D_HA, req, "waking:");
- wake_up(&req->rq_reply_waitq);
- }
+ DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
+ ptlrpc_wake_client_req(req);
}
spin_unlock_irqrestore(&imp->imp_lock, flags);
}
#include <portals/types.h>
#include "ptlrpc_internal.h"
-extern int request_in_callback(ptl_event_t *ev);
+static LIST_HEAD (ptlrpc_all_services);
+static spinlock_t ptlrpc_all_services_lock = SPIN_LOCK_UNLOCKED;
-static int ptlrpc_check_event(struct ptlrpc_service *svc,
- struct ptlrpc_thread *thread, ptl_event_t *event)
+static void
+ptlrpc_free_server_req (struct ptlrpc_request *req)
{
- struct ptlrpc_srv_ni *srv_ni;
- int i, idx, rc;
- ENTRY;
+ /* The last request to be received into a request buffer uses space
+ * in the request buffer descriptor, otherwise requests are
+ * allocated dynamically in the incoming reply event handler */
+ if (req == &req->rq_rqbd->rqbd_req)
+ return;
- spin_lock(&svc->srv_lock);
+ OBD_FREE(req, sizeof(*req));
+}
+
+static char *
+ptlrpc_alloc_request_buffer (int size)
+{
+ char *ptr;
+
+ if (size > SVC_BUF_VMALLOC_THRESHOLD)
+ OBD_VMALLOC(ptr, size);
+ else
+ OBD_ALLOC(ptr, size);
+
+ return (ptr);
+}
- if (thread->t_flags & SVC_STOPPING)
- GOTO(out, rc = 1);
+static void
+ptlrpc_free_request_buffer (char *ptr, int size)
+{
+ if (size > SVC_BUF_VMALLOC_THRESHOLD)
+ OBD_VFREE(ptr, size);
+ else
+ OBD_FREE(ptr, size);
+}
- LASSERT ((thread->t_flags & SVC_EVENT) == 0);
- LASSERT (ptlrpc_ninterfaces > 0);
+struct ptlrpc_request_buffer_desc *
+ptlrpc_alloc_rqbd (struct ptlrpc_srv_ni *srv_ni)
+{
+ struct ptlrpc_service *svc = srv_ni->sni_service;
+ unsigned long flags;
+ struct ptlrpc_request_buffer_desc *rqbd;
+
+ OBD_ALLOC(rqbd, sizeof (*rqbd));
+ if (rqbd == NULL)
+ return (NULL);
+
+ rqbd->rqbd_srv_ni = srv_ni;
+ rqbd->rqbd_refcount = 0;
+ rqbd->rqbd_cbid.cbid_fn = request_in_callback;
+ rqbd->rqbd_cbid.cbid_arg = rqbd;
+ rqbd->rqbd_buffer = ptlrpc_alloc_request_buffer(svc->srv_buf_size);
+
+ if (rqbd->rqbd_buffer == NULL) {
+ OBD_FREE(rqbd, sizeof (*rqbd));
+ return (NULL);
+ }
- for (i = 0; i < ptlrpc_ninterfaces; i++) {
- idx = (svc->srv_interface_rover + i) % ptlrpc_ninterfaces;
- srv_ni = &svc->srv_interfaces[idx];
+ spin_lock_irqsave (&svc->srv_lock, flags);
+ list_add(&rqbd->rqbd_list, &srv_ni->sni_rqbds);
+ svc->srv_nbufs++;
+ spin_unlock_irqrestore (&svc->srv_lock, flags);
- LASSERT (!PtlHandleEqual (srv_ni->sni_eq_h, PTL_HANDLE_NONE));
+ return (rqbd);
+}
- rc = PtlEQGet(srv_ni->sni_eq_h, event);
- switch (rc) {
- case PTL_OK:
- /* next time start with the next interface */
- svc->srv_interface_rover = (idx+1) % ptlrpc_ninterfaces;
- thread->t_flags |= SVC_EVENT;
- GOTO(out, rc = 1);
+void
+ptlrpc_free_rqbd (struct ptlrpc_request_buffer_desc *rqbd)
+{
+ struct ptlrpc_srv_ni *sni = rqbd->rqbd_srv_ni;
+ struct ptlrpc_service *svc = sni->sni_service;
+ unsigned long flags;
+
+ LASSERT (rqbd->rqbd_refcount == 0);
+
+ spin_lock_irqsave(&svc->srv_lock, flags);
+ list_del(&rqbd->rqbd_list);
+ svc->srv_nbufs--;
+ spin_unlock_irqrestore(&svc->srv_lock, flags);
+
+ ptlrpc_free_request_buffer (rqbd->rqbd_buffer, svc->srv_buf_size);
+ OBD_FREE (rqbd, sizeof (*rqbd));
+}
- case PTL_EQ_EMPTY:
- continue;
+void
+ptlrpc_save_lock (struct ptlrpc_request *req,
+ struct lustre_handle *lock, int mode)
+{
+ struct ptlrpc_reply_state *rs = req->rq_reply_state;
+ int idx;
- case PTL_EQ_DROPPED:
- CWARN("Event queue overflow (bug 2125): timeouts will "
- "follow.\n");
- continue;
+ LASSERT (rs != NULL);
+ LASSERT (rs->rs_nlocks < RS_MAX_LOCKS);
+
+ idx = rs->rs_nlocks++;
+ rs->rs_locks[idx] = *lock;
+ rs->rs_modes[idx] = mode;
+ rs->rs_difficult = 1;
+}
+
+void
+ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs)
+{
+ struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service;
+
+#ifdef CONFIG_SMP
+ LASSERT (spin_is_locked (&svc->srv_lock));
+#endif
+ LASSERT (rs->rs_difficult);
+ rs->rs_scheduled_ever = 1; /* flag any notification attempt */
+
+ if (rs->rs_scheduled) /* being set up or already notified */
+ return;
+
+ rs->rs_scheduled = 1;
+ list_del (&rs->rs_list);
+ list_add (&rs->rs_list, &svc->srv_reply_queue);
+ wake_up (&svc->srv_waitq);
+}
+
+void
+ptlrpc_commit_replies (struct obd_device *obd)
+{
+ struct list_head *tmp;
+ struct list_head *nxt;
+ unsigned long flags;
+
+ /* Find any replies that have been committed and get their service
+ * to attend to complete them. */
+
+ /* CAVEAT EMPTOR: spinlock ordering!!! */
+ spin_lock_irqsave (&obd->obd_uncommitted_replies_lock, flags);
+
+ list_for_each_safe (tmp, nxt, &obd->obd_uncommitted_replies) {
+ struct ptlrpc_reply_state *rs =
+ list_entry (tmp, struct ptlrpc_reply_state, rs_obd_list);
+
+ LASSERT (rs->rs_difficult);
- default:
- CERROR("BUG: PtlEQGet returned %d\n", rc);
- LBUG();
+ if (rs->rs_transno <= obd->obd_last_committed) {
+ struct ptlrpc_service *svc = rs->rs_srv_ni->sni_service;
+
+ spin_lock (&svc->srv_lock);
+ list_del_init (&rs->rs_obd_list);
+ ptlrpc_schedule_difficult_reply (rs);
+ spin_unlock (&svc->srv_lock);
}
}
- rc = 0;
- EXIT;
- out:
- spin_unlock(&svc->srv_lock);
- return rc;
+
+ spin_unlock_irqrestore (&obd->obd_uncommitted_replies_lock, flags);
+}
+
+static long
+timeval_sub(struct timeval *large, struct timeval *small)
+{
+ return (large->tv_sec - small->tv_sec) * 1000000 +
+ (large->tv_usec - small->tv_usec);
}
-struct ptlrpc_service * ptlrpc_init_svc(__u32 nevents, __u32 nbufs,
- __u32 bufsize, __u32 max_req_size,
- int req_portal, int rep_portal,
- svc_handler_t handler, char *name,
- struct proc_dir_entry *proc_entry)
+struct ptlrpc_service *
+ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size,
+ int req_portal, int rep_portal,
+ svc_handler_t handler, char *name,
+ struct proc_dir_entry *proc_entry)
{
- int i, j, ssize, rc;
- struct ptlrpc_service *service;
- struct ptlrpc_srv_ni *srv_ni;
+ int i;
+ int j;
+ int ssize;
+ struct ptlrpc_service *service;
+ struct ptlrpc_srv_ni *srv_ni;
+ struct ptlrpc_request_buffer_desc *rqbd;
ENTRY;
LASSERT (ptlrpc_ninterfaces > 0);
-
+ LASSERT (nbufs > 0);
+ LASSERT (bufsize >= max_req_size);
+
ssize = offsetof (struct ptlrpc_service,
srv_interfaces[ptlrpc_ninterfaces]);
OBD_ALLOC(service, ssize);
service->srv_max_req_size = max_req_size;
service->srv_buf_size = bufsize;
-
service->srv_rep_portal = rep_portal;
service->srv_req_portal = req_portal;
service->srv_handler = handler;
- service->srv_interface_rover = 0;
+
+ INIT_LIST_HEAD(&service->srv_request_queue);
+ INIT_LIST_HEAD(&service->srv_reply_queue);
/* First initialise enough for early teardown */
for (i = 0; i < ptlrpc_ninterfaces; i++) {
srv_ni->sni_service = service;
srv_ni->sni_ni = &ptlrpc_interfaces[i];
- srv_ni->sni_eq_h = PTL_HANDLE_NONE;
INIT_LIST_HEAD(&srv_ni->sni_rqbds);
- srv_ni->sni_nrqbds = 0;
- atomic_set(&srv_ni->sni_nrqbds_receiving, 0);
+ INIT_LIST_HEAD(&srv_ni->sni_active_replies);
}
- /* Now allocate the event queue and request buffers, assuming all
- * interfaces require the same level of buffering. */
+ spin_lock (&ptlrpc_all_services_lock);
+ list_add (&service->srv_list, &ptlrpc_all_services);
+ spin_unlock (&ptlrpc_all_services_lock);
+
+ /* Now allocate the request buffers, assuming all interfaces require
+ * the same number. */
for (i = 0; i < ptlrpc_ninterfaces; i++) {
srv_ni = &service->srv_interfaces[i];
CDEBUG (D_NET, "%s: initialising interface %s\n", name,
srv_ni->sni_ni->pni_name);
- rc = PtlEQAlloc(srv_ni->sni_ni->pni_ni_h, nevents,
- request_in_callback, &(srv_ni->sni_eq_h));
- if (rc != PTL_OK) {
- CERROR("%s.%d: PtlEQAlloc on %s failed: %d\n",
- name, i, srv_ni->sni_ni->pni_name, rc);
- GOTO (failed, NULL);
- }
-
for (j = 0; j < nbufs; j++) {
- struct ptlrpc_request_buffer_desc *rqbd;
-
- OBD_ALLOC_WAIT(rqbd, sizeof(*rqbd));
+ rqbd = ptlrpc_alloc_rqbd (srv_ni);
+
if (rqbd == NULL) {
- CERROR ("%s.%d: Can't allocate request "
- "descriptor %d on %s\n",
- name, i, srv_ni->sni_nrqbds,
+ CERROR ("%s.%d: Can't allocate request %d "
+ "on %s\n", name, i, j,
srv_ni->sni_ni->pni_name);
GOTO(failed, NULL);
}
-
- rqbd->rqbd_srv_ni = srv_ni;
- rqbd->rqbd_me_h = PTL_HANDLE_NONE;
- atomic_set(&rqbd->rqbd_refcount, 0);
-
- OBD_ALLOC_WAIT(rqbd->rqbd_buffer, service->srv_buf_size);
- if (rqbd->rqbd_buffer == NULL) {
- CERROR ("%s.%d: Can't allocate request "
- "buffer %d on %s\n",
- name, i, srv_ni->sni_nrqbds,
- srv_ni->sni_ni->pni_name);
- OBD_FREE(rqbd, sizeof(*rqbd));
- GOTO(failed, NULL);
- }
- list_add(&rqbd->rqbd_list, &srv_ni->sni_rqbds);
- srv_ni->sni_nrqbds++;
-
- ptlrpc_link_svc_me(rqbd);
+ ptlrpc_register_rqbd (rqbd);
}
}
return NULL;
}
-static int handle_incoming_request(struct obd_device *obddev,
- struct ptlrpc_service *svc,
- ptl_event_t *event,
- struct ptlrpc_request *request)
+static int
+ptlrpc_server_handle_request (struct ptlrpc_service *svc)
{
- struct ptlrpc_request_buffer_desc *rqbd = event->mem_desc.user_ptr;
- int rc;
-
- /* FIXME: If we move to an event-driven model, we should put the request
- * on the stack of mds_handle instead. */
+ struct ptlrpc_request *request;
+ unsigned long flags;
+ struct timeval work_start;
+ struct timeval work_end;
+ long timediff;
+ int refcount;
+ int rc;
+ ENTRY;
- LASSERT (atomic_read (&rqbd->rqbd_refcount) > 0);
- LASSERT ((event->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
- LASSERT (rqbd->rqbd_srv_ni->sni_service == svc);
- LASSERT (rqbd->rqbd_buffer == event->mem_desc.start);
- LASSERT (event->offset + event->mlength <= svc->srv_buf_size);
+ spin_lock_irqsave (&svc->srv_lock, flags);
+ if (list_empty (&svc->srv_request_queue) ||
+ (svc->srv_n_difficult_replies != 0 &&
+ svc->srv_n_active_reqs >= (svc->srv_nthreads - 1))) {
+ /* If all the other threads are handling requests, I must
+ * remain free to handle any 'difficult' reply that might
+ * block them */
+ spin_unlock_irqrestore (&svc->srv_lock, flags);
+ RETURN(0);
+ }
- memset(request, 0, sizeof(*request));
- spin_lock_init (&request->rq_lock);
- INIT_LIST_HEAD(&request->rq_list);
- request->rq_svc = svc;
- request->rq_xid = event->match_bits;
- request->rq_reqmsg = event->mem_desc.start + event->offset;
- request->rq_reqlen = event->mlength;
+ request = list_entry (svc->srv_request_queue.next,
+ struct ptlrpc_request, rq_list);
+ list_del_init (&request->rq_list);
+ svc->srv_n_queued_reqs--;
+ svc->srv_n_active_reqs++;
+
+ spin_unlock_irqrestore (&svc->srv_lock, flags);
+
+ do_gettimeofday(&work_start);
+ timediff = timeval_sub(&work_start, &request->rq_arrival_time);
+ if (svc->srv_stats != NULL) {
+ lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
+ timediff);
+ lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR,
+ svc->srv_n_queued_reqs);
+ lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
+ svc->srv_n_active_reqs);
+ }
#if SWAB_PARANOIA
/* Clear request swab mask; this is a new request */
if (rc != 0) {
CERROR ("error unpacking request: ptl %d from "LPX64
" xid "LPU64"\n", svc->srv_req_portal,
- event->initiator.nid, request->rq_xid);
+ request->rq_peer.peer_nid, request->rq_xid);
goto out;
}
+
rc = -EINVAL;
if (request->rq_reqmsg->type != PTL_RPC_MSG_REQUEST) {
- CERROR("wrong packet type received (type=%u)\n",
- request->rq_reqmsg->type);
+ CERROR("wrong packet type received (type=%u) from "
+ LPX64"\n", request->rq_reqmsg->type,
+ request->rq_peer.peer_nid);
goto out;
}
- CDEBUG(D_NET, "got req "LPD64" (md: %p + %d)\n", request->rq_xid,
- event->mem_desc.start, event->offset);
+ CDEBUG(D_NET, "got req "LPD64"\n", request->rq_xid);
- request->rq_peer.peer_nid = event->initiator.nid;
- request->rq_peer.peer_ni = rqbd->rqbd_srv_ni->sni_ni;
+ /* Discard requests queued for longer than my timeout. If the
+ * client's timeout is similar to mine, she'll be timing out this
+ * REQ anyway (bug 1502) */
+ if (timediff / 1000000 > (long)obd_timeout) {
+ CERROR("Dropping timed-out request from "LPX64
+ ": %ld seconds old\n",
+ request->rq_peer.peer_nid, timediff / 1000000);
+ goto out;
+ }
request->rq_export = class_conn2export(&request->rq_reqmsg->handle);
if (request->rq_export) {
- if (request->rq_reqmsg->conn_cnt <
+ if (request->rq_reqmsg->conn_cnt <
request->rq_export->exp_conn_cnt) {
DEBUG_REQ(D_ERROR, request,
"DROPPING req from old connection %d < %d",
request->rq_export->exp_last_request_time =
LTIME_S(CURRENT_TIME);
- }
+ }
CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:ni:nid:opc "
"%s:%s+%d:%d:"LPU64":%s:"LPX64":%d\n", current->comm,
(request->rq_export ?
atomic_read(&request->rq_export->exp_refcount) : -99),
request->rq_reqmsg->status, request->rq_xid,
- rqbd->rqbd_srv_ni->sni_ni->pni_name, event->initiator.nid,
+ request->rq_peer.peer_ni->pni_name,
+ request->rq_peer.peer_nid,
request->rq_reqmsg->opc);
rc = svc->srv_handler(request);
(request->rq_export ?
atomic_read(&request->rq_export->exp_refcount) : -99),
request->rq_reqmsg->status, request->rq_xid,
- rqbd->rqbd_srv_ni->sni_ni->pni_name, event->initiator.nid,
+ request->rq_peer.peer_ni->pni_name,
+ request->rq_peer.peer_nid,
request->rq_reqmsg->opc);
put_conn:
class_export_put(request->rq_export);
out:
- if (atomic_dec_and_test (&rqbd->rqbd_refcount)) /* last reference? */
- ptlrpc_link_svc_me (rqbd);
+ do_gettimeofday(&work_end);
+
+ timediff = timeval_sub(&work_end, &work_start);
+
+ CDEBUG((timediff / 1000000 > (long)obd_timeout) ? D_ERROR : D_HA,
+ "request "LPU64" opc %u from NID "LPX64" processed in %ldus "
+ "(%ldus total)\n", request->rq_xid, request->rq_reqmsg->opc,
+ request->rq_peer.peer_nid,
+ timediff, timeval_sub(&work_end, &request->rq_arrival_time));
+
+ if (svc->srv_stats != NULL) {
+ int opc = opcode_offset(request->rq_reqmsg->opc);
+ if (opc > 0) {
+ LASSERT(opc < LUSTRE_MAX_OPCODES);
+ lprocfs_counter_add(svc->srv_stats,
+ opc + PTLRPC_LAST_CNTR,
+ timediff);
+ }
+ }
+
+ spin_lock_irqsave(&svc->srv_lock, flags);
+ svc->srv_n_active_reqs--;
+ refcount = --(request->rq_rqbd->rqbd_refcount);
+ spin_unlock_irqrestore(&svc->srv_lock, flags);
+
+ if (refcount == 0) {
+ /* rqbd now idle: repost */
+ ptlrpc_register_rqbd(request->rq_rqbd);
+ }
+
+ ptlrpc_free_server_req(request);
+
+ RETURN(1);
+}
+
+static int
+ptlrpc_server_handle_reply (struct ptlrpc_service *svc)
+{
+ struct ptlrpc_reply_state *rs;
+ unsigned long flags;
+ struct obd_export *exp;
+ struct obd_device *obd;
+ int nlocks;
+ int been_handled;
+ ENTRY;
+
+ spin_lock_irqsave (&svc->srv_lock, flags);
+ if (list_empty (&svc->srv_reply_queue)) {
+ spin_unlock_irqrestore (&svc->srv_lock, flags);
+ RETURN(0);
+ }
+
+ rs = list_entry (svc->srv_reply_queue.next,
+ struct ptlrpc_reply_state, rs_list);
+
+ exp = rs->rs_export;
+ obd = exp->exp_obd;
+
+ LASSERT (rs->rs_difficult);
+ LASSERT (rs->rs_scheduled);
+
+ list_del_init (&rs->rs_list);
+
+ /* Disengage from notifiers carefully (lock ordering!) */
+ spin_unlock(&svc->srv_lock);
+
+ spin_lock (&obd->obd_uncommitted_replies_lock);
+ /* Noop if removed already */
+ list_del_init (&rs->rs_obd_list);
+ spin_unlock (&obd->obd_uncommitted_replies_lock);
+
+ spin_lock (&exp->exp_lock);
+ /* Noop if removed already */
+ list_del_init (&rs->rs_exp_list);
+ spin_unlock (&exp->exp_lock);
+
+ spin_lock(&svc->srv_lock);
+
+ been_handled = rs->rs_handled;
+ rs->rs_handled = 1;
+
+ nlocks = rs->rs_nlocks; /* atomic "steal", but */
+ rs->rs_nlocks = 0; /* locks still on rs_locks! */
+
+ if (nlocks == 0 && !been_handled) {
+ /* If we see this, we should already have seen the warning
+ * in mds_steal_ack_locks() */
+ CWARN("All locks stolen from rs %p x"LPD64".t"LPD64
+ " o%d NID"LPX64"\n",
+ rs,
+ rs->rs_xid, rs->rs_transno,
+ rs->rs_msg.opc, exp->exp_connection->c_peer.peer_nid);
+ }
+
+ if ((!been_handled && rs->rs_on_net) ||
+ nlocks > 0) {
+ spin_unlock_irqrestore(&svc->srv_lock, flags);
+
+ if (!been_handled && rs->rs_on_net) {
+ PtlMDUnlink(rs->rs_md_h);
+ /* Ignore return code; we're racing with
+ * completion... */
+ }
+
+ while (nlocks-- > 0)
+ ldlm_lock_decref(&rs->rs_locks[nlocks],
+ rs->rs_modes[nlocks]);
+
+ spin_lock_irqsave(&svc->srv_lock, flags);
+ }
+
+ rs->rs_scheduled = 0;
+
+ if (!rs->rs_on_net) {
+ /* Off the net */
+ svc->srv_n_difficult_replies--;
+ spin_unlock_irqrestore(&svc->srv_lock, flags);
+
+ class_export_put (exp);
+ rs->rs_export = NULL;
+ lustre_free_reply_state (rs);
+ atomic_dec (&svc->srv_outstanding_replies);
+ RETURN(1);
+ }
+
+ /* still on the net; callback will schedule */
+ spin_unlock_irqrestore (&svc->srv_lock, flags);
+ RETURN(1);
+}
+
+#ifndef __KERNEL__
+/* FIXME make use of timeout later */
+int
+liblustre_check_services (void *arg)
+{
+ int did_something = 0;
+ struct list_head *tmp, *nxt;
+ ENTRY;
+
+ /* I'm relying on being single threaded, not to have to lock
+ * ptlrpc_all_services etc */
+ list_for_each_safe (tmp, nxt, &ptlrpc_all_services) {
+ struct ptlrpc_service *svc =
+ list_entry (tmp, struct ptlrpc_service, srv_list);
+
+ if (svc->srv_nthreads != 0) /* I've recursed */
+ continue;
+
+ /* service threads can block for bulk, so this limits us
+ * (arbitrarily) to recursing 1 stack frame per service.
+ * Note that the problem with recursion is that we have to
+ * unwind completely before our caller can resume. */
+
+ svc->srv_nthreads++;
+
+ while (ptlrpc_server_handle_reply (svc))
+ did_something++;
+
+ while (ptlrpc_server_handle_request (svc))
+ did_something++;
+
+ svc->srv_nthreads--;
+ }
- return rc;
+ RETURN(did_something);
}
+#else /* __KERNEL__ */
+
/* Don't use daemonize, it removes fs struct from new thread (bug 418) */
void ptlrpc_daemonize(void)
{
reparent_to_init();
}
-static long timeval_sub(struct timeval *large, struct timeval *small)
-{
- return (large->tv_sec - small->tv_sec) * 1000000 +
- (large->tv_usec - small->tv_usec);
-}
-
static int ptlrpc_main(void *arg)
{
- struct ptlrpc_svc_data *data = arg;
- struct obd_device *obddev = data->dev;
- struct ptlrpc_service *svc = data->svc;
- struct ptlrpc_thread *thread = data->thread;
- struct ptlrpc_request *request;
- ptl_event_t *event;
- unsigned long flags;
- struct timeval start_time, finish_time;
- long total;
- int rc = 0;
+ struct ptlrpc_svc_data *data = (struct ptlrpc_svc_data *)arg;
+ struct ptlrpc_service *svc = data->svc;
+ struct ptlrpc_thread *thread = data->thread;
+ unsigned long flags;
ENTRY;
lock_kernel();
THREAD_NAME(current->comm, "%s", data->name);
unlock_kernel();
- OBD_ALLOC(event, sizeof(*event));
- if (event == NULL)
- GOTO(out, rc = -ENOMEM);
- OBD_ALLOC(request, sizeof(*request));
- if (request == NULL)
- GOTO(out_event, rc = -ENOMEM);
-
/* Record that the thread is running */
thread->t_flags = SVC_RUNNING;
wake_up(&thread->t_ctl_waitq);
+ spin_lock_irqsave(&svc->srv_lock, flags);
+ svc->srv_nthreads++;
+ spin_unlock_irqrestore(&svc->srv_lock, flags);
+
/* XXX maintain a list of all managed devices: insert here */
- do_gettimeofday(&finish_time);
- /* And now, loop forever on requests */
- while (1) {
+ while ((thread->t_flags & SVC_STOPPING) == 0 ||
+ svc->srv_n_difficult_replies != 0) {
+ /* Don't exit while there are replies to be handled */
struct l_wait_info lwi = { 0 };
- l_wait_event_exclusive(svc->srv_waitq,
- ptlrpc_check_event(svc, thread, event),
- &lwi);
-
- spin_lock(&svc->srv_lock);
- if (thread->t_flags & SVC_STOPPING) {
- thread->t_flags &= ~SVC_STOPPING;
- spin_unlock(&svc->srv_lock);
-
- EXIT;
- break;
- }
-
- if (!(thread->t_flags & SVC_EVENT)) {
- CERROR("unknown flag in service");
- spin_unlock(&svc->srv_lock);
- LBUG();
- EXIT;
- break;
- }
-
- thread->t_flags &= ~SVC_EVENT;
- spin_unlock(&svc->srv_lock);
-
- do_gettimeofday(&start_time);
- total = timeval_sub(&start_time, &event->arrival_time);
- if (svc->srv_stats != NULL) {
- lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
- total);
- lprocfs_counter_add(svc->srv_stats,
- PTLRPC_SVCIDLETIME_CNTR,
- timeval_sub(&start_time,
- &finish_time));
-#if 0 /* Wait for b_eq branch */
- lprocfs_counter_add(svc->srv_stats,
- PTLRPC_SVCEQDEPTH_CNTR, 0);
-#endif
- }
- if (total / 1000000 > (long)obd_timeout) {
- CERROR("Dropping request from NID "LPX64" because it's "
- "%ld seconds old.\n", event->initiator.nid,
- total / 1000000); /* bug 1502 */
- } else {
- CDEBUG(D_HA, "request from NID "LPX64" noticed after "
- "%ldus\n", event->initiator.nid, total);
- rc = handle_incoming_request(obddev, svc, event,
- request);
- }
- do_gettimeofday(&finish_time);
- total = timeval_sub(&finish_time, &start_time);
-
- CDEBUG((total / 1000000 > (long)obd_timeout) ? D_ERROR : D_HA,
- "request "LPU64" from NID "LPX64" processed in %ldus "
- "(%ldus total)\n", request->rq_xid, event->initiator.nid,
- total, timeval_sub(&finish_time, &event->arrival_time));
-
- if (svc->srv_stats != NULL) {
- int opc = opcode_offset(request->rq_reqmsg->opc);
- if (opc > 0) {
- LASSERT(opc < LUSTRE_MAX_OPCODES);
- lprocfs_counter_add(svc->srv_stats,
- opc + PTLRPC_LAST_CNTR,
- total);
- }
- }
+ l_wait_event_exclusive (svc->srv_waitq,
+ (thread->t_flags & SVC_STOPPING) != 0 ||
+ !list_empty (&svc->srv_reply_queue) ||
+ (!list_empty (&svc->srv_request_queue) &&
+ (svc->srv_n_difficult_replies == 0 ||
+ svc->srv_n_active_reqs <
+ (svc->srv_nthreads - 1))),
+ &lwi);
+
+ if (!list_empty (&svc->srv_reply_queue))
+ ptlrpc_server_handle_reply (svc);
+
+ /* only handle requests if there are no difficult replies
+ * outstanding, or I'm not the last thread handling
+ * requests */
+ if (!list_empty (&svc->srv_request_queue) &&
+ (svc->srv_n_difficult_replies == 0 ||
+ svc->srv_n_active_reqs < (svc->srv_nthreads - 1)))
+ ptlrpc_server_handle_request (svc);
}
- /* NB should wait for all SENT callbacks to complete before exiting
- * here. Unfortunately at this time there is no way to track this
- * state. */
- OBD_FREE(request, sizeof(*request));
-out_event:
- OBD_FREE(event, sizeof(*event));
-out:
+ spin_lock_irqsave(&svc->srv_lock, flags);
+
+ svc->srv_nthreads--; /* must know immediately */
thread->t_flags = SVC_STOPPED;
wake_up(&thread->t_ctl_waitq);
- CDEBUG(D_NET, "service thread exiting, process %d: rc = %d\n",
- current->pid, rc);
- return rc;
+ spin_unlock_irqrestore(&svc->srv_lock, flags);
+
+ CDEBUG(D_NET, "service thread exiting, process %d\n", current->pid);
+ return 0;
}
static void ptlrpc_stop_thread(struct ptlrpc_service *svc,
struct ptlrpc_thread *thread)
{
struct l_wait_info lwi = { 0 };
+ unsigned long flags;
- spin_lock(&svc->srv_lock);
+ spin_lock_irqsave(&svc->srv_lock, flags);
thread->t_flags = SVC_STOPPING;
- spin_unlock(&svc->srv_lock);
+ spin_unlock_irqrestore(&svc->srv_lock, flags);
wake_up_all(&svc->srv_waitq);
l_wait_event(thread->t_ctl_waitq, (thread->t_flags & SVC_STOPPED),
&lwi);
+
+ spin_lock_irqsave(&svc->srv_lock, flags);
+ list_del(&thread->t_link);
+ spin_unlock_irqrestore(&svc->srv_lock, flags);
+
+ OBD_FREE(thread, sizeof(*thread));
}
void ptlrpc_stop_all_threads(struct ptlrpc_service *svc)
{
- spin_lock(&svc->srv_lock);
+ unsigned long flags;
+ struct ptlrpc_thread *thread;
+
+ spin_lock_irqsave(&svc->srv_lock, flags);
while (!list_empty(&svc->srv_threads)) {
- struct ptlrpc_thread *thread;
- thread = list_entry(svc->srv_threads.next, struct ptlrpc_thread,
- t_link);
- spin_unlock(&svc->srv_lock);
+ thread = list_entry(svc->srv_threads.next,
+ struct ptlrpc_thread, t_link);
+
+ spin_unlock_irqrestore(&svc->srv_lock, flags);
ptlrpc_stop_thread(svc, thread);
- spin_lock(&svc->srv_lock);
- list_del(&thread->t_link);
- OBD_FREE(thread, sizeof(*thread));
+ spin_lock_irqsave(&svc->srv_lock, flags);
}
- spin_unlock(&svc->srv_lock);
+
+ spin_unlock_irqrestore(&svc->srv_lock, flags);
}
int ptlrpc_start_n_threads(struct obd_device *dev, struct ptlrpc_service *svc,
struct l_wait_info lwi = { 0 };
struct ptlrpc_svc_data d;
struct ptlrpc_thread *thread;
+ unsigned long flags;
int rc;
ENTRY;
if (thread == NULL)
RETURN(-ENOMEM);
init_waitqueue_head(&thread->t_ctl_waitq);
-
+
d.dev = dev;
d.svc = svc;
d.name = name;
d.thread = thread;
- spin_lock(&svc->srv_lock);
+ spin_lock_irqsave(&svc->srv_lock, flags);
list_add(&thread->t_link, &svc->srv_threads);
- spin_unlock(&svc->srv_lock);
+ spin_unlock_irqrestore(&svc->srv_lock, flags);
/* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
* just drop the VM and FILES in ptlrpc_daemonize() right away.
RETURN(0);
}
+#endif
int ptlrpc_unregister_service(struct ptlrpc_service *service)
{
- int i, rc;
+ int i;
+ int rc;
+ unsigned long flags;
struct ptlrpc_srv_ni *srv_ni;
+ struct l_wait_info lwi;
+ struct list_head *tmp;
- LASSERT (list_empty (&service->srv_threads));
+ LASSERT(list_empty(&service->srv_threads));
- /* XXX We could reply (with failure) to all buffered requests
- * _after_ unlinking _all_ the request buffers, but _before_
- * freeing them.
- */
+ spin_lock (&ptlrpc_all_services_lock);
+ list_del_init (&service->srv_list);
+ spin_unlock (&ptlrpc_all_services_lock);
+
+ for (i = 0; i < ptlrpc_ninterfaces; i++) {
+ srv_ni = &service->srv_interfaces[i];
+ CDEBUG(D_NET, "%s: tearing down interface %s\n",
+ service->srv_name, srv_ni->sni_ni->pni_name);
+
+ /* Unlink all the request buffers. This forces a 'final'
+ * event with its 'unlink' flag set for each rqbd */
+ list_for_each(tmp, &srv_ni->sni_rqbds) {
+ struct ptlrpc_request_buffer_desc *rqbd =
+ list_entry(tmp, struct ptlrpc_request_buffer_desc,
+ rqbd_list);
+
+ rc = PtlMDUnlink(rqbd->rqbd_md_h);
+ LASSERT (rc == PTL_OK || rc == PTL_INV_MD);
+ }
+
+ /* Wait for the network to release any buffers it's
+ * currently filling */
+ for (;;) {
+ spin_lock_irqsave(&service->srv_lock, flags);
+ rc = srv_ni->sni_nrqbd_receiving;
+ spin_unlock_irqrestore(&service->srv_lock, flags);
+
+ if (rc == 0)
+ break;
+
+ /* Network access will complete in finite time but
+ * the HUGE timeout lets us CWARN for visibility of
+ * sluggish NALs */
+ lwi = LWI_TIMEOUT(300 * HZ, NULL, NULL);
+ rc = l_wait_event(service->srv_waitq,
+ srv_ni->sni_nrqbd_receiving == 0,
+ &lwi);
+ if (rc == -ETIMEDOUT)
+ CWARN("Waiting for request buffers on "
+ "service %s on interface %s ",
+ service->srv_name, srv_ni->sni_ni->pni_name);
+ }
+
+ /* schedule all outstanding replies to terminate them */
+ spin_lock_irqsave(&service->srv_lock, flags);
+ while (!list_empty(&srv_ni->sni_active_replies)) {
+ struct ptlrpc_reply_state *rs =
+ list_entry(srv_ni->sni_active_replies.next,
+ struct ptlrpc_reply_state,
+ rs_list);
+ ptlrpc_schedule_difficult_reply(rs);
+ }
+ spin_unlock_irqrestore(&service->srv_lock, flags);
+ }
+
+ /* purge the request queue. NB No new replies (rqbds all unlinked)
+ * and no service threads, so I'm the only thread noodling the
+ * request queue now */
+ while (!list_empty(&service->srv_request_queue)) {
+ struct ptlrpc_request *req =
+ list_entry(service->srv_request_queue.next,
+ struct ptlrpc_request,
+ rq_list);
+
+ list_del(&req->rq_list);
+ service->srv_n_queued_reqs--;
+ req->rq_rqbd->rqbd_refcount--;
+
+ ptlrpc_free_server_req(req);
+ }
+ LASSERT(service->srv_n_queued_reqs == 0);
+ /* Now free all the request buffers since nothing references them
+ * any more... */
for (i = 0; i < ptlrpc_ninterfaces; i++) {
srv_ni = &service->srv_interfaces[i];
- CDEBUG (D_NET, "%s: tearing down interface %s\n",
- service->srv_name, srv_ni->sni_ni->pni_name);
- while (!list_empty (&srv_ni->sni_rqbds)) {
+ while (!list_empty(&srv_ni->sni_rqbds)) {
struct ptlrpc_request_buffer_desc *rqbd =
- list_entry (srv_ni->sni_rqbds.next,
- struct ptlrpc_request_buffer_desc,
- rqbd_list);
-
- list_del (&rqbd->rqbd_list);
-
- LASSERT (atomic_read (&rqbd->rqbd_refcount) > 0);
- /* refcount could be anything; it's possible for
- * the buffers to continued to get filled after all
- * the server threads exited. But we know they
- * _have_ exited.
- */
-
- (void) PtlMEUnlink(rqbd->rqbd_me_h);
- /* The callback handler could have unlinked this ME
- * already (we're racing with her) but it's safe to
- * ensure it _has_ been unlinked.
- */
-
- OBD_FREE (rqbd->rqbd_buffer, service->srv_buf_size);
- OBD_FREE (rqbd, sizeof (*rqbd));
- srv_ni->sni_nrqbds--;
+ list_entry(srv_ni->sni_rqbds.next,
+ struct ptlrpc_request_buffer_desc,
+ rqbd_list);
+
+ ptlrpc_free_rqbd(rqbd);
}
+ }
- LASSERT (srv_ni->sni_nrqbds == 0);
+ /* wait for all outstanding replies to complete (they were
+ * scheduled having been flagged to abort above) */
+ while (atomic_read(&service->srv_outstanding_replies) != 0) {
+ struct l_wait_info lwi = LWI_TIMEOUT(10 * HZ, NULL, NULL);
- if (!PtlHandleEqual (srv_ni->sni_eq_h, PTL_HANDLE_NONE)) {
- rc = PtlEQFree(srv_ni->sni_eq_h);
- if (rc)
- CERROR("%s.%d: PtlEQFree failed on %s: %d\n",
- service->srv_name, i,
- srv_ni->sni_ni->pni_name, rc);
+ rc = l_wait_event(service->srv_waitq,
+ !list_empty(&service->srv_reply_queue), &lwi);
+ LASSERT(rc == 0 || rc == -ETIMEDOUT);
+
+ if (rc == 0) {
+ ptlrpc_server_handle_reply(service);
+ continue;
}
+ CWARN("Unexpectedly long timeout %p\n", service);
}
ptlrpc_lprocfs_unregister_service(service);
OBD_FREE(service,
- offsetof (struct ptlrpc_service,
- srv_interfaces[ptlrpc_ninterfaces]));
+ offsetof(struct ptlrpc_service,
+ srv_interfaces[ptlrpc_ninterfaces]));
return 0;
}
# lustre.spec
-%define version HEAD
+%define version b_eq
%define kversion @LINUXRELEASE@
%define linuxdir @LINUX@
%define enable_doc @ENABLE_DOC@
%attr(-, root, root) /usr/lib/lustre/examples/llechocleanup.sh
%attr(-, root, root) /etc/init.d/lustre
-%attr(-, root, root) /lib/libportals.a
%attr(-, root, root) /lib/libptlctl.a
-%attr(-, root, root) /lib/libtcpnal.a
%attr(-, root, root) /lib/liblustreapi.a
%attr(-, root, root) /usr/include/lustre/*.h
# if all the modules have unloaded.
umount $MOUNT &
UMOUNT_PID=$!
- sleep $TIMEOUT
+ sleep 2
echo "killing umount"
kill -TERM $UMOUNT_PID
+ echo "waiting for umount to finish"
wait $UMOUNT_PID
# cleanup client modules
}
run_test 5 "force cleanup mds, then cleanup"
+test_5b() {
+ start_ost
+ start_mds
+ stop_mds
+
+ [ -d $MOUNT ] || mkdir -p $MOUNT
+ $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
+ llmount $mds_HOST://mds_svc/client_facet $MOUNT && exit 1
+
+ # cleanup client modules
+ $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
+
+ # stop_mds is a no-op here, and should not fail
+ stop_mds || return 2
+ stop_ost || return 3
+
+ lsmod | grep -q portals && return 3
+ return 0
+
+}
+run_test 5b "mds down, cleanup after failed mount (bug 2712)"
+
+test_5c() {
+ start_ost
+ start_mds
+
+ [ -d $MOUNT ] || mkdir -p $MOUNT
+ $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
+ llmount $mds_HOST://wrong_mds_svc/client_facet $MOUNT && exit 1
+
+ # cleanup client modules
+ $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
+
+ stop_mds || return 2
+ stop_ost || return 3
+
+ lsmod | grep -q portals && return 3
+ return 0
+
+}
+run_test 5c "cleanup after failed mount (bug 2712)"
+
test_6() {
setup
manual_umount_client
done
fail ost
for i in `seq 10`; do
- grep -q "tag-$i" $DIR/$tfile-$i || error "f1c-$i"
+ grep -q "tag-$i" $DIR/$tfile-$i || error "f2-$i"
done
}
run_test 2 "|x| 10 open(O_CREAT)s"
SUBDIRS = Lustre
CFLAGS:=-g -O2 -I$(top_srcdir)/utils -I$(top_srcdir)/portals/include -I$(srcdir)/../include -Wall -L../portals/utils
-KFLAGS:=
CPPFLAGS = $(HAVE_LIBREADLINE)
+
+if LIBLUSTRE
+
+bin_SCRIPTS = lrun
+
+EXTRA_DIST = $(sbin_SCRIPTS)
+
+include $(top_srcdir)/Rules
+
+else
+
+KFLAGS:=
lctl_LDADD := $(LIBREADLINE) -lptlctl
lfs_LDADD := $(LIBREADLINE) parser.o liblustreapi.a -lptlctl obd.o
lload_LDADD := -lptlctl
mount.lustre$(EXEEXT): llmount
cp llmount mount.lustre
+
+endif
#!/bin/sh
LIBLUSTRE_MOUNT_POINT=${LIBLUSTRE_MOUNT_POINT:-"/mnt/lustre"}
-LIBLUSTRE_MOUNT_TARGET=${LIBLUSTRE_MOUNT_TARGET:-""}
+LIBLUSTRE_MOUNT_TARGET=${LIBLUSTRE_MOUNT_TARGET:-"TARGET_NOT_SET"}
LIBLUSTRE_DUMPFILE=${LIBLUSTRE_DUMPFILE:-"/tmp/DUMP_FILE"}
LD_PRELOAD=${LD_PRELOAD:-"/usr/lib/liblustre.so"}