From: adilger Date: Wed, 10 Mar 2004 01:19:30 +0000 (+0000) Subject: Update b1_2 from HEAD (20040309_1747) X-Git-Tag: v1_8_0_110~486^6~95 X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=4f0bb165b968f3e1feb25d16f06d886a7279fa44;p=fs%2Flustre-release.git Update b1_2 from HEAD (20040309_1747) b=2818, b=2901, b=2663, b=2908, b=2530, b=2464, b=2306 (socknal zc part) --- diff --git a/lnet/archdep.m4 b/lnet/archdep.m4 index 3bdaf32..7801957 100644 --- a/lnet/archdep.m4 +++ b/lnet/archdep.m4 @@ -133,8 +133,8 @@ case ${host_cpu} in powerpc ) AC_MSG_RESULT($host_cpu) - KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -D__powerpc__ -fsigned-char -msoft-float -pipe -ffixed-r2 -Wno-uninitialized -mmultiple -mstring' - KCPPFLAGS='-D__KERNEL__' + KCFLAGS='-O2 -g -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -D__powerpc__ -fsigned-char -msoft-float -pipe -ffixed-r2 -Wno-uninitialized -mmultiple -mstring' + KCPPFLAGS='-D__KERNEL__ -DMODULE' MOD_LINK=elf32ppclinux ;; @@ -338,6 +338,18 @@ AC_SUBST(MOD_LINK) AC_SUBST(LINUX25) AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib) +# ---------- Red Hat 2.4.18 has iobuf->dovary -------------- +# But other kernels don't + +AC_MSG_CHECKING([if struct kiobuf has a dovary field]) +AC_TRY_COMPILE([#define __KERNEL__ + #include ], + [struct kiobuf iobuf; + iobuf.dovary = 1;], + [AC_MSG_RESULT([yes]) + CPPFLAGS="$CPPFLAGS -DHAVE_KIOBUF_DOVARY"], + [AC_MSG_RESULT([no])]) + # ---------- Red Hat 2.4.20 backports some 2.5 bits -------- # This needs to run after we've defined the KCPPFLAGS diff --git a/lnet/include/linux/kp30.h b/lnet/include/linux/kp30.h index d56a120..c080a57 100644 --- a/lnet/include/linux/kp30.h +++ b/lnet/include/linux/kp30.h @@ -368,13 +368,14 @@ typedef struct { struct list_head kprfd_list; /* stash in queues (routing target can use) */ ptl_nid_t kprfd_target_nid; /* final destination NID */ ptl_nid_t kprfd_gateway_nid; /* gateway NID */ - int kprfd_nob; /* # message bytes (including header) */ - int kprfd_niov; /* # message frags (including header) */ - struct iovec *kprfd_iov; /* message fragments */ - void *kprfd_router_arg; // originating NAL's router arg + ptl_hdr_t *kprfd_hdr; /* header in wire byte order */ + int kprfd_nob; /* # payload bytes */ + int kprfd_niov; /* # payload frags */ + ptl_kiov_t *kprfd_kiov; /* payload fragments */ + void *kprfd_router_arg; /* originating NAL's router arg */ kpr_fwd_callback_t kprfd_callback; /* completion callback */ void *kprfd_callback_arg; /* completion callback arg */ - kprfd_scratch_t kprfd_scratch; // scratchpad for routing targets + kprfd_scratch_t kprfd_scratch; /* scratchpad for routing targets */ } kpr_fwd_desc_t; typedef void (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd); @@ -477,15 +478,16 @@ kpr_lookup (kpr_router_t *router, ptl_nid_t nid, int nob, ptl_nid_t *gateway_nid } static inline void -kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid, - int nob, int niov, struct iovec *iov, +kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid, ptl_hdr_t *hdr, + int nob, int niov, ptl_kiov_t *kiov, kpr_fwd_callback_t callback, void *callback_arg) { fwd->kprfd_target_nid = nid; fwd->kprfd_gateway_nid = nid; + fwd->kprfd_hdr = hdr; fwd->kprfd_nob = nob; fwd->kprfd_niov = niov; - fwd->kprfd_iov = iov; + fwd->kprfd_kiov = kiov; fwd->kprfd_callback = callback; fwd->kprfd_callback_arg = callback_arg; } diff --git a/lnet/include/lnet/types.h b/lnet/include/lnet/types.h index 7ffe797..d4ca453 100644 --- a/lnet/include/lnet/types.h +++ b/lnet/include/lnet/types.h @@ -3,7 +3,13 @@ #ifdef __linux__ # include -# include +# if defined(__powerpc__) && !defined(__KERNEL__) +# define __KERNEL__ +# include +# undef __KERNEL__ +# else +# include +# endif #else # include typedef u_int32_t __u32; @@ -14,7 +20,7 @@ typedef u_int64_t __u64; # include #else # include -# define do_gettimeofday(tv) gettimeofday(tv, NULL) +# define do_gettimeofday(tv) gettimeofday(tv, NULL); #endif #include @@ -129,7 +135,7 @@ typedef struct { struct timeval arrival_time; volatile ptl_seq_t sequence; -} ptl_event_t; +} __attribute__((packed)) ptl_event_t; #ifdef __CYGWIN__ #pragma pop #endif diff --git a/lnet/klnds/gmlnd/gmlnd.h b/lnet/klnds/gmlnd/gmlnd.h index cdde5b7..ad46b90 100644 --- a/lnet/klnds/gmlnd/gmlnd.h +++ b/lnet/klnds/gmlnd/gmlnd.h @@ -45,6 +45,7 @@ #include "linux/init.h" #include "linux/sem.h" #include "linux/vmalloc.h" +#include "linux/sysctl.h" #define DEBUG_SUBSYSTEM S_GMNAL @@ -80,9 +81,14 @@ extern int gmnal_small_msg_size; extern int num_rx_threads; extern int num_stxds; +extern int gm_port; #define GMNAL_SMALL_MSG_SIZE(a) a->small_msg_size #define GMNAL_IS_SMALL_MESSAGE(n,a,b,c) gmnal_is_small_msg(n, a, b, c) #define GMNAL_MAGIC 0x1234abcd +/* + * The gm_port to use for gmnal + */ +#define GMNAL_GM_PORT gm_port /* @@ -218,6 +224,7 @@ typedef struct _gmnal_data_t { gmnal_rxtwe_t *rxtwe_tail; spinlock_t rxtwe_lock; struct semaphore rxtwe_wait; + struct ctl_table_header *sysctl; } gmnal_data_t; /* @@ -234,11 +241,6 @@ typedef struct _gmnal_data_t { extern gmnal_data_t *global_nal_data; /* - * The gm_port to use for gmnal - */ -#define GMNAL_GM_PORT 4 - -/* * for ioctl get pid */ #define GMNAL_IOC_GET_GNID 1 @@ -353,6 +355,8 @@ int gmnal_cb_read(nal_cb_t *, void *private, void *, user_ptr, size_t); int gmnal_cb_write(nal_cb_t *, void *private, user_ptr, void *, size_t); +int gmnal_cb_callback(nal_cb_t *, void *, lib_eq_t *, ptl_event_t *); + void *gmnal_cb_malloc(nal_cb_t *, size_t); void gmnal_cb_free(nal_cb_t *, void *, size_t); @@ -382,7 +386,7 @@ void gmnal_fini(void); a->cb_recv_pages = gmnal_cb_recv_pages; \ a->cb_read = gmnal_cb_read; \ a->cb_write = gmnal_cb_write; \ - a->cb_callback = NULL; \ + a->cb_callback = gmnal_cb_callback; \ a->cb_malloc = gmnal_cb_malloc; \ a->cb_free = gmnal_cb_free; \ a->cb_map = NULL; \ @@ -418,6 +422,7 @@ void gmnal_stop_rxthread(gmnal_data_t *); void gmnal_stop_ctthread(gmnal_data_t *); void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t); void gmnal_drop_sends_callback(gm_port_t *, void *, gm_status_t); +void gmnal_resume_sending_callback(gm_port_t *, void *, gm_status_t); char *gmnal_gm_error(gm_status_t); char *gmnal_rxevent(gm_recv_event_t*); int gmnal_is_small_msg(gmnal_data_t*, int, struct iovec*, int); diff --git a/lnet/klnds/gmlnd/gmlnd_api.c b/lnet/klnds/gmlnd/gmlnd_api.c index 1cb1317..1442aa7 100644 --- a/lnet/klnds/gmlnd/gmlnd_api.c +++ b/lnet/klnds/gmlnd/gmlnd_api.c @@ -25,7 +25,36 @@ #include "gmnal.h" + + gmnal_data_t *global_nal_data = NULL; +#define GLOBAL_NID_STR_LEN 16 +char global_nid_str[GLOBAL_NID_STR_LEN] = {0}; + +/* + * Write the global nid /proc/sys/gmnal/globalnid + */ +#define GMNAL_SYSCTL 201 +#define GMNAL_SYSCTL_GLOBALNID 1 + +static ctl_table gmnal_sysctl_table[] = { + {GMNAL_SYSCTL_GLOBALNID, "globalnid", + global_nid_str, GLOBAL_NID_STR_LEN, + 0444, NULL, &proc_dostring}, + { 0 } +}; + + +static ctl_table gmnalnal_top_sysctl_table[] = { + {GMNAL_SYSCTL, "gmnal", NULL, 0, 0555, gmnal_sysctl_table}, + { 0 } +}; + + + + + + /* * gmnal_api_forward * This function takes a pack block of arguments from the NAL API @@ -193,8 +222,8 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, ptl_pid_t portals_pid = 0; - CDEBUG(D_TRACE, "gmnal_init : interface [%d], ptl_size [%d], - ac_size[%d]\n", interface, ptl_size, ac_size); + CDEBUG(D_TRACE, "gmnal_init : interface [%d], ptl_size [%d], " + "ac_size[%d]\n", interface, ptl_size, ac_size); PORTAL_ALLOC(nal_data, sizeof(gmnal_data_t)); @@ -255,8 +284,8 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, } - CDEBUG(D_NET, "Calling gm_open with interface [%d], port [%d], - name [%s], version [%d]\n", interface, GMNAL_GM_PORT, + CDEBUG(D_NET, "Calling gm_open with interface [%d], port [%d], " + "name [%s], version [%d]\n", interface, GMNAL_GM_PORT, "gmnal", GM_API_VERSION); GMNAL_GM_LOCK(nal_data); @@ -280,15 +309,15 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, CDEBUG(D_ERROR, "gm_open Failure. No such device\n"); break; case(GM_INCOMPATIBLE_LIB_AND_DRIVER): - CDEBUG(D_ERROR, "gm_open Failure. Incompatile lib - and driver\n"); + CDEBUG(D_ERROR, "gm_open Failure. Incompatile lib " + "and driver\n"); break; case(GM_OUT_OF_MEMORY): CDEBUG(D_ERROR, "gm_open Failure. Out of Memory\n"); break; default: - CDEBUG(D_ERROR, "gm_open Failure. Unknow error - code [%d]\n", gm_status); + CDEBUG(D_ERROR, "gm_open Failure. Unknow error " + "code [%d]\n", gm_status); break; } GMNAL_GM_LOCK(nal_data); @@ -403,6 +432,7 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, } CDEBUG(D_INFO, "Global node id is [%u]\n", global_nid); nal_data->gm_global_nid = global_nid; + snprintf(global_nid_str, GLOBAL_NID_STR_LEN, "%u", global_nid); /* pid = gm_getpid(); @@ -429,6 +459,9 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, return(NULL); } + nal_data->sysctl = NULL; + nal_data->sysctl = register_sysctl_table (gmnalnal_top_sysctl_table, 0); + CDEBUG(D_INFO, "gmnal_init finished\n"); global_nal_data = nal->nal_data; @@ -459,6 +492,8 @@ void gmnal_fini() gm_close(nal_data->gm_port); gm_finalize(); GMNAL_GM_UNLOCK(nal_data); + if (nal_data->sysctl) + unregister_sysctl_table (nal_data->sysctl); PORTAL_FREE(nal, sizeof(nal_t)); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); diff --git a/lnet/klnds/gmlnd/gmlnd_cb.c b/lnet/klnds/gmlnd/gmlnd_cb.c index e055242..1f287468 100644 --- a/lnet/klnds/gmlnd/gmlnd_cb.c +++ b/lnet/klnds/gmlnd/gmlnd_cb.c @@ -35,8 +35,8 @@ int gmnal_cb_recv(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, int status = PTL_OK; - CDEBUG(D_TRACE, "gmnal_cb_recv nal_cb [%p], private[%p], cookie[%p], - niov[%d], iov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", + CDEBUG(D_TRACE, "gmnal_cb_recv nal_cb [%p], private[%p], cookie[%p], " + "niov[%d], iov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", nal_cb, private, cookie, niov, iov, mlen, rlen); switch(srxd->type) { @@ -64,10 +64,11 @@ int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, int status = PTL_OK; struct iovec *iovec = NULL, *iovec_dup = NULL; int i = 0; + ptl_kiov_t *kiov_dup = kiov;; - CDEBUG(D_TRACE, "gmnal_cb_recv_pages nal_cb [%p],private[%p], - cookie[%p], kniov[%d], kiov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", + CDEBUG(D_TRACE, "gmnal_cb_recv_pages nal_cb [%p],private[%p], " + "cookie[%p], kniov[%d], kiov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", nal_cb, private, cookie, kniov, kiov, mlen, rlen); if (srxd->type == GMNAL_SMALL_MESSAGE) { @@ -99,6 +100,10 @@ int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, CDEBUG(D_INFO, "calling gmnal_small_rx\n"); status = gmnal_small_rx(nal_cb, private, cookie, kniov, iovec_dup, mlen, rlen); + for (i=0; ikiov_page); + kiov_dup++; + } PORTAL_FREE(iovec_dup, sizeof(struct iovec)*kniov); } @@ -126,6 +131,7 @@ int gmnal_cb_send(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, niov, iov, len); } else { CDEBUG(D_ERROR, "Large message send it is not supported\n"); + lib_finalize(nal_cb, private, cookie, PTL_FAIL); return(PTL_FAIL); gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid, niov, iov, len); @@ -140,6 +146,7 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, int i = 0; gmnal_data_t *nal_data; struct iovec *iovec = NULL, *iovec_dup = NULL; + ptl_kiov_t *kiov_dup = kiov; CDEBUG(D_TRACE, "gmnal_cb_send_pages nid ["LPU64"] niov[%d] len["LPSZ"]\n", nid, kniov, len); nal_data = nal_cb->nal_data; @@ -181,6 +188,10 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid, kniov, iovec, len); } + for (i=0; ikiov_page); + kiov_dup++; + } PORTAL_FREE(iovec_dup, kniov*sizeof(struct iovec)); return(PTL_OK); } @@ -199,6 +210,18 @@ int gmnal_cb_write(nal_cb_t *nal_cb, void *private, user_ptr dst, return(PTL_OK); } +int gmnal_cb_callback(nal_cb_t *nal_cb, void *private, lib_eq_t *eq, + ptl_event_t *ev) +{ + + if (eq->event_callback != NULL) { + CDEBUG(D_INFO, "found callback\n"); + eq->event_callback(ev); + } + + return(PTL_OK); +} + void *gmnal_cb_malloc(nal_cb_t *nal_cb, size_t len) { void *ptr = NULL; diff --git a/lnet/klnds/gmlnd/gmlnd_comm.c b/lnet/klnds/gmlnd/gmlnd_comm.c index a0d3530..1bcd9bd 100644 --- a/lnet/klnds/gmlnd/gmlnd_comm.c +++ b/lnet/klnds/gmlnd/gmlnd_comm.c @@ -203,14 +203,14 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) gmnal_msghdr = (gmnal_msghdr_t*)buffer; portals_hdr = (ptl_hdr_t*)(buffer+GMNAL_MSGHDR_SIZE); - CDEBUG(D_INFO, "rx_event:: Sender node [%d], Sender Port [%d], - type [%d], length [%d], buffer [%p]\n", + CDEBUG(D_INFO, "rx_event:: Sender node [%d], Sender Port [%d], " + "type [%d], length [%d], buffer [%p]\n", snode, sport, type, length, buffer); - CDEBUG(D_INFO, "gmnal_msghdr:: Sender node [%u], magic [%d], - gmnal_type [%d]\n", gmnal_msghdr->sender_node_id, + CDEBUG(D_INFO, "gmnal_msghdr:: Sender node [%u], magic [%d], " + "gmnal_type [%d]\n", gmnal_msghdr->sender_node_id, gmnal_msghdr->magic, gmnal_msghdr->type); - CDEBUG(D_INFO, "portals_hdr:: Sender node ["LPD64"], - dest_node ["LPD64"]\n", portals_hdr->src_nid, + CDEBUG(D_INFO, "portals_hdr:: Sender node ["LPD64"], " + "dest_node ["LPD64"]\n", portals_hdr->src_nid, portals_hdr->dest_nid); @@ -321,6 +321,7 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, if (!private) { CDEBUG(D_ERROR, "gmnal_small_rx no context\n"); + lib_finalize(nal_cb, private, cookie, PTL_FAIL); return(PTL_FAIL); } @@ -343,7 +344,6 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, */ CDEBUG(D_PORTALS, "calling lib_finalize\n"); lib_finalize(nal_cb, private, cookie, PTL_OK); - /* * return buffer so it can be used again */ @@ -377,9 +377,9 @@ gmnal_small_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, unsigned int local_nid; gm_status_t gm_status = GM_SUCCESS; - CDEBUG(D_TRACE, "gmnal_small_tx nal_cb [%p] private [%p] cookie [%p] - hdr [%p] type [%d] global_nid ["LPU64"] pid [%d] niov [%d] - iov [%p] size [%d]\n", nal_cb, private, cookie, hdr, type, + CDEBUG(D_TRACE, "gmnal_small_tx nal_cb [%p] private [%p] cookie [%p] " + "hdr [%p] type [%d] global_nid ["LPU64"] pid [%d] niov [%d] " + "iov [%p] size [%d]\n", nal_cb, private, cookie, hdr, type, global_nid, pid, niov, iov, size); CDEBUG(D_INFO, "portals_hdr:: dest_nid ["LPU64"], src_nid ["LPU64"]\n", @@ -440,9 +440,9 @@ gmnal_small_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, stxd->msg_size = tot_size; - CDEBUG(D_NET, "Calling gm_send_to_peer port [%p] buffer [%p] - gmsize [%lu] msize [%d] global_nid ["LPU64"] local_nid[%d] - stxd [%p]\n", nal_data->gm_port, stxd->buffer, stxd->gm_size, + CDEBUG(D_NET, "Calling gm_send_to_peer port [%p] buffer [%p] " + "gmsize [%lu] msize [%d] global_nid ["LPU64"] local_nid[%d] " + "stxd [%p]\n", nal_data->gm_port, stxd->buffer, stxd->gm_size, stxd->msg_size, global_nid, local_nid, stxd); GMNAL_GM_LOCK(nal_data); @@ -493,8 +493,8 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) /* * do a resend on the dropped ones */ - CDEBUG(D_ERROR, "send stxd [%p] was dropped - resending\n", context); + CDEBUG(D_ERROR, "send stxd [%p] was dropped " + "resending\n", context); GMNAL_GM_LOCK(nal_data); gm_send_to_peer_with_callback(nal_data->gm_port, stxd->buffer, @@ -569,6 +569,11 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) case(GM_YP_NO_MATCH): default: CDEBUG(D_ERROR, "Unknown send error\n"); + gm_resume_sending(nal_data->gm_port, stxd->gm_priority, + stxd->gm_target_node, GMNAL_GM_PORT, + gmnal_resume_sending_callback, context); + return; + } /* @@ -588,10 +593,22 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) } gmnal_return_stxd(nal_data, stxd); lib_finalize(nal_cb, stxd, cookie, PTL_OK); - return; } +/* + * After an error on the port + * call this to allow future sends to complete + */ +void gmnal_resume_sending_callback(struct gm_port *gm_port, void *context, + gm_status_t status) +{ + gmnal_data_t *nal_data; + gmnal_stxd_t *stxd = (gmnal_stxd_t*)context; + CDEBUG(D_TRACE, "status is [%d] context is [%p]\n", status, context); + gmnal_return_stxd(stxd->nal_data, stxd); + return; +} void gmnal_drop_sends_callback(struct gm_port *gm_port, void *context, @@ -611,8 +628,8 @@ void gmnal_drop_sends_callback(struct gm_port *gm_port, void *context, context); GMNAL_GM_LOCK(nal_data); } else { - CDEBUG(D_ERROR, "send_to_peer status for stxd [%p] is - [%d][%s]\n", stxd, status, gmnal_gm_error(status)); + CDEBUG(D_ERROR, "send_to_peer status for stxd [%p] is " + "[%d][%s]\n", stxd, status, gmnal_gm_error(status)); } @@ -644,9 +661,9 @@ gmnal_large_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, int niov_dup; - CDEBUG(D_TRACE, "gmnal_large_tx nal_cb [%p] private [%p], cookie [%p] - hdr [%p], type [%d] global_nid ["LPU64"], pid [%d], niov [%d], - iov [%p], size [%d]\n", nal_cb, private, cookie, hdr, type, + CDEBUG(D_TRACE, "gmnal_large_tx nal_cb [%p] private [%p], cookie [%p] " + "hdr [%p], type [%d] global_nid ["LPU64"], pid [%d], niov [%d], " + "iov [%p], size [%d]\n", nal_cb, private, cookie, hdr, type, global_nid, pid, niov, iov, size); if (nal_cb) @@ -729,8 +746,8 @@ gmnal_large_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, iov->iov_base, iov->iov_len); if (gm_status != GM_SUCCESS) { GMNAL_GM_UNLOCK(nal_data); - CDEBUG(D_ERROR, "gm_register_memory returns [%d][%s] - for memory [%p] len ["LPSZ"]\n", + CDEBUG(D_ERROR, "gm_register_memory returns [%d][%s] " + "for memory [%p] len ["LPSZ"]\n", gm_status, gmnal_gm_error(gm_status), iov->iov_base, iov->iov_len); GMNAL_GM_LOCK(nal_data); @@ -806,12 +823,13 @@ gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, gmnal_msghdr_t *msghdr = NULL; gm_status_t gm_status; - CDEBUG(D_TRACE, "gmnal_large_rx :: nal_cb[%p], private[%p], - cookie[%p], niov[%d], iov[%p], mlen["LPSZ"], rlen["LPSZ"]\n", + CDEBUG(D_TRACE, "gmnal_large_rx :: nal_cb[%p], private[%p], " + "cookie[%p], niov[%d], iov[%p], mlen["LPSZ"], rlen["LPSZ"]\n", nal_cb, private, cookie, nriov, riov, mlen, rlen); if (!srxd) { CDEBUG(D_ERROR, "gmnal_large_rx no context\n"); + lib_finalize(nal_cb, private, cookie, PTL_FAIL); return(PTL_FAIL); } @@ -846,8 +864,8 @@ gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, riov->iov_base, riov->iov_len); if (gm_status != GM_SUCCESS) { GMNAL_GM_UNLOCK(nal_data); - CDEBUG(D_ERROR, "gm_register_memory returns [%d][%s] - for memory [%p] len ["LPSZ"]\n", + CDEBUG(D_ERROR, "gm_register_memory returns [%d][%s] " + "for memory [%p] len ["LPSZ"]\n", gm_status, gmnal_gm_error(gm_status), riov->iov_base, riov->iov_len); GMNAL_GM_LOCK(nal_data); @@ -902,8 +920,8 @@ gmnal_remote_get(gmnal_srxd_t *srxd, int nsiov, struct iovec *siov, int ncalls = 0; - CDEBUG(D_TRACE, "gmnal_remote_get srxd[%p], nriov[%d], riov[%p], - nsiov[%d], siov[%p]\n", srxd, nriov, riov, nsiov, siov); + CDEBUG(D_TRACE, "gmnal_remote_get srxd[%p], nriov[%d], riov[%p], " + "nsiov[%d], siov[%p]\n", srxd, nriov, riov, nsiov, siov); ncalls = gmnal_copyiov(0, srxd, nsiov, siov, nriov, riov); @@ -958,8 +976,8 @@ gmnal_copyiov(int do_copy, gmnal_srxd_t *srxd, int nsiov, srxd->gm_source_node, &source_node) != GM_SUCCESS) { - CDEBUG(D_ERROR, "cannot resolve global_id [%u] - to local node_id\n", srxd->gm_source_node); + CDEBUG(D_ERROR, "cannot resolve global_id [%u] " + "to local node_id\n", srxd->gm_source_node); GMNAL_GM_UNLOCK(nal_data); return(GMNAL_STATUS_FAIL); } @@ -1201,9 +1219,9 @@ gmnal_large_tx_ack(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) stxd->msg_size= sizeof(gmnal_msghdr_t); - CDEBUG(D_NET, "Calling gm_send_to_peer port [%p] buffer [%p] - gmsize [%lu] msize [%d] global_nid [%u] local_nid[%d] - stxd [%p]\n", nal_data->gm_port, stxd->buffer, stxd->gm_size, + CDEBUG(D_NET, "Calling gm_send_to_peer port [%p] buffer [%p] " + "gmsize [%lu] msize [%d] global_nid [%u] local_nid[%d] " + "stxd [%p]\n", nal_data->gm_port, stxd->buffer, stxd->gm_size, stxd->msg_size, srxd->gm_source_node, local_nid, stxd); GMNAL_GM_LOCK(nal_data); stxd->gm_priority = GM_LOW_PRIORITY; diff --git a/lnet/klnds/gmlnd/gmlnd_module.c b/lnet/klnds/gmlnd/gmlnd_module.c index 1260629..31f6819 100644 --- a/lnet/klnds/gmlnd/gmlnd_module.c +++ b/lnet/klnds/gmlnd/gmlnd_module.c @@ -30,6 +30,7 @@ int gmnal_small_msg_size = 525312; */ int num_rx_threads = -1; int num_stxds = 5; +int gm_port = 4; ptl_handle_ni_t kgmnal_ni; @@ -139,6 +140,7 @@ EXPORT_SYMBOL(kgmnal_ni); MODULE_PARM(gmnal_small_msg_size, "i"); MODULE_PARM(num_rx_threads, "i"); MODULE_PARM(num_stxds, "i"); +MODULE_PARM(gm_port, "i"); MODULE_AUTHOR("Morgan Doyle"); diff --git a/lnet/klnds/gmlnd/gmlnd_utils.c b/lnet/klnds/gmlnd/gmlnd_utils.c index 55606f3..6a52319 100644 --- a/lnet/klnds/gmlnd/gmlnd_utils.c +++ b/lnet/klnds/gmlnd/gmlnd_utils.c @@ -117,8 +117,8 @@ gmnal_alloc_txd(gmnal_data_t *nal_data) GMNAL_SMALL_MSG_SIZE(nal_data)); GMNAL_GM_UNLOCK(nal_data); if (!txbuffer) { - CDEBUG(D_ERROR, "Failed to gm_dma_malloc txbuffer [%d], - size [%d]\n", i, + CDEBUG(D_ERROR, "Failed to gm_dma_malloc txbuffer [%d]," + " size [%d]\n", i, GMNAL_SMALL_MSG_SIZE(nal_data)); PORTAL_FREE(txd, sizeof(gmnal_stxd_t)); return(GMNAL_STATUS_FAIL); @@ -131,8 +131,8 @@ gmnal_alloc_txd(gmnal_data_t *nal_data) txd->next = nal_data->stxd; nal_data->stxd = txd; - CDEBUG(D_INFO, "Registered txd [%p] with buffer [%p], - size [%d]\n", txd, txd->buffer, txd->buffer_size); + CDEBUG(D_INFO, "Registered txd [%p] with buffer [%p], " + "size [%d]\n", txd, txd->buffer, txd->buffer_size); } for (i=0; i<=nrxt_stx; i++) { @@ -146,8 +146,8 @@ gmnal_alloc_txd(gmnal_data_t *nal_data) GMNAL_SMALL_MSG_SIZE(nal_data)); GMNAL_GM_UNLOCK(nal_data); if (!txbuffer) { - CDEBUG(D_ERROR, "Failed to gm_dma_malloc txbuffer [%d], - size [%d]\n", i, + CDEBUG(D_ERROR, "Failed to gm_dma_malloc txbuffer [%d]," + " size [%d]\n", i, GMNAL_SMALL_MSG_SIZE(nal_data)); PORTAL_FREE(txd, sizeof(gmnal_stxd_t)); return(GMNAL_STATUS_FAIL); @@ -160,8 +160,8 @@ gmnal_alloc_txd(gmnal_data_t *nal_data) txd->next = nal_data->rxt_stxd; nal_data->rxt_stxd = txd; - CDEBUG(D_INFO, "Registered txd [%p] with buffer [%p], - size [%d]\n", txd, txd->buffer, txd->buffer_size); + CDEBUG(D_INFO, "Registered txd [%p] with buffer [%p], " + "size [%d]\n", txd, txd->buffer, txd->buffer_size); } /* @@ -187,8 +187,8 @@ gmnal_free_txd(gmnal_data_t *nal_data) CDEBUG(D_TRACE, "gmnal_free_small tx\n"); while(txd) { - CDEBUG(D_INFO, "Freeing txd [%p] with buffer [%p], - size [%d]\n", txd, txd->buffer, txd->buffer_size); + CDEBUG(D_INFO, "Freeing txd [%p] with buffer [%p], " + "size [%d]\n", txd, txd->buffer, txd->buffer_size); _txd = txd; txd = txd->next; GMNAL_GM_LOCK(nal_data); @@ -198,8 +198,8 @@ gmnal_free_txd(gmnal_data_t *nal_data) } txd = nal_data->rxt_stxd; while(txd) { - CDEBUG(D_INFO, "Freeing txd [%p] with buffer [%p], - size [%d]\n", txd, txd->buffer, txd->buffer_size); + CDEBUG(D_INFO, "Freeing txd [%p] with buffer [%p], " + "size [%d]\n", txd, txd->buffer, txd->buffer_size); _txd = txd; txd = txd->next; GMNAL_GM_LOCK(nal_data); @@ -392,22 +392,22 @@ gmnal_alloc_srxd(gmnal_data_t *nal_data) #if 0 PORTAL_ALLOC(rxbuffer, GMNAL_SMALL_MSG_SIZE(nal_data)); if (!rxbuffer) { - CDEBUG(D_ERROR, "Failed to malloc rxbuffer [%d], - size [%d]\n", i, + CDEBUG(D_ERROR, "Failed to malloc rxbuffer [%d], " + "size [%d]\n", i, GMNAL_SMALL_MSG_SIZE(nal_data)); PORTAL_FREE(rxd, sizeof(gmnal_srxd_t)); return(GMNAL_STATUS_FAIL); } - CDEBUG(D_NET, "Calling gm_register_memory with port [%p] - rxbuffer [%p], size [%d]\n", nal_data->gm_port, + CDEBUG(D_NET, "Calling gm_register_memory with port [%p] " + "rxbuffer [%p], size [%d]\n", nal_data->gm_port, rxbuffer, GMNAL_SMALL_MSG_SIZE(nal_data)); GMNAL_GM_LOCK(nal_data); gm_status = gm_register_memory(nal_data->gm_port, rxbuffer, GMNAL_SMALL_MSG_SIZE(nal_data)); GMNAL_GM_UNLOCK(nal_data); if (gm_status != GM_SUCCESS) { - CDEBUG(D_ERROR, "gm_register_memory failed buffer [%p], - index [%d]\n", rxbuffer, i); + CDEBUG(D_ERROR, "gm_register_memory failed buffer [%p]," + " index [%d]\n", rxbuffer, i); switch(gm_status) { case(GM_FAILURE): CDEBUG(D_ERROR, "GM_FAILURE\n"); @@ -432,8 +432,8 @@ gmnal_alloc_srxd(gmnal_data_t *nal_data) GMNAL_SMALL_MSG_SIZE(nal_data)); GMNAL_GM_UNLOCK(nal_data); if (!rxbuffer) { - CDEBUG(D_ERROR, "Failed to gm_dma_malloc rxbuffer [%d], - size [%d]\n", i, + CDEBUG(D_ERROR, "Failed to gm_dma_malloc rxbuffer [%d]," + " size [%d]\n", i, GMNAL_SMALL_MSG_SIZE(nal_data)); PORTAL_FREE(rxd, sizeof(gmnal_srxd_t)); return(GMNAL_STATUS_FAIL); @@ -447,15 +447,15 @@ gmnal_alloc_srxd(gmnal_data_t *nal_data) if (gm_hash_insert(nal_data->srxd_hash, (void*)rxbuffer, (void*)rxd)) { - CDEBUG(D_ERROR, "failed to create hash entry rxd[%p] - for rxbuffer[%p]\n", rxd, rxbuffer); + CDEBUG(D_ERROR, "failed to create hash entry rxd[%p] " + "for rxbuffer[%p]\n", rxd, rxbuffer); return(GMNAL_STATUS_FAIL); } rxd->next = nal_data->srxd; nal_data->srxd = rxd; - CDEBUG(D_INFO, "Registered rxd [%p] with buffer [%p], - size [%d]\n", rxd, rxd->buffer, rxd->size); + CDEBUG(D_INFO, "Registered rxd [%p] with buffer [%p], " + "size [%d]\n", rxd, rxd->buffer, rxd->size); } return(GMNAL_STATUS_OK); @@ -623,6 +623,8 @@ gmnal_stop_ctthread(gmnal_data_t *nal_data) char * gmnal_gm_error(gm_status_t status) { + return(gm_strerror(status)); + switch(status) { case(GM_SUCCESS): return("SUCCESS"); @@ -972,7 +974,7 @@ gmnal_get_rxtwe(gmnal_data_t *nal_data) } spin_lock(&nal_data->rxtwe_lock); if (nal_data->rxtwe_head) { - CDEBUG(D_WARNING, "Got a work entry\n"); + CDEBUG(D_INFO, "Got a work entry\n"); we = nal_data->rxtwe_head; nal_data->rxtwe_head = we->next; if (!nal_data->rxtwe_head) @@ -983,7 +985,7 @@ gmnal_get_rxtwe(gmnal_data_t *nal_data) spin_unlock(&nal_data->rxtwe_lock); } while (!we); - CDEBUG(D_WARNING, "Returning we[%p]\n", we); + CDEBUG(D_INFO, "Returning we[%p]\n", we); return(we); } diff --git a/lnet/klnds/qswlnd/qswlnd.c b/lnet/klnds/qswlnd/qswlnd.c index 90c9a95..3b3b5d4 100644 --- a/lnet/klnds/qswlnd/qswlnd.c +++ b/lnet/klnds/qswlnd/qswlnd.c @@ -348,10 +348,10 @@ kqswnal_finalise (void) for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) { kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; - /* If krx_pages[0] got allocated, it got mapped. + /* If krx_kiov[0].kiov_page got allocated, it got mapped. * NB subsequent pages get merged */ - if (krx->krx_pages[0] != NULL) + if (krx->krx_kiov[0].kiov_page != NULL) ep_dvma_unload(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_rx_nmh, &krx->krx_elanbuffer); @@ -416,8 +416,8 @@ kqswnal_finalise (void) kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; for (j = 0; j < krx->krx_npages; j++) - if (krx->krx_pages[j] != NULL) - __free_page (krx->krx_pages[j]); + if (krx->krx_kiov[j].kiov_page != NULL) + __free_page (krx->krx_kiov[j].kiov_page); } PORTAL_FREE(kqswnal_data.kqn_rxds, @@ -709,18 +709,19 @@ kqswnal_initialise (void) LASSERT (krx->krx_npages > 0); for (j = 0; j < krx->krx_npages; j++) { - krx->krx_pages[j] = alloc_page(GFP_KERNEL); - if (krx->krx_pages[j] == NULL) - { + struct page *page = alloc_page(GFP_KERNEL); + + if (page == NULL) { kqswnal_finalise (); return (-ENOMEM); } - LASSERT(page_address(krx->krx_pages[j]) != NULL); + krx->krx_kiov[j].kiov_page = page; + LASSERT(page_address(page) != NULL); #if MULTIRAIL_EKC ep_dvma_load(kqswnal_data.kqn_ep, NULL, - page_address(krx->krx_pages[j]), + page_address(page), PAGE_SIZE, kqswnal_data.kqn_ep_rx_nmh, elan_page_idx, &all_rails, &elanbuffer); @@ -736,7 +737,7 @@ kqswnal_initialise (void) #else elan3_dvma_kaddr_load(kqswnal_data.kqn_ep->DmaState, kqswnal_data.kqn_eprxdmahandle, - page_address(krx->krx_pages[j]), + page_address(page), PAGE_SIZE, elan_page_idx, &elanbuffer); if (j == 0) diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h index b1b9a45..5ebf30a 100644 --- a/lnet/klnds/qswlnd/qswlnd.h +++ b/lnet/klnds/qswlnd/qswlnd.h @@ -153,8 +153,7 @@ typedef struct int krx_rpc_reply_sent; /* rpc reply sent */ atomic_t krx_refcount; /* how to tell when rpc is done */ kpr_fwd_desc_t krx_fwd; /* embedded forwarding descriptor */ - struct page *krx_pages[KQSW_NRXMSGPAGES_LARGE]; /* pages allocated */ - struct iovec krx_iov[KQSW_NRXMSGPAGES_LARGE]; /* iovec for forwarding */ + ptl_kiov_t krx_kiov[KQSW_NRXMSGPAGES_LARGE]; /* buffer frags */ } kqswnal_rx_t; typedef struct diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c index 478c25f..157dc70 100644 --- a/lnet/klnds/qswlnd/qswlnd_cb.c +++ b/lnet/klnds/qswlnd/qswlnd_cb.c @@ -775,7 +775,7 @@ kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, int offset, int nob) { kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; - char *buffer = (char *)page_address(krx->krx_pages[0]); + char *buffer = (char *)page_address(krx->krx_kiov[0].kiov_page); kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(buffer + KQSW_HDR_SIZE); int rc; #if MULTIRAIL_EKC @@ -1008,7 +1008,7 @@ kqswnal_sendmsg (nal_cb_t *nal, } memcpy(ktx->ktx_buffer + sizeof(*hdr) + sizeof(csum), &csum, sizeof(csum)); #endif - + if (kqswnal_data.kqn_optimized_gets && type == PTL_MSG_GET && /* doing a GET */ nid == targetnid) { /* not forwarding */ @@ -1167,7 +1167,7 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) { int rc; kqswnal_tx_t *ktx; - struct iovec *iov = fwd->kprfd_iov; + ptl_kiov_t *kiov = fwd->kprfd_kiov; int niov = fwd->kprfd_niov; int nob = fwd->kprfd_nob; ptl_nid_t nid = fwd->kprfd_gateway_nid; @@ -1177,11 +1177,9 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) LBUG (); #endif /* The router wants this NAL to forward a packet */ - CDEBUG (D_NET, "forwarding [%p] to "LPX64", %d frags %d bytes\n", + CDEBUG (D_NET, "forwarding [%p] to "LPX64", payload: %d frags %d bytes\n", fwd, nid, niov, nob); - LASSERT (niov > 0); - ktx = kqswnal_get_idle_tx (fwd, 0); if (ktx == NULL) /* can't get txd right now */ return; /* fwd will be scheduled when tx desc freed */ @@ -1195,44 +1193,44 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) goto failed; } - if (nob > KQSW_NRXMSGBYTES_LARGE) { - CERROR ("Can't forward [%p] to "LPX64 - ": size %d bigger than max packet size %ld\n", - fwd, nid, nob, (long)KQSW_NRXMSGBYTES_LARGE); - rc = -EMSGSIZE; - goto failed; - } + /* copy hdr into pre-mapped buffer */ + memcpy(ktx->ktx_buffer, fwd->kprfd_hdr, sizeof(ptl_hdr_t)); + ktx->ktx_wire_hdr = (ptl_hdr_t *)ktx->ktx_buffer; - ktx->ktx_port = (nob <= (KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD)) ? + ktx->ktx_port = (nob <= KQSW_SMALLPAYLOAD) ? EP_MSG_SVC_PORTALS_SMALL : EP_MSG_SVC_PORTALS_LARGE; ktx->ktx_nid = nid; ktx->ktx_state = KTX_FORWARDING; ktx->ktx_args[0] = fwd; + ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - if ((kqswnal_data.kqn_copy_small_fwd || niov > 1) && - nob <= KQSW_TX_BUFFER_SIZE) + if (nob <= KQSW_TX_MAXCONTIG) { - /* send from ktx's pre-mapped contiguous buffer? */ - lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, 0, nob); + /* send payload from ktx's pre-mapped contiguous buffer */ #if MULTIRAIL_EKC ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, - 0, nob); + 0, KQSW_HDR_SIZE + nob); #else ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; - ktx->ktx_frags[0].Len = nob; + ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + nob; #endif - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - ktx->ktx_wire_hdr = (ptl_hdr_t *)ktx->ktx_buffer; + if (nob > 0) + lib_copy_kiov2buf(ktx->ktx_buffer + KQSW_HDR_SIZE, + niov, kiov, 0, nob); } else { - /* zero copy */ - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0; - rc = kqswnal_map_tx_iov (ktx, 0, nob, niov, iov); + /* zero copy payload */ +#if MULTIRAIL_EKC + ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, + 0, KQSW_HDR_SIZE); +#else + ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; + ktx->ktx_frags[0].Len = KQSW_HDR_SIZE; +#endif + rc = kqswnal_map_tx_kiov (ktx, 0, nob, niov, kiov); if (rc != 0) goto failed; - - ktx->ktx_wire_hdr = (ptl_hdr_t *)iov[0].iov_base; } rc = kqswnal_launch (ktx); @@ -1257,7 +1255,7 @@ kqswnal_fwd_callback (void *arg, int error) if (error != 0) { - ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]); + ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_kiov[0].kiov_page); CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n", NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),error); @@ -1371,8 +1369,9 @@ kqswnal_requeue_rx (kqswnal_rx_t *krx) void kqswnal_rx (kqswnal_rx_t *krx) { - ptl_hdr_t *hdr = (ptl_hdr_t *) page_address (krx->krx_pages[0]); + ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(krx->krx_kiov[0].kiov_page); ptl_nid_t dest_nid = NTOH__u64 (hdr->dest_nid); + int payload_nob; int nob; int niov; @@ -1398,16 +1397,26 @@ kqswnal_rx (kqswnal_rx_t *krx) return; } - /* NB forwarding may destroy iov; rebuild every time */ - for (nob = krx->krx_nob, niov = 0; nob > 0; nob -= PAGE_SIZE, niov++) - { - LASSERT (niov < krx->krx_npages); - krx->krx_iov[niov].iov_base= page_address(krx->krx_pages[niov]); - krx->krx_iov[niov].iov_len = MIN(PAGE_SIZE, nob); + nob = payload_nob = krx->krx_nob - KQSW_HDR_SIZE; + niov = 0; + if (nob > 0) { + krx->krx_kiov[0].kiov_offset = KQSW_HDR_SIZE; + krx->krx_kiov[0].kiov_len = MIN(PAGE_SIZE - KQSW_HDR_SIZE, nob); + niov = 1; + nob -= PAGE_SIZE - KQSW_HDR_SIZE; + + while (nob > 0) { + LASSERT (niov < krx->krx_npages); + + krx->krx_kiov[niov].kiov_offset = 0; + krx->krx_kiov[niov].kiov_len = MIN(PAGE_SIZE, nob); + niov++; + nob -= PAGE_SIZE; + } } - kpr_fwd_init (&krx->krx_fwd, dest_nid, - krx->krx_nob, niov, krx->krx_iov, + kpr_fwd_init (&krx->krx_fwd, dest_nid, + hdr, payload_nob, niov, krx->krx_kiov, kqswnal_fwd_callback, krx); kpr_fwd_start (&kqswnal_data.kqn_router, &krx->krx_fwd); @@ -1471,7 +1480,7 @@ kqswnal_rxhandler(EP_RXD *rxd) void kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr) { - ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]); + ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_kiov[0].kiov_page); CERROR ("%s checksum mismatch %p: dnid "LPX64", snid "LPX64 ", dpid %d, spid %d, type %d\n", @@ -1526,6 +1535,7 @@ kqswnal_recvmsg (nal_cb_t *nal, size_t rlen) { kqswnal_rx_t *krx = (kqswnal_rx_t *)private; + char *buffer = page_address(krx->krx_kiov[0].kiov_page); int page; char *page_ptr; int page_nob; @@ -1535,8 +1545,7 @@ kqswnal_recvmsg (nal_cb_t *nal, #if KQSW_CHECKSUM kqsw_csum_t senders_csum; kqsw_csum_t payload_csum = 0; - kqsw_csum_t hdr_csum = kqsw_csum(0, page_address(krx->krx_pages[0]), - sizeof(ptl_hdr_t)); + kqsw_csum_t hdr_csum = kqsw_csum(0, buffer, sizeof(ptl_hdr_t)); size_t csum_len = mlen; int csum_frags = 0; int csum_nob = 0; @@ -1545,8 +1554,7 @@ kqswnal_recvmsg (nal_cb_t *nal, atomic_inc (&csum_counter); - memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) + - sizeof (ptl_hdr_t), sizeof (kqsw_csum_t)); + memcpy (&senders_csum, buffer + sizeof (ptl_hdr_t), sizeof (kqsw_csum_t)); if (senders_csum != hdr_csum) kqswnal_csum_error (krx, 1); #endif @@ -1567,8 +1575,7 @@ kqswnal_recvmsg (nal_cb_t *nal, if (mlen != 0) { page = 0; - page_ptr = ((char *) page_address(krx->krx_pages[0])) + - KQSW_HDR_SIZE; + page_ptr = buffer + KQSW_HDR_SIZE; page_nob = PAGE_SIZE - KQSW_HDR_SIZE; LASSERT (niov > 0); @@ -1621,7 +1628,7 @@ kqswnal_recvmsg (nal_cb_t *nal, { page++; LASSERT (page < krx->krx_npages); - page_ptr = page_address(krx->krx_pages[page]); + page_ptr = page_address(krx->krx_kiov[page].kiov_page); page_nob = PAGE_SIZE; } @@ -1649,8 +1656,8 @@ kqswnal_recvmsg (nal_cb_t *nal, } #if KQSW_CHECKSUM - memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) + - sizeof(ptl_hdr_t) + sizeof(kqsw_csum_t), sizeof(kqsw_csum_t)); + memcpy (&senders_csum, buffer + sizeof(ptl_hdr_t) + sizeof(kqsw_csum_t), + sizeof(kqsw_csum_t)); if (csum_len != rlen) CERROR("Unable to checksum data in user's buffer\n"); diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index c47dcb4..2c44b43 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -1388,6 +1388,7 @@ ksocknal_cmd(struct portals_cfg *pcfg, void * private) void ksocknal_free_fmbs (ksock_fmb_pool_t *p) { + int npages = p->fmp_buff_pages; ksock_fmb_t *fmb; int i; @@ -1399,12 +1400,12 @@ ksocknal_free_fmbs (ksock_fmb_pool_t *p) fmb = list_entry(p->fmp_idle_fmbs.next, ksock_fmb_t, fmb_list); - for (i = 0; i < fmb->fmb_npages; i++) - if (fmb->fmb_pages[i] != NULL) - __free_page(fmb->fmb_pages[i]); - + for (i = 0; i < npages; i++) + if (fmb->fmb_kiov[i].kiov_page != NULL) + __free_page(fmb->fmb_kiov[i].kiov_page); + list_del(&fmb->fmb_list); - PORTAL_FREE(fmb, sizeof(*fmb)); + PORTAL_FREE(fmb, offsetof(ksock_fmb_t, fmb_kiov[npages])); } } @@ -1603,10 +1604,12 @@ ksocknal_module_init (void) spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock); INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs); INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns); + ksocknal_data.ksnd_small_fmp.fmp_buff_pages = SOCKNAL_SMALL_FWD_PAGES; spin_lock_init(&ksocknal_data.ksnd_large_fmp.fmp_lock); INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_idle_fmbs); INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns); + ksocknal_data.ksnd_large_fmp.fmp_buff_pages = SOCKNAL_LARGE_FWD_PAGES; spin_lock_init (&ksocknal_data.ksnd_reaper_lock); INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns); @@ -1690,34 +1693,36 @@ ksocknal_module_init (void) for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++) { - ksock_fmb_t *fmb; + ksock_fmb_t *fmb; + ksock_fmb_pool_t *pool; + + + if (i < SOCKNAL_SMALL_FWD_NMSGS) + pool = &ksocknal_data.ksnd_small_fmp; + else + pool = &ksocknal_data.ksnd_large_fmp; - PORTAL_ALLOC(fmb, sizeof(*fmb)); + PORTAL_ALLOC(fmb, offsetof(ksock_fmb_t, + fmb_kiov[pool->fmp_buff_pages])); if (fmb == NULL) { ksocknal_module_fini(); return (-ENOMEM); } - if (i < SOCKNAL_SMALL_FWD_NMSGS) { - fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES; - fmb->fmb_pool = &ksocknal_data.ksnd_small_fmp; - } else { - fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES; - fmb->fmb_pool = &ksocknal_data.ksnd_large_fmp; - } - - for (j = 0; j < fmb->fmb_npages; j++) { - fmb->fmb_pages[j] = alloc_page(GFP_KERNEL); + fmb->fmb_pool = pool; + + for (j = 0; j < pool->fmp_buff_pages; j++) { + fmb->fmb_kiov[j].kiov_page = alloc_page(GFP_KERNEL); - if (fmb->fmb_pages[j] == NULL) { + if (fmb->fmb_kiov[j].kiov_page == NULL) { ksocknal_module_fini (); return (-ENOMEM); } - LASSERT(page_address(fmb->fmb_pages[j]) != NULL); + LASSERT(page_address(fmb->fmb_kiov[j].kiov_page) != NULL); } - list_add(&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs); + list_add(&fmb->fmb_list, &pool->fmp_idle_fmbs); } } diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index 0f0b9bd..db8c842 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -44,6 +44,7 @@ #include #include +#include #include #include @@ -88,7 +89,7 @@ #define SOCKNAL_SMALL_FWD_PAGES 1 /* # pages in a small message fwd buffer */ -#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + PTL_MTU) >> PAGE_SHIFT) +#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN(PTL_MTU) >> PAGE_SHIFT) /* # pages in a large message fwd buffer */ #define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ @@ -115,6 +116,7 @@ typedef struct /* pool of forwarding buffers */ struct list_head fmp_idle_fmbs; /* free buffers */ struct list_head fmp_blocked_conns; /* connections waiting for a buffer */ int fmp_nactive_fmbs; /* # buffers in use */ + int fmp_buff_pages; /* # pages per buffer */ } ksock_fmb_pool_t; @@ -193,18 +195,13 @@ typedef struct { #define SOCKNAL_INIT_ALL 3 /* A packet just assembled for transmission is represented by 1 or more - * struct iovec fragments and 0 or more ptl_kiov_t fragments. Forwarded - * messages, or messages from an MD with PTL_MD_KIOV _not_ set have 0 - * ptl_kiov_t fragments. Messages from an MD with PTL_MD_KIOV set, have 1 - * struct iovec fragment (the header) and up to PTL_MD_MAX_IOV ptl_kiov_t - * fragments. + * struct iovec fragments (the first frag contains the portals header), + * followed by 0 or more ptl_kiov_t fragments. * * On the receive side, initially 1 struct iovec fragment is posted for - * receive (the header). Once the header has been received, if the message - * requires forwarding or will be received into mapped memory, up to - * PTL_MD_MAX_IOV struct iovec fragments describe the target memory. - * Otherwise up to PTL_MD_MAX_IOV ptl_kiov_t fragments are used. - */ + * receive (the header). Once the header has been received, the payload is + * received into either struct iovec or ptl_kiov_t fragments, depending on + * what the header matched or whether the message needs forwarding. */ struct ksock_conn; /* forward ref */ struct ksock_peer; /* forward ref */ @@ -227,6 +224,12 @@ typedef struct /* transmit packet */ #endif } ksock_tx_t; +typedef struct /* forwarded packet */ +{ + ksock_tx_t ftx_tx; /* send info */ + struct iovec ftx_iov; /* hdr iovec */ +} ksock_ftx_t; + #define KSOCK_ZCCD_2_TX(ptr) list_entry (ptr, ksock_tx_t, tx_zccd) /* network zero copy callback descriptor embedded in ksock_tx_t */ @@ -254,15 +257,14 @@ typedef struct /* Kernel portals Socket Forward { /* (socknal->router) */ struct list_head fmb_list; /* queue idle */ kpr_fwd_desc_t fmb_fwd; /* router's descriptor */ - int fmb_npages; /* # pages allocated */ ksock_fmb_pool_t *fmb_pool; /* owning pool */ struct ksock_peer *fmb_peer; /* peer received from */ - struct page *fmb_pages[SOCKNAL_LARGE_FWD_PAGES]; - struct iovec fmb_iov[SOCKNAL_LARGE_FWD_PAGES]; + ptl_hdr_t fmb_hdr; /* message header */ + ptl_kiov_t fmb_kiov[0]; /* payload frags */ } ksock_fmb_t; /* space for the rx frag descriptors; we either read a single contiguous - * header, or PTL_MD_MAX_IOV frags of payload of either type. */ + * header, or up to PTL_MD_MAX_IOV frags of payload of either type. */ typedef union { struct iovec iov[PTL_MD_MAX_IOV]; ptl_kiov_t kiov[PTL_MD_MAX_IOV]; diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index c6cdaba..c89e20e 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -123,7 +123,7 @@ ksocknal_free_ltx (ksock_ltx_t *ltx) PORTAL_FREE(ltx, ltx->ltx_desc_size); } -#if SOCKNAL_ZC +#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) struct page * ksocknal_kvaddr_to_page (unsigned long vaddr) { @@ -159,7 +159,7 @@ ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) int more = (tx->tx_niov > 1) || (tx->tx_nkiov > 0) || (!list_empty (&conn->ksnc_tx_queue)); -#if SOCKNAL_ZC +#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) int offset = vaddr & (PAGE_SIZE - 1); int zcsize = MIN (fragsize, PAGE_SIZE - offset); struct page *page; @@ -171,7 +171,7 @@ ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) LASSERT (fragsize <= tx->tx_resid); LASSERT (tx->tx_niov > 0); -#if SOCKNAL_ZC +#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) if (zcsize >= ksocknal_data.ksnd_zc_min_frag && (sock->sk->route_caps & NETIF_F_SG) && (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && @@ -1133,7 +1133,7 @@ void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) { ptl_nid_t nid = fwd->kprfd_gateway_nid; - ksock_tx_t *tx = (ksock_tx_t *)&fwd->kprfd_scratch; + ksock_ftx_t *ftx = (ksock_ftx_t *)&fwd->kprfd_scratch; int rc; CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, @@ -1143,14 +1143,18 @@ ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) if (nid == ksocknal_lib.ni.nid) nid = fwd->kprfd_target_nid; - tx->tx_isfwd = 1; /* This is a forwarding packet */ - tx->tx_nob = fwd->kprfd_nob; - tx->tx_niov = fwd->kprfd_niov; - tx->tx_iov = fwd->kprfd_iov; - tx->tx_nkiov = 0; - tx->tx_kiov = NULL; + /* setup iov for hdr */ + ftx->ftx_iov.iov_base = fwd->kprfd_hdr; + ftx->ftx_iov.iov_len = sizeof(ptl_hdr_t); + + ftx->ftx_tx.tx_isfwd = 1; /* This is a forwarding packet */ + ftx->ftx_tx.tx_nob = sizeof(ptl_hdr_t) + fwd->kprfd_nob; + ftx->ftx_tx.tx_niov = 1; + ftx->ftx_tx.tx_iov = &ftx->ftx_iov; + ftx->ftx_tx.tx_nkiov = fwd->kprfd_niov; + ftx->ftx_tx.tx_kiov = fwd->kprfd_kiov; - rc = ksocknal_launch_packet (tx, nid); + rc = ksocknal_launch_packet (&ftx->ftx_tx, nid); if (rc != 0) kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, rc); } @@ -1178,7 +1182,7 @@ ksocknal_fmb_callback (void *arg, int error) { ksock_fmb_t *fmb = (ksock_fmb_t *)arg; ksock_fmb_pool_t *fmp = fmb->fmb_pool; - ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]); + ptl_hdr_t *hdr = (ptl_hdr_t *)page_address(fmb->fmb_kiov[0].kiov_page); ksock_conn_t *conn = NULL; ksock_sched_t *sched; unsigned long flags; @@ -1236,7 +1240,6 @@ ksock_fmb_t * ksocknal_get_idle_fmb (ksock_conn_t *conn) { int payload_nob = conn->ksnc_rx_nob_left; - int packet_nob = sizeof (ptl_hdr_t) + payload_nob; unsigned long flags; ksock_fmb_pool_t *pool; ksock_fmb_t *fmb; @@ -1244,7 +1247,7 @@ ksocknal_get_idle_fmb (ksock_conn_t *conn) LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); LASSERT (kpr_routing(&ksocknal_data.ksnd_router)); - if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE) + if (payload_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE) pool = &ksocknal_data.ksnd_small_fmp; else pool = &ksocknal_data.ksnd_large_fmp; @@ -1275,98 +1278,64 @@ ksocknal_get_idle_fmb (ksock_conn_t *conn) int ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) { - int payload_nob = conn->ksnc_rx_nob_left; - int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + int payload_nob = conn->ksnc_rx_nob_left; ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid); - int niov; /* at least the header */ - int nob; + int niov = 0; + int nob = payload_nob; LASSERT (conn->ksnc_rx_scheduled); LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left); LASSERT (payload_nob >= 0); - LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE); + LASSERT (payload_nob <= fmb->fmb_pool->fmp_buff_pages * PAGE_SIZE); LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE); - - /* Got a forwarding buffer; copy the header we just read into the - * forwarding buffer. If there's payload, start reading reading it - * into the buffer, otherwise the forwarding buffer can be kicked - * off immediately. - * - * NB fmb->fmb_iov spans the WHOLE packet. - * conn->ksnc_rx_iov spans just the payload. - */ - fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]); - - /* copy header */ - memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t)); + LASSERT (fmb->fmb_kiov[0].kiov_offset == 0); /* Take a ref on the conn's peer to prevent module unload before - * forwarding completes. NB we ref peer and not conn since because - * all refs on conn after it has been closed must remove themselves - * in finite time */ + * forwarding completes. */ fmb->fmb_peer = conn->ksnc_peer; atomic_inc (&conn->ksnc_peer->ksnp_refcount); - if (payload_nob == 0) { /* got complete packet already */ - CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n", - conn, NTOH__u64 (conn->ksnc_hdr.src_nid), - dest_nid, packet_nob); + /* Copy the header we just read into the forwarding buffer. If + * there's payload, start reading reading it into the buffer, + * otherwise the forwarding buffer can be kicked off + * immediately. */ + fmb->fmb_hdr = conn->ksnc_hdr; - fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t); + while (nob > 0) { + LASSERT (niov < fmb->fmb_pool->fmp_buff_pages); + LASSERT (fmb->fmb_kiov[niov].kiov_offset == 0); + fmb->fmb_kiov[niov].kiov_len = MIN (PAGE_SIZE, nob); + nob -= PAGE_SIZE; + niov++; + } + + kpr_fwd_init(&fmb->fmb_fwd, dest_nid, &fmb->fmb_hdr, + payload_nob, niov, fmb->fmb_kiov, + ksocknal_fmb_callback, fmb); - kpr_fwd_init (&fmb->fmb_fwd, dest_nid, - packet_nob, 1, fmb->fmb_iov, - ksocknal_fmb_callback, fmb); + if (payload_nob == 0) { /* got complete packet already */ + CDEBUG (D_NET, "%p "LPX64"->"LPX64" fwd_start (immediate)\n", + conn, NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid); - /* forward it now */ kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd); ksocknal_new_packet (conn, 0); /* on to next packet */ return (1); } - niov = 1; - if (packet_nob <= PAGE_SIZE) { /* whole packet fits in first page */ - fmb->fmb_iov[0].iov_len = packet_nob; - } else { - fmb->fmb_iov[0].iov_len = PAGE_SIZE; - nob = packet_nob - PAGE_SIZE; - - do { - LASSERT (niov < fmb->fmb_npages); - fmb->fmb_iov[niov].iov_base = - page_address (fmb->fmb_pages[niov]); - fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob); - nob -= PAGE_SIZE; - niov++; - } while (nob > 0); - } - - kpr_fwd_init (&fmb->fmb_fwd, dest_nid, - packet_nob, niov, fmb->fmb_iov, - ksocknal_fmb_callback, fmb); - conn->ksnc_cookie = fmb; /* stash fmb for later */ conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ - /* payload is desc's iov-ed buffer, but skipping the hdr */ - LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) / - sizeof (struct iovec)); - - conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; - conn->ksnc_rx_iov[0].iov_base = - (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) + - sizeof (ptl_hdr_t)); - conn->ksnc_rx_iov[0].iov_len = - fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t); - - if (niov > 1) - memcpy(&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1], - (niov - 1) * sizeof (struct iovec)); - - conn->ksnc_rx_niov = niov; + /* Set up conn->ksnc_rx_kiov to read the payload into fmb's kiov-ed + * buffer */ + LASSERT (niov <= sizeof(conn->ksnc_rx_iov_space)/sizeof(ptl_kiov_t)); + conn->ksnc_rx_niov = 0; + conn->ksnc_rx_nkiov = niov; + conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; + memcpy(conn->ksnc_rx_kiov, fmb->fmb_kiov, niov * sizeof(ptl_kiov_t)); + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn, NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid, payload_nob); return (0); diff --git a/lnet/router/router.c b/lnet/router/router.c index e29f628..d0dbf0a 100644 --- a/lnet/router/router.c +++ b/lnet/router/router.c @@ -456,14 +456,13 @@ kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %d\n", fwd, target_nid, src_ne->kpne_interface.kprni_nalid); - LASSERT (nob >= sizeof (ptl_hdr_t)); /* at least got a packet header */ - LASSERT (nob == lib_iov_nob (fwd->kprfd_niov, fwd->kprfd_iov)); + LASSERT (nob == lib_kiov_nob (fwd->kprfd_niov, fwd->kprfd_kiov)); atomic_inc (&kpr_queue_depth); atomic_inc (&src_ne->kpne_refcount); /* source nal is busy until fwd completes */ kpr_fwd_packets++; /* (loose) stats accounting */ - kpr_fwd_bytes += nob; + kpr_fwd_bytes += nob + sizeof(ptl_hdr_t); if (src_ne->kpne_shutdown) /* caller is shutting down */ goto out; diff --git a/lnet/utils/Makefile.am b/lnet/utils/Makefile.am index 6c31b3d..925406f 100644 --- a/lnet/utils/Makefile.am +++ b/lnet/utils/Makefile.am @@ -26,11 +26,11 @@ libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h gmnalnid_SOURCES = gmnalnid.c ptlctl_SOURCES = ptlctl.c -ptlctl_LDADD = -L. -lptlctl -lncurses # -lefence +ptlctl_LDADD = -L. -lptlctl $(LIBREADLINE) $(LIBEFENCE) ptlctl_DEPENDENCIES = libptlctl.a debugctl_SOURCES = debugctl.c -debugctl_LDADD = -L. -lptlctl -lncurses # -lefence +debugctl_LDADD = -L. -lptlctl $(LIBREADLINE) $(LIBEFENCE) debugctl_DEPENDENCIES = libptlctl.a routerstat_SOURCES = routerstat.c diff --git a/lnet/utils/gmlndnid.c b/lnet/utils/gmlndnid.c index 84ac97f..ff6631c 100644 --- a/lnet/utils/gmlndnid.c +++ b/lnet/utils/gmlndnid.c @@ -115,5 +115,5 @@ int main(int argc, char **argv) free(pcfg.pcfg_pbuf1); close(pfd); printf("%u\n", nid); - exit(nid); + exit(0); } diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 274a565..a3d29e6 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -1,3 +1,13 @@ +tbd Cluster File Systems, Inc. + * version 1.2.1 + * bug fixes + - fixes for glimpse AST timeouts / incorrectly 0-sized files (2818) + - don't overwrite extent policy data in reply if lock was blocked (2901) + - drop filter export grants atomically with removal from device (2663) + - del obd_self_export from work_list in class_disconnect_exports (2908) + - don't LBUG if MDS recovery times out during orphan cleanup (2530) + - swab reply message in mdc_close, other PPC fixes (2464) + 2004-03-04 Cluster File Systems, Inc. * version 1.2.0 * bug fixes diff --git a/lustre/include/linux/lustre_cfg.h b/lustre/include/linux/lustre_cfg.h index d8c84be..4f230d2 100644 --- a/lustre/include/linux/lustre_cfg.h +++ b/lustre/include/linux/lustre_cfg.h @@ -119,8 +119,6 @@ static inline int lustre_cfg_pack(struct lustre_cfg *data, char **pbuf, LOGL(data->lcfg_inlbuf3, data->lcfg_inllen3, ptr); if (data->lcfg_inlbuf4) LOGL(data->lcfg_inlbuf4, data->lcfg_inllen4, ptr); -// if (lustre_cfg_is_invalid(overlay)) -// return 1; *plen = len; @@ -200,14 +198,11 @@ static inline int lustre_cfg_getdata(char **buf, int len, void *arg, int kernel) lcfg = (struct lustre_cfg *)*buf; if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) { - CERROR("Version mismatch kernel vs application\n"); + CERROR("Version mismatch kernel: %#x application: %#x\n", + LUSTRE_CFG_VERSION, lcfg->lcfg_version); return -EINVAL; } -// if (lustre_cfg_is_invalid(data)) { -// CERROR("ioctl not correctly formatted\n"); -// return -EINVAL; -// } if (lcfg->lcfg_dev_name) { lcfg->lcfg_dev_name = &lcfg->lcfg_bulk[0]; diff --git a/lustre/include/linux/lustre_dlm.h b/lustre/include/linux/lustre_dlm.h index b8515a3..d85d7a1 100644 --- a/lustre/include/linux/lustre_dlm.h +++ b/lustre/include/linux/lustre_dlm.h @@ -29,6 +29,7 @@ typedef enum { ELDLM_LOCK_CHANGED = 300, ELDLM_LOCK_ABORTED = 301, ELDLM_LOCK_REPLACED = 302, + ELDLM_NO_LOCK_DATA = 303, ELDLM_NAMESPACE_EXISTS = 400, ELDLM_BAD_NAMESPACE = 401 @@ -144,7 +145,7 @@ typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **, struct ldlm_valblock_ops { int (*lvbo_init)(struct ldlm_resource *res); int (*lvbo_update)(struct ldlm_resource *res, struct lustre_msg *m, - int buf_idx); + int buf_idx, int increase); }; struct ldlm_namespace { diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index 24ee1c2..242498e 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -55,6 +55,7 @@ struct lov_oinfo { /* per-stripe data structure */ struct list_head loi_cli_item; struct list_head loi_write_item; + int loi_kms_valid:1; __u64 loi_kms; /* known minimum size */ __u64 loi_rss; /* recently seen size */ __u64 loi_mtime; /* recently seen mtime */ diff --git a/lustre/kernel_patches/series/vanilla-2.4.20 b/lustre/kernel_patches/series/vanilla-2.4.20 index abb6bbe..d11bec0 100644 --- a/lustre/kernel_patches/series/vanilla-2.4.20 +++ b/lustre/kernel_patches/series/vanilla-2.4.20 @@ -50,5 +50,5 @@ kernel_text_address-2.4.20-vanilla.patch ext3-xattr-ptr-arith-fix.patch gfp_memalloc-2.4.22.patch procfs-ndynamic-2.4.patch -linux-2.4.20-tmpfs-xattr.patch +linux-2.4.20-filemap.patch ext3-truncate-buffer-head.patch diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 0e7f0b0..b5b0e33 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -752,11 +752,13 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns, lock->l_completion_ast = completion; lock->l_glimpse_ast = glimpse; - lock->l_lvb_len = lvb_len; - OBD_ALLOC(lock->l_lvb_data, lvb_len); - if (lock->l_lvb_data == NULL) { - OBD_SLAB_FREE(lock, ldlm_lock_slab, sizeof(*lock)); - RETURN(NULL); + if (lvb_len) { + lock->l_lvb_len = lvb_len; + OBD_ALLOC(lock->l_lvb_data, lvb_len); + if (lock->l_lvb_data == NULL) { + OBD_SLAB_FREE(lock, ldlm_lock_slab, sizeof(*lock)); + RETURN(NULL); + } } RETURN(lock); diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 5765d8c..6602713 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -37,6 +37,7 @@ #include #include +#include #include "ldlm_internal.h" extern kmem_cache_t *ldlm_resource_slab; @@ -547,12 +548,15 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data) } else if (rc == -EINVAL) { LDLM_DEBUG(lock, "lost the race -- client no longer has this " "lock"); + } else if (rc == -ELDLM_NO_LOCK_DATA) { + LDLM_DEBUG(lock, "lost a race -- client has a lock, but no " + "inode"); } else if (rc) { LDLM_ERROR(lock, "client sent rc %d rq_status %d from " "glimpse AST", rc, req->rq_status); } else { - rc = res->lr_namespace->ns_lvbo->lvbo_update(res, - req->rq_repmsg, 0); + rc = res->lr_namespace->ns_lvbo->lvbo_update + (res, req->rq_repmsg, 0, 1); } ptlrpc_req_finished(req); RETURN(rc); @@ -767,7 +771,7 @@ int ldlm_handle_cancel(struct ptlrpc_request *req) if (res && res->lr_namespace->ns_lvbo && res->lr_namespace->ns_lvbo->lvbo_update) { (void)res->lr_namespace->ns_lvbo->lvbo_update - (res, NULL, 0); + (res, NULL, 0, 0); //(res, req->rq_reqmsg, 1); } @@ -840,9 +844,12 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req, lock->l_req_mode = dlm_req->lock_desc.l_granted_mode; LDLM_DEBUG(lock, "completion AST, new lock mode"); } - if (lock->l_resource->lr_type != LDLM_PLAIN) + + if (lock->l_resource->lr_type != LDLM_PLAIN) { memcpy(&lock->l_policy_data, &dlm_req->lock_desc.l_policy_data, sizeof(lock->l_policy_data)); + LDLM_DEBUG(lock, "completion AST, new policy data"); + } ldlm_resource_unlink_lock(lock); if (memcmp(&dlm_req->lock_desc.l_resource.lr_name, @@ -889,6 +896,7 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req, struct ldlm_request *dlm_req, struct ldlm_lock *lock) { + int rc = -ENOSYS; ENTRY; l_lock(&ns->ns_lock); @@ -897,10 +905,17 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req, if (lock->l_glimpse_ast != NULL) { l_unlock(&ns->ns_lock); l_check_no_ns_lock(ns); - lock->l_glimpse_ast(lock, req); + rc = lock->l_glimpse_ast(lock, req); l_lock(&ns->ns_lock); } + if (req->rq_repmsg != NULL) { + ptlrpc_reply(req); + } else { + req->rq_status = rc; + ptlrpc_error(req); + } + if (lock->l_granted_mode == LCK_PW && !lock->l_readers && !lock->l_writers && time_after(jiffies, lock->l_last_used + 10 * HZ)) { diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 01e4562..a996da6 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -333,17 +333,6 @@ int ldlm_cli_enqueue(struct obd_export *exp, CDEBUG(D_INFO, "local: %p, remote cookie: "LPX64", flags: 0x%x\n", lock, reply->lock_handle.cookie, *flags); - if (type == LDLM_EXTENT) { - CDEBUG(D_INFO, "requested extent: "LPU64" -> "LPU64", got " - "extent "LPU64" -> "LPU64"\n", - body->lock_desc.l_policy_data.l_extent.start, - body->lock_desc.l_policy_data.l_extent.end, - reply->lock_desc.l_policy_data.l_extent.start, - reply->lock_desc.l_policy_data.l_extent.end); - } - if (policy != NULL) - memcpy(&lock->l_policy_data, &reply->lock_desc.l_policy_data, - sizeof(reply->lock_desc.l_policy_data)); /* If enqueue returned a blocked lock but the completion handler has * already run, then it fixed up the resource and we don't need to do it @@ -372,7 +361,14 @@ int ldlm_cli_enqueue(struct obd_export *exp, } LDLM_DEBUG(lock, "client-side enqueue, new resource"); } + if (policy != NULL) + memcpy(&lock->l_policy_data, + &reply->lock_desc.l_policy_data, + sizeof(reply->lock_desc.l_policy_data)); + if (type != LDLM_PLAIN) + LDLM_DEBUG(lock,"client-side enqueue, new policy data"); } + if ((*flags) & LDLM_FL_AST_SENT) { l_lock(&ns->ns_lock); lock->l_flags |= LDLM_FL_CBPENDING; diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 2cbc22e..9e487d5 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -549,64 +549,50 @@ int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data) } #endif -/* This function is a disaster. I hate the LOV. */ static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp) { struct ptlrpc_request *req = reqp; struct inode *inode = ll_inode_from_lock(lock); - struct obd_export *exp; struct ll_inode_info *lli; struct ost_lvb *lvb; - struct { - int stripe_number; - __u64 size; - struct lov_stripe_md *lsm; - } data; - __u32 vallen = sizeof(data); - int rc, size = sizeof(*lvb); + int rc, size = sizeof(*lvb), stripe = 0; ENTRY; if (inode == NULL) - RETURN(0); + GOTO(out, rc = -ELDLM_NO_LOCK_DATA); lli = ll_i2info(inode); if (lli == NULL) - goto iput; + GOTO(iput, rc = -ELDLM_NO_LOCK_DATA); if (lli->lli_smd == NULL) - goto iput; - exp = ll_i2obdexp(inode); + GOTO(iput, rc = -ELDLM_NO_LOCK_DATA); /* First, find out which stripe index this lock corresponds to. */ if (lli->lli_smd->lsm_stripe_count > 1) - data.stripe_number = ll_lock_to_stripe_offset(inode, lock); - else - data.stripe_number = 0; - - data.size = inode->i_size; - data.lsm = lli->lli_smd; - - rc = obd_get_info(exp, strlen("size_to_stripe"), "size_to_stripe", - &vallen, &data); - if (rc != 0) { - CERROR("obd_get_info: rc = %d\n", rc); - LBUG(); - } - - LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> size "LPU64, - inode->i_size, data.stripe_number, data.size); + stripe = ll_lock_to_stripe_offset(inode, lock); rc = lustre_pack_reply(req, 1, &size, NULL); if (rc) { CERROR("lustre_pack_reply: %d\n", rc); - goto iput; + GOTO(iput, rc); } lvb = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*lvb)); - lvb->lvb_size = data.size; - ptlrpc_reply(req); + lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe].loi_kms; + LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64, + inode->i_size, stripe, lvb->lvb_size); + GOTO(iput, 0); iput: iput(inode); - RETURN(0); + + out: + /* These errors are normal races, so we don't want to fill the console + * with messages by calling ptlrpc_error() */ + if (rc == -ELDLM_NO_LOCK_DATA) + lustre_pack_reply(req, 0, NULL, NULL); + + req->rq_status = rc; + return rc; } __u64 lov_merge_size(struct lov_stripe_md *lsm, int kms); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index c17ad63..c9cf119 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -462,7 +462,14 @@ int lustre_process_log(struct lustre_mount_data *lmd, char * profile, exp = class_conn2export(&mdc_conn); ctxt = llog_get_context(exp->exp_obd, LLOG_CONFIG_REPL_CTXT); +#if 1 rc = class_config_parse_llog(ctxt, profile, cfg); +#else + /* + * For debugging, it's useful to just dump the log + */ + rc = class_config_dump_llog(ctxt, profile, cfg); +#endif if (rc) { CERROR("class_config_parse_llog failed: rc = %d\n", rc); } diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index 4c59d71..5784eb8 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -104,6 +104,7 @@ int ll_set_inode(struct inode *inode, void *opaque) ll_read_inode2(inode, opaque); return 0; } + struct inode *ll_iget(struct super_block *sb, ino_t hash, struct lustre_md *md) { diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 92d862f..b0be68f 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -566,17 +566,16 @@ static int lov_clear_orphans(struct obd_export *export, struct obdo *src_oa, if (ost_uuid && !obd_uuid_equals(ost_uuid, &lov->tgts[i].uuid)) continue; - - memcpy(tmp_oa, src_oa, sizeof(*tmp_oa)); + memcpy(tmp_oa, src_oa, sizeof(*tmp_oa)); + /* XXX: LOV STACKING: use real "obj_mdp" sub-data */ err = obd_create(lov->tgts[i].ltd_exp, tmp_oa, &obj_mdp, oti); - if (err) { + if (err) + /* This export will be disabled until it is recovered, + and then orphan recovery will be completed. */ CERROR("error in orphan recovery on OST idx %d/%d: " "rc = %d\n", i, lov->desc.ld_tgt_count, err); - if (!rc) - rc = err; - } if (ost_uuid) break; @@ -603,7 +602,7 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa, LASSERT(ea != NULL); - if ((src_oa->o_valid & OBD_MD_FLFLAGS) && + if ((src_oa->o_valid & OBD_MD_FLFLAGS) && src_oa->o_flags == OBD_FL_DELORPHAN) { rc = lov_clear_orphans(exp, src_oa, ea, oti); RETURN(rc); @@ -617,7 +616,7 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa, if (!lov->desc.ld_active_tgt_count) RETURN(-EIO); - /* Recreate a specific object id at the given OST index */ + /* Recreate a specific object id at the given OST index */ if (src_oa->o_valid & OBD_MD_FLFLAGS && src_oa->o_flags & OBD_FL_RECREATE_OBJS) { struct lov_stripe_md obj_md; @@ -639,7 +638,8 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa, if (i == lsm->lsm_stripe_count) RETURN(-EINVAL); - rc = obd_create(lov->tgts[ost_idx].ltd_exp, src_oa, &obj_mdp, oti); + rc = obd_create(lov->tgts[ost_idx].ltd_exp, src_oa, + &obj_mdp, oti); RETURN(rc); } @@ -690,14 +690,14 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa, } if (*ea == NULL || lsm->lsm_oinfo[0].loi_ost_idx >= ost_count) { - if (ost_start_count <= 0) { + if (--ost_start_count <= 0) { ost_start_idx = ll_insecure_random_int(); ost_start_count = LOV_CREATE_RESEED_INTERVAL; - } else { - --ost_start_count; - ost_start_idx += lsm->lsm_stripe_count; - if (lsm->lsm_stripe_count == ost_count) - ++ost_start_idx; + } else if (lsm->lsm_stripe_count >= + lov->desc.ld_active_tgt_count) { + /* If we allocate from all of the stripes, make the + * next file start on the next OST. */ + ++ost_start_idx; } ost_idx = ost_start_idx % ost_count; } else { @@ -721,6 +721,7 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa, struct lov_stripe_md *obj_mdp = &obj_md; int err; + ++ost_start_idx; if (lov->tgts[ost_idx].active == 0) { CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx); continue; @@ -2055,6 +2056,7 @@ static int lov_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, /* XXX LOV STACKING: submd should be from the subobj */ submd->lsm_object_id = loi->loi_id; submd->lsm_stripe_count = 0; + submd->lsm_oinfo->loi_kms_valid = loi->loi_kms_valid; submd->lsm_oinfo->loi_rss = loi->loi_rss; submd->lsm_oinfo->loi_kms = loi->loi_kms; loi->loi_mtime = submd->lsm_oinfo->loi_mtime; @@ -2077,14 +2079,16 @@ static int lov_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, LASSERT(lock != NULL); loi->loi_rss = tmp; - // Extend KMS up to the end of this lock, and no further + /* Extend KMS up to the end of this lock and no further + * A lock on [x,y] means a KMS of up to y + 1 bytes! */ if (tmp > lock->l_policy_data.l_extent.end) tmp = lock->l_policy_data.l_extent.end + 1; - if (tmp > loi->loi_kms) { + if (tmp >= loi->loi_kms) { CDEBUG(D_INODE, "lock acquired, setting rss=" LPU64", kms="LPU64"\n", loi->loi_rss, tmp); loi->loi_kms = tmp; + loi->loi_kms_valid = 1; } else { CDEBUG(D_INODE, "lock acquired, setting rss=" LPU64"; leaving kms="LPU64", end="LPU64 @@ -2619,8 +2623,7 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen, for (i = 0; i < lov->desc.ld_tgt_count; i++) { int er; - if (!lov->tgts[i].active) - continue; + /* initialize all OSCs, even inactive ones */ er = obd_set_info(lov->tgts[i].ltd_exp, keylen, key, sizeof(obd_id), ((obd_id*)val) + i); diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index c692def..e1c3fed 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -507,6 +507,11 @@ int mdc_close(struct obd_export *exp, struct obdo *obdo, CERROR("Unexpected: can't find mdc_open_data, but the " "close succeeded. Please tell CFS.\n"); } + if (!lustre_swab_repbuf(req, 0, sizeof(struct mds_body), + lustre_swab_mds_body)) { + CERROR("Error unpacking mds_body\n"); + rc = -EPROTO; + } } if (req->rq_async_args.pointer_arg[0] != NULL) { CERROR("returned without dropping rpc_lock: rc %d\n", rc); diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index 0e9d2f0..3520849 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -159,20 +159,8 @@ int mds_lov_set_nextid(struct obd_device *obd) GOTO(out, rc); rc = mds_lov_clearorphans(mds, NULL /* all OSTs */); - if (rc < 0) - GOTO(out, rc); out: - if (rc && mds->mds_lov_objids) { - /* Might as well crash here, until we figure out what to do. - * If we OBD_FREE, we'll just LASSERT the next time through this - * function. */ - LBUG(); - OBD_FREE(mds->mds_lov_objids, - mds->mds_lov_desc.ld_tgt_count * sizeof(obd_id)); - mds->mds_lov_objids = NULL; - } - RETURN(rc); } diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index 8d49420..e959402 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -28,6 +28,7 @@ #endif #define DEBUG_SUBSYSTEM S_MDS +#include #include #include #include diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 5088abb..98ae3b5 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -630,6 +630,9 @@ void class_disconnect_exports(struct obd_device *obd, int flags) CDEBUG(D_HA, "exp %p export uuid == obd uuid, don't discon\n", exp); + /* Need to delete this now so we don't end up pointing + * to work_list later when this export is cleaned up. */ + list_del_init(&exp->exp_obd_chain); class_export_put(exp); continue; } diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index 9156dc8..41f2258 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -545,7 +545,7 @@ static int class_config_llog_handler(struct llog_handle * handle, int cfg_len = rec->lrh_len; char *cfg_buf = (char*) (rec + 1); int rc = 0; - + ENTRY; if (rec->lrh_type == OBD_CFG_REC) { char *buf; struct lustre_cfg *lcfg; @@ -641,7 +641,7 @@ static int class_config_dump_handler(struct llog_handle * handle, int cfg_len = rec->lrh_len; char *cfg_buf = (char*) (rec + 1); int rc = 0; - + ENTRY; if (rec->lrh_type == OBD_CFG_REC) { char *buf; struct lustre_cfg *lcfg; @@ -702,6 +702,9 @@ static int class_config_dump_handler(struct llog_handle * handle, if (pcfg->pcfg_flags) CDEBUG(D_INFO, " flags: %x\n", pcfg->pcfg_flags); + } else { + CERROR("unhandled lrh_type: %#x\n", rec->lrh_type); + rc = -EINVAL; } out: RETURN(rc); diff --git a/lustre/obdecho/echo.c b/lustre/obdecho/echo.c index 093f3ac..9248472 100644 --- a/lustre/obdecho/echo.c +++ b/lustre/obdecho/echo.c @@ -392,7 +392,8 @@ int echo_commitrw(int cmd, struct obd_export *export, struct obdo *oa, LASSERT(oti == NULL || oti->oti_handle == (void *)DESC_PRIV); for (i = 0; i < objcount; i++, obj++) { - int verify = obj->ioo_id != ECHO_PERSISTENT_OBJID; + int verify = (rc == 0 && + obj->ioo_id != ECHO_PERSISTENT_OBJID); int j; for (j = 0 ; j < obj->ioo_bufcnt ; j++, r++) { diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 0e8e458..d2f6369 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "filter_internal.h" @@ -1136,12 +1137,12 @@ static int filter_intent_policy(struct ldlm_namespace *ns, LASSERT(l->l_glimpse_ast != NULL); rc = l->l_glimpse_ast(l, NULL); /* this will update the LVB */ + if (rc != 0 && res->lr_namespace->ns_lvbo && + res->lr_namespace->ns_lvbo->lvbo_update) { + res->lr_namespace->ns_lvbo->lvbo_update(res, NULL, 0, 1); + } down(&res->lr_lvb_sem); -#if 0 - if (res_lvb->lvb_size == reply_lvb->lvb_size) - LDLM_ERROR(l, "we lost the glimpse race!"); -#endif reply_lvb->lvb_size = res_lvb->lvb_size; up(&res->lr_lvb_sem); @@ -1449,23 +1450,29 @@ static void filter_grant_sanity_check(struct obd_device *obd, char *func) spin_unlock(&obd->obd_osfs_lock); /* Do these assertions outside the spinlocks so we don't kill system */ - LASSERTF(tot_granted == fo_tot_granted, "%s "LPU64" != "LPU64"\n", - func, tot_granted, fo_tot_granted); - LASSERTF(tot_pending == fo_tot_pending, "%s "LPU64" != "LPU64"\n", - func, tot_pending, fo_tot_pending); - LASSERTF(tot_dirty == fo_tot_dirty, "%s "LPU64" != "LPU64"\n", - func, tot_dirty, fo_tot_dirty); - LASSERTF(tot_pending <= tot_granted, "%s "LPU64" > "LPU64"\n", - func, tot_pending, tot_granted); - LASSERTF(tot_granted <= maxsize, "%s "LPU64" > "LPU64"\n", - func, tot_granted, maxsize); - LASSERTF(tot_dirty <= maxsize, "%s "LPU64" > "LPU64"\n", - func, tot_dirty, maxsize); + if (tot_granted != fo_tot_granted) + CERROR("%s: tot_granted "LPU64" != fo_tot_granted "LPU64"\n", + func, tot_granted, fo_tot_granted); + if (tot_pending != fo_tot_pending) + CERROR("%s: tot_pending "LPU64" != fo_tot_pending "LPU64"\n", + func, tot_pending, fo_tot_pending); + if (tot_dirty != fo_tot_dirty) + CERROR("%s: tot_dirty "LPU64" != fo_tot_dirty "LPU64"\n", + func, tot_dirty, fo_tot_dirty); + if (tot_pending > tot_granted) + CERROR("%s: tot_pending "LPU64" > tot_granted "LPU64"\n", + func, tot_pending, tot_granted); + if (tot_granted > maxsize) + CERROR("%s: tot_granted "LPU64" > maxsize "LPU64"\n", + func, tot_granted, maxsize); + if (tot_dirty > maxsize) + CERROR("%s: tot_dirty "LPU64" > maxsize "LPU64"\n", + func, tot_dirty, maxsize); } -/* Remove this client from the grant accounting totals. This is done at - * disconnect time and also at export destroy time in case there was a race - * between removing the export and an incoming BRW updating the client grant. +/* Remove this client from the grant accounting totals. We also remove + * the export from the obd device under the osfs and dev locks to ensure + * that the filter_grant_sanity_check() calculations are always valid. * The client should do something similar when it invalidates its import. */ static void filter_grant_discard(struct obd_export *exp) { @@ -1474,6 +1481,10 @@ static void filter_grant_discard(struct obd_export *exp) struct filter_export_data *fed = &exp->exp_filter_data; spin_lock(&obd->obd_osfs_lock); + spin_lock(&exp->exp_obd->obd_dev_lock); + list_del_init(&exp->exp_obd_chain); + spin_unlock(&exp->exp_obd->obd_dev_lock); + CDEBUG(D_CACHE, "%s: cli %s/%p dirty %lu pend %lu grant %lu\n", obd->obd_name, exp->exp_client_uuid.uuid, exp, fed->fed_dirty, fed->fed_pending, fed->fed_grant); @@ -1512,7 +1523,9 @@ static int filter_destroy_export(struct obd_export *exp) if (exp->exp_obd->obd_replayable) filter_client_free(exp, exp->exp_flags); - filter_grant_sanity_check(exp->exp_obd, __FUNCTION__); + filter_grant_discard(exp); + if (!(exp->exp_flags & OBD_OPT_FORCE)) + filter_grant_sanity_check(exp->exp_obd, __FUNCTION__); RETURN(0); } @@ -1533,15 +1546,13 @@ static int filter_disconnect(struct obd_export *exp, int flags) exp->exp_flags = flags; spin_unlock_irqrestore(&exp->exp_lock, irqflags); + if (!(flags & OBD_OPT_FORCE)) + filter_grant_sanity_check(obd, __FUNCTION__); filter_grant_discard(exp); /* Disconnect early so that clients can't keep using export */ rc = class_disconnect(exp, flags); - /* Do this twice in case a BRW arrived between the first call and - * the class_export_unlink() call (bug 2663) */ - filter_grant_discard(exp); - ldlm_cancel_locks_for_export(exp); fsfilt_sync(obd, obd->u.filter.fo_sb); @@ -1665,7 +1676,7 @@ static int filter_setattr(struct obd_export *exp, struct obdo *oa, if (res->lr_namespace->ns_lvbo && res->lr_namespace->ns_lvbo->lvbo_update) { rc = res->lr_namespace->ns_lvbo->lvbo_update - (res, NULL, 0); + (res, NULL, 0, 0); } ldlm_resource_putref(res); } diff --git a/lustre/obdfilter/filter_io_24.c b/lustre/obdfilter/filter_io_24.c index 6ab28db..15ac8e9 100644 --- a/lustre/obdfilter/filter_io_24.c +++ b/lustre/obdfilter/filter_io_24.c @@ -251,7 +251,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount, GOTO(cleanup, rc); cleanup_phase = 1; -#if (LINUX_VERSION_CODE == KERNEL_VERSION(2,4,18)) +#ifdef HAVE_KIOBUF_DOVARY iobuf->dovary = 0; /* this prevents corruption, not present in 2.4.20 */ #endif rc = expand_kiobuf(iobuf, obj->ioo_bufcnt); diff --git a/lustre/obdfilter/filter_lvb.c b/lustre/obdfilter/filter_lvb.c index 852aeaf..acfba4c 100644 --- a/lustre/obdfilter/filter_lvb.c +++ b/lustre/obdfilter/filter_lvb.c @@ -102,9 +102,11 @@ static int filter_lvbo_init(struct ldlm_resource *res) * * m != NULL : called by the DLM itself after a glimpse callback * m == NULL : called by the filter after a disk write + * + * If 'increase' is true, don't allow values to move backwards. */ static int filter_lvbo_update(struct ldlm_resource *res, struct lustre_msg *m, - int buf_idx) + int buf_idx, int increase) { int rc = 0; struct ost_lvb *lvb = res->lr_lvb_data; @@ -137,13 +139,13 @@ static int filter_lvbo_update(struct ldlm_resource *res, struct lustre_msg *m, //GOTO(out, rc = -EPROTO); GOTO(out, rc = 0); } - if (new->lvb_size > lvb->lvb_size) { + if (new->lvb_size > lvb->lvb_size || !increase) { CDEBUG(D_DLMTRACE, "res: "LPU64" updating lvb size: " LPU64" -> "LPU64"\n", res->lr_name.name[0], lvb->lvb_size, new->lvb_size); lvb->lvb_size = new->lvb_size; } - if (new->lvb_mtime > lvb->lvb_mtime) { + if (new->lvb_mtime > lvb->lvb_mtime || !increase) { CDEBUG(D_DLMTRACE, "res: "LPU64" updating lvb mtime: " LPU64" -> "LPU64"\n", res->lr_name.name[0], lvb->lvb_mtime, new->lvb_mtime); @@ -170,10 +172,18 @@ static int filter_lvbo_update(struct ldlm_resource *res, struct lustre_msg *m, oa->o_valid = OBD_MD_FLID; obdo_from_inode(oa, dentry->d_inode, FILTER_VALID_FLAGS); - lvb->lvb_size = dentry->d_inode->i_size; - lvb->lvb_mtime = LTIME_S(dentry->d_inode->i_mtime); - CDEBUG(D_DLMTRACE, "res: "LPU64" disk lvb size: "LPU64", mtime: " - LPU64"\n", res->lr_name.name[0], lvb->lvb_size, lvb->lvb_mtime); + if (dentry->d_inode->i_size > lvb->lvb_size || !increase) { + CDEBUG(D_DLMTRACE, "res: "LPU64" updating lvb size from disk: " + LPU64" -> "LPU64"\n", res->lr_name.name[0], + lvb->lvb_size, dentry->d_inode->i_size); + lvb->lvb_size = dentry->d_inode->i_size; + } + if (dentry->d_inode->i_mtime > lvb->lvb_mtime || !increase) { + CDEBUG(D_DLMTRACE, "res: "LPU64" updating lvb mtime from disk: " + LPU64" -> "LPU64"\n", res->lr_name.name[0], + lvb->lvb_mtime,(__u64)LTIME_S(dentry->d_inode->i_mtime)); + lvb->lvb_mtime = LTIME_S(dentry->d_inode->i_mtime); + } f_dput(dentry); out: diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 6858fe1..a6a3992 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -2384,6 +2384,9 @@ static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK; policy->l_extent.end |= ~PAGE_MASK; + if (lsm->lsm_oinfo->loi_kms_valid == 0) + goto no_match; + /* Next, search for already existing extent locks that will cover us */ rc = ldlm_lock_match(obd->obd_namespace, 0, &res_id, type, policy, mode, lockh); @@ -2424,6 +2427,7 @@ static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, } } + no_match: rc = ldlm_cli_enqueue(exp, NULL, obd->obd_namespace, res_id, type, policy, mode, flags, bl_cb, cp_cb, gl_cb, data, &lvb, sizeof(lvb), lustre_swab_ost_lvb, lockh); diff --git a/lustre/portals/archdep.m4 b/lustre/portals/archdep.m4 index 3bdaf32..7801957 100644 --- a/lustre/portals/archdep.m4 +++ b/lustre/portals/archdep.m4 @@ -133,8 +133,8 @@ case ${host_cpu} in powerpc ) AC_MSG_RESULT($host_cpu) - KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -D__powerpc__ -fsigned-char -msoft-float -pipe -ffixed-r2 -Wno-uninitialized -mmultiple -mstring' - KCPPFLAGS='-D__KERNEL__' + KCFLAGS='-O2 -g -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -D__powerpc__ -fsigned-char -msoft-float -pipe -ffixed-r2 -Wno-uninitialized -mmultiple -mstring' + KCPPFLAGS='-D__KERNEL__ -DMODULE' MOD_LINK=elf32ppclinux ;; @@ -338,6 +338,18 @@ AC_SUBST(MOD_LINK) AC_SUBST(LINUX25) AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib) +# ---------- Red Hat 2.4.18 has iobuf->dovary -------------- +# But other kernels don't + +AC_MSG_CHECKING([if struct kiobuf has a dovary field]) +AC_TRY_COMPILE([#define __KERNEL__ + #include ], + [struct kiobuf iobuf; + iobuf.dovary = 1;], + [AC_MSG_RESULT([yes]) + CPPFLAGS="$CPPFLAGS -DHAVE_KIOBUF_DOVARY"], + [AC_MSG_RESULT([no])]) + # ---------- Red Hat 2.4.20 backports some 2.5 bits -------- # This needs to run after we've defined the KCPPFLAGS diff --git a/lustre/portals/include/linux/kp30.h b/lustre/portals/include/linux/kp30.h index d56a120..c080a57 100644 --- a/lustre/portals/include/linux/kp30.h +++ b/lustre/portals/include/linux/kp30.h @@ -368,13 +368,14 @@ typedef struct { struct list_head kprfd_list; /* stash in queues (routing target can use) */ ptl_nid_t kprfd_target_nid; /* final destination NID */ ptl_nid_t kprfd_gateway_nid; /* gateway NID */ - int kprfd_nob; /* # message bytes (including header) */ - int kprfd_niov; /* # message frags (including header) */ - struct iovec *kprfd_iov; /* message fragments */ - void *kprfd_router_arg; // originating NAL's router arg + ptl_hdr_t *kprfd_hdr; /* header in wire byte order */ + int kprfd_nob; /* # payload bytes */ + int kprfd_niov; /* # payload frags */ + ptl_kiov_t *kprfd_kiov; /* payload fragments */ + void *kprfd_router_arg; /* originating NAL's router arg */ kpr_fwd_callback_t kprfd_callback; /* completion callback */ void *kprfd_callback_arg; /* completion callback arg */ - kprfd_scratch_t kprfd_scratch; // scratchpad for routing targets + kprfd_scratch_t kprfd_scratch; /* scratchpad for routing targets */ } kpr_fwd_desc_t; typedef void (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd); @@ -477,15 +478,16 @@ kpr_lookup (kpr_router_t *router, ptl_nid_t nid, int nob, ptl_nid_t *gateway_nid } static inline void -kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid, - int nob, int niov, struct iovec *iov, +kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid, ptl_hdr_t *hdr, + int nob, int niov, ptl_kiov_t *kiov, kpr_fwd_callback_t callback, void *callback_arg) { fwd->kprfd_target_nid = nid; fwd->kprfd_gateway_nid = nid; + fwd->kprfd_hdr = hdr; fwd->kprfd_nob = nob; fwd->kprfd_niov = niov; - fwd->kprfd_iov = iov; + fwd->kprfd_kiov = kiov; fwd->kprfd_callback = callback; fwd->kprfd_callback_arg = callback_arg; } diff --git a/lustre/portals/include/portals/types.h b/lustre/portals/include/portals/types.h index 7ffe797..d4ca453 100644 --- a/lustre/portals/include/portals/types.h +++ b/lustre/portals/include/portals/types.h @@ -3,7 +3,13 @@ #ifdef __linux__ # include -# include +# if defined(__powerpc__) && !defined(__KERNEL__) +# define __KERNEL__ +# include +# undef __KERNEL__ +# else +# include +# endif #else # include typedef u_int32_t __u32; @@ -14,7 +20,7 @@ typedef u_int64_t __u64; # include #else # include -# define do_gettimeofday(tv) gettimeofday(tv, NULL) +# define do_gettimeofday(tv) gettimeofday(tv, NULL); #endif #include @@ -129,7 +135,7 @@ typedef struct { struct timeval arrival_time; volatile ptl_seq_t sequence; -} ptl_event_t; +} __attribute__((packed)) ptl_event_t; #ifdef __CYGWIN__ #pragma pop #endif diff --git a/lustre/portals/knals/gmnal/gmnal.h b/lustre/portals/knals/gmnal/gmnal.h index cdde5b7..ad46b90 100644 --- a/lustre/portals/knals/gmnal/gmnal.h +++ b/lustre/portals/knals/gmnal/gmnal.h @@ -45,6 +45,7 @@ #include "linux/init.h" #include "linux/sem.h" #include "linux/vmalloc.h" +#include "linux/sysctl.h" #define DEBUG_SUBSYSTEM S_GMNAL @@ -80,9 +81,14 @@ extern int gmnal_small_msg_size; extern int num_rx_threads; extern int num_stxds; +extern int gm_port; #define GMNAL_SMALL_MSG_SIZE(a) a->small_msg_size #define GMNAL_IS_SMALL_MESSAGE(n,a,b,c) gmnal_is_small_msg(n, a, b, c) #define GMNAL_MAGIC 0x1234abcd +/* + * The gm_port to use for gmnal + */ +#define GMNAL_GM_PORT gm_port /* @@ -218,6 +224,7 @@ typedef struct _gmnal_data_t { gmnal_rxtwe_t *rxtwe_tail; spinlock_t rxtwe_lock; struct semaphore rxtwe_wait; + struct ctl_table_header *sysctl; } gmnal_data_t; /* @@ -234,11 +241,6 @@ typedef struct _gmnal_data_t { extern gmnal_data_t *global_nal_data; /* - * The gm_port to use for gmnal - */ -#define GMNAL_GM_PORT 4 - -/* * for ioctl get pid */ #define GMNAL_IOC_GET_GNID 1 @@ -353,6 +355,8 @@ int gmnal_cb_read(nal_cb_t *, void *private, void *, user_ptr, size_t); int gmnal_cb_write(nal_cb_t *, void *private, user_ptr, void *, size_t); +int gmnal_cb_callback(nal_cb_t *, void *, lib_eq_t *, ptl_event_t *); + void *gmnal_cb_malloc(nal_cb_t *, size_t); void gmnal_cb_free(nal_cb_t *, void *, size_t); @@ -382,7 +386,7 @@ void gmnal_fini(void); a->cb_recv_pages = gmnal_cb_recv_pages; \ a->cb_read = gmnal_cb_read; \ a->cb_write = gmnal_cb_write; \ - a->cb_callback = NULL; \ + a->cb_callback = gmnal_cb_callback; \ a->cb_malloc = gmnal_cb_malloc; \ a->cb_free = gmnal_cb_free; \ a->cb_map = NULL; \ @@ -418,6 +422,7 @@ void gmnal_stop_rxthread(gmnal_data_t *); void gmnal_stop_ctthread(gmnal_data_t *); void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t); void gmnal_drop_sends_callback(gm_port_t *, void *, gm_status_t); +void gmnal_resume_sending_callback(gm_port_t *, void *, gm_status_t); char *gmnal_gm_error(gm_status_t); char *gmnal_rxevent(gm_recv_event_t*); int gmnal_is_small_msg(gmnal_data_t*, int, struct iovec*, int); diff --git a/lustre/portals/knals/gmnal/gmnal_api.c b/lustre/portals/knals/gmnal/gmnal_api.c index 1cb1317..1442aa7 100644 --- a/lustre/portals/knals/gmnal/gmnal_api.c +++ b/lustre/portals/knals/gmnal/gmnal_api.c @@ -25,7 +25,36 @@ #include "gmnal.h" + + gmnal_data_t *global_nal_data = NULL; +#define GLOBAL_NID_STR_LEN 16 +char global_nid_str[GLOBAL_NID_STR_LEN] = {0}; + +/* + * Write the global nid /proc/sys/gmnal/globalnid + */ +#define GMNAL_SYSCTL 201 +#define GMNAL_SYSCTL_GLOBALNID 1 + +static ctl_table gmnal_sysctl_table[] = { + {GMNAL_SYSCTL_GLOBALNID, "globalnid", + global_nid_str, GLOBAL_NID_STR_LEN, + 0444, NULL, &proc_dostring}, + { 0 } +}; + + +static ctl_table gmnalnal_top_sysctl_table[] = { + {GMNAL_SYSCTL, "gmnal", NULL, 0, 0555, gmnal_sysctl_table}, + { 0 } +}; + + + + + + /* * gmnal_api_forward * This function takes a pack block of arguments from the NAL API @@ -193,8 +222,8 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, ptl_pid_t portals_pid = 0; - CDEBUG(D_TRACE, "gmnal_init : interface [%d], ptl_size [%d], - ac_size[%d]\n", interface, ptl_size, ac_size); + CDEBUG(D_TRACE, "gmnal_init : interface [%d], ptl_size [%d], " + "ac_size[%d]\n", interface, ptl_size, ac_size); PORTAL_ALLOC(nal_data, sizeof(gmnal_data_t)); @@ -255,8 +284,8 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, } - CDEBUG(D_NET, "Calling gm_open with interface [%d], port [%d], - name [%s], version [%d]\n", interface, GMNAL_GM_PORT, + CDEBUG(D_NET, "Calling gm_open with interface [%d], port [%d], " + "name [%s], version [%d]\n", interface, GMNAL_GM_PORT, "gmnal", GM_API_VERSION); GMNAL_GM_LOCK(nal_data); @@ -280,15 +309,15 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, CDEBUG(D_ERROR, "gm_open Failure. No such device\n"); break; case(GM_INCOMPATIBLE_LIB_AND_DRIVER): - CDEBUG(D_ERROR, "gm_open Failure. Incompatile lib - and driver\n"); + CDEBUG(D_ERROR, "gm_open Failure. Incompatile lib " + "and driver\n"); break; case(GM_OUT_OF_MEMORY): CDEBUG(D_ERROR, "gm_open Failure. Out of Memory\n"); break; default: - CDEBUG(D_ERROR, "gm_open Failure. Unknow error - code [%d]\n", gm_status); + CDEBUG(D_ERROR, "gm_open Failure. Unknow error " + "code [%d]\n", gm_status); break; } GMNAL_GM_LOCK(nal_data); @@ -403,6 +432,7 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, } CDEBUG(D_INFO, "Global node id is [%u]\n", global_nid); nal_data->gm_global_nid = global_nid; + snprintf(global_nid_str, GLOBAL_NID_STR_LEN, "%u", global_nid); /* pid = gm_getpid(); @@ -429,6 +459,9 @@ gmnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, return(NULL); } + nal_data->sysctl = NULL; + nal_data->sysctl = register_sysctl_table (gmnalnal_top_sysctl_table, 0); + CDEBUG(D_INFO, "gmnal_init finished\n"); global_nal_data = nal->nal_data; @@ -459,6 +492,8 @@ void gmnal_fini() gm_close(nal_data->gm_port); gm_finalize(); GMNAL_GM_UNLOCK(nal_data); + if (nal_data->sysctl) + unregister_sysctl_table (nal_data->sysctl); PORTAL_FREE(nal, sizeof(nal_t)); PORTAL_FREE(nal_data, sizeof(gmnal_data_t)); PORTAL_FREE(nal_cb, sizeof(nal_cb_t)); diff --git a/lustre/portals/knals/gmnal/gmnal_cb.c b/lustre/portals/knals/gmnal/gmnal_cb.c index e055242..1f287468 100644 --- a/lustre/portals/knals/gmnal/gmnal_cb.c +++ b/lustre/portals/knals/gmnal/gmnal_cb.c @@ -35,8 +35,8 @@ int gmnal_cb_recv(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, int status = PTL_OK; - CDEBUG(D_TRACE, "gmnal_cb_recv nal_cb [%p], private[%p], cookie[%p], - niov[%d], iov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", + CDEBUG(D_TRACE, "gmnal_cb_recv nal_cb [%p], private[%p], cookie[%p], " + "niov[%d], iov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", nal_cb, private, cookie, niov, iov, mlen, rlen); switch(srxd->type) { @@ -64,10 +64,11 @@ int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, int status = PTL_OK; struct iovec *iovec = NULL, *iovec_dup = NULL; int i = 0; + ptl_kiov_t *kiov_dup = kiov;; - CDEBUG(D_TRACE, "gmnal_cb_recv_pages nal_cb [%p],private[%p], - cookie[%p], kniov[%d], kiov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", + CDEBUG(D_TRACE, "gmnal_cb_recv_pages nal_cb [%p],private[%p], " + "cookie[%p], kniov[%d], kiov [%p], mlen["LPSZ"], rlen["LPSZ"]\n", nal_cb, private, cookie, kniov, kiov, mlen, rlen); if (srxd->type == GMNAL_SMALL_MESSAGE) { @@ -99,6 +100,10 @@ int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, CDEBUG(D_INFO, "calling gmnal_small_rx\n"); status = gmnal_small_rx(nal_cb, private, cookie, kniov, iovec_dup, mlen, rlen); + for (i=0; ikiov_page); + kiov_dup++; + } PORTAL_FREE(iovec_dup, sizeof(struct iovec)*kniov); } @@ -126,6 +131,7 @@ int gmnal_cb_send(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, niov, iov, len); } else { CDEBUG(D_ERROR, "Large message send it is not supported\n"); + lib_finalize(nal_cb, private, cookie, PTL_FAIL); return(PTL_FAIL); gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid, niov, iov, len); @@ -140,6 +146,7 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, int i = 0; gmnal_data_t *nal_data; struct iovec *iovec = NULL, *iovec_dup = NULL; + ptl_kiov_t *kiov_dup = kiov; CDEBUG(D_TRACE, "gmnal_cb_send_pages nid ["LPU64"] niov[%d] len["LPSZ"]\n", nid, kniov, len); nal_data = nal_cb->nal_data; @@ -181,6 +188,10 @@ int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid, kniov, iovec, len); } + for (i=0; ikiov_page); + kiov_dup++; + } PORTAL_FREE(iovec_dup, kniov*sizeof(struct iovec)); return(PTL_OK); } @@ -199,6 +210,18 @@ int gmnal_cb_write(nal_cb_t *nal_cb, void *private, user_ptr dst, return(PTL_OK); } +int gmnal_cb_callback(nal_cb_t *nal_cb, void *private, lib_eq_t *eq, + ptl_event_t *ev) +{ + + if (eq->event_callback != NULL) { + CDEBUG(D_INFO, "found callback\n"); + eq->event_callback(ev); + } + + return(PTL_OK); +} + void *gmnal_cb_malloc(nal_cb_t *nal_cb, size_t len) { void *ptr = NULL; diff --git a/lustre/portals/knals/gmnal/gmnal_comm.c b/lustre/portals/knals/gmnal/gmnal_comm.c index a0d3530..1bcd9bd 100644 --- a/lustre/portals/knals/gmnal/gmnal_comm.c +++ b/lustre/portals/knals/gmnal/gmnal_comm.c @@ -203,14 +203,14 @@ gmnal_pre_receive(gmnal_data_t *nal_data, gmnal_rxtwe_t *we, int gmnal_type) gmnal_msghdr = (gmnal_msghdr_t*)buffer; portals_hdr = (ptl_hdr_t*)(buffer+GMNAL_MSGHDR_SIZE); - CDEBUG(D_INFO, "rx_event:: Sender node [%d], Sender Port [%d], - type [%d], length [%d], buffer [%p]\n", + CDEBUG(D_INFO, "rx_event:: Sender node [%d], Sender Port [%d], " + "type [%d], length [%d], buffer [%p]\n", snode, sport, type, length, buffer); - CDEBUG(D_INFO, "gmnal_msghdr:: Sender node [%u], magic [%d], - gmnal_type [%d]\n", gmnal_msghdr->sender_node_id, + CDEBUG(D_INFO, "gmnal_msghdr:: Sender node [%u], magic [%d], " + "gmnal_type [%d]\n", gmnal_msghdr->sender_node_id, gmnal_msghdr->magic, gmnal_msghdr->type); - CDEBUG(D_INFO, "portals_hdr:: Sender node ["LPD64"], - dest_node ["LPD64"]\n", portals_hdr->src_nid, + CDEBUG(D_INFO, "portals_hdr:: Sender node ["LPD64"], " + "dest_node ["LPD64"]\n", portals_hdr->src_nid, portals_hdr->dest_nid); @@ -321,6 +321,7 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, if (!private) { CDEBUG(D_ERROR, "gmnal_small_rx no context\n"); + lib_finalize(nal_cb, private, cookie, PTL_FAIL); return(PTL_FAIL); } @@ -343,7 +344,6 @@ gmnal_small_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, */ CDEBUG(D_PORTALS, "calling lib_finalize\n"); lib_finalize(nal_cb, private, cookie, PTL_OK); - /* * return buffer so it can be used again */ @@ -377,9 +377,9 @@ gmnal_small_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, unsigned int local_nid; gm_status_t gm_status = GM_SUCCESS; - CDEBUG(D_TRACE, "gmnal_small_tx nal_cb [%p] private [%p] cookie [%p] - hdr [%p] type [%d] global_nid ["LPU64"] pid [%d] niov [%d] - iov [%p] size [%d]\n", nal_cb, private, cookie, hdr, type, + CDEBUG(D_TRACE, "gmnal_small_tx nal_cb [%p] private [%p] cookie [%p] " + "hdr [%p] type [%d] global_nid ["LPU64"] pid [%d] niov [%d] " + "iov [%p] size [%d]\n", nal_cb, private, cookie, hdr, type, global_nid, pid, niov, iov, size); CDEBUG(D_INFO, "portals_hdr:: dest_nid ["LPU64"], src_nid ["LPU64"]\n", @@ -440,9 +440,9 @@ gmnal_small_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, stxd->msg_size = tot_size; - CDEBUG(D_NET, "Calling gm_send_to_peer port [%p] buffer [%p] - gmsize [%lu] msize [%d] global_nid ["LPU64"] local_nid[%d] - stxd [%p]\n", nal_data->gm_port, stxd->buffer, stxd->gm_size, + CDEBUG(D_NET, "Calling gm_send_to_peer port [%p] buffer [%p] " + "gmsize [%lu] msize [%d] global_nid ["LPU64"] local_nid[%d] " + "stxd [%p]\n", nal_data->gm_port, stxd->buffer, stxd->gm_size, stxd->msg_size, global_nid, local_nid, stxd); GMNAL_GM_LOCK(nal_data); @@ -493,8 +493,8 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) /* * do a resend on the dropped ones */ - CDEBUG(D_ERROR, "send stxd [%p] was dropped - resending\n", context); + CDEBUG(D_ERROR, "send stxd [%p] was dropped " + "resending\n", context); GMNAL_GM_LOCK(nal_data); gm_send_to_peer_with_callback(nal_data->gm_port, stxd->buffer, @@ -569,6 +569,11 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) case(GM_YP_NO_MATCH): default: CDEBUG(D_ERROR, "Unknown send error\n"); + gm_resume_sending(nal_data->gm_port, stxd->gm_priority, + stxd->gm_target_node, GMNAL_GM_PORT, + gmnal_resume_sending_callback, context); + return; + } /* @@ -588,10 +593,22 @@ gmnal_small_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) } gmnal_return_stxd(nal_data, stxd); lib_finalize(nal_cb, stxd, cookie, PTL_OK); - return; } +/* + * After an error on the port + * call this to allow future sends to complete + */ +void gmnal_resume_sending_callback(struct gm_port *gm_port, void *context, + gm_status_t status) +{ + gmnal_data_t *nal_data; + gmnal_stxd_t *stxd = (gmnal_stxd_t*)context; + CDEBUG(D_TRACE, "status is [%d] context is [%p]\n", status, context); + gmnal_return_stxd(stxd->nal_data, stxd); + return; +} void gmnal_drop_sends_callback(struct gm_port *gm_port, void *context, @@ -611,8 +628,8 @@ void gmnal_drop_sends_callback(struct gm_port *gm_port, void *context, context); GMNAL_GM_LOCK(nal_data); } else { - CDEBUG(D_ERROR, "send_to_peer status for stxd [%p] is - [%d][%s]\n", stxd, status, gmnal_gm_error(status)); + CDEBUG(D_ERROR, "send_to_peer status for stxd [%p] is " + "[%d][%s]\n", stxd, status, gmnal_gm_error(status)); } @@ -644,9 +661,9 @@ gmnal_large_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, int niov_dup; - CDEBUG(D_TRACE, "gmnal_large_tx nal_cb [%p] private [%p], cookie [%p] - hdr [%p], type [%d] global_nid ["LPU64"], pid [%d], niov [%d], - iov [%p], size [%d]\n", nal_cb, private, cookie, hdr, type, + CDEBUG(D_TRACE, "gmnal_large_tx nal_cb [%p] private [%p], cookie [%p] " + "hdr [%p], type [%d] global_nid ["LPU64"], pid [%d], niov [%d], " + "iov [%p], size [%d]\n", nal_cb, private, cookie, hdr, type, global_nid, pid, niov, iov, size); if (nal_cb) @@ -729,8 +746,8 @@ gmnal_large_tx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, iov->iov_base, iov->iov_len); if (gm_status != GM_SUCCESS) { GMNAL_GM_UNLOCK(nal_data); - CDEBUG(D_ERROR, "gm_register_memory returns [%d][%s] - for memory [%p] len ["LPSZ"]\n", + CDEBUG(D_ERROR, "gm_register_memory returns [%d][%s] " + "for memory [%p] len ["LPSZ"]\n", gm_status, gmnal_gm_error(gm_status), iov->iov_base, iov->iov_len); GMNAL_GM_LOCK(nal_data); @@ -806,12 +823,13 @@ gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, gmnal_msghdr_t *msghdr = NULL; gm_status_t gm_status; - CDEBUG(D_TRACE, "gmnal_large_rx :: nal_cb[%p], private[%p], - cookie[%p], niov[%d], iov[%p], mlen["LPSZ"], rlen["LPSZ"]\n", + CDEBUG(D_TRACE, "gmnal_large_rx :: nal_cb[%p], private[%p], " + "cookie[%p], niov[%d], iov[%p], mlen["LPSZ"], rlen["LPSZ"]\n", nal_cb, private, cookie, nriov, riov, mlen, rlen); if (!srxd) { CDEBUG(D_ERROR, "gmnal_large_rx no context\n"); + lib_finalize(nal_cb, private, cookie, PTL_FAIL); return(PTL_FAIL); } @@ -846,8 +864,8 @@ gmnal_large_rx(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie, riov->iov_base, riov->iov_len); if (gm_status != GM_SUCCESS) { GMNAL_GM_UNLOCK(nal_data); - CDEBUG(D_ERROR, "gm_register_memory returns [%d][%s] - for memory [%p] len ["LPSZ"]\n", + CDEBUG(D_ERROR, "gm_register_memory returns [%d][%s] " + "for memory [%p] len ["LPSZ"]\n", gm_status, gmnal_gm_error(gm_status), riov->iov_base, riov->iov_len); GMNAL_GM_LOCK(nal_data); @@ -902,8 +920,8 @@ gmnal_remote_get(gmnal_srxd_t *srxd, int nsiov, struct iovec *siov, int ncalls = 0; - CDEBUG(D_TRACE, "gmnal_remote_get srxd[%p], nriov[%d], riov[%p], - nsiov[%d], siov[%p]\n", srxd, nriov, riov, nsiov, siov); + CDEBUG(D_TRACE, "gmnal_remote_get srxd[%p], nriov[%d], riov[%p], " + "nsiov[%d], siov[%p]\n", srxd, nriov, riov, nsiov, siov); ncalls = gmnal_copyiov(0, srxd, nsiov, siov, nriov, riov); @@ -958,8 +976,8 @@ gmnal_copyiov(int do_copy, gmnal_srxd_t *srxd, int nsiov, srxd->gm_source_node, &source_node) != GM_SUCCESS) { - CDEBUG(D_ERROR, "cannot resolve global_id [%u] - to local node_id\n", srxd->gm_source_node); + CDEBUG(D_ERROR, "cannot resolve global_id [%u] " + "to local node_id\n", srxd->gm_source_node); GMNAL_GM_UNLOCK(nal_data); return(GMNAL_STATUS_FAIL); } @@ -1201,9 +1219,9 @@ gmnal_large_tx_ack(gmnal_data_t *nal_data, gmnal_srxd_t *srxd) stxd->msg_size= sizeof(gmnal_msghdr_t); - CDEBUG(D_NET, "Calling gm_send_to_peer port [%p] buffer [%p] - gmsize [%lu] msize [%d] global_nid [%u] local_nid[%d] - stxd [%p]\n", nal_data->gm_port, stxd->buffer, stxd->gm_size, + CDEBUG(D_NET, "Calling gm_send_to_peer port [%p] buffer [%p] " + "gmsize [%lu] msize [%d] global_nid [%u] local_nid[%d] " + "stxd [%p]\n", nal_data->gm_port, stxd->buffer, stxd->gm_size, stxd->msg_size, srxd->gm_source_node, local_nid, stxd); GMNAL_GM_LOCK(nal_data); stxd->gm_priority = GM_LOW_PRIORITY; diff --git a/lustre/portals/knals/gmnal/gmnal_module.c b/lustre/portals/knals/gmnal/gmnal_module.c index 1260629..31f6819 100644 --- a/lustre/portals/knals/gmnal/gmnal_module.c +++ b/lustre/portals/knals/gmnal/gmnal_module.c @@ -30,6 +30,7 @@ int gmnal_small_msg_size = 525312; */ int num_rx_threads = -1; int num_stxds = 5; +int gm_port = 4; ptl_handle_ni_t kgmnal_ni; @@ -139,6 +140,7 @@ EXPORT_SYMBOL(kgmnal_ni); MODULE_PARM(gmnal_small_msg_size, "i"); MODULE_PARM(num_rx_threads, "i"); MODULE_PARM(num_stxds, "i"); +MODULE_PARM(gm_port, "i"); MODULE_AUTHOR("Morgan Doyle"); diff --git a/lustre/portals/knals/gmnal/gmnal_utils.c b/lustre/portals/knals/gmnal/gmnal_utils.c index 55606f3..6a52319 100644 --- a/lustre/portals/knals/gmnal/gmnal_utils.c +++ b/lustre/portals/knals/gmnal/gmnal_utils.c @@ -117,8 +117,8 @@ gmnal_alloc_txd(gmnal_data_t *nal_data) GMNAL_SMALL_MSG_SIZE(nal_data)); GMNAL_GM_UNLOCK(nal_data); if (!txbuffer) { - CDEBUG(D_ERROR, "Failed to gm_dma_malloc txbuffer [%d], - size [%d]\n", i, + CDEBUG(D_ERROR, "Failed to gm_dma_malloc txbuffer [%d]," + " size [%d]\n", i, GMNAL_SMALL_MSG_SIZE(nal_data)); PORTAL_FREE(txd, sizeof(gmnal_stxd_t)); return(GMNAL_STATUS_FAIL); @@ -131,8 +131,8 @@ gmnal_alloc_txd(gmnal_data_t *nal_data) txd->next = nal_data->stxd; nal_data->stxd = txd; - CDEBUG(D_INFO, "Registered txd [%p] with buffer [%p], - size [%d]\n", txd, txd->buffer, txd->buffer_size); + CDEBUG(D_INFO, "Registered txd [%p] with buffer [%p], " + "size [%d]\n", txd, txd->buffer, txd->buffer_size); } for (i=0; i<=nrxt_stx; i++) { @@ -146,8 +146,8 @@ gmnal_alloc_txd(gmnal_data_t *nal_data) GMNAL_SMALL_MSG_SIZE(nal_data)); GMNAL_GM_UNLOCK(nal_data); if (!txbuffer) { - CDEBUG(D_ERROR, "Failed to gm_dma_malloc txbuffer [%d], - size [%d]\n", i, + CDEBUG(D_ERROR, "Failed to gm_dma_malloc txbuffer [%d]," + " size [%d]\n", i, GMNAL_SMALL_MSG_SIZE(nal_data)); PORTAL_FREE(txd, sizeof(gmnal_stxd_t)); return(GMNAL_STATUS_FAIL); @@ -160,8 +160,8 @@ gmnal_alloc_txd(gmnal_data_t *nal_data) txd->next = nal_data->rxt_stxd; nal_data->rxt_stxd = txd; - CDEBUG(D_INFO, "Registered txd [%p] with buffer [%p], - size [%d]\n", txd, txd->buffer, txd->buffer_size); + CDEBUG(D_INFO, "Registered txd [%p] with buffer [%p], " + "size [%d]\n", txd, txd->buffer, txd->buffer_size); } /* @@ -187,8 +187,8 @@ gmnal_free_txd(gmnal_data_t *nal_data) CDEBUG(D_TRACE, "gmnal_free_small tx\n"); while(txd) { - CDEBUG(D_INFO, "Freeing txd [%p] with buffer [%p], - size [%d]\n", txd, txd->buffer, txd->buffer_size); + CDEBUG(D_INFO, "Freeing txd [%p] with buffer [%p], " + "size [%d]\n", txd, txd->buffer, txd->buffer_size); _txd = txd; txd = txd->next; GMNAL_GM_LOCK(nal_data); @@ -198,8 +198,8 @@ gmnal_free_txd(gmnal_data_t *nal_data) } txd = nal_data->rxt_stxd; while(txd) { - CDEBUG(D_INFO, "Freeing txd [%p] with buffer [%p], - size [%d]\n", txd, txd->buffer, txd->buffer_size); + CDEBUG(D_INFO, "Freeing txd [%p] with buffer [%p], " + "size [%d]\n", txd, txd->buffer, txd->buffer_size); _txd = txd; txd = txd->next; GMNAL_GM_LOCK(nal_data); @@ -392,22 +392,22 @@ gmnal_alloc_srxd(gmnal_data_t *nal_data) #if 0 PORTAL_ALLOC(rxbuffer, GMNAL_SMALL_MSG_SIZE(nal_data)); if (!rxbuffer) { - CDEBUG(D_ERROR, "Failed to malloc rxbuffer [%d], - size [%d]\n", i, + CDEBUG(D_ERROR, "Failed to malloc rxbuffer [%d], " + "size [%d]\n", i, GMNAL_SMALL_MSG_SIZE(nal_data)); PORTAL_FREE(rxd, sizeof(gmnal_srxd_t)); return(GMNAL_STATUS_FAIL); } - CDEBUG(D_NET, "Calling gm_register_memory with port [%p] - rxbuffer [%p], size [%d]\n", nal_data->gm_port, + CDEBUG(D_NET, "Calling gm_register_memory with port [%p] " + "rxbuffer [%p], size [%d]\n", nal_data->gm_port, rxbuffer, GMNAL_SMALL_MSG_SIZE(nal_data)); GMNAL_GM_LOCK(nal_data); gm_status = gm_register_memory(nal_data->gm_port, rxbuffer, GMNAL_SMALL_MSG_SIZE(nal_data)); GMNAL_GM_UNLOCK(nal_data); if (gm_status != GM_SUCCESS) { - CDEBUG(D_ERROR, "gm_register_memory failed buffer [%p], - index [%d]\n", rxbuffer, i); + CDEBUG(D_ERROR, "gm_register_memory failed buffer [%p]," + " index [%d]\n", rxbuffer, i); switch(gm_status) { case(GM_FAILURE): CDEBUG(D_ERROR, "GM_FAILURE\n"); @@ -432,8 +432,8 @@ gmnal_alloc_srxd(gmnal_data_t *nal_data) GMNAL_SMALL_MSG_SIZE(nal_data)); GMNAL_GM_UNLOCK(nal_data); if (!rxbuffer) { - CDEBUG(D_ERROR, "Failed to gm_dma_malloc rxbuffer [%d], - size [%d]\n", i, + CDEBUG(D_ERROR, "Failed to gm_dma_malloc rxbuffer [%d]," + " size [%d]\n", i, GMNAL_SMALL_MSG_SIZE(nal_data)); PORTAL_FREE(rxd, sizeof(gmnal_srxd_t)); return(GMNAL_STATUS_FAIL); @@ -447,15 +447,15 @@ gmnal_alloc_srxd(gmnal_data_t *nal_data) if (gm_hash_insert(nal_data->srxd_hash, (void*)rxbuffer, (void*)rxd)) { - CDEBUG(D_ERROR, "failed to create hash entry rxd[%p] - for rxbuffer[%p]\n", rxd, rxbuffer); + CDEBUG(D_ERROR, "failed to create hash entry rxd[%p] " + "for rxbuffer[%p]\n", rxd, rxbuffer); return(GMNAL_STATUS_FAIL); } rxd->next = nal_data->srxd; nal_data->srxd = rxd; - CDEBUG(D_INFO, "Registered rxd [%p] with buffer [%p], - size [%d]\n", rxd, rxd->buffer, rxd->size); + CDEBUG(D_INFO, "Registered rxd [%p] with buffer [%p], " + "size [%d]\n", rxd, rxd->buffer, rxd->size); } return(GMNAL_STATUS_OK); @@ -623,6 +623,8 @@ gmnal_stop_ctthread(gmnal_data_t *nal_data) char * gmnal_gm_error(gm_status_t status) { + return(gm_strerror(status)); + switch(status) { case(GM_SUCCESS): return("SUCCESS"); @@ -972,7 +974,7 @@ gmnal_get_rxtwe(gmnal_data_t *nal_data) } spin_lock(&nal_data->rxtwe_lock); if (nal_data->rxtwe_head) { - CDEBUG(D_WARNING, "Got a work entry\n"); + CDEBUG(D_INFO, "Got a work entry\n"); we = nal_data->rxtwe_head; nal_data->rxtwe_head = we->next; if (!nal_data->rxtwe_head) @@ -983,7 +985,7 @@ gmnal_get_rxtwe(gmnal_data_t *nal_data) spin_unlock(&nal_data->rxtwe_lock); } while (!we); - CDEBUG(D_WARNING, "Returning we[%p]\n", we); + CDEBUG(D_INFO, "Returning we[%p]\n", we); return(we); } diff --git a/lustre/portals/knals/qswnal/qswnal.c b/lustre/portals/knals/qswnal/qswnal.c index 90c9a95..3b3b5d4 100644 --- a/lustre/portals/knals/qswnal/qswnal.c +++ b/lustre/portals/knals/qswnal/qswnal.c @@ -348,10 +348,10 @@ kqswnal_finalise (void) for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) { kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; - /* If krx_pages[0] got allocated, it got mapped. + /* If krx_kiov[0].kiov_page got allocated, it got mapped. * NB subsequent pages get merged */ - if (krx->krx_pages[0] != NULL) + if (krx->krx_kiov[0].kiov_page != NULL) ep_dvma_unload(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_rx_nmh, &krx->krx_elanbuffer); @@ -416,8 +416,8 @@ kqswnal_finalise (void) kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; for (j = 0; j < krx->krx_npages; j++) - if (krx->krx_pages[j] != NULL) - __free_page (krx->krx_pages[j]); + if (krx->krx_kiov[j].kiov_page != NULL) + __free_page (krx->krx_kiov[j].kiov_page); } PORTAL_FREE(kqswnal_data.kqn_rxds, @@ -709,18 +709,19 @@ kqswnal_initialise (void) LASSERT (krx->krx_npages > 0); for (j = 0; j < krx->krx_npages; j++) { - krx->krx_pages[j] = alloc_page(GFP_KERNEL); - if (krx->krx_pages[j] == NULL) - { + struct page *page = alloc_page(GFP_KERNEL); + + if (page == NULL) { kqswnal_finalise (); return (-ENOMEM); } - LASSERT(page_address(krx->krx_pages[j]) != NULL); + krx->krx_kiov[j].kiov_page = page; + LASSERT(page_address(page) != NULL); #if MULTIRAIL_EKC ep_dvma_load(kqswnal_data.kqn_ep, NULL, - page_address(krx->krx_pages[j]), + page_address(page), PAGE_SIZE, kqswnal_data.kqn_ep_rx_nmh, elan_page_idx, &all_rails, &elanbuffer); @@ -736,7 +737,7 @@ kqswnal_initialise (void) #else elan3_dvma_kaddr_load(kqswnal_data.kqn_ep->DmaState, kqswnal_data.kqn_eprxdmahandle, - page_address(krx->krx_pages[j]), + page_address(page), PAGE_SIZE, elan_page_idx, &elanbuffer); if (j == 0) diff --git a/lustre/portals/knals/qswnal/qswnal.h b/lustre/portals/knals/qswnal/qswnal.h index b1b9a45..5ebf30a 100644 --- a/lustre/portals/knals/qswnal/qswnal.h +++ b/lustre/portals/knals/qswnal/qswnal.h @@ -153,8 +153,7 @@ typedef struct int krx_rpc_reply_sent; /* rpc reply sent */ atomic_t krx_refcount; /* how to tell when rpc is done */ kpr_fwd_desc_t krx_fwd; /* embedded forwarding descriptor */ - struct page *krx_pages[KQSW_NRXMSGPAGES_LARGE]; /* pages allocated */ - struct iovec krx_iov[KQSW_NRXMSGPAGES_LARGE]; /* iovec for forwarding */ + ptl_kiov_t krx_kiov[KQSW_NRXMSGPAGES_LARGE]; /* buffer frags */ } kqswnal_rx_t; typedef struct diff --git a/lustre/portals/knals/qswnal/qswnal_cb.c b/lustre/portals/knals/qswnal/qswnal_cb.c index 478c25f..157dc70 100644 --- a/lustre/portals/knals/qswnal/qswnal_cb.c +++ b/lustre/portals/knals/qswnal/qswnal_cb.c @@ -775,7 +775,7 @@ kqswnal_dma_reply (kqswnal_tx_t *ktx, int nfrag, int offset, int nob) { kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; - char *buffer = (char *)page_address(krx->krx_pages[0]); + char *buffer = (char *)page_address(krx->krx_kiov[0].kiov_page); kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(buffer + KQSW_HDR_SIZE); int rc; #if MULTIRAIL_EKC @@ -1008,7 +1008,7 @@ kqswnal_sendmsg (nal_cb_t *nal, } memcpy(ktx->ktx_buffer + sizeof(*hdr) + sizeof(csum), &csum, sizeof(csum)); #endif - + if (kqswnal_data.kqn_optimized_gets && type == PTL_MSG_GET && /* doing a GET */ nid == targetnid) { /* not forwarding */ @@ -1167,7 +1167,7 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) { int rc; kqswnal_tx_t *ktx; - struct iovec *iov = fwd->kprfd_iov; + ptl_kiov_t *kiov = fwd->kprfd_kiov; int niov = fwd->kprfd_niov; int nob = fwd->kprfd_nob; ptl_nid_t nid = fwd->kprfd_gateway_nid; @@ -1177,11 +1177,9 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) LBUG (); #endif /* The router wants this NAL to forward a packet */ - CDEBUG (D_NET, "forwarding [%p] to "LPX64", %d frags %d bytes\n", + CDEBUG (D_NET, "forwarding [%p] to "LPX64", payload: %d frags %d bytes\n", fwd, nid, niov, nob); - LASSERT (niov > 0); - ktx = kqswnal_get_idle_tx (fwd, 0); if (ktx == NULL) /* can't get txd right now */ return; /* fwd will be scheduled when tx desc freed */ @@ -1195,44 +1193,44 @@ kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) goto failed; } - if (nob > KQSW_NRXMSGBYTES_LARGE) { - CERROR ("Can't forward [%p] to "LPX64 - ": size %d bigger than max packet size %ld\n", - fwd, nid, nob, (long)KQSW_NRXMSGBYTES_LARGE); - rc = -EMSGSIZE; - goto failed; - } + /* copy hdr into pre-mapped buffer */ + memcpy(ktx->ktx_buffer, fwd->kprfd_hdr, sizeof(ptl_hdr_t)); + ktx->ktx_wire_hdr = (ptl_hdr_t *)ktx->ktx_buffer; - ktx->ktx_port = (nob <= (KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD)) ? + ktx->ktx_port = (nob <= KQSW_SMALLPAYLOAD) ? EP_MSG_SVC_PORTALS_SMALL : EP_MSG_SVC_PORTALS_LARGE; ktx->ktx_nid = nid; ktx->ktx_state = KTX_FORWARDING; ktx->ktx_args[0] = fwd; + ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - if ((kqswnal_data.kqn_copy_small_fwd || niov > 1) && - nob <= KQSW_TX_BUFFER_SIZE) + if (nob <= KQSW_TX_MAXCONTIG) { - /* send from ktx's pre-mapped contiguous buffer? */ - lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, 0, nob); + /* send payload from ktx's pre-mapped contiguous buffer */ #if MULTIRAIL_EKC ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, - 0, nob); + 0, KQSW_HDR_SIZE + nob); #else ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; - ktx->ktx_frags[0].Len = nob; + ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + nob; #endif - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - ktx->ktx_wire_hdr = (ptl_hdr_t *)ktx->ktx_buffer; + if (nob > 0) + lib_copy_kiov2buf(ktx->ktx_buffer + KQSW_HDR_SIZE, + niov, kiov, 0, nob); } else { - /* zero copy */ - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0; - rc = kqswnal_map_tx_iov (ktx, 0, nob, niov, iov); + /* zero copy payload */ +#if MULTIRAIL_EKC + ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, + 0, KQSW_HDR_SIZE); +#else + ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; + ktx->ktx_frags[0].Len = KQSW_HDR_SIZE; +#endif + rc = kqswnal_map_tx_kiov (ktx, 0, nob, niov, kiov); if (rc != 0) goto failed; - - ktx->ktx_wire_hdr = (ptl_hdr_t *)iov[0].iov_base; } rc = kqswnal_launch (ktx); @@ -1257,7 +1255,7 @@ kqswnal_fwd_callback (void *arg, int error) if (error != 0) { - ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]); + ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_kiov[0].kiov_page); CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n", NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),error); @@ -1371,8 +1369,9 @@ kqswnal_requeue_rx (kqswnal_rx_t *krx) void kqswnal_rx (kqswnal_rx_t *krx) { - ptl_hdr_t *hdr = (ptl_hdr_t *) page_address (krx->krx_pages[0]); + ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(krx->krx_kiov[0].kiov_page); ptl_nid_t dest_nid = NTOH__u64 (hdr->dest_nid); + int payload_nob; int nob; int niov; @@ -1398,16 +1397,26 @@ kqswnal_rx (kqswnal_rx_t *krx) return; } - /* NB forwarding may destroy iov; rebuild every time */ - for (nob = krx->krx_nob, niov = 0; nob > 0; nob -= PAGE_SIZE, niov++) - { - LASSERT (niov < krx->krx_npages); - krx->krx_iov[niov].iov_base= page_address(krx->krx_pages[niov]); - krx->krx_iov[niov].iov_len = MIN(PAGE_SIZE, nob); + nob = payload_nob = krx->krx_nob - KQSW_HDR_SIZE; + niov = 0; + if (nob > 0) { + krx->krx_kiov[0].kiov_offset = KQSW_HDR_SIZE; + krx->krx_kiov[0].kiov_len = MIN(PAGE_SIZE - KQSW_HDR_SIZE, nob); + niov = 1; + nob -= PAGE_SIZE - KQSW_HDR_SIZE; + + while (nob > 0) { + LASSERT (niov < krx->krx_npages); + + krx->krx_kiov[niov].kiov_offset = 0; + krx->krx_kiov[niov].kiov_len = MIN(PAGE_SIZE, nob); + niov++; + nob -= PAGE_SIZE; + } } - kpr_fwd_init (&krx->krx_fwd, dest_nid, - krx->krx_nob, niov, krx->krx_iov, + kpr_fwd_init (&krx->krx_fwd, dest_nid, + hdr, payload_nob, niov, krx->krx_kiov, kqswnal_fwd_callback, krx); kpr_fwd_start (&kqswnal_data.kqn_router, &krx->krx_fwd); @@ -1471,7 +1480,7 @@ kqswnal_rxhandler(EP_RXD *rxd) void kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr) { - ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]); + ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_kiov[0].kiov_page); CERROR ("%s checksum mismatch %p: dnid "LPX64", snid "LPX64 ", dpid %d, spid %d, type %d\n", @@ -1526,6 +1535,7 @@ kqswnal_recvmsg (nal_cb_t *nal, size_t rlen) { kqswnal_rx_t *krx = (kqswnal_rx_t *)private; + char *buffer = page_address(krx->krx_kiov[0].kiov_page); int page; char *page_ptr; int page_nob; @@ -1535,8 +1545,7 @@ kqswnal_recvmsg (nal_cb_t *nal, #if KQSW_CHECKSUM kqsw_csum_t senders_csum; kqsw_csum_t payload_csum = 0; - kqsw_csum_t hdr_csum = kqsw_csum(0, page_address(krx->krx_pages[0]), - sizeof(ptl_hdr_t)); + kqsw_csum_t hdr_csum = kqsw_csum(0, buffer, sizeof(ptl_hdr_t)); size_t csum_len = mlen; int csum_frags = 0; int csum_nob = 0; @@ -1545,8 +1554,7 @@ kqswnal_recvmsg (nal_cb_t *nal, atomic_inc (&csum_counter); - memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) + - sizeof (ptl_hdr_t), sizeof (kqsw_csum_t)); + memcpy (&senders_csum, buffer + sizeof (ptl_hdr_t), sizeof (kqsw_csum_t)); if (senders_csum != hdr_csum) kqswnal_csum_error (krx, 1); #endif @@ -1567,8 +1575,7 @@ kqswnal_recvmsg (nal_cb_t *nal, if (mlen != 0) { page = 0; - page_ptr = ((char *) page_address(krx->krx_pages[0])) + - KQSW_HDR_SIZE; + page_ptr = buffer + KQSW_HDR_SIZE; page_nob = PAGE_SIZE - KQSW_HDR_SIZE; LASSERT (niov > 0); @@ -1621,7 +1628,7 @@ kqswnal_recvmsg (nal_cb_t *nal, { page++; LASSERT (page < krx->krx_npages); - page_ptr = page_address(krx->krx_pages[page]); + page_ptr = page_address(krx->krx_kiov[page].kiov_page); page_nob = PAGE_SIZE; } @@ -1649,8 +1656,8 @@ kqswnal_recvmsg (nal_cb_t *nal, } #if KQSW_CHECKSUM - memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) + - sizeof(ptl_hdr_t) + sizeof(kqsw_csum_t), sizeof(kqsw_csum_t)); + memcpy (&senders_csum, buffer + sizeof(ptl_hdr_t) + sizeof(kqsw_csum_t), + sizeof(kqsw_csum_t)); if (csum_len != rlen) CERROR("Unable to checksum data in user's buffer\n"); diff --git a/lustre/portals/knals/socknal/socknal.c b/lustre/portals/knals/socknal/socknal.c index c47dcb4..2c44b43 100644 --- a/lustre/portals/knals/socknal/socknal.c +++ b/lustre/portals/knals/socknal/socknal.c @@ -1388,6 +1388,7 @@ ksocknal_cmd(struct portals_cfg *pcfg, void * private) void ksocknal_free_fmbs (ksock_fmb_pool_t *p) { + int npages = p->fmp_buff_pages; ksock_fmb_t *fmb; int i; @@ -1399,12 +1400,12 @@ ksocknal_free_fmbs (ksock_fmb_pool_t *p) fmb = list_entry(p->fmp_idle_fmbs.next, ksock_fmb_t, fmb_list); - for (i = 0; i < fmb->fmb_npages; i++) - if (fmb->fmb_pages[i] != NULL) - __free_page(fmb->fmb_pages[i]); - + for (i = 0; i < npages; i++) + if (fmb->fmb_kiov[i].kiov_page != NULL) + __free_page(fmb->fmb_kiov[i].kiov_page); + list_del(&fmb->fmb_list); - PORTAL_FREE(fmb, sizeof(*fmb)); + PORTAL_FREE(fmb, offsetof(ksock_fmb_t, fmb_kiov[npages])); } } @@ -1603,10 +1604,12 @@ ksocknal_module_init (void) spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock); INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs); INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns); + ksocknal_data.ksnd_small_fmp.fmp_buff_pages = SOCKNAL_SMALL_FWD_PAGES; spin_lock_init(&ksocknal_data.ksnd_large_fmp.fmp_lock); INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_idle_fmbs); INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns); + ksocknal_data.ksnd_large_fmp.fmp_buff_pages = SOCKNAL_LARGE_FWD_PAGES; spin_lock_init (&ksocknal_data.ksnd_reaper_lock); INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns); @@ -1690,34 +1693,36 @@ ksocknal_module_init (void) for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++) { - ksock_fmb_t *fmb; + ksock_fmb_t *fmb; + ksock_fmb_pool_t *pool; + + + if (i < SOCKNAL_SMALL_FWD_NMSGS) + pool = &ksocknal_data.ksnd_small_fmp; + else + pool = &ksocknal_data.ksnd_large_fmp; - PORTAL_ALLOC(fmb, sizeof(*fmb)); + PORTAL_ALLOC(fmb, offsetof(ksock_fmb_t, + fmb_kiov[pool->fmp_buff_pages])); if (fmb == NULL) { ksocknal_module_fini(); return (-ENOMEM); } - if (i < SOCKNAL_SMALL_FWD_NMSGS) { - fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES; - fmb->fmb_pool = &ksocknal_data.ksnd_small_fmp; - } else { - fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES; - fmb->fmb_pool = &ksocknal_data.ksnd_large_fmp; - } - - for (j = 0; j < fmb->fmb_npages; j++) { - fmb->fmb_pages[j] = alloc_page(GFP_KERNEL); + fmb->fmb_pool = pool; + + for (j = 0; j < pool->fmp_buff_pages; j++) { + fmb->fmb_kiov[j].kiov_page = alloc_page(GFP_KERNEL); - if (fmb->fmb_pages[j] == NULL) { + if (fmb->fmb_kiov[j].kiov_page == NULL) { ksocknal_module_fini (); return (-ENOMEM); } - LASSERT(page_address(fmb->fmb_pages[j]) != NULL); + LASSERT(page_address(fmb->fmb_kiov[j].kiov_page) != NULL); } - list_add(&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs); + list_add(&fmb->fmb_list, &pool->fmp_idle_fmbs); } } diff --git a/lustre/portals/knals/socknal/socknal.h b/lustre/portals/knals/socknal/socknal.h index 0f0b9bd..db8c842 100644 --- a/lustre/portals/knals/socknal/socknal.h +++ b/lustre/portals/knals/socknal/socknal.h @@ -44,6 +44,7 @@ #include #include +#include #include #include @@ -88,7 +89,7 @@ #define SOCKNAL_SMALL_FWD_PAGES 1 /* # pages in a small message fwd buffer */ -#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + PTL_MTU) >> PAGE_SHIFT) +#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN(PTL_MTU) >> PAGE_SHIFT) /* # pages in a large message fwd buffer */ #define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ @@ -115,6 +116,7 @@ typedef struct /* pool of forwarding buffers */ struct list_head fmp_idle_fmbs; /* free buffers */ struct list_head fmp_blocked_conns; /* connections waiting for a buffer */ int fmp_nactive_fmbs; /* # buffers in use */ + int fmp_buff_pages; /* # pages per buffer */ } ksock_fmb_pool_t; @@ -193,18 +195,13 @@ typedef struct { #define SOCKNAL_INIT_ALL 3 /* A packet just assembled for transmission is represented by 1 or more - * struct iovec fragments and 0 or more ptl_kiov_t fragments. Forwarded - * messages, or messages from an MD with PTL_MD_KIOV _not_ set have 0 - * ptl_kiov_t fragments. Messages from an MD with PTL_MD_KIOV set, have 1 - * struct iovec fragment (the header) and up to PTL_MD_MAX_IOV ptl_kiov_t - * fragments. + * struct iovec fragments (the first frag contains the portals header), + * followed by 0 or more ptl_kiov_t fragments. * * On the receive side, initially 1 struct iovec fragment is posted for - * receive (the header). Once the header has been received, if the message - * requires forwarding or will be received into mapped memory, up to - * PTL_MD_MAX_IOV struct iovec fragments describe the target memory. - * Otherwise up to PTL_MD_MAX_IOV ptl_kiov_t fragments are used. - */ + * receive (the header). Once the header has been received, the payload is + * received into either struct iovec or ptl_kiov_t fragments, depending on + * what the header matched or whether the message needs forwarding. */ struct ksock_conn; /* forward ref */ struct ksock_peer; /* forward ref */ @@ -227,6 +224,12 @@ typedef struct /* transmit packet */ #endif } ksock_tx_t; +typedef struct /* forwarded packet */ +{ + ksock_tx_t ftx_tx; /* send info */ + struct iovec ftx_iov; /* hdr iovec */ +} ksock_ftx_t; + #define KSOCK_ZCCD_2_TX(ptr) list_entry (ptr, ksock_tx_t, tx_zccd) /* network zero copy callback descriptor embedded in ksock_tx_t */ @@ -254,15 +257,14 @@ typedef struct /* Kernel portals Socket Forward { /* (socknal->router) */ struct list_head fmb_list; /* queue idle */ kpr_fwd_desc_t fmb_fwd; /* router's descriptor */ - int fmb_npages; /* # pages allocated */ ksock_fmb_pool_t *fmb_pool; /* owning pool */ struct ksock_peer *fmb_peer; /* peer received from */ - struct page *fmb_pages[SOCKNAL_LARGE_FWD_PAGES]; - struct iovec fmb_iov[SOCKNAL_LARGE_FWD_PAGES]; + ptl_hdr_t fmb_hdr; /* message header */ + ptl_kiov_t fmb_kiov[0]; /* payload frags */ } ksock_fmb_t; /* space for the rx frag descriptors; we either read a single contiguous - * header, or PTL_MD_MAX_IOV frags of payload of either type. */ + * header, or up to PTL_MD_MAX_IOV frags of payload of either type. */ typedef union { struct iovec iov[PTL_MD_MAX_IOV]; ptl_kiov_t kiov[PTL_MD_MAX_IOV]; diff --git a/lustre/portals/knals/socknal/socknal_cb.c b/lustre/portals/knals/socknal/socknal_cb.c index c6cdaba..c89e20e 100644 --- a/lustre/portals/knals/socknal/socknal_cb.c +++ b/lustre/portals/knals/socknal/socknal_cb.c @@ -123,7 +123,7 @@ ksocknal_free_ltx (ksock_ltx_t *ltx) PORTAL_FREE(ltx, ltx->ltx_desc_size); } -#if SOCKNAL_ZC +#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) struct page * ksocknal_kvaddr_to_page (unsigned long vaddr) { @@ -159,7 +159,7 @@ ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) int more = (tx->tx_niov > 1) || (tx->tx_nkiov > 0) || (!list_empty (&conn->ksnc_tx_queue)); -#if SOCKNAL_ZC +#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) int offset = vaddr & (PAGE_SIZE - 1); int zcsize = MIN (fragsize, PAGE_SIZE - offset); struct page *page; @@ -171,7 +171,7 @@ ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) LASSERT (fragsize <= tx->tx_resid); LASSERT (tx->tx_niov > 0); -#if SOCKNAL_ZC +#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) if (zcsize >= ksocknal_data.ksnd_zc_min_frag && (sock->sk->route_caps & NETIF_F_SG) && (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && @@ -1133,7 +1133,7 @@ void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) { ptl_nid_t nid = fwd->kprfd_gateway_nid; - ksock_tx_t *tx = (ksock_tx_t *)&fwd->kprfd_scratch; + ksock_ftx_t *ftx = (ksock_ftx_t *)&fwd->kprfd_scratch; int rc; CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, @@ -1143,14 +1143,18 @@ ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) if (nid == ksocknal_lib.ni.nid) nid = fwd->kprfd_target_nid; - tx->tx_isfwd = 1; /* This is a forwarding packet */ - tx->tx_nob = fwd->kprfd_nob; - tx->tx_niov = fwd->kprfd_niov; - tx->tx_iov = fwd->kprfd_iov; - tx->tx_nkiov = 0; - tx->tx_kiov = NULL; + /* setup iov for hdr */ + ftx->ftx_iov.iov_base = fwd->kprfd_hdr; + ftx->ftx_iov.iov_len = sizeof(ptl_hdr_t); + + ftx->ftx_tx.tx_isfwd = 1; /* This is a forwarding packet */ + ftx->ftx_tx.tx_nob = sizeof(ptl_hdr_t) + fwd->kprfd_nob; + ftx->ftx_tx.tx_niov = 1; + ftx->ftx_tx.tx_iov = &ftx->ftx_iov; + ftx->ftx_tx.tx_nkiov = fwd->kprfd_niov; + ftx->ftx_tx.tx_kiov = fwd->kprfd_kiov; - rc = ksocknal_launch_packet (tx, nid); + rc = ksocknal_launch_packet (&ftx->ftx_tx, nid); if (rc != 0) kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, rc); } @@ -1178,7 +1182,7 @@ ksocknal_fmb_callback (void *arg, int error) { ksock_fmb_t *fmb = (ksock_fmb_t *)arg; ksock_fmb_pool_t *fmp = fmb->fmb_pool; - ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]); + ptl_hdr_t *hdr = (ptl_hdr_t *)page_address(fmb->fmb_kiov[0].kiov_page); ksock_conn_t *conn = NULL; ksock_sched_t *sched; unsigned long flags; @@ -1236,7 +1240,6 @@ ksock_fmb_t * ksocknal_get_idle_fmb (ksock_conn_t *conn) { int payload_nob = conn->ksnc_rx_nob_left; - int packet_nob = sizeof (ptl_hdr_t) + payload_nob; unsigned long flags; ksock_fmb_pool_t *pool; ksock_fmb_t *fmb; @@ -1244,7 +1247,7 @@ ksocknal_get_idle_fmb (ksock_conn_t *conn) LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); LASSERT (kpr_routing(&ksocknal_data.ksnd_router)); - if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE) + if (payload_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE) pool = &ksocknal_data.ksnd_small_fmp; else pool = &ksocknal_data.ksnd_large_fmp; @@ -1275,98 +1278,64 @@ ksocknal_get_idle_fmb (ksock_conn_t *conn) int ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) { - int payload_nob = conn->ksnc_rx_nob_left; - int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + int payload_nob = conn->ksnc_rx_nob_left; ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid); - int niov; /* at least the header */ - int nob; + int niov = 0; + int nob = payload_nob; LASSERT (conn->ksnc_rx_scheduled); LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left); LASSERT (payload_nob >= 0); - LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE); + LASSERT (payload_nob <= fmb->fmb_pool->fmp_buff_pages * PAGE_SIZE); LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE); - - /* Got a forwarding buffer; copy the header we just read into the - * forwarding buffer. If there's payload, start reading reading it - * into the buffer, otherwise the forwarding buffer can be kicked - * off immediately. - * - * NB fmb->fmb_iov spans the WHOLE packet. - * conn->ksnc_rx_iov spans just the payload. - */ - fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]); - - /* copy header */ - memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t)); + LASSERT (fmb->fmb_kiov[0].kiov_offset == 0); /* Take a ref on the conn's peer to prevent module unload before - * forwarding completes. NB we ref peer and not conn since because - * all refs on conn after it has been closed must remove themselves - * in finite time */ + * forwarding completes. */ fmb->fmb_peer = conn->ksnc_peer; atomic_inc (&conn->ksnc_peer->ksnp_refcount); - if (payload_nob == 0) { /* got complete packet already */ - CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n", - conn, NTOH__u64 (conn->ksnc_hdr.src_nid), - dest_nid, packet_nob); + /* Copy the header we just read into the forwarding buffer. If + * there's payload, start reading reading it into the buffer, + * otherwise the forwarding buffer can be kicked off + * immediately. */ + fmb->fmb_hdr = conn->ksnc_hdr; - fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t); + while (nob > 0) { + LASSERT (niov < fmb->fmb_pool->fmp_buff_pages); + LASSERT (fmb->fmb_kiov[niov].kiov_offset == 0); + fmb->fmb_kiov[niov].kiov_len = MIN (PAGE_SIZE, nob); + nob -= PAGE_SIZE; + niov++; + } + + kpr_fwd_init(&fmb->fmb_fwd, dest_nid, &fmb->fmb_hdr, + payload_nob, niov, fmb->fmb_kiov, + ksocknal_fmb_callback, fmb); - kpr_fwd_init (&fmb->fmb_fwd, dest_nid, - packet_nob, 1, fmb->fmb_iov, - ksocknal_fmb_callback, fmb); + if (payload_nob == 0) { /* got complete packet already */ + CDEBUG (D_NET, "%p "LPX64"->"LPX64" fwd_start (immediate)\n", + conn, NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid); - /* forward it now */ kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd); ksocknal_new_packet (conn, 0); /* on to next packet */ return (1); } - niov = 1; - if (packet_nob <= PAGE_SIZE) { /* whole packet fits in first page */ - fmb->fmb_iov[0].iov_len = packet_nob; - } else { - fmb->fmb_iov[0].iov_len = PAGE_SIZE; - nob = packet_nob - PAGE_SIZE; - - do { - LASSERT (niov < fmb->fmb_npages); - fmb->fmb_iov[niov].iov_base = - page_address (fmb->fmb_pages[niov]); - fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob); - nob -= PAGE_SIZE; - niov++; - } while (nob > 0); - } - - kpr_fwd_init (&fmb->fmb_fwd, dest_nid, - packet_nob, niov, fmb->fmb_iov, - ksocknal_fmb_callback, fmb); - conn->ksnc_cookie = fmb; /* stash fmb for later */ conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ - /* payload is desc's iov-ed buffer, but skipping the hdr */ - LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) / - sizeof (struct iovec)); - - conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; - conn->ksnc_rx_iov[0].iov_base = - (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) + - sizeof (ptl_hdr_t)); - conn->ksnc_rx_iov[0].iov_len = - fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t); - - if (niov > 1) - memcpy(&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1], - (niov - 1) * sizeof (struct iovec)); - - conn->ksnc_rx_niov = niov; + /* Set up conn->ksnc_rx_kiov to read the payload into fmb's kiov-ed + * buffer */ + LASSERT (niov <= sizeof(conn->ksnc_rx_iov_space)/sizeof(ptl_kiov_t)); + conn->ksnc_rx_niov = 0; + conn->ksnc_rx_nkiov = niov; + conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; + memcpy(conn->ksnc_rx_kiov, fmb->fmb_kiov, niov * sizeof(ptl_kiov_t)); + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn, NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid, payload_nob); return (0); diff --git a/lustre/portals/router/router.c b/lustre/portals/router/router.c index e29f628..d0dbf0a 100644 --- a/lustre/portals/router/router.c +++ b/lustre/portals/router/router.c @@ -456,14 +456,13 @@ kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %d\n", fwd, target_nid, src_ne->kpne_interface.kprni_nalid); - LASSERT (nob >= sizeof (ptl_hdr_t)); /* at least got a packet header */ - LASSERT (nob == lib_iov_nob (fwd->kprfd_niov, fwd->kprfd_iov)); + LASSERT (nob == lib_kiov_nob (fwd->kprfd_niov, fwd->kprfd_kiov)); atomic_inc (&kpr_queue_depth); atomic_inc (&src_ne->kpne_refcount); /* source nal is busy until fwd completes */ kpr_fwd_packets++; /* (loose) stats accounting */ - kpr_fwd_bytes += nob; + kpr_fwd_bytes += nob + sizeof(ptl_hdr_t); if (src_ne->kpne_shutdown) /* caller is shutting down */ goto out; diff --git a/lustre/portals/utils/Makefile.am b/lustre/portals/utils/Makefile.am index 6c31b3d..925406f 100644 --- a/lustre/portals/utils/Makefile.am +++ b/lustre/portals/utils/Makefile.am @@ -26,11 +26,11 @@ libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h gmnalnid_SOURCES = gmnalnid.c ptlctl_SOURCES = ptlctl.c -ptlctl_LDADD = -L. -lptlctl -lncurses # -lefence +ptlctl_LDADD = -L. -lptlctl $(LIBREADLINE) $(LIBEFENCE) ptlctl_DEPENDENCIES = libptlctl.a debugctl_SOURCES = debugctl.c -debugctl_LDADD = -L. -lptlctl -lncurses # -lefence +debugctl_LDADD = -L. -lptlctl $(LIBREADLINE) $(LIBEFENCE) debugctl_DEPENDENCIES = libptlctl.a routerstat_SOURCES = routerstat.c diff --git a/lustre/portals/utils/gmnalnid.c b/lustre/portals/utils/gmnalnid.c index 84ac97f..ff6631c 100644 --- a/lustre/portals/utils/gmnalnid.c +++ b/lustre/portals/utils/gmnalnid.c @@ -115,5 +115,5 @@ int main(int argc, char **argv) free(pcfg.pcfg_pbuf1); close(pfd); printf("%u\n", nid); - exit(nid); + exit(0); } diff --git a/lustre/scripts/lustre.spec.in b/lustre/scripts/lustre.spec.in index 156479d..a49417e 100644 --- a/lustre/scripts/lustre.spec.in +++ b/lustre/scripts/lustre.spec.in @@ -1,5 +1,5 @@ # lustre.spec -%define version v1_2_0 +%define version 1.2.0.3 %define kversion @LINUXRELEASE@ %define linuxdir @LINUX@ %define enable_doc @ENABLE_DOC@ diff --git a/lustre/tests/.RC_CURRENT.tag b/lustre/tests/.RC_CURRENT.tag index efebb89..0989b52 100644 --- a/lustre/tests/.RC_CURRENT.tag +++ b/lustre/tests/.RC_CURRENT.tag @@ -1 +1 @@ -RC_1_1_9_6 +RC_1_3_0_1 diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh index 6a32076..abe38c8 100755 --- a/lustre/tests/acceptance-small.sh +++ b/lustre/tests/acceptance-small.sh @@ -157,10 +157,10 @@ if [ "$CONF_SANITY" != "no" ]; then sh conf-sanity.sh fi -if [ "$REPLAY_OST_SINGLE" != "no" ]; then - sh replay-ost-single.sh -fi - if [ "$RECOVERY_SMALL" != "no" ]; then sh recovery-small.sh fi + +if [ "$REPLAY_OST_SINGLE" != "no" ]; then + sh replay-ost-single.sh +fi diff --git a/lustre/tests/munlink.c b/lustre/tests/munlink.c index 0f42434..62c2765 100755 --- a/lustre/tests/munlink.c +++ b/lustre/tests/munlink.c @@ -13,16 +13,23 @@ int main(int argc, char ** argv) { int rc, i; - if (argc < 2) { - printf("Usage %s filename\n", argv[0]); + if (argc < 2) { + printf("Usage %s filename {filename ...}\n", argv[0]); return 1; } for (i = 1; i < argc; i++) { - rc = unlink(argv[i]); - if (rc) - printf("unlink(%s) error: %s\n", argv[i], - strerror(errno)); + rc = unlink(argv[i]); + if (rc) { + printf("unlink(%s): %s ", argv[i], strerror(errno)); + rc = access(argv[i], F_OK); + if (rc && errno == ENOENT) + printf("(unlinked anyways)\n"); + else if (rc == 0) + printf("(still exists)\n"); + else + printf("(%s looking up)\n", strerror(errno)); + } } return rc; -} +} diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 67595fc..ef241b2 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -838,6 +838,22 @@ test_42() { } run_test 42 "recoery after ost failure" +# b=2530 +# directory orphans can't be unlinked from PENDING directory +test_43() { + replay_barrier mds + + # OBD_FAIL_OST_CREATE_NET 0x204 + do_facet ost "sysctl -w lustre.fail_loc=0x80000204" + facet_failover mds + df $MOUNT || return 1 + sleep 10 + do_facet ost "sysctl -w lustre.fail_loc=0" + + return 0 +} +run_test 43 "mds osc import failure during recovery; don't LBUG" + equals_msg test complete, cleaning up $CLEANUP diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 84b645a..538d819 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -648,9 +648,7 @@ test_25a() { run_test 25a "create file in symlinked directory ===============" test_25b() { - if [ ! -d $DIR/d25 ]; then - run_one 25a - fi + [ ! -d $DIR/d25 ] && test_25a $CHECKSTAT -t file $DIR/s25/foo || error } run_test 25b "lookup file in symlinked directory ===============" @@ -687,9 +685,7 @@ test_26d() { run_test 26d "create multiple component recursive symlink ======" test_26e() { - if [ ! -h $DIR/d26-3 ]; then - run_one 26d - fi + [ ! -h $DIR/d26-3 ] && test_26d rm $DIR/d26-3 } run_test 26e "unlink multiple component recursive symlink ======" @@ -1098,92 +1094,91 @@ run_test 33a "test open file(mode=0444) with O_RDWR (should return error)" TEST_34_SIZE=${TEST_34_SIZE:-2000000000000} test_34a() { - rm -f $DIR/test_34_file - $MCREATE $DIR/test_34_file || error - $LFIND $DIR/test_34_file 2>&1 | grep -q "no stripe info" || error - $TRUNCATE $DIR/test_34_file $TEST_34_SIZE || error - $LFIND $DIR/test_34_file 2>&1 | grep -q "no stripe info" || error - $CHECKSTAT -s $TEST_34_SIZE $DIR/test_34_file || error + rm -f $DIR/f34 + $MCREATE $DIR/f34 || error + $LFIND $DIR/f34 2>&1 | grep -q "no stripe info" || error + $TRUNCATE $DIR/f34 $TEST_34_SIZE || error + $LFIND $DIR/f34 2>&1 | grep -q "no stripe info" || error + $CHECKSTAT -s $TEST_34_SIZE $DIR/f34 || error } run_test 34a "truncate file that has not been opened ===========" test_34b() { - [ ! -f $DIR/test_34_file ] && run_one 34a - $CHECKSTAT -s $TEST_34_SIZE $DIR/test_34_file || error - $OPENFILE -f O_RDONLY $DIR/test_34_file - $LFIND $DIR/test_34_file 2>&1 | grep -q "no stripe info" || error - $CHECKSTAT -s $TEST_34_SIZE $DIR/test_34_file || error + [ ! -f $DIR/f34 ] && test_34a + $CHECKSTAT -s $TEST_34_SIZE $DIR/f34 || error + $OPENFILE -f O_RDONLY $DIR/f34 + $LFIND $DIR/f34 2>&1 | grep -q "no stripe info" || error + $CHECKSTAT -s $TEST_34_SIZE $DIR/f34 || error } run_test 34b "O_RDONLY opening file doesn't create objects =====" test_34c() { - [ ! -f $DIR/test_34_file ] && run_one 34a - $CHECKSTAT -s $TEST_34_SIZE $DIR/test_34_file || error - $OPENFILE -f O_RDWR $DIR/test_34_file - $LFIND $DIR/test_34_file 2>&1 | grep -q "no stripe info" && error - $CHECKSTAT -s $TEST_34_SIZE $DIR/test_34_file || error + [ ! -f $DIR/f34 ] && test_34a + $CHECKSTAT -s $TEST_34_SIZE $DIR/f34 || error + $OPENFILE -f O_RDWR $DIR/f34 + $LFIND $DIR/f34 2>&1 | grep -q "no stripe info" && error + $CHECKSTAT -s $TEST_34_SIZE $DIR/f34 || error } run_test 34c "O_RDWR opening file-with-size works ==============" test_34d() { - dd if=/dev/zero of=$DIR/test_34_file conv=notrunc bs=4k count=1 || error - $CHECKSTAT -s $TEST_34_SIZE $DIR/test_34_file || error - rm $DIR/test_34_file + dd if=/dev/zero of=$DIR/f34 conv=notrunc bs=4k count=1 || error + $CHECKSTAT -s $TEST_34_SIZE $DIR/f34 || error + rm $DIR/f34 } run_test 34d "write to sparse file =============================" test_34e() { - rm -f $DIR/test_34_file - $MCREATE $DIR/test_34_file || error - $TRUNCATE $DIR/test_34_file 1000 || error - $CHECKSTAT -s 1000 $DIR/test_34_file || error - $OPENFILE -f O_RDWR $DIR/test_34_file - $CHECKSTAT -s 1000 $DIR/test_34_file || error + rm -f $DIR/f34e + $MCREATE $DIR/f34e || error + $TRUNCATE $DIR/f34e 1000 || error + $CHECKSTAT -s 1000 $DIR/f34e || error + $OPENFILE -f O_RDWR $DIR/f34e + $CHECKSTAT -s 1000 $DIR/f34e || error } run_test 34e "create objects, some with size and some without ==" test_35a() { - cp /bin/sh $DIR/test_35a_file - chmod 444 $DIR/test_35a_file - chown $RUNAS_ID $DIR/test_35a_file - $RUNAS $DIR/test_35a_file && error || true - rm $DIR/test_35a_file + cp /bin/sh $DIR/f35a + chmod 444 $DIR/f35a + chown $RUNAS_ID $DIR/f35a + $RUNAS $DIR/f35a && error || true + rm $DIR/f35a } run_test 35a "exec file with mode 444 (should return and not leak) =====" - test_36a() { - rm -f $DIR/test_36_file - utime $DIR/test_36_file || error + rm -f $DIR/f36 + utime $DIR/f36 || error } run_test 36a "MDS utime check (mknod, utime) ===================" test_36b() { - echo "" > $DIR/test_36_file - utime $DIR/test_36_file || error + echo "" > $DIR/f36 + utime $DIR/f36 || error } run_test 36b "OST utime check (open, utime) ====================" test_36c() { - rm -f $DIR/d36/test_36_file + rm -f $DIR/d36/f36 mkdir $DIR/d36 chown $RUNAS_ID $DIR/d36 - $RUNAS utime $DIR/d36/test_36_file || error + $RUNAS utime $DIR/d36/f36 || error } run_test 36c "non-root MDS utime check (mknod, utime) ==========" test_36d() { - [ ! -d $DIR/d36 ] && run_one 36c - echo "" > $DIR/d36/test_36_file - $RUNAS utime $DIR/d36/test_36_file || error + [ ! -d $DIR/d36 ] && test_36c + echo "" > $DIR/d36/f36 + $RUNAS utime $DIR/d36/f36 || error } run_test 36d "non-root OST utime check (open, utime) ===========" test_36e() { [ $RUNAS_ID -eq $UID ] && return [ ! -d $DIR/d36 ] && mkdir $DIR/d36 - touch $DIR/d36/test_36_file2 - $RUNAS utime $DIR/d36/test_36_file2 && error || true + touch $DIR/d36/f36e + $RUNAS utime $DIR/d36/f36e && error "utime worked, want failure" || true } run_test 36e "utime on non-owned file (should return error) ====" @@ -1244,16 +1239,23 @@ stop_kupdated() { trap start_kupdated EXIT } +# ensure that all stripes have some grant before we test client-side cache +for i in `seq -f $DIR/f42-%g 1 $STRIPECOUNT`; do + dd if=/dev/zero of=$i bs=4k count=1 + rm $i +done + # Tests 42* verify that our behaviour is correct WRT caching, file closure, # file truncation, and file removal. test_42a() { cancel_lru_locks OSC stop_kupdated - sync # just to be safe - BEFOREWRITES=`count_ost_writes` - dd if=/dev/zero of=$DIR/f42a bs=1024 count=100 - AFTERWRITES=`count_ost_writes` - [ $BEFOREWRITES -eq $AFTERWRITES ] || \ + sync; sleep 1; sync # just to be safe + BEFOREWRITES=`count_ost_writes` + grep [0-9] /proc/fs/lustre/osc/OSC*MNT*/cur_grant_bytes + dd if=/dev/zero of=$DIR/f42a bs=1024 count=100 + AFTERWRITES=`count_ost_writes` + [ $BEFOREWRITES -eq $AFTERWRITES ] || \ error "$BEFOREWRITES < $AFTERWRITES" start_kupdated } @@ -1776,14 +1778,14 @@ test_99a() { run_test 99a "cvs init =========================================" test_99b() { - [ ! -d $DIR/d99cvsroot ] && run_one 99a + [ ! -d $DIR/d99cvsroot ] && test_99a cd /etc/init.d $RUNAS cvs -d $DIR/d99cvsroot import -m "nomesg" d99reposname vtag rtag } run_test 99b "cvs import =======================================" test_99c() { - [ ! -d $DIR/d99cvsroot ] && run_one 99b + [ ! -d $DIR/d99cvsroot ] && test_99b cd $DIR mkdir -p $DIR/d99reposname chown $RUNAS_ID $DIR/d99reposname @@ -1792,7 +1794,7 @@ test_99c() { run_test 99c "cvs checkout =====================================" test_99d() { - [ ! -d $DIR/d99cvsroot ] && run_one 99c + [ ! -d $DIR/d99cvsroot ] && test_99c cd $DIR/d99reposname $RUNAS touch foo99 $RUNAS cvs add -m 'addmsg' foo99 @@ -1800,14 +1802,14 @@ test_99d() { run_test 99d "cvs add ==========================================" test_99e() { - [ ! -d $DIR/d99cvsroot ] && run_one 99c + [ ! -d $DIR/d99cvsroot ] && test_99c cd $DIR/d99reposname $RUNAS cvs update } run_test 99e "cvs update =======================================" test_99f() { - [ ! -d $DIR/d99cvsroot ] && run_one 99d + [ ! -d $DIR/d99cvsroot ] && test_99d cd $DIR/d99reposname $RUNAS cvs commit -m 'nomsg' foo99 } diff --git a/lustre/utils/llmount.c b/lustre/utils/llmount.c index ce28e09..1270f91 100644 --- a/lustre/utils/llmount.c +++ b/lustre/utils/llmount.c @@ -36,6 +36,7 @@ int debug = 0; int verbose = 0; int nomtab = 0; +static char *progname = NULL; static void update_mtab_entry(char *spec, char *node, char *type, char *opts, @@ -54,12 +55,12 @@ update_mtab_entry(char *spec, char *node, char *type, char *opts, if (!nomtab) { fp = setmntent(MOUNTED, "a+"); if (fp == NULL) { - fprintf(stderr, "setmntent(%s): %s:", MOUNTED, - strerror (errno)); + fprintf(stderr, "%s: setmntent(%s): %s:", + progname, MOUNTED, strerror (errno)); } else { if ((addmntent (fp, &mnt)) == 1) { - fprintf(stderr, "addmntent: %s:", - strerror (errno)); + fprintf(stderr, "%s: addmntent: %s:", + progname, strerror (errno)); } endmntent(fp); } @@ -109,17 +110,18 @@ parse_options(char * options, struct lustre_mount_data *lmd) lmd->lmd_nal = ptl_name2nal(opteq+1); } else if(!strcmp(opt, "local_nid")) { if (ptl_parse_nid(&nid, opteq+1) != 0) { - fprintf (stderr, "mount: " + fprintf (stderr, "%s: " "can't parse NID %s\n", + progname, opteq+1); return (-1); } lmd->lmd_local_nid = nid; } else if(!strcmp(opt, "server_nid")) { if (ptl_parse_nid(&nid, opteq+1) != 0) { - fprintf (stderr, "mount: " + fprintf (stderr, "%s: " "can't parse NID %s\n", - opteq+1); + progname, opteq+1); return (-1); } lmd->lmd_server_nid = nid; @@ -174,8 +176,8 @@ set_local(struct lustre_mount_data *lmd) if (lmd->lmd_nal == SOCKNAL || lmd->lmd_nal == TCPNAL) { rc = gethostname(buf, sizeof(buf) - 1); if (rc) { - fprintf (stderr, "mount: can't get local buf:" - "%d\n", rc); + fprintf (stderr, "%s: can't get local buf: %d\n", + progname, rc); return rc; } } else if (lmd->lmd_nal == QSWNAL) { @@ -190,14 +192,15 @@ set_local(struct lustre_mount_data *lmd) } while (rc != 0 && pfiles[++i] != NULL); if (rc != 0) { - fprintf(stderr, - "mount: can't read Elan ID from /proc\n"); + fprintf(stderr, "%s: can't read Elan ID from /proc\n", + progname); + return -1; } } if (ptl_parse_nid (&nid, buf) != 0) { - fprintf (stderr, "mount: can't parse NID %s\n", buf); + fprintf (stderr, "%s: can't parse NID %s\n", progname, buf); return (-1); } @@ -214,29 +217,29 @@ set_peer(char *hostname, struct lustre_mount_data *lmd) if (lmd->lmd_nal == SOCKNAL || lmd->lmd_nal == TCPNAL) { if (lmd->lmd_server_nid == PTL_NID_ANY) { if (ptl_parse_nid (&nid, hostname) != 0) { - fprintf (stderr, "mount: can't parse NID %s\n", - hostname); + fprintf (stderr, "%s: can't parse NID %s\n", + progname, hostname); return (-1); } lmd->lmd_server_nid = nid; } if (ptl_parse_ipaddr(&lmd->lmd_server_ipaddr, hostname) != 0) { - fprintf (stderr, "mount: can't parse host %s\n", - hostname); + fprintf (stderr, "%s: can't parse host %s\n", + progname, hostname); return (-1); } } else if (lmd->lmd_nal == QSWNAL) { char buf[64]; rc = sscanf(hostname, "%*[^0-9]%63[0-9]", buf); if (rc != 1) { - fprintf (stderr, "mount: can't get elan id from host %s\n", - hostname); + fprintf (stderr, "%s: can't get elan id from host %s\n", + progname, hostname); return -1; } if (ptl_parse_nid (&nid, buf) != 0) { - fprintf (stderr, "mount: can't parse NID %s\n", - hostname); + fprintf (stderr, "%s: can't parse NID %s\n", + progname, hostname); return (-1); } lmd->lmd_server_nid = nid; @@ -260,8 +263,9 @@ build_data(char *source, char *options, struct lustre_mount_data *lmd) return -EINVAL; if (strlen(source) > sizeof(target) + 1) { - fprintf(stderr, "mount: " - "exessively long host:/mds/profile argument\n"); + fprintf(stderr, "%s: " + "exessively long host:/mds/profile argument\n", + progname); return -EINVAL; } strcpy(target, source); @@ -276,14 +280,16 @@ build_data(char *source, char *options, struct lustre_mount_data *lmd) *s = '\0'; profile = s + 1; } else { - fprintf(stderr, "mount: " + fprintf(stderr, "%s: " "directory to mount not in " - "host:/mds/profile format\n"); + "host:/mds/profile format\n", + progname); return(-1); } } else { - fprintf(stderr, "mount: " - "directory to mount not in host:/mds/profile format\n"); + fprintf(stderr, "%s: " + "directory to mount not in host:/mds/profile format\n", + progname); return(-1); } if (verbose) @@ -302,13 +308,13 @@ build_data(char *source, char *options, struct lustre_mount_data *lmd) if (rc) return rc; if (strlen(mds) > sizeof(lmd->lmd_mds) + 1) { - fprintf(stderr, "mount: mds name too long\n"); + fprintf(stderr, "%s: mds name too long\n", progname); return(-1); } strcpy(lmd->lmd_mds, mds); if (strlen(profile) > sizeof(lmd->lmd_profile) + 1) { - fprintf(stderr, "mount: profile name too long\n"); + fprintf(stderr, "%s: profile name too long\n", progname); return(-1); } strcpy(lmd->lmd_profile, profile); @@ -325,29 +331,44 @@ main(int argc, char * const argv[]) char * target = argv[2]; char * options = ""; int opt; - int i; + int i = 3; struct lustre_mount_data lmd; int rc; + progname = strrchr(argv[0], '/'); + progname = progname ? progname + 1 : argv[0]; + while ((opt = getopt(argc, argv, "vno:")) != EOF) { switch (opt) { case 'v': verbose = 1; printf("verbose: %d\n", verbose); + i++; break; case 'n': nomtab = 1; printf("nomtab: %d\n", nomtab); + i++; break; case 'o': options = optarg; + i++; break; default: + i++; break; } } + if (argc < i) { + fprintf(stderr, + "%s: too few arguments\n" + "Usage: %s [-v] [-n] [-o ...]\n", + progname, progname); + exit(1); + } + if (verbose) for (i = 0; i < argc; i++) { printf("arg[%d] = %s\n", i, argv[i]); @@ -360,7 +381,7 @@ main(int argc, char * const argv[]) } if (debug) { - printf("mount: debug mode, not mounting\n"); + printf("%s: debug mode, not mounting\n", progname); exit(0); }