X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Fklnds%2Fralnd%2Fralnd.h;h=498acff58e0778113265649f04de23c9c9973d35;hp=08430580f9e7fa6824afc385186d5213ebfcbbd4;hb=6869932b552ac705f411de3362f01bd50c1f6f7d;hpb=00f255b8c00dff66481a6ab22391869217b5d8af diff --git a/lnet/klnds/ralnd/ralnd.h b/lnet/klnds/ralnd/ralnd.h index 0843058..498acff 100644 --- a/lnet/klnds/ralnd/ralnd.h +++ b/lnet/klnds/ralnd/ralnd.h @@ -1,31 +1,49 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2004 Cluster File Systems, Inc. - * Author: Eric Barton + * GPL HEADER START * - * This file is part of Lustre, http://www.lustre.org. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/ralnd/ralnd.h + * + * Author: Eric Barton */ #ifndef EXPORT_SYMTAB # define EXPORT_SYMTAB #endif - +#ifndef AUTOCONF_INCLUDED #include +#endif #include #include #include @@ -51,55 +69,44 @@ #include #include -#define DEBUG_SUBSYSTEM S_NAL +#define DEBUG_SUBSYSTEM S_LND -#include -#include -#include -#include +#include +#include +#include #include -#define RANAL_MAXDEVS 2 /* max # devices RapidArray supports */ +/* tunables determined at compile time */ +#define RANAL_RESCHED 100 /* # scheduler loops before reschedule */ -#define RANAL_N_CONND 4 /* # connection daemons */ +#define RANAL_PEER_HASH_SIZE 101 /* # peer lists */ +#define RANAL_CONN_HASH_SIZE 101 /* # conn lists */ -#define RANAL_MIN_RECONNECT_INTERVAL 1 /* first failed connection retry (seconds)... */ -#define RANAL_MAX_RECONNECT_INTERVAL 60 /* ...exponentially increasing to this */ +#define RANAL_MIN_TIMEOUT 5 /* minimum timeout interval (seconds) */ +#define RANAL_TIMEOUT2KEEPALIVE(t) (((t)+1)/2) /* timeout -> keepalive interval */ -#define RANAL_FMA_MAX_PREFIX 232 /* max size of FMA "Prefix" */ +/* fixed constants */ +#define RANAL_MAXDEVS 2 /* max # devices RapidArray supports */ +#define RANAL_FMA_MAX_PREFIX 232 /* max bytes in FMA "Prefix" we can use */ #define RANAL_FMA_MAX_DATA ((7<<10)-256) /* Max FMA MSG is 7K including prefix */ -#define RANAL_PEER_HASH_SIZE 101 /* # peer lists */ -#define RANAL_CONN_HASH_SIZE 101 /* # conn lists */ - -#define RANAL_NTX 64 /* # tx descs */ -#define RANAL_NTX_NBLK 256 /* # reserved tx descs */ - -#define RANAL_FMA_CQ_SIZE 8192 /* # entries in receive CQ - * (overflow is a performance hit) */ - -#define RANAL_RESCHED 100 /* # scheduler loops before reschedule */ - -#define RANAL_MIN_TIMEOUT 5 /* minimum timeout interval (seconds) */ -#define RANAL_TIMEOUT2KEEPALIVE(t) (((t)+1)/2) /* timeout -> keepalive interval */ - -/* default vals for runtime tunables */ -#define RANAL_TIMEOUT 30 /* comms timeout (seconds) */ -#define RANAL_LISTENER_TIMEOUT 5 /* listener timeout (seconds) */ -#define RANAL_BACKLOG 127 /* listener's backlog */ -#define RANAL_PORT 988 /* listener's port */ -#define RANAL_MAX_IMMEDIATE (2<<10) /* immediate payload breakpoint */ typedef struct { - int kra_timeout; /* comms timeout (seconds) */ - int kra_listener_timeout; /* max time the listener can block */ - int kra_backlog; /* listener's backlog */ - int kra_port; /* listener's TCP/IP port */ - int kra_max_immediate; /* immediate payload breakpoint */ - - struct ctl_table_header *kra_sysctl; /* sysctl interface */ + int *kra_n_connd; /* # connection daemons */ + int *kra_min_reconnect_interval; /* first failed connection retry... */ + int *kra_max_reconnect_interval; /* ...exponentially increasing to this */ + int *kra_ntx; /* # tx descs */ + int *kra_credits; /* # concurrent sends */ + int *kra_peercredits; /* # concurrent sends to 1 peer */ + int *kra_fma_cq_size; /* # entries in receive CQ */ + int *kra_timeout; /* comms timeout (seconds) */ + int *kra_max_immediate; /* immediate payload breakpoint */ + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM + cfs_sysctl_table_header_t *kra_sysctl; /* sysctl interface */ +#endif } kra_tunables_t; typedef struct @@ -110,11 +117,15 @@ typedef struct int rad_id; /* device id */ int rad_idx; /* index in kra_devices */ int rad_ready; /* set by device callback */ - struct list_head rad_connq; /* connections requiring attention */ - struct list_head rad_zombies; /* connections to free */ + struct list_head rad_ready_conns;/* connections ready to tx/rx */ + struct list_head rad_new_conns; /* new connections to complete */ wait_queue_head_t rad_waitq; /* scheduler waits here */ spinlock_t rad_lock; /* serialise */ void *rad_scheduler; /* scheduling thread */ + unsigned int rad_nphysmap; /* # phys mappings */ + unsigned int rad_nppphysmap; /* # phys pages mapped */ + unsigned int rad_nvirtmap; /* # virt mappings */ + unsigned long rad_nobvirtmap; /* # virt bytes mapped */ } kra_device_t; typedef struct @@ -122,12 +133,8 @@ typedef struct int kra_init; /* initialisation state */ int kra_shutdown; /* shut down? */ atomic_t kra_nthreads; /* # live threads */ - - struct semaphore kra_nid_mutex; /* serialise NID/listener ops */ - struct semaphore kra_listener_signal; /* block for listener startup/shutdown */ - struct socket *kra_listener_sock; /* listener's socket */ - int kra_listener_shutdown; /* ask listener to close */ - + lnet_ni_t *kra_ni; /* _the_ nal instance */ + kra_device_t kra_devices[RANAL_MAXDEVS]; /* device/ptag/cq etc */ int kra_ndevs; /* # devices */ @@ -136,6 +143,7 @@ typedef struct struct list_head *kra_peers; /* hash table of all my known peers */ int kra_peer_hash_size; /* size of kra_peers */ atomic_t kra_npeers; /* # peers extant */ + int kra_nonewpeers; /* prevent new peers */ struct list_head *kra_conns; /* conns hashed by cqid */ int kra_conn_hash_size; /* size of kra_conns */ @@ -154,16 +162,13 @@ typedef struct spinlock_t kra_connd_lock; /* serialise */ struct list_head kra_idle_txs; /* idle tx descriptors */ - struct list_head kra_idle_nblk_txs; /* idle reserved tx descriptors */ __u64 kra_next_tx_cookie; /* RDMA completion cookie */ - wait_queue_head_t kra_idle_tx_waitq; /* block here for tx descriptor */ spinlock_t kra_tx_lock; /* serialise */ } kra_data_t; #define RANAL_INIT_NOTHING 0 #define RANAL_INIT_DATA 1 -#define RANAL_INIT_LIB 2 -#define RANAL_INIT_ALL 3 +#define RANAL_INIT_ALL 2 typedef struct kra_acceptsock /* accepted socket queued for connd */ { @@ -198,13 +203,13 @@ typedef struct typedef struct { - ptl_hdr_t raim_hdr; /* portals header */ + lnet_hdr_t raim_hdr; /* portals header */ /* Portals payload is in FMA "Message Data" */ } kra_immediate_msg_t; typedef struct { - ptl_hdr_t raprm_hdr; /* portals header */ + lnet_hdr_t raprm_hdr; /* portals header */ __u64 raprm_cookie; /* opaque completion cookie */ } kra_putreq_msg_t; @@ -217,7 +222,7 @@ typedef struct typedef struct { - ptl_hdr_t ragm_hdr; /* portals header */ + lnet_hdr_t ragm_hdr; /* portals header */ __u64 ragm_cookie; /* opaque completion cookie */ kra_rdma_desc_t ragm_desc; /* sender's sink buffer */ } kra_get_msg_t; @@ -244,7 +249,7 @@ typedef struct /* NB must fit in FMA "Prefix" * __u32 ram_seq; /* incrementing sequence number */ } kra_msg_t; -#define RANAL_MSG_MAGIC 0x0be91b92 /* unique magic */ +#define RANAL_MSG_MAGIC LNET_PROTO_RA_MAGIC /* unique magic */ #define RANAL_MSG_VERSION 1 /* current protocol version */ #define RANAL_MSG_FENCE 0x80 /* fence RDMA */ @@ -267,9 +272,8 @@ typedef struct kra_tx /* message descriptor */ { struct list_head tx_list; /* queue on idle_txs/rac_sendq/rac_waitq */ struct kra_conn *tx_conn; /* owning conn */ - lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ + lnet_msg_t *tx_lntmsg[2]; /* ptl msgs to finalize on completion */ unsigned long tx_qtime; /* when tx started to wait for something (jiffies) */ - int tx_isnblk; /* I'm reserved for non-blocking sends */ int tx_nob; /* # bytes of payload */ int tx_buftype; /* payload buffer type */ void *tx_buffer; /* source/sink buffer */ @@ -289,27 +293,12 @@ typedef struct kra_tx /* message descriptor */ #define RANAL_BUF_VIRT_UNMAPPED 4 /* virtual: not mapped yet */ #define RANAL_BUF_VIRT_MAPPED 5 /* virtual: mapped already */ -#define RANAL_TX_IDLE 0x00 /* on freelist */ -#define RANAL_TX_SIMPLE 0x10 /* about to send a simple message */ -#define RANAL_TX_PUTI_REQ 0x20 /* PUT initiator about to send PUT_REQ */ -#define RANAL_TX_PUTI_WAIT_ACK 0x21 /* PUT initiator waiting for PUT_ACK */ -#define RANAL_TX_PUTI_RDMA 0x22 /* PUT initiator waiting for RDMA to complete */ -#define RANAL_TX_PUTI_DONE 0x23 /* PUT initiator about to send PUT_DONE */ -#define RANAL_TX_PUTT_NAK 0x30 /* PUT target about to send PUT_NAK */ -#define RANAL_TX_PUTT_ACK 0x30 /* PUT target about to send PUT_ACK */ -#define RANAL_TX_PUTT_WAIT_DONE 0x31 /* PUT target waiting for PUT_DONE */ -#define RANAL_TX_GETI_REQ 0x40 /* GET initiator about to send GET_REQ */ -#define RANAL_TX_GETI_WAIT_DONE 0x41 /* GET initiator waiting for GET_DONE */ -#define RANAL_TX_GETT_NAK 0x50 /* GET target about to send PUT_NAK */ -#define RANAL_TX_GETT_RDMA 0x51 /* GET target waiting for RDMA to complete */ -#define RANAL_TX_GETT_DONE 0x52 /* GET target about to send GET_DONE */ - typedef struct kra_conn { struct kra_peer *rac_peer; /* owning peer */ struct list_head rac_list; /* stash on peer's conn list */ struct list_head rac_hashlist; /* stash in connection hash table */ - struct list_head rac_schedlist; /* schedule (on rad_connq) for attention */ + struct list_head rac_schedlist; /* schedule (on rad_???_conns) for attention */ struct list_head rac_fmaq; /* txs queued for FMA */ struct list_head rac_rdmaq; /* txs awaiting RDMA completion */ struct list_head rac_replyq; /* txs awaiting replies */ @@ -345,7 +334,7 @@ typedef struct kra_peer struct list_head rap_connd_list; /* schedule on kra_connd_peers */ struct list_head rap_conns; /* all active connections */ struct list_head rap_tx_queue; /* msgs waiting for a conn */ - ptl_nid_t rap_nid; /* who's on the other end(s) */ + lnet_nid_t rap_nid; /* who's on the other end(s) */ __u32 rap_ip; /* IP address of peer */ int rap_port; /* port on which peer listens */ atomic_t rap_refcount; /* # users */ @@ -355,20 +344,6 @@ typedef struct kra_peer unsigned long rap_reconnect_interval; /* exponential backoff */ } kra_peer_t; -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) -# define sk_allocation allocation -# define sk_data_ready data_ready -# define sk_write_space write_space -# define sk_user_data user_data -# define sk_prot prot -# define sk_sndbuf sndbuf -# define sk_socket socket -# define sk_wmem_queued wmem_queued -# define sk_err err -# define sk_sleep sleep -#endif - -extern lib_nal_t kranal_lib; extern kra_data_t kranal_data; extern kra_tunables_t kranal_tunables; @@ -378,7 +353,7 @@ extern void kranal_destroy_conn(kra_conn_t *conn); static inline void kranal_peer_addref(kra_peer_t *peer) { - CDEBUG(D_NET, "%p->"LPX64"\n", peer, peer->rap_nid); + CDEBUG(D_NET, "%p->%s\n", peer, libcfs_nid2str(peer->rap_nid)); LASSERT(atomic_read(&peer->rap_refcount) > 0); atomic_inc(&peer->rap_refcount); } @@ -386,14 +361,14 @@ kranal_peer_addref(kra_peer_t *peer) static inline void kranal_peer_decref(kra_peer_t *peer) { - CDEBUG(D_NET, "%p->"LPX64"\n", peer, peer->rap_nid); + CDEBUG(D_NET, "%p->%s\n", peer, libcfs_nid2str(peer->rap_nid)); LASSERT(atomic_read(&peer->rap_refcount) > 0); if (atomic_dec_and_test(&peer->rap_refcount)) kranal_destroy_peer(peer); } static inline struct list_head * -kranal_nid2peerlist (ptl_nid_t nid) +kranal_nid2peerlist (lnet_nid_t nid) { unsigned int hash = ((unsigned int)nid) % kranal_data.kra_peer_hash_size; @@ -410,7 +385,8 @@ kranal_peer_active(kra_peer_t *peer) static inline void kranal_conn_addref(kra_conn_t *conn) { - CDEBUG(D_NET, "%p->"LPX64"\n", conn, conn->rac_peer->rap_nid); + CDEBUG(D_NET, "%p->%s\n", conn, + libcfs_nid2str(conn->rac_peer->rap_nid)); LASSERT(atomic_read(&conn->rac_refcount) > 0); atomic_inc(&conn->rac_refcount); } @@ -418,7 +394,8 @@ kranal_conn_addref(kra_conn_t *conn) static inline void kranal_conn_decref(kra_conn_t *conn) { - CDEBUG(D_NET, "%p->"LPX64"\n", conn, conn->rac_peer->rap_nid); + CDEBUG(D_NET, "%p->%s\n", conn, + libcfs_nid2str(conn->rac_peer->rap_nid)); LASSERT(atomic_read(&conn->rac_refcount) > 0); if (atomic_dec_and_test(&conn->rac_refcount)) kranal_destroy_conn(conn); @@ -456,31 +433,41 @@ kranal_tx_mapped (kra_tx_t *tx) tx->tx_buftype == RANAL_BUF_PHYS_MAPPED); } -static inline __u64 -kranal_page2phys (struct page *p) -{ - return page_to_phys(p); -} +int kranal_startup (lnet_ni_t *ni); +void kranal_shutdown (lnet_ni_t *ni); +int kranal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); +int kranal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int kranal_eager_recv(lnet_ni_t *ni, void *private, + lnet_msg_t *lntmsg, void **new_private); +int kranal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); +int kranal_accept(lnet_ni_t *ni, struct socket *sock); extern void kranal_free_acceptsock (kra_acceptsock_t *ras); -extern int kranal_listener_procint (ctl_table *table, +extern int kranal_listener_procint (cfs_sysctl_table_t *table, int write, struct file *filp, void *buffer, size_t *lenp); extern void kranal_update_reaper_timeout (long timeout); extern void kranal_tx_done (kra_tx_t *tx, int completion); extern void kranal_unlink_peer_locked (kra_peer_t *peer); extern void kranal_schedule_conn (kra_conn_t *conn); -extern kra_peer_t *kranal_create_peer (ptl_nid_t nid); -extern kra_peer_t *kranal_find_peer_locked (ptl_nid_t nid); +extern int kranal_create_peer (kra_peer_t **peerp, lnet_nid_t nid); +extern int kranal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port); +extern kra_peer_t *kranal_find_peer_locked (lnet_nid_t nid); extern void kranal_post_fma (kra_conn_t *conn, kra_tx_t *tx); -extern int kranal_del_peer (ptl_nid_t nid, int single_share); +extern int kranal_del_peer (lnet_nid_t nid); extern void kranal_device_callback (RAP_INT32 devid, RAP_PVOID arg); extern int kranal_thread_start (int(*fn)(void *arg), void *arg); extern int kranal_connd (void *arg); extern int kranal_reaper (void *arg); extern int kranal_scheduler (void *arg); extern void kranal_close_conn_locked (kra_conn_t *conn, int error); +extern void kranal_close_conn (kra_conn_t *conn, int error); extern void kranal_terminate_conn_locked (kra_conn_t *conn); extern void kranal_connect (kra_peer_t *peer); extern int kranal_conn_handshake (struct socket *sock, kra_peer_t *peer); -extern void kranal_pause(int ticks); +extern int kranal_tunables_init(void); +extern void kranal_tunables_fini(void); +extern void kranal_init_msg(kra_msg_t *msg, int type);