Whamcloud - gitweb
LU-1346 gnilnd: remove libcfs abstractions
[fs/lustre-release.git] / lnet / klnds / o2iblnd / o2iblnd.h
index aa16854..661756c 100644 (file)
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
+/*
  * GPL HEADER START
  *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@@ -28,6 +26,8 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
  * Author: Eric Barton <eric@bartonsoftware.com>
  */
 
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
-#ifndef AUTOCONF_INCLUDED
-#include <linux/config.h>
-#endif
+#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
-#include <linux/smp_lock.h>
 #include <linux/unistd.h>
 #include <linux/uio.h>
 
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
@@ -66,6 +59,9 @@
 #include <linux/kmod.h>
 #include <linux/sysctl.h>
 #include <linux/pci.h>
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,32)
+#include <linux/pci-dma.h>
+#endif
 
 #include <net/sock.h>
 #include <linux/in.h>
 #include <lnet/lib-lnet.h>
 #include <lnet/lnet-sysctl.h>
 
-#if !HAVE_GFP_T
-typedef int gfp_t;
+#ifdef HAVE_COMPAT_RDMA
+#include <linux/compat-2.6.h>
 #endif
-
 #include <rdma/rdma_cm.h>
 #include <rdma/ib_cm.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_fmr_pool.h>
 
-/* tunables fixed at compile time */
-#ifdef CONFIG_SMP
-# define IBLND_N_SCHED      cfs_num_online_cpus()   /* # schedulers */
-#else
-# define IBLND_N_SCHED      1                   /* # schedulers */
-#endif
+#define IBLND_PEER_HASH_SIZE           101     /* # peer lists */
+/* # scheduler loops before reschedule */
+#define IBLND_RESCHED                  100
 
-#define IBLND_PEER_HASH_SIZE         101        /* # peer lists */
-#define IBLND_RESCHED                100        /* # scheduler loops before reschedule */
+#define IBLND_N_SCHED                  2
+#define IBLND_N_SCHED_HIGH             4
 
 typedef struct
 {
-        int              *kib_dev_failover;     /* HCA failover */
-        unsigned int     *kib_service;          /* IB service number */
-        int              *kib_min_reconnect_interval; /* first failed connection retry... */
-        int              *kib_max_reconnect_interval; /* ...exponentially increasing to this */
-        int              *kib_cksum;            /* checksum kib_msg_t? */
-        int              *kib_timeout;          /* comms timeout (seconds) */
-        int              *kib_keepalive;        /* keepalive timeout (seconds) */
-        int              *kib_ntx;              /* # tx descs */
-        int              *kib_credits;          /* # concurrent sends */
-        int              *kib_peertxcredits;    /* # concurrent sends to 1 peer */
-        int              *kib_peerrtrcredits;   /* # per-peer router buffer credits */
-        int              *kib_peercredits_hiw;  /* # when eagerly to return credits */
-        int              *kib_peertimeout;      /* seconds to consider peer dead */
-        char            **kib_default_ipif;     /* default IPoIB interface */
-        int              *kib_retry_count;
-        int              *kib_rnr_retry_count;
-        int              *kib_concurrent_sends; /* send work queue sizing */
-        int             *kib_ib_mtu;           /* IB MTU */
-        int              *kib_map_on_demand;    /* map-on-demand if RD has more fragments
-                                                 * than this value, 0 disable map-on-demand */
-        int              *kib_pmr_pool_size;    /* # physical MR in pool */
-        int              *kib_fmr_pool_size;    /* # FMRs in pool */
-        int              *kib_fmr_flush_trigger; /* When to trigger FMR flush */
-        int              *kib_fmr_cache;        /* enable FMR pool cache? */
+       int              *kib_dev_failover;     /* HCA failover */
+       unsigned int     *kib_service;          /* IB service number */
+       int              *kib_min_reconnect_interval; /* first failed connection retry... */
+       int              *kib_max_reconnect_interval; /* ...exponentially increasing to this */
+       int              *kib_cksum;            /* checksum kib_msg_t? */
+       int              *kib_timeout;          /* comms timeout (seconds) */
+       int              *kib_keepalive;        /* keepalive timeout (seconds) */
+       int              *kib_ntx;              /* # tx descs */
+       int              *kib_credits;          /* # concurrent sends */
+       int              *kib_peertxcredits;    /* # concurrent sends to 1 peer */
+       int              *kib_peerrtrcredits;   /* # per-peer router buffer credits */
+       int              *kib_peercredits_hiw;  /* # when eagerly to return credits */
+       int              *kib_peertimeout;      /* seconds to consider peer dead */
+       char            **kib_default_ipif;     /* default IPoIB interface */
+       int              *kib_retry_count;
+       int              *kib_rnr_retry_count;
+       int              *kib_concurrent_sends; /* send work queue sizing */
+       int              *kib_ib_mtu;           /* IB MTU */
+       int              *kib_map_on_demand;    /* map-on-demand if RD has more fragments
+                                                * than this value, 0 disable map-on-demand */
+       int              *kib_pmr_pool_size;    /* # physical MR in pool */
+       int              *kib_fmr_pool_size;    /* # FMRs in pool */
+       int              *kib_fmr_flush_trigger; /* When to trigger FMR flush */
+       int              *kib_fmr_cache;        /* enable FMR pool cache? */
 #if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
-        cfs_sysctl_table_header_t *kib_sysctl;  /* sysctl interface */
+       struct ctl_table_header *kib_sysctl;  /* sysctl interface */
 #endif
-        int              *kib_require_priv_port;/* accept only privileged ports */
-        int              *kib_use_priv_port;    /* use privileged port for active connect */
+       int              *kib_require_priv_port;/* accept only privileged ports */
+       int              *kib_use_priv_port;    /* use privileged port for active connect */
+       /* # threads on each CPT */
+       int              *kib_nscheds;
 } kib_tunables_t;
 
 extern kib_tunables_t  kiblnd_tunables;
@@ -179,6 +173,12 @@ kiblnd_concurrent_sends_v1(void)
 
 /************************/
 /* derived constants... */
+/* Pools (shared by connections on each CPT) */
+/* These pools can grow at runtime, so don't need give a very large value */
+#define IBLND_TX_POOL                  256
+#define IBLND_PMR_POOL                 256
+#define IBLND_FMR_POOL                 256
+#define IBLND_FMR_POOL_FLUSH           192
 
 /* TX messages (shared by all connections) */
 #define IBLND_TX_MSGS()            (*kiblnd_tunables.kib_ntx)
@@ -195,12 +195,20 @@ kiblnd_concurrent_sends_v1(void)
 
 struct kib_hca_dev;
 
+/* o2iblnd can run over aliased interface */
+#ifdef IFALIASZ
+#define KIB_IFNAME_SIZE              IFALIASZ
+#else
+#define KIB_IFNAME_SIZE              256
+#endif
+
 typedef struct
 {
         cfs_list_t           ibd_list;          /* chain on kib_devs */
         cfs_list_t           ibd_fail_list;     /* chain on kib_failed_devs */
         __u32                ibd_ifip;          /* IPoIB interface IP */
-        char                 ibd_ifname[32];    /* IPoIB interface name */
+        /** IPoIB interface name */
+        char                 ibd_ifname[KIB_IFNAME_SIZE];
         int                  ibd_nnets;         /* # nets extant */
 
         cfs_time_t           ibd_next_failover;
@@ -252,12 +260,11 @@ typedef struct {
 struct kib_pool;
 struct kib_poolset;
 
-typedef int  (*kib_ps_pool_create_t)(struct kib_poolset *ps, int inc, struct kib_pool **pp_po);
+typedef int  (*kib_ps_pool_create_t)(struct kib_poolset *ps,
+                                    int inc, struct kib_pool **pp_po);
 typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
-typedef void (*kib_ps_node_init_t)(struct kib_pool *po,
-                                   cfs_list_t *node);
-typedef void (*kib_ps_node_fini_t)(struct kib_pool *po,
-                                   cfs_list_t *node);
+typedef void (*kib_ps_node_init_t)(struct kib_pool *po, cfs_list_t *node);
+typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, cfs_list_t *node);
 
 struct kib_net;
 
@@ -265,7 +272,7 @@ struct kib_net;
 
 typedef struct kib_poolset
 {
-        cfs_spinlock_t          ps_lock;                /* serialize */
+       spinlock_t              ps_lock;                /* serialize */
         struct kib_net         *ps_net;                 /* network it belongs to */
         char                    ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */
         cfs_list_t              ps_pool_list;           /* list of pools */
@@ -273,6 +280,7 @@ typedef struct kib_poolset
         cfs_time_t              ps_next_retry;          /* time stamp for retry if failed to allocate */
         int                     ps_increasing;          /* is allocating new pool */
         int                     ps_pool_size;           /* new pool size */
+       int                     ps_cpt;                 /* CPT id */
 
         kib_ps_pool_create_t    ps_pool_create;         /* create a new pool */
         kib_ps_pool_destroy_t   ps_pool_destroy;        /* destroy a pool */
@@ -314,13 +322,18 @@ typedef struct kib_pmr_pool {
 
 typedef struct
 {
-        cfs_spinlock_t          fps_lock;               /* serialize */
+       spinlock_t              fps_lock;               /* serialize */
         struct kib_net         *fps_net;                /* IB network */
         cfs_list_t              fps_pool_list;          /* FMR pool list */
         cfs_list_t              fps_failed_pool_list;   /* FMR pool list */
         __u64                   fps_version;            /* validity stamp */
-        int                     fps_increasing;         /* is allocating new pool */
-        cfs_time_t              fps_next_retry;         /* time stamp for retry if failed to allocate */
+       int                     fps_cpt;                /* CPT id */
+       int                     fps_pool_size;
+       int                     fps_flush_trigger;
+       /* is allocating new pool */
+       int                     fps_increasing;
+       /* time stamp for retry if failed to allocate */
+       cfs_time_t              fps_next_retry;
 } kib_fmr_poolset_t;
 
 typedef struct
@@ -345,43 +358,64 @@ typedef struct kib_net
         __u64                ibn_incarnation;   /* my epoch */
         int                  ibn_init;          /* initialisation state */
         int                  ibn_shutdown;      /* shutting down? */
-        unsigned int         ibn_with_fmr:1;    /* FMR? */
-        unsigned int         ibn_with_pmr:1;    /* PMR? */
 
-        cfs_atomic_t         ibn_npeers;        /* # peers extant */
-        cfs_atomic_t         ibn_nconns;        /* # connections extant */
+       cfs_atomic_t            ibn_npeers;     /* # peers extant */
+       cfs_atomic_t            ibn_nconns;     /* # connections extant */
 
-        kib_tx_poolset_t     ibn_tx_ps;         /* tx pool-set */
-        kib_fmr_poolset_t    ibn_fmr_ps;        /* fmr pool-set */
-        kib_pmr_poolset_t    ibn_pmr_ps;        /* pmr pool-set */
+       kib_tx_poolset_t        **ibn_tx_ps;    /* tx pool-set */
+       kib_fmr_poolset_t       **ibn_fmr_ps;   /* fmr pool-set */
+       kib_pmr_poolset_t       **ibn_pmr_ps;   /* pmr pool-set */
 
-        kib_dev_t           *ibn_dev;           /* underlying IB device */
+       kib_dev_t               *ibn_dev;       /* underlying IB device */
 } kib_net_t;
 
+#define KIB_THREAD_SHIFT               16
+#define KIB_THREAD_ID(cpt, tid)                ((cpt) << KIB_THREAD_SHIFT | (tid))
+#define KIB_THREAD_CPT(id)             ((id) >> KIB_THREAD_SHIFT)
+#define KIB_THREAD_TID(id)             ((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
+
+struct kib_sched_info {
+       /* serialise */
+       spinlock_t              ibs_lock;
+       /* schedulers sleep here */
+       wait_queue_head_t               ibs_waitq;
+       /* conns to check for rx completions */
+       cfs_list_t              ibs_conns;
+       /* number of scheduler threads */
+       int                     ibs_nthreads;
+       /* max allowed scheduler threads */
+       int                     ibs_nthreads_max;
+       int                     ibs_cpt;        /* CPT id */
+};
+
 typedef struct
 {
-        int               kib_init;        /* initialisation state */
-        int               kib_shutdown;    /* shut down? */
-        cfs_list_t        kib_devs;        /* IB devices extant */
-        cfs_list_t           kib_failed_devs;   /* list head of failed devices */
-        cfs_atomic_t      kib_nthreads;    /* # live threads */
-        cfs_rwlock_t      kib_global_lock; /* stabilize net/dev/peer/conn ops */
-
-        cfs_list_t       *kib_peers;  /* hash table of all my known peers */
-        int               kib_peer_hash_size;/* size of kib_peers */
-
-        void             *kib_connd;       /* the connd task (serialisation assertions) */
-        cfs_list_t        kib_connd_conns; /* connections to setup/teardown */
-        cfs_list_t        kib_connd_zombies;/* connections with zero refcount */
-        cfs_waitq_t       kib_connd_waitq; /* connection daemon sleeps here */
-        cfs_spinlock_t    kib_connd_lock;  /* serialise */
-
-        cfs_waitq_t       kib_sched_waitq; /* schedulers sleep here */
-        cfs_list_t        kib_sched_conns; /* conns to check for rx completions */
-        cfs_spinlock_t    kib_sched_lock;  /* serialise */
-        cfs_waitq_t          kib_failover_waitq; /* schedulers sleep here */
-
-        struct ib_qp_attr kib_error_qpa;   /* QP->ERROR */
+       int                     kib_init;       /* initialisation state */
+       int                     kib_shutdown;   /* shut down? */
+       cfs_list_t              kib_devs;       /* IB devices extant */
+       /* list head of failed devices */
+       cfs_list_t              kib_failed_devs;
+       /* schedulers sleep here */
+       wait_queue_head_t               kib_failover_waitq;
+       cfs_atomic_t            kib_nthreads;   /* # live threads */
+       /* stabilize net/dev/peer/conn ops */
+       rwlock_t                kib_global_lock;
+       /* hash table of all my known peers */
+       cfs_list_t              *kib_peers;
+       /* size of kib_peers */
+       int                     kib_peer_hash_size;
+       /* the connd task (serialisation assertions) */
+       void                    *kib_connd;
+       /* connections to setup/teardown */
+       cfs_list_t              kib_connd_conns;
+       /* connections with zero refcount */
+       cfs_list_t              kib_connd_zombies;
+       /* connection daemon sleeps here */
+       wait_queue_head_t               kib_connd_waitq;
+       spinlock_t              kib_connd_lock; /* serialise */
+       struct ib_qp_attr       kib_error_qpa;  /* QP->ERROR */
+       /* percpt data for schedulers */
+       struct kib_sched_info   **kib_scheds;
 } kib_data_t;
 
 #define IBLND_INIT_NOTHING         0
@@ -564,8 +598,9 @@ typedef struct kib_connvars
 
 typedef struct kib_conn
 {
+       struct kib_sched_info *ibc_sched;       /* scheduler information */
         struct kib_peer     *ibc_peer;          /* owning peer */
-        kib_hca_dev_t      *ibc_hdev;           /* HCA bound on */
+        kib_hca_dev_t       *ibc_hdev;          /* HCA bound on */
         cfs_list_t           ibc_list;          /* stash on peer's conn list */
         cfs_list_t           ibc_sched_list;    /* schedule for attention */
         __u16                ibc_version;       /* version of connection */
@@ -578,17 +613,22 @@ typedef struct kib_conn
         int                  ibc_outstanding_credits; /* # credits to return */
         int                  ibc_reserved_credits;/* # ACK/DONE msg credits */
         int                  ibc_comms_error;   /* set on comms error */
-        int                  ibc_nrx:16;        /* receive buffers owned */
-        int                  ibc_scheduled:1;   /* scheduled for attention */
-        int                  ibc_ready:1;       /* CQ callback fired */
-        unsigned long        ibc_last_send;     /* time of last send */
-        cfs_list_t           ibc_early_rxs;     /* rxs completed before ESTABLISHED */
-        cfs_list_t          ibc_tx_noops;       /* IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
+       unsigned int         ibc_nrx:16;        /* receive buffers owned */
+       unsigned int         ibc_scheduled:1;   /* scheduled for attention */
+       unsigned int         ibc_ready:1;       /* CQ callback fired */
+        /* time of last send */
+        unsigned long        ibc_last_send;
+        /** link chain for kiblnd_check_conns only */
+        cfs_list_t           ibc_connd_list;
+        /** rxs completed before ESTABLISHED */
+        cfs_list_t           ibc_early_rxs;
+        /** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
+        cfs_list_t           ibc_tx_noops;
         cfs_list_t           ibc_tx_queue;       /* sends that need a credit */
         cfs_list_t           ibc_tx_queue_nocred;/* sends that don't need a credit */
         cfs_list_t           ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */
         cfs_list_t           ibc_active_txs;     /* active tx awaiting completion */
-        cfs_spinlock_t       ibc_lock;           /* serialise */
+       spinlock_t           ibc_lock;           /* serialise */
         kib_rx_t            *ibc_rxs;            /* the rx descs */
         kib_pages_t         *ibc_rx_pages;       /* premapped rx msg pages */
 
@@ -598,7 +638,7 @@ typedef struct kib_conn
         kib_connvars_t      *ibc_connvars;       /* in-progress connection state */
 } kib_conn_t;
 
-#define IBLND_CONN_INIT               0         /* being intialised */
+#define IBLND_CONN_INIT               0         /* being initialised */
 #define IBLND_CONN_ACTIVE_CONNECT     1         /* active sending req */
 #define IBLND_CONN_PASSIVE_WAIT       2         /* passive waiting for rtu */
 #define IBLND_CONN_ESTABLISHED        3         /* connection established */
@@ -662,20 +702,20 @@ do {                                                            \
         cfs_atomic_inc(&(conn)->ibc_refcount);                  \
 } while (0)
 
-#define kiblnd_conn_decref(conn)                                               \
-do {                                                                           \
-        unsigned long   flags;                                                 \
-                                                                               \
-        CDEBUG(D_NET, "conn[%p] (%d)--\n",                                     \
-               (conn), cfs_atomic_read(&(conn)->ibc_refcount));                \
-        LASSERT_ATOMIC_POS(&(conn)->ibc_refcount);                             \
-        if (cfs_atomic_dec_and_test(&(conn)->ibc_refcount)) {                  \
-                cfs_spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);     \
-                cfs_list_add_tail(&(conn)->ibc_list,                           \
-                                  &kiblnd_data.kib_connd_zombies);             \
-                cfs_waitq_signal(&kiblnd_data.kib_connd_waitq);                \
-                cfs_spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
-        }                                                                      \
+#define kiblnd_conn_decref(conn)                                       \
+do {                                                                   \
+       unsigned long flags;                                            \
+                                                                       \
+       CDEBUG(D_NET, "conn[%p] (%d)--\n",                              \
+              (conn), cfs_atomic_read(&(conn)->ibc_refcount));         \
+       LASSERT_ATOMIC_POS(&(conn)->ibc_refcount);                      \
+       if (cfs_atomic_dec_and_test(&(conn)->ibc_refcount)) {           \
+               spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);  \
+               cfs_list_add_tail(&(conn)->ibc_list,                    \
+                                 &kiblnd_data.kib_connd_zombies);      \
+               wake_up(&kiblnd_data.kib_connd_waitq);          \
+               spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
+       }                                                               \
 } while (0)
 
 #define kiblnd_peer_addref(peer)                                \
@@ -724,13 +764,13 @@ kiblnd_get_conn_locked (kib_peer_t *peer)
 static inline int
 kiblnd_send_keepalive(kib_conn_t *conn)
 {
-        return (*kiblnd_tunables.kib_keepalive > 0) &&
-                cfs_time_after(jiffies, conn->ibc_last_send +
-                               *kiblnd_tunables.kib_keepalive*CFS_HZ);
+       return (*kiblnd_tunables.kib_keepalive > 0) &&
+               cfs_time_after(jiffies, conn->ibc_last_send +
+                              *kiblnd_tunables.kib_keepalive*HZ);
 }
 
 static inline int
-kiblnd_send_noop(kib_conn_t *conn)
+kiblnd_need_noop(kib_conn_t *conn)
 {
         LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
 
@@ -744,11 +784,12 @@ kiblnd_send_noop(kib_conn_t *conn)
                         return 0; /* NOOP can be piggybacked */
 
                 /* No tx to piggyback NOOP onto or no credit to send a tx */
-                return (cfs_list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 0);
+                return (cfs_list_empty(&conn->ibc_tx_queue) ||
+                        conn->ibc_credits == 0);
         }
 
-        if (!cfs_list_empty(&conn->ibc_tx_noops) ||       /* NOOP already queued */
-            !cfs_list_empty(&conn->ibc_tx_queue_nocred) || /* can be piggybacked */
+        if (!cfs_list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
+            !cfs_list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
             conn->ibc_credits == 0)                    /* no credit */
                 return 0;
 
@@ -1030,10 +1071,10 @@ void kiblnd_tunables_fini(void);
 
 int  kiblnd_connd (void *arg);
 int  kiblnd_scheduler(void *arg);
-int  kiblnd_thread_start (int (*fn)(void *arg), void *arg);
+int  kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
 int  kiblnd_failover_thread (void *arg);
 
-int  kiblnd_alloc_pages (kib_pages_t **pp, int npages);
+int  kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages);
 void kiblnd_free_pages (kib_pages_t *p);
 
 int  kiblnd_cm_callback(struct rdma_cm_id *cmid,