X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Fklnds%2Fo2iblnd%2Fo2iblnd.h;h=c7a4f8c7e9ce2c5b13332b5a8b45c770f91e4418;hp=9c0409b429c4d617d4374784bc25a54f505e13ec;hb=b43a6b1800265608cfa18159d4d0d006a1c23015;hpb=c4118a072e98909fb95199158ed1b1d66cf421ee diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 9c0409b..c7a4f8c 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -1,6 +1,4 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * +/* * GPL HEADER START * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -28,6 +26,8 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. + * + * Copyright (c) 2011, Whamcloud, Inc. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -38,9 +38,6 @@ * Author: Eric Barton */ -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif #ifndef AUTOCONF_INCLUDED #include #endif @@ -77,24 +74,17 @@ #include #include -#if !HAVE_GFP_T -typedef int gfp_t; -#endif - #include #include #include #include -/* tunables fixed at compile time */ -#ifdef CONFIG_SMP -# define IBLND_N_SCHED cfs_num_online_cpus() /* # schedulers */ -#else -# define IBLND_N_SCHED 1 /* # schedulers */ -#endif +#define IBLND_PEER_HASH_SIZE 101 /* # peer lists */ +/* # scheduler loops before reschedule */ +#define IBLND_RESCHED 100 -#define IBLND_PEER_HASH_SIZE 101 /* # peer lists */ -#define IBLND_RESCHED 100 /* # scheduler loops before reschedule */ +#define IBLND_N_SCHED 2 +#define IBLND_N_SCHED_HIGH 4 typedef struct { @@ -127,6 +117,8 @@ typedef struct #endif int *kib_require_priv_port;/* accept only privileged ports */ int *kib_use_priv_port; /* use privileged port for active connect */ + /* # threads on each CPT */ + int *kib_nscheds; } kib_tunables_t; extern kib_tunables_t kiblnd_tunables; @@ -179,6 +171,12 @@ kiblnd_concurrent_sends_v1(void) /************************/ /* derived constants... */ +/* Pools (shared by connections on each CPT) */ +/* These pools can grow at runtime, so don't need give a very large value */ +#define IBLND_TX_POOL 256 +#define IBLND_PMR_POOL 256 +#define IBLND_FMR_POOL 256 +#define IBLND_FMR_POOL_FLUSH 192 /* TX messages (shared by all connections) */ #define IBLND_TX_MSGS() (*kiblnd_tunables.kib_ntx) @@ -195,12 +193,20 @@ kiblnd_concurrent_sends_v1(void) struct kib_hca_dev; +/* o2iblnd can run over aliased interface */ +#ifdef IFALIASZ +#define KIB_IFNAME_SIZE IFALIASZ +#else +#define KIB_IFNAME_SIZE 256 +#endif + typedef struct { cfs_list_t ibd_list; /* chain on kib_devs */ cfs_list_t ibd_fail_list; /* chain on kib_failed_devs */ __u32 ibd_ifip; /* IPoIB interface IP */ - char ibd_ifname[32]; /* IPoIB interface name */ + /** IPoIB interface name */ + char ibd_ifname[KIB_IFNAME_SIZE]; int ibd_nnets; /* # nets extant */ cfs_time_t ibd_next_failover; @@ -227,7 +233,10 @@ typedef struct kib_hca_dev cfs_atomic_t ibh_ref; /* refcount */ } kib_hca_dev_t; -#define IBLND_POOL_DEADLINE 300 /* # of seconds to keep pool alive */ +/** # of seconds to keep pool alive */ +#define IBLND_POOL_DEADLINE 300 +/** # of seconds to retry if allocation failed */ +#define IBLND_POOL_RETRY 1 typedef struct { @@ -249,12 +258,11 @@ typedef struct { struct kib_pool; struct kib_poolset; -typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps, int inc, struct kib_pool **pp_po); +typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps, + int inc, struct kib_pool **pp_po); typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po); -typedef void (*kib_ps_node_init_t)(struct kib_pool *po, - cfs_list_t *node); -typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, - cfs_list_t *node); +typedef void (*kib_ps_node_init_t)(struct kib_pool *po, cfs_list_t *node); +typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, cfs_list_t *node); struct kib_net; @@ -270,6 +278,7 @@ typedef struct kib_poolset cfs_time_t ps_next_retry; /* time stamp for retry if failed to allocate */ int ps_increasing; /* is allocating new pool */ int ps_pool_size; /* new pool size */ + int ps_cpt; /* CPT id */ kib_ps_pool_create_t ps_pool_create; /* create a new pool */ kib_ps_pool_destroy_t ps_pool_destroy; /* destroy a pool */ @@ -316,8 +325,13 @@ typedef struct cfs_list_t fps_pool_list; /* FMR pool list */ cfs_list_t fps_failed_pool_list; /* FMR pool list */ __u64 fps_version; /* validity stamp */ - int fps_increasing; /* is allocating new pool */ - cfs_time_t fps_next_retry; /* time stamp for retry if failed to allocate */ + int fps_cpt; /* CPT id */ + int fps_pool_size; + int fps_flush_trigger; + /* is allocating new pool */ + int fps_increasing; + /* time stamp for retry if failed to allocate */ + cfs_time_t fps_next_retry; } kib_fmr_poolset_t; typedef struct @@ -342,43 +356,64 @@ typedef struct kib_net __u64 ibn_incarnation; /* my epoch */ int ibn_init; /* initialisation state */ int ibn_shutdown; /* shutting down? */ - unsigned int ibn_with_fmr:1; /* FMR? */ - unsigned int ibn_with_pmr:1; /* PMR? */ - cfs_atomic_t ibn_npeers; /* # peers extant */ - cfs_atomic_t ibn_nconns; /* # connections extant */ + cfs_atomic_t ibn_npeers; /* # peers extant */ + cfs_atomic_t ibn_nconns; /* # connections extant */ - kib_tx_poolset_t ibn_tx_ps; /* tx pool-set */ - kib_fmr_poolset_t ibn_fmr_ps; /* fmr pool-set */ - kib_pmr_poolset_t ibn_pmr_ps; /* pmr pool-set */ + kib_tx_poolset_t **ibn_tx_ps; /* tx pool-set */ + kib_fmr_poolset_t **ibn_fmr_ps; /* fmr pool-set */ + kib_pmr_poolset_t **ibn_pmr_ps; /* pmr pool-set */ - kib_dev_t *ibn_dev; /* underlying IB device */ + kib_dev_t *ibn_dev; /* underlying IB device */ } kib_net_t; +#define KIB_THREAD_SHIFT 16 +#define KIB_THREAD_ID(cpt, tid) ((cpt) << KIB_THREAD_SHIFT | (tid)) +#define KIB_THREAD_CPT(id) ((id) >> KIB_THREAD_SHIFT) +#define KIB_THREAD_TID(id) ((id) & ((1UL << KIB_THREAD_SHIFT) - 1)) + +struct kib_sched_info { + /* serialise */ + cfs_spinlock_t ibs_lock; + /* schedulers sleep here */ + cfs_waitq_t ibs_waitq; + /* conns to check for rx completions */ + cfs_list_t ibs_conns; + /* number of scheduler threads */ + int ibs_nthreads; + /* max allowed scheduler threads */ + int ibs_nthreads_max; + int ibs_cpt; /* CPT id */ +}; + typedef struct { - int kib_init; /* initialisation state */ - int kib_shutdown; /* shut down? */ - cfs_list_t kib_devs; /* IB devices extant */ - cfs_list_t kib_failed_devs; /* list head of failed devices */ - cfs_atomic_t kib_nthreads; /* # live threads */ - cfs_rwlock_t kib_global_lock; /* stabilize net/dev/peer/conn ops */ - - cfs_list_t *kib_peers; /* hash table of all my known peers */ - int kib_peer_hash_size;/* size of kib_peers */ - - void *kib_connd; /* the connd task (serialisation assertions) */ - cfs_list_t kib_connd_conns; /* connections to setup/teardown */ - cfs_list_t kib_connd_zombies;/* connections with zero refcount */ - cfs_waitq_t kib_connd_waitq; /* connection daemon sleeps here */ - cfs_spinlock_t kib_connd_lock; /* serialise */ - - cfs_waitq_t kib_sched_waitq; /* schedulers sleep here */ - cfs_list_t kib_sched_conns; /* conns to check for rx completions */ - cfs_spinlock_t kib_sched_lock; /* serialise */ - cfs_waitq_t kib_failover_waitq; /* schedulers sleep here */ - - struct ib_qp_attr kib_error_qpa; /* QP->ERROR */ + int kib_init; /* initialisation state */ + int kib_shutdown; /* shut down? */ + cfs_list_t kib_devs; /* IB devices extant */ + /* list head of failed devices */ + cfs_list_t kib_failed_devs; + /* schedulers sleep here */ + cfs_waitq_t kib_failover_waitq; + cfs_atomic_t kib_nthreads; /* # live threads */ + /* stabilize net/dev/peer/conn ops */ + cfs_rwlock_t kib_global_lock; + /* hash table of all my known peers */ + cfs_list_t *kib_peers; + /* size of kib_peers */ + int kib_peer_hash_size; + /* the connd task (serialisation assertions) */ + void *kib_connd; + /* connections to setup/teardown */ + cfs_list_t kib_connd_conns; + /* connections with zero refcount */ + cfs_list_t kib_connd_zombies; + /* connection daemon sleeps here */ + cfs_waitq_t kib_connd_waitq; + cfs_spinlock_t kib_connd_lock; /* serialise */ + struct ib_qp_attr kib_error_qpa; /* QP->ERROR */ + /* percpt data for schedulers */ + struct kib_sched_info **kib_scheds; } kib_data_t; #define IBLND_INIT_NOTHING 0 @@ -561,8 +596,9 @@ typedef struct kib_connvars typedef struct kib_conn { + struct kib_sched_info *ibc_sched; /* scheduler information */ struct kib_peer *ibc_peer; /* owning peer */ - kib_hca_dev_t *ibc_hdev; /* HCA bound on */ + kib_hca_dev_t *ibc_hdev; /* HCA bound on */ cfs_list_t ibc_list; /* stash on peer's conn list */ cfs_list_t ibc_sched_list; /* schedule for attention */ __u16 ibc_version; /* version of connection */ @@ -578,9 +614,14 @@ typedef struct kib_conn int ibc_nrx:16; /* receive buffers owned */ int ibc_scheduled:1; /* scheduled for attention */ int ibc_ready:1; /* CQ callback fired */ - unsigned long ibc_last_send; /* time of last send */ - cfs_list_t ibc_early_rxs; /* rxs completed before ESTABLISHED */ - cfs_list_t ibc_tx_noops; /* IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */ + /* time of last send */ + unsigned long ibc_last_send; + /** link chain for kiblnd_check_conns only */ + cfs_list_t ibc_connd_list; + /** rxs completed before ESTABLISHED */ + cfs_list_t ibc_early_rxs; + /** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */ + cfs_list_t ibc_tx_noops; cfs_list_t ibc_tx_queue; /* sends that need a credit */ cfs_list_t ibc_tx_queue_nocred;/* sends that don't need a credit */ cfs_list_t ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */ @@ -595,7 +636,7 @@ typedef struct kib_conn kib_connvars_t *ibc_connvars; /* in-progress connection state */ } kib_conn_t; -#define IBLND_CONN_INIT 0 /* being intialised */ +#define IBLND_CONN_INIT 0 /* being initialised */ #define IBLND_CONN_ACTIVE_CONNECT 1 /* active sending req */ #define IBLND_CONN_PASSIVE_WAIT 2 /* passive waiting for rtu */ #define IBLND_CONN_ESTABLISHED 3 /* connection established */ @@ -727,7 +768,7 @@ kiblnd_send_keepalive(kib_conn_t *conn) } static inline int -kiblnd_send_noop(kib_conn_t *conn) +kiblnd_need_noop(kib_conn_t *conn) { LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED); @@ -741,11 +782,12 @@ kiblnd_send_noop(kib_conn_t *conn) return 0; /* NOOP can be piggybacked */ /* No tx to piggyback NOOP onto or no credit to send a tx */ - return (cfs_list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 0); + return (cfs_list_empty(&conn->ibc_tx_queue) || + conn->ibc_credits == 0); } - if (!cfs_list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */ - !cfs_list_empty(&conn->ibc_tx_queue_nocred) || /* can be piggybacked */ + if (!cfs_list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */ + !cfs_list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */ conn->ibc_credits == 0) /* no credit */ return 0; @@ -1030,7 +1072,7 @@ int kiblnd_scheduler(void *arg); int kiblnd_thread_start (int (*fn)(void *arg), void *arg); int kiblnd_failover_thread (void *arg); -int kiblnd_alloc_pages (kib_pages_t **pp, int npages); +int kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages); void kiblnd_free_pages (kib_pages_t *p); int kiblnd_cm_callback(struct rdma_cm_id *cmid,