LUDOC-270 protocol: Update the outline and add support files

[doc/protocol.git] / connection.txt
diff --git a/connection.txt b/connection.txt

new file mode 100644 (file)

index 0000000..0a7d1c2
--- /dev/null
+++ b/connection.txt
@@ -0,0 +1,838 @@
+Connections Between Lustre Entities
+-----------------------------------
+[[connection]]
+
+The Lustre protocol is connection-based in that each two entities
+maintain shared, coordinated state information. The most common
+example of two such entities are a client and a target on some
+server. The target is identified by name to the client through an
+interaction with the management server. The client then 'connects' to
+the given target on the indicated server by sending the appropriate
+version of the *_CONNECT message (MGS_CONNECT, MDS_CONNECT, or
+OST_CONNECT - colectively *_CONNECT) and receiving back the
+corresponding *_CONNECT reply. The server creates an 'export' for the
+connection between the target and the client, and the export holds the
+server state information for that connection. When the client gets the
+reply it creates an 'import', and the import holds the client state
+information for that connection. Note that if a server has N targets
+and M clients have connected to them, the server will have N x M
+exports and each client will have N imports.
+
+There are also connections between the servers: Each MDS and OSS has a
+connection to the MGS, where the MDS (respectively the OSS) plays the
+role of the client in the above discussion. That is, the MDS initiates
+the connection and has an import for the MGS, while the MGS has an
+export for each MDS. Each MDS connects to each OST, with an import on
+the MDS and an export on the OSS. This connection supports requests
+from the MDS to the OST for 'statfs' information such as size and
+access time values. Each OSS also connects to the first MDS to get
+access to auxiliary services, with an import on the OSS and an export
+on the first MDS. The auxiliary services are: the File ID Location
+Database (FLDB), the quota master service, and the sequence
+controller.
+
+Finally, for some communications the roles of message initiation and
+message reply are reversed. This is the case, for instance, with
+call-back operations. In that case the entity which would normally
+have an import has, instead, a 'reverse-export' and the
+other end of the connection maintains a 'reverse-import'. The
+reverse-import uses the same structure as a regular import, and the
+reverse-export uses the same structure as a regular export.
+
+Connection Structures
+~~~~~~~~~~~~~~~~~~~~~
+
+Connect Data
+^^^^^^^^^^^^
+
+An 'obd_connect_data' structure accompanies every connect operation in
+both the request message and in the reply message.
+
+----
+struct obd_connect_data {
+    __u64 ocd_connect_flags;
+    __u32 ocd_version;      /* OBD_CONNECT_VERSION */
+    __u32 ocd_grant;        /* OBD_CONNECT_GRANT */
+    __u32 ocd_index;        /* OBD_CONNECT_INDEX */
+    __u32 ocd_brw_size;     /* OBD_CONNECT_BRW_SIZE */
+    __u64 ocd_ibits_known;  /* OBD_CONNECT_IBITS */
+    __u8  ocd_blocksize;    /* OBD_CONNECT_GRANT_PARAM */
+    __u8  ocd_inodespace;   /* OBD_CONNECT_GRANT_PARAM */
+    __u16 ocd_grant_extent; /* OBD_CONNECT_GRANT_PARAM */
+    __u32 ocd_unused;
+    __u64 ocd_transno;      /* OBD_CONNECT_TRANSNO */
+    __u32 ocd_group;        /* OBD_CONNECT_MDS */
+    __u32 ocd_cksum_types;  /* OBD_CONNECT_CKSUM */
+    __u32 ocd_max_easize;   /* OBD_CONNECT_MAX_EASIZE */
+    __u32 ocd_instance;
+    __u64 ocd_maxbytes;     /* OBD_CONNECT_MAXBYTES */
+    __u64 padding1;
+    __u64 padding2;
+    __u64 padding3;
+    __u64 padding4;
+    __u64 padding5;
+    __u64 padding6;
+    __u64 padding7;
+    __u64 padding8;
+    __u64 padding9;
+    __u64 paddingA;
+    __u64 paddingB;
+    __u64 paddingC;
+    __u64 paddingD;
+    __u64 paddingE;
+    __u64 paddingF;
+};
+----
+
+The 'ocd_connect_flags' field encodes the connect flags giving the
+capabilities of a connection between client and target. Several of
+those flags (noted in comments above and the discussion below)
+actually control whether the remaining fields of 'obd_connect_data'
+get used. The [[connect-flags]] flags are:
+
+----
+#define OBD_CONNECT_RDONLY                0x1ULL /*client has read-only access*/
+#define OBD_CONNECT_INDEX                 0x2ULL /*connect specific LOV idx */
+#define OBD_CONNECT_MDS                   0x4ULL /*connect from MDT to OST */
+#define OBD_CONNECT_GRANT                 0x8ULL /*OSC gets grant at connect */
+#define OBD_CONNECT_SRVLOCK              0x10ULL /*server takes locks for cli */
+#define OBD_CONNECT_VERSION              0x20ULL /*Lustre versions in ocd */
+#define OBD_CONNECT_REQPORTAL            0x40ULL /*Separate non-IO req portal */
+#define OBD_CONNECT_ACL                  0x80ULL /*access control lists */
+#define OBD_CONNECT_XATTR               0x100ULL /*client use extended attr */
+#define OBD_CONNECT_CROW                0x200ULL /*MDS+OST create obj on write*/
+#define OBD_CONNECT_TRUNCLOCK           0x400ULL /*locks on server for punch */
+#define OBD_CONNECT_TRANSNO             0x800ULL /*replay sends init transno */
+#define OBD_CONNECT_IBITS              0x1000ULL /*support for inodebits locks*/
+#define OBD_CONNECT_JOIN               0x2000ULL /*files can be concatenated.
+                                                  *We do not support JOIN FILE
+                                                  *anymore, reserve this flags
+                                                  *just for preventing such bit
+                                                  *to be reused.*/
+#define OBD_CONNECT_ATTRFID            0x4000ULL /*Server can GetAttr By Fid*/
+#define OBD_CONNECT_NODEVOH            0x8000ULL /*No open hndl on specl nodes*/
+#define OBD_CONNECT_RMT_CLIENT        0x10000ULL /*Remote client */
+#define OBD_CONNECT_RMT_CLIENT_FORCE  0x20000ULL /*Remote client by force */
+#define OBD_CONNECT_BRW_SIZE          0x40000ULL /*Max bytes per rpc */
+#define OBD_CONNECT_QUOTA64           0x80000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_MDS_CAPA         0x100000ULL /*MDS capability */
+#define OBD_CONNECT_OSS_CAPA         0x200000ULL /*OSS capability */
+#define OBD_CONNECT_CANCELSET        0x400000ULL /*Early batched cancels. */
+#define OBD_CONNECT_SOM              0x800000ULL /*Size on MDS */
+#define OBD_CONNECT_AT              0x1000000ULL /*client uses AT */
+#define OBD_CONNECT_LRU_RESIZE      0x2000000ULL /*LRU resize feature. */
+#define OBD_CONNECT_MDS_MDS         0x4000000ULL /*MDS-MDS connection */
+#define OBD_CONNECT_REAL            0x8000000ULL /*real connection */
+#define OBD_CONNECT_CHANGE_QS      0x10000000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_CKSUM          0x20000000ULL /*support several cksum algos*/
+#define OBD_CONNECT_FID            0x40000000ULL /*FID is supported by server */
+#define OBD_CONNECT_VBR            0x80000000ULL /*version based recovery */
+#define OBD_CONNECT_LOV_V3        0x100000000ULL /*client supports LOV v3 EA */
+#define OBD_CONNECT_GRANT_SHRINK  0x200000000ULL /* support grant shrink */
+#define OBD_CONNECT_SKIP_ORPHAN   0x400000000ULL /* don't reuse orphan objids */
+#define OBD_CONNECT_MAX_EASIZE    0x800000000ULL /* preserved for large EA */
+#define OBD_CONNECT_FULL20       0x1000000000ULL /* it is 2.0 client */
+#define OBD_CONNECT_LAYOUTLOCK   0x2000000000ULL /* client uses layout lock */
+#define OBD_CONNECT_64BITHASH    0x4000000000ULL /* client supports 64-bits
+                                                  * directory hash */
+#define OBD_CONNECT_MAXBYTES     0x8000000000ULL /* max stripe size */
+#define OBD_CONNECT_IMP_RECOV   0x10000000000ULL /* imp recovery support */
+#define OBD_CONNECT_JOBSTATS    0x20000000000ULL /* jobid in ptlrpc_body */
+#define OBD_CONNECT_UMASK       0x40000000000ULL /* create uses client umask */
+#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS
+                                                  * RPC error properly */
+#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for
+                                                  * finer space reservation */
+#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8
+                           * policy and 2.x server */
+#define OBD_CONNECT_LVB_TYPE    0x400000000000ULL /* variable type of LVB */
+#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */
+#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */
+#define OBD_CONNECT_SHORTIO     0x2000000000000ULL/* short io */
+#define OBD_CONNECT_PINGLESS    0x4000000000000ULL/* pings not required */
+#define OBD_CONNECT_FLOCK_DEAD    0x8000000000000ULL/* deadlock detection */
+#define OBD_CONNECT_DISP_STRIPE 0x10000000000000ULL/* create stripe disposition*/
+#define OBD_CONNECT_OPEN_BY_FID    0x20000000000000ULL /* open by fid won't pack
+                               name in request */
+----
+
+Each flag corresponds to a particular capability that the client and
+target together will honor. A client will send a message including
+some subset of these capabilities during a connection request to a
+specific target. It tells the server what capabilities it has. The
+server then replies with the subset of those capabilities it agrees to
+honor (for the given target).
+
+If the OBD_CONNECT_VERSION flag is set then the 'ocd_version' field is
+honored. The 'ocd_version' gives an encoding of the Lustre
+version. For example, Version 2.7.32 would be hexadecimal number
+0x02073200.
+
+If the OBD_CONNECT_GRANT flag is set then the 'ocd_grant' field is
+honored. The 'ocd_grant' value in a reply (to a connection request)
+sets the client's grant.
+
+If the OBD_CONNECT_INDEX flag is set then the 'ocd_index' field is
+honored. The 'ocd_index' value is set in a reply to a connection
+request. It holds the LOV index of the target.
+
+If the OBD_CONNECT_BRW_SIZE flag is set then the 'ocd_brw_size' field
+is honored. The 'ocd_brw_size' value sets the size of the maximum
+supported RPC. The client proposes a value in its connection request,
+and the server's reply will either agree or further limit the size.
+
+If the OBD_CONNECT_IBITS flag is set then the 'ocd_ibits_known' field
+is honored. The 'ocd_ibits_known' value determines the handling of
+locks on inodes. See the discussion of inodes and extended attributes.
+
+If the OBD_CONNECT_GRANT_PARAM flag is set then the 'ocd_blocksize',
+'ocd_inodespace', and 'ocd_grant_extent' fields are honored. A server
+reply uses the 'ocd_blocksize' value to inform the client of the log
+base two of the size in bytes of the backend file system's blocks.
+
+A server reply uses the 'ocd_inodespace' value to inform the client of
+the log base two of the size of an inode.
+
+Under some circumstances (for example when ZFS is the back end file
+system) there may be additional overhead in handling writes for each
+extent. The server uses the 'ocd_grant_extent' value to inform the
+client of the size in bytes consumed from its grant on the server when
+creating a new file. The client uses this value in calculating how
+much dirty write cache it has and whether it has reached the limit
+established by the target's grant.
+
+If the OBD_CONNECT_TRANSNO flag is set then the 'ocd_transno' field is
+honored. A server uses the 'ocd_transno' value during recovery to
+inform the client of the transaction number at which it should begin
+replay.
+
+If the OBD_CONNECT_MDS flag is set then the 'ocd_group' field is
+honored. When an MDT connects to an OST the 'ocd_group' field informs
+the OSS of the MDT's index. Objects on that OST for that MDT will be
+in a common namespace served by that MDT.
+
+If the OBD_CONNECT_CKSUM flag is set then the 'ocd_cksum_types' field
+is honored. The client uses the 'ocd_checksum_types' field to propose
+to the server the client's available (presumably hardware assisted)
+checksum mechanisms. The server replies with the checksum types it has
+available. Finally, the client will employ the fastest of the agreed
+mechanisms.
+
+If the OBD_CONNECT_MAX_EASIZE flag is set then the 'ocd_max_easize'
+field is honored. The server uses 'ocd_max_easize' to inform the
+client about the amount of space that can be allocated in each inode
+for extended attributes. The 'ocd_max_easize' specifically refers to
+the space used for striping information. This allows the client to
+determine the maximum layout size (and hence stripe count) that can be
+stored on the MDT.
+
+The 'ocd_instance' field (alone) is not governed by an OBD_CONNECT_*
+flag. The MGS uses the 'ocd_instance' value in its reply to a
+connection request to inform the server and target of the "era" of its
+connection. The MGS initializes the era value for each server to zero
+and increments that value every time the target connects. This
+supports imperative recovery.
+
+If the OBD_CONNECT_MAXBYTES flag is set then the 'ocd_maxbytes' field
+is honored. An OSS uses the 'ocd_maxbytes' value to inform the client
+of the maximum OST object size for this target.  A stripe on any OST
+for a multi-striped file cannot be larger than the minimum maxbytes
+value.
+
+The additional space in the 'obd_connect_data' structure is unused and
+reserved for future use.
+
+fixme: Discuss the meaning of the rest of the OBD_CONNECT_* flags.
+
+Import
+^^^^^^
+
+----
+#define IMP_STATE_HIST_LEN 16
+struct import_state_hist {
+        enum lustre_imp_state ish_state;
+        time_t                ish_time;
+};
+struct obd_import {
+    struct portals_handle     imp_handle;
+    atomic_t                  imp_refcount;
+    struct lustre_handle      imp_dlm_handle;
+    struct ptlrpc_connection *imp_connection;
+    struct ptlrpc_client     *imp_client;
+    cfs_list_t        imp_pinger_chain;
+    cfs_list_t        imp_zombie_chain;
+    cfs_list_t        imp_replay_list;
+    cfs_list_t        imp_sending_list;
+    cfs_list_t        imp_delayed_list;
+    cfs_list_t      imp_committed_list;
+    cfs_list_t     *imp_replay_cursor;
+    struct obd_device    *imp_obd;
+    struct ptlrpc_sec    *imp_sec;
+    struct mutex      imp_sec_mutex;
+    cfs_time_t        imp_sec_expire;
+    wait_queue_head_t     imp_recovery_waitq;
+    atomic_t          imp_inflight;
+    atomic_t          imp_unregistering;
+    atomic_t          imp_replay_inflight;
+    atomic_t          imp_inval_count;
+    atomic_t          imp_timeouts;
+    enum lustre_imp_state     imp_state;
+    struct import_state_hist  imp_state_hist[IMP_STATE_HIST_LEN];
+    int               imp_state_hist_idx;
+    int               imp_generation;
+    __u32             imp_conn_cnt;
+    int               imp_last_generation_checked;
+    __u64             imp_last_replay_transno;
+    __u64             imp_peer_committed_transno;
+    __u64             imp_last_transno_checked;
+    struct lustre_handle      imp_remote_handle;
+    cfs_time_t        imp_next_ping;
+    __u64             imp_last_success_conn;
+    cfs_list_t        imp_conn_list;
+    struct obd_import_conn   *imp_conn_current;
+    spinlock_t      imp_lock;
+    /* flags */
+    unsigned long
+      imp_no_timeout:1,
+      imp_invalid:1,
+      imp_deactive:1,
+      imp_replayable:1,
+      imp_dlm_fake:1,
+      imp_server_timeout:1,
+      imp_delayed_recovery:1,
+      imp_no_lock_replay:1,
+      imp_vbr_failed:1,
+      imp_force_verify:1,
+      imp_force_next_verify:1,
+      imp_pingable:1,
+      imp_resend_replay:1,
+      imp_no_pinger_recover:1,
+      imp_need_mne_swab:1,
+      imp_force_reconnect:1,
+      imp_connect_tried:1;
+    __u32             imp_connect_op;
+    struct obd_connect_data   imp_connect_data;
+    __u64             imp_connect_flags_orig;
+    int               imp_connect_error;
+    __u32             imp_msg_magic;
+    __u32             imp_msghdr_flags;       /* adjusted based on server capability */
+    struct ptlrpc_request_pool *imp_rq_pool;      /* emergency request pool */
+    struct imp_at         imp_at;         /* adaptive timeout data */
+    time_t            imp_last_reply_time;    /* for health check */
+};
+----
+
+The 'imp_handle' value is the unique id for the import, and is used as
+a hash key to gain access to it. It is not used in any of the Lustre
+protocol messages, but rather is just for internal reference.
+
+The 'imp_refcount' is also for internal use. The value is incremented
+with each RPC created, and decremented as the request is freed. When
+the reference count is zero the import can be freed, as when the
+target is being disconnected.
+
+The 'imp_dlm_handle' is a reference to the LDLM export for this
+client.
+
+There can be multiple paths through the network to a given
+target, in which case there would be multiple 'obd_import_conn' items
+on the 'imp_conn_list'. Each 'obd_imp_conn' includes a
+'ptlrpc_connection', so 'imp_connection' points to the one that is
+actually in use.
+
+The 'imp_client' identifies the (local) portals for sending and
+receiving messages as well as the client's name. The information is
+specific to either an MDC or an OSC.
+
+The 'imp_ping_chain' places the import on a linked list of imports
+that need periodic pings.
+
+The 'imp_zombie_chain' places the import on a list ready for being
+freed. Unused imports ('imp_refcount' is zero) are deleted
+asynchronously by a garbage collecting process.
+
+In order to support recovery the client must keep requests that are in
+the process of being handled by the target.  The target replies to a
+request as soon as the target has made its local update to
+memory. When the client receives that reply the request is put on the
+'imp_replay_list'. In the event of a failure (target crash, lost
+message) this list is then replayed for the target during the recovery
+process. When a request has been sent but has not yet received a reply
+it is placed on the 'imp_sending_list'. In the event of a failure
+those will simply be replayed after any recovery has been
+completed. Finally, there may be requests that the client is delaying
+before it sends them. This can happen if the client is in a degraded
+mode, as when it is in recovery after a failure. These requests are
+put on the 'imp_delayed_list' and not processed until recovery is
+complete and the 'imp_sending_list' has been replayed.
+
+In order to support recovery 'open' requests must be preserved even
+after they have completed. Those requests are placed on the
+'imp_committed_list' and the 'imp_replay_cursor' allows for
+accelerated access to those items.
+
+The 'imp_obd' is a reference to the details about the target device
+that is the subject of this import. There is a lot of state info in
+there along with many implementation details that are not relevant to
+the actual Lustre protocol. fixme: I'll want to go through all of the
+fields in that structure to see which, if any need more
+documentation.
+
+The security policy and settings are kept in 'imp_sec', and
+'imp_sec_mutex' helps manage access to that info. The 'imp_sec_expire'
+setting is in support of security policies that have an expiration
+strategy.
+
+Some processes may need the import to be in a fully connected state in
+order to proceed. The 'imp_recovery_waitq' is where those threads will
+wait during recovery.
+
+The 'imp_inflight' field counts the number of in-flight requests. It
+is incremented with each request sent and decremented with each reply
+received.
+
+The client reserves buffers for the processing of requests and
+replies, and then informs LNet about those buffers. Buffers may get
+reused during subsequent processing, but then a point may come when
+the buffer is no longer going to be used. The client increments the
+'imp_unregistering' counter and informs LNet the buffer is no longer
+needed. When LNet has freed the buffer it will notify the client and
+then the 'imp_unregistering' can be decremented again.
+
+During recovery the 'imp_reply_inflight' counts the number of requests
+from the reply list that have been sent and have not been replied to.
+
+The 'imp_inval_count' field counts how many threads are in the process
+of cleaning up this connection or waiting for cleanup to complete. The
+cleanup itself may be needed in the case there is an eviction or other
+problem (fixme what other problem?). The cleanup may involve freeing
+allocated resources, updating internal state, running replay lists,
+and invalidating cache. Since it could take a while there may end up
+multiple threads waiting on this process to complete.
+
+The 'imp_timeout' field is a counter that is incremented every time
+there is a timeout in communication with the target.
+
+The 'imp_state' tracks the state of the import. It draws from the
+enumerated set of values:
+
+.enum_lustre_imp_state
+[options="header"]
+|=====
+| state        name              | value
+| LUSTRE_IMP_CLOSED       | 1
+| LUSTRE_IMP_NEW         | 2
+| LUSTRE_IMP_DISCON       | 3
+| LUSTRE_IMP_CONNECTING   | 4
+| LUSTRE_IMP_REPLAY       | 5
+| LUSTRE_IMP_REPLAY_LOCKS | 6
+| LUSTRE_IMP_REPLAY_WAIT  | 7
+| LUSTRE_IMP_RECOVER      | 8
+| LUSTRE_IMP_FULL         | 9
+| LUSTRE_IMP_EVICTED      | 10
+|=====
+fixme: what are the transitions between these states? The
+'imp_state_hist' array maintains a list of the last 16
+(IMP_STATE_HIST_LEN) states the import was in, along with the time it
+entered each (fixme: or is it when it left that  state?). The list is
+maintained in a circular manner, so the 'imp_state_hist_idx' points to
+the entry in the list for the most recently visited state.
+
+The 'imp_generation' and 'imp_conn_cnt' fields are monotonically
+increasing counters. Every time a connection request is sent to the
+target the 'imp_conn_cnt' counter is incremented, and every time a
+reply is received for the connection request the 'imp_generation'
+counter is incremented.
+
+The 'imp_last_generation_checked' implements an optimization. When a
+replay process has successfully traversed the reply list the
+'imp_generation' value is noted here. If the generation has not
+incremented then the replay list does not need to be traversed again.
+
+During replay the 'imp_last_replay_transno' is set to the transaction
+number of the last request being replayed, and
+'imp_peer_committed_transno is set to the 'pb_last_committed' value
+(of the 'ptlrpc_body') from replies if that value is higher than the
+previous 'imp_peer_committed_transno'.  The 'imp_last_transno_checked'
+field implements an optimization. It is set to the
+'imp_last_replay_transno' as its replay is initiated. If
+'imp_last_transno_checked' is still 'imp_last_replay_transno' and
+'imp_generation' is still 'imp_last_generation_checked' then  there
+are no additional requests ready to be removed from the replay
+list. Furthermore, 'imp_last_transno_checked' may no longer be needed,
+since the committed transactions are now maintained on a separate list.
+
+The 'imp_remote_handle' is the handle sent by the target in a
+connection reply message to uniquely identify the export for this
+target and client that is maintained on the server. This is the handle
+used in all subsequent messages to the target.
+
+There are two separate ping intervals (fixme: what are the
+values?). If there are no uncommitted messages for the target then the
+default ping interval is used to set the 'imp_next_ping' to the time
+the next ping needs to be sent. If there are uncommitted requests then
+a "short interval" is used to set the time for the next ping.
+
+The 'imp_last_success_conn' value is set to the time of the last
+successful connection. fixme: The source says it is in 64 bit
+jiffies, but does not further indicate how that value is calculated.
+
+Since there can actually be multiple connection paths for a target
+(due to failover or multihomed configurations) the import maintains a
+list of all the possible connection paths in the list pointed to by
+the 'imp_conn_list' field. The 'imp_conn_current' points to the one
+currently in use. Compare with the 'imp_connection' fields. They point
+to different structures, but each is reachable from the other.
+
+Most of the flag, state, and list information in the import needs to
+be accessed atomically. The 'imp_lock' is used to maintain the
+consistency of the import while it is manipulated by multiple threads.
+
+The various flags are documented in the source code and are largely
+obvious from those short comments, reproduced here:
+
+.import flags
+[options="header"]
+|=====
+| flag                    | explanation
+| imp_no_timeout          | timeouts are disabled
+| imp_invalid             | client has been evicted
+| imp_deactive            | client administratively disabled
+| imp_replayable          | try to recover the import
+| imp_dlm_fake            | don't run recovery (timeout instead)
+| imp_server_timeout      | use 1/2 timeout on MDSs and OSCs
+| imp_delayed_recovery    | VBR: imp in delayed recovery
+| imp_no_lock_replay      | VBR: if gap was found then no lock replays
+| imp_vbr_failed          | recovery by versions was failed
+| imp_force_verify        | force an immidiate ping
+| imp_force_next_verify   | force a scheduled ping
+| imp_pingable            | target is pingable
+| imp_resend_replay       | resend for replay
+| imp_no_pinger_recover   | disable normal recovery, for test only.
+| imp_need_mne_swab       | need IR MNE swab
+| imp_force_reconnect     | import must be reconnected, not new connection
+| imp_connect_tried       | import has tried to connect with server
+|=====
+A few additional notes are in order. The 'imp_dlm_fake' flag signifies
+that this is not a "real" import, but rather it is a "reverse"import
+in support of the LDLM. When the LDLM invokes callback operations the
+messages are initiated at the other end, so there need to a fake
+import to receive the replies from the operation. Prior to the
+introduction of adaptive timeouts the servers were given fixed timeout
+value that were half those used for the clients. The
+'imp_server_timeout' flag indicated that the import should use the
+half-sized timeouts, but with the introduction of adaptive timeouts
+this facility is no longer used. "VBR" is "version based recovery",
+and it introduces a new possibility for handling requests. Previously,
+f there were a gap in the transaction number sequence the the requests
+associated with the missing transaction numbers would be
+discarded. With VBR those transaction only need to be discarded if
+there is an actual dependency between the ones that were skipped and
+the currently latest committed transaction number. fixme: What are the
+circumstances that would lead to setting the 'imp_force_next_verify'
+or 'imp_pingable' flags? During recovery, the client sets the
+'imp_no_pinger_recover' flag, which tells the process to proceed from
+the current value of 'imp_replay_last_transno'. The
+'imp_need_mne_swab' flag indicates a version dependent circumstance
+where swabbing was inadvertently left out of one processing step.
+
+
+Export
+^^^^^^
+
+An 'obd_export' structure for a given target is created on a server
+for each client that connects to that target. The exports for all the
+clients for a given target are managed together. The export represents
+the connection state between the client and target as well as the
+current state of any ongoing activity. Thus each pending request will
+have a reference to the export. The export is discarded if the
+connection goes away, but only after all the references to it have
+been cleaned up. The state information for each export is also
+maintained on disk. In the event of a server failure, that or another
+server can read the export date from disk to enable recovery.
+
+----
+struct obd_export {
+    struct portals_handle     exp_handle;
+    atomic_t   exp_refcount;
+    atomic_t   exp_rpc_count;
+    atomic_t   exp_cb_count;
+    atomic_t   exp_replay_count;
+    atomic_t   exp_locks_count;
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+    cfs_list_t exp_locks_list;
+    spinlock_t      exp_locks_list_guard;
+#endif
+    struct obd_uuid       exp_client_uuid;
+    cfs_list_t exp_obd_chain;
+    cfs_hlist_node_t      exp_uuid_hash;
+    cfs_hlist_node_t      exp_nid_hash;
+    cfs_list_t            exp_obd_chain_timed;
+    struct obd_device    *exp_obd;
+    struct obd_import    *exp_imp_reverse;
+    struct nid_stat      *exp_nid_stats;
+    struct ptlrpc_connection *exp_connection;
+    __u32       exp_conn_cnt;
+    cfs_hash_t *exp_lock_hash;
+    cfs_hash_t *exp_flock_hash;
+    cfs_list_t  exp_outstanding_replies;
+    cfs_list_t  exp_uncommitted_replies;
+    spinlock_t  exp_uncommitted_replies_lock;
+    __u64       exp_last_committed;
+    cfs_time_t  exp_last_request_time;
+    cfs_list_t  exp_req_replay_queue;
+    spinlock_t  exp_lock;
+    struct obd_connect_data   exp_connect_data;
+    enum obd_option       exp_flags;
+    unsigned long
+      exp_failed:1,
+      exp_in_recovery:1,
+      exp_disconnected:1,
+      exp_connecting:1,
+      exp_delayed:1,
+      exp_vbr_failed:1,
+      exp_req_replay_needed:1,
+      exp_lock_replay_needed:1,
+      exp_need_sync:1,
+      exp_flvr_changed:1,
+      exp_flvr_adapt:1,
+      exp_libclient:1,
+      exp_need_mne_swab:1;
+    enum lustre_sec_part      exp_sp_peer;
+    struct sptlrpc_flavor     exp_flvr;
+    struct sptlrpc_flavor     exp_flvr_old[2];
+    cfs_time_t exp_flvr_expire[2];
+    spinlock_t exp_rpc_lock;
+    cfs_list_t exp_hp_rpcs;
+    cfs_list_t exp_reg_rpcs;
+    cfs_list_t exp_bl_list;
+    spinlock_t exp_bl_list_lock;
+    union {
+        struct tg_export_data     eu_target_data;
+        struct mdt_export_data    eu_mdt_data;
+        struct filter_export_data eu_filter_data;
+        struct ec_export_data     eu_ec_data;
+        struct mgs_export_data    eu_mgs_data;
+    } u;
+    struct nodemap      *exp_nodemap;
+};
+----
+
+The 'exp_handle' is a little extra information as compared with a
+'struct lustre_handle', which is just the cookie. The cookie that the
+server generates to uniquely identify this connection gets put into
+this structure along with their information about the device in
+question. This is the cookie the *_CONNECT reply sends back to the
+client and is then stored int he client's import.
+
+The 'exp_refcount' gets incremented whenever some aspect of the export
+is "in use". The arrival of an otherwise unprocessed message for this
+target will increment the refcount. A reference by an LDLM lock that
+gets taken will increment the refcount. Callback invocations and
+replay also lead to incrementing the ref_count. The next for fields -
+'exp_rpc_count', exp_cb_count', and 'exp_replay_count', and
+'exp_locks_count' - all subcategorize the 'exp_refcount' for debug
+purposes. Similarly, the 'exp_locks_list' and 'exp_locks_list_guard'
+are further debug info that lists the actual locks accounted in
+'exp_locks_count'.
+
+The 'exp_client_uuid' gives the UUID of the client connected to this
+export. Fixme: when and how does the UUID get generated?
+
+The server maintains all the exports for a given target on a circular
+list. Each export's place on that list is maintained in the
+'exp_obd_chain'. A common activity is to look up the export based on
+the UUID or the nid of the client, and the 'exp_uuid_hash' and
+'exp_nid_hash' fields maintain this export's place in hashes
+constructed for that purpose.
+
+Exports are also maintained on a list sorted by the last time the
+corresponding client was heard from. The 'exp_obd_chain_timed' field
+maintains the export's place on that list. When a message arrives from
+the client the time is "now" so the export gets put at the end of the
+list. Since it is circular, the next export is then the oldest. If it
+has not been heard of within its timeout interval that export is
+marked for later eviction.
+
+The 'exp_obd' points to the 'obd_device' structure for the device that
+is the target of this export.
+
+In the event of a call-back the export needs to have a the ability to
+initiate messages back to the client. The 'exp_imp_reverse' provides a
+"reverse" import that manages this capability.
+
+The '/proc' stats for the export (and the target) get updated via the
+'exp_nid_stats'.
+
+The 'exp_connection' points to the connection information for this
+export. This is the information about the actual networking pathway(s)
+that get used for communication.
+
+
+The 'exp_conn_cnt' notes the connection count value from the client at
+the time of the connection. In the event that more than one connection
+request is issued before the connection is established then the
+'exp_conn_cnt' will list the highest value. If a previous connection
+attempt (with a lower value) arrives later it may be safely
+discarded. Every request lists its connection count, so non-connection
+requests with lower connection count values can also be discarded.
+Note that this does not count how many times the client has connected
+to the target. If a client is evicted the export is deleted once it
+has been cleaned up and its 'exp_ref_count' reduced to zero. A new
+connection from the client will get a new export.
+
+The 'exp_lock_hash' provides access to the locks granted to the
+corresponding client for this target. If a lock cannot be granted it
+is discarded. A file system lock ("flock") is also implemented through
+the LDLM lock system, but not all LDLM locks are flocks. The ones that
+are flocks are gathered in a hash 'exp_flock_hash'. This supports
+deadlock detection.
+
+For those requests that initiate file system modifying transactions
+the request and its attendant locks need to be preserved until either
+a) the client acknowleges recieving the reply, or b) the transaction
+has been committed locally. This ensures a request can be replayed in
+the event of a failure. The reply is kept on the
+'exp_outstanding_replies' list until the LNet layer notifies the
+server that the reply has been acknowledged. A reply is kept on the
+'exp_uncommitted_replies' list until the transaction (if any) has been
+committed.
+
+The 'exp_last_committed' value keeps the transaction number of the
+last committed transaction. Every reply to a client includes this
+value as a means of early-as-possible notification of transactions that
+have been committed.
+
+The 'exp_last_request_time' is self explanatory.
+
+During reply a request that is waiting for reply is maintained on the
+list 'exp_req_replay_queue'.
+
+The 'exp_lock' spin-lock is used for access control to the exports
+flags, as well as the 'exp_outstanding_replies' list and the revers
+import, if any.
+
+The 'exp_connect_data' refers to an 'obd_connect_data' structure for
+the connection established between this target and the client this
+export refers to. See also the corresponding entry in the import and
+in the connect messages passed between the hosts.
+
+The 'exp_flags' field encodes three directives as follows:
+----
+enum obd_option {
+        OBD_OPT_FORCE =         0x0001,
+        OBD_OPT_FAILOVER =      0x0002,
+        OBD_OPT_ABORT_RECOV =   0x0004,
+};
+----
+fixme: Are the set for some exports and a condition of their
+existence? or do they reflect a transient state the export is passing
+through?
+
+The 'exp_failed' flag gets set whenever the target has failed for any
+reason or the export is otherwise due to be cleaned up. Once set it
+will not be unset in this export. Any subsequent connection between
+the client and the target would be governed by a new export.
+
+After a failure export data is retrieved from disk and the exports
+recreated. Exports created in this way will have their
+'exp_in_recovery' flag set. Once any outstanding requests and locks
+have been recovered for the client, then the export is recovered and
+'exp_in_recovery' can be cleared. When all the client exports for a
+given target have been recovered then the target is considered
+recovered, and when all targets have been recovered the server is
+considered recovered.
+
+A *_DISCONNECT message from the client will set the 'exp_disconnected'
+flag, as will any sort of failure of the target. Once set the export
+will be cleaned up and deleted.
+
+When a *_CONNECT message arrives the 'exp_connecting' flag is set. If
+for some reason a second *_CONNECT request arrives from the client it can
+be discarded when this flag is set.
+
+The 'exp_delayed' flag is no longer used. In older code it indicated
+that recovery had not completed in a timely fashion, but that a tardy
+recovery would still be possible, since there were no dependencies on
+the export.
+
+The 'exp_vbr_failed' flag indicates a failure during the recovery
+process. See <<recovery>> for a more detailed discussion of recovery
+and transaction replay. For a file system modifying request, the
+server composes its reply including the 'pb_pre_versions' entries in
+'ptlrpc_body', which indicate the most recent updates to the
+object. The client updates the request wth teh 'pb_transno' and
+'pb_pre_versions' from the reply, and keeps that request until the
+target signals that the transaction has been committed to disk. If the
+client times-out without that confirmation then it will 'replay' the
+request, which now includes the 'pb_pre_versions' information. During
+a replay the target checks that the object has not been further
+modified beyond those 'pb_pre_versions'. If this check fails then the
+request is out of date, and the recovery process fails for the
+connection between this client and this target. At that point the
+'exp_vbr_failed' flag is set to indicate version based recovery
+failed. This will lead to the client being evicted and this export
+being cleaned up and deleted.
+
+At the start of recovery both the 'exp_req_replay_needed' and
+'exp_lock_replay_needed' flags are set. As request replay is completed
+the 'exp_req_replay_needed' flag is cleared. As lock replay is
+completed the 'exp_lock_replay_needed' flag is cleared. Once both are
+cleared the 'exp_in_recovery' flag can be cleared.
+
+The 'exp_need_sync' supports an optimization. At mount time it is
+likely that every client (potentially thousands) will create an export
+and that export will need to be saved to disk synchronously. This can
+lead to an unusually high and poorly performing interaction with the
+disk. When the export is created the 'exp_need_sync' flag is set and
+the actual writing to disk is delayed. As transactions arrive from
+clients (in a much less coordinated fashion) the 'exp_need_sync' flag
+indicates a need to save the export as well as the transaction. At
+that point the flag is cleared (except see below).
+
+In DNE (phase I) the export for an MDT managing the connection from
+another MDT will want to always keep the 'exp_need_sync' flag set. For
+that special case such an export sets the 'exp_keep_sync', which then
+prevents the 'exp_need_sync' flag from ever being cleared. This will
+no longer be needed in DNE Phase II.
+
+The 'exp_flvr_changed' and 'exp_flvr_adapt' flags along with
+'exp_sp_peer', 'exp_flvr', 'exp_flvr_old', and 'exp_flvr_expire'
+fields are all used to manage the security settings for the
+connection. Security is discussed in the <<security>> section. (fixme:
+or will be.)
+
+The 'exp_libclient' flag indicates that the export is for a client
+based on "liblustre". This allows for simplified handling on the
+server. (fixme: how is processing simplified? It sounds like I may
+need a whole special section on liblustre.)
+
+The 'exp_need_mne_swab' flag indicates the presence of an old bug that
+affected one special case of failed swabbing. It is not part of
+current processing.
+
+As RPCs arrive they are first subjected to triage. Each request is
+placed on the 'exp_hp_rpcs' list and examined to see if it is high
+priority (fixme: what constitutes high priority? PING, truncate, bulk
+I/O, ... others?). If it is not high priority then it is moved to the
+'exp_reg_prcs' list. The 'exp_rpc_lock' protects both lists from
+concurrent access.
+
+All arriving LDLM requests get put on the 'exp_bl_list' and access to
+that list is controlled via the 'exp_bl_list_lock'.
+
+The union provides for target specific data. The 'eu_target_data' is
+for a common core of fields for a generic target. The others are
+specific to particular target types: 'eu_mdt_data' for MDTs,
+'eu_filter_data' for OSTs, 'eu_ec_data' for an "echo client" (fixme:
+describe what an echo client is somewhere), and 'eu_mgs_data' is for
+an MGS.
+
+The 'exp_bl_lock_at' field supports adaptive timeouts which will be
+discussed separately. (fixme: so discuss it somewhere.)
+
+Connection Count
+^^^^^^^^^^^^^^^^
+
+Each export maintains a connection count. Or is it just the management
+server?