+Connections Between Lustre Entities
+-----------------------------------
+[[connection]]
+
+The Lustre protocol is connection-based in that each two entities
+maintain shared, coordinated state information. The most common
+example of two such entities are a client and a target on some
+server. The target is identified by name to the client through an
+interaction with the management server. The client then 'connects' to
+the given target on the indicated server by sending the appropriate
+version of the *_CONNECT message (MGS_CONNECT, MDS_CONNECT, or
+OST_CONNECT - colectively *_CONNECT) and receiving back the
+corresponding *_CONNECT reply. The server creates an 'export' for the
+connection between the target and the client, and the export holds the
+server state information for that connection. When the client gets the
+reply it creates an 'import', and the import holds the client state
+information for that connection. Note that if a server has N targets
+and M clients have connected to them, the server will have N x M
+exports and each client will have N imports.
+
+There are also connections between the servers: Each MDS and OSS has a
+connection to the MGS, where the MDS (respectively the OSS) plays the
+role of the client in the above discussion. That is, the MDS initiates
+the connection and has an import for the MGS, while the MGS has an
+export for each MDS. Each MDS connects to each OST, with an import on
+the MDS and an export on the OSS. This connection supports requests
+from the MDS to the OST for 'statfs' information such as size and
+access time values. Each OSS also connects to the first MDS to get
+access to auxiliary services, with an import on the OSS and an export
+on the first MDS. The auxiliary services are: the File ID Location
+Database (FLDB), the quota master service, and the sequence
+controller.
+
+Finally, for some communications the roles of message initiation and
+message reply are reversed. This is the case, for instance, with
+call-back operations. In that case the entity which would normally
+have an import has, instead, a 'reverse-export' and the
+other end of the connection maintains a 'reverse-import'. The
+reverse-import uses the same structure as a regular import, and the
+reverse-export uses the same structure as a regular export.
+
+Connection Structures
+~~~~~~~~~~~~~~~~~~~~~
+
+Connect Data
+^^^^^^^^^^^^
+
+An 'obd_connect_data' structure accompanies every connect operation in
+both the request message and in the reply message.
+
+----
+struct obd_connect_data {
+ __u64 ocd_connect_flags;
+ __u32 ocd_version; /* OBD_CONNECT_VERSION */
+ __u32 ocd_grant; /* OBD_CONNECT_GRANT */
+ __u32 ocd_index; /* OBD_CONNECT_INDEX */
+ __u32 ocd_brw_size; /* OBD_CONNECT_BRW_SIZE */
+ __u64 ocd_ibits_known; /* OBD_CONNECT_IBITS */
+ __u8 ocd_blocksize; /* OBD_CONNECT_GRANT_PARAM */
+ __u8 ocd_inodespace; /* OBD_CONNECT_GRANT_PARAM */
+ __u16 ocd_grant_extent; /* OBD_CONNECT_GRANT_PARAM */
+ __u32 ocd_unused;
+ __u64 ocd_transno; /* OBD_CONNECT_TRANSNO */
+ __u32 ocd_group; /* OBD_CONNECT_MDS */
+ __u32 ocd_cksum_types; /* OBD_CONNECT_CKSUM */
+ __u32 ocd_max_easize; /* OBD_CONNECT_MAX_EASIZE */
+ __u32 ocd_instance;
+ __u64 ocd_maxbytes; /* OBD_CONNECT_MAXBYTES */
+ __u64 padding1;
+ __u64 padding2;
+ __u64 padding3;
+ __u64 padding4;
+ __u64 padding5;
+ __u64 padding6;
+ __u64 padding7;
+ __u64 padding8;
+ __u64 padding9;
+ __u64 paddingA;
+ __u64 paddingB;
+ __u64 paddingC;
+ __u64 paddingD;
+ __u64 paddingE;
+ __u64 paddingF;
+};
+----
+
+The 'ocd_connect_flags' field encodes the connect flags giving the
+capabilities of a connection between client and target. Several of
+those flags (noted in comments above and the discussion below)
+actually control whether the remaining fields of 'obd_connect_data'
+get used. The [[connect-flags]] flags are:
+
+----
+#define OBD_CONNECT_RDONLY 0x1ULL /*client has read-only access*/
+#define OBD_CONNECT_INDEX 0x2ULL /*connect specific LOV idx */
+#define OBD_CONNECT_MDS 0x4ULL /*connect from MDT to OST */
+#define OBD_CONNECT_GRANT 0x8ULL /*OSC gets grant at connect */
+#define OBD_CONNECT_SRVLOCK 0x10ULL /*server takes locks for cli */
+#define OBD_CONNECT_VERSION 0x20ULL /*Lustre versions in ocd */
+#define OBD_CONNECT_REQPORTAL 0x40ULL /*Separate non-IO req portal */
+#define OBD_CONNECT_ACL 0x80ULL /*access control lists */
+#define OBD_CONNECT_XATTR 0x100ULL /*client use extended attr */
+#define OBD_CONNECT_CROW 0x200ULL /*MDS+OST create obj on write*/
+#define OBD_CONNECT_TRUNCLOCK 0x400ULL /*locks on server for punch */
+#define OBD_CONNECT_TRANSNO 0x800ULL /*replay sends init transno */
+#define OBD_CONNECT_IBITS 0x1000ULL /*support for inodebits locks*/
+#define OBD_CONNECT_JOIN 0x2000ULL /*files can be concatenated.
+ *We do not support JOIN FILE
+ *anymore, reserve this flags
+ *just for preventing such bit
+ *to be reused.*/
+#define OBD_CONNECT_ATTRFID 0x4000ULL /*Server can GetAttr By Fid*/
+#define OBD_CONNECT_NODEVOH 0x8000ULL /*No open hndl on specl nodes*/
+#define OBD_CONNECT_RMT_CLIENT 0x10000ULL /*Remote client */
+#define OBD_CONNECT_RMT_CLIENT_FORCE 0x20000ULL /*Remote client by force */
+#define OBD_CONNECT_BRW_SIZE 0x40000ULL /*Max bytes per rpc */
+#define OBD_CONNECT_QUOTA64 0x80000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_MDS_CAPA 0x100000ULL /*MDS capability */
+#define OBD_CONNECT_OSS_CAPA 0x200000ULL /*OSS capability */
+#define OBD_CONNECT_CANCELSET 0x400000ULL /*Early batched cancels. */
+#define OBD_CONNECT_SOM 0x800000ULL /*Size on MDS */
+#define OBD_CONNECT_AT 0x1000000ULL /*client uses AT */
+#define OBD_CONNECT_LRU_RESIZE 0x2000000ULL /*LRU resize feature. */
+#define OBD_CONNECT_MDS_MDS 0x4000000ULL /*MDS-MDS connection */
+#define OBD_CONNECT_REAL 0x8000000ULL /*real connection */
+#define OBD_CONNECT_CHANGE_QS 0x10000000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_CKSUM 0x20000000ULL /*support several cksum algos*/
+#define OBD_CONNECT_FID 0x40000000ULL /*FID is supported by server */
+#define OBD_CONNECT_VBR 0x80000000ULL /*version based recovery */
+#define OBD_CONNECT_LOV_V3 0x100000000ULL /*client supports LOV v3 EA */
+#define OBD_CONNECT_GRANT_SHRINK 0x200000000ULL /* support grant shrink */
+#define OBD_CONNECT_SKIP_ORPHAN 0x400000000ULL /* don't reuse orphan objids */
+#define OBD_CONNECT_MAX_EASIZE 0x800000000ULL /* preserved for large EA */
+#define OBD_CONNECT_FULL20 0x1000000000ULL /* it is 2.0 client */
+#define OBD_CONNECT_LAYOUTLOCK 0x2000000000ULL /* client uses layout lock */
+#define OBD_CONNECT_64BITHASH 0x4000000000ULL /* client supports 64-bits
+ * directory hash */
+#define OBD_CONNECT_MAXBYTES 0x8000000000ULL /* max stripe size */
+#define OBD_CONNECT_IMP_RECOV 0x10000000000ULL /* imp recovery support */
+#define OBD_CONNECT_JOBSTATS 0x20000000000ULL /* jobid in ptlrpc_body */
+#define OBD_CONNECT_UMASK 0x40000000000ULL /* create uses client umask */
+#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS
+ * RPC error properly */
+#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for
+ * finer space reservation */
+#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8
+ * policy and 2.x server */
+#define OBD_CONNECT_LVB_TYPE 0x400000000000ULL /* variable type of LVB */
+#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */
+#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */
+#define OBD_CONNECT_SHORTIO 0x2000000000000ULL/* short io */
+#define OBD_CONNECT_PINGLESS 0x4000000000000ULL/* pings not required */
+#define OBD_CONNECT_FLOCK_DEAD 0x8000000000000ULL/* deadlock detection */
+#define OBD_CONNECT_DISP_STRIPE 0x10000000000000ULL/* create stripe disposition*/
+#define OBD_CONNECT_OPEN_BY_FID 0x20000000000000ULL /* open by fid won't pack
+ name in request */
+----
+
+Each flag corresponds to a particular capability that the client and
+target together will honor. A client will send a message including
+some subset of these capabilities during a connection request to a
+specific target. It tells the server what capabilities it has. The
+server then replies with the subset of those capabilities it agrees to
+honor (for the given target).
+
+If the OBD_CONNECT_VERSION flag is set then the 'ocd_version' field is
+honored. The 'ocd_version' gives an encoding of the Lustre
+version. For example, Version 2.7.32 would be hexadecimal number
+0x02073200.
+
+If the OBD_CONNECT_GRANT flag is set then the 'ocd_grant' field is
+honored. The 'ocd_grant' value in a reply (to a connection request)
+sets the client's grant.
+
+If the OBD_CONNECT_INDEX flag is set then the 'ocd_index' field is
+honored. The 'ocd_index' value is set in a reply to a connection
+request. It holds the LOV index of the target.
+
+If the OBD_CONNECT_BRW_SIZE flag is set then the 'ocd_brw_size' field
+is honored. The 'ocd_brw_size' value sets the size of the maximum
+supported RPC. The client proposes a value in its connection request,
+and the server's reply will either agree or further limit the size.
+
+If the OBD_CONNECT_IBITS flag is set then the 'ocd_ibits_known' field
+is honored. The 'ocd_ibits_known' value determines the handling of
+locks on inodes. See the discussion of inodes and extended attributes.
+
+If the OBD_CONNECT_GRANT_PARAM flag is set then the 'ocd_blocksize',
+'ocd_inodespace', and 'ocd_grant_extent' fields are honored. A server
+reply uses the 'ocd_blocksize' value to inform the client of the log
+base two of the size in bytes of the backend file system's blocks.
+
+A server reply uses the 'ocd_inodespace' value to inform the client of
+the log base two of the size of an inode.
+
+Under some circumstances (for example when ZFS is the back end file
+system) there may be additional overhead in handling writes for each
+extent. The server uses the 'ocd_grant_extent' value to inform the
+client of the size in bytes consumed from its grant on the server when
+creating a new file. The client uses this value in calculating how
+much dirty write cache it has and whether it has reached the limit
+established by the target's grant.
+
+If the OBD_CONNECT_TRANSNO flag is set then the 'ocd_transno' field is
+honored. A server uses the 'ocd_transno' value during recovery to
+inform the client of the transaction number at which it should begin
+replay.
+
+If the OBD_CONNECT_MDS flag is set then the 'ocd_group' field is
+honored. When an MDT connects to an OST the 'ocd_group' field informs
+the OSS of the MDT's index. Objects on that OST for that MDT will be
+in a common namespace served by that MDT.
+
+If the OBD_CONNECT_CKSUM flag is set then the 'ocd_cksum_types' field
+is honored. The client uses the 'ocd_checksum_types' field to propose
+to the server the client's available (presumably hardware assisted)
+checksum mechanisms. The server replies with the checksum types it has
+available. Finally, the client will employ the fastest of the agreed
+mechanisms.
+
+If the OBD_CONNECT_MAX_EASIZE flag is set then the 'ocd_max_easize'
+field is honored. The server uses 'ocd_max_easize' to inform the
+client about the amount of space that can be allocated in each inode
+for extended attributes. The 'ocd_max_easize' specifically refers to
+the space used for striping information. This allows the client to
+determine the maximum layout size (and hence stripe count) that can be
+stored on the MDT.
+
+The 'ocd_instance' field (alone) is not governed by an OBD_CONNECT_*
+flag. The MGS uses the 'ocd_instance' value in its reply to a
+connection request to inform the server and target of the "era" of its
+connection. The MGS initializes the era value for each server to zero
+and increments that value every time the target connects. This
+supports imperative recovery.
+
+If the OBD_CONNECT_MAXBYTES flag is set then the 'ocd_maxbytes' field
+is honored. An OSS uses the 'ocd_maxbytes' value to inform the client
+of the maximum OST object size for this target. A stripe on any OST
+for a multi-striped file cannot be larger than the minimum maxbytes
+value.
+
+The additional space in the 'obd_connect_data' structure is unused and
+reserved for future use.
+
+fixme: Discuss the meaning of the rest of the OBD_CONNECT_* flags.
+
+Import
+^^^^^^
+
+----
+#define IMP_STATE_HIST_LEN 16
+struct import_state_hist {
+ enum lustre_imp_state ish_state;
+ time_t ish_time;
+};
+struct obd_import {
+ struct portals_handle imp_handle;
+ atomic_t imp_refcount;
+ struct lustre_handle imp_dlm_handle;
+ struct ptlrpc_connection *imp_connection;
+ struct ptlrpc_client *imp_client;
+ cfs_list_t imp_pinger_chain;
+ cfs_list_t imp_zombie_chain;
+ cfs_list_t imp_replay_list;
+ cfs_list_t imp_sending_list;
+ cfs_list_t imp_delayed_list;
+ cfs_list_t imp_committed_list;
+ cfs_list_t *imp_replay_cursor;
+ struct obd_device *imp_obd;
+ struct ptlrpc_sec *imp_sec;
+ struct mutex imp_sec_mutex;
+ cfs_time_t imp_sec_expire;
+ wait_queue_head_t imp_recovery_waitq;
+ atomic_t imp_inflight;
+ atomic_t imp_unregistering;
+ atomic_t imp_replay_inflight;
+ atomic_t imp_inval_count;
+ atomic_t imp_timeouts;
+ enum lustre_imp_state imp_state;
+ struct import_state_hist imp_state_hist[IMP_STATE_HIST_LEN];
+ int imp_state_hist_idx;
+ int imp_generation;
+ __u32 imp_conn_cnt;
+ int imp_last_generation_checked;
+ __u64 imp_last_replay_transno;
+ __u64 imp_peer_committed_transno;
+ __u64 imp_last_transno_checked;
+ struct lustre_handle imp_remote_handle;
+ cfs_time_t imp_next_ping;
+ __u64 imp_last_success_conn;
+ cfs_list_t imp_conn_list;
+ struct obd_import_conn *imp_conn_current;
+ spinlock_t imp_lock;
+ /* flags */
+ unsigned long
+ imp_no_timeout:1,
+ imp_invalid:1,
+ imp_deactive:1,
+ imp_replayable:1,
+ imp_dlm_fake:1,
+ imp_server_timeout:1,
+ imp_delayed_recovery:1,
+ imp_no_lock_replay:1,
+ imp_vbr_failed:1,
+ imp_force_verify:1,
+ imp_force_next_verify:1,
+ imp_pingable:1,
+ imp_resend_replay:1,
+ imp_no_pinger_recover:1,
+ imp_need_mne_swab:1,
+ imp_force_reconnect:1,
+ imp_connect_tried:1;
+ __u32 imp_connect_op;
+ struct obd_connect_data imp_connect_data;
+ __u64 imp_connect_flags_orig;
+ int imp_connect_error;
+ __u32 imp_msg_magic;
+ __u32 imp_msghdr_flags; /* adjusted based on server capability */
+ struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */
+ struct imp_at imp_at; /* adaptive timeout data */
+ time_t imp_last_reply_time; /* for health check */
+};
+----
+
+The 'imp_handle' value is the unique id for the import, and is used as
+a hash key to gain access to it. It is not used in any of the Lustre
+protocol messages, but rather is just for internal reference.
+
+The 'imp_refcount' is also for internal use. The value is incremented
+with each RPC created, and decremented as the request is freed. When
+the reference count is zero the import can be freed, as when the
+target is being disconnected.
+
+The 'imp_dlm_handle' is a reference to the LDLM export for this
+client.
+
+There can be multiple paths through the network to a given
+target, in which case there would be multiple 'obd_import_conn' items
+on the 'imp_conn_list'. Each 'obd_imp_conn' includes a
+'ptlrpc_connection', so 'imp_connection' points to the one that is
+actually in use.
+
+The 'imp_client' identifies the (local) portals for sending and
+receiving messages as well as the client's name. The information is
+specific to either an MDC or an OSC.
+
+The 'imp_ping_chain' places the import on a linked list of imports
+that need periodic pings.
+
+The 'imp_zombie_chain' places the import on a list ready for being
+freed. Unused imports ('imp_refcount' is zero) are deleted
+asynchronously by a garbage collecting process.
+
+In order to support recovery the client must keep requests that are in
+the process of being handled by the target. The target replies to a
+request as soon as the target has made its local update to
+memory. When the client receives that reply the request is put on the
+'imp_replay_list'. In the event of a failure (target crash, lost
+message) this list is then replayed for the target during the recovery
+process. When a request has been sent but has not yet received a reply
+it is placed on the 'imp_sending_list'. In the event of a failure
+those will simply be replayed after any recovery has been
+completed. Finally, there may be requests that the client is delaying
+before it sends them. This can happen if the client is in a degraded
+mode, as when it is in recovery after a failure. These requests are
+put on the 'imp_delayed_list' and not processed until recovery is
+complete and the 'imp_sending_list' has been replayed.
+
+In order to support recovery 'open' requests must be preserved even
+after they have completed. Those requests are placed on the
+'imp_committed_list' and the 'imp_replay_cursor' allows for
+accelerated access to those items.
+
+The 'imp_obd' is a reference to the details about the target device
+that is the subject of this import. There is a lot of state info in
+there along with many implementation details that are not relevant to
+the actual Lustre protocol. fixme: I'll want to go through all of the
+fields in that structure to see which, if any need more
+documentation.
+
+The security policy and settings are kept in 'imp_sec', and
+'imp_sec_mutex' helps manage access to that info. The 'imp_sec_expire'
+setting is in support of security policies that have an expiration
+strategy.
+
+Some processes may need the import to be in a fully connected state in
+order to proceed. The 'imp_recovery_waitq' is where those threads will
+wait during recovery.
+
+The 'imp_inflight' field counts the number of in-flight requests. It
+is incremented with each request sent and decremented with each reply
+received.
+
+The client reserves buffers for the processing of requests and
+replies, and then informs LNet about those buffers. Buffers may get
+reused during subsequent processing, but then a point may come when
+the buffer is no longer going to be used. The client increments the
+'imp_unregistering' counter and informs LNet the buffer is no longer
+needed. When LNet has freed the buffer it will notify the client and
+then the 'imp_unregistering' can be decremented again.
+
+During recovery the 'imp_reply_inflight' counts the number of requests
+from the reply list that have been sent and have not been replied to.
+
+The 'imp_inval_count' field counts how many threads are in the process
+of cleaning up this connection or waiting for cleanup to complete. The
+cleanup itself may be needed in the case there is an eviction or other
+problem (fixme what other problem?). The cleanup may involve freeing
+allocated resources, updating internal state, running replay lists,
+and invalidating cache. Since it could take a while there may end up
+multiple threads waiting on this process to complete.
+
+The 'imp_timeout' field is a counter that is incremented every time
+there is a timeout in communication with the target.
+
+The 'imp_state' tracks the state of the import. It draws from the
+enumerated set of values:
+
+.enum_lustre_imp_state
+[options="header"]
+|=====
+| state name | value
+| LUSTRE_IMP_CLOSED | 1
+| LUSTRE_IMP_NEW | 2
+| LUSTRE_IMP_DISCON | 3
+| LUSTRE_IMP_CONNECTING | 4
+| LUSTRE_IMP_REPLAY | 5
+| LUSTRE_IMP_REPLAY_LOCKS | 6
+| LUSTRE_IMP_REPLAY_WAIT | 7
+| LUSTRE_IMP_RECOVER | 8
+| LUSTRE_IMP_FULL | 9
+| LUSTRE_IMP_EVICTED | 10
+|=====
+fixme: what are the transitions between these states? The
+'imp_state_hist' array maintains a list of the last 16
+(IMP_STATE_HIST_LEN) states the import was in, along with the time it
+entered each (fixme: or is it when it left that state?). The list is
+maintained in a circular manner, so the 'imp_state_hist_idx' points to
+the entry in the list for the most recently visited state.
+
+The 'imp_generation' and 'imp_conn_cnt' fields are monotonically
+increasing counters. Every time a connection request is sent to the
+target the 'imp_conn_cnt' counter is incremented, and every time a
+reply is received for the connection request the 'imp_generation'
+counter is incremented.
+
+The 'imp_last_generation_checked' implements an optimization. When a
+replay process has successfully traversed the reply list the
+'imp_generation' value is noted here. If the generation has not
+incremented then the replay list does not need to be traversed again.
+
+During replay the 'imp_last_replay_transno' is set to the transaction
+number of the last request being replayed, and
+'imp_peer_committed_transno is set to the 'pb_last_committed' value
+(of the 'ptlrpc_body') from replies if that value is higher than the
+previous 'imp_peer_committed_transno'. The 'imp_last_transno_checked'
+field implements an optimization. It is set to the
+'imp_last_replay_transno' as its replay is initiated. If
+'imp_last_transno_checked' is still 'imp_last_replay_transno' and
+'imp_generation' is still 'imp_last_generation_checked' then there
+are no additional requests ready to be removed from the replay
+list. Furthermore, 'imp_last_transno_checked' may no longer be needed,
+since the committed transactions are now maintained on a separate list.
+
+The 'imp_remote_handle' is the handle sent by the target in a
+connection reply message to uniquely identify the export for this
+target and client that is maintained on the server. This is the handle
+used in all subsequent messages to the target.
+
+There are two separate ping intervals (fixme: what are the
+values?). If there are no uncommitted messages for the target then the
+default ping interval is used to set the 'imp_next_ping' to the time
+the next ping needs to be sent. If there are uncommitted requests then
+a "short interval" is used to set the time for the next ping.
+
+The 'imp_last_success_conn' value is set to the time of the last
+successful connection. fixme: The source says it is in 64 bit
+jiffies, but does not further indicate how that value is calculated.
+
+Since there can actually be multiple connection paths for a target
+(due to failover or multihomed configurations) the import maintains a
+list of all the possible connection paths in the list pointed to by
+the 'imp_conn_list' field. The 'imp_conn_current' points to the one
+currently in use. Compare with the 'imp_connection' fields. They point
+to different structures, but each is reachable from the other.
+
+Most of the flag, state, and list information in the import needs to
+be accessed atomically. The 'imp_lock' is used to maintain the
+consistency of the import while it is manipulated by multiple threads.
+
+The various flags are documented in the source code and are largely
+obvious from those short comments, reproduced here:
+
+.import flags
+[options="header"]
+|=====
+| flag | explanation
+| imp_no_timeout | timeouts are disabled
+| imp_invalid | client has been evicted
+| imp_deactive | client administratively disabled
+| imp_replayable | try to recover the import
+| imp_dlm_fake | don't run recovery (timeout instead)
+| imp_server_timeout | use 1/2 timeout on MDSs and OSCs
+| imp_delayed_recovery | VBR: imp in delayed recovery
+| imp_no_lock_replay | VBR: if gap was found then no lock replays
+| imp_vbr_failed | recovery by versions was failed
+| imp_force_verify | force an immidiate ping
+| imp_force_next_verify | force a scheduled ping
+| imp_pingable | target is pingable
+| imp_resend_replay | resend for replay
+| imp_no_pinger_recover | disable normal recovery, for test only.
+| imp_need_mne_swab | need IR MNE swab
+| imp_force_reconnect | import must be reconnected, not new connection
+| imp_connect_tried | import has tried to connect with server
+|=====
+A few additional notes are in order. The 'imp_dlm_fake' flag signifies
+that this is not a "real" import, but rather it is a "reverse"import
+in support of the LDLM. When the LDLM invokes callback operations the
+messages are initiated at the other end, so there need to a fake
+import to receive the replies from the operation. Prior to the
+introduction of adaptive timeouts the servers were given fixed timeout
+value that were half those used for the clients. The
+'imp_server_timeout' flag indicated that the import should use the
+half-sized timeouts, but with the introduction of adaptive timeouts
+this facility is no longer used. "VBR" is "version based recovery",
+and it introduces a new possibility for handling requests. Previously,
+f there were a gap in the transaction number sequence the the requests
+associated with the missing transaction numbers would be
+discarded. With VBR those transaction only need to be discarded if
+there is an actual dependency between the ones that were skipped and
+the currently latest committed transaction number. fixme: What are the
+circumstances that would lead to setting the 'imp_force_next_verify'
+or 'imp_pingable' flags? During recovery, the client sets the
+'imp_no_pinger_recover' flag, which tells the process to proceed from
+the current value of 'imp_replay_last_transno'. The
+'imp_need_mne_swab' flag indicates a version dependent circumstance
+where swabbing was inadvertently left out of one processing step.
+
+
+Export
+^^^^^^
+
+An 'obd_export' structure for a given target is created on a server
+for each client that connects to that target. The exports for all the
+clients for a given target are managed together. The export represents
+the connection state between the client and target as well as the
+current state of any ongoing activity. Thus each pending request will
+have a reference to the export. The export is discarded if the
+connection goes away, but only after all the references to it have
+been cleaned up. The state information for each export is also
+maintained on disk. In the event of a server failure, that or another
+server can read the export date from disk to enable recovery.
+
+----
+struct obd_export {
+ struct portals_handle exp_handle;
+ atomic_t exp_refcount;
+ atomic_t exp_rpc_count;
+ atomic_t exp_cb_count;
+ atomic_t exp_replay_count;
+ atomic_t exp_locks_count;
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+ cfs_list_t exp_locks_list;
+ spinlock_t exp_locks_list_guard;
+#endif
+ struct obd_uuid exp_client_uuid;
+ cfs_list_t exp_obd_chain;
+ cfs_hlist_node_t exp_uuid_hash;
+ cfs_hlist_node_t exp_nid_hash;
+ cfs_list_t exp_obd_chain_timed;
+ struct obd_device *exp_obd;
+ struct obd_import *exp_imp_reverse;
+ struct nid_stat *exp_nid_stats;
+ struct ptlrpc_connection *exp_connection;
+ __u32 exp_conn_cnt;
+ cfs_hash_t *exp_lock_hash;
+ cfs_hash_t *exp_flock_hash;
+ cfs_list_t exp_outstanding_replies;
+ cfs_list_t exp_uncommitted_replies;
+ spinlock_t exp_uncommitted_replies_lock;
+ __u64 exp_last_committed;
+ cfs_time_t exp_last_request_time;
+ cfs_list_t exp_req_replay_queue;
+ spinlock_t exp_lock;
+ struct obd_connect_data exp_connect_data;
+ enum obd_option exp_flags;
+ unsigned long
+ exp_failed:1,
+ exp_in_recovery:1,
+ exp_disconnected:1,
+ exp_connecting:1,
+ exp_delayed:1,
+ exp_vbr_failed:1,
+ exp_req_replay_needed:1,
+ exp_lock_replay_needed:1,
+ exp_need_sync:1,
+ exp_flvr_changed:1,
+ exp_flvr_adapt:1,
+ exp_libclient:1,
+ exp_need_mne_swab:1;
+ enum lustre_sec_part exp_sp_peer;
+ struct sptlrpc_flavor exp_flvr;
+ struct sptlrpc_flavor exp_flvr_old[2];
+ cfs_time_t exp_flvr_expire[2];
+ spinlock_t exp_rpc_lock;
+ cfs_list_t exp_hp_rpcs;
+ cfs_list_t exp_reg_rpcs;
+ cfs_list_t exp_bl_list;
+ spinlock_t exp_bl_list_lock;
+ union {
+ struct tg_export_data eu_target_data;
+ struct mdt_export_data eu_mdt_data;
+ struct filter_export_data eu_filter_data;
+ struct ec_export_data eu_ec_data;
+ struct mgs_export_data eu_mgs_data;
+ } u;
+ struct nodemap *exp_nodemap;
+};
+----
+
+The 'exp_handle' is a little extra information as compared with a
+'struct lustre_handle', which is just the cookie. The cookie that the
+server generates to uniquely identify this connection gets put into
+this structure along with their information about the device in
+question. This is the cookie the *_CONNECT reply sends back to the
+client and is then stored int he client's import.
+
+The 'exp_refcount' gets incremented whenever some aspect of the export
+is "in use". The arrival of an otherwise unprocessed message for this
+target will increment the refcount. A reference by an LDLM lock that
+gets taken will increment the refcount. Callback invocations and
+replay also lead to incrementing the ref_count. The next for fields -
+'exp_rpc_count', exp_cb_count', and 'exp_replay_count', and
+'exp_locks_count' - all subcategorize the 'exp_refcount' for debug
+purposes. Similarly, the 'exp_locks_list' and 'exp_locks_list_guard'
+are further debug info that lists the actual locks accounted in
+'exp_locks_count'.
+
+The 'exp_client_uuid' gives the UUID of the client connected to this
+export. Fixme: when and how does the UUID get generated?
+
+The server maintains all the exports for a given target on a circular
+list. Each export's place on that list is maintained in the
+'exp_obd_chain'. A common activity is to look up the export based on
+the UUID or the nid of the client, and the 'exp_uuid_hash' and
+'exp_nid_hash' fields maintain this export's place in hashes
+constructed for that purpose.
+
+Exports are also maintained on a list sorted by the last time the
+corresponding client was heard from. The 'exp_obd_chain_timed' field
+maintains the export's place on that list. When a message arrives from
+the client the time is "now" so the export gets put at the end of the
+list. Since it is circular, the next export is then the oldest. If it
+has not been heard of within its timeout interval that export is
+marked for later eviction.
+
+The 'exp_obd' points to the 'obd_device' structure for the device that
+is the target of this export.
+
+In the event of a call-back the export needs to have a the ability to
+initiate messages back to the client. The 'exp_imp_reverse' provides a
+"reverse" import that manages this capability.
+
+The '/proc' stats for the export (and the target) get updated via the
+'exp_nid_stats'.
+
+The 'exp_connection' points to the connection information for this
+export. This is the information about the actual networking pathway(s)
+that get used for communication.
+
+
+The 'exp_conn_cnt' notes the connection count value from the client at
+the time of the connection. In the event that more than one connection
+request is issued before the connection is established then the
+'exp_conn_cnt' will list the highest value. If a previous connection
+attempt (with a lower value) arrives later it may be safely
+discarded. Every request lists its connection count, so non-connection
+requests with lower connection count values can also be discarded.
+Note that this does not count how many times the client has connected
+to the target. If a client is evicted the export is deleted once it
+has been cleaned up and its 'exp_ref_count' reduced to zero. A new
+connection from the client will get a new export.
+
+The 'exp_lock_hash' provides access to the locks granted to the
+corresponding client for this target. If a lock cannot be granted it
+is discarded. A file system lock ("flock") is also implemented through
+the LDLM lock system, but not all LDLM locks are flocks. The ones that
+are flocks are gathered in a hash 'exp_flock_hash'. This supports
+deadlock detection.
+
+For those requests that initiate file system modifying transactions
+the request and its attendant locks need to be preserved until either
+a) the client acknowleges recieving the reply, or b) the transaction
+has been committed locally. This ensures a request can be replayed in
+the event of a failure. The reply is kept on the
+'exp_outstanding_replies' list until the LNet layer notifies the
+server that the reply has been acknowledged. A reply is kept on the
+'exp_uncommitted_replies' list until the transaction (if any) has been
+committed.
+
+The 'exp_last_committed' value keeps the transaction number of the
+last committed transaction. Every reply to a client includes this
+value as a means of early-as-possible notification of transactions that
+have been committed.
+
+The 'exp_last_request_time' is self explanatory.
+
+During reply a request that is waiting for reply is maintained on the
+list 'exp_req_replay_queue'.
+
+The 'exp_lock' spin-lock is used for access control to the exports
+flags, as well as the 'exp_outstanding_replies' list and the revers
+import, if any.
+
+The 'exp_connect_data' refers to an 'obd_connect_data' structure for
+the connection established between this target and the client this
+export refers to. See also the corresponding entry in the import and
+in the connect messages passed between the hosts.
+
+The 'exp_flags' field encodes three directives as follows:
+----
+enum obd_option {
+ OBD_OPT_FORCE = 0x0001,
+ OBD_OPT_FAILOVER = 0x0002,
+ OBD_OPT_ABORT_RECOV = 0x0004,
+};
+----
+fixme: Are the set for some exports and a condition of their
+existence? or do they reflect a transient state the export is passing
+through?
+
+The 'exp_failed' flag gets set whenever the target has failed for any
+reason or the export is otherwise due to be cleaned up. Once set it
+will not be unset in this export. Any subsequent connection between
+the client and the target would be governed by a new export.
+
+After a failure export data is retrieved from disk and the exports
+recreated. Exports created in this way will have their
+'exp_in_recovery' flag set. Once any outstanding requests and locks
+have been recovered for the client, then the export is recovered and
+'exp_in_recovery' can be cleared. When all the client exports for a
+given target have been recovered then the target is considered
+recovered, and when all targets have been recovered the server is
+considered recovered.
+
+A *_DISCONNECT message from the client will set the 'exp_disconnected'
+flag, as will any sort of failure of the target. Once set the export
+will be cleaned up and deleted.
+
+When a *_CONNECT message arrives the 'exp_connecting' flag is set. If
+for some reason a second *_CONNECT request arrives from the client it can
+be discarded when this flag is set.
+
+The 'exp_delayed' flag is no longer used. In older code it indicated
+that recovery had not completed in a timely fashion, but that a tardy
+recovery would still be possible, since there were no dependencies on
+the export.
+
+The 'exp_vbr_failed' flag indicates a failure during the recovery
+process. See <<recovery>> for a more detailed discussion of recovery
+and transaction replay. For a file system modifying request, the
+server composes its reply including the 'pb_pre_versions' entries in
+'ptlrpc_body', which indicate the most recent updates to the
+object. The client updates the request wth teh 'pb_transno' and
+'pb_pre_versions' from the reply, and keeps that request until the
+target signals that the transaction has been committed to disk. If the
+client times-out without that confirmation then it will 'replay' the
+request, which now includes the 'pb_pre_versions' information. During
+a replay the target checks that the object has not been further
+modified beyond those 'pb_pre_versions'. If this check fails then the
+request is out of date, and the recovery process fails for the
+connection between this client and this target. At that point the
+'exp_vbr_failed' flag is set to indicate version based recovery
+failed. This will lead to the client being evicted and this export
+being cleaned up and deleted.
+
+At the start of recovery both the 'exp_req_replay_needed' and
+'exp_lock_replay_needed' flags are set. As request replay is completed
+the 'exp_req_replay_needed' flag is cleared. As lock replay is
+completed the 'exp_lock_replay_needed' flag is cleared. Once both are
+cleared the 'exp_in_recovery' flag can be cleared.
+
+The 'exp_need_sync' supports an optimization. At mount time it is
+likely that every client (potentially thousands) will create an export
+and that export will need to be saved to disk synchronously. This can
+lead to an unusually high and poorly performing interaction with the
+disk. When the export is created the 'exp_need_sync' flag is set and
+the actual writing to disk is delayed. As transactions arrive from
+clients (in a much less coordinated fashion) the 'exp_need_sync' flag
+indicates a need to save the export as well as the transaction. At
+that point the flag is cleared (except see below).
+
+In DNE (phase I) the export for an MDT managing the connection from
+another MDT will want to always keep the 'exp_need_sync' flag set. For
+that special case such an export sets the 'exp_keep_sync', which then
+prevents the 'exp_need_sync' flag from ever being cleared. This will
+no longer be needed in DNE Phase II.
+
+The 'exp_flvr_changed' and 'exp_flvr_adapt' flags along with
+'exp_sp_peer', 'exp_flvr', 'exp_flvr_old', and 'exp_flvr_expire'
+fields are all used to manage the security settings for the
+connection. Security is discussed in the <<security>> section. (fixme:
+or will be.)
+
+The 'exp_libclient' flag indicates that the export is for a client
+based on "liblustre". This allows for simplified handling on the
+server. (fixme: how is processing simplified? It sounds like I may
+need a whole special section on liblustre.)
+
+The 'exp_need_mne_swab' flag indicates the presence of an old bug that
+affected one special case of failed swabbing. It is not part of
+current processing.
+
+As RPCs arrive they are first subjected to triage. Each request is
+placed on the 'exp_hp_rpcs' list and examined to see if it is high
+priority (fixme: what constitutes high priority? PING, truncate, bulk
+I/O, ... others?). If it is not high priority then it is moved to the
+'exp_reg_prcs' list. The 'exp_rpc_lock' protects both lists from
+concurrent access.
+
+All arriving LDLM requests get put on the 'exp_bl_list' and access to
+that list is controlled via the 'exp_bl_list_lock'.
+
+The union provides for target specific data. The 'eu_target_data' is
+for a common core of fields for a generic target. The others are
+specific to particular target types: 'eu_mdt_data' for MDTs,
+'eu_filter_data' for OSTs, 'eu_ec_data' for an "echo client" (fixme:
+describe what an echo client is somewhere), and 'eu_mgs_data' is for
+an MGS.
+
+The 'exp_bl_lock_at' field supports adaptive timeouts which will be
+discussed separately. (fixme: so discuss it somewhere.)
+
+Connection Count
+^^^^^^^^^^^^^^^^
+
+Each export maintains a connection count. Or is it just the management
+server?