--- /dev/null
+Import
+^^^^^^
+
+----
+#define IMP_STATE_HIST_LEN 16
+struct import_state_hist {
+ enum lustre_imp_state ish_state;
+ time_t ish_time;
+};
+struct obd_import {
+ struct portals_handle imp_handle;
+ atomic_t imp_refcount;
+ struct lustre_handle imp_dlm_handle;
+ struct ptlrpc_connection *imp_connection;
+ struct ptlrpc_client *imp_client;
+ cfs_list_t imp_pinger_chain;
+ cfs_list_t imp_zombie_chain;
+ cfs_list_t imp_replay_list;
+ cfs_list_t imp_sending_list;
+ cfs_list_t imp_delayed_list;
+ cfs_list_t imp_committed_list;
+ cfs_list_t *imp_replay_cursor;
+ struct obd_device *imp_obd;
+ struct ptlrpc_sec *imp_sec;
+ struct mutex imp_sec_mutex;
+ cfs_time_t imp_sec_expire;
+ wait_queue_head_t imp_recovery_waitq;
+ atomic_t imp_inflight;
+ atomic_t imp_unregistering;
+ atomic_t imp_replay_inflight;
+ atomic_t imp_inval_count;
+ atomic_t imp_timeouts;
+ enum lustre_imp_state imp_state;
+ struct import_state_hist imp_state_hist[IMP_STATE_HIST_LEN];
+ int imp_state_hist_idx;
+ int imp_generation;
+ __u32 imp_conn_cnt;
+ int imp_last_generation_checked;
+ __u64 imp_last_replay_transno;
+ __u64 imp_peer_committed_transno;
+ __u64 imp_last_transno_checked;
+ struct lustre_handle imp_remote_handle;
+ cfs_time_t imp_next_ping;
+ __u64 imp_last_success_conn;
+ cfs_list_t imp_conn_list;
+ struct obd_import_conn *imp_conn_current;
+ spinlock_t imp_lock;
+ /* flags */
+ unsigned long
+ imp_no_timeout:1,
+ imp_invalid:1,
+ imp_deactive:1,
+ imp_replayable:1,
+ imp_dlm_fake:1,
+ imp_server_timeout:1,
+ imp_delayed_recovery:1,
+ imp_no_lock_replay:1,
+ imp_vbr_failed:1,
+ imp_force_verify:1,
+ imp_force_next_verify:1,
+ imp_pingable:1,
+ imp_resend_replay:1,
+ imp_no_pinger_recover:1,
+ imp_need_mne_swab:1,
+ imp_force_reconnect:1,
+ imp_connect_tried:1;
+ __u32 imp_connect_op;
+ struct obd_connect_data imp_connect_data;
+ __u64 imp_connect_flags_orig;
+ int imp_connect_error;
+ __u32 imp_msg_magic;
+ __u32 imp_msghdr_flags; /* adjusted based on server capability */
+ struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */
+ struct imp_at imp_at; /* adaptive timeout data */
+ time_t imp_last_reply_time; /* for health check */
+};
+----
+
+The 'imp_handle' value is the unique id for the import, and is used as
+a hash key to gain access to it. It is not used in any of the Lustre
+protocol messages, but rather is just for internal reference.
+
+The 'imp_refcount' is also for internal use. The value is incremented
+with each RPC created, and decremented as the request is freed. When
+the reference count is zero the import can be freed, as when the
+target is being disconnected.
+
+The 'imp_dlm_handle' is a reference to the LDLM export for this
+client.
+
+There can be multiple paths through the network to a given
+target, in which case there would be multiple 'obd_import_conn' items
+on the 'imp_conn_list'. Each 'obd_imp_conn' includes a
+'ptlrpc_connection', so 'imp_connection' points to the one that is
+actually in use.
+
+The 'imp_client' identifies the (local) portals for sending and
+receiving messages as well as the client's name. The information is
+specific to either an MDC or an OSC.
+
+The 'imp_ping_chain' places the import on a linked list of imports
+that need periodic pings.
+
+The 'imp_zombie_chain' places the import on a list ready for being
+freed. Unused imports ('imp_refcount' is zero) are deleted
+asynchronously by a garbage collecting process.
+
+In order to support recovery the client must keep requests that are in
+the process of being handled by the target. The target replies to a
+request as soon as the target has made its local update to
+memory. When the client receives that reply the request is put on the
+'imp_replay_list'. In the event of a failure (target crash, lost
+message) this list is then replayed for the target during the recovery
+process. When a request has been sent but has not yet received a reply
+it is placed on the 'imp_sending_list'. In the event of a failure
+those will simply be replayed after any recovery has been
+completed. Finally, there may be requests that the client is delaying
+before it sends them. This can happen if the client is in a degraded
+mode, as when it is in recovery after a failure. These requests are
+put on the 'imp_delayed_list' and not processed until recovery is
+complete and the 'imp_sending_list' has been replayed.
+
+In order to support recovery 'open' requests must be preserved even
+after they have completed. Those requests are placed on the
+'imp_committed_list' and the 'imp_replay_cursor' allows for
+accelerated access to those items.
+
+The 'imp_obd' is a reference to the details about the target device
+that is the subject of this import. There is a lot of state info in
+there along with many implementation details that are not relevant to
+the actual Lustre protocol. fixme: I'll want to go through all of the
+fields in that structure to see which, if any need more
+documentation.
+
+The security policy and settings are kept in 'imp_sec', and
+'imp_sec_mutex' helps manage access to that info. The 'imp_sec_expire'
+setting is in support of security policies that have an expiration
+strategy.
+
+Some processes may need the import to be in a fully connected state in
+order to proceed. The 'imp_recovery_waitq' is where those threads will
+wait during recovery.
+
+The 'imp_inflight' field counts the number of in-flight requests. It
+is incremented with each request sent and decremented with each reply
+received.
+
+The client reserves buffers for the processing of requests and
+replies, and then informs LNet about those buffers. Buffers may get
+reused during subsequent processing, but then a point may come when
+the buffer is no longer going to be used. The client increments the
+'imp_unregistering' counter and informs LNet the buffer is no longer
+needed. When LNet has freed the buffer it will notify the client and
+then the 'imp_unregistering' can be decremented again.
+
+During recovery the 'imp_reply_inflight' counts the number of requests
+from the reply list that have been sent and have not been replied to.
+
+The 'imp_inval_count' field counts how many threads are in the process
+of cleaning up this connection or waiting for cleanup to complete. The
+cleanup itself may be needed in the case there is an eviction or other
+problem (fixme what other problem?). The cleanup may involve freeing
+allocated resources, updating internal state, running replay lists,
+and invalidating cache. Since it could take a while there may end up
+multiple threads waiting on this process to complete.
+
+The 'imp_timeout' field is a counter that is incremented every time
+there is a timeout in communication with the target.
+
+The 'imp_state' tracks the state of the import. It draws from the
+enumerated set of values:
+
+.enum_lustre_imp_state
+[options="header"]
+|=====
+| state name | value
+| LUSTRE_IMP_CLOSED | 1
+| LUSTRE_IMP_NEW | 2
+| LUSTRE_IMP_DISCON | 3
+| LUSTRE_IMP_CONNECTING | 4
+| LUSTRE_IMP_REPLAY | 5
+| LUSTRE_IMP_REPLAY_LOCKS | 6
+| LUSTRE_IMP_REPLAY_WAIT | 7
+| LUSTRE_IMP_RECOVER | 8
+| LUSTRE_IMP_FULL | 9
+| LUSTRE_IMP_EVICTED | 10
+|=====
+fixme: what are the transitions between these states? The
+'imp_state_hist' array maintains a list of the last 16
+(IMP_STATE_HIST_LEN) states the import was in, along with the time it
+entered each (fixme: or is it when it left that state?). The list is
+maintained in a circular manner, so the 'imp_state_hist_idx' points to
+the entry in the list for the most recently visited state.
+
+The 'imp_generation' and 'imp_conn_cnt' fields are monotonically
+increasing counters. Every time a connection request is sent to the
+target the 'imp_conn_cnt' counter is incremented, and every time a
+reply is received for the connection request the 'imp_generation'
+counter is incremented.
+
+The 'imp_last_generation_checked' implements an optimization. When a
+replay process has successfully traversed the reply list the
+'imp_generation' value is noted here. If the generation has not
+incremented then the replay list does not need to be traversed again.
+
+During replay the 'imp_last_replay_transno' is set to the transaction
+number of the last request being replayed, and
+'imp_peer_committed_transno is set to the 'pb_last_committed' value
+(of the 'ptlrpc_body') from replies if that value is higher than the
+previous 'imp_peer_committed_transno'. The 'imp_last_transno_checked'
+field implements an optimization. It is set to the
+'imp_last_replay_transno' as its replay is initiated. If
+'imp_last_transno_checked' is still 'imp_last_replay_transno' and
+'imp_generation' is still 'imp_last_generation_checked' then there
+are no additional requests ready to be removed from the replay
+list. Furthermore, 'imp_last_transno_checked' may no longer be needed,
+since the committed transactions are now maintained on a separate list.
+
+The 'imp_remote_handle' is the handle sent by the target in a
+connection reply message to uniquely identify the export for this
+target and client that is maintained on the server. This is the handle
+used in all subsequent messages to the target.
+
+There are two separate ping intervals (fixme: what are the
+values?). If there are no uncommitted messages for the target then the
+default ping interval is used to set the 'imp_next_ping' to the time
+the next ping needs to be sent. If there are uncommitted requests then
+a "short interval" is used to set the time for the next ping.
+
+The 'imp_last_success_conn' value is set to the time of the last
+successful connection. fixme: The source says it is in 64 bit
+jiffies, but does not further indicate how that value is calculated.
+
+Since there can actually be multiple connection paths for a target
+(due to failover or multihomed configurations) the import maintains a
+list of all the possible connection paths in the list pointed to by
+the 'imp_conn_list' field. The 'imp_conn_current' points to the one
+currently in use. Compare with the 'imp_connection' fields. They point
+to different structures, but each is reachable from the other.
+
+Most of the flag, state, and list information in the import needs to
+be accessed atomically. The 'imp_lock' is used to maintain the
+consistency of the import while it is manipulated by multiple threads.
+
+The various flags are documented in the source code and are largely
+obvious from those short comments, reproduced here:
+
+.import flags
+[options="header"]
+|=====
+| flag | explanation
+| imp_no_timeout | timeouts are disabled
+| imp_invalid | client has been evicted
+| imp_deactive | client administratively disabled
+| imp_replayable | try to recover the import
+| imp_dlm_fake | don't run recovery (timeout instead)
+| imp_server_timeout | use 1/2 timeout on MDSs and OSCs
+| imp_delayed_recovery | VBR: imp in delayed recovery
+| imp_no_lock_replay | VBR: if gap was found then no lock replays
+| imp_vbr_failed | recovery by versions was failed
+| imp_force_verify | force an immidiate ping
+| imp_force_next_verify | force a scheduled ping
+| imp_pingable | target is pingable
+| imp_resend_replay | resend for replay
+| imp_no_pinger_recover | disable normal recovery, for test only.
+| imp_need_mne_swab | need IR MNE swab
+| imp_force_reconnect | import must be reconnected, not new connection
+| imp_connect_tried | import has tried to connect with server
+|=====
+A few additional notes are in order. The 'imp_dlm_fake' flag signifies
+that this is not a "real" import, but rather it is a "reverse"import
+in support of the LDLM. When the LDLM invokes callback operations the
+messages are initiated at the other end, so there need to a fake
+import to receive the replies from the operation. Prior to the
+introduction of adaptive timeouts the servers were given fixed timeout
+value that were half those used for the clients. The
+'imp_server_timeout' flag indicated that the import should use the
+half-sized timeouts, but with the introduction of adaptive timeouts
+this facility is no longer used. "VBR" is "version based recovery",
+and it introduces a new possibility for handling requests. Previously,
+f there were a gap in the transaction number sequence the the requests
+associated with the missing transaction numbers would be
+discarded. With VBR those transaction only need to be discarded if
+there is an actual dependency between the ones that were skipped and
+the currently latest committed transaction number. fixme: What are the
+circumstances that would lead to setting the 'imp_force_next_verify'
+or 'imp_pingable' flags? During recovery, the client sets the
+'imp_no_pinger_recover' flag, which tells the process to proceed from
+the current value of 'imp_replay_last_transno'. The
+'imp_need_mne_swab' flag indicates a version dependent circumstance
+where swabbing was inadvertently left out of one processing step.