5 #define IMP_STATE_HIST_LEN 16
6 struct import_state_hist {
7 enum lustre_imp_state ish_state;
11 struct portals_handle imp_handle;
12 atomic_t imp_refcount;
13 struct lustre_handle imp_dlm_handle;
14 struct ptlrpc_connection *imp_connection;
15 struct ptlrpc_client *imp_client;
16 cfs_list_t imp_pinger_chain;
17 cfs_list_t imp_zombie_chain;
18 cfs_list_t imp_replay_list;
19 cfs_list_t imp_sending_list;
20 cfs_list_t imp_delayed_list;
21 cfs_list_t imp_committed_list;
22 cfs_list_t *imp_replay_cursor;
23 struct obd_device *imp_obd;
24 struct ptlrpc_sec *imp_sec;
25 struct mutex imp_sec_mutex;
26 cfs_time_t imp_sec_expire;
27 wait_queue_head_t imp_recovery_waitq;
28 atomic_t imp_inflight;
29 atomic_t imp_unregistering;
30 atomic_t imp_replay_inflight;
31 atomic_t imp_inval_count;
32 atomic_t imp_timeouts;
33 enum lustre_imp_state imp_state;
34 struct import_state_hist imp_state_hist[IMP_STATE_HIST_LEN];
35 int imp_state_hist_idx;
38 int imp_last_generation_checked;
39 __u64 imp_last_replay_transno;
40 __u64 imp_peer_committed_transno;
41 __u64 imp_last_transno_checked;
42 struct lustre_handle imp_remote_handle;
43 cfs_time_t imp_next_ping;
44 __u64 imp_last_success_conn;
45 cfs_list_t imp_conn_list;
46 struct obd_import_conn *imp_conn_current;
56 imp_delayed_recovery:1,
60 imp_force_next_verify:1,
63 imp_no_pinger_recover:1,
65 imp_force_reconnect:1,
68 struct obd_connect_data imp_connect_data;
69 __u64 imp_connect_flags_orig;
70 int imp_connect_error;
72 __u32 imp_msghdr_flags; /* adjusted based on server capability */
73 struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */
74 struct imp_at imp_at; /* adaptive timeout data */
75 time_t imp_last_reply_time; /* for health check */
79 The 'imp_handle' value is the unique id for the import, and is used as
80 a hash key to gain access to it. It is not used in any of the Lustre
81 protocol messages, but rather is just for internal reference.
83 The 'imp_refcount' is also for internal use. The value is incremented
84 with each RPC created, and decremented as the request is freed. When
85 the reference count is zero the import can be freed, as when the
86 target is being disconnected.
88 The 'imp_dlm_handle' is a reference to the LDLM export for this
91 There can be multiple paths through the network to a given
92 target, in which case there would be multiple 'obd_import_conn' items
93 on the 'imp_conn_list'. Each 'obd_imp_conn' includes a
94 'ptlrpc_connection', so 'imp_connection' points to the one that is
97 The 'imp_client' identifies the (local) portals for sending and
98 receiving messages as well as the client's name. The information is
99 specific to either an MDC or an OSC.
101 The 'imp_ping_chain' places the import on a linked list of imports
102 that need periodic pings.
104 The 'imp_zombie_chain' places the import on a list ready for being
105 freed. Unused imports ('imp_refcount' is zero) are deleted
106 asynchronously by a garbage collecting process.
108 In order to support recovery the client must keep requests that are in
109 the process of being handled by the target. The target replies to a
110 request as soon as the target has made its local update to
111 memory. When the client receives that reply the request is put on the
112 'imp_replay_list'. In the event of a failure (target crash, lost
113 message) this list is then replayed for the target during the recovery
114 process. When a request has been sent but has not yet received a reply
115 it is placed on the 'imp_sending_list'. In the event of a failure
116 those will simply be replayed after any recovery has been
117 completed. Finally, there may be requests that the client is delaying
118 before it sends them. This can happen if the client is in a degraded
119 mode, as when it is in recovery after a failure. These requests are
120 put on the 'imp_delayed_list' and not processed until recovery is
121 complete and the 'imp_sending_list' has been replayed.
123 In order to support recovery 'open' requests must be preserved even
124 after they have completed. Those requests are placed on the
125 'imp_committed_list' and the 'imp_replay_cursor' allows for
126 accelerated access to those items.
128 The 'imp_obd' is a reference to the details about the target device
129 that is the subject of this import. There is a lot of state info in
130 there along with many implementation details that are not relevant to
131 the actual Lustre protocol. fixme: I'll want to go through all of the
132 fields in that structure to see which, if any need more
135 The security policy and settings are kept in 'imp_sec', and
136 'imp_sec_mutex' helps manage access to that info. The 'imp_sec_expire'
137 setting is in support of security policies that have an expiration
140 Some processes may need the import to be in a fully connected state in
141 order to proceed. The 'imp_recovery_waitq' is where those threads will
142 wait during recovery.
144 The 'imp_inflight' field counts the number of in-flight requests. It
145 is incremented with each request sent and decremented with each reply
148 The client reserves buffers for the processing of requests and
149 replies, and then informs LNet about those buffers. Buffers may get
150 reused during subsequent processing, but then a point may come when
151 the buffer is no longer going to be used. The client increments the
152 'imp_unregistering' counter and informs LNet the buffer is no longer
153 needed. When LNet has freed the buffer it will notify the client and
154 then the 'imp_unregistering' can be decremented again.
156 During recovery the 'imp_reply_inflight' counts the number of requests
157 from the reply list that have been sent and have not been replied to.
159 The 'imp_inval_count' field counts how many threads are in the process
160 of cleaning up this connection or waiting for cleanup to complete. The
161 cleanup itself may be needed in the case there is an eviction or other
162 problem (fixme what other problem?). The cleanup may involve freeing
163 allocated resources, updating internal state, running replay lists,
164 and invalidating cache. Since it could take a while there may end up
165 multiple threads waiting on this process to complete.
167 The 'imp_timeout' field is a counter that is incremented every time
168 there is a timeout in communication with the target.
170 The 'imp_state' tracks the state of the import. It draws from the
171 enumerated set of values:
173 .enum_lustre_imp_state
177 | LUSTRE_IMP_CLOSED | 1
179 | LUSTRE_IMP_DISCON | 3
180 | LUSTRE_IMP_CONNECTING | 4
181 | LUSTRE_IMP_REPLAY | 5
182 | LUSTRE_IMP_REPLAY_LOCKS | 6
183 | LUSTRE_IMP_REPLAY_WAIT | 7
184 | LUSTRE_IMP_RECOVER | 8
185 | LUSTRE_IMP_FULL | 9
186 | LUSTRE_IMP_EVICTED | 10
188 fixme: what are the transitions between these states? The
189 'imp_state_hist' array maintains a list of the last 16
190 (IMP_STATE_HIST_LEN) states the import was in, along with the time it
191 entered each (fixme: or is it when it left that state?). The list is
192 maintained in a circular manner, so the 'imp_state_hist_idx' points to
193 the entry in the list for the most recently visited state.
195 The 'imp_generation' and 'imp_conn_cnt' fields are monotonically
196 increasing counters. Every time a connection request is sent to the
197 target the 'imp_conn_cnt' counter is incremented, and every time a
198 reply is received for the connection request the 'imp_generation'
199 counter is incremented.
201 The 'imp_last_generation_checked' implements an optimization. When a
202 replay process has successfully traversed the reply list the
203 'imp_generation' value is noted here. If the generation has not
204 incremented then the replay list does not need to be traversed again.
206 During replay the 'imp_last_replay_transno' is set to the transaction
207 number of the last request being replayed, and
208 'imp_peer_committed_transno is set to the 'pb_last_committed' value
209 (of the 'ptlrpc_body') from replies if that value is higher than the
210 previous 'imp_peer_committed_transno'. The 'imp_last_transno_checked'
211 field implements an optimization. It is set to the
212 'imp_last_replay_transno' as its replay is initiated. If
213 'imp_last_transno_checked' is still 'imp_last_replay_transno' and
214 'imp_generation' is still 'imp_last_generation_checked' then there
215 are no additional requests ready to be removed from the replay
216 list. Furthermore, 'imp_last_transno_checked' may no longer be needed,
217 since the committed transactions are now maintained on a separate list.
219 The 'imp_remote_handle' is the handle sent by the target in a
220 connection reply message to uniquely identify the export for this
221 target and client that is maintained on the server. This is the handle
222 used in all subsequent messages to the target.
224 There are two separate ping intervals (fixme: what are the
225 values?). If there are no uncommitted messages for the target then the
226 default ping interval is used to set the 'imp_next_ping' to the time
227 the next ping needs to be sent. If there are uncommitted requests then
228 a "short interval" is used to set the time for the next ping.
230 The 'imp_last_success_conn' value is set to the time of the last
231 successful connection. fixme: The source says it is in 64 bit
232 jiffies, but does not further indicate how that value is calculated.
234 Since there can actually be multiple connection paths for a target
235 (due to failover or multihomed configurations) the import maintains a
236 list of all the possible connection paths in the list pointed to by
237 the 'imp_conn_list' field. The 'imp_conn_current' points to the one
238 currently in use. Compare with the 'imp_connection' fields. They point
239 to different structures, but each is reachable from the other.
241 Most of the flag, state, and list information in the import needs to
242 be accessed atomically. The 'imp_lock' is used to maintain the
243 consistency of the import while it is manipulated by multiple threads.
245 The various flags are documented in the source code and are largely
246 obvious from those short comments, reproduced here:
252 | imp_no_timeout | timeouts are disabled
253 | imp_invalid | client has been evicted
254 | imp_deactive | client administratively disabled
255 | imp_replayable | try to recover the import
256 | imp_dlm_fake | don't run recovery (timeout instead)
257 | imp_server_timeout | use 1/2 timeout on MDSs and OSCs
258 | imp_delayed_recovery | VBR: imp in delayed recovery
259 | imp_no_lock_replay | VBR: if gap was found then no lock replays
260 | imp_vbr_failed | recovery by versions was failed
261 | imp_force_verify | force an immidiate ping
262 | imp_force_next_verify | force a scheduled ping
263 | imp_pingable | target is pingable
264 | imp_resend_replay | resend for replay
265 | imp_no_pinger_recover | disable normal recovery, for test only.
266 | imp_need_mne_swab | need IR MNE swab
267 | imp_force_reconnect | import must be reconnected, not new connection
268 | imp_connect_tried | import has tried to connect with server
270 A few additional notes are in order. The 'imp_dlm_fake' flag signifies
271 that this is not a "real" import, but rather it is a "reverse"import
272 in support of the LDLM. When the LDLM invokes callback operations the
273 messages are initiated at the other end, so there need to a fake
274 import to receive the replies from the operation. Prior to the
275 introduction of adaptive timeouts the servers were given fixed timeout
276 value that were half those used for the clients. The
277 'imp_server_timeout' flag indicated that the import should use the
278 half-sized timeouts, but with the introduction of adaptive timeouts
279 this facility is no longer used. "VBR" is "version based recovery",
280 and it introduces a new possibility for handling requests. Previously,
281 f there were a gap in the transaction number sequence the the requests
282 associated with the missing transaction numbers would be
283 discarded. With VBR those transaction only need to be discarded if
284 there is an actual dependency between the ones that were skipped and
285 the currently latest committed transaction number. fixme: What are the
286 circumstances that would lead to setting the 'imp_force_next_verify'
287 or 'imp_pingable' flags? During recovery, the client sets the
288 'imp_no_pinger_recover' flag, which tells the process to proceed from
289 the current value of 'imp_replay_last_transno'. The
290 'imp_need_mne_swab' flag indicates a version dependent circumstance
291 where swabbing was inadvertently left out of one processing step.