--- /dev/null
+Data Structures and Defines
+---------------------------
+[[data-structs]]
+
+The following data types are used in the Lustre protocol description.
+
+Basic Data Types
+~~~~~~~~~~~~~~~~
+
+.Basic Data Types
+[options="header"]
+|=====
+| data types | size
+| __u8 | an 8-bit unsigned integer
+| __u16 | a 16-bit unsigned integer
+| __u32 | a 32-bit unsigned integer
+| __u64 | a 64-bit unsigned integer
+| __s64 | a 64-bit signed integer
+| obd_time | an __s64
+|=====
+
+
+Other Data Types
+~~~~~~~~~~~~~~~~
+
+The following topics introduce the various kinds of data that are
+represented and manipulated in Lustre messages and representations of
+the shared state on clients and servers.
+
+Grant
+^^^^^
+[[grant]]
+
+A grant value is part of a client's state for a given target. It
+provides an upper bound to the amount of dirty cache data the client
+will allow that is destined for the target. The value is established
+by agreement between the server and the client and represents a
+guarantee by the server that the target storage has space for the
+dirty data. The client can ask for additional grant, which the server
+may provide depending on how full the target is.
+
+LOV Index
+^^^^^^^^^
+[[lov-index]]
+
+Each target is assigned an LOV index (by the 'mkfs' command line) as
+the target is added to the file system. This value is stored in the
+MGS in order to identify its role in the file system.
+
+Transaction Number
+^^^^^^^^^^^^^^^^^^
+[[transno]]
+
+For each target there is a sequence of values (a strictly increasing
+series of numbers) where each operation that can modify the file
+system is assigned the next number in the series. This is the
+transaction number, and it imposes a strict serial ordering to all of
+the file system modifying operations. For file system modifying
+requests the server generates the next value in the sequence and
+informs the client of the value in the 'pb_transno' field of the
+'ptlrpc_body' of its reply to the client's request. For replys to
+requests that do not modify the file system the 'pb_transno' field in
+the 'ptlrpc_body' is just set to 0.
+
+Structured Data Types
+~~~~~~~~~~~~~~~~~~~~~
+
+Extended Attributes
+^^^^^^^^^^^^^^^^^^^
+
+I have not figured out how so called 'eadata' buffers are handled,
+yet. I am told that this is not just for extended attributes, but is a
+generic structure.
+
+Lustre Capabilities
+^^^^^^^^^^^^^^^^^^^
+
+A 'lustre_capa' structure conveys details about the capabilities
+supported (or requested) between a client and a given target.
+
+----
+#define CAPA_HMAC_MAX_LEN 64
+struct lustre_capa {
+ struct lu_fid lc_fid;
+ __u64 lc_opc;
+ __u64 lc_uid;
+ __u64 lc_gid;
+ __u32 lc_flags;
+ __u32 lc_keyid;
+ __u32 lc_timeout;
+ __u32 lc_expiry;
+ __u8 lc_hmac[CAPA_HMAC_MAX_LEN];
+}
+----
+
+MDT Data
+^^^^^^^^
+
+An 'mdt_body' structure holds details about a given MDT.
+
+----
+struct mdt_body {
+ struct lu_fid fid1;
+ struct lu_fid fid2;
+ struct lustre_handle handle;
+ __u64 valid;
+ __u64 size;
+ obd_time mtime;
+ obd_time atime;
+ obd_time ctime;
+ __u64 blocks;
+ __u64 ioepoch;
+ __u64 t_state;
+ __u32 fsuid;
+ __u32 fsgid;
+ __u32 capability;
+ __u32 mode;
+ __u32 uid;
+ __u32 gid;
+ __u32 flags;
+ __u32 rdev;
+ __u32 nlink;
+ __u32 unused2;
+ __u32 suppgid;
+ __u32 eadatasize;
+ __u32 aclsize;
+ __u32 max_mdsize;
+ __u32 max_cookiesize;
+ __u32 uid_h;
+ __u32 gid_h;
+ __u32 padding_5;
+ __u64 padding_6;
+ __u64 padding_7;
+ __u64 padding_8;
+ __u64 padding_9;
+ __u64 padding_10;
+}; /* 216 */
+----
+
+MGS Configuration Reference
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+----
+#define MTI_NAME_MAXLEN 64
+struct mgs_config_body {
+ char mcb_name[MTI_NAME_MAXLEN]; /* logname */
+ __u64 mcb_offset; /* next index of config log to request */
+ __u16 mcb_type; /* type of log: CONFIG_T_[CONFIG|RECOVER] */
+ __u8 mcb_reserved;
+ __u8 mcb_bits; /* bits unit size of config log */
+ __u32 mcb_units; /* # of units for bulk transfer */
+};
+----
+
+The 'mgs_config_body' structure has information identifying to the MGS
+which Lustre file system the client is asking about.
+
+MGS Configuration Data
+^^^^^^^^^^^^^^^^^^^^^^
+
+----
+struct mgs_config_res {
+ __u64 mcr_offset; /* index of last config log */
+ __u64 mcr_size; /* size of the log */
+};
+----
+
+The 'mgs_config_res' structure returns information about the Lustre
+file system.
+
+Lustre Handle
+^^^^^^^^^^^^^
+
+----
+struct lustre_handle {
+ __u64 cookie;
+};
+----
+
+A Lustre handle is a reference to an import or an export. Those
+objects maintain state about the connection between a given client
+and a given target. The import is on the client and the corresponding
+export is on the server.
+
+Lustre Message Header
+^^^^^^^^^^^^^^^^^^^^^
+[[lustre-message-header]]
+
+Every message has an initial header that informs the receiver about
+the size of the rest of the message to follow along with a few other
+details.
+
+----
+#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3
+#define MSGHDR_AT_SUPPORT 0x1
+struct lustre_msg_v2 {
+ __u32 lm_bufcount;
+ __u32 lm_secflvr;
+ __u32 lm_magic;
+ __u32 lm_repsize;
+ __u32 lm_cksum;
+ __u32 lm_flags;
+ __u32 lm_padding_2;
+ __u32 lm_padding_3;
+ __u32 lm_buflens[0];
+};
+#define lustre_msg lustre_msg_v2
+----
+
+The 'lm_buffcount' field gives the number of buffers that will follow
+the header. The header and sequence of buffers constitutes one
+message. Each of the buffers is a sequence of bytes whose contents
+corresponds to one of the structures described in this section. There
+will always be at least one, and no message has more than eight.
+
+The 'lm_secflvr' field gives an indication of whether any sort of
+cyptographic encoding of the subsequent buffers will be in force. The
+value is zero if there is no "crypto" and gives a code identifying the
+"flavor" of crypto if it is employed. Further, if crypto is employed
+there will only be one buffer following (i.e. buffcount = 1), and that
+buffer is an encoding of what would otherwise have been the sequence
+of buffers normally following the header. This document will defer all
+discussion of cryptography. An chapter is planned that will address it
+separately.
+
+The 'lm_magic' field is a "magic" value (LUSTRE_MSG_MAGIC_V2) that is
+checked in order to positively identify that the message is intended
+for the use to which it is being put. That is, we are indeed dealing
+with a Lustre message, and not, for example, corrupted memory or a bad
+pointer.
+
+The 'lm_repsize' field is an indication from the sender of an action
+request of the maximum available space that has been set aside for
+any reply to the request. A reply that attempts to use more than that
+much space will be discarded.
+
+The 'lm_cksum' has to do with the <<security>> settings for the
+cluster. Fixme: This may not be in current use. We need to verify.
+
+The 'lm_flags' field can be set to enable adaptive timeouts support
+with the value MSGHDR_AT_SUPPORT.
+
+The 'lm_padding*' fields are reserved for future use.
+
+The array of 'lm_bufflens' values has 'lm_bufcount' entries. Each
+entry corresponds to, and gives the length of, one of the buffers that
+will follow.
+
+The entire header is required to be a multiple of eight bytes
+long. Thus there may need to an extra four bytes of padding after the
+'lm_bufflens' array if that array has an odd number of entries.
+
+OBD statfs
+^^^^^^^^^^
+
+The 'obd_stafs' structure defines fields that are used for returning
+server common 'statfs' data items to a client. It augments that data
+with some Lustre-specific information, and also has space allocated
+for future use by Lustre.
+
+----
+struct obd_statfs {
+ __u64 os_type;
+ __u64 os_blocks;
+ __u64 os_bfree;
+ __u64 os_bavail;
+ __u64 os_files;
+ __u64 os_ffree;
+ __u8 os_fsid[40];
+ __u32 os_bsize;
+ __u32 os_namelen;
+ __u64 os_maxbytes;
+ __u32 os_state; /**< obd_statfs_state OS_STATE_* flag */
+ __u32 os_fprecreated; /* objs available now to the caller */
+ /* used in QoS code to find preferred
+ * OSTs */
+ __u32 os_spare2;
+ __u32 os_spare3;
+ __u32 os_spare4;
+ __u32 os_spare5;
+ __u32 os_spare6;
+ __u32 os_spare7;
+ __u32 os_spare8;
+ __u32 os_spare9;
+};
+----
+
+Lustre Message Preamble
+^^^^^^^^^^^^^^^^^^^^^^^
+[[lustre-message-preamble]]
+
+Every Lustre message starts with both the above header and an
+additional set of fields (in its first "buffer") given by the 'struct
+ptlrpc_body_v3' structure. This preamble has information information
+relevant to every message type. In particular, the Lustre message type
+is itself encoded in the 'pb_opc' Lustre operation number. The value
+of that op code determines what else will be in the message following
+the preamble.
+----
+#define PTLRPC_NUM_VERSIONS 4
+#define JOBSTATS_JOBID_SIZE 32
+struct ptlrpc_body_v3 {
+ struct lustre_handle pb_handle;
+ __u32 pb_type;
+ __u32 pb_version;
+ __u32 pb_opc;
+ __u32 pb_status;
+ __u64 pb_last_xid;
+ __u64 pb_last_seen;
+ __u64 pb_last_committed;
+ __u64 pb_transno;
+ __u32 pb_flags;
+ __u32 pb_op_flags;
+ __u32 pb_conn_cnt;
+ __u32 pb_timeout;
+ __u32 pb_service_time;
+ __u32 pb_limit;
+ __u64 pb_slv;
+ __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+ __u64 pb_padding[4];
+ char pb_jobid[JOBSTATS_JOBID_SIZE];
+};
+#define ptlrpc_body ptlrpc_body_v3
+----
+In a connection request, sent by a client to server and regarding a
+specific target, the 'pb_handle' is 0. In the reply to a connection
+request, sent by the server, the handle is a value uniquely
+identifying the target. Subsequent messages between this client and
+this server regarding this target will use this handle to to gain
+access to their shared state. The handle is persistent across
+reconnects.
+
+The 'pb_type' is PTL_RPC_MSG_REQUEST in messages when they are
+initiated, it is PTL_RPC_MSG_REPLY in a reply, and it is
+PTL_RPC_MSG_ERR to convey that a message was received that could not
+be interpreted, that is, if it was corrupt or incomplete. The encoding
+of those type values is given by:
+----
+#define PTL_RPC_MSG_REQUEST 4711
+#define PTL_RPC_MSG_ERR 4712
+#define PTL_RPC_MSG_REPLY 4713
+----
+The error message type is only for responding to a message that failed
+to be interpreted as an actual message. Note that other errors, such
+as those that emerge from processing the actual message content, do
+not use the PTL_RPC_MSG_ERR type.
+
+The 'pb_version' identifies the version of the Lustre protocol and is
+derived from the following constants. The lower two bytes give the
+version of PtlRPC being employed in the message, and the upper two
+bytes encode the role of the host for the service being
+requested. That role is one of OBD, MDS, OST, DLM, LOG, or MGS.
+----
+#define PTLRPC_MSG_VERSION 0x00000003
+#define LUSTRE_VERSION_MASK 0xffff0000
+#define LUSTRE_OBD_VERSION 0x00010000
+#define LUSTRE_MDS_VERSION 0x00020000
+#define LUSTRE_OST_VERSION 0x00030000
+#define LUSTRE_DLM_VERSION 0x00040000
+#define LUSTRE_LOG_VERSION 0x00050000
+#define LUSTRE_MGS_VERSION 0x00060000
+----
+
+The 'pb_opc' value (operation code) gives the actual Lustre operation
+that is the subject of this message. For example, MDS_CONNECT is a
+Lustre operation (number 38). The following list gives the name used
+and the value for each operation.
+----
+typedef enum {
+ OST_REPLY = 0,
+ OST_GETATTR = 1,
+ OST_SETATTR = 2,
+ OST_READ = 3,
+ OST_WRITE = 4,
+ OST_CREATE = 5,
+ OST_DESTROY = 6,
+ OST_GET_INFO = 7,
+ OST_CONNECT = 8,
+ OST_DISCONNECT = 9,
+ OST_PUNCH = 10,
+ OST_OPEN = 11,
+ OST_CLOSE = 12,
+ OST_STATFS = 13,
+ OST_SYNC = 16,
+ OST_SET_INFO = 17,
+ OST_QUOTACHECK = 18,
+ OST_QUOTACTL = 19,
+ OST_QUOTA_ADJUST_QUNIT = 20,
+ MDS_GETATTR = 33,
+ MDS_GETATTR_NAME = 34,
+ MDS_CLOSE = 35,
+ MDS_REINT = 36,
+ MDS_READPAGE = 37,
+ MDS_CONNECT = 38,
+ MDS_DISCONNECT = 39,
+ MDS_GETSTATUS = 40,
+ MDS_STATFS = 41,
+ MDS_PIN = 42,
+ MDS_UNPIN = 43,
+ MDS_SYNC = 44,
+ MDS_DONE_WRITING = 45,
+ MDS_SET_INFO = 46,
+ MDS_QUOTACHECK = 47,
+ MDS_QUOTACTL = 48,
+ MDS_GETXATTR = 49,
+ MDS_SETXATTR = 50,
+ MDS_WRITEPAGE = 51,
+ MDS_IS_SUBDIR = 52,
+ MDS_GET_INFO = 53,
+ MDS_HSM_STATE_GET = 54,
+ MDS_HSM_STATE_SET = 55,
+ MDS_HSM_ACTION = 56,
+ MDS_HSM_PROGRESS = 57,
+ MDS_HSM_REQUEST = 58,
+ MDS_HSM_CT_REGISTER = 59,
+ MDS_HSM_CT_UNREGISTER = 60,
+ MDS_SWAP_LAYOUTS = 61,
+ LDLM_ENQUEUE = 101,
+ LDLM_CONVERT = 102,
+ LDLM_CANCEL = 103,
+ LDLM_BL_CALLBACK = 104,
+ LDLM_CP_CALLBACK = 105,
+ LDLM_GL_CALLBACK = 106,
+ LDLM_SET_INFO = 107,
+ MGS_CONNECT = 250,
+ MGS_DISCONNECT = 251,
+ MGS_EXCEPTION = 252,
+ MGS_TARGET_REG = 253,
+ MGS_TARGET_DEL = 254,
+ MGS_SET_INFO = 255,
+ MGS_CONFIG_READ = 256,
+ OBD_PING = 400,
+ OBD_LOG_CANCEL = 401,
+ OBD_QC_CALLBACK = 402,
+ OBD_IDX_READ = 403,
+ LLOG_ORIGIN_HANDLE_CREATE = 501,
+ LLOG_ORIGIN_HANDLE_NEXT_BLOCK = 502,
+ LLOG_ORIGIN_HANDLE_READ_HEADER = 503,
+ LLOG_ORIGIN_HANDLE_WRITE_REC = 504,
+ LLOG_ORIGIN_HANDLE_CLOSE = 505,
+ LLOG_ORIGIN_CONNECT = 506,
+ LLOG_ORIGIN_HANDLE_PREV_BLOCK = 508,
+ LLOG_ORIGIN_HANDLE_DESTROY = 509,
+ QUOTA_DQACQ = 601,
+ QUOTA_DQREL = 602,
+ SEQ_QUERY = 700,
+ SEC_CTX_INIT = 801,
+ SEC_CTX_INIT_CONT = 802,
+ SEC_CTX_FINI = 803,
+ FLD_QUERY = 900,
+ FLD_READ = 901,
+ UPDATE_OBJ = 1000,
+ LAST_OPC
+} cmd_t;
+----
+The symbols and values above identify the operations Lustre uses in
+its protocol. They are examined in detail in the
+<<lustre-operations,Lustre Operations>> section. Lustre carries out
+each of these operations via the exchange of a pair of messages: a
+request and a reply. The details of each message are specific to each
+operation. The <<lustre-messages,Lustre Messages>> chapter discusses
+each message and its contents.
+
+The 'pb_status' value in a request message is set to the 'pid' of the
+process making the request. In a reply message, a zero indicates that
+the service successfully initiated the requested operation. If for
+some reason the operation could not be initiated (eg. "permission
+denied") the status will encode the standard Linux kernel (POSIX)
+error code (eg. EPERM).
+
+'pb_last_xid' and 'pb_last_seen' are not used.
+
+The 'pb_last_committed' value is always zero in a request. In a reply
+it is the highest transaction number that has been committed to
+storage. The transaction numbers are maintained on a per-target basis
+and each series of transaction numbers is a strictly increasing
+sequence. This field is set in any kind of reply message including
+pings and non-modifying transactions.
+
+The 'pb_transno' value always zero in a new request. It is also zero
+for replies to operations that do not modify the file system. For
+replies to operations that do modify the file system it is the
+server-assigned value from the sequence of values associated with the
+given client and target. That transaction number is copied into the
+'pb_trans' field of the 'ptlrpc_body' of the originial request. If the
+request has to be replayed it will include the transaction number.
+
+The 'pb_flags' value governs the client state machine. Fixme: document
+what the states and transitions are of this state machine. Currently,
+only the bottom two bytes are used, and they encode state according to
+the following values:
+----
+#define MSG_GEN_FLAG_MASK 0x0000ffff
+#define MSG_LAST_REPLAY 0x0001
+#define MSG_RESENT 0x0002
+#define MSG_REPLAY 0x0004
+#define MSG_DELAY_REPLAY 0x0010
+#define MSG_VERSION_REPLAY 0x0020
+#define MSG_REQ_REPLAY_DONE 0x0040
+#define MSG_LOCK_REPLAY_DONE 0x0080
+----
+
+The 'pb_op_flags' value governs the client connection status state
+machine. Fixme: document what the states and transitions are of this
+state machine.
+----
+#define MSG_CONNECT_RECOVERING 0x00000001
+#define MSG_CONNECT_RECONNECT 0x00000002
+#define MSG_CONNECT_REPLAYABLE 0x00000004
+#define MSG_CONNECT_LIBCLIENT 0x00000010
+#define MSG_CONNECT_INITIAL 0x00000020
+#define MSG_CONNECT_ASYNC 0x00000040
+#define MSG_CONNECT_NEXT_VER 0x00000080
+#define MSG_CONNECT_TRANSNO 0x00000100
+----
+In normal operation an initial request to connect will set
+'pb_op_flags' to MSG_CONNECT_INITIAL and MSG_CONNECT_NEXT_VER. The
+reply to that connection request (and all other, non-connect, requests
+and replies) will set 'pb_op_flags' to 0.
+
+The 'pb_conn_cnt' (connection count) value in a request message
+reports the client's "era", which is part of the client and server's
+shared state. The value of the era is initialized to one when it is
+first connected to the MDT. Each subsequent connection (after an
+eviction) increments the era for the client. Since the 'pb_conn_cnt'
+reflects the client's era at the time the message was composed the
+server can use this value to discard late-arriving messages requesting
+operations on out-of-date shared state.
+
+The 'pb_timeout' value in a request indicates how long (in seconds)
+the requester plans to wait before timing out the operation. That is,
+the corresponding reply for this message should arrive within this
+time frame. The service may extend this time frame via an "early
+reply", which is a reply to this message that notifies the requester
+that it should extend its timeout interval by the value of the
+'pb_timeout' field in the reply. The "early reply" does not indicate
+the operation has actually been initiated. Clients maintain multiple
+request queues, called "portals", and each type of operation is
+assigned to one of these queues. There is a timeout value associated
+with each queue, and the timeout update affects all the messages
+associated with the given queue, not just the specific message that
+initiated the request. Finally, in a reply message (one that does
+indicate the operation has been initiated) the timeout value updates
+the timeout interval for the queue. Is this last point different from
+the "early reply" update?
+
+The 'pb_service_time' value is zero in a request. In a reply it
+indicates how long this particular operation actually took from the
+time it first arrived in the request queue (at the service) to the
+time the server replied. Note that the client can use this value and
+the local elapsed time for the operation to calculate network latency.
+
+The 'pb_limit' value is zero in a request. In a reply it is a value
+sent from a lock service to a client to set the maximum number of
+locks available to the client. When dynamic lock LRU's are enabled
+this allows for managing the size of the LRU.
+
+The 'pb_slv' value is zero in a request. On a DLM service, the "server
+lock volume" is a value that characterizes (estimates) the amount of
+traffic, or load, on that lock service. It is calculated as the
+product of the number of locks and their age. In a reply, the 'pb_slv'
+value indicates to the client the available share of the total lock
+load on the server that the client is allowed to consume. The client
+is then responsible for reducing its number or (or age) of locks to
+stay within this limit.
+
+The array of 'pb_pre_versions' values has four entries. They are
+always zero in a new request message. They are also zero in replies to
+operations that do not modify the file system. For an operation that
+does modify the file system, the reply encodes the most recent
+transaction numbers for the objects modified by this operation, and
+the 'pb_pre_versions' values are copied into the original request when
+the reply arrives. If the request needs to be replayed then the
+updated 'pb_pre_versions' values accompany the replayed request.
+
+'pb_padding' is reserved for future use.
+
+The 'pb_jobid' (string) value gives a unique identifier associated
+with the process on behalf of which this message was generated. The
+identifier is assigned to the user process by a job scheduler, if any.
+
+Object Based Disk UUID
+^^^^^^^^^^^^^^^^^^^^^^
+
+----
+#define UUID_MAX 40
+struct obd_uuid {
+ char uuid[UUID_MAX];
+};
+----
+
+OST ID
+^^^^^^
+
+----
+struct ost_id {
+ union {
+ struct ostid {
+ __u64 oi_id;
+ __u64 oi_seq;
+ } oi;
+ struct lu_fid oi_fid;
+ } LUSTRE_ANONYMOUS_UNION_NAME;
+};
+----
+