1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
32 * Copyright (c) 2011 Whamcloud, Inc.
36 * This file is part of Lustre, http://www.lustre.org/
37 * Lustre is a trademark of Sun Microsystems, Inc.
39 * lustre/include/lustre_disk.h
41 * Lustre disk format definitions.
43 * Author: Nathan Rutman <nathan@clusterfs.com>
46 #ifndef _LUSTRE_DISK_H
47 #define _LUSTRE_DISK_H
49 /** \defgroup disk disk
54 #include <libcfs/libcfs.h>
55 #include <lnet/types.h>
57 /****************** on-disk files *********************/
59 #define MDT_LOGS_DIR "LOGS" /* COMPAT_146 */
60 #define MOUNT_CONFIGS_DIR "CONFIGS"
61 #define CONFIGS_FILE "mountdata"
62 /** Persistent mount data are stored on the disk in this file. */
63 #define MOUNT_DATA_FILE MOUNT_CONFIGS_DIR"/"CONFIGS_FILE
64 #define LAST_RCVD "last_rcvd"
65 #define LOV_OBJID "lov_objid"
66 #define HEALTH_CHECK "health_check"
67 #define CAPA_KEYS "capa_keys"
68 #define CHANGELOG_USERS "changelog_users"
69 #define MGS_NIDTBL_DIR "NIDTBL_VERSIONS"
72 /****************** persistent mount data *********************/
74 #define LDD_F_SV_TYPE_MDT 0x0001
75 #define LDD_F_SV_TYPE_OST 0x0002
76 #define LDD_F_SV_TYPE_MGS 0x0004
77 #define LDD_F_SV_TYPE_MASK (LDD_F_SV_TYPE_MDT | \
80 #define LDD_F_SV_ALL 0x0008
81 /** need an index assignment */
82 #define LDD_F_NEED_INDEX 0x0010
83 /** never registered */
84 #define LDD_F_VIRGIN 0x0020
85 /** update the config logs for this server */
86 #define LDD_F_UPDATE 0x0040
87 /** rewrite the LDD */
88 #define LDD_F_REWRITE_LDD 0x0080
89 /** regenerate config logs for this fs or server */
90 #define LDD_F_WRITECONF 0x0100
92 #define LDD_F_UPGRADE14 0x0200
93 /** process as lctl conf_param */
94 #define LDD_F_PARAM 0x0400
95 /** backend fs make use of IAM directory format. */
96 #define LDD_F_IAM_DIR 0x0800
97 /** all nodes are specified as service nodes */
98 #define LDD_F_NO_PRIMNODE 0x1000
100 #define LDD_F_IR_CAPABLE 0x2000
101 /** the MGS refused to register the target. */
102 #define LDD_F_ERROR 0x4000
104 /* opc for target register */
105 #define LDD_F_OPC_REG 0x10000000
106 #define LDD_F_OPC_UNREG 0x20000000
107 #define LDD_F_OPC_READY 0x40000000
108 #define LDD_F_OPC_MASK 0xf0000000
110 #define LDD_F_ONDISK_MASK (LDD_F_SV_TYPE_MASK | LDD_F_IAM_DIR)
112 enum ldd_mount_type {
121 static inline char *mt_str(enum ldd_mount_type mt)
123 static char *mount_type_string[] = {
130 return mount_type_string[mt];
133 #define LDD_INCOMPAT_SUPP 0
134 #define LDD_ROCOMPAT_SUPP 0
136 #define LDD_MAGIC 0x1dd00001
138 /* On-disk configuration file. In host-endian order. */
139 struct lustre_disk_data {
141 __u32 ldd_feature_compat; /* compatible feature flags */
142 __u32 ldd_feature_rocompat;/* read-only compatible feature flags */
143 __u32 ldd_feature_incompat;/* incompatible feature flags */
145 __u32 ldd_config_ver; /* config rewrite count - not used */
146 __u32 ldd_flags; /* LDD_SV_TYPE */
147 __u32 ldd_svindex; /* server index (0001), must match
149 __u32 ldd_mount_type; /* target fs type LDD_MT_* */
150 char ldd_fsname[64]; /* filesystem this server is part of,
152 char ldd_svname[64]; /* this server's name (lustre-mdt0001)*/
153 __u8 ldd_uuid[40]; /* server UUID (COMPAT_146) */
155 /*200*/ char ldd_userdata[1024 - 200]; /* arbitrary user string */
156 /*1024*/__u8 ldd_padding[4096 - 1024];
157 /*4096*/char ldd_mount_opts[4096]; /* target fs mount opts */
158 /*8192*/char ldd_params[4096]; /* key=value pairs */
161 #define IS_MDT(data) ((data)->ldd_flags & LDD_F_SV_TYPE_MDT)
162 #define IS_OST(data) ((data)->ldd_flags & LDD_F_SV_TYPE_OST)
163 #define IS_MGS(data) ((data)->ldd_flags & LDD_F_SV_TYPE_MGS)
164 #define MT_STR(data) mt_str((data)->ldd_mount_type)
166 /* Make the mdt/ost server obd name based on the filesystem name */
167 static inline int server_make_name(__u32 flags, __u16 index, char *fs,
170 if (flags & (LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) {
171 if (!(flags & LDD_F_SV_ALL))
172 sprintf(name, "%.8s-%s%04x", fs,
173 (flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST",
175 } else if (flags & LDD_F_SV_TYPE_MGS) {
176 sprintf(name, "MGS");
178 CERROR("unknown server type %#x\n", flags);
184 /* Get the index from the obd name */
185 int server_name2index(char *svname, __u32 *idx, char **endptr);
188 /****************** mount command *********************/
190 /* The lmd is only used internally by Lustre; mount simply passes
191 everything as string options */
193 #define LMD_MAGIC 0xbdacbd03
195 /* gleaned from the mount command - no persistent info here */
196 struct lustre_mount_data {
198 __u32 lmd_flags; /* lustre mount flags */
199 int lmd_mgs_failnodes; /* mgs failover node count */
200 int lmd_exclude_count;
201 int lmd_recovery_time_soft;
202 int lmd_recovery_time_hard;
203 char *lmd_dev; /* device name */
204 char *lmd_profile; /* client only */
205 char *lmd_mgssec; /* sptlrpc flavor to mgs */
206 char *lmd_opts; /* lustre mount options (as opposed to
207 _device_ mount options) */
208 __u32 *lmd_exclude; /* array of OSTs to ignore */
211 #define LMD_FLG_SERVER 0x0001 /* Mounting a server */
212 #define LMD_FLG_CLIENT 0x0002 /* Mounting a client */
213 #define LMD_FLG_ABORT_RECOV 0x0008 /* Abort recovery */
214 #define LMD_FLG_NOSVC 0x0010 /* Only start MGS/MGC for servers,
216 #define LMD_FLG_NOMGS 0x0020 /* Only start target for servers, reusing
217 existing MGS services */
218 #define LMD_FLG_WRITECONF 0x0040 /* Rewrite config log */
219 #define LMD_FLG_NOIR 0x0080 /* NO imperative recovery */
221 #define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT)
224 /****************** last_rcvd file *********************/
226 /** version recovery epoch */
227 #define LR_EPOCH_BITS 32
228 #define lr_epoch(a) ((a) >> LR_EPOCH_BITS)
229 #define LR_EXPIRE_INTERVALS 16 /**< number of intervals to track transno */
230 #define ENOENT_VERSION 1 /** 'virtual' version of non-existent object */
232 #define LR_SERVER_SIZE 512
233 #define LR_CLIENT_START 8192
234 #define LR_CLIENT_SIZE 128
235 #if LR_CLIENT_START < LR_SERVER_SIZE
236 #error "Can't have LR_CLIENT_START < LR_SERVER_SIZE"
240 * This limit is arbitrary (131072 clients on x86), but it is convenient to use
241 * 2^n * CFS_PAGE_SIZE * 8 for the number of bits that fit an order-n allocation.
242 * If we need more than 131072 clients (order-2 allocation on x86) then this
243 * should become an array of single-page pointers that are allocated on demand.
245 #if (128 * 1024UL) > (CFS_PAGE_SIZE * 8)
246 #define LR_MAX_CLIENTS (128 * 1024UL)
248 #define LR_MAX_CLIENTS (CFS_PAGE_SIZE * 8)
251 /** COMPAT_146: this is an OST (temporary) */
252 #define OBD_COMPAT_OST 0x00000002
253 /** COMPAT_146: this is an MDT (temporary) */
254 #define OBD_COMPAT_MDT 0x00000004
255 /** 2.0 server, interop flag to show server version is changed */
256 #define OBD_COMPAT_20 0x00000008
258 /** MDS handles LOV_OBJID file */
259 #define OBD_ROCOMPAT_LOVOBJID 0x00000001
261 /** OST handles group subdirs */
262 #define OBD_INCOMPAT_GROUPS 0x00000001
263 /** this is an OST */
264 #define OBD_INCOMPAT_OST 0x00000002
265 /** this is an MDT */
266 #define OBD_INCOMPAT_MDT 0x00000004
267 /** common last_rvcd format */
268 #define OBD_INCOMPAT_COMMON_LR 0x00000008
269 /** FID is enabled */
270 #define OBD_INCOMPAT_FID 0x00000010
271 /** Size-on-MDS is enabled */
272 #define OBD_INCOMPAT_SOM 0x00000020
273 /** filesystem using iam format to store directory entries */
274 #define OBD_INCOMPAT_IAM_DIR 0x00000040
275 /** LMA attribute contains per-inode incompatible flags */
276 #define OBD_INCOMPAT_LMA 0x00000080
279 /* Data stored per server at the head of the last_rcvd file. In le32 order.
280 This should be common to filter_internal.h, lustre_mds.h */
281 struct lr_server_data {
282 __u8 lsd_uuid[40]; /* server UUID */
283 __u64 lsd_last_transno; /* last completed transaction ID */
284 __u64 lsd_compat14; /* reserved - compat with old last_rcvd */
285 __u64 lsd_mount_count; /* incarnation number */
286 __u32 lsd_feature_compat; /* compatible feature flags */
287 __u32 lsd_feature_rocompat;/* read-only compatible feature flags */
288 __u32 lsd_feature_incompat;/* incompatible feature flags */
289 __u32 lsd_server_size; /* size of server data area */
290 __u32 lsd_client_start; /* start of per-client data area */
291 __u16 lsd_client_size; /* size of per-client data area */
292 __u16 lsd_subdir_count; /* number of subdirectories for objects */
293 __u64 lsd_catalog_oid; /* recovery catalog object id */
294 __u32 lsd_catalog_ogen; /* recovery catalog inode generation */
295 __u8 lsd_peeruuid[40]; /* UUID of MDS associated with this OST */
296 __u32 lsd_ost_index; /* index number of OST in LOV */
297 __u32 lsd_mdt_index; /* index number of MDT in LMV */
298 __u32 lsd_start_epoch; /* VBR: start epoch from last boot */
299 /** transaction values since lsd_trans_table_time */
300 __u64 lsd_trans_table[LR_EXPIRE_INTERVALS];
301 /** start point of transno table below */
302 __u32 lsd_trans_table_time; /* time of first slot in table above */
303 __u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */
304 __u8 lsd_padding[LR_SERVER_SIZE - 288];
307 /* Data stored per client in the last_rcvd file. In le32 order. */
308 struct lsd_client_data {
309 __u8 lcd_uuid[40]; /* client UUID */
310 __u64 lcd_last_transno; /* last completed transaction ID */
311 __u64 lcd_last_xid; /* xid for the last transaction */
312 __u32 lcd_last_result; /* result from last RPC */
313 __u32 lcd_last_data; /* per-op data (disposition for open &c.) */
314 /* for MDS_CLOSE requests */
315 __u64 lcd_last_close_transno; /* last completed transaction ID */
316 __u64 lcd_last_close_xid; /* xid for the last transaction */
317 __u32 lcd_last_close_result; /* result from last RPC */
318 __u32 lcd_last_close_data; /* per-op data */
319 /* VBR: last versions */
320 __u64 lcd_pre_versions[4];
321 __u32 lcd_last_epoch;
322 /** orphans handling for delayed export rely on that */
323 __u32 lcd_first_epoch;
324 __u8 lcd_padding[LR_CLIENT_SIZE - 128];
327 /* bug20354: the lcd_uuid for export of clients may be wrong */
328 static inline void check_lcd(char *obd_name, int index,
329 struct lsd_client_data *lcd)
331 int length = sizeof(lcd->lcd_uuid);
332 if (strnlen((char*)lcd->lcd_uuid, length) == length) {
333 lcd->lcd_uuid[length - 1] = '\0';
335 LCONSOLE_ERROR("the client UUID (%s) on %s for exports"
336 "stored in last_rcvd(index = %d) is bad!\n",
337 lcd->lcd_uuid, obd_name, index);
341 /* last_rcvd handling */
342 static inline void lsd_le_to_cpu(struct lr_server_data *buf,
343 struct lr_server_data *lsd)
346 memcpy(lsd->lsd_uuid, buf->lsd_uuid, sizeof (lsd->lsd_uuid));
347 lsd->lsd_last_transno = le64_to_cpu(buf->lsd_last_transno);
348 lsd->lsd_compat14 = le64_to_cpu(buf->lsd_compat14);
349 lsd->lsd_mount_count = le64_to_cpu(buf->lsd_mount_count);
350 lsd->lsd_feature_compat = le32_to_cpu(buf->lsd_feature_compat);
351 lsd->lsd_feature_rocompat = le32_to_cpu(buf->lsd_feature_rocompat);
352 lsd->lsd_feature_incompat = le32_to_cpu(buf->lsd_feature_incompat);
353 lsd->lsd_server_size = le32_to_cpu(buf->lsd_server_size);
354 lsd->lsd_client_start = le32_to_cpu(buf->lsd_client_start);
355 lsd->lsd_client_size = le16_to_cpu(buf->lsd_client_size);
356 lsd->lsd_subdir_count = le16_to_cpu(buf->lsd_subdir_count);
357 lsd->lsd_catalog_oid = le64_to_cpu(buf->lsd_catalog_oid);
358 lsd->lsd_catalog_ogen = le32_to_cpu(buf->lsd_catalog_ogen);
359 memcpy(lsd->lsd_peeruuid, buf->lsd_peeruuid, sizeof(lsd->lsd_peeruuid));
360 lsd->lsd_ost_index = le32_to_cpu(buf->lsd_ost_index);
361 lsd->lsd_mdt_index = le32_to_cpu(buf->lsd_mdt_index);
362 lsd->lsd_start_epoch = le32_to_cpu(buf->lsd_start_epoch);
363 for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
364 lsd->lsd_trans_table[i] = le64_to_cpu(buf->lsd_trans_table[i]);
365 lsd->lsd_trans_table_time = le32_to_cpu(buf->lsd_trans_table_time);
366 lsd->lsd_expire_intervals = le32_to_cpu(buf->lsd_expire_intervals);
369 static inline void lsd_cpu_to_le(struct lr_server_data *lsd,
370 struct lr_server_data *buf)
373 memcpy(buf->lsd_uuid, lsd->lsd_uuid, sizeof (buf->lsd_uuid));
374 buf->lsd_last_transno = cpu_to_le64(lsd->lsd_last_transno);
375 buf->lsd_compat14 = cpu_to_le64(lsd->lsd_compat14);
376 buf->lsd_mount_count = cpu_to_le64(lsd->lsd_mount_count);
377 buf->lsd_feature_compat = cpu_to_le32(lsd->lsd_feature_compat);
378 buf->lsd_feature_rocompat = cpu_to_le32(lsd->lsd_feature_rocompat);
379 buf->lsd_feature_incompat = cpu_to_le32(lsd->lsd_feature_incompat);
380 buf->lsd_server_size = cpu_to_le32(lsd->lsd_server_size);
381 buf->lsd_client_start = cpu_to_le32(lsd->lsd_client_start);
382 buf->lsd_client_size = cpu_to_le16(lsd->lsd_client_size);
383 buf->lsd_subdir_count = cpu_to_le16(lsd->lsd_subdir_count);
384 buf->lsd_catalog_oid = cpu_to_le64(lsd->lsd_catalog_oid);
385 buf->lsd_catalog_ogen = cpu_to_le32(lsd->lsd_catalog_ogen);
386 memcpy(buf->lsd_peeruuid, lsd->lsd_peeruuid, sizeof(buf->lsd_peeruuid));
387 buf->lsd_ost_index = cpu_to_le32(lsd->lsd_ost_index);
388 buf->lsd_mdt_index = cpu_to_le32(lsd->lsd_mdt_index);
389 buf->lsd_start_epoch = cpu_to_le32(lsd->lsd_start_epoch);
390 for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
391 buf->lsd_trans_table[i] = cpu_to_le64(lsd->lsd_trans_table[i]);
392 buf->lsd_trans_table_time = cpu_to_le32(lsd->lsd_trans_table_time);
393 buf->lsd_expire_intervals = cpu_to_le32(lsd->lsd_expire_intervals);
396 static inline void lcd_le_to_cpu(struct lsd_client_data *buf,
397 struct lsd_client_data *lcd)
399 memcpy(lcd->lcd_uuid, buf->lcd_uuid, sizeof (lcd->lcd_uuid));
400 lcd->lcd_last_transno = le64_to_cpu(buf->lcd_last_transno);
401 lcd->lcd_last_xid = le64_to_cpu(buf->lcd_last_xid);
402 lcd->lcd_last_result = le32_to_cpu(buf->lcd_last_result);
403 lcd->lcd_last_data = le32_to_cpu(buf->lcd_last_data);
404 lcd->lcd_last_close_transno = le64_to_cpu(buf->lcd_last_close_transno);
405 lcd->lcd_last_close_xid = le64_to_cpu(buf->lcd_last_close_xid);
406 lcd->lcd_last_close_result = le32_to_cpu(buf->lcd_last_close_result);
407 lcd->lcd_last_close_data = le32_to_cpu(buf->lcd_last_close_data);
408 lcd->lcd_pre_versions[0] = le64_to_cpu(buf->lcd_pre_versions[0]);
409 lcd->lcd_pre_versions[1] = le64_to_cpu(buf->lcd_pre_versions[1]);
410 lcd->lcd_pre_versions[2] = le64_to_cpu(buf->lcd_pre_versions[2]);
411 lcd->lcd_pre_versions[3] = le64_to_cpu(buf->lcd_pre_versions[3]);
412 lcd->lcd_last_epoch = le32_to_cpu(buf->lcd_last_epoch);
413 lcd->lcd_first_epoch = le32_to_cpu(buf->lcd_first_epoch);
416 static inline void lcd_cpu_to_le(struct lsd_client_data *lcd,
417 struct lsd_client_data *buf)
419 memcpy(buf->lcd_uuid, lcd->lcd_uuid, sizeof (lcd->lcd_uuid));
420 buf->lcd_last_transno = cpu_to_le64(lcd->lcd_last_transno);
421 buf->lcd_last_xid = cpu_to_le64(lcd->lcd_last_xid);
422 buf->lcd_last_result = cpu_to_le32(lcd->lcd_last_result);
423 buf->lcd_last_data = cpu_to_le32(lcd->lcd_last_data);
424 buf->lcd_last_close_transno = cpu_to_le64(lcd->lcd_last_close_transno);
425 buf->lcd_last_close_xid = cpu_to_le64(lcd->lcd_last_close_xid);
426 buf->lcd_last_close_result = cpu_to_le32(lcd->lcd_last_close_result);
427 buf->lcd_last_close_data = cpu_to_le32(lcd->lcd_last_close_data);
428 buf->lcd_pre_versions[0] = cpu_to_le64(lcd->lcd_pre_versions[0]);
429 buf->lcd_pre_versions[1] = cpu_to_le64(lcd->lcd_pre_versions[1]);
430 buf->lcd_pre_versions[2] = cpu_to_le64(lcd->lcd_pre_versions[2]);
431 buf->lcd_pre_versions[3] = cpu_to_le64(lcd->lcd_pre_versions[3]);
432 buf->lcd_last_epoch = cpu_to_le32(lcd->lcd_last_epoch);
433 buf->lcd_first_epoch = cpu_to_le32(lcd->lcd_first_epoch);
436 static inline __u64 lcd_last_transno(struct lsd_client_data *lcd)
438 return (lcd->lcd_last_transno > lcd->lcd_last_close_transno ?
439 lcd->lcd_last_transno : lcd->lcd_last_close_transno);
442 static inline __u64 lcd_last_xid(struct lsd_client_data *lcd)
444 return (lcd->lcd_last_xid > lcd->lcd_last_close_xid ?
445 lcd->lcd_last_xid : lcd->lcd_last_close_xid);
448 /****************** superblock additional info *********************/
453 struct lustre_sb_info {
455 struct obd_device *lsi_mgc; /* mgc obd */
456 struct lustre_mount_data *lsi_lmd; /* mount command info */
457 struct lustre_disk_data *lsi_ldd; /* mount info on-disk */
458 struct ll_sb_info *lsi_llsbi; /* add'l client sbi info */
459 struct vfsmount *lsi_srv_mnt; /* the one server mount */
460 cfs_atomic_t lsi_mounts; /* references to the srv_mnt */
461 struct backing_dev_info lsi_bdi; /* each client mountpoint needs
462 own backing_dev_info */
465 #define LSI_SERVER 0x00000001
466 #define LSI_UMOUNT_FORCE 0x00000010
467 #define LSI_UMOUNT_FAILOVER 0x00000020
468 #define LSI_BDI_INITIALIZED 0x00000040
469 #define LSI_IR_CAPABLE 0x00000080
471 #define s2lsi(sb) ((struct lustre_sb_info *)((sb)->s_fs_info))
472 #define s2lsi_nocast(sb) ((sb)->s_fs_info)
474 #define get_profile_name(sb) (s2lsi(sb)->lsi_lmd->lmd_profile)
476 #endif /* __KERNEL__ */
478 /****************** mount lookup info *********************/
480 struct lustre_mount_info {
482 struct super_block *lmi_sb;
483 struct vfsmount *lmi_mnt;
484 cfs_list_t lmi_list_chain;
487 /****************** prototypes *********************/
492 void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
493 struct vfsmount *mnt));
494 void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb));
497 int lustre_common_put_super(struct super_block *sb);
498 struct lustre_mount_info *server_find_mount_locked(const char *name);
499 struct lustre_mount_info *server_get_mount(const char *name);
500 struct lustre_mount_info *server_get_mount_2(const char *name);
501 int server_put_mount(const char *name, struct vfsmount *mnt);
502 int server_put_mount_2(const char *name, struct vfsmount *mnt);
503 int server_register_target(struct super_block *sb);
504 struct mgs_target_info;
505 int server_mti_print(char *title, struct mgs_target_info *mti);
506 void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd);
509 int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type);
515 #endif // _LUSTRE_DISK_H