-.TH LFS-SETSTRIPIE 1 2015-11-06 "Lustre" "Lustre Utilities"
+.TH LFS-SETSTRIPE 1 2017-08-23 "Lustre" "Lustre Utilities"
.SH NAME
-lfs setstripe \- set striping pattern of a file.
+lfs setstripe \- set striping pattern of a file or directory default
.SH SYNOPSIS
-.B lfs setstripe [\fISTRIPE_OPTIONS\fR] <directory|filename>
+.B lfs setstripe \fR[\fISTRIPE_OPTIONS\fR] <\fIdirectory\fR|\fIfilename\fR>
.br
-.B lfs setstripe -d <directory>
+.B lfs setstripe \fR{\fB--component-end\fR|\fB-E \fIend1\fR} [\fISTRIPE_OPTIONS\fR]
+[{\fB--component-end\fR|\fB-E \fIend2\fR} [\fISTRIPE_OPTIONS\fR] ...] <\fIfilename\fR>
.br
-.B lfs setstripe <--component-end|-E end1> [\fISTRIPE_OPTIONS\fR] \
-[<--component-end|-E end2> [\fISTRIPE_OPTIONS\fR] ...] <filename>
+.B lfs setstripe --component-add \fR{\fB--component-end\fR|\fB-E \fIend1\fR}
+[\fISTRIPE_OPTIONS\fR] [{\fB--component-end\fR|\fB-E \fIend2\fR} [\fISTRIPE_OPTIONS\fR]
+\&...] <\fIfilename\fR>
.br
-.B lfs setstripe --component-add <--component-end|-E end1> [\fISTRIPE_OPTIONS\fR] \
-[<--component-end|-E end2> [\fISTRIPE_OPTIONS\fR] ...] <filename>
+.B lfs setstripe --component-del \fR{\fB--component-id\fR|\fB-I \fIcomp_id\fR|
+.B --component-flags=\fIcomp_flags\fR} <\fIfilename\fR>
.br
-.B lfs setstripe --component-del <--component-id|-I comp_id | \
---component-flags comp_flags> <filename>
+.B lfs setstripe -d \fR<\fIdirectory\fR>
.br
.SH DESCRIPTION
.TP
-.B lfs setstripe [\fISTRIPE_OPTIONS\fR] <directory|filename>
-Create a file with specified striping pattern, or set default stripping pattern
-to a directory.
-.br
-.TP
-.B lfs setstripe -d <directory>
-.br
-Delete the default striping on the specified directory.
-.TP
-.B lfs setstripe <--component-end|-E end1> [\fISTRIPE_OPTIONS\fR] \
-[<--component-end|-E end2> [\fISTRIPE_OPTIONS\fR] ...] <filename>
+.B lfs setstripe \fR[\fISTRIPE_OPTIONS\fR] <\fIdirectory\fR|\fIfilename\fR>
+Create a file with specified layout, or set or replace the default file
+layout on an existing directory. If the default file layout is set on
+the filesystem root directory, it will be used as the filesystem-wide
+default layout for all files that do not explicitly specify a layout and
+do not have a default layout on the parent directory. The default layout
+set on a directory will be copied to any new subdirectories created within
+that directory at the time they are created.
+.TP
+.B lfs setstripe \fR{\fB--component-end\fR|\fB-E \fIend1\fR} [\fISTRIPE_OPTIONS\fR] \
+[{\fB--component-end\fR|\fB-E \fIend2\fR} [\fISTRIPE_OPTIONS\fR] ...] <\fIfilename\fR>
.br
Create a file with the specified composite layout. Each component defines the
-stripe pattern of the file in the range of [start, end). The first component
-must start from offset 0, and all components must be adjacent with each other,
-no holes are allowed, so each extent will start at the end of previous extent.
-The
-.I -E
+stripe pattern of the file in the range of
+.RI [ start ", " end ].
+The first component implicitly starts at offset 0, and all later components
+start at the end of previous extent. The
+.B -E
option is used to specify the end offset of each component, and it also
-indicates the following \fISTRIPE_OPTIONS\fR are for this component. A -1 end
-offset indicates the EOF.
-.TP
-.B lfs setstripe --component-add <--component-end|-E end1> [\fISTRIPE_OPTIONS\fR] \
-[<--component-end|-E end2> [\fISTRIPE_OPTIONS\fR] ...] <filename>
+indicates the following \fISTRIPE_OPTIONS\fR are for this component. The end
+offset of
+.B -1
+or
+.B eof
+indicates the component extends to the end of file.
+.TP
+.B lfs setstripe --component-add \fR{\fB--component-end\fR|\fB-E \fIend1\fR} [\fISTRIPE_OPTIONS\fR] \
+[{\fB--component-end\fR|\fB-E \fIend2\fR} [\fISTRIPE_OPTIONS\fR] ...] <\fIfilename\fR>
.br
Add components to an existing composite file. The extent start of the first
component to be added is equal to the extent end of last component in existing
-file, and all components to be added must be adjacent with each other.
-.TP
-.B lfs setstripe --component-del <--component-id|-I comp_id | \
---component-flags comp_flags> <filename>
+file, and all components to be added must be adjacent with each other. It is
+not possible to add components incrementally to the default directory layout,
+since the entire default layout can be replaced with one
+.B lfs setstripe
+call.
+.TP
+.B lfs setstripe --component-del \fR{\fB--component-id\fR|\fB-I \fIcomp_id\fR | \
+\fB--component-flags \fIcomp_flags\fR} <\fIfilename\fR>
.br
Remove the component(s) specified by component ID or flags from an existing
-file. The ID specified by
-.I -I
+file. The ID specified by the
+.B -I
option is the numerical unique ID of the component, it can be obtained using
the
.B lfs getstripe
-command.
-.I --component-flags
-option is used to specify certain type of components, such as all instantiated
-ones.
+command. It is not possible to delete components from a default directory
+layout, since the entire default layout can be replaced with one
+.B lfs setstripe
+call.
+The \fB--component-flags\fR option is used to specify certain type of
+components, such as all instantiated ones.
+.TP
+.B lfs setstripe -d \fR<\fIdirectory\fR>
+.br
+Delete the default layout on the specified directory. It is not necessary
+to delete the default layout on a directory before replacing it. This is
+only needed if the directory should revert from a directory-specific layout
+to using the global filesystem default layout stored on the root directory.
.SH STRIPE_OPTIONS
The various stripe related options are listed and explained below:
.TP
-.B -c, --stripe-count <\fIstripe_count\fR>
-The number of OSTs to stripe a file over. 0 means to use the filesystem-wide
-default stripe count (default 1), and -1 means to stripe over all available
-OSTs.
+.B -c\fR, \fB--stripe-count \fR<\fIstripe_count\fR>
+The number of OSTs to stripe a file over. \fB0 \fRmeans to use the
+filesystem-wide default stripe count (default 1), and \fB-1 \fRmeans to stripe
+over all available OSTs.
.TP
-.B -S, --stripe-size <\fIstripe_size\fR>
-The number of bytes to store on each OST before moving to the next OST. 0 means
-to use the filesystem-wide default stripe_size (default 1MB).
+.B -S\fR, \fB--stripe-size \fR<\fIstripe_size\fR>
+The number of bytes to store on each OST before moving to the next OST. \fB0\fR
+means to use the filesystem-wide default stripe_size (default 1MB).
.TP
-.B -i, --stripe-index <\fIstart_ost_index\fR>
-The OST index (starting at 0) on which to start striping for this file. -1
+.B -i\fR, \fB--stripe-index \fR<\fIstart_ost_index\fR>
+The OST index (starting at 0) on which to start striping for this file. \fB-1\fR
allows the MDS to choose the starting index and it is strongly recommended, as
this allows space and load balancing to be done by the MDS as needed.
.TP
-.B -o, --ost-list <\fIost_indices\fR>
+.B -o\fR, \fB--ost-list \fR<\fIost_indices\fR>
Used to specify the exact stripe layout on the file system. \fIost_indices\fR
is a list of OSTs referenced by their indices, which are specified in decimal
or hex form and can be obtained using the
striping the file. Otherwise the striping will occur in the order specified in
.IR ost_indices .
.TP
-.B -p, --pool <\fIpool_name\fR>
+.B -p\fR, \fB--pool \fR<\fIpool_name\fR>
The name of a predefined pool of OSTs (see
.BR lctl (8))
that will be used for striping. The
.I start_ost_index
must be part of the pool or an error will be returned.
.TP
+ .B -L, --layout <\fIlayout type\fB>\fR
+ The type of stripe layout, can be
+ .BR raid0 ", " released " or " mdt ".
+ It is
+ .BR raid0
+ by default. The
+ .BR mdt
+ type allows place the first component of the file on the MDT where the inode
+ is located. This is used with composite file layouts and can be defined as
+ first component only. The
+ .IR stripe_size
+ of MDT part is always equal to the component size. There is also per-MDT
+ parameter
+ .IR lod.dom_stripesize
+ to limit maximum size of DoM stripe which can be changed with
+ .BR lctl\ set_param
+ command, (e.g.
+ .IR lctl\ set_param\ lod.*.dom_stripesize=0
+ , see
+ .BR lctl (8))
+ .TP
There are two options available only for \fBlfs migrate\fR:
.TP
-.B -b, --block
+.BR -b , --block
Block file access during data migration (default).
.TP
-.B -n, --non-block
+.BR -n , --non-block
Abort migrations if concurrent access is detected.
.SH COMPONENT_OPTIONS
The various component related options are listed and explained below:
.TP
-.B -E, --component-end <\fIend\fR>
+.B -E\fR,\fB--component-end \fR< \fIend\fR>
The end offset of the component,
.I end
is specified in bytes, or using a suffix (kMGTP),
-such as 256M. -1 means the end of file.
+such as 256M. \fB-1\fR means the end of file.
.TP
-.B -I, --component-id <\fIcomp_id\fR>
+.B -I\fR, \fB--component-id \fR<\fIcomp_id\fR>
The numerical unique component id.
.TP
-.B --component-flags <\fIflags\fR>
-Component flags. Available flags: \fBinit\fR: instantiated component.
-\fB^init\fR: uninstantiated component.
+.B --component-flags \fR<\fIflags\fR>
+Component flags. Available \fIflags\fR:
+.RS
+.RS
+.B init\fR: instantiated component.
+.RE
+.RS
+.B ^init\fR: uninstantiated component.
+.RE
+.RE
.TP
.B --component-add
Add specified components to an existing composite file.
This creates a file striped on two OSTs with 128kB on each stripe.
.TP
.B $ lfs setstripe -d /mnt/lustre/dir
-This deletes a default stripe pattern on dir. New files will use the default \
-striping pattern created therein.
+This deletes a default stripe pattern on dir. New files created in that
+directory will use the filesystem global default instead.
.TP
.B $ lfs setstripe -E 4M -c 1 -E 64M -c 4 -E -1 -c -1 /mnt/lustre/file1
This creates a file with composite layout, the component has 1 stripe and \
.TP
.B $ lfs setstripe --component-del -I 1 /mnt/lustre/file1
This deletes the component with ID equals 1 from an existing file.
+ .TP
+ .B $ lfs setstripe -E 1M -L mdt -E -1 /mnt/lustre/file1
+ This created file with Data-on-MDT layout. The first 1M is placed on MDT and \
+ rest of file is placed on OST with default striping.
.SH SEE ALSO
.BR lfs (1),
.BR lfs-migrate (1),
struct lu_buf cl_buf;
/** size of layout in lov_mds_md format. */
size_t cl_size;
+ /** size of DoM component if exists or zero otherwise */
+ u64 cl_dom_comp_size;
/** Layout generation. */
u32 cl_layout_gen;
/** whether layout is a composite one */
/** Transient page, the transient cl_page is used to bind a cl_page
* to vmpage which is not belonging to the same object of cl_page.
- * it is used in DirectIO, lockless IO and liblustre. */
+ * it is used in DirectIO and lockless IO. */
CPT_TRANSIENT,
};
* of ldlm_[res_]lvbo_[init,update,fill]() functions.
*/
struct ldlm_valblock_ops {
- int (*lvbo_init)(struct ldlm_resource *res);
- int (*lvbo_update)(struct ldlm_resource *res,
- struct ptlrpc_request *r,
- int increase);
- int (*lvbo_free)(struct ldlm_resource *res);
+ int (*lvbo_init)(struct ldlm_resource *res);
+ int (*lvbo_update)(struct ldlm_resource *res, struct ldlm_lock *lock,
+ struct ptlrpc_request *r, int increase);
+ int (*lvbo_free)(struct ldlm_resource *res);
/* Return size of lvb data appropriate RPC size can be reserved */
int (*lvbo_size)(struct ldlm_lock *lock);
/* Called to fill in lvb data to RPC buffer @buf */
* This allows the client to start caching negative dentries
* for a directory and may save an RPC for a later stat.
*/
- unsigned int ns_ctime_age_limit;
+ time64_t ns_ctime_age_limit;
/**
* Used to rate-limit ldlm_namespace_dump calls.
* \see ldlm_namespace_dump. Increased by 10 seconds every time
* it is called.
*/
- cfs_time_t ns_next_dump;
+ time64_t ns_next_dump;
/** "policy" function that does actual lock conflict determination */
ldlm_res_policy ns_policy;
* The resources in this namespace remember contended state during
* \a ns_contention_time, in seconds.
*/
- unsigned ns_contention_time;
+ time64_t ns_contention_time;
/**
* Limit size of contended extent locks, in bytes.
/** Private storage for lock user. Opaque to LDLM. */
void *l_ast_data;
-
+ /* separate ost_lvb used mostly by Data-on-MDT for now.
+ * It is introduced to don't mix with layout lock data. */
+ struct ost_lvb l_ost_lvb;
/*
* Server-side-only members.
*/
* under this lock.
* \see ost_rw_prolong_locks
*/
- cfs_time_t l_callback_timeout;
+ time64_t l_callback_timeout;
/** Local PID of process which created this lock. */
__u32 l_pid;
union {
/**
* When the resource was considered as contended,
- * used only on server side. */
- cfs_time_t lr_contention_time;
+ * used only on server side.
+ */
+ time64_t lr_contention_time;
/**
* Associated inode, used only on client side.
*/
lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT;
}
+ static inline bool ldlm_has_dom(struct ldlm_lock *lock)
+ {
+ return lock->l_resource->lr_type == LDLM_IBITS &&
+ lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_DOM;
+ }
+
static inline char *
ldlm_ns_name(struct ldlm_namespace *ns)
{
struct ldlm_res_id lpa_resid;
struct ldlm_extent lpa_extent;
enum ldlm_mode lpa_mode;
- int lpa_timeout;
+ time64_t lpa_timeout;
int lpa_locks_cnt;
int lpa_blocks_cnt;
};
/** @} ldlm_handlers */
void ldlm_revoke_export_locks(struct obd_export *exp);
-unsigned int ldlm_bl_timeout(struct ldlm_lock *lock);
+time64_t ldlm_bl_timeout(struct ldlm_lock *lock);
#endif
int ldlm_del_waiting_lock(struct ldlm_lock *lock);
-int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout);
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout);
int ldlm_get_ref(void);
void ldlm_put_ref(void);
int ldlm_init_export(struct obd_export *exp);
* Update Lock Value Block Operations (LVBO) on a resource taking into account
* data from request \a r
*/
- static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
- struct ptlrpc_request *req, int increase)
+ static inline int ldlm_lvbo_update(struct ldlm_resource *res,
+ struct ldlm_lock *lock,
+ struct ptlrpc_request *req, int increase)
{
+ struct ldlm_namespace *ns = ldlm_res_to_ns(res);
int rc;
/* delayed lvb init may be required */
return rc;
}
- if (ldlm_res_to_ns(res)->ns_lvbo &&
- ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) {
- return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, req,
- increase);
- }
+ if (ns->ns_lvbo && ns->ns_lvbo->lvbo_update)
+ return ns->ns_lvbo->lvbo_update(res, lock, req, increase);
+
return 0;
}
+ static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
+ struct ptlrpc_request *req, int increase)
+ {
+ return ldlm_lvbo_update(res, NULL, req, increase);
+ }
+
int ldlm_error2errno(enum ldlm_error error);
enum ldlm_error ldlm_errno2error(int err_no); /* don't call it `errno': this
* confuses user-space. */
int ldlm_proc_setup(void);
#ifdef CONFIG_PROC_FS
void ldlm_proc_cleanup(void);
+
+ static inline void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
+ struct lprocfs_stats *srv_stats)
+ {
+ int lock_type = 0, op = 0;
+
+ lock_type = dlm_req->lock_desc.l_resource.lr_type;
+
+ switch (lock_type) {
+ case LDLM_PLAIN:
+ op = PTLRPC_LAST_CNTR + LDLM_PLAIN_ENQUEUE;
+ break;
+ case LDLM_EXTENT:
+ op = PTLRPC_LAST_CNTR + LDLM_EXTENT_ENQUEUE;
+ break;
+ case LDLM_FLOCK:
+ op = PTLRPC_LAST_CNTR + LDLM_FLOCK_ENQUEUE;
+ break;
+ case LDLM_IBITS:
+ op = PTLRPC_LAST_CNTR + LDLM_IBITS_ENQUEUE;
+ break;
+ default:
+ op = 0;
+ break;
+ }
+
+ if (op != 0)
+ lprocfs_counter_incr(srv_stats, op);
+
+ return;
+ }
#else
static inline void ldlm_proc_cleanup(void) {}
+ static inline void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
+ struct lprocfs_stats *srv_stats) {}
#endif
/* resource.c - internal */
return ex1->start <= ex2->start && ex1->end >= ex2->end;
}
+ int ldlm_inodebits_drop(struct ldlm_lock *lock, __u64 to_drop);
+
#endif
/** @} LDLM */
struct lu_buf oti_ladvise_buf;
};
+ static inline __u64 osc_enq2ldlm_flags(__u32 enqflags)
+ {
+ __u64 result = 0;
+
+ CDEBUG(D_DLMTRACE, "flags: %x\n", enqflags);
+
+ LASSERT((enqflags & ~CEF_MASK) == 0);
+
+ if (enqflags & CEF_NONBLOCK)
+ result |= LDLM_FL_BLOCK_NOWAIT;
+ if (enqflags & CEF_GLIMPSE)
+ result |= LDLM_FL_HAS_INTENT;
+ if (enqflags & CEF_DISCARD_DATA)
+ result |= LDLM_FL_AST_DISCARD_DATA;
+ if (enqflags & CEF_PEEK)
+ result |= LDLM_FL_TEST_LOCK;
+ if (enqflags & CEF_LOCK_MATCH)
+ result |= LDLM_FL_MATCH_LOCK;
+ if (enqflags & CEF_LOCK_NO_EXPAND)
+ result |= LDLM_FL_NO_EXPANSION;
+ if (enqflags & CEF_SPECULATIVE)
+ result |= LDLM_FL_SPECULATIVE;
+ return result;
+ }
+
+ typedef int (*osc_enqueue_upcall_f)(void *cookie, struct lustre_handle *lockh,
+ int rc);
+
+ struct osc_enqueue_args {
+ struct obd_export *oa_exp;
+ enum ldlm_type oa_type;
+ enum ldlm_mode oa_mode;
+ __u64 *oa_flags;
+ osc_enqueue_upcall_f oa_upcall;
+ void *oa_cookie;
+ struct ost_lvb *oa_lvb;
+ struct lustre_handle oa_lockh;
+ bool oa_speculative;
+ };
+
+ /**
+ * Bit flags for osc_dlm_lock_at_pageoff().
+ */
+ enum osc_dap_flags {
+ /**
+ * Just check if the desired lock exists, it won't hold reference
+ * count on lock.
+ */
+ OSC_DAP_FL_TEST_LOCK = 1 << 0,
+ /**
+ * Return the lock even if it is being canceled.
+ */
+ OSC_DAP_FL_CANCELING = 1 << 1
+ };
+
+ /*
+ * The set of operations which are different for MDC and OSC objects
+ */
+ struct osc_object_operations {
+ void (*oto_build_res_name)(struct osc_object *osc,
+ struct ldlm_res_id *resname);
+ struct ldlm_lock* (*oto_dlmlock_at_pgoff)(const struct lu_env *env,
+ struct osc_object *obj,
+ pgoff_t index,
+ enum osc_dap_flags dap_flags);
+ };
+
struct osc_object {
struct cl_object oo_cl;
struct lov_oinfo *oo_oinfo;
atomic_t oo_nr_ios;
wait_queue_head_t oo_io_waitq;
+ const struct osc_object_operations *oo_obj_ops;
bool oo_initialized;
};
+ static inline void osc_build_res_name(struct osc_object *osc,
+ struct ldlm_res_id *resname)
+ {
+ return osc->oo_obj_ops->oto_build_res_name(osc, resname);
+ }
+
+ static inline struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env,
+ struct osc_object *obj,
+ pgoff_t index,
+ enum osc_dap_flags flags)
+ {
+ return obj->oo_obj_ops->oto_dlmlock_at_pgoff(env, obj, index, flags);
+ }
+
static inline void osc_object_lock(struct osc_object *obj)
{
spin_lock(&obj->oo_lock);
#endif
}
+ static inline void osc_object_set_contended(struct osc_object *obj)
+ {
+ obj->oo_contention_time = cfs_time_current();
+ /* mb(); */
+ obj->oo_contended = 1;
+ }
+
+ static inline void osc_object_clear_contended(struct osc_object *obj)
+ {
+ obj->oo_contended = 0;
+ }
+
/*
* Lock "micro-states" for osc layer.
*/
enum osc_lock_state ols_state;
/** lock value block */
struct ost_lvb ols_lvb;
-
+ /** Lockless operations to be used by lockless lock */
+ const struct cl_lock_operations *ols_lockless_ops;
/**
* true, if ldlm_lock_addref() was called against
* osc_lock::ols_lock. This is used for sanity checking.
ols_speculative:1;
};
+ static inline int osc_lock_is_lockless(const struct osc_lock *ols)
+ {
+ return (ols->ols_cl.cls_ops == ols->ols_lockless_ops);
+ }
/**
* Page state private for osc layer.
cfs_time_t ops_submit_time;
};
+struct osc_brw_async_args {
+ struct obdo *aa_oa;
+ int aa_requested_nob;
+ int aa_nio_count;
+ u32 aa_page_count;
+ int aa_resends;
+ struct brw_page **aa_ppga;
+ struct client_obd *aa_cli;
+ struct list_head aa_oaps;
+ struct list_head aa_exts;
+};
+
extern struct kmem_cache *osc_lock_kmem;
extern struct kmem_cache *osc_object_kmem;
extern struct kmem_cache *osc_thread_kmem;
#define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY)
+ /* osc_page.c */
int osc_page_init(const struct lu_env *env, struct cl_object *obj,
struct cl_page *page, pgoff_t ind);
void osc_index2policy(union ldlm_policy_data *policy, const struct cl_object *obj,
pgoff_t start, pgoff_t end);
void osc_lru_add_batch(struct client_obd *cli, struct list_head *list);
void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
enum cl_req_type crt, int brw_flags);
+ int lru_queue_work(const struct lu_env *env, void *data);
+ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
+ long target, bool force);
+
+ /* osc_cache.c */
int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops);
int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg,
u32 async_flags);
pgoff_t start, pgoff_t end, int hp, int discard);
int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
pgoff_t start, pgoff_t end);
- void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
- struct osc_object *osc);
- int lru_queue_work(const struct lu_env *env, void *data);
+ int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+ struct osc_object *osc, int async);
+ void osc_wake_cache_waiters(struct client_obd *cli);
- void osc_object_set_contended(struct osc_object *obj);
- void osc_object_clear_contended(struct osc_object *obj);
+ static inline int osc_io_unplug_async(const struct lu_env *env,
+ struct client_obd *cli,
+ struct osc_object *osc)
+ {
+ return osc_io_unplug0(env, cli, osc, 1);
+ }
+
+ static inline void osc_io_unplug(const struct lu_env *env,
+ struct client_obd *cli,
+ struct osc_object *osc)
+ {
+ (void)osc_io_unplug0(env, cli, osc, 0);
+ }
+
+ typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *,
+ struct osc_page *, void *);
+ int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
+ struct osc_object *osc, pgoff_t start, pgoff_t end,
+ osc_page_gang_cbt cb, void *cbdata);
+ int osc_discard_cb(const struct lu_env *env, struct cl_io *io,
+ struct osc_page *ops, void *cbdata);
+
+ /* osc_dev.c */
+ int osc_device_init(const struct lu_env *env, struct lu_device *d,
+ const char *name, struct lu_device *next);
+ struct lu_device *osc_device_fini(const struct lu_env *env,
+ struct lu_device *d);
+ struct lu_device *osc_device_free(const struct lu_env *env,
+ struct lu_device *d);
+
+ /* osc_object.c */
+ int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+ const struct lu_object_conf *conf);
+ void osc_object_free(const struct lu_env *env, struct lu_object *obj);
+ int osc_lvb_print(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct ost_lvb *lvb);
+ int osc_object_print(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct lu_object *obj);
+ int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+ struct cl_attr *attr);
+ int osc_attr_update(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_attr *attr, unsigned valid);
+ int osc_object_glimpse(const struct lu_env *env, const struct cl_object *obj,
+ struct ost_lvb *lvb);
+ int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc);
int osc_object_is_contended(struct osc_object *obj);
- int osc_lock_is_lockless(const struct osc_lock *olck);
+ int osc_object_find_cbdata(const struct lu_env *env, struct cl_object *obj,
+ ldlm_iterator_t iter, void *data);
+ int osc_object_prune(const struct lu_env *env, struct cl_object *obj);
+
+ /* osc_request.c */
+ void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd);
+ int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg);
+ int osc_precleanup_common(struct obd_device *obd);
+ int osc_cleanup_common(struct obd_device *obd);
+ int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+ u32 keylen, void *key, u32 vallen, void *val,
+ struct ptlrpc_request_set *set);
+ int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode, void *arg);
+ int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
+ struct obd_device *obd, struct obd_uuid *cluuid,
+ struct obd_connect_data *data, void *localdata);
+ int osc_disconnect(struct obd_export *exp);
+ int osc_punch_send(struct obd_export *exp, struct obdo *oa,
+ obd_enqueue_update_f upcall, void *cookie);
+
+ /* osc_io.c */
+ int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
+ enum cl_req_type crt, struct cl_2queue *queue);
+ int osc_io_commit_async(const struct lu_env *env,
+ const struct cl_io_slice *ios,
+ struct cl_page_list *qin, int from, int to,
+ cl_commit_cbt cb);
+ int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios);
+ void osc_io_iter_fini(const struct lu_env *env,
+ const struct cl_io_slice *ios);
+ int osc_io_write_iter_init(const struct lu_env *env,
+ const struct cl_io_slice *ios);
+ void osc_io_write_iter_fini(const struct lu_env *env,
+ const struct cl_io_slice *ios);
+ int osc_io_fault_start(const struct lu_env *env, const struct cl_io_slice *ios);
+ void osc_io_setattr_end(const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ int osc_io_read_start(const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ int osc_io_write_start(const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ void osc_io_end(const struct lu_env *env, const struct cl_io_slice *slice);
+ int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+ struct cl_fsync_io *fio);
+ void osc_io_fsync_end(const struct lu_env *env,
+ const struct cl_io_slice *slice);
+ void osc_read_ahead_release(const struct lu_env *env, void *cbdata);
+
+ /* osc_lock.c */
+ void osc_lock_to_lockless(const struct lu_env *env, struct osc_lock *ols,
+ int force);
+ void osc_lock_wake_waiters(const struct lu_env *env, struct osc_object *osc,
+ struct osc_lock *oscl);
+ int osc_lock_enqueue_wait(const struct lu_env *env, struct osc_object *obj,
+ struct osc_lock *oscl);
+ void osc_lock_set_writer(const struct lu_env *env, const struct cl_io *io,
+ struct cl_object *obj, struct osc_lock *oscl);
+ int osc_lock_print(const struct lu_env *env, void *cookie,
+ lu_printer_t p, const struct cl_lock_slice *slice);
+ void osc_lock_cancel(const struct lu_env *env,
+ const struct cl_lock_slice *slice);
+ void osc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice);
+ int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data);
/*****************************************************************************
*
unsigned int oe_mppr;
};
- int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
- int sent, int rc);
- int osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
-
- int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
- pgoff_t start, pgoff_t end, bool discard_pages);
-
- typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *,
- struct osc_page *, void *);
- int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io,
- struct osc_object *osc, pgoff_t start, pgoff_t end,
- osc_page_gang_cbt cb, void *cbdata);
/** @} osc */
#endif /* LUSTRE_OSC_H */
#ifndef __OBD_H
#define __OBD_H
+#include <linux/kobject.h>
#include <linux/spinlock.h>
+#include <linux/sysfs.h>
#include <uapi/linux/lustre/lustre_idl.h>
#include <lustre_lib.h>
int typ_refcnt;
struct lu_device_type *typ_lu;
spinlock_t obd_type_lock;
- struct kobject *typ_kobj;
+ struct kobject typ_kobj;
+ struct completion typ_kobj_unregister;
};
struct brw_page {
struct timeout_item {
enum timeout_event ti_event;
- cfs_time_t ti_timeout;
+ time64_t ti_timeout;
timeout_cb_t ti_cb;
void *ti_cb_data;
struct list_head ti_obd_list;
* See osc_{reserve|unreserve}_grant for details. */
long cl_reserved_grant;
struct list_head cl_cache_waiters; /* waiting for cache/grant */
- cfs_time_t cl_next_shrink_grant; /* jiffies */
+ time64_t cl_next_shrink_grant; /* seconds */
struct list_head cl_grant_shrink_list; /* Timeout event list */
- int cl_grant_shrink_interval; /* seconds */
+ time64_t cl_grant_shrink_interval; /* seconds */
/* A chunk is an optimal size used by osc_extent to determine
* the extent size. A chunk is max(PAGE_SIZE, OST block size) */
atomic_t cl_pending_r_pages;
__u32 cl_max_pages_per_rpc;
__u32 cl_max_rpcs_in_flight;
+ __u32 cl_short_io_bytes;
struct obd_histogram cl_read_rpc_hist;
struct obd_histogram cl_write_rpc_hist;
struct obd_histogram cl_read_page_hist;
struct mutex cl_mgc_mutex;
struct local_oid_storage *cl_mgc_los;
struct dt_object *cl_mgc_configs_dir;
- atomic_t cl_mgc_refcount;
struct obd_export *cl_mgc_mgsexp;
+ atomic_t cl_mgc_refcount;
+ /* in-flight control list and total RPCs counter */
+ struct list_head cl_flight_waiters;
+ __u32 cl_rpcs_in_flight;
/* checksumming for data sent over the network */
unsigned int cl_checksum:1, /* 0 = disabled, 1 = enabled */
ltd_reap:1; /* should this target be deleted */
};
+ struct lov_md_tgt_desc {
+ struct obd_device *lmtd_mdc;
+ __u32 lmtd_index;
+ };
+
struct lov_obd {
struct lov_desc desc;
struct lov_tgt_desc **lov_tgts; /* sparse array */
struct cl_client_cache *lov_cache;
struct rw_semaphore lov_notify_lock;
+ /* Data-on-MDT: MDC array */
+ struct lov_md_tgt_desc *lov_mdc_tgts;
};
struct lmv_tgt_desc {
struct obd_uuid ltd_uuid;
+ struct obd_device *ltd_obd;
struct obd_export *ltd_exp;
__u32 ltd_idx;
struct mutex ltd_fid_mutex;
/*
* Data structure used to pass obd_notify()-event to non-obd listeners (llite
- * and liblustre being main examples).
+ * being main example).
*/
struct obd_notify_upcall {
int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
struct obd_export *obd_lwp_export;
/* list of exports in LRU order, for ping evictor, with obd_dev_lock */
struct list_head obd_exports_timed;
- time_t obd_eviction_timer; /* for ping evictor */
+ time64_t obd_eviction_timer; /* for ping evictor */
int obd_max_recoverable_clients;
atomic_t obd_connected_clients;
struct proc_dir_entry *obd_proc_exports_entry;
struct proc_dir_entry *obd_svc_procroot;
struct lprocfs_stats *obd_svc_stats;
- struct attribute_group *obd_attrs;
+ struct attribute_group obd_attrs_group;
+ struct attribute **obd_attrs;
struct lprocfs_vars *obd_vars;
atomic_t obd_evict_inprogress;
wait_queue_head_t obd_evict_inprogress_waitq;
* List of outstanding class_incref()'s fo this OBD. For debugging. */
struct lu_ref obd_reference;
- struct kobject obd_kobj; /* sysfs object */
- struct completion obd_kobj_unregister;
+ struct kset obd_kset; /* sysfs object collection */
+ struct kobj_type obd_ktype;
+ struct completion obd_kobj_unregister;
};
/* get/set_info keys */
#define MDC_REPLY_PORTAL 10
//#define MDC_BULK_PORTAL 11
#define MDS_REQUEST_PORTAL 12
- //#define MDS_REPLY_PORTAL 13
+ #define MDS_IO_PORTAL 13
#define MDS_BULK_PORTAL 14
#define LDLM_CB_REQUEST_PORTAL 15
#define LDLM_CB_REPLY_PORTAL 16
#define MSG_CONNECT_RECOVERING 0x00000001
#define MSG_CONNECT_RECONNECT 0x00000002
#define MSG_CONNECT_REPLAYABLE 0x00000004
-//#define MSG_CONNECT_PEER 0x8
+/* #define MSG_CONNECT_PEER 0x00000008 removed 1.5 */
#define MSG_CONNECT_LIBCLIENT 0x00000010
#define MSG_CONNECT_INITIAL 0x00000020
#define MSG_CONNECT_ASYNC 0x00000040
OBD_CONNECT_FLOCK_DEAD | \
OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK | \
OBD_CONNECT_OPEN_BY_FID | \
- OBD_CONNECT_DIR_STRIPE | \
- OBD_CONNECT_BULK_MBITS | \
+ OBD_CONNECT_DIR_STRIPE | OBD_CONNECT_GRANT | \
+ OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_SRVLOCK | \
+ OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM | \
OBD_CONNECT_MULTIMODRPCS | \
OBD_CONNECT_SUBTREE | OBD_CONNECT_LARGE_ACL | \
+ OBD_CONNECT_GRANT_PARAM | \
OBD_CONNECT_FLAGS2)
#define MDT_CONNECT_SUPPORTED2 OBD_CONNECT2_FILE_SECCTX
* those *_DEF magics are only used on server side internally, they
* won't be put on wire or disk.
*/
-#define LOV_MAGIC_DEF 0x10000000
-#define LOV_MAGIC_V1_DEF (LOV_MAGIC_DEF | LOV_MAGIC_V1)
-#define LOV_MAGIC_V3_DEF (LOV_MAGIC_DEF | LOV_MAGIC_V3)
-#define LOV_MAGIC_COMP_V1_DEF (LOV_MAGIC_DEF | LOV_MAGIC_COMP_V1)
+#define LOV_MAGIC_DEFINED 0x10000000
+#define LOV_MAGIC_V1_DEFINED (LOV_MAGIC_DEFINED | LOV_MAGIC_V1)
+#define LOV_MAGIC_V3_DEFINED (LOV_MAGIC_DEFINED | LOV_MAGIC_V3)
+#define LOV_MAGIC_COMP_V1_DEFINED (LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1)
#define lov_pattern(pattern) (pattern & ~LOV_PATTERN_F_MASK)
#define lov_pattern_flags(pattern) (pattern & LOV_PATTERN_F_MASK)
#define OBD_MD_FLUID (0x00000200ULL) /* user ID */
#define OBD_MD_FLGID (0x00000400ULL) /* group ID */
#define OBD_MD_FLFLAGS (0x00000800ULL) /* flags word */
+ #define OBD_MD_DOM_SIZE (0X00001000ULL) /* Data-on-MDT component size */
#define OBD_MD_FLNLINK (0x00002000ULL) /* link count */
#define OBD_MD_FLGENER (0x00004000ULL) /* generation number */
/*#define OBD_MD_FLINLINE (0x00008000ULL) inline data. used until 1.6.5 */
OBD_BRW_OVER_GRPQUOTA | \
OBD_BRW_OVER_PRJQUOTA)
+#define OBD_BRW_LOCAL1 0x80000000UL /*
+ * osd-ldiskfs internal,
+ * page mapped to real block
+ */
+
+#define OBD_BRW_LOCALS (OBD_BRW_LOCAL1)
+
#define OBD_OBJECT_EOF LUSTRE_EOF
#define OST_MIN_PRECREATE 32
struct lu_fid qid_fid; /* FID for per-directory quota */
__u64 qid_uid; /* user identifier */
__u64 qid_gid; /* group identifier */
+ __u64 qid_projid; /* project identifier */
};
/* quotactl management */
* Do not exceed 63
*/
-typedef enum {
+enum mds_reint_op {
REINT_SETATTR = 1,
REINT_CREATE = 2,
REINT_LINK = 3,
REINT_RMENTRY = 8,
REINT_MIGRATE = 9,
REINT_MAX
-} mds_reint_t, mdt_reint_t;
+};
/* the disposition of the intent outlines what was executed */
#define DISP_IT_EXECD 0x00000001
* will grant LOOKUP_LOCK. */
#define MDS_INODELOCK_PERM 0x000010
#define MDS_INODELOCK_XATTR 0x000020 /* extended attributes */
+ #define MDS_INODELOCK_DOM 0x000040 /* Data for data-on-mdt files */
- #define MDS_INODELOCK_MAXSHIFT 5
+ #define MDS_INODELOCK_MAXSHIFT 6
/* This FULL lock is useful to take on unlink sort of operations */
#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1)
+ /* DOM lock shouldn't be canceled early, use this macro for ELC */
+ #define MDS_INODELOCK_ELC (MDS_INODELOCK_FULL & ~MDS_INODELOCK_DOM)
/* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
* but was moved into name[1] along with the OID to avoid consuming the
__u32 mbo_uid_h; /* high 32-bits of uid, for FUID */
__u32 mbo_gid_h; /* high 32-bits of gid, for FUID */
__u32 mbo_projid;
- __u64 mbo_padding_6; /* also fix lustre_swab_mdt_body */
- __u64 mbo_padding_7;
- __u64 mbo_padding_8;
+ __u64 mbo_dom_size; /* size of DOM component */
+ __u64 mbo_dom_blocks; /* blocks consumed by DOM component */
+ __u64 mbo_padding_8; /* also fix lustre_swab_mdt_body */
__u64 mbo_padding_9;
__u64 mbo_padding_10;
}; /* 216 */
/* lmv structures */
struct lmv_desc {
- __u32 ld_tgt_count; /* how many MDS's */
- __u32 ld_active_tgt_count; /* how many active */
- __u32 ld_default_stripe_count; /* how many objects are used */
- __u32 ld_pattern; /* default hash pattern */
- __u64 ld_default_hash_size;
- __u64 ld_padding_1; /* also fix lustre_swab_lmv_desc */
- __u32 ld_padding_2; /* also fix lustre_swab_lmv_desc */
- __u32 ld_qos_maxage; /* in second */
- __u32 ld_padding_3; /* also fix lustre_swab_lmv_desc */
- __u32 ld_padding_4; /* also fix lustre_swab_lmv_desc */
- struct obd_uuid ld_uuid;
+ __u32 ld_tgt_count; /* how many MDS's */
+ __u32 ld_active_tgt_count; /* how many active */
+ __u32 ld_default_stripe_count; /* how many objects are used */
+ __u32 ld_pattern; /* default hash pattern */
+ __u64 ld_default_hash_size;
+ __u64 ld_padding_1; /* also fix lustre_swab_lmv_desc */
+ __u32 ld_padding_2; /* also fix lustre_swab_lmv_desc */
+ __u32 ld_qos_maxage; /* in second */
+ __u32 ld_padding_3; /* also fix lustre_swab_lmv_desc */
+ __u32 ld_padding_4; /* also fix lustre_swab_lmv_desc */
+ struct obd_uuid ld_uuid;
};
/* LMV layout EA, and it will be stored both in master and slave object */
#define LMV_HASH_FLAG_MIGRATION 0x80000000
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 10, 56, 0)
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 11, 56, 0)
/* Since lustre 2.8, this flag will not be needed, instead this DEAD
* and orphan flags will be stored in LMA (see LMAI_ORPHAN)
* Keep this flag just for LFSCK, because it still might meet such
/* LOV settings descriptor (should only contain static info) */
struct lov_desc {
- __u32 ld_tgt_count; /* how many OBD's */
- __u32 ld_active_tgt_count; /* how many active */
- __u32 ld_default_stripe_count; /* how many objects are used */
- __u32 ld_pattern; /* default PATTERN_RAID0 */
- __u64 ld_default_stripe_size; /* in bytes */
- __u64 ld_default_stripe_offset; /* in bytes */
- __u32 ld_padding_0; /* unused */
- __u32 ld_qos_maxage; /* in second */
- __u32 ld_padding_1; /* also fix lustre_swab_lov_desc */
- __u32 ld_padding_2; /* also fix lustre_swab_lov_desc */
- struct obd_uuid ld_uuid;
+ __u32 ld_tgt_count; /* how many OBD's */
+ __u32 ld_active_tgt_count; /* how many active */
+ __s32 ld_default_stripe_count; /* how many objects are used */
+ __u32 ld_pattern; /* default PATTERN_RAID0 */
+ __u64 ld_default_stripe_size; /* in bytes */
+ __s64 ld_default_stripe_offset; /* starting OST index */
+ __u32 ld_padding_0; /* unused */
+ __u32 ld_qos_maxage; /* in second */
+ __u32 ld_padding_1; /* also fix lustre_swab_lov_desc */
+ __u32 ld_padding_2; /* also fix lustre_swab_lov_desc */
+ struct obd_uuid ld_uuid;
};
#define ld_magic ld_active_tgt_count /* for swabbing from llogs */
IT_QUOTA_DQACQ = 0x00000800,
IT_QUOTA_CONN = 0x00001000,
IT_SETXATTR = 0x00002000,
+ IT_GLIMPSE = 0x00004000,
+ IT_BRW = 0x00008000,
};
struct ldlm_intent {
atomic_long_set(&cli->cl_unstable_count, 0);
INIT_LIST_HEAD(&cli->cl_shrink_list);
+ INIT_LIST_HEAD(&cli->cl_flight_waiters);
+ cli->cl_rpcs_in_flight = 0;
+
init_waitqueue_head(&cli->cl_destroy_waitq);
atomic_set(&cli->cl_destroy_in_flight, 0);
#ifdef ENABLE_CHECKSUM
* from OFD after connecting. */
cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES;
+ cli->cl_short_io_bytes = OBD_MAX_SHORT_IO_BYTES;
+
/* set cl_chunkbits default value to PAGE_SHIFT,
* it will be updated at OSC connection time. */
cli->cl_chunkbits = PAGE_SHIFT;
cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_MAX;
else
cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT;
- }
+ }
spin_lock_init(&cli->cl_mod_rpcs_lock);
spin_lock_init(&cli->cl_mod_rpcs_hist.oh_lock);
{
struct obd_device *target;
struct lustre_handle *hdl;
- cfs_time_t now;
- cfs_time_t deadline;
- int timeout;
+ time64_t deadline;
+ time64_t timeout;
+ time64_t now;
int rc = 0;
- ENTRY;
+ ENTRY;
hdl = &exp->exp_imp_reverse->imp_remote_handle;
if (!exp->exp_connection || !lustre_handle_is_used(hdl)) {
conn->cookie = exp->exp_handle.h_cookie;
GOTO(out_already, rc);
}
- now = cfs_time_current();
- deadline = target->obd_recovery_timer.expires;
- if (cfs_time_before(now, deadline)) {
- struct target_distribute_txn_data *tdtd =
- class_exp2tgt(exp)->lut_tdtd;
+ now = ktime_get_seconds();
+ deadline = cfs_duration_sec(target->obd_recovery_timer.expires);
+ if (now < deadline) {
+ struct target_distribute_txn_data *tdtd;
int size = 0;
int count = 0;
char *buf = NULL;
- timeout = cfs_duration_sec(cfs_time_sub(deadline, now));
+ timeout = deadline - now;
+ tdtd = class_exp2tgt(exp)->lut_tdtd;
if (tdtd && tdtd->tdtd_show_update_logs_retrievers)
buf = tdtd->tdtd_show_update_logs_retrievers(
tdtd->tdtd_show_retrievers_cbdata,
if (count > 0)
LCONSOLE_WARN("%s: Recovery already passed deadline "
- "%d:%.02d. It is due to DNE recovery "
+ "%lld:%.02lld. It is due to DNE recovery "
"failed/stuck on the %d MDT(s):%s. "
"Please wait until all MDTs recovered "
"or abort the recovery by force.\n",
buf ? buf : "unknown (not enough RAM)");
else
LCONSOLE_WARN("%s: Recovery already passed deadline "
- "%d:%.02d. If you do not want to wait "
+ "%lld:%.02lld. If you do not want to wait "
"more, please abort the recovery by "
"force.\n", target->obd_name,
timeout / 60, timeout % 60);
if (buf != NULL)
OBD_FREE(buf, size);
} else {
- timeout = cfs_duration_sec(cfs_time_sub(now, deadline));
+ timeout = now - deadline;
LCONSOLE_WARN("%s: Recovery already passed deadline"
- " %d:%.02d, It is most likely due to DNE"
+ " %lld:%.02lld, It is most likely due to DNE"
" recovery is failed or stuck, please wait a"
" few more minutes or abort the recovery.\n",
target->obd_name, timeout / 60, timeout % 60);
* reconnect case */
struct lustre_handle conn;
struct lustre_handle *tmp;
- struct obd_uuid tgtuuid;
struct obd_uuid cluuid;
char *str;
int rc = 0;
bool mds_conn = false, lw_client = false, initial_conn = false;
bool mds_mds_conn = false;
bool new_mds_mds_conn = false;
- bool target_referenced = false;
struct obd_connect_data *data, *tmpdata;
int size, tmpsize;
lnet_nid_t *client_nid = NULL;
GOTO(out, rc = -EINVAL);
}
- obd_str2uuid(&tgtuuid, str);
- target = class_uuid2obd(&tgtuuid);
- if (!target)
- target = class_name2obd(str);
-
+ target = class_dev_by_str(str);
if (!target) {
deuuidify(str, NULL, &target_start, &target_len);
LCONSOLE_ERROR_MSG(0x137, "%s: not available for connect "
}
spin_lock(&target->obd_dev_lock);
+
+ target->obd_conn_inprogress++;
+
if (target->obd_stopping || !target->obd_set_up) {
spin_unlock(&target->obd_dev_lock);
GOTO(out, rc = -EAGAIN);
}
- /* Make sure the target isn't cleaned up while we're here. Yes,
- * there's still a race between the above check and our incref here.
- * Really, class_uuid2obd should take the ref. */
- class_incref(target, __func__, current);
- target_referenced = true;
-
- target->obd_conn_inprogress++;
spin_unlock(&target->obd_dev_lock);
str = req_capsule_client_get(&req->rq_pill, &RMF_CLUUID);
*/
if (!(data->ocd_connect_flags & OBD_CONNECT_FULL20))
GOTO(out, rc = -EPROTO);
-#endif
+ /* Don't allow liblustre clients to connect.
+ * - testing was disabled in v2_2_50_0-61-g6a75d65
+ * - building was disabled in v2_5_58_0-28-g7277179
+ * - client code was deleted in v2_6_50_0-101-gcdfbc72,
+ * - clients were refused connect for version difference > 0.0.1.32 */
if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
- if (data->ocd_version < LUSTRE_VERSION_CODE -
- LUSTRE_VERSION_ALLOWED_OFFSET ||
- data->ocd_version > LUSTRE_VERSION_CODE +
- LUSTRE_VERSION_ALLOWED_OFFSET) {
- DEBUG_REQ(D_WARNING, req, "Refusing %s (%d.%d.%d.%d) "
- "libclient connection attempt",
- data->ocd_version < LUSTRE_VERSION_CODE ?
- "old" : "new",
- OBD_OCD_VERSION_MAJOR(data->ocd_version),
- OBD_OCD_VERSION_MINOR(data->ocd_version),
- OBD_OCD_VERSION_PATCH(data->ocd_version),
- OBD_OCD_VERSION_FIX(data->ocd_version));
- data = req_capsule_server_sized_get(&req->rq_pill,
- &RMF_CONNECT_DATA,
- offsetof(typeof(*data), ocd_version) +
- sizeof(data->ocd_version));
- if (data) {
- data->ocd_connect_flags = OBD_CONNECT_VERSION;
- data->ocd_version = LUSTRE_VERSION_CODE;
- }
- GOTO(out, rc = -EPROTO);
- }
+ DEBUG_REQ(D_WARNING, req, "Refusing libclient connection");
+ GOTO(out, rc = -EPROTO);
}
+#endif
/* Note: lw_client is needed in MDS-MDS failover during update log
* processing, so we needs to allow lw_client to be connected at
GOTO(out, rc);
}
- CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %ld last %ld\n",
- target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
- target->obd_recovering ? "recovering/" : "", data->ocd_transno,
- export, (long)cfs_time_current_sec(),
- export ? (long)export->exp_last_request_time : 0);
+ CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %lld last %lld\n",
+ target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
+ target->obd_recovering ? "recovering/" : "", data->ocd_transno,
+ export, ktime_get_real_seconds(),
+ export ? export->exp_last_request_time : 0);
/* If this is the first time a client connects, reset the recovery
* timer. Discard lightweight connections which might be local. */
/* allow "new" MDT to be connected during recovery, since we
* need retrieve recovery update records from it */
if (target->obd_recovering && !lw_client && !mds_mds_conn) {
- cfs_time_t t;
- int c; /* connected */
- int i; /* in progress */
- int k; /* known */
- int s; /* stale/evicted */
+ time64_t t;
+ int c; /* connected */
+ int i; /* in progress */
+ int k; /* known */
+ int s; /* stale/evicted */
c = atomic_read(&target->obd_connected_clients);
i = atomic_read(&target->obd_lock_replay_clients);
k = target->obd_max_recoverable_clients;
s = target->obd_stale_clients;
t = target->obd_recovery_timer.expires;
- t = cfs_time_sub(t, cfs_time_current());
- t = cfs_duration_sec(t);
+ t = cfs_duration_sec(target->obd_recovery_timer.expires);
+ t -= ktime_get_seconds();
LCONSOLE_WARN("%s: Denying connection for new client %s"
"(at %s), waiting for %d known clients "
"(%d recovered, %d in progress, and %d "
- "evicted) to recover in %d:%.02d\n",
+ "evicted) to recover in %lld:%.02lld\n",
target->obd_name, cluuid.uuid,
libcfs_nid2str(req->rq_peer.nid), k,
- c - i, i, s, (int)t / 60,
- (int)t % 60);
+ c - i, i, s, t / 60, t % 60);
rc = -EBUSY;
} else {
dont_check_exports:
spin_unlock(&export->exp_lock);
CDEBUG(D_RPCTRACE, "%s: %s already connected at greater "
"or equal conn_cnt: %d >= %d\n",
- cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
- export->exp_conn_cnt,
- lustre_msg_get_conn_cnt(req->rq_reqmsg));
+ cluuid.uuid, libcfs_nid2str(req->rq_peer.nid),
+ export->exp_conn_cnt,
+ lustre_msg_get_conn_cnt(req->rq_reqmsg));
- GOTO(out, rc = -EALREADY);
- }
- LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
- export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
-
- /* Don't evict liblustre clients for not pinging. */
- if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
- export->exp_libclient = 1;
- spin_unlock(&export->exp_lock);
-
- spin_lock(&target->obd_dev_lock);
- list_del_init(&export->exp_obd_chain_timed);
- spin_unlock(&target->obd_dev_lock);
- } else {
- spin_unlock(&export->exp_lock);
+ GOTO(out, rc = -EALREADY);
}
+ LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0);
+ export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+ spin_unlock(&export->exp_lock);
- if (export->exp_connection != NULL) {
+ if (export->exp_connection != NULL) {
/* Check to see if connection came from another NID. */
- if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) &&
+ if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) &&
!hlist_unhashed(&export->exp_nid_hash))
- cfs_hash_del(export->exp_obd->obd_nid_hash,
- &export->exp_connection->c_peer.nid,
- &export->exp_nid_hash);
+ cfs_hash_del(export->exp_obd->obd_nid_hash,
+ &export->exp_connection->c_peer.nid,
+ &export->exp_nid_hash);
- ptlrpc_connection_put(export->exp_connection);
- }
+ ptlrpc_connection_put(export->exp_connection);
+ }
export->exp_connection = ptlrpc_connection_get(req->rq_peer,
req->rq_self,
class_export_put(export);
}
- if (target_referenced == true && target != NULL) {
+ if (target != NULL) {
spin_lock(&target->obd_dev_lock);
target->obd_conn_inprogress--;
spin_unlock(&target->obd_dev_lock);
-
- class_decref(target, __func__, current);
+ class_decref(target, "find", current);
}
req->rq_status = rc;
RETURN(rc);
obd->obd_recovery_end = ktime_get_real_seconds();
/* When recovery finished, cleanup orphans on MDS and OST. */
- if (OBT(obd) && OBP(obd, postrecov)) {
- int rc = OBP(obd, postrecov)(obd);
- if (rc < 0)
- LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
- obd->obd_name, rc);
- }
+ if (obd->obd_type && OBP(obd, postrecov)) {
+ int rc = OBP(obd, postrecov)(obd);
+
+ if (rc < 0)
+ LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
+ obd->obd_name, rc);
+ }
EXIT;
}
}
mod_timer(&obd->obd_recovery_timer,
- cfs_time_shift(obd->obd_recovery_timeout));
+ jiffies + cfs_time_seconds(obd->obd_recovery_timeout));
obd->obd_recovery_start = ktime_get_real_seconds();
spin_unlock(&obd->obd_dev_lock);
* if @extend is true, extend recovery window to have @drt remaining at least;
* otherwise, make sure the recovery timeout value is not less than @drt.
*/
-static void extend_recovery_timer(struct obd_device *obd, int drt,
+static void extend_recovery_timer(struct obd_device *obd, time64_t drt,
bool extend)
{
time64_t now;
obd->obd_recovery_timeout = to;
end = obd->obd_recovery_start + to;
mod_timer(&obd->obd_recovery_timer,
- cfs_time_shift(end - now));
+ jiffies + cfs_time_seconds(end - now));
}
spin_unlock(&obd->obd_dev_lock);
struct ptlrpc_request *req,
int new_client)
{
- int service_time = lustre_msg_get_service_time(req->rq_reqmsg);
+ time64_t service_time = lustre_msg_get_service_time(req->rq_reqmsg);
struct obd_device_target *obt = &obd->u.obt;
if (!new_client && service_time)
target_start_recovery_timer(obd);
/* Convert the service time to RPC timeout,
- * and reuse service_time to limit stack usage. */
+ * and reuse service_time to limit stack usage.
+ */
service_time = at_est2timeout(service_time);
if (OBD_FAIL_CHECK(OBD_FAIL_TGT_SLUGGISH_NET) &&
/* don't reset timer for final stage */
if (!exp_finished(req->rq_export)) {
- int to = obd_timeout;
+ time64_t to = obd_timeout;
/**
* Add request timeout to the recovery time so next request from
int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc,
struct l_wait_info *lwi)
{
- struct ptlrpc_request *req = desc->bd_req;
- time_t start = cfs_time_current_sec();
- time_t deadline;
- int rc = 0;
+ struct ptlrpc_request *req = desc->bd_req;
+ time64_t start = ktime_get_real_seconds();
+ time64_t deadline;
+ int rc = 0;
ENTRY;
deadline = req->rq_deadline;
do {
- long timeoutl = deadline - cfs_time_current_sec();
- cfs_duration_t timeout = timeoutl <= 0 ?
- CFS_TICK : cfs_time_seconds(timeoutl);
- time_t rq_deadline;
+ time64_t timeoutl = deadline - ktime_get_real_seconds();
+ long timeout_jiffies = timeoutl <= 0 ?
+ 1 : cfs_time_seconds(timeoutl);
+ time64_t rq_deadline;
- *lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1),
+ *lwi = LWI_TIMEOUT_INTERVAL(timeout_jiffies,
+ cfs_time_seconds(1),
target_bulk_timeout, desc);
rc = l_wait_event(desc->bd_waitq,
!ptlrpc_server_bulk_active(desc) ||
deadline = start + bulk_timeout;
if (deadline > rq_deadline)
deadline = rq_deadline;
- } while ((rc == -ETIMEDOUT) &&
- (deadline > cfs_time_current_sec()));
+ } while (rc == -ETIMEDOUT &&
+ deadline > ktime_get_real_seconds());
if (rc == -ETIMEDOUT) {
- DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %ld%+lds",
+ DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %lld%+llds",
bulk2type(req), deadline - start,
- cfs_time_current_sec() - deadline);
+ ktime_get_real_seconds() - deadline);
ptlrpc_abort_bulk(desc);
} else if (exp->exp_failed) {
DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",
* Add a lock to granted list on a resource maintaining skiplist
* correctness.
*/
- static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
+ void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
{
- struct sl_insert_point prev;
- ENTRY;
+ struct sl_insert_point prev;
- LASSERT(lock->l_req_mode == lock->l_granted_mode);
+ LASSERT(lock->l_req_mode == lock->l_granted_mode);
- search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
- ldlm_granted_list_add_lock(lock, &prev);
- EXIT;
+ search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
+ ldlm_granted_list_add_lock(lock, &prev);
}
/**
res = ldlm_resource_getref(lock->l_resource);
- ldlm_res_lvbo_update(res, NULL, 1);
+ ldlm_lvbo_update(res, lock, NULL, 1);
ldlm_lock_cancel(lock);
if (!exp->exp_obd->obd_stopping)
ldlm_reprocess_all(res);
libcfs_debug_vmsg2(msgdata, fmt, args,
" ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
"res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s "
- "remote: %#llx expref: %d pid: %u timeout: %lu "
+ "remote: %#llx expref: %d pid: %u timeout: %lld "
"lvb_type: %d\n",
lock,
lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
"res: "DLDLMRES" rrc: %d type: %s [%llu->%llu] "
"(req %llu->%llu) flags: %#llx nid: %s remote: "
- "%#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+ "%#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
ldlm_lock_to_ns_name(lock), lock,
lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
lock->l_readers, lock->l_writers,
" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
"res: "DLDLMRES" rrc: %d type: %s pid: %d "
"[%llu->%llu] flags: %#llx nid: %s "
- "remote: %#llx expref: %d pid: %u timeout: %lu\n",
+ "remote: %#llx expref: %d pid: %u timeout: %lld\n",
ldlm_lock_to_ns_name(lock), lock,
lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
lock->l_readers, lock->l_writers,
" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
"res: "DLDLMRES" bits %#llx/%#llx rrc: %d type: %s "
"flags: %#llx nid: %s remote: %#llx expref: %d "
- "pid: %u timeout: %lu lvb_type: %d\n",
+ "pid: %u timeout: %lld lvb_type: %d\n",
ldlm_lock_to_ns_name(lock),
lock, lock->l_handle.h_cookie,
atomic_read(&lock->l_refc),
" ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s "
"res: "DLDLMRES" rrc: %d type: %s flags: %#llx "
"nid: %s remote: %#llx expref: %d pid: %u "
- "timeout: %lu lvb_type: %d\n",
+ "timeout: %lld lvb_type: %d\n",
ldlm_lock_to_ns_name(lock),
lock, lock->l_handle.h_cookie,
atomic_read(&lock->l_refc),
static struct ldlm_state *ldlm_state;
-static inline cfs_time_t round_timeout(cfs_time_t timeout)
-{
- return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
-}
-
-/* timeout for initial callback (AST) reply (bz10399) */
-static inline unsigned int ldlm_get_rq_timeout(void)
+/* timeout for initial callback (AST) reply (bz10399)
+ * Due to having to send a 32 bit time value over the
+ * wire return it as time_t instead of time64_t
+ */
+static inline time_t ldlm_get_rq_timeout(void)
{
- /* Non-AT value */
- unsigned int timeout = min(ldlm_timeout, obd_timeout / 3);
+ /* Non-AT value */
+ time_t timeout = min(ldlm_timeout, obd_timeout / 3);
- return timeout < 1 ? 1 : timeout;
+ return timeout < 1 ? 1 : timeout;
}
struct ldlm_bl_pool {
}
static int ldlm_add_waiting_lock(struct ldlm_lock *lock);
-static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds);
+static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t seconds);
/**
* Check if there is a request in the export request list
spin_lock_bh(&waiting_locks_spinlock);
while (!list_empty(&waiting_locks_list)) {
lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
- l_pending_chain);
- if (cfs_time_after(lock->l_callback_timeout,
- cfs_time_current()) ||
- (lock->l_req_mode == LCK_GROUP))
- break;
+ l_pending_chain);
+ if (lock->l_callback_timeout > ktime_get_seconds() ||
+ lock->l_req_mode == LCK_GROUP)
+ break;
/* Check if we need to prolong timeout */
if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT) &&
wake_up(&expired_lock_wait_queue);
}
- /*
- * Make sure the timer will fire again if we have any locks
- * left.
- */
+ /*
+ * Make sure the timer will fire again if we have any locks
+ * left.
+ */
if (!list_empty(&waiting_locks_list)) {
- cfs_time_t timeout_rounded;
+ unsigned long timeout_jiffies;
+
lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
- l_pending_chain);
- timeout_rounded = (cfs_time_t)round_timeout(lock->l_callback_timeout);
- mod_timer(&waiting_locks_timer, timeout_rounded);
- }
+ l_pending_chain);
+ timeout_jiffies = cfs_time_seconds(lock->l_callback_timeout);
+ mod_timer(&waiting_locks_timer, timeout_jiffies);
+ }
spin_unlock_bh(&waiting_locks_spinlock);
}
*
* Called with the namespace lock held.
*/
-static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds)
+static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, time64_t seconds)
{
- cfs_time_t timeout;
- cfs_time_t timeout_rounded;
+ unsigned long timeout_jiffies;
+ time64_t timeout;
if (!list_empty(&lock->l_pending_chain))
return 0;
OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
seconds = 1;
- timeout = cfs_time_shift(seconds);
- if (likely(cfs_time_after(timeout, lock->l_callback_timeout)))
+ timeout = ktime_get_seconds() + seconds;
+ if (likely(timeout > lock->l_callback_timeout))
lock->l_callback_timeout = timeout;
- timeout_rounded = round_timeout(lock->l_callback_timeout);
+ timeout_jiffies = cfs_time_seconds(lock->l_callback_timeout);
- if (cfs_time_before(timeout_rounded, waiting_locks_timer.expires) ||
- !timer_pending(&waiting_locks_timer)) {
- mod_timer(&waiting_locks_timer, timeout_rounded);
- }
- /* if the new lock has a shorter timeout than something earlier on
- the list, we'll wait the longer amount of time; no big deal. */
- /* FIFO */
+ if (time_before(timeout_jiffies, waiting_locks_timer.expires) ||
+ !timer_pending(&waiting_locks_timer))
+ mod_timer(&waiting_locks_timer, timeout_jiffies);
+
+ /* if the new lock has a shorter timeout than something earlier on
+ * the list, we'll wait the longer amount of time; no big deal.
+ */
+ /* FIFO */
list_add_tail(&lock->l_pending_chain, &waiting_locks_list);
- return 1;
+ return 1;
}
static void ldlm_add_blocked_lock(struct ldlm_lock *lock)
static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
{
+ time64_t timeout = ldlm_bl_timeout(lock);
int ret;
- int timeout = ldlm_bl_timeout(lock);
/* NB: must be called with hold of lock_res_and_lock() */
LASSERT(ldlm_is_res_locked(lock));
}
if (ldlm_is_destroyed(lock)) {
- static cfs_time_t next;
+ static time64_t next;
spin_unlock_bh(&waiting_locks_spinlock);
LDLM_ERROR(lock, "not waiting on destroyed lock (bug 5653)");
- if (cfs_time_after(cfs_time_current(), next)) {
- next = cfs_time_shift(14400);
+ if (ktime_get_seconds() > next) {
+ next = ktime_get_seconds() + 14400;
libcfs_debug_dumpstack(NULL);
}
return 0;
if (ret)
ldlm_add_blocked_lock(lock);
- LDLM_DEBUG(lock, "%sadding to wait list(timeout: %d, AT: %s)",
+ LDLM_DEBUG(lock, "%sadding to wait list(timeout: %lld, AT: %s)",
ret == 0 ? "not re-" : "", timeout,
AT_OFF ? "off" : "on");
return ret;
del_timer(&waiting_locks_timer);
} else {
struct ldlm_lock *next;
+
next = list_entry(list_next, struct ldlm_lock,
- l_pending_chain);
+ l_pending_chain);
mod_timer(&waiting_locks_timer,
- round_timeout(next->l_callback_timeout));
+ cfs_time_seconds(next->l_callback_timeout));
}
}
list_del_init(&lock->l_pending_chain);
*
* Called with namespace lock held.
*/
-int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
{
if (lock->l_export == NULL) {
/* We don't have a "waiting locks list" on clients. */
RETURN(0);
}
-int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, time64_t timeout)
{
RETURN(0);
}
*
* \retval timeout in seconds to wait for the client reply
*/
-unsigned int ldlm_bl_timeout(struct ldlm_lock *lock)
+time64_t ldlm_bl_timeout(struct ldlm_lock *lock)
{
- unsigned int timeout;
+ time64_t timeout;
if (AT_OFF)
return obd_timeout / 2;
* It would be nice to have some kind of "early reply" mechanism for
* lock callbacks too... */
timeout = at_get(&lock->l_export->exp_bl_lock_at);
- return max(timeout + (timeout >> 1), ldlm_enqueue_min);
+ return max(timeout + (timeout >> 1), (time64_t)ldlm_enqueue_min);
}
EXPORT_SYMBOL(ldlm_bl_timeout);
struct lnet_process_id peer = req->rq_import->imp_connection->c_peer;
if (!req->rq_replied || (rc && rc != -EINVAL)) {
- if (lock->l_export && lock->l_export->exp_libclient) {
- LDLM_DEBUG(lock,
- "%s AST (req@%p x%llu) to liblustre client (nid %s) timeout, just cancelling lock",
- ast_type, req, req->rq_xid,
- libcfs_nid2str(peer.nid));
- ldlm_lock_cancel(lock);
- rc = -ERESTART;
- } else if (ldlm_is_cancel(lock)) {
+ if (ldlm_is_cancel(lock)) {
LDLM_DEBUG(lock,
"%s AST (req@%p x%llu) timeout from nid %s, but cancel was received (AST reply lost?)",
ast_type, req, req->rq_xid,
/* update lvbo to return proper attributes.
* see bug 23174 */
ldlm_resource_getref(res);
- ldlm_res_lvbo_update(res, NULL, 1);
+ ldlm_lvbo_update(res, lock, NULL, 1);
ldlm_resource_putref(res);
}
ldlm_lock_cancel(lock);
} else if (rc == -ELDLM_NO_LOCK_DATA) {
LDLM_DEBUG(lock, "lost race - client has a lock but no "
"inode");
- ldlm_res_lvbo_update(lock->l_resource, NULL, 1);
+ ldlm_lvbo_update(lock->l_resource, lock, NULL, 1);
} else if (rc != 0) {
rc = ldlm_handle_ast_error(lock, req, rc, "glimpse");
} else {
- rc = ldlm_res_lvbo_update(lock->l_resource, req, 1);
+ rc = ldlm_lvbo_update(lock->l_resource, lock, req, 1);
}
break;
case LDLM_BL_CALLBACK:
static void ldlm_update_resend(struct ptlrpc_request *req, void *data)
{
- struct ldlm_cb_async_args *ca = data;
- struct ldlm_lock *lock = ca->ca_lock;
+ struct ldlm_cb_async_args *ca = data;
+ struct ldlm_lock *lock = ca->ca_lock;
ldlm_refresh_waiting_lock(lock, ldlm_bl_timeout(lock));
}
ldlm_lock_reorder_req(lock);
- req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse,
- &RQF_LDLM_BL_CALLBACK,
- LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK);
- if (req == NULL)
- RETURN(-ENOMEM);
+ req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse,
+ &RQF_LDLM_BL_CALLBACK,
+ LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK);
+ if (req == NULL)
+ RETURN(-ENOMEM);
- CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
- ca = ptlrpc_req_async_args(req);
- ca->ca_set_arg = arg;
- ca->ca_lock = lock;
+ CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
+ ca = ptlrpc_req_async_args(req);
+ ca->ca_set_arg = arg;
+ ca->ca_lock = lock;
- req->rq_interpret_reply = ldlm_cb_interpret;
+ req->rq_interpret_reply = ldlm_cb_interpret;
lock_res_and_lock(lock);
if (ldlm_is_destroyed(lock)) {
lvb_len = 0;
req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT, lvb_len);
- rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK);
- if (rc) {
- ptlrpc_request_free(req);
- RETURN(rc);
- }
+ rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK);
+ if (rc) {
+ ptlrpc_request_free(req);
+ RETURN(rc);
+ }
- CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
- ca = ptlrpc_req_async_args(req);
- ca->ca_set_arg = arg;
- ca->ca_lock = lock;
+ CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
+ ca = ptlrpc_req_async_args(req);
+ ca->ca_set_arg = arg;
+ ca->ca_lock = lock;
- req->rq_interpret_reply = ldlm_cb_interpret;
- body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+ req->rq_interpret_reply = ldlm_cb_interpret;
+ body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
- body->lock_handle[0] = lock->l_remote_handle;
+ body->lock_handle[0] = lock->l_remote_handle;
body->lock_flags = ldlm_flags_to_wire(flags);
ldlm_lock2desc(lock, &body->lock_desc);
if (lvb_len > 0) {
*desc = *arg->gl_desc;
}
- body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
- body->lock_handle[0] = lock->l_remote_handle;
- ldlm_lock2desc(lock, &body->lock_desc);
+ body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+ body->lock_handle[0] = lock->l_remote_handle;
+ ldlm_lock2desc(lock, &body->lock_desc);
CLASSERT(sizeof(*ca) <= sizeof(req->rq_async_args));
ca = ptlrpc_req_async_args(req);
RETURN(rc);
}
+ EXPORT_SYMBOL(ldlm_server_glimpse_ast);
int ldlm_glimpse_locks(struct ldlm_resource *res,
struct list_head *gl_work_list)
}
EXPORT_SYMBOL(ldlm_request_lock);
- static void ldlm_svc_get_eopc(const struct ldlm_request *dlm_req,
- struct lprocfs_stats *srv_stats)
- {
- int lock_type = 0, op = 0;
-
- lock_type = dlm_req->lock_desc.l_resource.lr_type;
-
- switch (lock_type) {
- case LDLM_PLAIN:
- op = PTLRPC_LAST_CNTR + LDLM_PLAIN_ENQUEUE;
- break;
- case LDLM_EXTENT:
- if (dlm_req->lock_flags & LDLM_FL_HAS_INTENT)
- op = PTLRPC_LAST_CNTR + LDLM_GLIMPSE_ENQUEUE;
- else
- op = PTLRPC_LAST_CNTR + LDLM_EXTENT_ENQUEUE;
- break;
- case LDLM_FLOCK:
- op = PTLRPC_LAST_CNTR + LDLM_FLOCK_ENQUEUE;
- break;
- case LDLM_IBITS:
- op = PTLRPC_LAST_CNTR + LDLM_IBITS_ENQUEUE;
- break;
- default:
- op = 0;
- break;
- }
-
- if (op)
- lprocfs_counter_incr(srv_stats, op);
-
- return;
- }
-
/**
* Main server-side entry point into LDLM for enqueue. This is called by ptlrpc
* service threads to carry out client lock enqueueing requests.
LASSERT(req->rq_export);
- if (ptlrpc_req2svc(req)->srv_stats != NULL)
+ /* for intent enqueue the stat will be updated inside intent policy */
+ if (ptlrpc_req2svc(req)->srv_stats != NULL &&
+ !(dlm_req->lock_flags & LDLM_FL_HAS_INTENT))
ldlm_svc_get_eopc(dlm_req, ptlrpc_req2svc(req)->srv_stats);
if (req->rq_export && req->rq_export->exp_nid_stats &&
lock->l_req_extent = lock->l_policy_data.l_extent;
existing_lock:
-
if (flags & LDLM_FL_HAS_INTENT) {
/* In this case, the reply buffer is allocated deep in
* local_lock_enqueue by the policy function. */
ldlm_add_waiting_lock(lock);
}
}
- /* Make sure we never ever grant usual metadata locks to liblustre
- clients */
- if ((dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN ||
- dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS) &&
- req->rq_export->exp_libclient) {
- if (unlikely(!ldlm_is_cancel_on_block(lock) ||
- !(dlm_rep->lock_flags & LDLM_FL_CANCEL_ON_BLOCK))){
- CERROR("Granting sync lock to libclient. "
- "req fl %d, rep fl %d, lock fl %#llx\n",
- dlm_req->lock_flags, dlm_rep->lock_flags,
- lock->l_flags);
- LDLM_ERROR(lock, "sync lock");
- if (dlm_req->lock_flags & LDLM_FL_HAS_INTENT) {
- struct ldlm_intent *it;
-
- it = req_capsule_client_get(&req->rq_pill,
- &RMF_LDLM_INTENT);
- if (it != NULL) {
- CERROR("This is intent %s (%llu)\n",
- ldlm_it2str(it->opc), it->opc);
- }
- }
- }
- }
+ unlock_res_and_lock(lock);
- unlock_res_and_lock(lock);
-
- EXIT;
+ EXIT;
out:
req->rq_status = rc ?: err; /* return either error - bug 11190 */
if (!req->rq_packed_final) {
if (res != NULL) {
ldlm_resource_getref(res);
LDLM_RESOURCE_ADDREF(res);
- ldlm_res_lvbo_update(res, NULL, 1);
+
+ if (!ldlm_is_discard_data(lock))
+ ldlm_lvbo_update(res, lock, NULL, 1);
}
pres = res;
}
INIT_LIST_HEAD(&ast_list);
if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
- int to = cfs_time_seconds(1);
+ long to = cfs_time_seconds(1);
+
while (to > 0) {
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(to);
if (ldlm_state == NULL)
RETURN(-ENOMEM);
- ldlm_kobj = kobject_create_and_add("ldlm", lustre_kobj);
+ ldlm_kobj = kobject_create_and_add("ldlm", &lustre_kset->kobj);
if (!ldlm_kobj)
GOTO(out, -ENOMEM);
kset_unregister(ldlm_ns_kset);
if (ldlm_svc_kset)
kset_unregister(ldlm_svc_kset);
- if (ldlm_kobj)
+ if (ldlm_kobj) {
+ sysfs_remove_group(ldlm_kobj, &ldlm_attr_group);
kobject_put(ldlm_kobj);
+ }
ldlm_proc_cleanup();
if (ldlm_interval_tree_slab == NULL)
goto out_interval;
+ #ifdef HAVE_SERVER_SUPPORT
+ ldlm_glimpse_work_kmem = kmem_cache_create("ldlm_glimpse_work_kmem",
+ sizeof(struct ldlm_glimpse_work),
+ 0, 0, NULL);
+ if (ldlm_glimpse_work_kmem == NULL)
+ goto out_interval_tree;
+ #endif
+
#if LUSTRE_TRACKS_LOCK_EXP_REFS
class_export_dump_hook = ldlm_dump_export_locks;
#endif
return 0;
-
+ #ifdef HAVE_SERVER_SUPPORT
+ out_interval_tree:
+ kmem_cache_destroy(ldlm_interval_tree_slab);
+ #endif
out_interval:
kmem_cache_destroy(ldlm_interval_slab);
out_lock:
kmem_cache_destroy(ldlm_lock_slab);
kmem_cache_destroy(ldlm_interval_slab);
kmem_cache_destroy(ldlm_interval_tree_slab);
+ #ifdef HAVE_SERVER_SUPPORT
+ kmem_cache_destroy(ldlm_glimpse_work_kmem);
+ #endif
}
ENTRY;
if (lock->l_conn_export == NULL) {
- static cfs_time_t next_dump = 0, last_dump = 0;
+ static time64_t next_dump, last_dump;
LDLM_ERROR(lock, "lock timed out (enqueued at %lld, %llds ago); "
"not entering recovery in server code, just going back to sleep",
(s64)lock->l_last_activity,
(s64)(ktime_get_real_seconds() -
lock->l_last_activity));
- if (cfs_time_after(cfs_time_current(), next_dump)) {
+ if (ktime_get_seconds() > next_dump) {
last_dump = next_dump;
- next_dump = cfs_time_shift(300);
+ next_dump = ktime_get_seconds() + 300;
ldlm_namespace_dump(D_DLMTRACE,
ldlm_lock_to_ns(lock));
if (last_dump == 0)
/* We use the same basis for both server side and client side functions
from a single node. */
-static unsigned int ldlm_cp_timeout(struct ldlm_lock *lock)
+static time64_t ldlm_cp_timeout(struct ldlm_lock *lock)
{
- unsigned int timeout;
+ time64_t timeout;
if (AT_OFF)
return obd_timeout;
* lock from another client. Server will evict the other client if it
* doesn't respond reasonably, and then give us the lock. */
timeout = at_get(ldlm_lock_to_ns_at(lock));
- return max(3 * timeout, ldlm_enqueue_min);
+ return max(3 * timeout, (time64_t) ldlm_enqueue_min);
}
/**
struct obd_device *obd;
struct obd_import *imp = NULL;
struct l_wait_info lwi;
- __u32 timeout;
+ time64_t timeout;
int rc = 0;
ENTRY;
timeout = ldlm_cp_timeout(lock);
lwd.lwd_lock = lock;
- lock->l_last_activity = cfs_time_current_sec();
+ lock->l_last_activity = ktime_get_real_seconds();
if (ldlm_is_no_timeout(lock)) {
LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
lock->l_export = NULL;
lock->l_blocking_ast = einfo->ei_cb_bl;
lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL));
- lock->l_last_activity = cfs_time_current_sec();
+ lock->l_last_activity = ktime_get_real_seconds();
/* lock not sent to server yet */
if (reqp == NULL || *reqp == NULL) {
body->lock_flags = ldlm_flags_to_wire(*flags);
body->lock_handle[0] = *lockh;
+ /* extended LDLM opcodes in client stats */
+ if (exp->exp_obd->obd_svc_stats != NULL) {
+ bool glimpse = *flags & LDLM_FL_HAS_INTENT;
+
+ /* OST glimpse has no intent buffer */
+ if (req_capsule_has_field(&req->rq_pill, &RMF_LDLM_INTENT,
+ RCL_CLIENT)) {
+ struct ldlm_intent *it;
+
+ it = req_capsule_client_get(&req->rq_pill,
+ &RMF_LDLM_INTENT);
+ glimpse = (it && (it->opc == IT_GLIMPSE));
+ }
+
+ if (!glimpse)
+ ldlm_svc_get_eopc(body, exp->exp_obd->obd_svc_stats);
+ else
+ lprocfs_counter_incr(exp->exp_obd->obd_svc_stats,
+ PTLRPC_LAST_CNTR +
+ LDLM_GLIMPSE_ENQUEUE);
+ }
+
if (async) {
LASSERT(reqp != NULL);
RETURN(0);
lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING;
if ((lru_flags & LDLM_LRU_FLAG_CLEANUP) &&
- lock->l_resource->lr_type == LDLM_EXTENT &&
- lock->l_granted_mode == LCK_PR)
+ (lock->l_resource->lr_type == LDLM_EXTENT ||
+ ldlm_has_dom(lock)) && lock->l_granted_mode == LCK_PR)
ldlm_set_discard_data(lock);
/* We can't re-add to l_lru as it confuses the
sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
SBI_DEFAULT_READAHEAD_MAX);
sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
- sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
- SBI_DEFAULT_READAHEAD_WHOLE_MAX;
+ sbi->ll_ra_info.ra_max_read_ahead_whole_pages = -1;
ll_generate_random_uuid(uuid);
class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
RETURN(-ENOMEM);
}
+ /* pass client page size via ocd_grant_blkbits, the server should report
+ * back its backend blocksize for grant calculation purpose */
+ data->ocd_grant_blkbits = PAGE_SHIFT;
+
/* indicate MDT features supported by this client */
- data->ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH |
- OBD_CONNECT_ATTRFID |
- OBD_CONNECT_VERSION | OBD_CONNECT_BRW_SIZE |
- OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
- OBD_CONNECT_CANCELSET | OBD_CONNECT_FID |
- OBD_CONNECT_AT | OBD_CONNECT_LOV_V3 |
+ data->ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH |
+ OBD_CONNECT_ATTRFID | OBD_CONNECT_GRANT |
+ OBD_CONNECT_VERSION | OBD_CONNECT_BRW_SIZE |
+ OBD_CONNECT_SRVLOCK | OBD_CONNECT_TRUNCLOCK|
+ OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
+ OBD_CONNECT_CANCELSET | OBD_CONNECT_FID |
+ OBD_CONNECT_AT | OBD_CONNECT_LOV_V3 |
OBD_CONNECT_VBR | OBD_CONNECT_FULL20 |
OBD_CONNECT_64BITHASH |
OBD_CONNECT_EINPROGRESS |
OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK |
OBD_CONNECT_OPEN_BY_FID |
OBD_CONNECT_DIR_STRIPE |
- OBD_CONNECT_BULK_MBITS |
+ OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM |
OBD_CONNECT_SUBTREE |
- OBD_CONNECT_FLAGS2 | OBD_CONNECT_MULTIMODRPCS;
+ OBD_CONNECT_MULTIMODRPCS |
+ OBD_CONNECT_GRANT_PARAM | OBD_CONNECT_FLAGS2;
data->ocd_connect_flags2 = 0;
OBD_CONNECT_LARGE_ACL;
#endif
+ data->ocd_cksum_types = cksum_types_supported_client();
+
if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
/* flag mdc connection as lightweight, only used for test
* purpose, use with care */
if (sbi->ll_flags & LL_SBI_ALWAYS_PING)
data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
-#ifdef HAVE_SECURITY_DENTRY_INIT_SECURITY
+#if defined(HAVE_SECURITY_DENTRY_INIT_SECURITY) && defined(CONFIG_SECURITY)
data->ocd_connect_flags2 |= OBD_CONNECT2_FILE_SECCTX;
#endif /* HAVE_SECURITY_DENTRY_INIT_SECURITY */
OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
OBD_CONNECT_LAYOUTLOCK |
OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK |
- OBD_CONNECT_BULK_MBITS |
+ OBD_CONNECT_BULK_MBITS | OBD_CONNECT_SHORTIO |
OBD_CONNECT_FLAGS2;
/* The client currently advertises support for OBD_CONNECT_LOCKAHEAD_OLD so it
sbi->ll_dt_exp->exp_connect_data = *data;
+ /* Don't change value if it was specified in the config log */
+ if (sbi->ll_ra_info.ra_max_read_ahead_whole_pages == -1)
+ sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
+ max_t(unsigned long, SBI_DEFAULT_READAHEAD_WHOLE_MAX,
+ (data->ocd_brw_size >> PAGE_SHIFT));
+
err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp,
LUSTRE_SEQ_METADATA);
if (err) {
memset(lli->lli_jobid, 0, LUSTRE_JOBID_SIZE);
}
-static inline int ll_bdi_register(struct backing_dev_info *bdi)
+#ifndef HAVE_SUPER_SETUP_BDI_NAME
+
+#define LSI_BDI_INITIALIZED 0x00400000
+
+#ifndef HAVE_BDI_CAP_MAP_COPY
+# define BDI_CAP_MAP_COPY 0
+#endif
+
+#define MAX_STRING_SIZE 128
+
+static int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
{
- static atomic_t ll_bdi_num = ATOMIC_INIT(0);
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ char buf[MAX_STRING_SIZE];
+ va_list args;
+ int err;
+
+ err = bdi_init(&lsi->lsi_bdi);
+ if (err)
+ return err;
+
+ lsi->lsi_flags |= LSI_BDI_INITIALIZED;
+ lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
+ lsi->lsi_bdi.name = "lustre";
+ va_start(args, fmt);
+ vsnprintf(buf, MAX_STRING_SIZE, fmt, args);
+ va_end(args);
+ err = bdi_register(&lsi->lsi_bdi, NULL, "%s", buf);
+ va_end(args);
+ if (!err)
+ sb->s_bdi = &lsi->lsi_bdi;
- bdi->name = "lustre";
- return bdi_register(bdi, NULL, "lustre-%d",
- atomic_inc_return(&ll_bdi_num));
+ return err;
}
+#endif /* !HAVE_SUPER_SETUP_BDI_NAME */
int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
{
if (err)
GOTO(out_free, err);
- err = bdi_init(&lsi->lsi_bdi);
- if (err)
- GOTO(out_free, err);
- lsi->lsi_flags |= LSI_BDI_INITIALIZED;
-#ifdef HAVE_BDI_CAP_MAP_COPY
- lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
-#else
- lsi->lsi_bdi.capabilities = 0;
-#endif
- err = ll_bdi_register(&lsi->lsi_bdi);
+ err = super_setup_bdi_name(sb, "lustre-%p", sb);
if (err)
GOTO(out_free, err);
- sb->s_bdi = &lsi->lsi_bdi;
#ifndef HAVE_DCACHE_LOCK
/* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
sb->s_d_op = &ll_d_ops;
if (profilenm)
class_del_profile(profilenm);
+#ifndef HAVE_SUPER_SETUP_BDI_NAME
if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
bdi_destroy(&lsi->lsi_bdi);
lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
}
+#endif
ll_free_sbi(sb);
lsi->lsi_llsbi = NULL;
return lu_fid_eq(&ll_i2info(inode)->lli_fid, opaque);
}
+ int ll_dom_lock_cancel(struct inode *inode, struct ldlm_lock *lock)
+ {
+ struct lu_env *env;
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct cl_layout clt = { .cl_layout_gen = 0, };
+ int rc;
+ __u16 refcheck;
+
+
+ ENTRY;
+
+ if (!lli->lli_clob)
+ RETURN(0);
+
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env))
+ RETURN(PTR_ERR(env));
+
+ rc = cl_object_layout_get(env, lli->lli_clob, &clt);
+ if (rc) {
+ CDEBUG(D_INODE, "Cannot get layout for "DFID"\n",
+ PFID(ll_inode2fid(inode)));
+ rc = -ENODATA;
+ } else if (clt.cl_size == 0 || clt.cl_dom_comp_size == 0) {
+ CDEBUG(D_INODE, "DOM lock without DOM layout for "DFID"\n",
+ PFID(ll_inode2fid(inode)));
+ } else {
+ enum cl_fsync_mode mode;
+ loff_t end = clt.cl_dom_comp_size - 1;
+
+ mode = ldlm_is_discard_data(lock) ?
+ CL_FSYNC_DISCARD : CL_FSYNC_LOCAL;
+ rc = cl_sync_file_range(inode, 0, end, mode, 1);
+ truncate_inode_pages_range(inode->i_mapping, 0, end);
+ }
+ cl_env_put(env, &refcheck);
+ RETURN(rc);
+ }
+
int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
void *data, int flag)
{
struct inode *inode = ll_inode_from_resource_lock(lock);
__u64 bits = lock->l_policy_data.l_inodebits.bits;
- /* Inode is set to lock->l_resource->lr_lvb_inode
- * for mdc - bug 24555 */
- LASSERT(lock->l_ast_data == NULL);
-
if (inode == NULL)
break;
}
if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
- MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM))
+ MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM |
+ MDS_INODELOCK_DOM))
ll_have_md_lock(inode, &bits, LCK_MINMODE);
+ if (bits & MDS_INODELOCK_DOM) {
+ rc = ll_dom_lock_cancel(inode, lock);
+ if (rc < 0)
+ CDEBUG(D_INODE, "cannot flush DoM data "
+ DFID": rc = %d\n",
+ PFID(ll_inode2fid(inode)), rc);
+ lock_res_and_lock(lock);
+ ldlm_set_kms_ignore(lock);
+ unlock_res_and_lock(lock);
+ bits &= ~MDS_INODELOCK_DOM;
+ }
+
if (bits & MDS_INODELOCK_LAYOUT) {
struct cl_object_conf conf = {
.coc_opc = OBJECT_CONF_INVALIDATE,
static int ll_unlink(struct inode *dir, struct dentry *dchild)
{
struct qstr *name = &dchild->d_name;
- struct ptlrpc_request *request = NULL;
- struct md_op_data *op_data;
- int rc;
- ENTRY;
+ struct ptlrpc_request *request = NULL;
+ struct md_op_data *op_data;
+ struct mdt_body *body;
+ int rc;
+ ENTRY;
CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, dir="DFID"(%p)\n",
name->len, name->name, PFID(ll_inode2fid(dir)), dir);
- /*
- * XXX: unlink bind mountpoint maybe call to here,
- * just check it as vfs_unlink does.
- */
+ /*
+ * XXX: unlink bind mountpoint maybe call to here,
+ * just check it as vfs_unlink does.
+ */
if (unlikely(d_mountpoint(dchild)))
RETURN(-EBUSY);
if (IS_ERR(op_data))
RETURN(PTR_ERR(op_data));
- if (dchild->d_inode != NULL)
- op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
+ op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
op_data->op_fid2 = op_data->op_fid3;
rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
if (rc)
GOTO(out, rc);
- ll_update_times(request, dir);
- ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1);
+ /*
+ * The server puts attributes in on the last unlink, use them to update
+ * the link count so the inode can be freed immediately.
+ */
+ body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+ if (body->mbo_valid & OBD_MD_FLNLINK)
+ set_nlink(dchild->d_inode, body->mbo_nlink);
- out:
- ptlrpc_req_finished(request);
- RETURN(rc);
+ ll_update_times(request, dir);
+ ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1);
+
+out:
+ ptlrpc_req_finished(request);
+ RETURN(rc);
}
static int ll_rename(struct inode *src, struct dentry *src_dchild,
mutex_lock(&lmv->lmv_init_mutex);
if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) {
tgt = lmv->tgts[index];
- CERROR("%s: UUID %s already assigned at LOV target index %d:"
+ CERROR("%s: UUID %s already assigned at LMV target index %d:"
" rc = %d\n", obd->obd_name,
obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST);
mutex_unlock(&lmv->lmv_init_mutex);
{
struct file *filp;
__u32 i, j;
- int err, rc;
+ int err;
bool any_set = false;
- struct kkuc_ct_data kcd = { 0 };
+ struct kkuc_ct_data kcd = {
+ .kcd_magic = KKUC_CT_DATA_MAGIC,
+ .kcd_uuid = lmv->cluuid,
+ .kcd_archive = lk->lk_data
+ };
+ int rc = 0;
ENTRY;
+ filp = fget(lk->lk_wfd);
+ if (!filp)
+ RETURN(-EBADF);
+
+ rc = libcfs_kkuc_group_add(filp, lk->lk_uid, lk->lk_group,
+ &kcd, sizeof(kcd));
+ if (rc)
+ GOTO(err_fput, rc);
+
/* All or nothing: try to register to all MDS.
* In case of failure, unregister from previous MDS,
* except if it because of inactive target. */
if (tgt == NULL || tgt->ltd_exp == NULL)
continue;
+
err = obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg);
if (err) {
if (tgt->ltd_active) {
obd_iocontrol(cmd, tgt->ltd_exp, len,
lk, uarg);
}
- RETURN(rc);
+ GOTO(err_kkuc_rem, rc);
}
/* else: transient error.
* kuc will register to the missing MDT
if (!any_set)
/* no registration done: return error */
- RETURN(-ENOTCONN);
+ GOTO(err_kkuc_rem, rc = -ENOTCONN);
- /* at least one registration done, with no failure */
- filp = fget(lk->lk_wfd);
- if (filp == NULL)
- RETURN(-EBADF);
+ RETURN(0);
- kcd.kcd_magic = KKUC_CT_DATA_MAGIC;
- kcd.kcd_uuid = lmv->cluuid;
- kcd.kcd_archive = lk->lk_data;
+err_kkuc_rem:
+ libcfs_kkuc_group_rem(lk->lk_uid, lk->lk_group);
- rc = libcfs_kkuc_group_add(filp, lk->lk_uid, lk->lk_group,
- &kcd, sizeof(kcd));
- if (rc != 0)
- fput(filp);
-
- RETURN(rc);
+err_fput:
+ fput(filp);
+ return rc;
}
RETURN(rc);
rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
- LCK_EX, MDS_INODELOCK_FULL,
+ LCK_EX, MDS_INODELOCK_ELC,
MF_MDC_CANCEL_FID3);
if (rc != 0)
RETURN(rc);
struct lmv_tgt_desc *tgt;
rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx,
- LCK_EX, MDS_INODELOCK_FULL,
+ LCK_EX, MDS_INODELOCK_ELC,
MF_MDC_CANCEL_FID4);
if (rc != 0)
RETURN(rc);
}
rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX,
- MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3);
+ MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
if (rc != 0)
RETURN(rc);
}
pattern = le32_to_cpu(lmm->lmm_pattern);
- if (lov_pattern(pattern) != LOV_PATTERN_RAID0)
+ if (lov_pattern(pattern) != LOV_PATTERN_RAID0 &&
+ lov_pattern(pattern) != LOV_PATTERN_MDT)
GOTO(out, rc = -EINVAL);
lod_comp->llc_pattern = pattern;
if (!lod_comp_inited(lod_comp))
continue;
- if (!(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)) {
+ if (!(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED) &&
+ !(lod_comp->llc_pattern & LOV_PATTERN_MDT)) {
rc = lod_initialize_objects(env, lo, objs, i);
if (rc)
GOTO(out, rc);
GOTO(out, rc = -EINVAL);
}
- magic = le32_to_cpu(lum->lmm_magic) & ~LOV_MAGIC_DEF;
+ magic = le32_to_cpu(lum->lmm_magic) & ~LOV_MAGIC_DEFINED;
if (magic != LOV_USER_MAGIC_V1 &&
magic != LOV_USER_MAGIC_V3 &&
magic != LOV_USER_MAGIC_SPECIFIC) {
}
stripe_offset = le16_to_cpu(lum->lmm_stripe_offset);
- if (!is_from_disk && stripe_offset != LOV_OFFSET_DEFAULT) {
+ if (!is_from_disk && stripe_offset != LOV_OFFSET_DEFAULT &&
+ lov_pattern(le32_to_cpu(lum->lmm_pattern)) != LOV_PATTERN_MDT) {
/* if offset is not within valid range [0, osts_size) */
if (stripe_offset >= d->lod_osts_size) {
CDEBUG(D_LAYOUT, "stripe offset %u >= bitmap size %u\n",
RETURN(-EINVAL);
}
- magic = le32_to_cpu(lum->lmm_magic) & ~LOV_MAGIC_DEF;
+ magic = le32_to_cpu(lum->lmm_magic) & ~LOV_MAGIC_DEFINED;
if (magic != LOV_USER_MAGIC_V1 &&
magic != LOV_USER_MAGIC_V3 &&
magic != LOV_USER_MAGIC_SPECIFIC &&
tmp.lb_buf = (char *)comp_v1 +
le32_to_cpu(ent->lcme_offset);
tmp.lb_len = le32_to_cpu(ent->lcme_size);
+
+ /* Checks for DoM entry in composite layout. */
+ lum = tmp.lb_buf;
+ if (lov_pattern(le32_to_cpu(lum->lmm_pattern)) ==
+ LOV_PATTERN_MDT) {
+ /* DoM component can be only the first entry */
+ if (i > 0) {
+ CDEBUG(D_LAYOUT, "invalid DoM layout "
+ "entry found at %i index\n", i);
+ RETURN(-EINVAL);
+ }
+ stripe_size = le32_to_cpu(lum->lmm_stripe_size);
+ /* There is just one stripe on MDT and it must
+ * cover whole component size. */
+ if (stripe_size != prev_end) {
+ CDEBUG(D_LAYOUT, "invalid DoM layout "
+ "stripe size %u != %llu "
+ "(component size)\n",
+ stripe_size, prev_end);
+ RETURN(-EINVAL);
+ }
+ /* Check stripe size againts per-MDT limit */
+ if (stripe_size > d->lod_dom_max_stripesize) {
+ CDEBUG(D_LAYOUT, "DoM component size "
+ "%u is bigger than MDT limit "
+ "%u, check dom_max_stripesize"
+ " parameter\n",
+ stripe_size,
+ d->lod_dom_max_stripesize);
+ RETURN(-EINVAL);
+ }
+ }
rc = lod_verify_v1v3(d, &tmp, is_from_disk);
if (rc)
break;
void lod_fix_desc_pattern(__u32 *val)
{
/* from lov_setstripe */
- if ((*val != 0) && (*val != LOV_PATTERN_RAID0)) {
+ if ((*val != 0) && (*val != LOV_PATTERN_RAID0) &&
+ (*val != LOV_PATTERN_MDT)) {
LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val);
*val = 0;
}
}
if (v1->lmm_pattern != LOV_PATTERN_RAID0 &&
+ v1->lmm_pattern != LOV_PATTERN_MDT &&
v1->lmm_pattern != 0) {
lod_free_def_comp_entries(lds);
RETURN(-EINVAL);
lod_comp->llc_stripe_count = v1->lmm_stripe_count;
lod_comp->llc_stripe_size = v1->lmm_stripe_size;
lod_comp->llc_stripe_offset = v1->lmm_stripe_offset;
+ lod_comp->llc_pattern = v1->lmm_pattern;
pool = NULL;
if (v1->lmm_magic == LOV_USER_MAGIC_V3) {
&lds->lds_def_comp_entries[i];
CDEBUG(D_LAYOUT, "Inherite from default: size:%hu "
- "nr:%u offset:%u %s\n",
+ "nr:%u offset:%u pattern %#x %s\n",
def_comp->llc_stripe_size,
def_comp->llc_stripe_count,
def_comp->llc_stripe_offset,
+ def_comp->llc_pattern,
def_comp->llc_pool ?: "");
*obj_comp = *def_comp;
if (!lo->ldo_is_composite)
continue;
- if (obj_comp->llc_stripe_count <= 0)
+ if (obj_comp->llc_stripe_count <= 0 &&
+ obj_comp->llc_pattern != LOV_PATTERN_MDT)
obj_comp->llc_stripe_count =
desc->ld_default_stripe_count;
if (obj_comp->llc_stripe_size <= 0)
if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
lod_comp_set_init(lod_comp);
+ if (lov_pattern(lod_comp->llc_pattern) == LOV_PATTERN_MDT)
+ lod_comp_set_init(lod_comp);
+
if (lod_comp->llc_stripe == NULL)
continue;
if (buf && buf->lb_len) {
struct lov_user_md_v1 *v1 = buf->lb_buf;
- if (v1->lmm_magic != (LOV_MAGIC_DEF | LOV_MAGIC_COMP_V1) &&
- v1->lmm_magic !=
- __swab32(LOV_MAGIC_DEF | LOV_MAGIC_COMP_V1)) {
+ if (v1->lmm_magic != (LOV_MAGIC_DEFINED | LOV_MAGIC_COMP_V1) &&
+ v1->lmm_magic != __swab32(LOV_MAGIC_DEFINED |
+ LOV_MAGIC_COMP_V1)) {
CERROR("%s: the replay buffer of layout extend "
"(magic %#x) does not contain expected "
"composite layout.\n",
int rc = 0, i;
ENTRY;
- magic = le32_to_cpu(v1->lmm_magic) & ~LOV_MAGIC_DEF;
+ magic = le32_to_cpu(v1->lmm_magic) & ~LOV_MAGIC_DEFINED;
if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3 &&
magic != LOV_MAGIC_COMP_V1)
lod_obj_set_pool(mo, i, pool_name);
if ((!mo->ldo_is_composite || lod_comp_inited(lod_comp)) &&
- !(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)) {
+ !(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED) &&
+ !(lod_comp->llc_pattern & LOV_PATTERN_MDT)) {
rc = lod_initialize_objects(env, mo, objs, i);
if (rc)
GOTO(out, rc);
comp_v1 = buf->lb_buf;
magic = v1->lmm_magic;
- if (unlikely(le32_to_cpu(magic) & LOV_MAGIC_DEF)) {
+ if (unlikely(le32_to_cpu(magic) & LOV_MAGIC_DEFINED)) {
/* try to use as fully defined striping */
rc = lod_use_defined_striping(env, lo, buf);
RETURN(rc);
if (v1->lmm_pattern == 0)
v1->lmm_pattern = LOV_PATTERN_RAID0;
- if (lov_pattern(v1->lmm_pattern) != LOV_PATTERN_RAID0) {
+ if (lov_pattern(v1->lmm_pattern) != LOV_PATTERN_RAID0 &&
+ lov_pattern(v1->lmm_pattern) != LOV_PATTERN_MDT) {
CDEBUG(D_LAYOUT, "%s: invalid pattern: %x\n",
lod2obd(d)->obd_name, v1->lmm_pattern);
GOTO(free_comp, rc = -EINVAL);
}
lod_comp->llc_pattern = v1->lmm_pattern;
-
lod_comp->llc_stripe_size = desc->ld_default_stripe_size;
if (v1->lmm_stripe_size)
lod_comp->llc_stripe_size = v1->lmm_stripe_size;
lod_comp->llc_stripe_count = desc->ld_default_stripe_count;
- if (v1->lmm_stripe_count)
+ if (v1->lmm_stripe_count ||
+ lov_pattern(v1->lmm_pattern) == LOV_PATTERN_MDT)
lod_comp->llc_stripe_count = v1->lmm_stripe_count;
lod_comp->llc_stripe_offset = v1->lmm_stripe_offset;
lod_obj_set_pool(lo, i, pool_name);
+ LASSERT(ergo(lov_pattern(lod_comp->llc_pattern) ==
+ LOV_PATTERN_MDT, lod_comp->llc_stripe_count == 0));
+
if (pool_name == NULL)
continue;
if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
RETURN(0);
+ /* A Data-on-MDT component is being created */
+ if (lov_pattern(lod_comp->llc_pattern) == LOV_PATTERN_MDT)
+ RETURN(0);
+
if (likely(lod_comp->llc_stripe == NULL)) {
/*
* no striping has been created so far
* \retval 0 on success
* \retval negative error code if failed
*/
+ static int lod_dom_stripesize_seq_show(struct seq_file *m, void *v)
+ {
+ struct obd_device *dev = m->private;
+ struct lod_device *lod;
+
+ LASSERT(dev != NULL);
+ lod = lu2lod_dev(dev->obd_lu_dev);
+ seq_printf(m, "%u\n", lod->lod_dom_max_stripesize);
+ return 0;
+ }
+
+ /**
+ * Set default stripe size.
+ *
+ * \param[in] file proc file
+ * \param[in] buffer string containing the maximum number of bytes stored in
+ * each object before moving to the next object in the
+ * layout (if any)
+ * \param[in] count @buffer length
+ * \param[in] off unused for single entry
+ *
+ * \retval @count on success
+ * \retval negative error code if failed
+ */
+ static ssize_t
+ lod_dom_stripesize_seq_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *off)
+ {
+ struct seq_file *m = file->private_data;
+ struct obd_device *dev = m->private;
+ struct lod_device *lod;
+ __s64 val;
+ int rc;
+
+ LASSERT(dev != NULL);
+ lod = lu2lod_dev(dev->obd_lu_dev);
+ rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
+ if (rc)
+ return rc;
+ if (val < 0)
+ return -ERANGE;
+
+ /* */
+ if (val > (1ULL << 30))
+ return -ERANGE;
+ else if (val > 0)
+ lod_fix_desc_stripe_size(&val);
+
+ lod->lod_dom_max_stripesize = val;
+
+ return count;
+ }
+ LPROC_SEQ_FOPS(lod_dom_stripesize);
+
+ /**
+ * Show default stripe size.
+ *
+ * \param[in] m seq file
+ * \param[in] v unused for single entry
+ *
+ * \retval 0 on success
+ * \retval negative error code if failed
+ */
static int lod_stripesize_seq_show(struct seq_file *m, void *v)
{
struct obd_device *dev = m->private;
LASSERT(dev != NULL);
lod = lu2lod_dev(dev->obd_lu_dev);
- seq_printf(m, "%llu\n",
- lod->lod_desc.ld_default_stripe_offset);
+ seq_printf(m, "%lld\n", lod->lod_desc.ld_default_stripe_offset);
return 0;
}
LASSERT(dev != NULL);
lod = lu2lod_dev(dev->obd_lu_dev);
- rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
+ rc = lprocfs_str_to_s64(buffer, count, &val);
if (rc)
return rc;
if (val < -1)
.fops = &lod_qos_maxage_fops },
{ .name = "lmv_failout",
.fops = &lod_lmv_failout_fops },
+ {
+ .name = "dom_stripesize",
+ .fops = &lod_dom_stripesize_fops
+ },
{ NULL }
};
struct ldlm_request *lockreq;
struct ldlm_reply *lockrep;
struct ldlm_lock *lock;
+ struct mdt_body *body = NULL;
void *lvb_data = NULL;
__u32 lvb_len = 0;
+
ENTRY;
LASSERT(rc >= 0);
/* We know what to expect, so we do any byte flipping required here */
if (it_has_reply_body(it)) {
- struct mdt_body *body;
-
body = req_capsule_server_get(pill, &RMF_MDT_BODY);
if (body == NULL) {
CERROR ("Can't swab mdt_body\n");
* client still does this checking in case it's talking with an old
* server. - Jinshan */
lock = ldlm_handle2lock(lockh);
- if (lock != NULL && ldlm_has_layout(lock) && lvb_data != NULL &&
+ if (lock == NULL)
+ RETURN(rc);
+
+ if (ldlm_has_layout(lock) && lvb_data != NULL &&
!(lockrep->lock_flags & LDLM_FL_BLOCKED_MASK)) {
void *lmm;
ldlm_it2str(it->it_op), lvb_len);
OBD_ALLOC_LARGE(lmm, lvb_len);
- if (lmm == NULL) {
- LDLM_LOCK_PUT(lock);
- RETURN(-ENOMEM);
- }
+ if (lmm == NULL)
+ GOTO(out_lock, rc = -ENOMEM);
+
memcpy(lmm, lvb_data, lvb_len);
/* install lvb_data */
if (lmm != NULL)
OBD_FREE_LARGE(lmm, lvb_len);
}
- if (lock != NULL)
- LDLM_LOCK_PUT(lock);
+
+ if (ldlm_has_dom(lock)) {
+ LASSERT(lock->l_glimpse_ast == mdc_ldlm_glimpse_ast);
+
+ body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+ if (!(body->mbo_valid & OBD_MD_DOM_SIZE)) {
+ LDLM_ERROR(lock, "%s: DoM lock without size.\n",
+ exp->exp_obd->obd_name);
+ GOTO(out_lock, rc = -EPROTO);
+ }
+
+ LDLM_DEBUG(lock, "DoM lock is returned by: %s, size: %llu",
+ ldlm_it2str(it->it_op), body->mbo_dom_size);
+
+ rc = mdc_fill_lvb(req, &lock->l_ost_lvb);
+ }
+ out_lock:
+ LDLM_LOCK_PUT(lock);
RETURN(rc);
}
rc = obd_get_request_slot(&obddev->u.cli);
if (rc != 0) {
mdc_put_mod_rpc_slot(req, it);
- mdc_clear_replay_flag(req, 0);
- ptlrpc_req_finished(req);
- RETURN(rc);
- }
- }
+ mdc_clear_replay_flag(req, 0);
+ ptlrpc_req_finished(req);
+ RETURN(rc);
+ }
+ }
- rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
+ /* With Data-on-MDT the glimpse callback is needed too.
+ * It is set here in advance but not in mdc_finish_enqueue()
+ * to avoid possible races. It is safe to have glimpse handler
+ * for non-DOM locks and costs nothing.*/
+ if (einfo->ei_cb_gl == NULL)
+ einfo->ei_cb_gl = mdc_ldlm_glimpse_ast;
+
+ rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
0, lvb_type, lockh, 0);
- if (!it) {
- /* For flock requests we immediatelly return without further
- delay and let caller deal with the rest, since rest of
- this function metadata processing makes no sense for flock
+ if (!it) {
+ /* For flock requests we immediatelly return without further
+ delay and let caller deal with the rest, since rest of
+ this function metadata processing makes no sense for flock
requests anyway. But in case of problem during comms with
Server (ETIMEDOUT) or any signal/kill attempt (EINTR), we
can not rely on caller and this mainly for F_UNLCKs
mdc_put_mod_rpc_slot(req, it);
if (rc < 0) {
- CDEBUG(D_INFO, "%s: ldlm_cli_enqueue failed: rc = %d\n",
- obddev->obd_name, rc);
+ CDEBUG(D_INFO,
+ "%s: ldlm_cli_enqueue "DFID":"DFID"=%s failed: rc = %d\n",
+ obddev->obd_name, PFID(&op_data->op_fid1),
+ PFID(&op_data->op_fid2), op_data->op_name ?: "", rc);
mdc_clear_replay_flag(req, rc);
ptlrpc_req_finished(req);
.ei_mode = it_to_lock_mode(it),
.ei_cb_bl = cb_blocking,
.ei_cb_cp = ldlm_completion_ast,
+ .ei_cb_gl = mdc_ldlm_glimpse_ast,
};
struct lustre_handle lockh;
int rc = 0;
RETURN(rc);
}
+ /* With Data-on-MDT the glimpse callback is needed too.
+ * It is set here in advance but not in mdc_finish_enqueue()
+ * to avoid possible races. It is safe to have glimpse handler
+ * for non-DOM locks and costs nothing.*/
+ if (minfo->mi_einfo.ei_cb_gl == NULL)
+ minfo->mi_einfo.ei_cb_gl = mdc_ldlm_glimpse_ast;
+
rc = ldlm_cli_enqueue(exp, &req, &minfo->mi_einfo, &res_id, &policy,
&flags, NULL, 0, LVB_T_NONE, &minfo->mi_lockh, 1);
if (rc < 0) {
#include <uapi/linux/lustre/lustre_param.h>
#include <lustre_swab.h>
#include <obd_class.h>
+ #include <lustre_osc.h>
#include "mdc_internal.h"
}
}
- if (opcode == MDS_REINT) {
- struct mdt_rec_setxattr *rec;
+ if (opcode == MDS_REINT) {
+ struct mdt_rec_setxattr *rec;
- CLASSERT(sizeof(struct mdt_rec_setxattr) ==
- sizeof(struct mdt_rec_reint));
+ CLASSERT(sizeof(struct mdt_rec_setxattr) ==
+ sizeof(struct mdt_rec_reint));
rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
rec->sx_opcode = REINT_SETXATTR;
rec->sx_fsuid = from_kuid(&init_user_ns, current_fsuid());
keylen, key, vallen, val, set);
RETURN(rc);
}
- if (KEY_IS(KEY_SPTLRPC_CONF)) {
- sptlrpc_conf_client_adapt(exp->exp_obd);
- RETURN(0);
- }
- if (KEY_IS(KEY_FLUSH_CTX)) {
- sptlrpc_import_flush_my_ctx(imp);
- RETURN(0);
- }
if (KEY_IS(KEY_CHANGELOG_CLEAR)) {
rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
keylen, key, vallen, val, set);
RETURN(0);
}
- CERROR("Unknown key %s\n", (char *)key);
- RETURN(-EINVAL);
+ rc = osc_set_info_async(env, exp, keylen, key, vallen, val, set);
+ RETURN(rc);
}
static int mdc_get_info(const struct lu_env *env, struct obd_export *exp,
static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
enum obd_import_event event)
{
+ struct client_obd *cli = &obd->u.cli;
int rc = 0;
LASSERT(imp->imp_obd == obd);
switch (event) {
-
- case IMP_EVENT_INACTIVE: {
- struct client_obd *cli = &obd->u.cli;
+ case IMP_EVENT_DISCON:
+ spin_lock(&cli->cl_loi_list_lock);
+ cli->cl_avail_grant = 0;
+ cli->cl_lost_grant = 0;
+ spin_unlock(&cli->cl_loi_list_lock);
+ break;
+ case IMP_EVENT_INACTIVE:
/*
* Flush current sequence to make client obtain new one
* from server in case of disconnect/reconnect.
rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE);
break;
- }
case IMP_EVENT_INVALIDATE: {
struct ldlm_namespace *ns = obd->obd_namespace;
+ struct lu_env *env;
+ __u16 refcheck;
ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+ env = cl_env_get(&refcheck);
+ if (!IS_ERR(env)) {
+ /* Reset grants. All pages go to failing rpcs due to
+ * the invalid import.
+ */
+ osc_io_unplug(env, cli, NULL);
+
+ cfs_hash_for_each_nolock(ns->ns_rs_hash,
+ osc_ldlm_resource_invalidate,
+ env, 0);
+ cl_env_put(env, &refcheck);
+ ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+ } else {
+ rc = PTR_ERR(env);
+ }
break;
}
case IMP_EVENT_ACTIVE:
if (rc == 0)
rc = mdc_kuc_reregister(imp);
break;
- case IMP_EVENT_OCD:
+ case IMP_EVENT_OCD: {
+ struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+ if (OCD_HAS_FLAG(ocd, GRANT))
+ osc_init_grant(cli, ocd);
+
rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD);
break;
- case IMP_EVENT_DISCON:
+ }
case IMP_EVENT_DEACTIVATE:
case IMP_EVENT_ACTIVATE:
break;
EXIT;
}
- static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+ int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
{
- int rc;
+ int rc;
+
ENTRY;
- rc = ptlrpcd_addref();
+ rc = osc_setup_common(obd, cfg);
if (rc < 0)
RETURN(rc);
- rc = client_obd_setup(obd, cfg);
- if (rc)
- GOTO(err_ptlrpcd_decref, rc);
#ifdef CONFIG_PROC_FS
obd->obd_vars = lprocfs_mdc_obd_vars;
lprocfs_obd_setup(obd, false);
lprocfs_alloc_md_stats(obd, 0);
#endif
+
sptlrpc_lprocfs_cliobd_attach(obd);
ptlrpc_lprocfs_register_obd(obd);
if (rc) {
CERROR("%s: failed to setup llogging subsystems: rc = %d\n",
obd->obd_name, rc);
- GOTO(err_mdc_cleanup, rc);
+ GOTO(err_llog_cleanup, rc);
}
rc = mdc_changelog_cdev_init(obd);
if (rc) {
CERROR("%s: failed to setup changelog char device: rc = %d\n",
obd->obd_name, rc);
- GOTO(err_mdc_cleanup, rc);
+ GOTO(err_changelog_cleanup, rc);
}
- EXIT;
- err_mdc_cleanup:
- if (rc)
- client_obd_cleanup(obd);
+ RETURN(rc);
- err_ptlrpcd_decref:
- if (rc)
- ptlrpcd_decref();
+ err_changelog_cleanup:
+ mdc_llog_finish(obd);
+ err_llog_cleanup:
+ ptlrpc_lprocfs_unregister_obd(obd);
+ lprocfs_obd_cleanup(obd);
+ lprocfs_free_md_stats(obd);
- return rc;
+ osc_cleanup_common(obd);
+ return rc;
}
/* Initialize the default and maximum LOV EA sizes. This allows
{
ENTRY;
+ osc_precleanup_common(obd);
+
/* Failsafe, ok if racy */
if (obd->obd_type->typ_refcnt <= 1)
libcfs_kkuc_group_rem(0, KUC_GRP_HSM);
static int mdc_cleanup(struct obd_device *obd)
{
- ptlrpcd_decref();
-
- return client_obd_cleanup(obd);
+ return osc_cleanup_common(obd);
}
- static int mdc_process_config(struct obd_device *obd, size_t len, void *buf)
+ int mdc_process_config(struct obd_device *obd, size_t len, void *buf)
{
- struct lustre_cfg *lcfg = buf;
- int rc = class_process_proc_param(PARAM_MDC, obd->obd_vars, lcfg, obd);
+ struct lustre_cfg *lcfg = buf;
+ int rc;
+
+ rc = class_process_proc_param(PARAM_MDC, obd->obd_vars, lcfg, obd);
return (rc > 0 ? 0: rc);
}
.o_add_conn = client_import_add_conn,
.o_del_conn = client_import_del_conn,
.o_connect = client_connect_import,
- .o_disconnect = client_disconnect_export,
+ .o_reconnect = osc_reconnect,
+ .o_disconnect = osc_disconnect,
.o_iocontrol = mdc_iocontrol,
.o_set_info_async = mdc_set_info_async,
.o_statfs = mdc_statfs,
static int __init mdc_init(void)
{
return class_register_type(&mdc_obd_ops, &mdc_md_ops, true, NULL,
- LUSTRE_MDC_NAME, NULL);
+ LUSTRE_MDC_NAME, &mdc_device_type);
}
static void __exit mdc_exit(void)
#include <obd.h>
#include <obd_support.h>
#include <lustre_barrier.h>
-
+ #include <obd_cksum.h>
#include <llog_swab.h>
#include "mdt_internal.h"
{
struct ptlrpc_request *req = tgt_ses_req(tsi);
struct mdt_thread_info *info = tsi2mdt_info(tsi);
- struct md_device *next = info->mti_mdt->mdt_child;
+ struct mdt_device *mdt = info->mti_mdt;
+ struct tg_grants_data *tgd = &mdt->mdt_lut.lut_tgd;
struct ptlrpc_service_part *svcpt;
struct obd_statfs *osfs;
int rc;
if (!osfs)
GOTO(out, rc = -EPROTO);
- /** statfs information are cached in the mdt_device */
- if (cfs_time_before_64(info->mti_mdt->mdt_osfs_age,
- cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS))) {
- /** statfs data is too old, get up-to-date one */
- rc = next->md_ops->mdo_statfs(info->mti_env, next, osfs);
- if (rc)
- GOTO(out, rc);
- spin_lock(&info->mti_mdt->mdt_lock);
- info->mti_mdt->mdt_osfs = *osfs;
- info->mti_mdt->mdt_osfs_age = cfs_time_current_64();
- spin_unlock(&info->mti_mdt->mdt_lock);
- } else {
- /** use cached statfs data */
- spin_lock(&info->mti_mdt->mdt_lock);
- *osfs = info->mti_mdt->mdt_osfs;
- spin_unlock(&info->mti_mdt->mdt_lock);
- }
+ rc = tgt_statfs_internal(tsi->tsi_env, &mdt->mdt_lut, osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ NULL);
+ if (unlikely(rc))
+ GOTO(out, rc);
+ /* at least try to account for cached pages. its still racy and
+ * might be under-reporting if clients haven't announced their
+ * caches with brw recently */
+ CDEBUG(D_SUPER | D_CACHE, "blocks cached %llu granted %llu"
+ " pending %llu free %llu avail %llu\n",
+ tgd->tgd_tot_dirty, tgd->tgd_tot_granted,
+ tgd->tgd_tot_pending,
+ osfs->os_bfree << tgd->tgd_blockbits,
+ osfs->os_bavail << tgd->tgd_blockbits);
+
+ osfs->os_bavail -= min_t(u64, osfs->os_bavail,
+ ((tgd->tgd_tot_dirty + tgd->tgd_tot_pending +
+ osfs->os_bsize - 1) >> tgd->tgd_blockbits));
+
+ tgt_grant_sanity_check(mdt->mdt_lu_dev.ld_obd, __func__);
+ CDEBUG(D_CACHE, "%llu blocks: %llu free, %llu avail; "
+ "%llu objects: %llu free; state %x\n",
+ osfs->os_blocks, osfs->os_bfree, osfs->os_bavail,
+ osfs->os_files, osfs->os_ffree, osfs->os_state);
+
+ if (!exp_grant_param_supp(tsi->tsi_exp) &&
+ tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT) {
+ /* clients which don't support OBD_CONNECT_GRANT_PARAM
+ * should not see a block size > page size, otherwise
+ * cl_lost_grant goes mad. Therefore, we emulate a 4KB (=2^12)
+ * block size which is the biggest block size known to work
+ * with all client's page size. */
+ osfs->os_blocks <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
+ osfs->os_bfree <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
+ osfs->os_bavail <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
+ osfs->os_bsize = 1 << COMPAT_BSIZE_SHIFT;
+ }
if (rc == 0)
mdt_counter_incr(req, LPROC_MDT_STATFS);
out:
RETURN(rc);
}
+ /**
+ * Pack size attributes into the reply.
+ */
+ int mdt_pack_size2body(struct mdt_thread_info *info,
+ const struct lu_fid *fid, bool dom_lock)
+ {
+ struct mdt_body *b;
+ struct md_attr *ma = &info->mti_attr;
+ int dom_stripe;
+
+ ENTRY;
+
+ LASSERT(ma->ma_attr.la_valid & LA_MODE);
+
+ if (!S_ISREG(ma->ma_attr.la_mode) ||
+ !(ma->ma_valid & MA_LOV && ma->ma_lmm != NULL))
+ RETURN(-ENODATA);
+
+ dom_stripe = mdt_lmm_dom_entry(ma->ma_lmm);
+ /* no DoM stripe, no size in reply */
+ if (dom_stripe == LMM_NO_DOM)
+ RETURN(-ENOENT);
+
+ /* no DoM lock, no size in reply */
+ if (!dom_lock)
+ RETURN(0);
+
+ /* Either DoM lock exists or LMM has only DoM stripe then
+ * return size on body. */
+ b = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+
+ mdt_dom_object_size(info->mti_env, info->mti_mdt, fid, b, dom_lock);
+ RETURN(0);
+ }
+
#ifdef CONFIG_FS_POSIX_ACL
/*
* Pack ACL data into the reply. UIDs/GIDs are mapped and filtered by nodemap.
/* if no object is allocated on osts, the size on mds is valid.
* b=22272 */
b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
- } else if ((ma->ma_valid & MA_LOV) && ma->ma_lmm != NULL &&
- mdt_hsm_is_released(ma->ma_lmm)) {
- /* A released file stores its size on MDS. */
- /* But return 1 block for released file, unless tools like tar
- * will consider it fully sparse. (LU-3864)
- */
- if (unlikely(b->mbo_size == 0))
- b->mbo_blocks = 0;
- else
- b->mbo_blocks = 1;
- b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+ } else if ((ma->ma_valid & MA_LOV) && ma->ma_lmm != NULL) {
+ if (mdt_hsm_is_released(ma->ma_lmm)) {
+ /* A released file stores its size on MDS. */
+ /* But return 1 block for released file, unless tools
+ * like tar will consider it fully sparse. (LU-3864)
+ */
+ if (unlikely(b->mbo_size == 0))
+ b->mbo_blocks = 0;
+ else
+ b->mbo_blocks = 1;
+ b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+ }
}
if (fid != NULL && (b->mbo_valid & OBD_MD_FLSIZE))
/* layout lock must be granted in a best-effort way
* for IT operations */
LASSERT(!(child_bits & MDS_INODELOCK_LAYOUT));
- if (!OBD_FAIL_CHECK(OBD_FAIL_MDS_NO_LL_GETATTR) &&
- exp_connect_layout(info->mti_exp) &&
- S_ISREG(lu_object_attr(&child->mot_obj)) &&
+ if (S_ISREG(lu_object_attr(&child->mot_obj)) &&
!mdt_object_remote(child) && ldlm_rep != NULL) {
- /* try to grant layout lock for regular file. */
- try_bits = MDS_INODELOCK_LAYOUT;
+ if (!OBD_FAIL_CHECK(OBD_FAIL_MDS_NO_LL_GETATTR) &&
+ exp_connect_layout(info->mti_exp)) {
+ /* try to grant layout lock for regular file. */
+ try_bits = MDS_INODELOCK_LAYOUT;
+ }
+ /* Acquire DOM lock in advance for data-on-mdt file */
+ if (child != parent)
+ try_bits |= MDS_INODELOCK_DOM;
}
if (try_bits != 0) {
"Lock res_id: "DLDLMRES", fid: "DFID"\n",
PLDLMRES(lock->l_resource),
PFID(mdt_object_fid(child)));
+
+ if (S_ISREG(lu_object_attr(&child->mot_obj)) &&
+ mdt_object_exists(child) && !mdt_object_remote(child) &&
+ child != parent) {
+ LDLM_LOCK_PUT(lock);
+ mdt_object_put(info->mti_env, child);
+ /* NB: call the mdt_pack_size2body always after
+ * mdt_object_put(), that is why this speacial
+ * exit path is used. */
+ rc = mdt_pack_size2body(info, child_fid,
+ child_bits & MDS_INODELOCK_DOM);
+ if (rc != 0 && child_bits & MDS_INODELOCK_DOM) {
+ /* DOM lock was taken in advance but this is
+ * not DoM file. Drop the lock. */
+ lock_res_and_lock(lock);
+ ldlm_inodebits_drop(lock, MDS_INODELOCK_DOM);
+ unlock_res_and_lock(lock);
+ }
+
+ GOTO(out_parent, rc = 0);
+ }
}
if (lock)
LDLM_LOCK_PUT(lock);
}
/* this should sync this object */
- static int mdt_object_sync(struct mdt_thread_info *info)
+ static int mdt_object_sync(const struct lu_env *env, struct obd_export *exp,
+ struct mdt_object *mo)
{
- struct md_object *next;
int rc;
+
ENTRY;
- if (!mdt_object_exists(info->mti_object)) {
+ if (!mdt_object_exists(mo)) {
CWARN("%s: non existing object "DFID": rc = %d\n",
- mdt_obd_name(info->mti_mdt),
- PFID(mdt_object_fid(info->mti_object)), -ESTALE);
+ exp->exp_obd->obd_name, PFID(mdt_object_fid(mo)),
+ -ESTALE);
RETURN(-ESTALE);
}
- next = mdt_object_child(info->mti_object);
- rc = mo_object_sync(info->mti_env, next);
+
+ rc = mo_object_sync(env, mdt_object_child(mo));
RETURN(rc);
}
struct mdt_thread_info *info = tsi2mdt_info(tsi);
/* sync an object */
- rc = mdt_object_sync(info);
+ rc = mdt_object_sync(tsi->tsi_env, tsi->tsi_exp,
+ info->mti_object);
if (rc == 0) {
const struct lu_fid *fid;
struct lu_attr *la = &info->mti_attr.ma_attr;
RETURN(rc);
}
+ static int mdt_data_sync(struct tgt_session_info *tsi)
+ {
+ struct mdt_thread_info *info;
+ struct mdt_device *mdt = mdt_exp2dev(tsi->tsi_exp);
+ struct ost_body *body = tsi->tsi_ost_body;
+ struct ost_body *repbody;
+ struct mdt_object *mo = NULL;
+ struct md_attr *ma;
+ int rc = 0;
+
+ ENTRY;
+
+ repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
+
+ /* if no fid is specified then do nothing,
+ * device sync is done via MDS_SYNC */
+ if (fid_is_zero(&tsi->tsi_fid))
+ RETURN(0);
+
+ mo = mdt_object_find(tsi->tsi_env, mdt, &tsi->tsi_fid);
+ if (IS_ERR(mo))
+ RETURN(PTR_ERR(mo));
+
+ rc = mdt_object_sync(tsi->tsi_env, tsi->tsi_exp, mo);
+ if (rc)
+ GOTO(put, rc);
+
+ repbody->oa.o_oi = body->oa.o_oi;
+ repbody->oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+ info = tsi2mdt_info(tsi);
+ ma = &info->mti_attr;
+ ma->ma_need = MA_INODE;
+ ma->ma_valid = 0;
+ rc = mdt_attr_get_complex(info, mo, ma);
+ if (rc == 0)
+ obdo_from_la(&repbody->oa, &ma->ma_attr, VALID_FLAGS);
+ else
+ rc = 0;
+ mdt_thread_info_fini(info);
+
+ EXIT;
+ put:
+ if (mo != NULL)
+ mdt_object_put(tsi->tsi_env, mo);
+ return rc;
+ }
+
/*
* Handle quota control requests to consult current usage/limit, but also
* to configure quota enforcement
* \param mode lock mode
* \param decref force immediate lock releasing
*/
- static void mdt_save_lock(struct mdt_thread_info *info, struct lustre_handle *h,
- enum ldlm_mode mode, int decref)
+ void mdt_save_lock(struct mdt_thread_info *info, struct lustre_handle *h,
+ enum ldlm_mode mode, int decref)
{
ENTRY;
MDT_IT_GETXATTR,
MDT_IT_LAYOUT,
MDT_IT_QUOTA,
- MDT_IT_NR
+ MDT_IT_GLIMPSE,
+ MDT_IT_BRW,
+ MDT_IT_NR
};
static int mdt_intent_getattr(enum mdt_it_code opcode,
- struct mdt_thread_info *info,
- struct ldlm_lock **,
- __u64);
+ struct mdt_thread_info *info,
+ struct ldlm_lock **, __u64);
static int mdt_intent_getxattr(enum mdt_it_code opcode,
struct mdt_thread_info *info,
struct mdt_thread_info *info,
struct ldlm_lock **,
__u64);
+ static int mdt_intent_glimpse(enum mdt_it_code opcode,
+ struct mdt_thread_info *info,
+ struct ldlm_lock **lockp, __u64 flags)
+ {
+ return mdt_glimpse_enqueue(info, info->mti_mdt->mdt_namespace,
+ lockp, flags);
+ }
+ static int mdt_intent_brw(enum mdt_it_code opcode,
+ struct mdt_thread_info *info,
+ struct ldlm_lock **lockp, __u64 flags)
+ {
+ return mdt_brw_enqueue(info, info->mti_mdt->mdt_namespace,
+ lockp, flags);
+ }
static struct mdt_it_flavor {
const struct req_format *it_fmt;
.it_fmt = &RQF_LDLM_INTENT_LAYOUT,
.it_flags = 0,
.it_act = mdt_intent_layout
- }
+ },
+ [MDT_IT_GLIMPSE] = {
+ .it_fmt = &RQF_LDLM_INTENT,
+ .it_flags = 0,
+ .it_act = mdt_intent_glimpse,
+ },
+ [MDT_IT_BRW] = {
+ .it_fmt = &RQF_LDLM_INTENT,
+ .it_flags = 0,
+ .it_act = mdt_intent_brw,
+ },
+
};
- static int
- mdt_intent_lock_replace(struct mdt_thread_info *info,
- struct ldlm_lock **lockp,
- struct mdt_lock_handle *lh,
- __u64 flags, int result)
+ int mdt_intent_lock_replace(struct mdt_thread_info *info,
+ struct ldlm_lock **lockp,
+ struct mdt_lock_handle *lh,
+ __u64 flags, int result)
{
struct ptlrpc_request *req = mdt_info_req(info);
struct ldlm_lock *lock = *lockp;
new_lock->l_export = class_export_lock_get(req->rq_export, new_lock);
new_lock->l_blocking_ast = lock->l_blocking_ast;
new_lock->l_completion_ast = lock->l_completion_ast;
+ if (ldlm_has_dom(new_lock))
+ new_lock->l_glimpse_ast = ldlm_server_glimpse_ast;
new_lock->l_remote_handle = lock->l_remote_handle;
new_lock->l_flags &= ~LDLM_FL_LOCAL;
RETURN(ELDLM_LOCK_REPLACED);
}
- static void mdt_intent_fixup_resent(struct mdt_thread_info *info,
- struct ldlm_lock *new_lock,
- struct mdt_lock_handle *lh,
- __u64 flags)
+ void mdt_intent_fixup_resent(struct mdt_thread_info *info,
+ struct ldlm_lock *new_lock,
+ struct mdt_lock_handle *lh, __u64 flags)
{
struct ptlrpc_request *req = mdt_info_req(info);
struct ldlm_request *dlmreq;
case IT_QUOTA_CONN:
rc = MDT_IT_QUOTA;
break;
+ case IT_GLIMPSE:
+ rc = MDT_IT_GLIMPSE;
+ break;
+ case IT_BRW:
+ rc = MDT_IT_BRW;
+ break;
default:
CERROR("Unknown intent opcode: 0x%08x\n", itcode);
rc = -EINVAL;
RETURN(rc);
}
+ static void mdt_ptlrpc_stats_update(struct ptlrpc_request *req,
+ enum ldlm_intent_flags it_opc)
+ {
+ struct lprocfs_stats *srv_stats = ptlrpc_req2svc(req)->srv_stats;
+
+ /* update stats when IT code is known */
+ if (srv_stats != NULL)
+ lprocfs_counter_incr(srv_stats,
+ PTLRPC_LAST_CNTR + (it_opc == IT_GLIMPSE ?
+ LDLM_GLIMPSE_ENQUEUE : LDLM_IBITS_ENQUEUE));
+ }
+
static int mdt_intent_policy(struct ldlm_namespace *ns,
struct ldlm_lock **lockp, void *req_cookie,
enum ldlm_mode mode, __u64 flags, void *data)
struct ptlrpc_request *req = req_cookie;
struct ldlm_intent *it;
struct req_capsule *pill;
+ const struct ldlm_lock_desc *ldesc;
int rc;
ENTRY;
tsi = tgt_ses_info(req->rq_svc_thread->t_env);
info = tsi2mdt_info(tsi);
- LASSERT(info != NULL);
- pill = info->mti_pill;
- LASSERT(pill->rc_req == req);
+ LASSERT(info != NULL);
+ pill = info->mti_pill;
+ LASSERT(pill->rc_req == req);
+ ldesc = &info->mti_dlm_req->lock_desc;
- if (req->rq_reqmsg->lm_bufcount > DLM_INTENT_IT_OFF) {
+ if (req->rq_reqmsg->lm_bufcount > DLM_INTENT_IT_OFF) {
req_capsule_extend(pill, &RQF_LDLM_INTENT_BASIC);
- it = req_capsule_client_get(pill, &RMF_LDLM_INTENT);
- if (it != NULL) {
- rc = mdt_intent_opc(it->opc, info, lockp, flags);
- if (rc == 0)
- rc = ELDLM_OK;
-
- /* Lock without inodebits makes no sense and will oops
- * later in ldlm. Let's check it now to see if we have
- * ibits corrupted somewhere in mdt_intent_opc().
- * The case for client miss to set ibits has been
- * processed by others. */
- LASSERT(ergo(info->mti_dlm_req->lock_desc.l_resource.\
- lr_type == LDLM_IBITS,
- info->mti_dlm_req->lock_desc.\
- l_policy_data.l_inodebits.bits != 0));
- } else
- rc = err_serious(-EFAULT);
- } else {
- /* No intent was provided */
- LASSERT(pill->rc_fmt == &RQF_LDLM_ENQUEUE);
+ it = req_capsule_client_get(pill, &RMF_LDLM_INTENT);
+ if (it != NULL) {
+ mdt_ptlrpc_stats_update(req, it->opc);
+ rc = mdt_intent_opc(it->opc, info, lockp, flags);
+ if (rc == 0)
+ rc = ELDLM_OK;
+
+ /* Lock without inodebits makes no sense and will oops
+ * later in ldlm. Let's check it now to see if we have
+ * ibits corrupted somewhere in mdt_intent_opc().
+ * The case for client miss to set ibits has been
+ * processed by others. */
+ LASSERT(ergo(ldesc->l_resource.lr_type == LDLM_IBITS,
+ ldesc->l_policy_data.l_inodebits.bits != 0));
+ } else {
+ rc = err_serious(-EFAULT);
+ }
+ } else {
+ /* No intent was provided */
req_capsule_set_size(pill, &RMF_DLM_LVB, RCL_SERVER, 0);
- rc = req_capsule_server_pack(pill);
- if (rc)
- rc = err_serious(rc);
- }
+ rc = req_capsule_server_pack(pill);
+ if (rc)
+ rc = err_serious(rc);
+ }
mdt_thread_info_fini(info);
RETURN(rc);
}
return rc;
}
+ #define OBD_FAIL_OST_READ_NET OBD_FAIL_OST_BRW_NET
+ #define OBD_FAIL_OST_WRITE_NET OBD_FAIL_OST_BRW_NET
+ #define OST_BRW_READ OST_READ
+ #define OST_BRW_WRITE OST_WRITE
+
static struct tgt_handler mdt_tgt_handlers[] = {
TGT_RPC_HANDLER(MDS_FIRST_OPC,
0, MDS_CONNECT, mdt_tgt_connect,
mdt_swap_layouts),
};
+ static struct tgt_handler mdt_io_ops[] = {
+ TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_BRW_READ, tgt_brw_read),
+ TGT_OST_HDL(HABEO_CORPUS | MUTABOR, OST_BRW_WRITE, tgt_brw_write),
+ TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO | MUTABOR,
+ OST_PUNCH, mdt_punch_hdl),
+ TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_SYNC, mdt_data_sync),
+ };
+
static struct tgt_handler mdt_sec_ctx_ops[] = {
TGT_SEC_HDL_VAR(0, SEC_CTX_INIT, mdt_sec_ctx_handle),
TGT_SEC_HDL_VAR(0, SEC_CTX_INIT_CONT,mdt_sec_ctx_handle),
.tos_opc_end = LFSCK_LAST_OPC,
.tos_hs = tgt_lfsck_handlers
},
-
+ {
+ .tos_opc_start = OST_FIRST_OPC,
+ .tos_opc_end = OST_LAST_OPC,
+ .tos_hs = mdt_io_ops
+ },
{
.tos_hs = NULL
}
static int mdt_postrecov(const struct lu_env *, struct mdt_device *);
static int mdt_init0(const struct lu_env *env, struct mdt_device *m,
- struct lu_device_type *ldt, struct lustre_cfg *cfg)
+ struct lu_device_type *ldt, struct lustre_cfg *cfg)
{
- struct mdt_thread_info *info;
- struct obd_device *obd;
+ const struct dt_device_param *dt_conf;
+ struct mdt_thread_info *info;
+ struct obd_device *obd;
+ const char *dev = lustre_cfg_string(cfg, 0);
+ const char *num = lustre_cfg_string(cfg, 2);
+ struct tg_grants_data *tgd = &m->mdt_lut.lut_tgd;
- const char *dev = lustre_cfg_string(cfg, 0);
- const char *num = lustre_cfg_string(cfg, 2);
- struct lustre_mount_info *lmi = NULL;
- struct lustre_sb_info *lsi;
- struct lu_site *s;
- struct seq_server_site *ss_site;
- const char *identity_upcall = "NONE";
- struct md_device *next;
- int rc;
- long node_id;
- mntopt_t mntopts;
- ENTRY;
+ struct lustre_mount_info *lmi = NULL;
+ struct lustre_sb_info *lsi;
+ struct lu_site *s;
+ struct seq_server_site *ss_site;
+ const char *identity_upcall = "NONE";
+ struct md_device *next;
+ int rc;
+ long node_id;
+ mntopt_t mntopts;
+ ENTRY;
lu_device_init(&m->mdt_lu_dev, ldt);
- /*
- * Environment (env) might be missing mdt_thread_key values at that
- * point, if device is allocated when mdt_thread_key is in QUIESCENT
- * mode.
- *
- * Usually device allocation path doesn't use module key values, but
- * mdt has to do a lot of work here, so allocate key value.
- */
- rc = lu_env_refill((struct lu_env *)env);
- if (rc != 0)
- RETURN(rc);
+ /*
+ * Environment (env) might be missing mdt_thread_key values at that
+ * point, if device is allocated when mdt_thread_key is in QUIESCENT
+ * mode.
+ *
+ * Usually device allocation path doesn't use module key values, but
+ * mdt has to do a lot of work here, so allocate key value.
+ */
+ rc = lu_env_refill((struct lu_env *)env);
+ if (rc != 0)
+ RETURN(rc);
- info = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
- LASSERT(info != NULL);
+ info = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
+ LASSERT(info != NULL);
- obd = class_name2obd(dev);
- LASSERT(obd != NULL);
+ obd = class_name2obd(dev);
+ LASSERT(obd != NULL);
- m->mdt_max_mdsize = MAX_MD_SIZE; /* 4 stripes */
+ m->mdt_max_mdsize = MAX_MD_SIZE; /* 4 stripes */
m->mdt_opts.mo_evict_tgt_nids = 1;
- m->mdt_opts.mo_cos = MDT_COS_DEFAULT;
+ m->mdt_opts.mo_cos = MDT_COS_DEFAULT;
lmi = server_get_mount(dev);
- if (lmi == NULL) {
- CERROR("Cannot get mount info for %s!\n", dev);
- RETURN(-EFAULT);
- } else {
- lsi = s2lsi(lmi->lmi_sb);
- /* CMD is supported only in IAM mode */
- LASSERT(num);
- node_id = simple_strtol(num, NULL, 10);
+ if (lmi == NULL) {
+ CERROR("Cannot get mount info for %s!\n", dev);
+ RETURN(-EFAULT);
+ } else {
+ lsi = s2lsi(lmi->lmi_sb);
+ /* CMD is supported only in IAM mode */
+ LASSERT(num);
+ node_id = simple_strtol(num, NULL, 10);
obd->u.obt.obt_magic = OBT_MAGIC;
if (lsi->lsi_lmd != NULL &&
lsi->lsi_lmd->lmd_flags & LMD_FLG_SKIP_LFSCK)
m->mdt_skip_lfsck = 1;
}
+ /* DoM files get IO lock at open by default */
+ m->mdt_opts.mo_dom_lock = 1;
+
m->mdt_squash.rsi_uid = 0;
m->mdt_squash.rsi_gid = 0;
INIT_LIST_HEAD(&m->mdt_squash.rsi_nosquash_nids);
init_rwsem(&m->mdt_squash.rsi_sem);
spin_lock_init(&m->mdt_lock);
- m->mdt_osfs_age = cfs_time_shift_64(-1000);
m->mdt_enable_remote_dir = 0;
m->mdt_enable_remote_dir_gid = 0;
s->ld_seq_site = ss_site;
ss_site->ss_lu = s;
- /* set server index */
+ /* set server index */
ss_site->ss_node_id = node_id;
/* failover is the default
* FIXME: we do not failout mds0/mgs, which may cause some problems.
* assumed whose ss_node_id == 0 XXX
* */
- obd->obd_replayable = 1;
- /* No connection accepted until configurations will finish */
- obd->obd_no_conn = 1;
+ obd->obd_replayable = 1;
+ /* No connection accepted until configurations will finish */
+ obd->obd_no_conn = 1;
if (cfg->lcfg_bufcount > 4 && LUSTRE_CFG_BUFLEN(cfg, 4) > 0) {
char *str = lustre_cfg_string(cfg, 4);
snprintf(info->mti_u.ns_name, sizeof(info->mti_u.ns_name), "%s-%s",
LUSTRE_MDT_NAME, obd->obd_uuid.uuid);
- m->mdt_namespace = ldlm_namespace_new(obd, info->mti_u.ns_name,
- LDLM_NAMESPACE_SERVER,
- LDLM_NAMESPACE_GREEDY,
- LDLM_NS_TYPE_MDT);
- if (m->mdt_namespace == NULL)
- GOTO(err_fini_seq, rc = -ENOMEM);
+ m->mdt_namespace = ldlm_namespace_new(obd, info->mti_u.ns_name,
+ LDLM_NAMESPACE_SERVER,
+ LDLM_NAMESPACE_GREEDY,
+ LDLM_NS_TYPE_MDT);
+ if (m->mdt_namespace == NULL)
+ GOTO(err_fini_seq, rc = -ENOMEM);
m->mdt_namespace->ns_lvbp = m;
m->mdt_namespace->ns_lvbo = &mdt_lvbo;
- ldlm_register_intent(m->mdt_namespace, mdt_intent_policy);
- /* set obd_namespace for compatibility with old code */
- obd->obd_namespace = m->mdt_namespace;
+ ldlm_register_intent(m->mdt_namespace, mdt_intent_policy);
+ /* set obd_namespace for compatibility with old code */
+ obd->obd_namespace = m->mdt_namespace;
rc = mdt_hsm_cdt_init(m);
if (rc != 0) {
CERROR("%s: error initializing coordinator, rc %d\n",
mdt_obd_name(m), rc);
- GOTO(err_free_ns, rc);
+ GOTO(err_free_ns, rc);
}
rc = tgt_init(env, &m->mdt_lut, obd, m->mdt_bottom, mdt_common_slice,
if (rc)
GOTO(err_free_hsm, rc);
+ /* Amount of available space excluded from granting and reserved
+ * for metadata. It is in percentage and 50% is default value. */
+ tgd->tgd_reserved_pcnt = 50;
+
+ if (ONE_MB_BRW_SIZE < (1U << tgd->tgd_blockbits))
+ m->mdt_brw_size = 1U << tgd->tgd_blockbits;
+ else
+ m->mdt_brw_size = ONE_MB_BRW_SIZE;
+
rc = mdt_fs_setup(env, m, obd, lsi);
if (rc)
GOTO(err_tgt, rc);
tgt_adapt_sptlrpc_conf(&m->mdt_lut);
- next = m->mdt_child;
- rc = next->md_ops->mdo_iocontrol(env, next, OBD_IOC_GET_MNTOPT, 0,
- &mntopts);
- if (rc)
- GOTO(err_fs_cleanup, rc);
+ next = m->mdt_child;
+ dt_conf = next->md_ops->mdo_dtconf_get(env, next);
- if (mntopts & MNTOPT_USERXATTR)
- m->mdt_opts.mo_user_xattr = 1;
- else
- m->mdt_opts.mo_user_xattr = 0;
+ mntopts = dt_conf->ddp_mntopts;
- rc = next->md_ops->mdo_maxeasize_get(env, next, &m->mdt_max_ea_size);
- if (rc)
- GOTO(err_fs_cleanup, rc);
+ if (mntopts & MNTOPT_USERXATTR)
+ m->mdt_opts.mo_user_xattr = 1;
+ else
+ m->mdt_opts.mo_user_xattr = 0;
- if (mntopts & MNTOPT_ACL)
- m->mdt_opts.mo_acl = 1;
- else
- m->mdt_opts.mo_acl = 0;
+ m->mdt_max_ea_size = dt_conf->ddp_max_ea_size;
+
+ if (mntopts & MNTOPT_ACL)
+ m->mdt_opts.mo_acl = 1;
+ else
+ m->mdt_opts.mo_acl = 0;
/* XXX: to support suppgid for ACL, we enable identity_upcall
* by default, otherwise, maybe got unexpected -EACCESS. */
GOTO(err_fs_cleanup, rc);
}
- rc = mdt_procfs_init(m, dev);
- if (rc) {
- CERROR("Can't init MDT lprocfs, rc %d\n", rc);
- GOTO(err_recovery, rc);
- }
+ rc = mdt_procfs_init(m, dev);
+ if (rc) {
+ CERROR("Can't init MDT lprocfs, rc %d\n", rc);
+ GOTO(err_recovery, rc);
+ }
rc = mdt_quota_init(env, m, cfg);
if (rc)
* when the whole stack is complete and ready
* to serve the requests */
- /* Reduce the initial timeout on an MDS because it doesn't need such
- * a long timeout as an OST does. Adaptive timeouts will adjust this
- * value appropriately. */
- if (ldlm_timeout == LDLM_TIMEOUT_DEFAULT)
- ldlm_timeout = MDS_LDLM_TIMEOUT_DEFAULT;
+ /* Reduce the initial timeout on an MDS because it doesn't need such
+ * a long timeout as an OST does. Adaptive timeouts will adjust this
+ * value appropriately. */
+ if (ldlm_timeout == LDLM_TIMEOUT_DEFAULT)
+ ldlm_timeout = MDS_LDLM_TIMEOUT_DEFAULT;
- RETURN(0);
+ RETURN(0);
err_procfs:
mdt_procfs_fini(m);
err_recovery:
o->lo_ops = &mdt_obj_ops;
spin_lock_init(&mo->mot_write_lock);
mutex_init(&mo->mot_lov_mutex);
+ init_rwsem(&mo->mot_dom_sem);
init_rwsem(&mo->mot_open_sem);
atomic_set(&mo->mot_open_count, 0);
RETURN(o);
* \retval -EPROTO \a data unexpectedly has zero obd_connect_data::ocd_brw_size
* \retval -EBADE client and server feature requirements are incompatible
*/
- static int mdt_connect_internal(struct obd_export *exp,
+ static int mdt_connect_internal(const struct lu_env *env,
+ struct obd_export *exp,
struct mdt_device *mdt,
- struct obd_connect_data *data)
+ struct obd_connect_data *data, bool reconnect)
{
LASSERT(data != NULL);
data->ocd_connect_flags &= ~OBD_CONNECT_XATTR;
if (OCD_HAS_FLAG(data, BRW_SIZE)) {
- data->ocd_brw_size = min(data->ocd_brw_size, MD_MAX_BRW_SIZE);
+ data->ocd_brw_size = min(data->ocd_brw_size,
+ mdt->mdt_brw_size);
if (data->ocd_brw_size == 0) {
CERROR("%s: cli %s/%p ocd_connect_flags: %#llx "
"ocd_version: %x ocd_grant: %d ocd_index: %u "
}
}
+ if (OCD_HAS_FLAG(data, GRANT_PARAM)) {
+ struct dt_device_param *ddp = &mdt->mdt_lut.lut_dt_conf;
+
+ /* client is reporting its page size, for future use */
+ exp->exp_target_data.ted_pagebits = data->ocd_grant_blkbits;
+ data->ocd_grant_blkbits = mdt->mdt_lut.lut_tgd.tgd_blockbits;
+ /* ddp_inodespace may not be power-of-two value, eg. for ldiskfs
+ * it's LDISKFS_DIR_REC_LEN(20) = 28. */
+ data->ocd_grant_inobits = fls(ddp->ddp_inodespace - 1);
+ /* ocd_grant_tax_kb is in 1K byte blocks */
+ data->ocd_grant_tax_kb = ddp->ddp_extent_tax >> 10;
+ data->ocd_grant_max_blks = ddp->ddp_max_extent_blks;
+ }
+
+ if (OCD_HAS_FLAG(data, GRANT)) {
+ /* Save connect_data we have so far because tgt_grant_connect()
+ * uses it to calculate grant. */
+ exp->exp_connect_data = *data;
+ tgt_grant_connect(env, exp, data, !reconnect);
+ }
+
+ if (OCD_HAS_FLAG(data, MAXBYTES))
+ data->ocd_maxbytes = mdt->mdt_lut.lut_dt_conf.ddp_maxbytes;
+
/* NB: Disregard the rule against updating
* exp_connect_data.ocd_connect_flags in this case, since
* tgt_client_new() needs to know if this is a lightweight
spin_unlock(&exp->exp_lock);
}
+ if (OCD_HAS_FLAG(data, CKSUM)) {
+ __u32 cksum_types = data->ocd_cksum_types;
+
+ /* The client set in ocd_cksum_types the checksum types it
+ * supports. We have to mask off the algorithms that we don't
+ * support */
+ data->ocd_cksum_types &= cksum_types_supported_server();
+
+ if (unlikely(data->ocd_cksum_types == 0)) {
+ CERROR("%s: Connect with checksum support but no "
+ "ocd_cksum_types is set\n",
+ exp->exp_obd->obd_name);
+ RETURN(-EPROTO);
+ }
+
+ CDEBUG(D_RPCTRACE, "%s: cli %s supports cksum type %x, return "
+ "%x\n", exp->exp_obd->obd_name, obd_export_nid2str(exp),
+ cksum_types, data->ocd_cksum_types);
+ } else {
+ /* This client does not support OBD_CONNECT_CKSUM
+ * fall back to CRC32 */
+ CDEBUG(D_RPCTRACE, "%s: cli %s does not support "
+ "OBD_CONNECT_CKSUM, CRC32 will be used\n",
+ exp->exp_obd->obd_name, obd_export_nid2str(exp));
+ }
+
return 0;
}
static int mdt_obd_disconnect(struct obd_export *exp)
{
- int rc;
- ENTRY;
+ int rc;
- LASSERT(exp);
- class_export_get(exp);
+ ENTRY;
+
+ LASSERT(exp);
+ class_export_get(exp);
+
+ if (!(exp->exp_flags & OBD_OPT_FORCE))
+ tgt_grant_sanity_check(exp->exp_obd, __func__);
if ((exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) &&
!(exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT)) {
if (rc != 0)
CDEBUG(D_IOCTL, "server disconnect error: rc = %d\n", rc);
+ tgt_grant_discard(exp);
+
rc = mdt_export_cleanup(exp);
nodemap_del_member(exp);
class_export_put(exp);
if (rc != 0 && rc != -EEXIST)
GOTO(out, rc);
- rc = mdt_connect_internal(lexp, mdt, data);
+ rc = mdt_connect_internal(env, lexp, mdt, data, false);
if (rc == 0) {
struct lsd_client_data *lcd = lexp->exp_target_data.ted_lcd;
if (rc != 0 && rc != -EEXIST)
RETURN(rc);
- rc = mdt_connect_internal(exp, mdt_dev(obd->obd_lu_dev), data);
+ rc = mdt_connect_internal(env, exp, mdt_dev(obd->obd_lu_dev), data,
+ true);
if (rc == 0)
mdt_export_stats_init(obd, exp, localdata);
else
LASSERT(list_empty(&exp->exp_outstanding_replies));
LASSERT(list_empty(&exp->exp_mdt_data.med_open_head));
+ /*
+ * discard grants once we're sure no more
+ * interaction with the client is possible
+ */
+ tgt_grant_discard(exp);
+ if (exp_connect_flags(exp) & OBD_CONNECT_GRANT)
+ exp->exp_obd->u.obt.obt_lut->lut_tgd.tgd_tot_granted_clients--;
+
+ if (!(exp->exp_flags & OBD_OPT_FORCE))
+ tgt_grant_sanity_check(exp->exp_obd, __func__);
+
RETURN(0);
}
.o_destroy_export = mdt_destroy_export,
.o_iocontrol = mdt_iocontrol,
.o_postrecov = mdt_obd_postrecov,
+ /* Data-on-MDT IO methods */
+ .o_preprw = mdt_obd_preprw,
+ .o_commitrw = mdt_obd_commitrw,
};
static struct lu_device* mdt_device_fini(const struct lu_env *env,
/* Remove archive on last unlink policy */
bool cdt_remove_archive_on_last_unlink;
+
+ bool cdt_wakeup_coordinator;
};
/* mdt state flag bits */
unsigned int mo_user_xattr:1,
mo_acl:1,
mo_cos:1,
- mo_evict_tgt_nids:1;
+ mo_evict_tgt_nids:1,
+ mo_dom_lock:1;
} mdt_opts;
/* mdt state flags */
unsigned long mdt_state;
int mdt_max_ea_size;
+ /* preferred BRW size, decided by storage type and capability */
+ __u32 mdt_brw_size;
+
struct upcall_cache *mdt_identity_cache;
unsigned int mdt_capa_conf:1,
/* lock for osfs and md_root */
spinlock_t mdt_lock;
- /* statfs optimization: we cache a bit */
- struct obd_statfs mdt_osfs;
- __u64 mdt_osfs_age;
-
/* root squash */
struct root_squash_info mdt_squash;
spinlock_t mot_write_lock;
/* Lock to protect create_data */
struct mutex mot_lov_mutex;
+ /* lock to protect read/write stages for Data-on-MDT files */
+ struct rw_semaphore mot_dom_sem;
/* Lock to protect lease open.
* Lease open acquires write lock; normal open acquires read lock */
struct rw_semaphore mot_open_sem;
#define MDT_EREMOTE_OPEN (EREMOTE + 1024)
struct mdt_reint_record {
- mdt_reint_t rr_opcode;
+ enum mds_reint_op rr_opcode;
const struct lustre_handle *rr_handle;
const struct lu_fid *rr_fid1;
const struct lu_fid *rr_fid2;
return exp_connect_flags(exp) & OBD_CONNECT_DIR_STRIPE;
}
+ enum {
+ LMM_NO_DOM,
+ LMM_DOM_ONLY,
+ LMM_DOM_OST
+ };
+
+ /* XXX Look into layout in MDT layer. This must be done in LOD. */
+ static inline int mdt_lmm_dom_entry(struct lov_mds_md *lmm)
+ {
+ struct lov_comp_md_v1 *comp_v1;
+ struct lov_mds_md *v1;
+ int i;
+
+ if (lmm->lmm_magic == LOV_MAGIC_COMP_V1) {
+ comp_v1 = (struct lov_comp_md_v1 *)lmm;
+ v1 = (struct lov_mds_md *)((char *)comp_v1 +
+ comp_v1->lcm_entries[0].lcme_offset);
+ /* DoM entry is the first entry always */
+ if (lov_pattern(v1->lmm_pattern) != LOV_PATTERN_MDT)
+ return LMM_NO_DOM;
+
+ for (i = 1; i < comp_v1->lcm_entry_count; i++) {
+ int j;
+
+ v1 = (struct lov_mds_md *)((char *)comp_v1 +
+ comp_v1->lcm_entries[i].lcme_offset);
+ for (j = 0; j < v1->lmm_stripe_count; j++) {
+ /* if there is any object on OST */
+ if (v1->lmm_objects[j].l_ost_idx !=
+ (__u32)-1UL)
+ return LMM_DOM_OST;
+ }
+ }
+ return LMM_DOM_ONLY;
+ }
+ return LMM_NO_DOM;
+ }
+
__u64 mdt_get_disposition(struct ldlm_reply *rep, __u64 op_flag);
void mdt_set_disposition(struct mdt_thread_info *info,
struct ldlm_reply *rep, __u64 op_flag);
void mdt_object_unlock(struct mdt_thread_info *info, struct mdt_object *mo,
struct mdt_lock_handle *lh, int decref);
+ void mdt_save_lock(struct mdt_thread_info *info, struct lustre_handle *h,
+ enum ldlm_mode mode, int decref);
struct mdt_object *mdt_object_new(const struct lu_env *env,
struct mdt_device *,
struct mdt_object *o, struct lu_nodemap *nodemap);
#endif
void mdt_pack_attr2body(struct mdt_thread_info *info, struct mdt_body *b,
- const struct lu_attr *attr, const struct lu_fid *fid);
-
+ const struct lu_attr *attr, const struct lu_fid *fid);
+ int mdt_pack_size2body(struct mdt_thread_info *info,
+ const struct lu_fid *fid, bool dom_lock);
int mdt_getxattr(struct mdt_thread_info *info);
int mdt_reint_setxattr(struct mdt_thread_info *info,
struct mdt_lock_handle *lh);
struct mdt_thread_info *mti);
void mdt_thread_info_fini(struct mdt_thread_info *mti);
struct mdt_thread_info *tsi2mdt_info(struct tgt_session_info *tsi);
+ void mdt_intent_fixup_resent(struct mdt_thread_info *info,
+ struct ldlm_lock *new_lock,
+ struct mdt_lock_handle *lh, __u64 flags);
+ int mdt_intent_lock_replace(struct mdt_thread_info *info,
+ struct ldlm_lock **lockp,
+ struct mdt_lock_handle *lh,
+ __u64 flags, int result);
int mdt_hsm_attr_set(struct mdt_thread_info *info, struct mdt_object *obj,
const struct md_hsm *mh);
int mdt_blocking_ast(struct ldlm_lock*, struct ldlm_lock_desc*, void*, int);
+ static int mdt_dom_glimpse_ast(struct ldlm_lock *lock, void *reqp)
+ {
+ return -ELDLM_NO_LOCK_DATA;
+ }
+
/* Issues dlm lock on passed @ns, @f stores it lock handle into @lh. */
static inline int mdt_fid_lock(struct ldlm_namespace *ns,
struct lustre_handle *lh, enum ldlm_mode mode,
__u64 flags, const __u64 *client_cookie)
{
int rc;
+ bool glimpse = policy->l_inodebits.bits & MDS_INODELOCK_DOM;
LASSERT(ns != NULL);
LASSERT(lh != NULL);
rc = ldlm_cli_enqueue_local(ns, res_id, LDLM_IBITS, policy,
mode, &flags, mdt_blocking_ast,
- ldlm_completion_ast, NULL, NULL, 0,
- LVB_T_NONE, client_cookie, lh);
+ ldlm_completion_ast,
+ glimpse ? mdt_dom_glimpse_ast : NULL,
+ NULL, 0, LVB_T_NONE, client_cookie, lh);
return rc == ELDLM_OK ? 0 : -EIO;
}
/* mdt_lvb.c */
extern struct ldlm_valblock_ops mdt_lvbo;
+ int mdt_dom_lvb_is_valid(struct ldlm_resource *res);
+ int mdt_dom_lvbo_update(struct ldlm_resource *res, struct ldlm_lock *lock,
+ struct ptlrpc_request *req, bool increase_only);
void mdt_enable_cos(struct mdt_device *, int);
int mdt_cos_is_enabled(struct mdt_device *);
LPROC_MDT_SETXATTR,
LPROC_MDT_STATFS,
LPROC_MDT_SYNC,
- LPROC_MDT_SAMEDIR_RENAME,
- LPROC_MDT_CROSSDIR_RENAME,
- LPROC_MDT_LAST,
+ LPROC_MDT_SAMEDIR_RENAME,
+ LPROC_MDT_CROSSDIR_RENAME,
+ LPROC_MDT_IO_READ,
+ LPROC_MDT_IO_WRITE,
+ LPROC_MDT_IO_PUNCH,
+ LPROC_MDT_LAST,
};
void mdt_counter_incr(struct ptlrpc_request *req, int opcode);
void mdt_stats_counter_init(struct lprocfs_stats *stats);
return jobid;
}
+ /* MDT IO */
+
+ #define VALID_FLAGS (LA_TYPE | LA_MODE | LA_SIZE | LA_BLOCKS | \
+ LA_BLKSIZE | LA_ATIME | LA_MTIME | LA_CTIME)
+
+ int mdt_obd_preprw(const struct lu_env *env, int cmd, struct obd_export *exp,
+ struct obdo *oa, int objcount, struct obd_ioobj *obj,
+ struct niobuf_remote *rnb, int *nr_local,
+ struct niobuf_local *lnb);
+
+ int mdt_obd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp,
+ struct obdo *oa, int objcount, struct obd_ioobj *obj,
+ struct niobuf_remote *rnb, int npages,
+ struct niobuf_local *lnb, int old_rc);
+ int mdt_punch_hdl(struct tgt_session_info *tsi);
+ int mdt_glimpse_enqueue(struct mdt_thread_info *mti, struct ldlm_namespace *ns,
+ struct ldlm_lock **lockp, __u64 flags);
+ int mdt_brw_enqueue(struct mdt_thread_info *info, struct ldlm_namespace *ns,
+ struct ldlm_lock **lockp, __u64 flags);
+ void mdt_dom_discard_data(struct mdt_thread_info *info,
+ const struct lu_fid *fid);
+ int mdt_dom_disk_lvbo_update(const struct lu_env *env, struct mdt_object *mo,
+ struct ldlm_resource *res, bool increase_only);
+ void mdt_dom_obj_lvb_update(const struct lu_env *env, struct mdt_object *mo,
+ bool increase_only);
+ int mdt_dom_lvb_alloc(struct ldlm_resource *res);
+
+ static inline void mdt_dom_check_and_discard(struct mdt_thread_info *mti,
+ struct mdt_object *mo)
+ {
+ if (lu_object_is_dying(&mo->mot_header) &&
+ S_ISREG(lu_object_attr(&mo->mot_obj)))
+ mdt_dom_discard_data(mti, mdt_object_fid(mo));
+ }
+
+ int mdt_dom_object_size(const struct lu_env *env, struct mdt_device *mdt,
+ const struct lu_fid *fid, struct mdt_body *mb,
+ bool dom_lock);
+ bool mdt_dom_client_has_lock(struct mdt_thread_info *info,
+ const struct lu_fid *fid);
+ /* grants */
+ long mdt_grant_connect(const struct lu_env *env, struct obd_export *exp,
+ u64 want, bool conservative);
+ extern struct kmem_cache *ldlm_glimpse_work_kmem;
+
#endif /* _MDT_INTERNAL_H */
struct ptlrpc_service *mds_mdsc_service;
struct ptlrpc_service *mds_mdss_service;
struct ptlrpc_service *mds_fld_service;
+ struct ptlrpc_service *mds_io_service;
struct mutex mds_health_mutex;
- struct kset *mds_kset;
};
/*
module_param(mds_num_threads, ulong, 0444);
MODULE_PARM_DESC(mds_num_threads, "number of MDS service threads to start");
+ int mds_max_io_threads = 512;
+ module_param(mds_max_io_threads, int, 0444);
+ MODULE_PARM_DESC(mds_max_io_threads, "maximum number of MDS IO service threads");
+
static char *mds_num_cpts;
module_param(mds_num_cpts, charp, 0444);
MODULE_PARM_DESC(mds_num_cpts, "CPU partitions MDS threads should run on");
ptlrpc_unregister_service(m->mds_fld_service);
m->mds_fld_service = NULL;
}
+ if (m->mds_io_service != NULL) {
+ ptlrpc_unregister_service(m->mds_io_service);
+ m->mds_io_service = NULL;
+ }
mutex_unlock(&m->mds_health_mutex);
EXIT;
.so_hpreq_handler = ptlrpc_hpreq_handler,
},
};
- m->mds_regular_service = ptlrpc_register_service(&conf, m->mds_kset,
+ m->mds_regular_service = ptlrpc_register_service(&conf, &obd->obd_kset,
procfs_entry);
if (IS_ERR(m->mds_regular_service)) {
rc = PTR_ERR(m->mds_regular_service);
.so_req_printer = target_print_req,
},
};
- m->mds_readpage_service = ptlrpc_register_service(&conf, m->mds_kset,
+ m->mds_readpage_service = ptlrpc_register_service(&conf, &obd->obd_kset,
procfs_entry);
if (IS_ERR(m->mds_readpage_service)) {
rc = PTR_ERR(m->mds_readpage_service);
.so_hpreq_handler = NULL,
},
};
- m->mds_setattr_service = ptlrpc_register_service(&conf, m->mds_kset,
+ m->mds_setattr_service = ptlrpc_register_service(&conf, &obd->obd_kset,
procfs_entry);
if (IS_ERR(m->mds_setattr_service)) {
rc = PTR_ERR(m->mds_setattr_service);
.so_hpreq_handler = NULL,
},
};
- m->mds_out_service = ptlrpc_register_service(&conf, m->mds_kset,
+ m->mds_out_service = ptlrpc_register_service(&conf, &obd->obd_kset,
procfs_entry);
if (IS_ERR(m->mds_out_service)) {
rc = PTR_ERR(m->mds_out_service);
.so_hpreq_handler = NULL,
},
};
- m->mds_mdsc_service = ptlrpc_register_service(&conf, m->mds_kset,
+ m->mds_mdsc_service = ptlrpc_register_service(&conf, &obd->obd_kset,
procfs_entry);
if (IS_ERR(m->mds_mdsc_service)) {
rc = PTR_ERR(m->mds_mdsc_service);
.so_hpreq_handler = NULL,
},
};
- m->mds_mdss_service = ptlrpc_register_service(&conf, m->mds_kset,
+ m->mds_mdss_service = ptlrpc_register_service(&conf, &obd->obd_kset,
procfs_entry);
if (IS_ERR(m->mds_mdss_service)) {
rc = PTR_ERR(m->mds_mdss_service);
.so_hpreq_handler = NULL,
},
};
- m->mds_fld_service = ptlrpc_register_service(&conf, m->mds_kset,
+ m->mds_fld_service = ptlrpc_register_service(&conf, &obd->obd_kset,
procfs_entry);
if (IS_ERR(m->mds_fld_service)) {
rc = PTR_ERR(m->mds_fld_service);
GOTO(err_mds_svc, rc);
}
- m->mds_io_service = ptlrpc_register_service(&conf, m->mds_kset,
+ memset(&conf, 0, sizeof(conf));
+ conf = (typeof(conf)) {
+ .psc_name = LUSTRE_MDT_NAME "_io",
+ .psc_watchdog_factor = MDT_SERVICE_WATCHDOG_FACTOR,
+ .psc_buf = {
+ .bc_nbufs = OST_NBUFS,
+ .bc_buf_size = OST_IO_BUFSIZE,
+ .bc_req_max_size = OST_IO_MAXREQSIZE,
+ .bc_rep_max_size = OST_IO_MAXREPSIZE,
+ .bc_req_portal = MDS_IO_PORTAL,
+ .bc_rep_portal = MDC_REPLY_PORTAL,
+ },
+ .psc_thr = {
+ .tc_thr_name = "ll_mdt_io",
+ .tc_thr_factor = OSS_THR_FACTOR,
+ .tc_nthrs_init = OSS_NTHRS_INIT,
+ .tc_nthrs_base = OSS_NTHRS_BASE,
+ .tc_nthrs_max = mds_max_io_threads,
+ .tc_cpu_affinity = 1,
+ .tc_ctx_tags = LCT_DT_THREAD | LCT_MD_THREAD,
+ },
+ .psc_ops = {
+ .so_thr_init = tgt_io_thread_init,
+ .so_thr_done = tgt_io_thread_done,
+ .so_req_handler = tgt_request_handle,
+ .so_req_printer = target_print_req,
+ },
+ };
++ m->mds_io_service = ptlrpc_register_service(&conf, &obd->obd_kset,
+ procfs_entry);
+ if (IS_ERR(m->mds_io_service)) {
+ rc = PTR_ERR(m->mds_io_service);
+ CERROR("failed to start MDT I/O service: %d\n", rc);
+ m->mds_io_service = NULL;
+ GOTO(err_mds_svc, rc);
+ }
+
EXIT;
err_mds_svc:
if (rc)
ENTRY;
mds_stop_ptlrpc_service(m);
- lprocfs_kset_unregister(obd, m->mds_kset);
+ lprocfs_obd_cleanup(obd);
RETURN(NULL);
}
/* set this lu_device to obd, because error handling need it */
obd->obd_lu_dev = l;
- rc = lprocfs_kset_register(obd, &m->mds_kset);
+ rc = lprocfs_obd_setup(obd, true);
if (rc != 0) {
mds_device_free(env, l);
l = ERR_PTR(rc);
rc = mds_start_ptlrpc_service(m);
if (rc != 0) {
- lprocfs_kset_unregister(obd, m->mds_kset);
+ lprocfs_obd_cleanup(obd);
mds_device_free(env, l);
l = ERR_PTR(rc);
return l;
rc |= ptlrpc_service_health_check(mds->mds_mdsc_service);
rc |= ptlrpc_service_health_check(mds->mds_mdss_service);
rc |= ptlrpc_service_health_check(mds->mds_fld_service);
+ rc |= ptlrpc_service_health_check(mds->mds_io_service);
mutex_unlock(&mds->mds_health_mutex);
return rc != 0 ? 1 : 0;
{
struct md_attr *ma = &info->mti_attr;
__u64 open_flags = info->mti_spec.sp_cr_flags;
+ __u64 trybits = 0;
enum ldlm_mode lm = LCK_CR;
bool acq_lease = !!(open_flags & MDS_OPEN_LEASE);
bool try_layout = false;
bool create_layout = false;
int rc = 0;
+ int dom_stripes = LMM_NO_DOM;
+ bool dom_lock = false;
+
ENTRY;
*ibits = 0;
if (exp_connect_layout(info->mti_exp) && !create_layout &&
ma->ma_need & MA_LOV)
try_layout = true;
+
+ /* DoM files can have just MDT stripe or combined MDT + OST
+ * stripes.
+ * - In the first case the open for read/write will do IO to
+ * the MDT stripe and it makes sense to take IO lock in
+ * advance along with OPEN even if it is blocking lock.
+ * - In the second case it is just size of MDT stripe and it
+ * is quite unlikely that client will write into it, though
+ * it may read it. So IO lock will be taken optionally if it
+ * is non-blocking one.
+ */
+ if (ma->ma_valid & MA_LOV && ma->ma_lmm != NULL)
+ dom_stripes = mdt_lmm_dom_entry(ma->ma_lmm);
+
+ if (dom_stripes == LMM_DOM_ONLY &&
+ info->mti_mdt->mdt_opts.mo_dom_lock != 0 &&
+ !mdt_dom_client_has_lock(info, mdt_object_fid(obj)))
+ dom_lock = true;
}
if (acq_lease) {
try_layout = false;
lhc = &info->mti_lh[MDT_LH_LOCAL];
+ } else if (dom_lock) {
+ lm = (open_flags & FMODE_WRITE) ? LCK_PW : LCK_PR;
+ *ibits = MDS_INODELOCK_DOM;
+ try_layout = false;
}
+
CDEBUG(D_INODE, "normal open:"DFID" lease count: %d, lm: %d\n",
PFID(mdt_object_fid(obj)),
atomic_read(&obj->mot_open_count), lm);
* lock for each open.
* However this is a double-edged sword because changing
* permission will revoke huge # of LOOKUP locks. */
- rc = mdt_object_lock_try(info, obj, lhc, ibits,
- MDS_INODELOCK_LAYOUT |
- MDS_INODELOCK_LOOKUP, false);
- } else if (*ibits != 0) {
- rc = mdt_object_lock(info, obj, lhc, *ibits);
+ trybits |= MDS_INODELOCK_LAYOUT | MDS_INODELOCK_LOOKUP;
}
- CDEBUG(D_INODE, "%s: Requested bits lock:"DFID ", ibits = %#llx"
+ if (trybits != 0)
+ rc = mdt_object_lock_try(info, obj, lhc, ibits, trybits, false);
+ else if (*ibits != 0)
+ rc = mdt_object_lock(info, obj, lhc, *ibits);
+
+ CDEBUG(D_INODE, "%s: Requested bits lock:"DFID ", ibits = %#llx/%#llx"
", open_flags = %#llo, try_layout = %d : rc = %d\n",
mdt_obd_name(info->mti_mdt), PFID(mdt_object_fid(obj)),
- *ibits, open_flags, try_layout, rc);
+ *ibits, trybits, open_flags, try_layout, rc);
/* will change layout, revoke layout locks by enqueuing EX lock. */
if (rc == 0 && create_layout) {
if (ibits == 0 || rc == -MDT_EREMOTE_OPEN)
RETURN_EXIT;
- if (!(open_flags & MDS_OPEN_LOCK) && !(ibits & MDS_INODELOCK_LAYOUT)) {
+ if (!(open_flags & MDS_OPEN_LOCK) && !(ibits & MDS_INODELOCK_LAYOUT) &&
+ !(ibits & MDS_INODELOCK_DOM)) {
/* for the open request, the lock will only return to client
* if open or layout lock is granted. */
rc = 1;
mdt_object_open_unlock(info, o, lhc, ibits, rc);
out:
mdt_object_put(env, o);
+ if (rc == 0) {
+ rc = mdt_pack_size2body(info, rr->rr_fid2,
+ ibits & MDS_INODELOCK_DOM);
+ LASSERT(ergo(ibits & MDS_INODELOCK_DOM, !rc));
+ rc = 0;
+ }
out_parent_put:
if (parent != NULL)
mdt_object_put(env, parent);
result = mdt_cross_open(info, rr->rr_fid2, rr->rr_fid1,
ldlm_rep, create_flags);
GOTO(out, result);
- } else if (req_is_replay(req) ||
- (req->rq_export->exp_libclient && create_flags & MDS_OPEN_HAS_EA)) {
- /* This is a replay request or from liblustre with ea. */
+ } else if (req_is_replay(req)) {
result = mdt_open_by_fid(info, ldlm_rep);
- if (result != -ENOENT) {
- if (req->rq_export->exp_libclient &&
- create_flags & MDS_OPEN_HAS_EA)
- GOTO(out, result = 0);
+ if (result != -ENOENT)
GOTO(out, result);
- }
+
/* We didn't find the correct object, so we need to re-create it
* via a regular replay. */
if (!(create_flags & MDS_OPEN_CREAT)) {
mdt_object_open_unlock(info, child, lhc, ibits, result);
out_child:
mdt_object_put(info->mti_env, child);
+ if (result == 0) {
+ rc = mdt_pack_size2body(info, child_fid,
+ ibits & MDS_INODELOCK_DOM);
+ LASSERT(ergo(ibits & MDS_INODELOCK_DOM, !rc));
+ rc = 0;
+ }
out_parent:
mdt_object_unlock_put(info, parent, lh, result || !created);
out:
__u32 off;
int i;
- if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_COMP_V1_DEF)) {
+ if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_COMP_V1_DEFINED)) {
comp_v1 = (struct lov_comp_md_v1 *)lmm;
if (comp_v1->lcm_entry_count == 0)
if (!(ma->ma_valid & MA_LOV)) {
/* Even empty file are released */
memset(ma->ma_lmm, 0, sizeof(*ma->ma_lmm));
- ma->ma_lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1_DEF);
+ ma->ma_lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1_DEFINED);
ma->ma_lmm->lmm_pattern = cpu_to_le32(LOV_PATTERN_RAID0);
ma->ma_lmm->lmm_stripe_size = cpu_to_le32(LOV_MIN_STRIPE_SIZE);
ma->ma_lmm_size = sizeof(*ma->ma_lmm);
} else {
- /* Magic must be LOV_MAGIC_*_DEF otherwise LOD will interpret
+ /* Magic must be LOV_MAGIC_*_DEFINED or LOD will interpret
* ma_lmm as lov_user_md, then it will be confused by union of
* layout_gen and stripe_offset. */
if ((le32_to_cpu(ma->ma_lmm->lmm_magic) & LOV_MAGIC_MASK) ==
LOV_MAGIC_MAGIC)
- ma->ma_lmm->lmm_magic |= cpu_to_le32(LOV_MAGIC_DEF);
+ ma->ma_lmm->lmm_magic |= cpu_to_le32(LOV_MAGIC_DEFINED);
else
GOTO(out_unlock, rc = -EINVAL);
}
atomic_dec(&o->mot_open_count);
mdt_handle_last_unlink(info, o, ma);
- if (!MFD_CLOSED(mode))
- rc = mo_close(info->mti_env, next, ma, mode);
+ if (!MFD_CLOSED(mode)) {
+ rc = mo_close(info->mti_env, next, ma, mode);
+ mdt_dom_check_and_discard(info, o);
+ }
/* adjust open and lease count */
if (mode & MDS_OPEN_LEASE) {
if (rc != 0)
GOTO(out_unlock, rc);
-
+ mdt_dom_obj_lvb_update(info->mti_env, mo, false);
EXIT;
out_unlock:
mdt_unlock_slaves(info, mo, lockpart, s0_lh, s0_obj, einfo, rc);
mdt_pack_attr2body(info, repbody, &ma->ma_attr, mdt_object_fid(mo));
- EXIT;
+ EXIT;
out_put:
- mdt_object_put(info->mti_env, mo);
+ mdt_object_put(info->mti_env, mo);
out:
- if (rc == 0)
+ if (rc == 0)
mdt_counter_incr(req, LPROC_MDT_SETATTR);
mdt_client_compatibility(info);
bool cos_incompat = false;
int no_name = 0;
int rc;
+
ENTRY;
DEBUG_REQ(D_INODE, req, "unlink "DFID"/"DNAME"", PFID(rr->rr_fid1),
mdt_object_child(mc), &rr->rr_name, ma, no_name);
mutex_unlock(&mc->mot_lov_mutex);
+ if (rc != 0)
+ GOTO(unlock_child, rc);
- if (rc == 0 && !lu_object_is_dying(&mc->mot_header))
+ if (!lu_object_is_dying(&mc->mot_header)) {
rc = mdt_attr_get_complex(info, mc, ma);
- if (rc == 0)
- mdt_handle_last_unlink(info, mc, ma);
+ if (rc)
+ GOTO(out_stat, rc);
+ } else {
+ mdt_dom_check_and_discard(info, mc);
+ }
+ mdt_handle_last_unlink(info, mc, ma);
- if (ma->ma_valid & MA_INODE) {
- switch (ma->ma_attr.la_mode & S_IFMT) {
- case S_IFDIR:
+ out_stat:
+ if (ma->ma_valid & MA_INODE) {
+ switch (ma->ma_attr.la_mode & S_IFMT) {
+ case S_IFDIR:
mdt_counter_incr(req, LPROC_MDT_RMDIR);
- break;
- case S_IFREG:
- case S_IFLNK:
- case S_IFCHR:
- case S_IFBLK:
- case S_IFIFO:
- case S_IFSOCK:
+ break;
+ case S_IFREG:
+ case S_IFLNK:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
mdt_counter_incr(req, LPROC_MDT_UNLINK);
- break;
- default:
- LASSERTF(0, "bad file type %o unlinking\n",
- ma->ma_attr.la_mode);
- }
- }
+ break;
+ default:
+ LASSERTF(0, "bad file type %o unlinking\n",
+ ma->ma_attr.la_mode);
+ }
+ }
- EXIT;
+ EXIT;
unlock_child:
mdt_unlock_slaves(info, mc, MDS_INODELOCK_UPDATE, s0_lh, s0_obj, einfo,
GOTO(out, rc = -EBUSY);
}
+ mdt_lock_pdo_init(&mll->mll_lh, LCK_PW, &name);
rc = mdt_object_lock(info, mdt_pobj, &mll->mll_lh,
MDS_INODELOCK_UPDATE);
if (rc != 0) {
/* handle last link of tgt object */
if (rc == 0) {
mdt_counter_incr(req, LPROC_MDT_RENAME);
- if (mnew)
+ if (mnew) {
mdt_handle_last_unlink(info, mnew, ma);
+ mdt_dom_check_and_discard(info, mnew);
+ }
mdt_rename_counter_tally(info, info->mti_mdt, req,
msrcdir, mtgtdir);
#include <linux/pid_namespace.h>
#include <linux/kthread.h>
#include <obd_class.h>
+#include <lustre_log.h>
#include <lprocfs_status.h>
#include <lustre_disk.h>
#include <lustre_kernelcomm.h>
spin_unlock(&type->obd_type_lock);
}
+static void class_sysfs_release(struct kobject *kobj)
+{
+ struct obd_type *type = container_of(kobj, struct obd_type,
+ typ_kobj);
+
+ complete(&type->typ_kobj_unregister);
+}
+
+static struct kobj_type class_ktype = {
+ .sysfs_ops = &lustre_sysfs_ops,
+ .release = class_sysfs_release,
+};
+
#define CLASS_MAX_NAME 1024
int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
}
}
#endif
- type->typ_kobj = kobject_create_and_add(type->typ_name, lustre_kobj);
- if (!type->typ_kobj) {
- rc = -ENOMEM;
+ type->typ_kobj.kset = lustre_kset;
+ init_completion(&type->typ_kobj_unregister);
+ rc = kobject_init_and_add(&type->typ_kobj, &class_ktype,
+ &lustre_kset->kobj, "%s", type->typ_name);
+ if (rc)
GOTO(failed, rc);
- }
- if (ldt != NULL) {
- type->typ_lu = ldt;
- rc = lu_device_type_init(ldt);
- if (rc != 0)
- GOTO (failed, rc);
- }
+ if (ldt) {
+ type->typ_lu = ldt;
+ rc = lu_device_type_init(ldt);
+ if (rc) {
+ kobject_put(&type->typ_kobj);
+ GOTO(failed, rc);
+ }
+ }
spin_lock(&obd_types_lock);
list_add(&type->typ_chain, &obd_types);
spin_unlock(&obd_types_lock);
- RETURN (0);
+ RETURN(0);
failed:
- if (type->typ_kobj)
- kobject_put(type->typ_kobj);
if (type->typ_name != NULL) {
#ifdef CONFIG_PROC_FS
if (type->typ_procroot != NULL)
RETURN(-EBUSY);
}
- if (type->typ_kobj)
- kobject_put(type->typ_kobj);
+ kobject_put(&type->typ_kobj);
+ wait_for_completion(&type->typ_kobj_unregister);
/* we do not use type->typ_procroot as for compatibility purposes
* other modules can share names (i.e. lod can use lov entry). so
/**
* Create a new obd device.
*
- * Find an empty slot in ::obd_devs[], create a new obd device in it.
+ * Allocate the new obd_device and initialize it.
*
* \param[in] type_name obd device type string.
* \param[in] name obd device name.
+ * \param[in] uuid obd device UUID
*
- * \retval NULL if create fails, otherwise return the obd device
- * pointer created.
+ * \retval newdev pointer to created obd_device
+ * \retval ERR_PTR(errno) on error
*/
-struct obd_device *class_newdev(const char *type_name, const char *name)
+struct obd_device *class_newdev(const char *type_name, const char *name,
+ const char *uuid)
{
- struct obd_device *result = NULL;
struct obd_device *newdev;
struct obd_type *type = NULL;
- int i;
- int new_obd_minor = 0;
ENTRY;
if (strlen(name) >= MAX_OBD_NAME) {
}
newdev = obd_device_alloc();
- if (newdev == NULL)
- GOTO(out_type, result = ERR_PTR(-ENOMEM));
-
+ if (newdev == NULL) {
+ class_put_type(type);
+ RETURN(ERR_PTR(-ENOMEM));
+ }
LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
+ strncpy(newdev->obd_name, name, sizeof(newdev->obd_name) - 1);
+ newdev->obd_type = type;
+ newdev->obd_minor = -1;
+
+ rwlock_init(&newdev->obd_pool_lock);
+ newdev->obd_pool_limit = 0;
+ newdev->obd_pool_slv = 0;
+
+ INIT_LIST_HEAD(&newdev->obd_exports);
+ INIT_LIST_HEAD(&newdev->obd_unlinked_exports);
+ INIT_LIST_HEAD(&newdev->obd_delayed_exports);
+ INIT_LIST_HEAD(&newdev->obd_exports_timed);
+ INIT_LIST_HEAD(&newdev->obd_nid_stats);
+ spin_lock_init(&newdev->obd_nid_lock);
+ spin_lock_init(&newdev->obd_dev_lock);
+ mutex_init(&newdev->obd_dev_mutex);
+ spin_lock_init(&newdev->obd_osfs_lock);
+ /* newdev->obd_osfs_age must be set to a value in the distant
+ * past to guarantee a fresh statfs is fetched on mount. */
+ newdev->obd_osfs_age = cfs_time_shift_64(-1000);
+
+ /* XXX belongs in setup not attach */
+ init_rwsem(&newdev->obd_observer_link_sem);
+ /* recovery data */
+ init_timer(&newdev->obd_recovery_timer);
+ spin_lock_init(&newdev->obd_recovery_task_lock);
+ init_waitqueue_head(&newdev->obd_next_transno_waitq);
+ init_waitqueue_head(&newdev->obd_evict_inprogress_waitq);
+ INIT_LIST_HEAD(&newdev->obd_req_replay_queue);
+ INIT_LIST_HEAD(&newdev->obd_lock_replay_queue);
+ INIT_LIST_HEAD(&newdev->obd_final_req_queue);
+ INIT_LIST_HEAD(&newdev->obd_evict_list);
+ INIT_LIST_HEAD(&newdev->obd_lwp_list);
+
+ llog_group_init(&newdev->obd_olg);
+ /* Detach drops this */
+ atomic_set(&newdev->obd_refcount, 1);
+ lu_ref_init(&newdev->obd_reference);
+ lu_ref_add(&newdev->obd_reference, "newdev", newdev);
+
+ newdev->obd_conn_inprogress = 0;
+
+ strncpy(newdev->obd_uuid.uuid, uuid, strlen(uuid));
+
+ CDEBUG(D_IOCTL, "Allocate new device %s (%p)\n",
+ newdev->obd_name, newdev);
+
+ return newdev;
+}
- write_lock(&obd_dev_lock);
- for (i = 0; i < class_devno_max(); i++) {
- struct obd_device *obd = class_num2obd(i);
-
- if (obd && (strcmp(name, obd->obd_name) == 0)) {
- CERROR("Device %s already exists at %d, won't add\n",
- name, i);
- if (result) {
- LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC,
- "%p obd_magic %08x != %08x\n", result,
- result->obd_magic, OBD_DEVICE_MAGIC);
- LASSERTF(result->obd_minor == new_obd_minor,
- "%p obd_minor %d != %d\n", result,
- result->obd_minor, new_obd_minor);
-
- obd_devs[result->obd_minor] = NULL;
- result->obd_name[0]='\0';
- }
- result = ERR_PTR(-EEXIST);
- break;
- }
- if (!result && !obd) {
- result = newdev;
- result->obd_minor = i;
- new_obd_minor = i;
- result->obd_type = type;
- strncpy(result->obd_name, name,
- sizeof(result->obd_name) - 1);
- obd_devs[i] = result;
- }
- }
- write_unlock(&obd_dev_lock);
-
- if (result == NULL && i >= class_devno_max()) {
- CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n",
- class_devno_max());
- GOTO(out, result = ERR_PTR(-EOVERFLOW));
- }
-
- if (IS_ERR(result))
- GOTO(out, result);
+/**
+ * Free obd device.
+ *
+ * \param[in] obd obd_device to be freed
+ *
+ * \retval none
+ */
+void class_free_dev(struct obd_device *obd)
+{
+ struct obd_type *obd_type = obd->obd_type;
+
+ LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x "
+ "!= %08x\n", obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+ LASSERTF(obd->obd_minor == -1 || obd_devs[obd->obd_minor] == obd,
+ "obd %p != obd_devs[%d] %p\n",
+ obd, obd->obd_minor, obd_devs[obd->obd_minor]);
+ LASSERTF(atomic_read(&obd->obd_refcount) == 0,
+ "obd_refcount should be 0, not %d\n",
+ atomic_read(&obd->obd_refcount));
+ LASSERT(obd_type != NULL);
+
+ CDEBUG(D_INFO, "Release obd device %s obd_type name = %s\n",
+ obd->obd_name, obd->obd_type->typ_name);
+
+ CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
+ obd->obd_name, obd->obd_uuid.uuid);
+ if (obd->obd_stopping) {
+ int err;
+
+ /* If we're not stopping, we were never set up */
+ err = obd_cleanup(obd);
+ if (err)
+ CERROR("Cleanup %s returned %d\n",
+ obd->obd_name, err);
+ }
- CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
- result->obd_name, result);
+ obd_device_free(obd);
- RETURN(result);
-out:
- obd_device_free(newdev);
-out_type:
- class_put_type(type);
- return result;
+ class_put_type(obd_type);
}
-void class_release_dev(struct obd_device *obd)
+/**
+ * Unregister obd device.
+ *
+ * Free slot in obd_dev[] used by \a obd.
+ *
+ * \param[in] new_obd obd_device to be unregistered
+ *
+ * \retval none
+ */
+void class_unregister_device(struct obd_device *obd)
{
- struct obd_type *obd_type = obd->obd_type;
-
- LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n",
- obd, obd->obd_magic, OBD_DEVICE_MAGIC);
- LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n",
- obd, obd->obd_minor, obd_devs[obd->obd_minor]);
- LASSERT(obd_type != NULL);
+ write_lock(&obd_dev_lock);
+ if (obd->obd_minor >= 0) {
+ LASSERT(obd_devs[obd->obd_minor] == obd);
+ obd_devs[obd->obd_minor] = NULL;
+ obd->obd_minor = -1;
+ }
+ write_unlock(&obd_dev_lock);
+}
- CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n",
- obd->obd_name, obd->obd_minor, obd->obd_type->typ_name);
+/**
+ * Register obd device.
+ *
+ * Find free slot in obd_devs[], fills it with \a new_obd.
+ *
+ * \param[in] new_obd obd_device to be registered
+ *
+ * \retval 0 success
+ * \retval -EEXIST device with this name is registered
+ * \retval -EOVERFLOW obd_devs[] is full
+ */
+int class_register_device(struct obd_device *new_obd)
+{
+ int ret = 0;
+ int i;
+ int new_obd_minor = 0;
+ bool minor_assign = false;
write_lock(&obd_dev_lock);
- obd_devs[obd->obd_minor] = NULL;
+ for (i = 0; i < class_devno_max(); i++) {
+ struct obd_device *obd = class_num2obd(i);
+
+ if (obd != NULL &&
+ (strcmp(new_obd->obd_name, obd->obd_name) == 0)) {
+ CERROR("%s: already exists, won't add\n",
+ obd->obd_name);
+ /* in case we found a free slot before duplicate */
+ minor_assign = false;
+ ret = -EEXIST;
+ break;
+ }
+ if (!minor_assign && obd == NULL) {
+ new_obd_minor = i;
+ minor_assign = true;
+ }
+ }
+
+ if (minor_assign) {
+ new_obd->obd_minor = new_obd_minor;
+ LASSERTF(obd_devs[new_obd_minor] == NULL, "obd_devs[%d] "
+ "%p\n", new_obd_minor, obd_devs[new_obd_minor]);
+ obd_devs[new_obd_minor] = new_obd;
+ } else {
+ if (ret == 0) {
+ ret = -EOVERFLOW;
+ CERROR("%s: all %u/%u devices used, increase "
+ "MAX_OBD_DEVICES: rc = %d\n", new_obd->obd_name,
+ i, class_devno_max(), ret);
+ }
+ }
write_unlock(&obd_dev_lock);
- obd_device_free(obd);
- class_put_type(obd_type);
+ RETURN(ret);
}
-int class_name2dev(const char *name)
+static int class_name2dev_nolock(const char *name)
{
int i;
if (!name)
return -1;
- read_lock(&obd_dev_lock);
for (i = 0; i < class_devno_max(); i++) {
struct obd_device *obd = class_num2obd(i);
out any references */
LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
if (obd->obd_attached) {
- read_unlock(&obd_dev_lock);
return i;
}
break;
}
}
- read_unlock(&obd_dev_lock);
return -1;
}
+int class_name2dev(const char *name)
+{
+ int i;
+
+ if (!name)
+ return -1;
+
+ read_lock(&obd_dev_lock);
+ i = class_name2dev_nolock(name);
+ read_unlock(&obd_dev_lock);
+
+ return i;
+}
+EXPORT_SYMBOL(class_name2dev);
+
struct obd_device *class_name2obd(const char *name)
{
int dev = class_name2dev(name);
}
EXPORT_SYMBOL(class_name2obd);
-int class_uuid2dev(struct obd_uuid *uuid)
+int class_uuid2dev_nolock(struct obd_uuid *uuid)
{
int i;
- read_lock(&obd_dev_lock);
for (i = 0; i < class_devno_max(); i++) {
struct obd_device *obd = class_num2obd(i);
if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
return i;
}
}
- read_unlock(&obd_dev_lock);
return -1;
}
+int class_uuid2dev(struct obd_uuid *uuid)
+{
+ int i;
+
+ read_lock(&obd_dev_lock);
+ i = class_uuid2dev_nolock(uuid);
+ read_unlock(&obd_dev_lock);
+
+ return i;
+}
+EXPORT_SYMBOL(class_uuid2dev);
+
struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
{
int dev = class_uuid2dev(uuid);
}
/**
+ * Find obd in obd_dev[] by name or uuid.
+ *
+ * Increment obd's refcount if found.
+ *
+ * \param[in] str obd name or uuid
+ *
+ * \retval NULL if not found
+ * \retval target pointer to found obd_device
+ */
+struct obd_device *class_dev_by_str(const char *str)
+{
+ struct obd_device *target = NULL;
+ struct obd_uuid tgtuuid;
+ int rc;
+
+ obd_str2uuid(&tgtuuid, str);
+
+ read_lock(&obd_dev_lock);
+ rc = class_uuid2dev_nolock(&tgtuuid);
+ if (rc < 0)
+ rc = class_name2dev_nolock(str);
+
+ if (rc >= 0)
+ target = class_num2obd(rc);
+
+ if (target != NULL)
+ class_incref(target, "find", current);
+ read_unlock(&obd_dev_lock);
+
+ RETURN(target);
+}
+EXPORT_SYMBOL(class_dev_by_str);
+
+/**
* Get obd devices count. Device in any
* state are counted
* \retval obd device count
LASSERT(list_empty(&exp->exp_req_replay_queue));
LASSERT(list_empty(&exp->exp_hp_rpcs));
obd_destroy_export(exp);
- class_decref(obd, "export", exp);
+ /* self export doesn't hold a reference to an obd, although it
+ * exists until freeing of the obd */
+ if (exp != obd->obd_self_export)
+ class_decref(obd, "export", exp);
OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
EXIT;
atomic_read(&exp->exp_refcount) - 1);
if (atomic_dec_and_test(&exp->exp_refcount)) {
- LASSERT(!list_empty(&exp->exp_obd_chain));
- LASSERT(list_empty(&exp->exp_stale_list));
+ struct obd_device *obd = exp->exp_obd;
+
CDEBUG(D_IOCTL, "final put %p/%s\n",
exp, exp->exp_client_uuid.uuid);
/* release nid stat refererence */
lprocfs_exp_cleanup(exp);
- obd_zombie_export_add(exp);
+ if (exp == obd->obd_self_export) {
+ /* self export should be destroyed without
+ * zombie thread as it doesn't hold a
+ * reference to obd and doesn't hold any
+ * resources */
+ class_export_destroy(exp);
+ /* self export is destroyed, no class
+ * references exist and it is safe to free
+ * obd */
+ class_free_dev(obd);
+ } else {
+ LASSERT(!list_empty(&exp->exp_obd_chain));
+ obd_zombie_export_add(exp);
+ }
+
}
}
EXPORT_SYMBOL(class_export_put);
/* Creates a new export, adds it to the hash table, and returns a
* pointer to it. The refcount is 2: one for the hash reference, and
* one for the pointer returned by this function. */
-struct obd_export *class_new_export(struct obd_device *obd,
- struct obd_uuid *cluuid)
+struct obd_export *__class_new_export(struct obd_device *obd,
+ struct obd_uuid *cluuid, bool is_self)
{
struct obd_export *export;
struct cfs_hash *hash = NULL;
export->exp_conn_cnt = 0;
export->exp_lock_hash = NULL;
export->exp_flock_hash = NULL;
+ /* 2 = class_handle_hash + last */
atomic_set(&export->exp_refcount, 2);
atomic_set(&export->exp_rpc_count, 0);
atomic_set(&export->exp_cb_count, 0);
INIT_LIST_HEAD(&export->exp_hp_rpcs);
INIT_LIST_HEAD(&export->exp_reg_rpcs);
class_handle_hash(&export->exp_handle, &export_handle_ops);
- export->exp_last_request_time = cfs_time_current_sec();
+ export->exp_last_request_time = ktime_get_real_seconds();
spin_lock_init(&export->exp_lock);
spin_lock_init(&export->exp_rpc_lock);
INIT_HLIST_NODE(&export->exp_uuid_hash);
export->exp_client_uuid = *cluuid;
obd_init_export(export);
- spin_lock(&obd->obd_dev_lock);
- /* shouldn't happen, but might race */
- if (obd->obd_stopping)
- GOTO(exit_unlock, rc = -ENODEV);
+ if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
+ spin_lock(&obd->obd_dev_lock);
+ /* shouldn't happen, but might race */
+ if (obd->obd_stopping)
+ GOTO(exit_unlock, rc = -ENODEV);
- hash = cfs_hash_getref(obd->obd_uuid_hash);
- if (hash == NULL)
- GOTO(exit_unlock, rc = -ENODEV);
- spin_unlock(&obd->obd_dev_lock);
+ hash = cfs_hash_getref(obd->obd_uuid_hash);
+ if (hash == NULL)
+ GOTO(exit_unlock, rc = -ENODEV);
+ spin_unlock(&obd->obd_dev_lock);
- if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash);
if (rc != 0) {
LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
at_init(&export->exp_bl_lock_at, obd_timeout, 0);
spin_lock(&obd->obd_dev_lock);
if (obd->obd_stopping) {
- cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
- GOTO(exit_unlock, rc = -ENODEV);
+ if (hash)
+ cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
+ GOTO(exit_unlock, rc = -ESHUTDOWN);
}
- class_incref(obd, "export", export);
- list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
- list_add_tail(&export->exp_obd_chain_timed,
- &export->exp_obd->obd_exports_timed);
- export->exp_obd->obd_num_exports++;
+ if (!is_self) {
+ class_incref(obd, "export", export);
+ list_add_tail(&export->exp_obd_chain_timed,
+ &obd->obd_exports_timed);
+ list_add(&export->exp_obd_chain, &obd->obd_exports);
+ obd->obd_num_exports++;
+ } else {
+ INIT_LIST_HEAD(&export->exp_obd_chain_timed);
+ INIT_LIST_HEAD(&export->exp_obd_chain);
+ }
spin_unlock(&obd->obd_dev_lock);
- cfs_hash_putref(hash);
+ if (hash)
+ cfs_hash_putref(hash);
RETURN(export);
exit_unlock:
OBD_FREE_PTR(export);
return ERR_PTR(rc);
}
+
+struct obd_export *class_new_export(struct obd_device *obd,
+ struct obd_uuid *uuid)
+{
+ return __class_new_export(obd, uuid, false);
+}
EXPORT_SYMBOL(class_new_export);
+struct obd_export *class_new_export_self(struct obd_device *obd,
+ struct obd_uuid *uuid)
+{
+ return __class_new_export(obd, uuid, true);
+}
+
void class_unlink_export(struct obd_export *exp)
{
class_handle_unhash(&exp->exp_handle);
+ if (exp->exp_obd->obd_self_export == exp) {
+ class_export_put(exp);
+ return;
+ }
+
spin_lock(&exp->exp_obd->obd_dev_lock);
/* delete an uuid-export hashitem from hashtables */
if (!hlist_unhashed(&exp->exp_uuid_hash))
class_export_get(exp);
CDEBUG(D_HA, "%s: disconnecting export at %s (%p), "
- "last request at %ld\n",
+ "last request at %lld\n",
exp->exp_obd->obd_name, obd_export_nid2str(exp),
exp, exp->exp_last_request_time);
/* release one export reference anyway */
int rc;
spin_lock(&cli->cl_loi_list_lock);
- if (cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight) {
- cli->cl_r_in_flight++;
+ if (cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight) {
+ cli->cl_rpcs_in_flight++;
spin_unlock(&cli->cl_loi_list_lock);
return 0;
}
init_waitqueue_head(&orsw.orsw_waitq);
- list_add_tail(&orsw.orsw_entry, &cli->cl_loi_read_list);
+ list_add_tail(&orsw.orsw_entry, &cli->cl_flight_waiters);
orsw.orsw_signaled = false;
spin_unlock(&cli->cl_loi_list_lock);
if (rc != 0) {
if (!orsw.orsw_signaled) {
if (list_empty(&orsw.orsw_entry))
- cli->cl_r_in_flight--;
+ cli->cl_rpcs_in_flight--;
else
list_del(&orsw.orsw_entry);
}
struct obd_request_slot_waiter *orsw;
spin_lock(&cli->cl_loi_list_lock);
- cli->cl_r_in_flight--;
+ cli->cl_rpcs_in_flight--;
/* If there is free slot, wakeup the first waiter. */
- if (!list_empty(&cli->cl_loi_read_list) &&
- likely(cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight)) {
- orsw = list_entry(cli->cl_loi_read_list.next,
+ if (!list_empty(&cli->cl_flight_waiters) &&
+ likely(cli->cl_rpcs_in_flight < cli->cl_max_rpcs_in_flight)) {
+ orsw = list_entry(cli->cl_flight_waiters.next,
struct obd_request_slot_waiter, orsw_entry);
list_del_init(&orsw->orsw_entry);
- cli->cl_r_in_flight++;
+ cli->cl_rpcs_in_flight++;
wake_up(&orsw->orsw_waitq);
}
spin_unlock(&cli->cl_loi_list_lock);
spin_lock(&cli->cl_loi_list_lock);
old = cli->cl_max_rpcs_in_flight;
cli->cl_max_rpcs_in_flight = max;
+ client_adjust_max_dirty(cli);
+
diff = max - old;
/* We increase the max_rpcs_in_flight, then wakeup some waiters. */
for (i = 0; i < diff; i++) {
- if (list_empty(&cli->cl_loi_read_list))
+ if (list_empty(&cli->cl_flight_waiters))
break;
- orsw = list_entry(cli->cl_loi_read_list.next,
+ orsw = list_entry(cli->cl_flight_waiters.next,
struct obd_request_slot_waiter, orsw_entry);
list_del_init(&orsw->orsw_entry);
- cli->cl_r_in_flight++;
+ cli->cl_rpcs_in_flight++;
wake_up(&orsw->orsw_waitq);
}
spin_unlock(&cli->cl_loi_list_lock);
*/
int class_attach(struct lustre_cfg *lcfg)
{
+ struct obd_export *exp;
struct obd_device *obd = NULL;
char *typename, *name, *uuid;
int rc, len;
RETURN(-EINVAL);
}
name = lustre_cfg_string(lcfg, 0);
-
if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
CERROR("No UUID passed!\n");
RETURN(-EINVAL);
}
- uuid = lustre_cfg_string(lcfg, 2);
- CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
- MKSTR(typename), MKSTR(name), MKSTR(uuid));
+ uuid = lustre_cfg_string(lcfg, 2);
+ len = strlen(uuid);
+ if (len >= sizeof(obd->obd_uuid)) {
+ CERROR("%s: uuid must be < %d bytes long\n",
+ name, (int)sizeof(obd->obd_uuid));
+ RETURN(-EINVAL);
+ }
- obd = class_newdev(typename, name);
+ obd = class_newdev(typename, name, uuid);
if (IS_ERR(obd)) {
/* Already exists or out of obds */
rc = PTR_ERR(obd);
- obd = NULL;
CERROR("Cannot create device %s of type %s : %d\n",
name, typename, rc);
- GOTO(out, rc);
+ RETURN(rc);
}
LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n",
name, typename);
LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
"%p obd_name %s != %s\n", obd, obd->obd_name, name);
- rwlock_init(&obd->obd_pool_lock);
- obd->obd_pool_limit = 0;
- obd->obd_pool_slv = 0;
-
- INIT_LIST_HEAD(&obd->obd_exports);
- INIT_LIST_HEAD(&obd->obd_unlinked_exports);
- INIT_LIST_HEAD(&obd->obd_delayed_exports);
- INIT_LIST_HEAD(&obd->obd_exports_timed);
- INIT_LIST_HEAD(&obd->obd_nid_stats);
- spin_lock_init(&obd->obd_nid_lock);
- spin_lock_init(&obd->obd_dev_lock);
- mutex_init(&obd->obd_dev_mutex);
- spin_lock_init(&obd->obd_osfs_lock);
- /* obd->obd_osfs_age must be set to a value in the distant
- * past to guarantee a fresh statfs is fetched on mount. */
- obd->obd_osfs_age = cfs_time_shift_64(-1000);
-
- /* XXX belongs in setup not attach */
- init_rwsem(&obd->obd_observer_link_sem);
- /* recovery data */
- init_timer(&obd->obd_recovery_timer);
- spin_lock_init(&obd->obd_recovery_task_lock);
- init_waitqueue_head(&obd->obd_next_transno_waitq);
- init_waitqueue_head(&obd->obd_evict_inprogress_waitq);
- INIT_LIST_HEAD(&obd->obd_req_replay_queue);
- INIT_LIST_HEAD(&obd->obd_lock_replay_queue);
- INIT_LIST_HEAD(&obd->obd_final_req_queue);
- INIT_LIST_HEAD(&obd->obd_evict_list);
- INIT_LIST_HEAD(&obd->obd_lwp_list);
-
- llog_group_init(&obd->obd_olg);
-
- obd->obd_conn_inprogress = 0;
-
- len = strlen(uuid);
- if (len >= sizeof(obd->obd_uuid)) {
- CERROR("uuid must be < %d bytes long\n",
- (int)sizeof(obd->obd_uuid));
- GOTO(out, rc = -EINVAL);
- }
- memcpy(obd->obd_uuid.uuid, uuid, len);
+ exp = class_new_export_self(obd, &obd->obd_uuid);
+ if (IS_ERR(exp)) {
+ /* force free */
+ GOTO(out, rc = PTR_ERR(exp));
+ RETURN(PTR_ERR(exp));
+ }
- /* Detach drops this */
- spin_lock(&obd->obd_dev_lock);
- atomic_set(&obd->obd_refcount, 1);
- spin_unlock(&obd->obd_dev_lock);
- lu_ref_init(&obd->obd_reference);
- lu_ref_add(&obd->obd_reference, "attach", obd);
+ obd->obd_self_export = exp;
+ list_del_init(&exp->exp_obd_chain_timed);
+ class_export_put(exp);
+
+ rc = class_register_device(obd);
+ if (rc != 0)
+ GOTO(out, rc);
- obd->obd_attached = 1;
- CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+ obd->obd_attached = 1;
+ CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
- RETURN(0);
- out:
- if (obd != NULL) {
- class_release_dev(obd);
- }
- return rc;
+ RETURN(0);
+out:
+ class_decref(obd, "newdev", obd);
+ class_free_dev(obd);
+
+ RETURN(rc);
}
EXPORT_SYMBOL(class_attach);
int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
{
int err = 0;
- struct obd_export *exp;
ENTRY;
LASSERT(obd != NULL);
CFS_HASH_MAX_THETA,
&uuid_hash_ops, CFS_HASH_DEFAULT);
if (!obd->obd_uuid_hash)
- GOTO(err_hash, err = -ENOMEM);
+ GOTO(err_exit, err = -ENOMEM);
/* create a nid-export lustre hash */
obd->obd_nid_hash = cfs_hash_create("NID_HASH",
CFS_HASH_MAX_THETA,
&nid_hash_ops, CFS_HASH_DEFAULT);
if (!obd->obd_nid_hash)
- GOTO(err_hash, err = -ENOMEM);
+ GOTO(err_exit, err = -ENOMEM);
/* create a nid-stats lustre hash */
obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
CFS_HASH_MIN_THETA,
CFS_HASH_MAX_THETA,
&nid_stat_hash_ops, CFS_HASH_DEFAULT);
- if (!obd->obd_nid_stats_hash)
- GOTO(err_hash, err = -ENOMEM);
+ if (!obd->obd_nid_stats_hash)
+ GOTO(err_exit, err = -ENOMEM);
/* create a client_generation-export lustre hash */
obd->obd_gen_hash = cfs_hash_create("UUID_HASH",
CFS_HASH_MAX_THETA,
&gen_hash_ops, CFS_HASH_DEFAULT);
if (!obd->obd_gen_hash)
- GOTO(err_hash, err = -ENOMEM);
-
- exp = class_new_export(obd, &obd->obd_uuid);
- if (IS_ERR(exp))
- GOTO(err_hash, err = PTR_ERR(exp));
+ GOTO(err_exit, err = -ENOMEM);
- obd->obd_self_export = exp;
- list_del_init(&exp->exp_obd_chain_timed);
- class_export_put(exp);
-
- err = obd_setup(obd, lcfg);
- if (err)
- GOTO(err_exp, err);
+ err = obd_setup(obd, lcfg);
+ if (err)
+ GOTO(err_exit, err);
- obd->obd_set_up = 1;
+ obd->obd_set_up = 1;
spin_lock(&obd->obd_dev_lock);
/* cleanup drops this */
obd->obd_name, obd->obd_uuid.uuid);
RETURN(0);
-err_exp:
- if (obd->obd_self_export) {
- class_unlink_export(obd->obd_self_export);
- obd->obd_self_export = NULL;
- }
-err_hash:
+err_exit:
if (obd->obd_uuid_hash) {
cfs_hash_putref(obd->obd_uuid_hash);
obd->obd_uuid_hash = NULL;
obd->obd_attached = 0;
spin_unlock(&obd->obd_dev_lock);
+ /* cleanup in progress. we don't like to find this device after now */
+ class_unregister_device(obd);
+
CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
obd->obd_name, obd->obd_uuid.uuid);
- class_decref(obd, "attach", obd);
+ class_decref(obd, "newdev", obd);
+
RETURN(0);
}
EXPORT_SYMBOL(class_detach);
}
/* Leave this on forever */
obd->obd_stopping = 1;
+ /* function can't return error after that point, so clear setup flag
+ * as early as possible to avoid finding via obd_devs / hash */
+ obd->obd_set_up = 0;
spin_unlock(&obd->obd_dev_lock);
/* wait for already-arrived-connections to finish. */
LASSERT(obd->obd_self_export);
- /* The three references that should be remaining are the
- * obd_self_export and the attach and setup references. */
- if (atomic_read(&obd->obd_refcount) > 3) {
- /* refcounf - 3 might be the number of real exports
- (excluding self export). But class_incref is called
- by other things as well, so don't count on it. */
- CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
- obd->obd_name, atomic_read(&obd->obd_refcount) - 3);
- dump_exports(obd, 0, D_HA);
- class_disconnect_exports(obd);
- }
+ CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d/%d\n",
+ obd->obd_name, obd->obd_num_exports,
+ atomic_read(&obd->obd_refcount) - 2);
+ dump_exports(obd, 0, D_HA);
+ class_disconnect_exports(obd);
/* Precleanup, we must make sure all exports get destroyed. */
err = obd_precleanup(obd);
void class_decref(struct obd_device *obd, const char *scope, const void *source)
{
- int err;
- int refs;
+ int last;
- spin_lock(&obd->obd_dev_lock);
- atomic_dec(&obd->obd_refcount);
- refs = atomic_read(&obd->obd_refcount);
- spin_unlock(&obd->obd_dev_lock);
+ CDEBUG(D_INFO, "Decref %s (%p) now %d - %s\n", obd->obd_name, obd,
+ atomic_read(&obd->obd_refcount), scope);
+
+ LASSERT(obd->obd_num_exports >= 0);
+ last = atomic_dec_and_test(&obd->obd_refcount);
lu_ref_del(&obd->obd_reference, scope, source);
- CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
+ if (last) {
+ struct obd_export *exp;
- if ((refs == 1) && obd->obd_stopping) {
+ LASSERT(!obd->obd_attached);
/* All exports have been destroyed; there should
- be no more in-progress ops by this point.*/
-
- spin_lock(&obd->obd_self_export->exp_lock);
- obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
- spin_unlock(&obd->obd_self_export->exp_lock);
-
- /* note that we'll recurse into class_decref again */
- class_unlink_export(obd->obd_self_export);
- return;
- }
+ * be no more in-progress ops by this point.*/
+ exp = obd->obd_self_export;
- if (refs == 0) {
- CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
- obd->obd_name, obd->obd_uuid.uuid);
- LASSERT(!obd->obd_attached);
- if (obd->obd_stopping) {
- /* If we're not stopping, we were never set up */
- err = obd_cleanup(obd);
- if (err)
- CERROR("Cleanup %s returned %d\n",
- obd->obd_name, err);
+ if (exp) {
+ exp->exp_flags |= exp_flags_from_obd(obd);
+ /*
+ * note that we'll recurse into class_decref again
+ * but it's not a problem because we was last user
+ */
+ class_unlink_export(exp);
}
-
- class_release_dev(obd);
}
}
EXPORT_SYMBOL(class_decref);
GOTO(out, err = -EINVAL);
}
-
switch(lcfg->lcfg_command) {
case LCFG_SETUP: {
err = class_setup(obd, lcfg);
err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
GOTO(out, err = 0);
}
- default: {
- err = obd_process_config(obd, sizeof(*lcfg), lcfg);
- GOTO(out, err);
+ /* Process config log ADD_MDC record twice to add MDC also to LOV
+ * for Data-on-MDT:
+ *
+ * add 0:lustre-clilmv 1:lustre-MDT0000_UUID 2:0 3:1
+ * 4:lustre-MDT0000-mdc_UUID
+ */
+ case LCFG_ADD_MDC: {
+ struct obd_device *lov_obd;
+ char *clilmv;
+
+ err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+ if (err)
+ GOTO(out, err);
+
+ /* make sure this is client LMV log entry */
+ clilmv = strstr(lustre_cfg_string(lcfg, 0), "clilmv");
+ if (!clilmv)
+ GOTO(out, err);
+
+ /* replace 'lmv' with 'lov' name to address LOV device and
+ * process llog record to add MDC there. */
+ clilmv[4] = 'o';
+ lov_obd = class_name2obd(lustre_cfg_string(lcfg, 0));
+ if (lov_obd == NULL) {
+ err = -ENOENT;
+ CERROR("%s: Cannot find LOV by %s name, rc = %d\n",
+ obd->obd_name, lustre_cfg_string(lcfg, 0), err);
+ } else {
+ err = obd_process_config(lov_obd, sizeof(*lcfg), lcfg);
+ }
+ /* restore 'lmv' name */
+ clilmv[4] = 'm';
+ GOTO(out, err);
+ }
+ default: {
+ err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+ GOTO(out, err);
}
}
+ EXIT;
out:
if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
CWARN("Ignoring error %d on optional command %#x\n", err,
/* rc = -EINVAL; continue parsing other params */
skip++;
} else if (rc < 0) {
- CERROR("%s: error writing proc '%s'='%s': rc = %d\n",
- lustre_cfg_string(lcfg, 0), key, sval, rc);
+ CERROR("%s: error writing parameter '%s': rc = %d\n",
+ lustre_cfg_string(lcfg, 0), key, rc);
rc = 0;
} else {
- CDEBUG(D_CONFIG, "%s: Set parameter '%s'='%s'\n",
- lustre_cfg_string(lcfg, 0), key, sval);
+ CDEBUG(D_CONFIG, "%s: set parameter '%s'\n",
+ lustre_cfg_string(lcfg, 0), key);
}
}
LPROC_SEQ_FOPS_RO(ofd_seqs);
/**
- * Show estimate of total amount of dirty data on clients.
- *
- * \param[in] m seq_file handle
- * \param[in] data unused for single entry
- *
- * \retval 0 on success
- * \retval negative value on error
- */
- static int ofd_tot_dirty_seq_show(struct seq_file *m, void *data)
- {
- struct obd_device *obd = m->private;
- struct tg_grants_data *tgd;
-
- LASSERT(obd != NULL);
- tgd = &obd->u.obt.obt_lut->lut_tgd;
- seq_printf(m, "%llu\n", tgd->tgd_tot_dirty);
- return 0;
- }
- LPROC_SEQ_FOPS_RO(ofd_tot_dirty);
-
- /**
- * Show total amount of space granted to clients.
- *
- * \param[in] m seq_file handle
- * \param[in] data unused for single entry
- *
- * \retval 0 on success
- * \retval negative value on error
- */
- static int ofd_tot_granted_seq_show(struct seq_file *m, void *data)
- {
- struct obd_device *obd = m->private;
- struct tg_grants_data *tgd;
-
- LASSERT(obd != NULL);
- tgd = &obd->u.obt.obt_lut->lut_tgd;
- seq_printf(m, "%llu\n", tgd->tgd_tot_granted);
- return 0;
- }
- LPROC_SEQ_FOPS_RO(ofd_tot_granted);
-
- /**
- * Show total amount of space used by IO in progress.
- *
- * \param[in] m seq_file handle
- * \param[in] data unused for single entry
- *
- * \retval 0 on success
- * \retval negative value on error
- */
- static int ofd_tot_pending_seq_show(struct seq_file *m, void *data)
- {
- struct obd_device *obd = m->private;
- struct tg_grants_data *tgd;
-
- LASSERT(obd != NULL);
- tgd = &obd->u.obt.obt_lut->lut_tgd;
- seq_printf(m, "%llu\n", tgd->tgd_tot_pending);
- return 0;
- }
- LPROC_SEQ_FOPS_RO(ofd_tot_pending);
-
- /**
* Show total number of grants for precreate.
*
* \param[in] m seq_file handle
/**
* Show the maximum age of FMD data in seconds.
*
- * Though it is shown in seconds, it is stored internally in units
- * of jiffies for efficiency.
- *
* \param[in] m seq_file handle
* \param[in] data unused for single entry
*
struct obd_device *obd = m->private;
struct ofd_device *ofd = ofd_dev(obd->obd_lu_dev);
- seq_printf(m, "%ld\n", jiffies_to_msecs(ofd->ofd_fmd_max_age) /
- MSEC_PER_SEC);
+ seq_printf(m, "%lld\n", ofd->ofd_fmd_max_age);
return 0;
}
* Set the maximum age of FMD data in seconds.
*
* This defines how long FMD data stays in the FMD list.
- * It is stored internally in units of jiffies for efficiency.
*
* \param[in] file proc file
* \param[in] buffer string which represents maximum number
if (val > 65536 || val < 1)
return -EINVAL;
- ofd->ofd_fmd_max_age = msecs_to_jiffies(val * MSEC_PER_SEC);
+ ofd->ofd_fmd_max_age = val;
return count;
}
LPROC_SEQ_FOPS(ofd_fmd_max_age);
LPROC_SEQ_FOPS(ofd_sync_lock_cancel);
/**
- * Show if grants compatibility mode is disabled.
- *
- * When tgd_grant_compat_disable is set, we don't grant any space to clients
- * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such
- * a client is inflated since it consumes PAGE_SIZE of grant space per
- * block, (i.e. typically 4kB units), but underlaying file system might have
- * block size bigger than page size, e.g. ZFS. See LU-2049 for details.
- *
- * \param[in] m seq_file handle
- * \param[in] data unused for single entry
- *
- * \retval 0 on success
- * \retval negative value on error
- */
- static int ofd_grant_compat_disable_seq_show(struct seq_file *m, void *data)
- {
- struct obd_device *obd = m->private;
- struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
-
- seq_printf(m, "%u\n", tgd->tgd_grant_compat_disable);
- return 0;
- }
-
- /**
- * Change grant compatibility mode.
- *
- * Setting tgd_grant_compat_disable prohibit any space granting to clients
- * not supporting OBD_CONNECT_GRANT_PARAM. See details above.
- *
- * \param[in] file proc file
- * \param[in] buffer string which represents mode
- * 1: disable compatibility mode
- * 0: enable compatibility mode
- * \param[in] count \a buffer length
- * \param[in] off unused for single entry
- *
- * \retval \a count on success
- * \retval negative number on error
- */
- static ssize_t
- ofd_grant_compat_disable_seq_write(struct file *file,
- const char __user *buffer,
- size_t count, loff_t *off)
- {
- struct seq_file *m = file->private_data;
- struct obd_device *obd = m->private;
- struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
- __s64 val;
- int rc;
-
- rc = lprocfs_str_to_s64(buffer, count, &val);
- if (rc)
- return rc;
-
- if (val < 0)
- return -EINVAL;
-
- tgd->tgd_grant_compat_disable = !!val;
-
- return count;
- }
- LPROC_SEQ_FOPS(ofd_grant_compat_disable);
-
- /**
* Show the limit of soft sync RPCs.
*
* This value defines how many IO RPCs with OBD_BRW_SOFT_SYNC flag
LPROC_SEQ_FOPS_RW_TYPE(ofd, checksum_dump);
LPROC_SEQ_FOPS_RW_TYPE(ofd, job_interval);
+ LPROC_SEQ_FOPS_RO(tgt_tot_dirty);
+ LPROC_SEQ_FOPS_RO(tgt_tot_granted);
+ LPROC_SEQ_FOPS_RO(tgt_tot_pending);
+ LPROC_SEQ_FOPS(tgt_grant_compat_disable);
+
struct lprocfs_vars lprocfs_ofd_obd_vars[] = {
{ .name = "seqs_allocated",
.fops = &ofd_seqs_fops },
{ .name = "last_id",
.fops = &ofd_last_id_fops },
{ .name = "tot_dirty",
- .fops = &ofd_tot_dirty_fops },
+ .fops = &tgt_tot_dirty_fops },
{ .name = "tot_pending",
- .fops = &ofd_tot_pending_fops },
+ .fops = &tgt_tot_pending_fops },
{ .name = "tot_granted",
- .fops = &ofd_tot_granted_fops },
+ .fops = &tgt_tot_granted_fops },
{ .name = "grant_precreate",
.fops = &ofd_grant_precreate_fops },
{ .name = "precreate_batch",
{ .name = "checksum_dump",
.fops = &ofd_checksum_dump_fops },
{ .name = "grant_compat_disable",
- .fops = &ofd_grant_compat_disable_fops },
+ .fops = &tgt_grant_compat_disable_fops },
{ .name = "client_cache_count",
.fops = &ofd_fmd_max_num_fops },
{ .name = "client_cache_seconds",
}
}
if (diff > 0) {
- cfs_time_t enough_time = cfs_time_shift(DISK_TIMEOUT);
- u64 next_id;
- int created = 0;
- int count;
+ time64_t enough_time = ktime_get_seconds() + DISK_TIMEOUT;
+ u64 next_id;
+ int created = 0;
+ int count;
if (!(oa->o_valid & OBD_MD_FLFLAGS) ||
!(oa->o_flags & OBD_FL_DELORPHAN)) {
count, seq, next_id);
if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
- && cfs_time_after(jiffies, enough_time)) {
+ && ktime_get_seconds() > enough_time) {
CDEBUG(D_HA, "%s: Slow creates, %d/%lld objects"
" created at a rate of %d/s\n",
ofd_name(ofd), created, diff + created,
*
* \retval amount of time to extend the timeout with
*/
-static inline int prolong_timeout(struct ptlrpc_request *req)
+static inline time64_t prolong_timeout(struct ptlrpc_request *req)
{
struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
- time_t req_timeout;
+ time64_t req_timeout;
if (AT_OFF)
return obd_timeout / 2;
req_timeout = req->rq_deadline - req->rq_arrival_time.tv_sec;
- return max_t(time_t, at_est2timeout(at_get(&svcpt->scp_at_estimate)),
+ return max_t(time64_t, at_est2timeout(at_get(&svcpt->scp_at_estimate)),
req_timeout);
}
struct ofd_thread_info *info = NULL;
struct obd_device *obd;
struct tg_grants_data *tgd = &m->ofd_lut.lut_tgd;
- struct obd_statfs *osfs;
struct lu_fid fid;
struct nm_config_file *nodemap_config;
struct obd_device_target *obt;
m->ofd_raid_degraded = 0;
m->ofd_syncjournal = 0;
ofd_slc_set(m);
- tgd->tgd_grant_compat_disable = 0;
m->ofd_soft_sync_limit = OFD_SOFT_SYNC_LIMIT_DEFAULT;
- /* statfs data */
- spin_lock_init(&tgd->tgd_osfs_lock);
- tgd->tgd_osfs_age = cfs_time_shift_64(-1000);
- tgd->tgd_osfs_unstable = 0;
- tgd->tgd_statfs_inflight = 0;
- tgd->tgd_osfs_inflight = 0;
-
- /* grant data */
- spin_lock_init(&tgd->tgd_grant_lock);
- tgd->tgd_tot_dirty = 0;
- tgd->tgd_tot_granted = 0;
- tgd->tgd_tot_pending = 0;
-
m->ofd_seq_count = 0;
init_waitqueue_head(&m->ofd_inconsistency_thread.t_ctl_waitq);
INIT_LIST_HEAD(&m->ofd_inconsistency_list);
ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
"filter_ldlm_cb_client", &obd->obd_ldlm_client);
- dt_conf_get(env, m->ofd_osd, &m->ofd_lut.lut_dt_conf);
-
rc = tgt_init(env, &m->ofd_lut, obd, m->ofd_osd, ofd_common_slice,
OBD_FAIL_OST_ALL_REQUEST_NET,
OBD_FAIL_OST_ALL_REPLY_NET);
if (rc)
GOTO(err_free_ns, rc);
- /* populate cached statfs data */
- osfs = &ofd_info(env)->fti_u.osfs;
- rc = tgt_statfs_internal(env, &m->ofd_lut, osfs, 0, NULL);
- if (rc != 0) {
- CERROR("%s: can't get statfs data, rc %d\n", obd->obd_name, rc);
- GOTO(err_fini_lut, rc);
- }
- if (!is_power_of_2(osfs->os_bsize)) {
- CERROR("%s: blocksize (%d) is not a power of 2\n",
- obd->obd_name, osfs->os_bsize);
- GOTO(err_fini_lut, rc = -EPROTO);
- }
- tgd->tgd_blockbits = fls(osfs->os_bsize) - 1;
+ tgd->tgd_reserved_pcnt = 0;
if (DT_DEF_BRW_SIZE < (1U << tgd->tgd_blockbits))
m->ofd_brw_size = 1U << tgd->tgd_blockbits;
m->ofd_cksum_types_supported = cksum_types_supported_server();
m->ofd_precreate_batch = OFD_PRECREATE_BATCH_DEFAULT;
- if (osfs->os_bsize * osfs->os_blocks < OFD_PRECREATE_SMALL_FS)
+ if (tgd->tgd_osfs.os_bsize * tgd->tgd_osfs.os_blocks <
+ OFD_PRECREATE_SMALL_FS)
m->ofd_precreate_batch = OFD_PRECREATE_BATCH_SMALL;
rc = ofd_fs_setup(env, m, obd);
return(rc);
}
- rc = ofd_dlm_init();
- if (rc) {
- lu_kmem_fini(ofd_caches);
- ofd_fmd_exit();
- return rc;
- }
-
rc = class_register_type(&ofd_obd_ops, NULL, true, NULL,
LUSTRE_OST_NAME, &ofd_device_type);
return rc;
static void __exit ofd_exit(void)
{
ofd_fmd_exit();
- ofd_dlm_exit();
lu_kmem_fini(ofd_caches);
class_unregister_type(LUSTRE_OST_NAME);
}
int error;
};
- int ofd_dlm_init(void)
- {
- ldlm_glimpse_work_kmem = kmem_cache_create("ldlm_glimpse_work_kmem",
- sizeof(struct ldlm_glimpse_work),
- 0, 0, NULL);
- if (ldlm_glimpse_work_kmem == NULL)
- return -ENOMEM;
- else
- return 0;
- }
-
- void ofd_dlm_exit(void)
- {
- if (ldlm_glimpse_work_kmem) {
- kmem_cache_destroy(ldlm_glimpse_work_kmem);
- ldlm_glimpse_work_kmem = NULL;
- }
- }
-
/**
* OFD interval callback.
*
/* Find the 'victim' lock from this interval */
list_for_each_entry(lck, &node->li_group, l_sl_policy) {
-
victim_lock = LDLM_LOCK_GET(lck);
/* the same policy group - every lock has the
struct ldlm_glimpse_work *pos, *tmp;
ENTRY;
+ /* update stats for intent in intent policy */
+ if (ptlrpc_req2svc(req)->srv_stats != NULL)
+ lprocfs_counter_incr(ptlrpc_req2svc(req)->srv_stats,
+ PTLRPC_LAST_CNTR + LDLM_GLIMPSE_ENQUEUE);
+
INIT_LIST_HEAD(&arg.gl_list);
arg.no_glimpse_ast = false;
arg.error = 0;
struct list_head fmd_list; /* linked to fed_mod_list */
struct lu_fid fmd_fid; /* FID being written to */
__u64 fmd_mactime_xid; /* xid highest {m,a,c}time setattr */
- cfs_time_t fmd_expire; /* time when the fmd should expire */
+ time64_t fmd_expire; /* time when the fmd should expire */
int fmd_refcount; /* reference counter - list holds 1 */
};
#define OFD_FMD_MAX_NUM_DEFAULT 128
-#define OFD_FMD_MAX_AGE_DEFAULT msecs_to_jiffies((obd_timeout+10)*MSEC_PER_SEC)
+#define OFD_FMD_MAX_AGE_DEFAULT (obd_timeout + 10)
#define OFD_SOFT_SYNC_LIMIT_DEFAULT 16
/* ofd mod data: ofd_device wide values */
int ofd_fmd_max_num; /* per ofd ofd_mod_data */
- cfs_duration_t ofd_fmd_max_age; /* time to fmd expiry */
+ time64_t ofd_fmd_max_age; /* time to fmd expiry */
spinlock_t ofd_flags_lock;
unsigned long ofd_raid_degraded:1,
/* ofd_dlm.c */
extern struct kmem_cache *ldlm_glimpse_work_kmem;
- int ofd_dlm_init(void);
- void ofd_dlm_exit(void);
+
int ofd_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock **lockp,
void *req_cookie, enum ldlm_mode mode, __u64 flags,
void *data);
return result;
}
+ EXPORT_SYMBOL(osc_page_init);
/**
* Helper function called by osc_io_submit() for every page in an immediate
}
RETURN(count > 0 ? count : rc);
}
+ EXPORT_SYMBOL(osc_lru_shrink);
/**
* Reclaim LRU pages by an IO thread. The caller wants to reclaim at least
struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
struct osc_io *oio = osc_env_io(env);
int rc = 0;
+
ENTRY;
if (cli->cl_cache == NULL) /* shall not be in LRU */
* are likely from the same page zone.
*/
static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
+ struct osc_brw_async_args *aa,
int factor)
{
- int page_count = desc->bd_iov_count;
+ int page_count;
void *zone = NULL;
int count = 0;
int i;
- LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+ if (desc != NULL) {
+ LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
+ page_count = desc->bd_iov_count;
+ } else {
+ page_count = aa->aa_page_count;
+ }
for (i = 0; i < page_count; i++) {
- void *pz = page_zone(BD_GET_KIOV(desc, i).kiov_page);
+ void *pz;
+ if (desc)
+ pz = page_zone(BD_GET_KIOV(desc, i).kiov_page);
+ else
+ pz = page_zone(aa->aa_ppga[i]->pg);
if (likely(pz == zone)) {
++count;
mod_zone_page_state(zone, NR_UNSTABLE_NFS, factor * count);
}
-static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
+static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
+ struct osc_brw_async_args *aa)
{
- unstable_page_accounting(desc, 1);
+ unstable_page_accounting(desc, aa, 1);
}
-static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc)
+static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc,
+ struct osc_brw_async_args *aa)
{
- unstable_page_accounting(desc, -1);
+ unstable_page_accounting(desc, aa, -1);
}
/**
void osc_dec_unstable_pages(struct ptlrpc_request *req)
{
struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+ struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
- int page_count = desc->bd_iov_count;
+ int page_count;
long unstable_count;
+ if (desc)
+ page_count = desc->bd_iov_count;
+ else
+ page_count = aa->aa_page_count;
+
LASSERT(page_count >= 0);
- dec_unstable_page_accounting(desc);
+
+ dec_unstable_page_accounting(desc, aa);
unstable_count = atomic_long_sub_return(page_count,
&cli->cl_unstable_count);
void osc_inc_unstable_pages(struct ptlrpc_request *req)
{
struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+ struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
- long page_count = desc->bd_iov_count;
+ long page_count;
/* No unstable page tracking */
if (cli->cl_cache == NULL || !cli->cl_cache->ccc_unstable_check)
return;
- add_unstable_page_accounting(desc);
+ if (desc)
+ page_count = desc->bd_iov_count;
+ else
+ page_count = aa->aa_page_count;
+
+ add_unstable_page_accounting(desc, aa);
atomic_long_add(page_count, &cli->cl_unstable_count);
atomic_long_add(page_count, &cli->cl_cache->ccc_unstable_nr);
static unsigned int osc_reqpool_mem_max = 5;
module_param(osc_reqpool_mem_max, uint, 0444);
-struct osc_brw_async_args {
- struct obdo *aa_oa;
- int aa_requested_nob;
- int aa_nio_count;
- u32 aa_page_count;
- int aa_resends;
- struct brw_page **aa_ppga;
- struct client_obd *aa_cli;
- struct list_head aa_oaps;
- struct list_head aa_exts;
-};
-
#define osc_grant_args osc_brw_async_args
struct osc_setattr_args {
void *la_cookie;
};
- struct osc_enqueue_args {
- struct obd_export *oa_exp;
- enum ldlm_type oa_type;
- enum ldlm_mode oa_mode;
- __u64 *oa_flags;
- osc_enqueue_upcall_f oa_upcall;
- void *oa_cookie;
- struct ost_lvb *oa_lvb;
- struct lustre_handle oa_lockh;
- bool oa_speculative;
- };
-
static void osc_release_ppga(struct brw_page **ppga, size_t count);
static int brw_interpret(const struct lu_env *env, struct ptlrpc_request *req,
void *data, int rc);
RETURN(rc);
}
- int osc_punch_base(struct obd_export *exp, struct obdo *oa,
- obd_enqueue_update_f upcall, void *cookie,
- struct ptlrpc_request_set *rqset)
+ int osc_punch_send(struct obd_export *exp, struct obdo *oa,
+ obd_enqueue_update_f upcall, void *cookie)
{
- struct ptlrpc_request *req;
- struct osc_setattr_args *sa;
- struct ost_body *body;
- int rc;
- ENTRY;
+ struct ptlrpc_request *req;
+ struct osc_setattr_args *sa;
+ struct obd_import *imp = class_exp2cliimp(exp);
+ struct ost_body *body;
+ int rc;
- req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH);
- if (req == NULL)
- RETURN(-ENOMEM);
+ ENTRY;
- rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
- if (rc) {
- ptlrpc_request_free(req);
- RETURN(rc);
- }
- req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
- ptlrpc_at_set_req_timeout(req);
+ req = ptlrpc_request_alloc(imp, &RQF_OST_PUNCH);
+ if (req == NULL)
+ RETURN(-ENOMEM);
+
+ rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
+ if (rc < 0) {
+ ptlrpc_request_free(req);
+ RETURN(rc);
+ }
+
+ osc_set_io_portal(req);
+
+ ptlrpc_at_set_req_timeout(req);
body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
- LASSERT(body);
- lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+ lustre_set_wire_obdo(&imp->imp_connect_data, &body->oa, oa);
ptlrpc_request_set_replen(req);
sa->sa_oa = oa;
sa->sa_upcall = upcall;
sa->sa_cookie = cookie;
- if (rqset == PTLRPCD_SET)
- ptlrpcd_add_req(req);
- else
- ptlrpc_set_add_req(rqset, req);
+
+ ptlrpcd_add_req(req);
RETURN(0);
}
+ EXPORT_SYMBOL(osc_punch_send);
static int osc_sync_interpret(const struct lu_env *env,
struct ptlrpc_request *req,
void osc_update_next_shrink(struct client_obd *cli)
{
- cli->cl_next_shrink_grant =
- cfs_time_shift(cli->cl_grant_shrink_interval);
- CDEBUG(D_CACHE, "next time %ld to shrink grant \n",
- cli->cl_next_shrink_grant);
+ cli->cl_next_shrink_grant = ktime_get_seconds() +
+ cli->cl_grant_shrink_interval;
+
+ CDEBUG(D_CACHE, "next time %lld to shrink grant\n",
+ cli->cl_next_shrink_grant);
}
static void __osc_update_grant(struct client_obd *cli, u64 grant)
}
}
- static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
- u32 keylen, void *key,
- u32 vallen, void *val,
- struct ptlrpc_request_set *set);
-
static int osc_shrink_grant_interpret(const struct lu_env *env,
struct ptlrpc_request *req,
void *aa, int rc)
static int osc_should_shrink_grant(struct client_obd *client)
{
- cfs_time_t time = cfs_time_current();
- cfs_time_t next_shrink = client->cl_next_shrink_grant;
+ time64_t next_shrink = client->cl_next_shrink_grant;
if ((client->cl_import->imp_connect_data.ocd_connect_flags &
OBD_CONNECT_GRANT_SHRINK) == 0)
return 0;
- if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) {
+ if (ktime_get_seconds() >= next_shrink - 5) {
/* Get the current RPC size directly, instead of going via:
* cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
* Keep comment here so that it can be found by searching. */
TIMEOUT_GRANT);
}
- static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
+ void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
{
/*
* ocd_grant is the total grant amount we're expect to hold: if we've
list_empty(&cli->cl_grant_shrink_list))
osc_add_shrink_grant(cli);
}
+ EXPORT_SYMBOL(osc_init_grant);
/* We assume that the reason this OSC got a short read is because it read
* beyond the end of a stripe file; i.e. lustre is reading a sparse file
return(-EPROTO);
}
}
-
- if (req->rq_bulk->bd_nob_transferred != requested_nob) {
+ if (req->rq_bulk != NULL &&
+ req->rq_bulk->bd_nob_transferred != requested_nob) {
CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
req->rq_bulk->bd_nob_transferred, requested_nob);
return(-EPROTO);
struct ost_body *body;
struct obd_ioobj *ioobj;
struct niobuf_remote *niobuf;
- int niocount, i, requested_nob, opc, rc;
+ int niocount, i, requested_nob, opc, rc, short_io_size;
struct osc_brw_async_args *aa;
struct req_capsule *pill;
struct brw_page *pg_prev;
+ void *short_io_buf;
ENTRY;
if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
niocount * sizeof(*niobuf));
+ for (i = 0; i < page_count; i++)
+ short_io_size += pga[i]->count;
+
+ /* Check if we can do a short io. */
+ if (!(short_io_size <= cli->cl_short_io_bytes && niocount == 1 &&
+ imp_connect_shortio(cli->cl_import)))
+ short_io_size = 0;
+
+ req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_CLIENT,
+ opc == OST_READ ? 0 : short_io_size);
+ if (opc == OST_READ)
+ req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_SERVER,
+ short_io_size);
+
rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
if (rc) {
ptlrpc_request_free(req);
RETURN(rc);
}
- req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
- ptlrpc_at_set_req_timeout(req);
+ osc_set_io_portal(req);
+ ptlrpc_at_set_req_timeout(req);
/* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
* retry logic */
req->rq_no_retry_einprogress = 1;
+ if (short_io_size != 0) {
+ desc = NULL;
+ short_io_buf = NULL;
+ goto no_bulk;
+ }
+
desc = ptlrpc_prep_bulk_imp(req, page_count,
cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
(opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE :
if (desc == NULL)
GOTO(out, rc = -ENOMEM);
/* NB request now owns desc and will free it when it gets freed */
-
+no_bulk:
body = req_capsule_client_get(pill, &RMF_OST_BODY);
ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
* when the RPC is finally sent in ptlrpc_register_bulk(). It sends
* "max - 1" for old client compatibility sending "0", and also so the
* the actual maximum is a power-of-two number, not one less. LU-1431 */
- ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
+ if (desc != NULL)
+ ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
+ else /* short io */
+ ioobj_max_brw_set(ioobj, 0);
+
+ if (short_io_size != 0) {
+ if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+ body->oa.o_valid |= OBD_MD_FLFLAGS;
+ body->oa.o_flags = 0;
+ }
+ body->oa.o_flags |= OBD_FL_SHORT_IO;
+ CDEBUG(D_CACHE, "Using short io for data transfer, size = %d\n",
+ short_io_size);
+ if (opc == OST_WRITE) {
+ short_io_buf = req_capsule_client_get(pill,
+ &RMF_SHORT_IO);
+ LASSERT(short_io_buf != NULL);
+ }
+ }
+
LASSERT(page_count > 0);
pg_prev = pga[0];
for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
pg_prev->pg->index, pg_prev->off);
LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
(pg->flag & OBD_BRW_SRVLOCK));
-
- desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff, pg->count);
- requested_nob += pg->count;
+ if (short_io_size != 0 && opc == OST_WRITE) {
+ unsigned char *ptr = ll_kmap_atomic(pg->pg, KM_USER0);
+
+ LASSERT(short_io_size >= requested_nob + pg->count);
+ memcpy(short_io_buf + requested_nob,
+ ptr + poff,
+ pg->count);
+ ll_kunmap_atomic(ptr, KM_USER0);
+ } else if (short_io_size == 0) {
+ desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff,
+ pg->count);
+ }
+ requested_nob += pg->count;
if (i > 0 && can_merge_pages(pg_prev, pg)) {
niobuf--;
* resent due to cksum error, this will allow Server to
* check+dump pages on its side */
}
- ptlrpc_request_set_replen(req);
+ ptlrpc_request_set_replen(req);
- CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
- aa = ptlrpc_req_async_args(req);
- aa->aa_oa = oa;
- aa->aa_requested_nob = requested_nob;
- aa->aa_nio_count = niocount;
- aa->aa_page_count = page_count;
- aa->aa_resends = 0;
- aa->aa_ppga = pga;
- aa->aa_cli = cli;
+ CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+ aa = ptlrpc_req_async_args(req);
+ aa->aa_oa = oa;
+ aa->aa_requested_nob = requested_nob;
+ aa->aa_nio_count = niocount;
+ aa->aa_page_count = page_count;
+ aa->aa_resends = 0;
+ aa->aa_ppga = pga;
+ aa->aa_cli = cli;
INIT_LIST_HEAD(&aa->aa_oaps);
*reqp = req;
CERROR("Unexpected +ve rc %d\n", rc);
RETURN(-EPROTO);
}
- LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob);
- if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
+ if (req->rq_bulk != NULL &&
+ sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
RETURN(-EAGAIN);
if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
/* The rest of this function executes only for OST_READs */
- /* if unwrap_bulk failed, return -EAGAIN to retry */
- rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+ if (req->rq_bulk == NULL) {
+ rc = req_capsule_get_size(&req->rq_pill, &RMF_SHORT_IO,
+ RCL_SERVER);
+ LASSERT(rc == req->rq_status);
+ } else {
+ /* if unwrap_bulk failed, return -EAGAIN to retry */
+ rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+ }
if (rc < 0)
GOTO(out, rc = -EAGAIN);
RETURN(-EPROTO);
}
- if (rc != req->rq_bulk->bd_nob_transferred) {
+ if (req->rq_bulk != NULL && rc != req->rq_bulk->bd_nob_transferred) {
CERROR ("Unexpected rc %d (%d transferred)\n",
rc, req->rq_bulk->bd_nob_transferred);
return (-EPROTO);
}
+ if (req->rq_bulk == NULL) {
+ /* short io */
+ int nob, pg_count, i = 0;
+ unsigned char *buf;
+
+ CDEBUG(D_CACHE, "Using short io read, size %d\n", rc);
+ pg_count = aa->aa_page_count;
+ buf = req_capsule_server_sized_get(&req->rq_pill, &RMF_SHORT_IO,
+ rc);
+ nob = rc;
+ while (nob > 0 && pg_count > 0) {
+ unsigned char *ptr;
+ int count = aa->aa_ppga[i]->count > nob ?
+ nob : aa->aa_ppga[i]->count;
+
+ CDEBUG(D_CACHE, "page %p count %d\n",
+ aa->aa_ppga[i]->pg, count);
+ ptr = ll_kmap_atomic(aa->aa_ppga[i]->pg, KM_USER0);
+ memcpy(ptr + (aa->aa_ppga[i]->off & ~PAGE_MASK), buf,
+ count);
+ ll_kunmap_atomic((void *) ptr, KM_USER0);
+
+ buf += count;
+ nob -= count;
+ i++;
+ pg_count--;
+ }
+ }
+
if (rc < aa->aa_requested_nob)
handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
aa->aa_ppga, OST_READ,
cksum_type);
- if (peer->nid != req->rq_bulk->bd_sender) {
+ if (req->rq_bulk != NULL &&
+ peer->nid != req->rq_bulk->bd_sender) {
via = " via ";
router = libcfs_nid2str(req->rq_bulk->bd_sender);
}
struct osc_extent *ext;
struct osc_extent *tmp;
struct client_obd *cli = aa->aa_cli;
+ unsigned long transferred = 0;
ENTRY;
rc = osc_brw_fini_request(req, rc);
LASSERT(list_empty(&aa->aa_exts));
LASSERT(list_empty(&aa->aa_oaps));
+ transferred = (req->rq_bulk == NULL ? /* short io */
+ aa->aa_requested_nob :
+ req->rq_bulk->bd_nob_transferred);
+
osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
- ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred);
+ ptlrpc_lprocfs_brw(req, transferred);
spin_lock(&cli->cl_loi_list_lock);
/* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
return set;
}
- static int osc_enqueue_fini(struct ptlrpc_request *req,
- osc_enqueue_upcall_f upcall, void *cookie,
- struct lustre_handle *lockh, enum ldlm_mode mode,
- __u64 *flags, bool speculative, int errcode)
+ int osc_enqueue_fini(struct ptlrpc_request *req, osc_enqueue_upcall_f upcall,
+ void *cookie, struct lustre_handle *lockh,
+ enum ldlm_mode mode, __u64 *flags, bool speculative,
+ int errcode)
{
bool intent = *flags & LDLM_FL_HAS_INTENT;
int rc;
if (errcode == ELDLM_OK && lustre_handle_is_used(lockh))
ldlm_lock_decref(lockh, mode);
- RETURN(rc);
+ RETURN(rc);
}
- static int osc_enqueue_interpret(const struct lu_env *env,
- struct ptlrpc_request *req,
- struct osc_enqueue_args *aa, int rc)
+ int osc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req,
+ struct osc_enqueue_args *aa, int rc)
{
struct ldlm_lock *lock;
struct lustre_handle *lockh = &aa->oa_lockh;
rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode,
aa->oa_flags, aa->oa_speculative, rc);
- OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+ OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
ldlm_lock_decref(lockh, mode);
LDLM_LOCK_PUT(lock);
req->rq_no_delay = 1;
}
- req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
- CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
- aa = ptlrpc_req_async_args(req);
- aa->aa_oi = oinfo;
+ req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
+ CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+ aa = ptlrpc_req_async_args(req);
+ aa->aa_oi = oinfo;
- ptlrpc_set_add_req(rqset, req);
- RETURN(0);
+ ptlrpc_set_add_req(rqset, req);
+ RETURN(0);
}
static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
return err;
}
- static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
- u32 keylen, void *key,
- u32 vallen, void *val,
- struct ptlrpc_request_set *set)
+ int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+ u32 keylen, void *key, u32 vallen, void *val,
+ struct ptlrpc_request_set *set)
{
struct ptlrpc_request *req;
struct obd_device *obd = exp->exp_obd;
tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
&RMF_OST_BODY :
&RMF_SETINFO_VAL);
- memcpy(tmp, val, vallen);
+ memcpy(tmp, val, vallen);
if (KEY_IS(KEY_GRANT_SHRINK)) {
- struct osc_grant_args *aa;
- struct obdo *oa;
-
- CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
- aa = ptlrpc_req_async_args(req);
- OBDO_ALLOC(oa);
- if (!oa) {
- ptlrpc_req_finished(req);
- RETURN(-ENOMEM);
- }
- *oa = ((struct ost_body *)val)->oa;
- aa->aa_oa = oa;
- req->rq_interpret_reply = osc_shrink_grant_interpret;
- }
+ struct osc_grant_args *aa;
+ struct obdo *oa;
+
+ CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+ aa = ptlrpc_req_async_args(req);
+ OBDO_ALLOC(oa);
+ if (!oa) {
+ ptlrpc_req_finished(req);
+ RETURN(-ENOMEM);
+ }
+ *oa = ((struct ost_body *)val)->oa;
+ aa->aa_oa = oa;
+ req->rq_interpret_reply = osc_shrink_grant_interpret;
+ }
ptlrpc_request_set_replen(req);
if (!KEY_IS(KEY_GRANT_SHRINK)) {
RETURN(0);
}
+ EXPORT_SYMBOL(osc_set_info_async);
- static int osc_reconnect(const struct lu_env *env,
- struct obd_export *exp, struct obd_device *obd,
- struct obd_uuid *cluuid,
- struct obd_connect_data *data,
- void *localdata)
+ int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
+ struct obd_device *obd, struct obd_uuid *cluuid,
+ struct obd_connect_data *data, void *localdata)
{
- struct client_obd *cli = &obd->u.cli;
+ struct client_obd *cli = &obd->u.cli;
- if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
- long lost_grant;
+ if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
+ long lost_grant;
long grant;
spin_lock(&cli->cl_loi_list_lock);
RETURN(0);
}
+ EXPORT_SYMBOL(osc_reconnect);
- static int osc_disconnect(struct obd_export *exp)
+ int osc_disconnect(struct obd_export *exp)
{
struct obd_device *obd = class_exp2obd(exp);
int rc;
osc_del_shrink_grant(&obd->u.cli);
return rc;
}
+ EXPORT_SYMBOL(osc_disconnect);
- static int osc_ldlm_resource_invalidate(struct cfs_hash *hs,
- struct cfs_hash_bd *bd, struct hlist_node *hnode, void *arg)
+ int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
+ struct hlist_node *hnode, void *arg)
{
struct lu_env *env = arg;
struct ldlm_resource *res = cfs_hash_object(hs, hnode);
RETURN(0);
}
+ EXPORT_SYMBOL(osc_ldlm_resource_invalidate);
static int osc_import_event(struct obd_device *obd,
struct obd_import *imp,
RETURN(0);
}
- int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+ int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg)
{
struct client_obd *cli = &obd->u.cli;
- struct obd_type *type;
- void *handler;
- int rc;
- int adding;
- int added;
- int req_count;
+ void *handler;
+ int rc;
+
ENTRY;
rc = ptlrpcd_addref();
if (rc)
GOTO(out_ptlrpcd, rc);
+
handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
if (IS_ERR(handler))
- GOTO(out_client_setup, rc = PTR_ERR(handler));
+ GOTO(out_ptlrpcd_work, rc = PTR_ERR(handler));
cli->cl_writeback_work = handler;
handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli);
cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
+ INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
+ RETURN(rc);
+
+ out_ptlrpcd_work:
+ if (cli->cl_writeback_work != NULL) {
+ ptlrpcd_destroy_work(cli->cl_writeback_work);
+ cli->cl_writeback_work = NULL;
+ }
+ if (cli->cl_lru_work != NULL) {
+ ptlrpcd_destroy_work(cli->cl_lru_work);
+ cli->cl_lru_work = NULL;
+ }
+ client_obd_cleanup(obd);
+ out_ptlrpcd:
+ ptlrpcd_decref();
+ RETURN(rc);
+ }
+ EXPORT_SYMBOL(osc_setup_common);
+
+ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+ {
+ struct client_obd *cli = &obd->u.cli;
+ struct obd_type *type;
+ int adding;
+ int added;
+ int req_count;
+ int rc;
+
+ ENTRY;
+
+ rc = osc_setup_common(obd, lcfg);
+ if (rc < 0)
+ RETURN(rc);
+
#ifdef CONFIG_PROC_FS
obd->obd_vars = lprocfs_osc_obd_vars;
#endif
spin_unlock(&osc_shrink_lock);
RETURN(0);
-
- out_ptlrpcd_work:
- if (cli->cl_writeback_work != NULL) {
- ptlrpcd_destroy_work(cli->cl_writeback_work);
- cli->cl_writeback_work = NULL;
- }
- if (cli->cl_lru_work != NULL) {
- ptlrpcd_destroy_work(cli->cl_lru_work);
- cli->cl_lru_work = NULL;
- }
- out_client_setup:
- client_obd_cleanup(obd);
- out_ptlrpcd:
- ptlrpcd_decref();
- RETURN(rc);
}
- static int osc_precleanup(struct obd_device *obd)
+ int osc_precleanup_common(struct obd_device *obd)
{
struct client_obd *cli = &obd->u.cli;
ENTRY;
}
obd_cleanup_client_import(obd);
+ RETURN(0);
+ }
+ EXPORT_SYMBOL(osc_precleanup_common);
+
+ static int osc_precleanup(struct obd_device *obd)
+ {
+ ENTRY;
+
+ osc_precleanup_common(obd);
+
ptlrpc_lprocfs_unregister_obd(obd);
lprocfs_obd_cleanup(obd);
RETURN(0);
}
- int osc_cleanup(struct obd_device *obd)
+ int osc_cleanup_common(struct obd_device *obd)
{
struct client_obd *cli = &obd->u.cli;
int rc;
ptlrpcd_decref();
RETURN(rc);
}
+ EXPORT_SYMBOL(osc_cleanup_common);
int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
{
.o_owner = THIS_MODULE,
.o_setup = osc_setup,
.o_precleanup = osc_precleanup,
- .o_cleanup = osc_cleanup,
+ .o_cleanup = osc_cleanup_common,
.o_add_conn = client_import_add_conn,
.o_del_conn = client_import_del_conn,
.o_connect = client_connect_import,
if (rc)
GOTO(out_sa, rc);
+#ifdef ZFS_PROJINHERIT
+ if (o->od_projectused_dn && osa->flags & ZFS_PROJID) {
+ rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_PROJID(o),
+ &osa->projid, 8);
+ if (rc)
+ GOTO(out_sa, rc);
+
+ la->la_projid = osa->projid;
+ la->la_valid |= LA_PROJID;
+ obj->oo_with_projid = 1;
+ } else {
+ la->la_projid = ZFS_DEFAULT_PROJID;
+ la->la_valid &= ~LA_PROJID;
+ }
+#else
+ la->la_projid = 0;
+ la->la_valid &= ~LA_PROJID;
+#endif
+
la->la_atime = osa->atime[0];
la->la_mtime = osa->mtime[0];
la->la_ctime = osa->ctime[0];
case ACCT_GROUP_OID:
dn = osd->od_groupused_dn;
break;
+#ifdef ZFS_PROJINHERIT
+ case ACCT_PROJECT_OID:
+ dn = osd->od_projectused_dn;
+ break;
+#endif
default:
break;
}
/* one less inode */
rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid,
- obj->oo_attr.la_gid, -1, oh, false, NULL, false);
+ obj->oo_attr.la_gid, obj->oo_attr.la_projid,
+ -1, oh, NULL, OSD_QID_INODE);
if (rc)
RETURN(rc);
/* data to be truncated */
rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid,
- obj->oo_attr.la_gid, 0, oh, true, NULL, false);
+ obj->oo_attr.la_gid, obj->oo_attr.la_projid,
+ 0, oh, NULL, OSD_QID_BLK);
if (rc)
RETURN(rc);
* anything else */
}
- if (attr && (attr->la_valid & (LA_UID | LA_GID))) {
+ if (attr && (attr->la_valid & (LA_UID | LA_GID | LA_PROJID))) {
sa_object_size(obj->oo_sa_hdl, &blksize, &bspace);
bspace = toqb(bspace * blksize);
}
GOTO(out, rc);
}
}
-
+#ifdef ZFS_PROJINHERIT
+ if (attr && attr->la_valid & LA_PROJID) {
+ if (!osd->od_projectused_dn)
+ GOTO(out, rc = -EOPNOTSUPP);
+
+ /* Usually, if project quota is upgradable for the device,
+ * then the upgrade will be done before or when mount the
+ * device. So when we come here, this project should have
+ * project ID attribute already (that is zero by default).
+ * Otherwise, there was something wrong during the former
+ * upgrade, let's return failure to report that.
+ *
+ * Please note that, different from other attributes, you
+ * can NOT simply set the project ID attribute under such
+ * case, because adding (NOT change) project ID attribute
+ * needs to change the object's attribute layout to match
+ * zfs backend quota accounting requirement. */
+ if (unlikely(!obj->oo_with_projid))
+ GOTO(out, rc = -ENXIO);
+
+ /* quota enforcement for project */
+ if (attr->la_projid != obj->oo_attr.la_projid) {
+ rc = qsd_transfer(env, osd->od_quota_slave,
+ &oh->ot_quota_trans, PRJQUOTA,
+ obj->oo_attr.la_projid,
+ attr->la_projid, bspace,
+ &info->oti_qi);
+ if (rc)
+ GOTO(out, rc);
+ }
+ }
+#endif
out:
up_read(&obj->oo_guard);
RETURN(rc);
if (rc < 0) {
CWARN("%s: failed to set LMA flags: rc = %d\n",
osd->od_svname, rc);
- RETURN(rc);
+ GOTO(out, rc);
}
}
}
write_lock(&obj->oo_attr_lock);
cnt = 0;
+
+ if (valid & LA_PROJID) {
+#ifdef ZFS_PROJINHERIT
+ /* osd_declare_attr_set() must be called firstly.
+ * If osd::od_projectused_dn is not set, then we
+ * can not arrive at here. */
+ LASSERT(osd->od_projectused_dn);
+ LASSERT(obj->oo_with_projid);
+
+ osa->projid = obj->oo_attr.la_projid = la->la_projid;
+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PROJID(osd), NULL,
+ &osa->projid, 8);
+#else
+ valid &= ~LA_PROJID;
+#endif
+ }
+
if (valid & LA_ATIME) {
osa->atime[0] = obj->oo_attr.la_atime = la->la_atime;
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(osd), NULL,
/* many flags are not supported by zfs, so ensure a good cached
* copy */
obj->oo_attr.la_flags = attrs_zfs2fs(osa->flags);
+#ifdef ZFS_PROJINHERIT
+ if (obj->oo_with_projid)
+ osa->flags |= ZFS_PROJID;
+#endif
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(osd), NULL,
&osa->flags, 8);
}
/* will help to find FID->ino mapping at dt_insert() */
osd_idc_find_and_init(env, osd, obj);
- rc = osd_declare_quota(env, osd, attr->la_uid, attr->la_gid, 1, oh,
- false, NULL, false);
+ rc = osd_declare_quota(env, osd, attr->la_uid, attr->la_gid,
+ attr->la_projid, 1, oh, NULL, OSD_QID_INODE);
RETURN(rc);
}
int __osd_attr_init(const struct lu_env *env, struct osd_device *osd,
- sa_handle_t *sa_hdl, dmu_tx_t *tx,
+ struct osd_object *obj, sa_handle_t *sa_hdl, dmu_tx_t *tx,
struct lu_attr *la, uint64_t parent,
nvlist_t *xattr)
{
osa->gid = la->la_gid;
osa->rdev = la->la_rdev;
osa->nlink = la->la_nlink;
- osa->flags = attrs_fs2zfs(la->la_flags);
+ if (la->la_valid & LA_FLAGS)
+ osa->flags = attrs_fs2zfs(la->la_flags);
+ else
+ osa->flags = 0;
osa->size = la->la_size;
+#ifdef ZFS_PROJINHERIT
+ if (osd->od_projectused_dn) {
+ if (la->la_valid & LA_PROJID)
+ osa->projid = la->la_projid;
+ else
+ osa->projid = ZFS_DEFAULT_PROJID;
+ osa->flags |= ZFS_PROJID;
+ if (obj)
+ obj->oo_with_projid = 1;
+ } else {
+ osa->flags &= ~ZFS_PROJID;
+ }
+#endif
/*
* we need to create all SA below upon object create.
*
* XXX The attribute order matters since the accounting callback relies
* on static offsets (i.e. SA_*_OFFSET, see zfs_space_delta_cb()) to
- * look up the UID/GID attributes. Moreover, the callback does not seem
- * to support the spill block.
+ * look up the UID/GID/PROJID attributes. Moreover, the callback does
+ * not seem to support the spill block.
* We define attributes in the same order as SA_*_OFFSET in order to
* work around the problem. See ORI-610.
*/
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(osd), NULL, osa->ctime, 16);
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CRTIME(osd), NULL, crtime, 16);
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(osd), NULL, &osa->nlink, 8);
+#ifdef ZFS_PROJINHERIT
+ if (osd->od_projectused_dn)
+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PROJID(osd), NULL,
+ &osa->projid, 8);
+#endif
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_RDEV(osd), NULL, &osa->rdev, 8);
LASSERT(cnt <= ARRAY_SIZE(osd_oti_get(env)->oti_attr_bulk));
return rc;
}
+#ifdef HAVE_DMU_OBJECT_ALLOC_DNSIZE
+static int osd_find_dnsize(struct osd_object *obj)
+{
+ struct osd_device *osd = osd_obj2dev(obj);
+ int dnsize;
+
+ if (osd->od_dnsize == ZFS_DNSIZE_AUTO) {
+ dnsize = DNODE_MIN_SIZE;
+ do {
+ if (DN_BONUS_SIZE(dnsize) >= obj->oo_ea_in_bonus + 32)
+ break;
+ dnsize <<= 1;
+ } while (dnsize < DNODE_MAX_SIZE);
+ if (dnsize > DNODE_MAX_SIZE)
+ dnsize = DNODE_MAX_SIZE;
+ } else if (osd->od_dnsize == ZFS_DNSIZE_1K) {
+ dnsize = 1024;
+ } else if (osd->od_dnsize == ZFS_DNSIZE_2K) {
+ dnsize = 2048;
+ } else if (osd->od_dnsize == ZFS_DNSIZE_4K) {
+ dnsize = 4096;
+ } else if (osd->od_dnsize == ZFS_DNSIZE_8K) {
+ dnsize = 8192;
+ } else if (osd->od_dnsize == ZFS_DNSIZE_16K) {
+ dnsize = 16384;
+ } else {
+ dnsize = DNODE_MIN_SIZE;
+ }
+ return dnsize;
+}
+#else
+static int inline osd_find_dnsize(struct osd_object *obj)
+{
+ return DN_MAX_BONUSLEN;
+}
+#endif
+
/*
* The transaction passed to this routine must have
* dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT) called and then assigned
type = DMU_OTN_UINT8_METADATA;
/* Create a new DMU object using the default dnode size. */
- oid = osd_dmu_object_alloc(osd->od_os, type, 0, 0, tx);
+ oid = osd_dmu_object_alloc(osd->od_os, type, 0,
+ osd_find_dnsize(obj), tx);
LASSERT(la->la_valid & LA_MODE);
la->la_size = 0;
* a conversion from the different internal ZAP hash formats being used. */
int __osd_zap_create(const struct lu_env *env, struct osd_device *osd,
dnode_t **dnp, dmu_tx_t *tx, struct lu_attr *la,
- zap_flags_t flags)
+ unsigned dnsize, zap_flags_t flags)
{
uint64_t oid;
DMU_OT_DIRECTORY_CONTENTS,
14, /* == ZFS fzap_default_blockshift */
DN_MAX_INDBLKSHIFT, /* indirect blockshift */
- 0, tx);
+ dnsize, tx);
la->la_size = 2;
la->la_nlink = 1;
* binary keys */
LASSERT(S_ISREG(la->la_mode));
rc = __osd_zap_create(env, osd_obj2dev(obj), &dn, oh->ot_tx, la,
- ZAP_FLAG_UINT64_KEY);
+ osd_find_dnsize(obj), ZAP_FLAG_UINT64_KEY);
if (rc)
return ERR_PTR(rc);
return dn;
int rc;
LASSERT(S_ISDIR(la->la_mode));
- rc = __osd_zap_create(env, osd_obj2dev(obj), &dn, oh->ot_tx, la, 0);
+ rc = __osd_zap_create(env, osd_obj2dev(obj), &dn, oh->ot_tx, la,
+ osd_find_dnsize(obj), 0);
if (rc)
return ERR_PTR(rc);
return dn;
if (rc)
return ERR_PTR(rc);
- if ((fid_is_idif(fid) || fid_is_norm(fid) || fid_is_echo(fid)) &&
- osd->od_is_ost) {
+ if ((fid_is_idif(fid) || fid_is_norm(fid) || fid_is_echo(fid))) {
/* The minimum block size must be at least page size otherwise
* it will break the assumption in tgt_thread_big_cache where
* the array size is PTLRPC_MAX_BRW_PAGES. It will also affect
obj->oo_attr = *attr;
obj->oo_attr.la_valid |= LA_SIZE | LA_NLINK | LA_TYPE;
+#ifdef ZFS_PROJINHERIT
+ if (osd->od_projectused_dn) {
+ if (!(obj->oo_attr.la_valid & LA_PROJID))
+ obj->oo_attr.la_projid = ZFS_DEFAULT_PROJID;
+ obj->oo_with_projid = 1;
+ }
+#endif
+
dn = osd_create_type_f(dof->dof_type)(env, obj, &obj->oo_attr, oh);
if (IS_ERR(dn)) {
rc = PTR_ERR(dn);
__swab32s(&b->mbo_uid_h);
__swab32s(&b->mbo_gid_h);
__swab32s(&b->mbo_projid);
- CLASSERT(offsetof(typeof(*b), mbo_padding_6) != 0);
- CLASSERT(offsetof(typeof(*b), mbo_padding_7) != 0);
+ __swab64s(&b->mbo_dom_size);
+ __swab64s(&b->mbo_dom_blocks);
CLASSERT(offsetof(typeof(*b), mbo_padding_8) != 0);
CLASSERT(offsetof(typeof(*b), mbo_padding_9) != 0);
CLASSERT(offsetof(typeof(*b), mbo_padding_10) != 0);
void lustre_swab_mgs_target_info(struct mgs_target_info *mti)
{
- int i;
- __swab32s(&mti->mti_lustre_ver);
- __swab32s(&mti->mti_stripe_index);
- __swab32s(&mti->mti_config_ver);
- __swab32s(&mti->mti_flags);
- __swab32s(&mti->mti_instance);
- __swab32s(&mti->mti_nid_count);
- CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
- for (i = 0; i < MTI_NIDS_MAX; i++)
- __swab64s(&mti->mti_nids[i]);
+ int i;
+
+ __swab32s(&mti->mti_lustre_ver);
+ __swab32s(&mti->mti_stripe_index);
+ __swab32s(&mti->mti_config_ver);
+ __swab32s(&mti->mti_flags);
+ __swab32s(&mti->mti_instance);
+ __swab32s(&mti->mti_nid_count);
+ CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+ for (i = 0; i < MTI_NIDS_MAX; i++)
+ __swab64s(&mti->mti_nids[i]);
}
void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry)
{
__u8 i;
- __swab64s(&entry->mne_version);
- __swab32s(&entry->mne_instance);
- __swab32s(&entry->mne_index);
- __swab32s(&entry->mne_length);
-
- /* mne_nid_(count|type) must be one byte size because we're gonna
- * access it w/o swapping. */
- CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
- CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
-
- /* remove this assertion if ipv6 is supported. */
- LASSERT(entry->mne_nid_type == 0);
- for (i = 0; i < entry->mne_nid_count; i++) {
- CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
- __swab64s(&entry->u.nids[i]);
- }
+ __swab64s(&entry->mne_version);
+ __swab32s(&entry->mne_instance);
+ __swab32s(&entry->mne_index);
+ __swab32s(&entry->mne_length);
+
+ /* mne_nid_(count|type) must be one byte size because we're gonna
+ * access it w/o swapping. */
+ CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
+ CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
+
+ /* remove this assertion if ipv6 is supported. */
+ LASSERT(entry->mne_nid_type == 0);
+ for (i = 0; i < entry->mne_nid_count; i++) {
+ CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+ __swab64s(&entry->u.nids[i]);
+ }
}
EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry);
struct tg_export_data *ted = &exp->exp_target_data;
int level = D_CACHE;
- if (exp->exp_obd->obd_self_export == exp)
- CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
- "%ld\n", exp->exp_obd->obd_name, ted->ted_grant,
- ted->ted_pending, ted->ted_dirty);
-
if (ted->ted_grant < 0 || ted->ted_pending < 0 || ted->ted_dirty < 0)
level = D_ERROR;
CDEBUG_LIMIT(level, "%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
struct lu_target *lut = obd->u.obt.obt_lut;
struct tg_grants_data *tgd = &lut->lut_tgd;
struct obd_export *exp;
+ struct tg_export_data *ted;
u64 maxsize;
u64 tot_dirty = 0;
u64 tot_pending = 0;
spin_lock(&obd->obd_dev_lock);
spin_lock(&tgd->tgd_grant_lock);
+ exp = obd->obd_self_export;
+ ted = &exp->exp_target_data;
+ CDEBUG(D_CACHE, "%s: processing self export: %ld %ld "
+ "%ld\n", obd->obd_name, ted->ted_grant,
+ ted->ted_pending, ted->ted_dirty);
+ tot_granted += ted->ted_grant + ted->ted_pending;
+ tot_pending += ted->ted_pending;
+ tot_dirty += ted->ted_dirty;
+
list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
error = tgt_check_export_grants(exp, &tot_dirty, &tot_pending,
&tot_granted, maxsize);
if (unlikely(rc))
GOTO(out, rc);
+ osfs->os_namelen = min_t(__u32, osfs->os_namelen, NAME_MAX);
+
spin_lock(&tgd->tgd_grant_lock);
spin_lock(&tgd->tgd_osfs_lock);
/* calculate how much space was written while we released the
u64 left;
u64 avail;
u64 unstable;
+ u64 reserved;
ENTRY;
assert_spin_locked(&tgd->tgd_grant_lock);
unstable = tgd->tgd_osfs_unstable; /* those might be accounted twice */
spin_unlock(&tgd->tgd_osfs_lock);
- tot_granted = tgd->tgd_tot_granted;
+ reserved = left * tgd->tgd_reserved_pcnt / 100;
+ tot_granted = tgd->tgd_tot_granted + reserved;
if (left < tot_granted) {
int mask = (left + unstable <
RETURN(rc);
}
EXPORT_SYMBOL(tgt_grant_commit_cb_add);
+
+ /**
+ * Show estimate of total amount of dirty data on clients.
+ *
+ * \param[in] m seq_file handle
+ * \param[in] data unused for single entry
+ *
+ * \retval 0 on success
+ * \retval negative value on error
+ */
+ int tgt_tot_dirty_seq_show(struct seq_file *m, void *data)
+ {
+ struct obd_device *obd = m->private;
+ struct tg_grants_data *tgd;
+
+ LASSERT(obd != NULL);
+ tgd = &obd->u.obt.obt_lut->lut_tgd;
+ seq_printf(m, "%llu\n", tgd->tgd_tot_dirty);
+ return 0;
+ }
+ EXPORT_SYMBOL(tgt_tot_dirty_seq_show);
+
+ /**
+ * Show total amount of space granted to clients.
+ *
+ * \param[in] m seq_file handle
+ * \param[in] data unused for single entry
+ *
+ * \retval 0 on success
+ * \retval negative value on error
+ */
+ int tgt_tot_granted_seq_show(struct seq_file *m, void *data)
+ {
+ struct obd_device *obd = m->private;
+ struct tg_grants_data *tgd;
+
+ LASSERT(obd != NULL);
+ tgd = &obd->u.obt.obt_lut->lut_tgd;
+ seq_printf(m, "%llu\n", tgd->tgd_tot_granted);
+ return 0;
+ }
+ EXPORT_SYMBOL(tgt_tot_granted_seq_show);
+
+ /**
+ * Show total amount of space used by IO in progress.
+ *
+ * \param[in] m seq_file handle
+ * \param[in] data unused for single entry
+ *
+ * \retval 0 on success
+ * \retval negative value on error
+ */
+ int tgt_tot_pending_seq_show(struct seq_file *m, void *data)
+ {
+ struct obd_device *obd = m->private;
+ struct tg_grants_data *tgd;
+
+ LASSERT(obd != NULL);
+ tgd = &obd->u.obt.obt_lut->lut_tgd;
+ seq_printf(m, "%llu\n", tgd->tgd_tot_pending);
+ return 0;
+ }
+ EXPORT_SYMBOL(tgt_tot_pending_seq_show);
+
+ /**
+ * Show if grants compatibility mode is disabled.
+ *
+ * When tgd_grant_compat_disable is set, we don't grant any space to clients
+ * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such
+ * a client is inflated since it consumes PAGE_SIZE of grant space per
+ * block, (i.e. typically 4kB units), but underlaying file system might have
+ * block size bigger than page size, e.g. ZFS. See LU-2049 for details.
+ *
+ * \param[in] m seq_file handle
+ * \param[in] data unused for single entry
+ *
+ * \retval 0 on success
+ * \retval negative value on error
+ */
+ int tgt_grant_compat_disable_seq_show(struct seq_file *m, void *data)
+ {
+ struct obd_device *obd = m->private;
+ struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
+
+ seq_printf(m, "%u\n", tgd->tgd_grant_compat_disable);
+ return 0;
+ }
+ EXPORT_SYMBOL(tgt_grant_compat_disable_seq_show);
+
+ /**
+ * Change grant compatibility mode.
+ *
+ * Setting tgd_grant_compat_disable prohibit any space granting to clients
+ * not supporting OBD_CONNECT_GRANT_PARAM. See details above.
+ *
+ * \param[in] file proc file
+ * \param[in] buffer string which represents mode
+ * 1: disable compatibility mode
+ * 0: enable compatibility mode
+ * \param[in] count \a buffer length
+ * \param[in] off unused for single entry
+ *
+ * \retval \a count on success
+ * \retval negative number on error
+ */
+ ssize_t tgt_grant_compat_disable_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+ {
+ struct seq_file *m = file->private_data;
+ struct obd_device *obd = m->private;
+ struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
+ __s64 val;
+ int rc;
+
+ rc = lprocfs_str_to_s64(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val < 0)
+ return -EINVAL;
+
+ tgd->tgd_grant_compat_disable = !!val;
+
+ return count;
+ }
+ EXPORT_SYMBOL(tgt_grant_compat_disable_seq_write);
&RMF_ACL, RCL_SERVER,
LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
+ if (req_capsule_has_field(tsi->tsi_pill, &RMF_SHORT_IO,
+ RCL_SERVER)) {
+ struct niobuf_remote *remote_nb =
+ req_capsule_client_get(tsi->tsi_pill,
+ &RMF_NIOBUF_REMOTE);
+ struct ost_body *body = tsi->tsi_ost_body;
+
+ req_capsule_set_size(tsi->tsi_pill, &RMF_SHORT_IO,
+ RCL_SERVER,
+ (body->oa.o_flags & OBD_FL_SHORT_IO) ?
+ remote_nb[0].rnb_len : 0);
+ }
+
rc = req_capsule_server_pack(tsi->tsi_pill);
}
EXIT;
}
EXPORT_SYMBOL(tgt_io_thread_done);
+
+ /**
+ * Helper function for getting Data-on-MDT file server DLM lock
+ * if asked by client.
+ */
+ int tgt_mdt_data_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
+ struct lustre_handle *lh, int mode, __u64 *flags)
+ {
+ union ldlm_policy_data policy;
+ int rc;
+
+ ENTRY;
+
+ LASSERT(lh != NULL);
+ LASSERT(ns != NULL);
+ LASSERT(!lustre_handle_is_used(lh));
+
+ policy.l_inodebits.bits = MDS_INODELOCK_DOM | MDS_INODELOCK_UPDATE;
+ policy.l_inodebits.try_bits = 0;
+
+ rc = ldlm_cli_enqueue_local(ns, res_id, LDLM_IBITS, &policy, mode,
+ flags, ldlm_blocking_ast,
+ ldlm_completion_ast, ldlm_glimpse_ast,
+ NULL, 0, LVB_T_NONE, NULL, lh);
+
+ RETURN(rc == ELDLM_OK ? 0 : -EIO);
+ }
+ EXPORT_SYMBOL(tgt_mdt_data_lock);
+
/**
* Helper function for getting server side [start, start+count] DLM lock
* if asked by client.
}
EXPORT_SYMBOL(tgt_extent_unlock);
- int tgt_brw_lock(struct ldlm_namespace *ns, struct ldlm_res_id *res_id,
- struct obd_ioobj *obj, struct niobuf_remote *nb,
- struct lustre_handle *lh, enum ldlm_mode mode)
+ static int tgt_brw_lock(struct obd_export *exp, struct ldlm_res_id *res_id,
+ struct obd_ioobj *obj, struct niobuf_remote *nb,
+ struct lustre_handle *lh, enum ldlm_mode mode)
{
+ struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
__u64 flags = 0;
int nrbufs = obj->ioo_bufcnt;
int i;
+ int rc;
ENTRY;
if (!(nb[i].rnb_flags & OBD_BRW_SRVLOCK))
RETURN(-EFAULT);
- RETURN(tgt_extent_lock(ns, res_id, nb[0].rnb_offset,
- nb[nrbufs - 1].rnb_offset +
- nb[nrbufs - 1].rnb_len - 1,
- lh, mode, &flags));
+ /* MDT IO for data-on-mdt */
+ if (exp->exp_connect_data.ocd_connect_flags & OBD_CONNECT_IBITS)
+ rc = tgt_mdt_data_lock(ns, res_id, lh, mode, &flags);
+ else
+ rc = tgt_extent_lock(ns, res_id, nb[0].rnb_offset,
+ nb[nrbufs - 1].rnb_offset +
+ nb[nrbufs - 1].rnb_len - 1,
+ lh, mode, &flags);
+ RETURN(rc);
}
- void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
- struct lustre_handle *lh, enum ldlm_mode mode)
+ static void tgt_brw_unlock(struct obd_ioobj *obj, struct niobuf_remote *niob,
+ struct lustre_handle *lh, enum ldlm_mode mode)
{
ENTRY;
tgt_extent_unlock(lh, mode);
EXIT;
}
-
-static __u32 tgt_checksum_bulk(struct lu_target *tgt,
- struct ptlrpc_bulk_desc *desc, int opc,
- enum cksum_types cksum_type)
+static __u32 tgt_checksum_niobuf(struct lu_target *tgt,
+ struct niobuf_local *local_nb, int npages,
+ int opc, enum cksum_types cksum_type)
{
struct cfs_crypto_hash_desc *hdesc;
unsigned int bufsize;
unsigned char cfs_alg = cksum_obd2cfs(cksum_type);
__u32 cksum;
- LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type));
-
hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
if (IS_ERR(hdesc)) {
CERROR("%s: unable to initialize checksum hash %s\n",
}
CDEBUG(D_INFO, "Checksum for algo %s\n", cfs_crypto_hash_name(cfs_alg));
- for (i = 0; i < desc->bd_iov_count; i++) {
+ for (i = 0; i < npages; i++) {
/* corrupt the data before we compute the checksum, to
* simulate a client->OST data error */
if (i == 0 && opc == OST_WRITE &&
OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) {
- int off = BD_GET_KIOV(desc, i).kiov_offset &
- ~PAGE_MASK;
- int len = BD_GET_KIOV(desc, i).kiov_len;
+ int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
+ int len = local_nb[i].lnb_len;
struct page *np = tgt_page_to_corrupt;
- char *ptr = kmap(BD_GET_KIOV(desc, i).kiov_page) + off;
if (np) {
- char *ptr2 = kmap(np) + off;
+ char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
+ KM_USER0);
+ char *ptr2 = page_address(np);
- memcpy(ptr2, ptr, len);
- memcpy(ptr2, "bad3", min(4, len));
- kunmap(np);
+ memcpy(ptr2 + off, ptr + off, len);
+ memcpy(ptr2 + off, "bad3", min(4, len));
+ ll_kunmap_atomic(ptr, KM_USER0);
/* LU-8376 to preserve original index for
* display in dump_all_bulk_pages() */
- np->index = BD_GET_KIOV(desc,
- i).kiov_page->index;
+ np->index = i;
- BD_GET_KIOV(desc, i).kiov_page = np;
+ cfs_crypto_hash_update_page(hdesc, np, off,
+ len);
+ continue;
} else {
CERROR("%s: can't alloc page for corruption\n",
tgt_name(tgt));
}
}
- cfs_crypto_hash_update_page(hdesc,
- BD_GET_KIOV(desc, i).kiov_page,
- BD_GET_KIOV(desc, i).kiov_offset &
- ~PAGE_MASK,
- BD_GET_KIOV(desc, i).kiov_len);
+ cfs_crypto_hash_update_page(hdesc, local_nb[i].lnb_page,
+ local_nb[i].lnb_page_offset & ~PAGE_MASK,
+ local_nb[i].lnb_len);
/* corrupt the data after we compute the checksum, to
* simulate an OST->client data error */
if (i == 0 && opc == OST_READ &&
OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND)) {
- int off = BD_GET_KIOV(desc, i).kiov_offset
- & ~PAGE_MASK;
- int len = BD_GET_KIOV(desc, i).kiov_len;
+ int off = local_nb[i].lnb_page_offset & ~PAGE_MASK;
+ int len = local_nb[i].lnb_len;
struct page *np = tgt_page_to_corrupt;
- char *ptr =
- kmap(BD_GET_KIOV(desc, i).kiov_page) + off;
if (np) {
- char *ptr2 = kmap(np) + off;
+ char *ptr = ll_kmap_atomic(local_nb[i].lnb_page,
+ KM_USER0);
+ char *ptr2 = page_address(np);
- memcpy(ptr2, ptr, len);
- memcpy(ptr2, "bad4", min(4, len));
- kunmap(np);
+ memcpy(ptr2 + off, ptr + off, len);
+ memcpy(ptr2 + off, "bad4", min(4, len));
+ ll_kunmap_atomic(ptr, KM_USER0);
/* LU-8376 to preserve original index for
* display in dump_all_bulk_pages() */
- np->index = BD_GET_KIOV(desc,
- i).kiov_page->index;
+ np->index = i;
- BD_GET_KIOV(desc, i).kiov_page = np;
+ cfs_crypto_hash_update_page(hdesc, np, off,
+ len);
+ continue;
} else {
CERROR("%s: can't alloc page for corruption\n",
tgt_name(tgt));
char dbgcksum_file_name[PATH_MAX];
static void dump_all_bulk_pages(struct obdo *oa, int count,
- lnet_kiov_t *iov, __u32 server_cksum,
- __u32 client_cksum)
+ struct niobuf_local *local_nb,
+ __u32 server_cksum, __u32 client_cksum)
{
struct file *filp;
int rc, i;
oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
- (__u64)iov[0].kiov_page->index << PAGE_SHIFT,
- ((__u64)iov[count - 1].kiov_page->index << PAGE_SHIFT) +
- iov[count - 1].kiov_len - 1, client_cksum, server_cksum);
+ local_nb[0].lnb_file_offset,
+ local_nb[count-1].lnb_file_offset +
+ local_nb[count-1].lnb_len - 1, client_cksum, server_cksum);
filp = filp_open(dbgcksum_file_name,
O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600);
if (IS_ERR(filp)) {
oldfs = get_fs();
set_fs(KERNEL_DS);
for (i = 0; i < count; i++) {
- len = iov[i].kiov_len;
- buf = kmap(iov[i].kiov_page);
+ len = local_nb[i].lnb_len;
+ buf = kmap(local_nb[i].lnb_page);
while (len != 0) {
rc = vfs_write(filp, (__force const char __user *)buf,
len, &filp->f_pos);
CDEBUG(D_INFO, "%s: wrote %d bytes\n",
dbgcksum_file_name, rc);
}
- kunmap(iov[i].kiov_page);
+ kunmap(local_nb[i].lnb_page);
}
set_fs(oldfs);
return;
}
-static int check_read_checksum(struct ptlrpc_bulk_desc *desc, struct obdo *oa,
+static int check_read_checksum(struct niobuf_local *local_nb, int npages,
+ struct obd_export *exp, struct obdo *oa,
const lnet_process_id_t *peer,
__u32 client_cksum, __u32 server_cksum,
enum cksum_types server_cksum_type)
{
char *msg;
enum cksum_types cksum_type;
+ loff_t start, end;
/* unlikely to happen and only if resend does not occur due to cksum
* control failure on Client */
return 0;
}
- if (desc->bd_export->exp_obd->obd_checksum_dump)
- dump_all_bulk_pages(oa, desc->bd_iov_count,
- &BD_GET_KIOV(desc, 0), server_cksum,
+ if (exp->exp_obd->obd_checksum_dump)
+ dump_all_bulk_pages(oa, npages, local_nb, server_cksum,
client_cksum);
cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
else
msg = "should have changed on the client or in transit";
+ start = local_nb[0].lnb_file_offset;
+ end = local_nb[npages-1].lnb_file_offset +
+ local_nb[npages-1].lnb_len - 1;
+
LCONSOLE_ERROR_MSG(0x132, "%s: BAD READ CHECKSUM: %s: from %s inode "
DFID " object "DOSTID" extent [%llu-%llu], client returned csum"
" %x (type %x), server csum %x (type %x)\n",
- desc->bd_export->exp_obd->obd_name,
+ exp->exp_obd->obd_name,
msg, libcfs_nid2str(peer->nid),
oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL,
oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
POSTID(&oa->o_oi),
- (__u64)BD_GET_KIOV(desc, 0).kiov_page->index << PAGE_SHIFT,
- ((__u64)BD_GET_KIOV(desc,
- desc->bd_iov_count - 1).kiov_page->index
- << PAGE_SHIFT) +
- BD_GET_KIOV(desc, desc->bd_iov_count - 1).kiov_len - 1,
- client_cksum, cksum_type, server_cksum, server_cksum_type);
+ start, end, client_cksum, cksum_type, server_cksum,
+ server_cksum_type);
+
return 1;
}
+static int tgt_pages2shortio(struct niobuf_local *local, int npages,
+ unsigned char *buf, int size)
+{
+ int i, off, len, copied = size;
+ char *ptr;
+
+ for (i = 0; i < npages; i++) {
+ off = local[i].lnb_page_offset & ~PAGE_MASK;
+ len = local[i].lnb_len;
+
+ CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n",
+ i, off, len, size);
+ if (len > size)
+ return -EINVAL;
+
+ ptr = ll_kmap_atomic(local[i].lnb_page, KM_USER0);
+ memcpy(buf + off, ptr, len);
+ ll_kunmap_atomic(ptr, KM_USER0);
+ buf += len;
+ size -= len;
+ }
+ return copied - size;
+}
+
int tgt_brw_read(struct tgt_session_info *tsi)
{
struct ptlrpc_request *req = tgt_ses_req(tsi);
struct ost_body *body, *repbody;
struct l_wait_info lwi;
struct lustre_handle lockh = { 0 };
- int npages, nob = 0, rc, i, no_reply = 0;
+ int npages, nob = 0, rc, i, no_reply = 0,
+ npages_read;
struct tgt_thread_big_cache *tbc = req->rq_svc_thread->t_data;
ENTRY;
- if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
+ if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL &&
+ ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) {
CERROR("%s: deny read request from %s to portal %u\n",
tgt_name(tsi->tsi_tgt),
obd_export_nid2str(req->rq_export),
local_nb = tbc->local;
- rc = tgt_brw_lock(exp->exp_obd->obd_namespace, &tsi->tsi_resid, ioo,
- remote_nb, &lockh, LCK_PR);
+ rc = tgt_brw_lock(exp, &tsi->tsi_resid, ioo, remote_nb, &lockh,
+ LCK_PR);
if (rc != 0)
RETURN(rc);
if (rc != 0)
GOTO(out_lock, rc);
- desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
- PTLRPC_BULK_PUT_SOURCE |
- PTLRPC_BULK_BUF_KIOV,
- OST_BULK_PORTAL,
- &ptlrpc_bulk_kiov_nopin_ops);
- if (desc == NULL)
- GOTO(out_commitrw, rc = -ENOMEM);
+ if (body->oa.o_flags & OBD_FL_SHORT_IO) {
+ desc = NULL;
+ } else {
+ desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
+ PTLRPC_BULK_PUT_SOURCE |
+ PTLRPC_BULK_BUF_KIOV,
+ OST_BULK_PORTAL,
+ &ptlrpc_bulk_kiov_nopin_ops);
+ if (desc == NULL)
+ GOTO(out_commitrw, rc = -ENOMEM);
+ }
nob = 0;
+ npages_read = npages;
for (i = 0; i < npages; i++) {
int page_rc = local_nb[i].lnb_rc;
if (page_rc < 0) {
rc = page_rc;
+ npages_read = i;
break;
}
nob += page_rc;
- if (page_rc != 0) { /* some data! */
+ if (page_rc != 0 && desc != NULL) { /* some data! */
LASSERT(local_nb[i].lnb_page != NULL);
desc->bd_frag_ops->add_kiov_frag
(desc, local_nb[i].lnb_page,
- local_nb[i].lnb_page_offset,
+ local_nb[i].lnb_page_offset & ~PAGE_MASK,
page_rc);
}
if (page_rc != local_nb[i].lnb_len) { /* short read */
+ local_nb[i].lnb_len = page_rc;
+ npages_read = i + (page_rc != 0 ? 1 : 0);
/* All subsequent pages should be 0 */
while (++i < npages)
LASSERT(local_nb[i].lnb_rc == 0);
repbody->oa.o_flags = cksum_type_pack(cksum_type);
repbody->oa.o_valid = OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
- repbody->oa.o_cksum = tgt_checksum_bulk(tsi->tsi_tgt, desc,
- OST_READ, cksum_type);
+ repbody->oa.o_cksum = tgt_checksum_niobuf(tsi->tsi_tgt,
+ local_nb, npages_read,
+ OST_READ, cksum_type);
CDEBUG(D_PAGE, "checksum at read origin: %x\n",
repbody->oa.o_cksum);
* zero-cksum case) */
if ((body->oa.o_valid & OBD_MD_FLFLAGS) &&
(body->oa.o_flags & OBD_FL_RECOV_RESEND))
- check_read_checksum(desc, &body->oa, &req->rq_peer,
+ check_read_checksum(local_nb, npages_read, exp,
+ &body->oa, &req->rq_peer,
body->oa.o_cksum,
repbody->oa.o_cksum, cksum_type);
} else {
/* Check if client was evicted while we were doing i/o before touching
* network */
- if (likely(rc == 0 &&
- !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2) &&
- !CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_BULK))) {
- rc = target_bulk_io(exp, desc, &lwi);
+ if (rc == 0) {
+ if (body->oa.o_flags & OBD_FL_SHORT_IO) {
+ unsigned char *short_io_buf;
+ int short_io_size;
+
+ short_io_buf = req_capsule_server_get(&req->rq_pill,
+ &RMF_SHORT_IO);
+ short_io_size = req_capsule_get_size(&req->rq_pill,
+ &RMF_SHORT_IO,
+ RCL_SERVER);
+ rc = tgt_pages2shortio(local_nb, npages_read,
+ short_io_buf, short_io_size);
+ if (rc >= 0)
+ req_capsule_shrink(&req->rq_pill,
+ &RMF_SHORT_IO, rc,
+ RCL_SERVER);
+ rc = rc > 0 ? 0 : rc;
+ } else if (!CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) {
+ rc = target_bulk_io(exp, desc, &lwi);
+ }
no_reply = rc != 0;
+ } else {
+ if (body->oa.o_flags & OBD_FL_SHORT_IO)
+ req_capsule_shrink(&req->rq_pill, &RMF_SHORT_IO, 0,
+ RCL_SERVER);
}
out_commitrw:
obd_export_nid2str(exp), rc);
}
/* send a bulk after reply to simulate a network delay or reordering
- * by a router */
- if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))) {
+ * by a router - Note that !desc implies short io, so there is no bulk
+ * to reorder. */
+ if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)) &&
+ desc) {
wait_queue_head_t waitq;
struct l_wait_info lwi1;
}
EXPORT_SYMBOL(tgt_brw_read);
+static int tgt_shortio2pages(struct niobuf_local *local, int npages,
+ unsigned char *buf, int size)
+{
+ int i, off, len;
+ char *ptr;
+
+ for (i = 0; i < npages; i++) {
+ off = local[i].lnb_page_offset & ~PAGE_MASK;
+ len = local[i].lnb_len;
+
+ if (len == 0)
+ continue;
+
+ CDEBUG(D_PAGE, "index %d offset = %d len = %d left = %d\n",
+ i, off, len, size);
+ ptr = ll_kmap_atomic(local[i].lnb_page, KM_USER0);
+ if (ptr == NULL)
+ return -EINVAL;
+ memcpy(ptr + off, buf, len < size ? len : size);
+ ll_kunmap_atomic(ptr, KM_USER0);
+ buf += len;
+ size -= len;
+ }
+ return 0;
+}
+
static void tgt_warn_on_cksum(struct ptlrpc_request *req,
struct ptlrpc_bulk_desc *desc,
struct niobuf_local *local_nb, int npages,
body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
LASSERT(body != NULL);
- if (req->rq_peer.nid != desc->bd_sender) {
+ if (desc && req->rq_peer.nid != desc->bd_sender) {
via = " via ";
router = libcfs_nid2str(desc->bd_sender);
}
if (exp->exp_obd->obd_checksum_dump)
- dump_all_bulk_pages(&body->oa, desc->bd_iov_count,
- &BD_GET_KIOV(desc, 0), server_cksum,
+ dump_all_bulk_pages(&body->oa, npages, local_nb, server_cksum,
client_cksum);
if (mmap) {
ENTRY;
- if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL) {
+ if (ptlrpc_req2svc(req)->srv_req_portal != OST_IO_PORTAL &&
+ ptlrpc_req2svc(req)->srv_req_portal != MDS_IO_PORTAL) {
CERROR("%s: deny write request from %s to portal %u\n",
tgt_name(tsi->tsi_tgt),
obd_export_nid2str(req->rq_export),
local_nb = tbc->local;
- rc = tgt_brw_lock(exp->exp_obd->obd_namespace, &tsi->tsi_resid, ioo,
- remote_nb, &lockh, LCK_PW);
+ rc = tgt_brw_lock(exp, &tsi->tsi_resid, ioo, remote_nb, &lockh,
+ LCK_PW);
if (rc != 0)
GOTO(out, rc);
objcount, ioo, remote_nb, &npages, local_nb);
if (rc < 0)
GOTO(out_lock, rc);
+ if (body->oa.o_flags & OBD_FL_SHORT_IO) {
+ int short_io_size;
+ unsigned char *short_io_buf;
+
+ short_io_size = req_capsule_get_size(&req->rq_pill,
+ &RMF_SHORT_IO,
+ RCL_CLIENT);
+ short_io_buf = req_capsule_client_get(&req->rq_pill,
+ &RMF_SHORT_IO);
+ CDEBUG(D_INFO, "Client use short io for data transfer,"
+ " size = %d\n", short_io_size);
+
+ /* Copy short io buf to pages */
+ rc = tgt_shortio2pages(local_nb, npages, short_io_buf,
+ short_io_size);
+ desc = NULL;
+ } else {
+ desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
+ PTLRPC_BULK_GET_SINK |
+ PTLRPC_BULK_BUF_KIOV,
+ OST_BULK_PORTAL,
+ &ptlrpc_bulk_kiov_nopin_ops);
+ if (desc == NULL)
+ GOTO(skip_transfer, rc = -ENOMEM);
+
+ /* NB Having prepped, we must commit... */
+ for (i = 0; i < npages; i++)
+ desc->bd_frag_ops->add_kiov_frag(desc,
+ local_nb[i].lnb_page,
+ local_nb[i].lnb_page_offset & ~PAGE_MASK,
+ local_nb[i].lnb_len);
+
+ rc = sptlrpc_svc_prep_bulk(req, desc);
+ if (rc != 0)
+ GOTO(skip_transfer, rc);
- desc = ptlrpc_prep_bulk_exp(req, npages, ioobj_max_brw_get(ioo),
- PTLRPC_BULK_GET_SINK | PTLRPC_BULK_BUF_KIOV,
- OST_BULK_PORTAL,
- &ptlrpc_bulk_kiov_nopin_ops);
- if (desc == NULL)
- GOTO(skip_transfer, rc = -ENOMEM);
-
- /* NB Having prepped, we must commit... */
- for (i = 0; i < npages; i++)
- desc->bd_frag_ops->add_kiov_frag(desc,
- local_nb[i].lnb_page,
- local_nb[i].lnb_page_offset,
- local_nb[i].lnb_len);
-
- rc = sptlrpc_svc_prep_bulk(req, desc);
- if (rc != 0)
- GOTO(skip_transfer, rc);
+ rc = target_bulk_io(exp, desc, &lwi);
+ }
- rc = target_bulk_io(exp, desc, &lwi);
no_reply = rc != 0;
skip_transfer:
repbody->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
repbody->oa.o_flags &= ~OBD_FL_CKSUM_ALL;
repbody->oa.o_flags |= cksum_type_pack(cksum_type);
- repbody->oa.o_cksum = tgt_checksum_bulk(tsi->tsi_tgt, desc,
- OST_WRITE, cksum_type);
+ repbody->oa.o_cksum = tgt_checksum_niobuf(tsi->tsi_tgt,
+ local_nb, npages,
+ OST_WRITE,
+ cksum_type);
cksum_counter++;
if (unlikely(body->oa.o_cksum != repbody->oa.o_cksum)) {
struct lu_attr attr;
struct lu_fid fid;
struct dt_object *o;
+ struct tg_grants_data *tgd = &lut->lut_tgd;
+ struct obd_statfs *osfs;
int i, rc = 0;
ENTRY;
if (!obd->obd_replayable)
RETURN(0);
+ /* initialize grant and statfs data in target */
+ dt_conf_get(env, lut->lut_bottom, &lut->lut_dt_conf);
+
+ /* statfs data */
+ spin_lock_init(&tgd->tgd_osfs_lock);
+ tgd->tgd_osfs_age = cfs_time_shift_64(-1000);
+ tgd->tgd_osfs_unstable = 0;
+ tgd->tgd_statfs_inflight = 0;
+ tgd->tgd_osfs_inflight = 0;
+
+ /* grant data */
+ spin_lock_init(&tgd->tgd_grant_lock);
+ tgd->tgd_tot_dirty = 0;
+ tgd->tgd_tot_granted = 0;
+ tgd->tgd_tot_pending = 0;
+ tgd->tgd_grant_compat_disable = 0;
+
+ /* populate cached statfs data */
+ osfs = &tgt_th_info(env)->tti_u.osfs;
+ rc = tgt_statfs_internal(env, lut, osfs, 0, NULL);
+ if (rc != 0) {
+ CERROR("%s: can't get statfs data, rc %d\n", tgt_name(lut),
+ rc);
+ GOTO(out, rc);
+ }
+ if (!is_power_of_2(osfs->os_bsize)) {
+ CERROR("%s: blocksize (%d) is not a power of 2\n",
+ tgt_name(lut), osfs->os_bsize);
+ GOTO(out, rc = -EPROTO);
+ }
+ tgd->tgd_blockbits = fls(osfs->os_bsize) - 1;
+
spin_lock_init(&lut->lut_translock);
spin_lock_init(&lut->lut_client_bitmap_lock);
}
EXPORT_SYMBOL(tgt_fini);
+static struct kmem_cache *tgt_thread_kmem;
+static struct kmem_cache *tgt_session_kmem;
+static struct lu_kmem_descr tgt_caches[] = {
+ {
+ .ckd_cache = &tgt_thread_kmem,
+ .ckd_name = "tgt_thread_kmem",
+ .ckd_size = sizeof(struct tgt_thread_info),
+ },
+ {
+ .ckd_cache = &tgt_session_kmem,
+ .ckd_name = "tgt_session_kmem",
+ .ckd_size = sizeof(struct tgt_session_info)
+ },
+ {
+ .ckd_cache = NULL
+ }
+};
+
+
/* context key constructor/destructor: tg_key_init, tg_key_fini */
-LU_KEY_INIT(tgt, struct tgt_thread_info);
+static void *tgt_key_init(const struct lu_context *ctx,
+ struct lu_context_key *key)
+{
+ struct tgt_thread_info *thread;
+
+ OBD_SLAB_ALLOC_PTR_GFP(thread, tgt_thread_kmem, GFP_NOFS);
+ if (thread == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ return thread;
+}
static void tgt_key_fini(const struct lu_context *ctx,
struct lu_context_key *key, void *data)
if (args->ta_args != NULL)
OBD_FREE(args->ta_args, sizeof(args->ta_args[0]) *
args->ta_alloc_args);
- OBD_FREE_PTR(info);
+ OBD_SLAB_FREE_PTR(info, tgt_thread_kmem);
}
static void tgt_key_exit(const struct lu_context *ctx,
LU_KEY_INIT_GENERIC(tgt);
-/* context key constructor/destructor: tgt_ses_key_init, tgt_ses_key_fini */
-LU_KEY_INIT_FINI(tgt_ses, struct tgt_session_info);
+static void *tgt_ses_key_init(const struct lu_context *ctx,
+ struct lu_context_key *key)
+{
+ struct tgt_session_info *session;
+
+ OBD_SLAB_ALLOC_PTR_GFP(session, tgt_session_kmem, GFP_NOFS);
+ if (session == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ return session;
+}
+
+static void tgt_ses_key_fini(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+ struct tgt_session_info *session = data;
+
+ OBD_SLAB_FREE_PTR(session, tgt_session_kmem);
+}
/* context key: tgt_session_key */
struct lu_context_key tgt_session_key = {
int tgt_mod_init(void)
{
+ int result;
ENTRY;
+ result = lu_kmem_init(tgt_caches);
+ if (result != 0)
+ RETURN(result);
+
tgt_page_to_corrupt = alloc_page(GFP_KERNEL);
tgt_key_init_generic(&tgt_thread_key, NULL);
lu_context_key_degister(&tgt_thread_key);
lu_context_key_degister(&tgt_session_key);
update_info_fini();
+
+ lu_kmem_fini(tgt_caches);
}
OSTDEV2_2=$fs3ost_DEV
if ! combined_mgs_mds; then
- # bug number for skipped test: LU-9860 LU-9860 LU-9860 LU-9860
- ALWAYS_EXCEPT="$ALWAYS_EXCEPT 33a 43b 53b 54b"
+ # bug number for skipped test: LU-9860 LU-9860 LU-9860
+ ALWAYS_EXCEPT="$ALWAYS_EXCEPT 43b 53b 54b"
# bug number for skipped test: LU-9875 LU-9879 LU-9879 LU-9879 LU-9879
ALWAYS_EXCEPT="$ALWAYS_EXCEPT 70e 80 84 87 100"
# bug number for skipped test: LU-8110 LU-9400 LU-9879 LU-9879 LU-9879
}
cleanup() {
- umount_client $MOUNT || return 200
+ local force=""
+ [ "x$1" != "x" ] && force='-f'
+ umount_client $MOUNT $force|| return 200
cleanup_nocli || return $?
}
local tarball=$1
local writeconf=$2
local dne_upgrade=${dne_upgrade:-"no"}
+ local dom_upgrade=${dom_upgrade:-"no"}
local ff_convert=${ff_convert:-"no"}
local shall_cleanup_mdt=false
local shall_cleanup_mdt1=false
shall_cleanup_lustre=true
$r $LCTL set_param debug="$PTLDEBUG"
- t32_verify_quota $node $fsname $tmp/mnt/lustre || {
- error_noexit "verify quota failed"
- return 1
- }
-
if $r test -f $tmp/list; then
#
# There is not a Test Framework API to copy files to or
echo "list verification skipped"
fi
+ if [ "$dom_upgrade" != "no" ]; then
+ echo "Check DoM file can be created"
+ $LFS setstripe -E 1M -L mdt -E EOF $tmp/mnt/lustre/dom || {
+ error_noexit "Verify DoM creation"
+ return 1
+ }
+ [ $($LFS getstripe -L $tmp/mnt/lustre/dom) == 100 ] || {
+ error_noexit "Verify a DoM file"
+ return 1
+ }
+ dd if=/dev/urandom of=$tmp/mnt/lustre/dom bs=4096 \
+ count=1 conv=fsync || {
+ error_noexit "Cannot write to DoM file"
+ return 1
+ }
+ [ $(stat -c%s $tmp/mnt/lustre/dom) == 4096 ] || {
+ error_noexit "DoM: bad size after write"
+ return 1
+ }
+ rm $tmp/mnt/lustre/dom
+
+ $r $LCTL get_param -n lod.*MDT0000*.dom_stripesize || {
+ error_noexit "Getting \"dom_stripesize\""
+ return 1
+ }
+ $r $LCTL conf_param \
+ $fsname-MDT0000.lod.dom_stripesize=0 || {
+ error_noexit "Changing \"dom_stripesize\""
+ return 1
+ }
+ wait_update $(facet_host mds) "$LCTL get_param \
+ -n lod.*MDT0000*.dom_stripesize" 0 || {
+ error_noexit "Verifying \"dom_stripesize\""
+ return 1
+ }
+ fi
+
if [ "$dne_upgrade" != "no" ]; then
$LFS mkdir -i 1 -c2 $tmp/mnt/lustre/striped_dir || {
error_noexit "set striped dir failed"
}
run_test 32d "convert ff test"
+ test_32e() {
+ local tarballs
+ local tarball
+ local rc=0
+
+ t32_check
+ for tarball in $tarballs; do
+ echo $tarball | grep "2_9" || continue
+ #load_modules
+ dom_upgrade=yes t32_test $tarball writeconf || let "rc += $?"
+ done
+ return $rc
+ }
+ run_test 32e "dom upgrade test"
+
test_33a() { # bug 12333, was test_33
local FSNAME2=test-123
local MDSDEV=$(mdsdevname ${SINGLEMDS//mds/})
mkfsoptions="--mkfsoptions=\\\"-J size=8\\\"" # See bug 17931.
fi
- add fs2mds $(mkfs_opts mds1 ${fs2mdsdev}) --mgs --fsname=${FSNAME2} \
- --reformat $mkfsoptions $fs2mdsdev $fs2mdsvdev || exit 10
+ if combined_mgs_mds; then
+ local mgs_flag="--mgs"
+ fi
+
+ add fs2mds $(mkfs_opts mds1 ${fs2mdsdev}) --fsname=${FSNAME2} \
+ --reformat $mgs_flag $mkfsoptions $fs2mdsdev $fs2mdsvdev ||
+ exit 10
add fs2ost $(mkfs_opts ost1 ${fs2ostdev}) --mgsnode=$MGSNID \
--fsname=${FSNAME2} --index=8191 --reformat $fs2ostdev \
$fs2ostvdev || exit 10
start fs2mds $fs2mdsdev $MDS_MOUNT_OPTS && trap cleanup_fs2 EXIT INT
start fs2ost $fs2ostdev $OST_MOUNT_OPTS
- do_facet $SINGLEMDS "$LCTL conf_param $FSNAME2.sys.timeout=200" ||
+ do_facet mgs "$LCTL conf_param $FSNAME2.sys.timeout=200" ||
error "$LCTL conf_param $FSNAME2.sys.timeout=200 failed"
mkdir -p $MOUNT2 || error "mkdir $MOUNT2 failed"
$MOUNT_CMD $MGSNID:/${FSNAME2} $MOUNT2 || error "$MOUNT_CMD failed"
echo "blah blah" > $MOUNT/$tfile
cat $MOUNT/$tfile || error "cat $MOUNT/$tfile failed"
- umount_client $MOUNT || error "umount_client $MOUNT failed"
+ umount_client $MOUNT -f || error "umount_client $MOUNT failed"
stop_ost || error "Unable to stop OST1"
stop_mds || error "Unable to stop MDS"
stop_mds || error "Unable to stop MDS on second try"
soc=$(do_facet mds1 "$LCTL get_param -n \
mdt.*MDT0000.sync_lock_cancel")
[ $soc == "never" ] || error "SoC enabled on single MDS"
+ umount_client $MOUNT -f > /dev/null
cleanup || error "cleanup failed with $?"
}
do_facet ost1 $DEBUGFS -c -R stats `ostdevname 1` | grep "meta_bg" ||
error "meta_bg is not set"
- return 0
+ reformat
}
run_test 99 "Adding meta_bg option"
}
test_105() {
- cleanup
+ cleanup -f
reformat
setup
mkdir -p $TMP/$tdir
ALWAYS_EXCEPT=" 42a 42b 42c 45 68b $SANITY_EXCEPT"
# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
-# skipped tests: LU-2036 LU-8411 LU-9096 LU-9054
-ALWAYS_EXCEPT=" 76 407 253 312 $ALWAYS_EXCEPT"
+# skipped tests: LU-8411 LU-9096 LU-9054 LU-10199
+ALWAYS_EXCEPT=" 407 253 312 56xb $ALWAYS_EXCEPT"
# Check Grants after these tests
GRANT_CHECK_LIST="$GRANT_CHECK_LIST 42a 42b 42c 42d 42e 63a 63b 64a 64b 64c"
export PATH=$PATH:/sbin
TMP=${TMP:-/tmp}
+ OSC=${OSC:-"osc"}
CC=${CC:-cc}
CHECKSTAT=${CHECKSTAT:-"checkstat -v"}
$LFS setstripe -S 65536 $DIR/$tdir/f0 || error "setstripe failed"
[ $($LFS getstripe -S $DIR/$tdir/f0) -ne 65536 ] &&
error "stripe size $size != 65536" || true
- [ $($LFS getstripe -d $DIR/$tdir | grep -c "stripe_count") -ne 1 ] &&
- error "$LFS getstripe -d $DIR/$tdir failed" || true
+ [ $($LFS getstripe -d $DIR/$tdir | grep -c "stripe_count") -eq 0 ] &&
+ error "$LFS getstripe -d $DIR/$tdir no 'stripe_count'" || true
}
-run_test 27w "check $LFS setstripe -S option"
+run_test 27w "check $LFS setstripe -S and getstrip -d options"
test_27wa() {
[[ $OSTCOUNT -lt 2 ]] &&
run_test 41 "test small file write + fstat ====================="
count_ost_writes() {
- lctl get_param -n osc.*.stats |
+ lctl get_param -n ${OSC}.*.stats |
awk -vwrites=0 '/ost_write/ { writes += $2 } \
END { printf("%0.0f", writes) }'
}
test_42a() {
[ $PARALLEL == "yes" ] && skip "skip parallel run" && return
setup_test42
- cancel_lru_locks osc
+ cancel_lru_locks $OSC
stop_writeback
sync; sleep 1; sync # just to be safe
BEFOREWRITES=`count_ost_writes`
test_42b() {
[ $PARALLEL == "yes" ] && skip "skip parallel run" && return
setup_test42
- cancel_lru_locks osc
+ cancel_lru_locks $OSC
stop_writeback
sync
dd if=/dev/zero of=$DIR/f42b bs=1024 count=100
# start the file with a full-file pw lock to match against
# until the truncate.
trunc_test() {
- test=$1
- file=$DIR/$test
- offset=$2
- cancel_lru_locks osc
+ test=$1
+ file=$DIR/$test
+ offset=$2
+ cancel_lru_locks $OSC
stop_writeback
# prime the file with 0,EOF PW to match
touch $file
$TRUNCATE $file 0
sync; sync
# now the real test..
- dd if=/dev/zero of=$file bs=1024 count=100
- BEFOREWRITES=`count_ost_writes`
- $TRUNCATE $file $offset
- cancel_lru_locks osc
- AFTERWRITES=`count_ost_writes`
+ dd if=/dev/zero of=$file bs=1024 count=100
+ BEFOREWRITES=`count_ost_writes`
+ $TRUNCATE $file $offset
+ cancel_lru_locks $OSC
+ AFTERWRITES=`count_ost_writes`
start_writeback
}
dirty_osc_total() {
tot=0
- for d in `lctl get_param -n osc.*.cur_dirty_bytes`; do
+ for d in `lctl get_param -n ${OSC}.*.cur_dirty_bytes`; do
tot=$(($tot + $d))
done
echo $tot
[[ $numfree -lt $nrdirs ]] && skip "not enough blocks ($numfree)" &&
return
- trap cleanup_print_lfsdf EXIT
+ trap cleanup_print_lfs_df EXIT
# create files
- createmany -d $dir/d $nrdirs ||
+ createmany -d $dir/d $nrdirs || {
+ unlinkmany $dir/d $nrdirs
error "failed to create $nrdirs subdirs in MDT$mdtidx:$dir"
+ }
# really created :
nrdirs=$(ls -U $dir | wc -l)
echo "left ulimit at $ulimit_old"
fi
- createmany -o -k -t 120 $DIR/$tdir/f $numfree ||
+ createmany -o -k -t 120 $DIR/$tdir/f $numfree || {
+ unlinkmany $DIR/$tdir/f $numfree
error "create+open $numfree files in $DIR/$tdir failed"
+ }
ulimit -n $ulimit_old
# if createmany exits at 120s there will be fewer than $numfree files
check_swap_layouts_support && return 0
[[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs" && return
- local dir0=$DIR/$tdir/$testnum
- test_mkdir -p $dir0
-
+ local dir0=$DIR/$tdir
local ref1=/etc/passwd
local file1=$dir0/file1
- $SETSTRIPE -c 2 $file1
+ test_mkdir $dir0 || error "creating dir $dir0"
+ $LFS setstripe -c 2 $file1
cp $ref1 $file1
$LFS migrate -c 1 $file1 || error "migrate failed rc = $?"
- stripe=$($GETSTRIPE -c $file1)
+ stripe=$($LFS getstripe -c $file1)
[[ $stripe == 1 ]] || error "stripe of $file1 is $stripe != 1"
cmp $file1 $ref1 || error "content mismatch $file1 differs from $ref1"
local ref1=/etc/passwd
local file1=$dir0/file1
- $SETSTRIPE -c 2 $file1
+ $LFS setstripe -c 2 $file1
cp $ref1 $file1
$LFS migrate --block -c 1 $file1 || error "migrate failed rc = $?"
- local stripe=$($GETSTRIPE -c $file1)
+ local stripe=$($LFS getstripe -c $file1)
[[ $stripe == 1 ]] || error "stripe of $file1 is $stripe != 1"
cmp $file1 $ref1 || error "content mismatch $file1 differs from $ref1"
}
run_test 56xa "lfs migration --block support"
+check_migrate_links() {
+ local dir="$1"
+ local file1="$dir/file1"
+ local begin="$2"
+ local count="$3"
+ local total_count=$(($begin + $count - 1))
+ local symlink_count=10
+ local uniq_count=10
+
+ if [ ! -f "$file1" ]; then
+ echo -n "creating initial file..."
+ $LFS setstripe -c 1 -S "512k" "$file1" ||
+ error "cannot setstripe initial file"
+ echo "done"
+
+ echo -n "creating symlinks..."
+ for s in $(seq 1 $symlink_count); do
+ ln -s "$file1" "$dir/slink$s" ||
+ error "cannot create symlinks"
+ done
+ echo "done"
+
+ echo -n "creating nonlinked files..."
+ createmany -o "$dir/uniq" 1 10 &> /dev/null ||
+ error "cannot create nonlinked files"
+ echo "done"
+ fi
+
+ # create hard links
+ if [ ! -f "$dir/file$total_count" ]; then
+ echo -n "creating hard links $begin:$total_count..."
+ createmany -l"$file1" "$dir/file" "$begin" "$count" &> \
+ /dev/null || error "cannot create hard links"
+ echo "done"
+ fi
+
+ echo -n "checking number of hard links listed in xattrs..."
+ local fid=$($LFS getstripe -F "$file1")
+ local paths=($($LFS fid2path "$MOUNT" "$fid" 2> /dev/null))
+
+ echo "${#paths[*]}"
+ if [ ${#paths[*]} -lt $total_count -a "$begin" -eq 2 ]; then
+ echo "hard link list has unexpected size, skipping test"
+ return 0
+ fi
+ if [ ${#paths[*]} -ge $total_count -a "$begin" -ne 2 ]; then
+ error "link names should exceed xattrs size"
+ fi
+
+ echo -n "migrating files..."
+ local migrate_out=$($LFS_MIGRATE -y -S '1m' $dir)
+ local rc=$?
+ [ $rc -eq 0 ] || error "migrate failed rc = $rc"
+ echo "done"
+
+ # make sure all links have been properly migrated
+ echo -n "verifying files..."
+ fid=$($LFS getstripe -F "$file1") ||
+ error "cannot get fid for file $file1"
+ for i in $(seq 2 $total_count); do
+ local fid2=$($LFS getstripe -F $dir/file$i)
+ [ "$fid2" == "$fid" ] ||
+ error "migrated hard link has mismatched FID"
+ done
+
+ # make sure hard links were properly detected, and migration was
+ # performed only once for the entire link set; nonlinked files should
+ # also be migrated
+ local actual=$(grep -c 'done migrate' <<< "$migrate_out")
+ local expected=$(($uniq_count + 1))
+ [ "$actual" -eq "$expected" ] ||
+ error "hard links individually migrated ($actual != $expected)"
+
+ # make sure the correct number of hard links are present
+ local hardlinks=$(stat -c '%h' "$file1")
+ [ $hardlinks -eq $total_count ] ||
+ error "num hard links $hardlinks != $total_count"
+ echo "done"
+
+ return 0
+}
+
+test_56xb() {
+ local dir0="$DIR/$tdir"
+
+ test_mkdir "$dir0" || error "cannot create dir $dir0"
+
+ echo "testing lfs migrate mode when all links fit within xattrs"
+ LFS_MIGRATE_RSYNC=false check_migrate_links "$dir0" 2 99
+
+ echo "testing rsync mode when all links fit within xattrs"
+ LFS_MIGRATE_RSYNC=true check_migrate_links "$dir0" 2 99
+
+ echo "testing lfs migrate mode when all links do not fit within xattrs"
+ LFS_MIGRATE_RSYNC=false check_migrate_links "$dir0" 101 100
+
+ echo "testing rsync mode when all links do not fit within xattrs"
+ LFS_MIGRATE_RSYNC=true check_migrate_links "$dir0" 101 100
+
+ # clean up
+ rm -rf $dir0
+}
+run_test 56xb "lfs migration hard link support"
+
test_56y() {
[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.4.53) ] &&
skip "No HSM $(lustre_build_version $SINGLEMDS) MDS < 2.4.53" &&
awk '/lustre_inode_cache/ {print $2; exit}' /proc/slabinfo
}
-get_inode_slab_tunables() {
- awk '/lustre_inode_cache/ {print $9," ",$10," ",$11; exit}' /proc/slabinfo
-}
-
-set_inode_slab_tunables() {
- echo "lustre_inode_cache $1" > /proc/slabinfo
-}
-
test_76() { # Now for bug 20433, added originally in bug 1443
[ $PARALLEL == "yes" ] && skip "skip parallel run" && return
- local SLAB_SETTINGS=$(get_inode_slab_tunables)
local CPUS=$(getconf _NPROCESSORS_ONLN 2>/dev/null)
- # we cannot set limit below 1 which means 1 inode in each
- # per-cpu cache is still allowed
- set_inode_slab_tunables "1 1 0"
cancel_lru_locks osc
BEFORE_INODES=$(num_inodes)
echo "before inodes: $BEFORE_INODES"
error "inode slab grew from $BEFORE_INODES to $AFTER_INODES"
fi
done
- set_inode_slab_tunables "$SLAB_SETTINGS"
}
run_test 76 "confirm clients recycle inodes properly ===="
done
echo "Cancel LRU locks on lustre client to flush the client cache"
- cancel_lru_locks osc
+ cancel_lru_locks $OSC
echo "Reset readahead stats"
$LCTL set_param -n llite.*.read_ahead_stats 0
done
cd $DIR
- $1 $TAR cf $TMP/f102.tar $tdir --xattrs
+ $1 tar cf $TMP/f102.tar $tdir --xattrs
}
cleanup_test102() {
return 0
}
-find_lustre_tar() {
- [ -n "$(which tar 2>/dev/null)" ] &&
- strings $(which tar) | grep -q "lustre" && echo tar
+have_xattrs_include() {
+ tar --help | grep -q xattrs-include &&
+ echo --xattrs-include="lustre.*"
}
test_102d() {
[ $PARALLEL == "yes" ] && skip "skip parallel run" && return
- # b10930: tar test for trusted.lov xattr
- TAR=$(find_lustre_tar)
- [ -z "$TAR" ] && skip_env "lustre-aware tar is not installed" && return
[[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs" && return
+ XINC=$(have_xattrs_include)
setup_test102
- test_mkdir $DIR/$tdir
- $TAR xf $TMP/$tfile.tar -C $DIR/$tdir --xattrs
+ tar xf $TMP/f102.tar -C $DIR/$tdir --xattrs $XINC
cd $DIR/$tdir/$tdir
compare_stripe_info1
}
test_102f() {
[ $PARALLEL == "yes" ] && skip "skip parallel run" && return
- # b10930: tar test for trusted.lov xattr
- TAR=$(find_lustre_tar)
- [ -z "$TAR" ] && skip_env "lustre-aware tar is not installed" && return
[[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs" && return
+ XINC=$(have_xattrs_include)
setup_test102
test_mkdir $DIR/$tdir.restore
cd $DIR
- $TAR cf - --xattrs $tdir | $TAR xf - --xattrs -C $DIR/$tdir.restore
+ tar cf - --xattrs $tdir | tar xf - \
+ -C $DIR/$tdir.restore --xattrs $XINC
cd $DIR/$tdir.restore/$tdir
compare_stripe_info1
}
test_102j() {
[ $PARALLEL == "yes" ] && skip "skip parallel run" && return
- TAR=$(find_lustre_tar)
- [ -z "$TAR" ] && skip_env "lustre-aware tar is not installed" && return
[[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs" && return
+ XINC=$(have_xattrs_include)
setup_test102 "$RUNAS"
- test_mkdir $DIR/$tdir
chown $RUNAS_ID $DIR/$tdir
- $RUNAS $TAR xf $TMP/f102.tar -C $DIR/$tdir --xattrs
+ $RUNAS tar xf $TMP/f102.tar -C $DIR/$tdir --xattrs $XINC
cd $DIR/$tdir/$tdir
compare_stripe_info1 "$RUNAS"
}
[ $PARALLEL == "yes" ] && skip "skip parallel run" && return
local TF="$TMP/$tfile"
- dd if=/dev/urandom of=$TF bs=6096 count=1 || error "dd failed"
- cp $TF $DIR/$tfile
- cancel_lru_locks osc
- cmp $TF $DIR/$tfile || error "$TMP/$tfile $DIR/$tfile differ"
- remount_client $MOUNT
- df -P $MOUNT
- cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (remount)"
+ dd if=/dev/urandom of=$TF bs=6096 count=1 || error "dd failed"
+ cp $TF $DIR/$tfile
+ cancel_lru_locks $OSC
+ cmp $TF $DIR/$tfile || error "$TMP/$tfile $DIR/$tfile differ"
+ remount_client $MOUNT
+ df -P $MOUNT
+ cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (remount)"
- $TRUNCATE $TF 6000
- $TRUNCATE $DIR/$tfile 6000
- cancel_lru_locks osc
- cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (truncate1)"
+ $TRUNCATE $TF 6000
+ $TRUNCATE $DIR/$tfile 6000
+ cancel_lru_locks $OSC
+ cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (truncate1)"
- echo "12345" >>$TF
- echo "12345" >>$DIR/$tfile
- cancel_lru_locks osc
- cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (append1)"
+ echo "12345" >>$TF
+ echo "12345" >>$DIR/$tfile
+ cancel_lru_locks $OSC
+ cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (append1)"
- echo "12345" >>$TF
- echo "12345" >>$DIR/$tfile
- cancel_lru_locks osc
- cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (append2)"
+ echo "12345" >>$TF
+ echo "12345" >>$DIR/$tfile
+ cancel_lru_locks $OSC
+ cmp $TF $DIR/$tfile || error "$TF $DIR/$tfile differ (append2)"
- rm -f $TF
- true
+ rm -f $TF
+ true
}
run_test 150 "truncate/append tests"
dd if=/dev/urandom of=$temp bs=6096 count=1 || \
error "dd of=$temp bs=6096 count=1 failed"
cp $temp $file
- cancel_lru_locks osc
+ cancel_lru_locks $OSC
cmp $temp $file || error "$temp $file differ"
$TRUNCATE $temp 6000
# having "abc" as 1st arg, creates $TMP/lnet_abc.out and $TMP/lnet_abc.sys
create_lnet_proc_files() {
- lctl get_param -n $1 >$TMP/lnet_$1.out || error "cannot read lnet.$1"
- sysctl lnet.$1 >$TMP/lnet_$1.sys_tmp || error "cannot read lnet.$1"
-
- sed "s/^lnet.$1\ =\ //g" "$TMP/lnet_$1.sys_tmp" >$TMP/lnet_$1.sys
- rm -f "$TMP/lnet_$1.sys_tmp"
+ lctl get_param -n $1 >$TMP/lnet_$1.sys || error "cannot read lnet.$1"
}
# counterpart of create_lnet_proc_files
remove_lnet_proc_files() {
- rm -f $TMP/lnet_$1.out $TMP/lnet_$1.sys
+ rm -f $TMP/lnet_$1.sys
}
# uses 1st arg as trailing part of filename, 2nd arg as description for reports,
# can we successfully write to lnet.stats?
lctl set_param -n stats=0 || error "cannot write to lnet.stats"
- sysctl -w lnet.stats=0 || error "cannot write to lnet.stats"
}
run_test 215 "lnet exists and has proper content - bugs 18102, 21079, 21517"
mkdir -p $DIR/$tdir
createmany -o $DIR/$tdir/f- 5000
unlinkmany $DIR/$tdir/f- 5000
- do_nodes $list "lctl set_param -n osp*.*.sync_changes 1"
- changes=$(do_nodes $list "lctl get_param -n osc.*MDT*.sync_changes \
- osc.*MDT*.sync_in_flight" | calc_sum)
+ [ $(lustre_version_code $SINGLEMDS) -gt $(version_code 2.10.53) ] &&
+ do_nodes $list "lctl set_param -n osp.*.force_sync=1"
+ changes=$(do_nodes $list "lctl get_param -n osp.*MDT*.sync_changes \
+ osp.*MDT*.sync_in_flight" | calc_sum)
[ "$changes" -eq 0 ] || error "$changes not synced"
}
run_test 239 "osp_sync test"
test_241_bio() {
for LOOP in $(seq $1); do
dd if=$DIR/$tfile of=/dev/null bs=40960 count=1 2>/dev/null
- cancel_lru_locks osc || true
+ cancel_lru_locks $OSC || true
done
}
test_241a() { # was test_241
dd if=/dev/zero of=$DIR/$tfile count=1 bs=40960
ls -la $DIR/$tfile
- cancel_lru_locks osc
+ cancel_lru_locks $OSC
test_241_bio 1000 &
PID=$!
test_241_dio 1000
#after mount new plainllog is used
touch $DIR/$tdir/{11..19}
- local TEMP256FILE=$(mktemp TEMP256XXXXXX)
+ do_facet mds1 sync
+ local TEMP256FILE=$(mktemp -u TEMP256XXXXXX)
cat_sl=$(do_facet mds1 \
"$DEBUGFS -R \\\"dump changelog_catalog $TEMP256FILE\\\" $mdt_dev; \
llog_reader $TEMP256FILE | grep \\\"type=1064553b\\\" | wc -l")
- rm $TEMP256FILE
+ do_facet mds1 rm $TEMP256FILE
if (( cat_sl != 2 )); then
do_facet mds1 $LCTL --device $MDT0 changelog_deregister $cl_user
$LFS changelog_clear $MDT0 $cl_user 0
- TEMP256FILE=$(mktemp TEMP256XXXXXX)
+ do_facet mds1 sync
+ TEMP256FILE=$(mktemp -u TEMP256XXXXXX)
cat_sl=$(do_facet mds1 \
"$DEBUGFS -R \\\"dump changelog_catalog $TEMP256FILE\\\" $mdt_dev; \
llog_reader $TEMP256FILE | grep \\\"type=1064553b\\\" | wc -l")
- rm $TEMP256FILE
+ do_facet mds1 rm $TEMP256FILE
do_facet mds1 $LCTL --device $MDT0 changelog_deregister $cl_user
}
run_test 260 "Check mdc_close fail"
+ ### Data-on-MDT sanity tests ###
+ test_270a() {
+ # create DoM file
+ local dom=$DIR/$tdir/dom_file
+ local tmp=$DIR/$tdir/tmp_file
+
+ mkdir -p $DIR/$tdir
+
+ # basic checks for DoM component creation
+ $LFS setstripe -E 1024K -E 1024K -L mdt $dom 2>/dev/null &&
+ error "Can set MDT layout to non-first entry"
+
+ $LFS setstripe -E 1024K -L mdt -E 1024K -L mdt $dom 2>/dev/null &&
+ error "Can define multiple entries as MDT layout"
+
+ $LFS setstripe -E 1M -L mdt $dom ||
+ error "Can't create DoM layout"
+
+ [ $($LFS getstripe -L $dom) == 100 ] || error "bad pattern"
+ [ $($LFS getstripe -c $dom) == 0 ] || error "bad stripe count"
+ [ $($LFS getstripe -S $dom) == 1048576 ] || error "bad stripe size"
+
+ local mdtidx=$($GETSTRIPE -M $dom)
+ local mdtname=MDT$(printf %04x $mdtidx)
+ local facet=mds$((mdtidx + 1))
+ local space_check=1
+
+ # Skip free space checks with ZFS
+ if [ "$(facet_fstype $facet)" == "zfs" ]; then
+ space_check=0
+ fi
+
+ # write
+ sync
+ local mdtfree1=$(do_facet $facet \
+ lctl get_param -n osd*.*$mdtname.kbytesfree)
+ dd if=/dev/urandom of=$tmp bs=1024 count=100
+ # check also direct IO along write
+ dd if=$tmp of=$dom bs=102400 count=1 oflag=direct
+ sync
+ cmp $tmp $dom || error "file data is different"
+ [ $(stat -c%s $dom) == 102400 ] || error "bad size after write"
+ if [ $space_check == 1 ]; then
+ local mdtfree2=$(do_facet $facet \
+ lctl get_param -n osd*.*$mdtname.kbytesfree)
+ [ $(($mdtfree1 - $mdtfree2)) -ge 102 ] ||
+ error "MDT free space is wrong after write"
+ fi
+
+ # truncate
+ $TRUNCATE $dom 10000
+ [ $(stat -c%s $dom) == 10000 ] || error "bad size after truncate"
+ if [ $space_check == 1 ]; then
+ mdtfree1=$(do_facet $facet \
+ lctl get_param -n osd*.*$mdtname.kbytesfree)
+ [ $(($mdtfree1 - $mdtfree2)) -ge 92 ] ||
+ error "MDT free space is wrong after truncate"
+ fi
+
+ # append
+ cat $tmp >> $dom
+ sync
+ [ $(stat -c%s $dom) == 112400 ] || error "bad size after append"
+ if [ $space_check == 1 ]; then
+ mdtfree2=$(do_facet $facet \
+ lctl get_param -n osd*.*$mdtname.kbytesfree)
+ [ $(($mdtfree1 - $mdtfree2)) -ge 102 ] ||
+ error "MDT free space is wrong after append"
+ fi
+
+ # delete
+ rm $dom
+ if [ $space_check == 1 ]; then
+ mdtfree1=$(do_facet $facet \
+ lctl get_param -n osd*.*$mdtname.kbytesfree)
+ [ $(($mdtfree1 - $mdtfree2)) -ge 112 ] ||
+ error "MDT free space is wrong after removal"
+ fi
+
+ # combined striping
+ $LFS setstripe -E 1024K -L mdt -E EOF $dom ||
+ error "Can't create DoM + OST striping"
+
+ dd if=/dev/urandom of=$tmp bs=1024 count=2000
+ # check also direct IO along write
+ dd if=$tmp of=$dom bs=102400 count=20 oflag=direct
+ sync
+ cmp $tmp $dom || error "file data is different"
+ [ $(stat -c%s $dom) == 2048000 ] || error "bad size after write"
+ rm $dom
+ rm $tmp
+
+ return 0
+ }
+ run_test 270a "DoM: basic functionality tests"
+
+ test_270b() {
+ local dom=$DIR/$tdir/dom_file
+ local max_size=1048576
+
+ mkdir -p $DIR/$tdir
+ $LFS setstripe -E $max_size -L mdt $dom
+
+ # truncate over the limit
+ $TRUNCATE $dom $(($max_size + 1)) &&
+ error "successful truncate over the maximum size"
+ # write over the limit
+ dd if=/dev/zero of=$dom bs=$max_size seek=1 count=1 &&
+ error "successful write over the maximum size"
+ # append over the limit
+ dd if=/dev/zero of=$dom bs=$(($max_size - 3)) count=1
+ echo "12345" >> $dom && error "successful append over the maximum size"
+ rm $dom
+
+ return 0
+ }
+ run_test 270b "DoM: maximum size overflow checks for DoM-only file"
+
+ test_270c() {
+ mkdir -p $DIR/$tdir
+ $LFS setstripe -E 1024K -L mdt $DIR/$tdir
+
+ # check files inherit DoM EA
+ touch $DIR/$tdir/first
+ [ $($GETSTRIPE -L $DIR/$tdir/first) == 100 ] ||
+ error "bad pattern"
+ [ $($LFS getstripe -c $DIR/$tdir/first) == 0 ] ||
+ error "bad stripe count"
+ [ $($LFS getstripe -S $DIR/$tdir/first) == 1048576 ] ||
+ error "bad stripe size"
+
+ # check directory inherits DoM EA and uses it as default
+ mkdir $DIR/$tdir/subdir
+ touch $DIR/$tdir/subdir/second
+ [ $($LFS getstripe -L $DIR/$tdir/subdir/second) == 100 ] ||
+ error "bad pattern in sub-directory"
+ [ $($LFS getstripe -c $DIR/$tdir/subdir/second) == 0 ] ||
+ error "bad stripe count in sub-directory"
+ [ $($LFS getstripe -S $DIR/$tdir/subdir/second) == 1048576 ] ||
+ error "bad stripe size in sub-directory"
+ return 0
+ }
+ run_test 270c "DoM: DoM EA inheritance tests"
+
+ test_270d() {
+ mkdir -p $DIR/$tdir
+ $LFS setstripe -E 1024K -L mdt $DIR/$tdir
+
+ # inherit default DoM striping
+ mkdir $DIR/$tdir/subdir
+ touch $DIR/$tdir/subdir/f1
+
+ # change default directory striping
+ $LFS setstripe -c 1 $DIR/$tdir/subdir
+ touch $DIR/$tdir/subdir/f2
+ [ $($LFS getstripe -c $DIR/$tdir/subdir/f2) == 1 ] ||
+ error "wrong default striping in file 2"
+ [ $($LFS getstripe -L $DIR/$tdir/subdir/f2) == 1 ] ||
+ error "bad pattern in file 2"
+ return 0
+ }
+ run_test 270d "DoM: change striping from DoM to RAID0"
+
+ test_270e() {
+ mkdir -p $DIR/$tdir/dom
+ mkdir -p $DIR/$tdir/norm
+ DOMFILES=20
+ NORMFILES=10
+ $LFS setstripe -E 1M -L mdt $DIR/$tdir/dom
+ $LFS setstripe -i 0 -S 2M $DIR/$tdir/norm
+
+ createmany -o $DIR/$tdir/dom/dom- $DOMFILES
+ createmany -o $DIR/$tdir/norm/norm- $NORMFILES
+
+ # find DoM files by layout
+ NUM=$($LFIND -L mdt -type f $DIR/$tdir 2>/dev/null | wc -l)
+ [ $NUM -eq $DOMFILES ] ||
+ error "lfs find -L: found $NUM, expected $DOMFILES"
+ echo "Test 1: lfs find 20 DOM files by layout: OK"
+
+ # there should be 1 dir with default DOM striping
+ NUM=$($LFIND -L mdt -type d $DIR/$tdir 2>/dev/null | wc -l)
+ [ $NUM -eq 1 ] ||
+ error "lfs find -L: found $NUM, expected 1 dir"
+ echo "Test 2: lfs find 1 DOM dir by layout: OK"
+
+ # find DoM files by stripe size
+ NUM=$($LFIND -S -1200K -type f $DIR/$tdir 2>/dev/null | wc -l)
+ [ $NUM -eq $DOMFILES ] ||
+ error "lfs find -S: found $NUM, expected $DOMFILES"
+ echo "Test 4: lfs find 20 DOM files by stripe size: OK"
+
+ # find files by stripe offset except DoM files
+ NUM=$($LFIND -i 0 -type f $DIR/$tdir 2>/dev/null | wc -l)
+ [ $NUM -eq $NORMFILES ] ||
+ error "lfs find -i: found $NUM, expected $NORMFILES"
+ echo "Test 5: lfs find no DOM files by stripe index: OK"
+ return 0
+ }
+ run_test 270e "DoM: lfs find with DoM files test"
+
+ test_270f() {
+ local mdtname=${FSNAME}-MDT0000-mdtlov
+ local dom=$DIR/$tdir/dom_file
+ local dom_limit_saved=$(do_facet mds1 $LCTL get_param -n \
+ lod.$mdtname.dom_stripesize)
+ local dom_limit=131072
+
+ do_facet mds1 $LCTL set_param -n lod.$mdtname.dom_stripesize=$dom_limit
+ local dom_current=$(do_facet mds1 $LCTL get_param -n \
+ lod.$mdtname.dom_stripesize)
+ [ ${dom_limit} -eq ${dom_current} ] ||
+ error "Cannot change per-MDT DoM stripe limit to $dom_limit"
+
+ $LFS mkdir -i 0 -c 1 $DIR/$tdir
+ $LFS setstripe -d $DIR/$tdir
+ $LFS setstripe -E $dom_limit -L mdt $DIR/$tdir ||
+ error "Can't set directory default striping"
+
+ # exceed maximum stripe size
+ $LFS setstripe -E $(($dom_limit * 2)) -L mdt $dom &&
+ error "Able to create DoM component size more than LOD limit"
+
+ do_facet mds1 $LCTL set_param -n lod.$mdtname.dom_stripesize=0
+ dom_current=$(do_facet mds1 $LCTL get_param -n \
+ lod.$mdtname.dom_stripesize)
+ [ 0 -eq ${dom_current} ] ||
+ error "Can't set zero DoM stripe limit"
+
+ # too low values to be aligned with smallest stripe size 64K
+ do_facet mds1 $LCTL set_param -n lod.$mdtname.dom_stripesize=30000
+ dom_current=$(do_facet mds1 $LCTL get_param -n \
+ lod.$mdtname.dom_stripesize)
+ [ 30000 -eq ${dom_current} ] &&
+ error "Can set too small DoM stripe limit"
+
+ do_facet mds1 $LCTL set_param -n lod.$mdtname.dom_stripesize=2147483648
+ dom_current=$(do_facet mds1 $LCTL get_param -n \
+ lod.$mdtname.dom_stripesize)
+ echo $dom_current
+ [ 2147483648 -eq ${dom_current} ] &&
+ error "Can set too large DoM stripe limit"
+
+ do_facet mds1 $LCTL set_param -n \
+ lod.$mdtname.dom_stripesize=$((dom_limit * 2))
+ $LFS setstripe -E $((dom_limit * 2)) -L mdt $dom ||
+ error "Can't create DoM component size after limit change"
+ do_facet mds1 $LCTL set_param -n \
+ lod.$mdtname.dom_stripesize=$((dom_limit / 2))
+ $LFS setstripe -E $dom_limit -L mdt ${dom}_big &&
+ error "Can create big DoM component after limit decrease"
+ touch ${dom}_def ||
+ error "Can't create file with old default layout"
+
+ do_facet mds1 $LCTL set_param -n lod.*.dom_stripesize=$dom_limit_saved
+ return 0
+ }
+ run_test 270f "DoM: maximum DoM stripe size checks"
+
+ test_271a() {
+ local dom=$DIR/$tdir/dom
+
+ mkdir -p $DIR/$tdir
+
+ $LFS setstripe -E 1024K -L mdt $dom
+
+ lctl set_param -n mdc.*.stats=clear
+ dd if=/dev/zero of=$dom bs=4096 count=1 || return 1
+ cat $dom > /dev/null
+ local reads=$(lctl get_param -n mdc.*.stats |
+ awk '/ost_read/ {print $2}')
+ [ -z $reads ] || error "Unexpected $reads READ RPCs"
+ ls $dom
+ rm -f $dom
+ }
+ run_test 271a "DoM: data is cached for read after write"
+
+ test_271b() {
+ local dom=$DIR/$tdir/dom
+
+ mkdir -p $DIR/$tdir
+
+ $LFS setstripe -E 1024K -L mdt -E EOF $dom
+
+ lctl set_param -n mdc.*.stats=clear
+ dd if=/dev/zero of=$dom bs=4096 count=1 || return 1
+ cancel_lru_locks mdc
+ $CHECKSTAT -t file -s 4096 $dom || error "stat #1 fails"
+ # second stat to check size is cached on client
+ $CHECKSTAT -t file -s 4096 $dom || error "stat #2 fails"
+ local gls=$(lctl get_param -n mdc.*.stats |
+ awk '/ldlm_glimpse/ {print $2}')
+ [ -z $gls ] || error "Unexpected $gls glimpse RPCs"
+ rm -f $dom
+ }
+ run_test 271b "DoM: no glimpse RPC for stat (DoM only file)"
+
+ test_271ba() {
+ local dom=$DIR/$tdir/dom
+
+ mkdir -p $DIR/$tdir
+
+ $LFS setstripe -E 1024K -L mdt -E EOF $dom
+
+ lctl set_param -n mdc.*.stats=clear
+ lctl set_param -n osc.*.stats=clear
+ dd if=/dev/zero of=$dom bs=2048K count=1 || return 1
+ cancel_lru_locks mdc
+ $CHECKSTAT -t file -s 2097152 $dom || error "stat"
+ # second stat to check size is cached on client
+ $CHECKSTAT -t file -s 2097152 $dom || error "stat"
+ local gls=$(lctl get_param -n mdc.*.stats |
+ awk '/ldlm_glimpse/ {print $2}')
+ [ -z $gls ] || error "Unexpected $gls glimpse RPCs"
+ local gls=$(lctl get_param -n osc.*.stats |
+ awk '/ldlm_glimpse/ {print $2}')
+ [ -z $gls ] || error "Unexpected $gls OSC glimpse RPCs"
+ rm -f $dom
+ }
+ run_test 271ba "DoM: no glimpse RPC for stat (combined file)"
+
+ test_271c() {
+ # test to be enabled with lock_convert
+ skip "skipped until lock convert will be implemented" && return
+
+ local dom=$DIR/$tdir/dom
+
+ mkdir -p $DIR/$tdir
+
+ $LFS setstripe -E 1024K -L mdt $DIR/$tdir
+
+ local mdtidx=$($LFS getstripe -M $DIR/$tdir)
+ local facet=mds$((mdtidx + 1))
+
+ cancel_lru_locks mdc
+ do_facet $facet lctl set_param -n mdt.*.dom_lock=0
+ createmany -o $dom 1000
+ lctl set_param -n mdc.*.stats=clear
+ smalliomany -w $dom 1000 200
+ lctl get_param -n mdc.*.stats
+ local enq=$(lctl get_param -n mdc.*.stats |
+ awk '/ldlm_ibits_enqueue/ {print $2}')
+ # Each file has 1 open, 1 IO enqueues, total 2000
+ # but now we have also +1 getxattr for security.capability, total 3000
+ [ $enq -ge 2000 ] || error "Too few enqueues $enq, expected > 2000"
+ unlinkmany $dom 1000
+
+ cancel_lru_locks mdc
+ do_facet $facet lctl set_param -n mdt.*.dom_lock=1
+ createmany -o $dom 1000
+ lctl set_param -n mdc.*.stats=clear
+ smalliomany -w $dom 1000 200
+ lctl get_param -n mdc.*.stats
+ local enq_2=$(lctl get_param -n mdc.*.stats |
+ awk '/ldlm_ibits_enqueue/ {print $2}')
+ # Expect to see reduced amount of RPCs by 1000 due to single enqueue
+ # for OPEN and IO lock.
+ [ $((enq - enq_2)) -ge 1000 ] ||
+ error "Too many enqueues $enq_2, expected about $((enq - 1000))"
+ unlinkmany $dom 1000
+ return 0
+ }
+ run_test 271c "DoM: IO lock at open saves enqueue RPCs"
+
cleanup_test_300() {
trap 0
umask $SAVE_UMASK
$LFS setdirstripe -D -i1 $DIR/$tdir/striped_dir ||
error "create striped_dir failed"
+ $LFS setdirstripe -i0 $DIR/$tdir/striped_dir/dir0 ||
+ error "create dir0 fails"
+ stripe_index=$($LFS getdirstripe -i $DIR/$tdir/striped_dir/dir0)
+ [ $stripe_index -eq 0 ] ||
+ error "dir0 expect index 0 got $stripe_index"
+
mkdir $DIR/$tdir/striped_dir/dir1 ||
error "create dir1 fails"
stripe_index=$($LFS getdirstripe -i $DIR/$tdir/striped_dir/dir1)
[ $stripe_index -eq 1 ] ||
- error "dir1 expect 1 got $stripe_index"
-
- $LFS setdirstripe -i2 $DIR/$tdir/striped_dir/dir2 ||
- error "create dir2 fails"
- stripe_index=$($LFS getdirstripe -i $DIR/$tdir/striped_dir/dir2)
- [ $stripe_index -eq 2 ] ||
- error "dir2 expect 2 got $stripe_index"
+ error "dir1 expect index 1 got $stripe_index"
#check default stripe count/stripe index
test_300_check_default_striped_dir normal_dir $MDSCOUNT 1
check_and_setup_lustre
+ OSC=${OSC:-"osc"}
+
assert_DIR
rm -rf $DIR1/[df][0-9]* $DIR1/lnk $DIR/[df].${TESTSUITE}*
test_19() { # bug3811
local node=$(facet_active_host ost1)
+ [ "x$DOM" = "xyes" ] && node=$(facet_active_host $SINGLEMDS)
+
# check whether obdfilter is cache capable at all
if ! get_osd_param $node '' read_cache_enable >/dev/null; then
echo "not cache-capable obdfilter"
cp $TMP/$tfile $DIR1/$tfile
for i in `seq 1 20`; do
[ $((i % 5)) -eq 0 ] && log "$testname loop $i"
- cancel_lru_locks osc > /dev/null
+ cancel_lru_locks $OSC > /dev/null
cksum $DIR1/$tfile | cut -d" " -f 1,2 > $TMP/sum1 & \
cksum $DIR2/$tfile | cut -d" " -f 1,2 > $TMP/sum2
wait
test_20() {
test_mkdir $DIR1/d20
- cancel_lru_locks osc
+ cancel_lru_locks $OSC
CNT=$((`lctl get_param -n llite.*.dump_page_cache | wc -l`))
$MULTIOP $DIR1/f20 Ow8190c
$MULTIOP $DIR2/f20 Oz8194w8190c
$MULTIOP $DIR1/f20 Oz0r8190c
- cancel_lru_locks osc
+ cancel_lru_locks $OSC
CNTD=$((`lctl get_param -n llite.*.dump_page_cache | wc -l` - $CNT))
[ $CNTD -gt 0 ] && \
error $CNTD" page left in cache after lock cancel" || true
echo "atime should be updated while another read" > $DIR1/$tfile
# clear the lock(mode: LCK_PW) gotten from creating operation
- cancel_lru_locks osc
+ cancel_lru_locks $OSC
time1=$(date +%s)
echo "now is $time1"
sleep $((at_diff + 1))
OSC=`lctl dl | awk '/-osc-|OSC.*MNT/ {print $4}' | head -n 1`
# OSC=`lctl dl | awk '/-osc-/ {print $4}' | head -n 1`
- lctl --device %$OSC deactivate
+ lctl --device %osc deactivate
lfs df -i || error "lfs df -i with deactivated OSC failed"
- lctl --device %$OSC activate
+ lctl --device %osc activate
lfs df || error "lfs df with reactivated OSC failed"
}
run_test 24a "lfs df [-ih] [path] test ========================="
run_test 26b "sync mtime between ost and mds"
test_27() {
- cancel_lru_locks osc
+ cancel_lru_locks $OSC
lctl clear
dd if=/dev/zero of=$DIR2/$tfile bs=$((4096+4))k conv=notrunc count=4 seek=3 &
DD2_PID=$!
}
run_test 28 "read/write/truncate file with lost stripes"
-test_29() { # bug 10999
- touch $DIR1/$tfile
- #define OBD_FAIL_LDLM_GLIMPSE 0x30f
- lctl set_param fail_loc=0x8000030f
- ls -l $DIR2/$tfile &
- usleep 500
- dd if=/dev/zero of=$DIR1/$tfile bs=4k count=1
- wait
-}
-#bug 11549 - permanently turn test off in b1_5
-run_test 29 "lock put race between glimpse and enqueue ========="
-
-test_30() { #bug #11110, LU-2523
+test_30() { #b=11110, LU-2523
test_mkdir $DIR1/$tdir
cp -f /bin/bash $DIR1/$tdir/bash
/bin/sh -c 'sleep 1; rm -f $DIR2/$tdir/bash; cp /bin/bash $DIR2/$tdir' &
wait
true
}
-
run_test 30 "recreate file race"
test_31a() {
# enable/disable lockless truncate feature, depending on the arg 0/1
enable_lockless_truncate() {
- lctl set_param -n osc.*.lockless_truncate $1
+ lctl set_param -n $OSC.*.lockless_truncate $1
}
test_32a() { # bug 11270
local p="$TMP/$TESTSUITE-$TESTNAME.parameters"
- save_lustre_params client "osc.*.lockless_truncate" > $p
- cancel_lru_locks osc
+
+ save_lustre_params client "$OSC.*.lockless_truncate" > $p
+ cancel_lru_locks $OSC
enable_lockless_truncate 1
rm -f $DIR1/$tfile
lfs setstripe -c -1 $DIR1/$tfile
dd if=/dev/zero of=$DIR1/$tfile count=$OSTCOUNT bs=$STRIPE_BYTES > \
/dev/null 2>&1
- clear_stats osc.*.osc_stats
+ clear_stats $OSC.*.${OSC}_stats
log "checking cached lockless truncate"
$TRUNCATE $DIR1/$tfile 8000000
$CHECKSTAT -s 8000000 $DIR2/$tfile || error "wrong file size"
- [ $(calc_stats osc.*.osc_stats lockless_truncate) -ne 0 ] ||
+ [ $(calc_stats $OSC.*.${OSC}_stats lockless_truncate) -ne 0 ] ||
error "cached truncate isn't lockless"
log "checking not cached lockless truncate"
$TRUNCATE $DIR2/$tfile 5000000
$CHECKSTAT -s 5000000 $DIR1/$tfile || error "wrong file size"
- [ $(calc_stats osc.*.osc_stats lockless_truncate) -ne 0 ] ||
+ [ $(calc_stats $OSC.*.${OSC}_stats lockless_truncate) -ne 0 ] ||
error "not cached truncate isn't lockless"
log "disabled lockless truncate"
enable_lockless_truncate 0
- clear_stats osc.*.osc_stats
+ clear_stats $OSC.*.${OSC}_stats
$TRUNCATE $DIR2/$tfile 3000000
$CHECKSTAT -s 3000000 $DIR1/$tfile || error "wrong file size"
- [ $(calc_stats osc.*.osc_stats lockless_truncate) -eq 0 ] ||
+ [ $(calc_stats $OSC.*.${OSC}_stats lockless_truncate) -eq 0 ] ||
error "lockless truncate disabling failed"
rm $DIR1/$tfile
# restore lockless_truncate default values
"ldlm.namespaces.filter-*.contended_locks" >> $p
save_lustre_params $facets \
"ldlm.namespaces.filter-*.contention_seconds" >> $p
- clear_stats osc.*.osc_stats
+ clear_stats $OSC.*.${OSC}_stats
# agressive lockless i/o settings
do_nodes $(comma_list $(osts_nodes)) \
"lctl set_param -n ldlm.namespaces.*.max_nolock_bytes=2000000 \
ldlm.namespaces.filter-*.contended_locks=0 \
ldlm.namespaces.filter-*.contention_seconds=60"
- lctl set_param -n osc.*.contention_seconds=60
+ lctl set_param -n $OSC.*.contention_seconds=60
for i in {1..5}; do
dd if=/dev/zero of=$DIR1/$tfile bs=4k count=1 conv=notrunc > \
/dev/null 2>&1
dd if=/dev/zero of=$DIR2/$tfile bs=4k count=1 conv=notrunc > \
/dev/null 2>&1
done
- [ $(calc_stats osc.*.osc_stats lockless_write_bytes) -ne 0 ] ||
+ [ $(calc_stats $OSC.*.${OSC}_stats lockless_write_bytes) -ne 0 ] ||
error "lockless i/o was not triggered"
# disable lockless i/o (it is disabled by default)
do_nodes $(comma_list $(osts_nodes)) \
ldlm.namespaces.filter-*.contention_seconds=0"
# set contention_seconds to 0 at client too, otherwise Lustre still
# remembers lock contention
- lctl set_param -n osc.*.contention_seconds=0
- clear_stats osc.*.osc_stats
+ lctl set_param -n $OSC.*.contention_seconds=0
+ clear_stats $OSC.*.${OSC}_stats
for i in {1..1}; do
dd if=/dev/zero of=$DIR1/$tfile bs=4k count=1 conv=notrunc > \
/dev/null 2>&1
dd if=/dev/zero of=$DIR2/$tfile bs=4k count=1 conv=notrunc > \
/dev/null 2>&1
done
- [ $(calc_stats osc.*.osc_stats lockless_write_bytes) -eq 0 ] ||
+ [ $(calc_stats $OSC.*.${OSC}_stats lockless_write_bytes) -eq 0 ] ||
error "lockless i/o works when disabled"
rm -f $DIR1/$tfile
restore_lustre_params <$p
$LCTL set_param fail_loc=0
- cancel_lru_locks osc
+ cancel_lru_locks $OSC
local mtime2=$(stat -c %Y $DIR2/$tfile)
[ "$mtime2" -ge "$d1" ] && [ "$mtime2" -le "$d2" ] ||
local client1=${CLIENT1:-$(hostname)}
local myRUNAS="$3"
+ local np=$(check_cpt_number ost1)
+ [ $np -gt 0 ] || error "CPU partitions should not be $np."
+ echo "cpu_npartitions on ost1 is $np"
+
mkdir $dir || error "mkdir $dir failed"
- $LFS setstripe -c 1 $dir || error "setstripe to $dir failed"
+ $LFS setstripe -c 1 -i 0 $dir || error "setstripe to $dir failed"
chmod 777 $dir
trap cleanup_tbf_verify EXIT
echo "Limited write rate: $1, read rate: $2"
echo "Verify the write rate is under TBF control"
- local runtime=$(do_node $client1 $myRUNAS dd if=/dev/zero of=$dir/tbf \
- bs=1M count=100 oflag=direct 2>&1 | awk '/bytes/ {print $6}')
+ local start=$SECONDS
+ do_node $client1 $myRUNAS dd if=/dev/zero of=$dir/tbf \
+ bs=1M count=100 oflag=direct 2>&1
+ local runtime=$((SECONDS - start + 1))
local rate=$(bc <<< "scale=6; 100 / $runtime")
echo "Write runtime is $runtime s, speed is $rate IOPS"
- # verify the write rate does not exceed 110% of TBF limited rate
- [ $(bc <<< "$rate < 1.1 * $1") -eq 1 ] ||
- error "The write rate ($rate) exceeds 110% of preset rate ($1)"
+ # verify the write rate does not exceed TBF rate limit
+ [ $(bc <<< "$rate < 1.1 * $np * $1") -eq 1 ] ||
+ error "The write rate ($rate) exceeds 110% of rate limit ($1 * $np)"
cancel_lru_locks osc
echo "Verify the read rate is under TBF control"
- runtime=$(do_node $client1 $myRUNAS dd if=$dir/tbf of=/dev/null \
- bs=1M count=100 iflag=direct 2>&1 | awk '/bytes/ {print $6}')
+ start=$SECONDS
+ do_node $client1 $myRUNAS dd if=$dir/tbf of=/dev/null \
+ bs=1M count=100 iflag=direct 2>&1
+ runtime=$((SECONDS - start + 1))
rate=$(bc <<< "scale=6; 100 / $runtime")
echo "Read runtime is $runtime s, speed is $rate IOPS"
- # verify the read rate does not exceed 110% of TBF limited rate
- [ $(bc <<< "$rate < 1.1 * $2") -eq 1 ] ||
- error "The read rate ($rate) exceeds 110% of preset rate ($2)"
+ # verify the read rate does not exceed TBF rate limit
+ [ $(bc <<< "$rate < 1.1 * $np * $2") -eq 1 ] ||
+ error "The read rate ($rate) exceeds 110% of rate limit ($2 * $np)"
cancel_lru_locks osc
cleanup_tbf_verify || error "rm -rf $dir failed"
}
run_test 93 "alloc_rr should not allocate on same ost"
+ # Data-on-MDT tests
+ test_100a() {
+ skip "Reserved for glimpse-ahead" && return
+ mkdir -p $DIR/$tdir
+
+ $LFS setstripe -E 1024K -L mdt -E EOF $DIR/$tdir/dom
+
+ lctl set_param -n mdc.*.stats=clear
+ dd if=/dev/zero of=$DIR2/$tdir/dom bs=4096 count=1 || return 1
+
+ $CHECKSTAT -t file -s 4096 $DIR/$tdir/dom || error "stat #1"
+ # first stat from server should return size data and save glimpse
+ local gls=$(lctl get_param -n mdc.*.stats | \
+ awk '/ldlm_glimpse/ {print $2}')
+ [ -z $gls ] || error "Unexpected $gls glimpse RPCs"
+ # second stat to check size is NOT cached on client without IO lock
+ $CHECKSTAT -t file -s 4096 $DIR/$tdir/dom || error "stat #2"
+
+ local gls=$(lctl get_param -n mdc.*.stats | grep ldlm_glimpse | wc -l)
+ [ "1" == "$gls" ] || error "Expect 1 glimpse RPCs but got $gls"
+ rm -f $dom
+ }
+ run_test 100a "DoM: glimpse RPCs for stat without IO lock (DoM only file)"
+
+ test_100b() {
+ mkdir -p $DIR/$tdir
+
+ $LFS setstripe -E 1024K -L mdt -E EOF $DIR/$tdir/dom
+
+ lctl set_param -n mdc.*.stats=clear
+ dd if=/dev/zero of=$DIR2/$tdir/dom bs=4096 count=1 || return 1
+ cancel_lru_locks mdc
+ # first stat data from server should have size
+ $CHECKSTAT -t file -s 4096 $DIR/$tdir/dom || error "stat #1"
+ # second stat to check size is cached on client
+ $CHECKSTAT -t file -s 4096 $DIR/$tdir/dom || error "stat #2"
+
+ local gls=$(lctl get_param -n mdc.*.stats |
+ awk '/ldlm_glimpse/ {print $2}')
+ # both stats should cause no glimpse requests
+ [ -z $gls ] || error "Unexpected $gls glimpse RPCs"
+ rm -f $dom
+ }
+ run_test 100b "DoM: no glimpse RPC for stat with IO lock (DoM only file)"
+
+ test_100c() {
+ mkdir -p $DIR/$tdir
+
+ $LFS setstripe -E 1024K -L mdt -E EOF $DIR/$tdir/dom
+
+ lctl set_param -n mdc.*.stats=clear
+ lctl set_param -n osc.*.stats=clear
+ dd if=/dev/zero of=$DIR2/$tdir/dom bs=2048K count=1 || return 1
+
+ # check that size is merged from MDT and OST correctly
+ $CHECKSTAT -t file -s 2097152 $DIR/$tdir/dom ||
+ error "Wrong size from stat #1"
+
+ local gls=$(lctl get_param -n osc.*.stats | grep ldlm_glimpse | wc -l)
+ [ $gls -eq 0 ] && error "Expect OST glimpse RPCs but got none"
+
+ rm -f $dom
+ }
+ run_test 100c "DoM: write vs stat without IO lock (combined file)"
+
+ test_100d() {
+ mkdir -p $DIR/$tdir
+
+ $LFS setstripe -E 1024K -L mdt -E EOF $DIR/$tdir/dom
+
+
+ dd if=/dev/zero of=$DIR2/$tdir/dom bs=2048K count=1 || return 1
+ lctl set_param -n mdc.*.stats=clear
+ $TRUNCATE $DIR2/$tdir/dom 4096
+
+ # check that reported size is valid after file grows to OST and
+ # is truncated back to MDT stripe size
+ $CHECKSTAT -t file -s 4096 $DIR/$tdir/dom ||
+ error "Wrong size from stat #1"
+
+ local gls=$(lctl get_param -n osc.*.stats | grep ldlm_glimpse | wc -l)
+ [ $gls -eq 0 ] && error "Expect OST glimpse but got none"
+
+ rm -f $dom
+ }
+ run_test 100d "DoM: write+truncate vs stat without IO lock (combined file)"
+
+
+ test_101a() {
+ $LFS setstripe -E 1024K -L mdt -E EOF $DIR1/$tfile
+ lctl set_param -n mdc.*.stats=clear
+ # to get layout
+ $CHECKSTAT -t file $DIR1/$tfile
+ # open + IO lock
+ dd if=/dev/zero of=$DIR1/$tfile bs=4096 count=1 || error "Write fails"
+ # must discard pages
+ rm $DIR2/$tfile || error "Unlink fails"
+ local writes=$(lctl get_param -n mdc.*.stats | grep ost_write | wc -l)
+ [ $writes -eq 0 ] || error "Found WRITE RPC but expect none"
+ }
+ run_test 101a "Discard DoM data on unlink"
+
+ test_101b() {
+ $LFS setstripe -E 1024K -L mdt -E EOF $DIR1/$tfile
+ touch $DIR1/${tfile}_2
+ lctl set_param -n mdc.*.stats=clear
+ # to get layout
+ $CHECKSTAT -t file $DIR1/$tfile
+ # open + IO lock
+ dd if=/dev/zero of=$DIR1/$tfile bs=4096 count=1 || error "Write fails"
+ # must discard pages
+ mv $DIR2/${tfile}_2 $DIR2/$tfile || error "Rename fails"
+ local writes=$(lctl get_param -n mdc.*.stats | grep ost_write | wc -l)
+ [ $writes -eq 0 ] || error "Found WRITE RPC but expect none"
+ }
+ run_test 101b "Discard DoM data on rename"
+
+ test_101c() {
+ $LFS setstripe -E 1024K -L mdt -E EOF $DIR1/$tfile
+ lctl set_param -n mdc.*.stats=clear
+ # to get layout
+ $CHECKSTAT -t file $DIR1/$tfile
+ # open + IO lock
+ dd if=/dev/zero of=$DIR1/$tfile bs=4096 count=1 || error "Write fails"
+
+ $MULTIOP $DIR1/$tfile O_c &
+ MULTIOP_PID=$!
+ sleep 2
+ rm $DIR2/$tfile > /dev/null || error "Unlink fails"
+ kill -USR1 $MULTIOP_PID || return 2
+ wait $MULTIOP_PID || return 3
+ local writes=$(lctl get_param -n mdc.*.stats | grep ost_write | wc -l)
+ [ $writes -eq 0 ] || error "Found WRITE RPC but expect none"
+ }
+ run_test 101c "Discard DoM data on close-unlink"
+
log "cleanup: ======================================================"
# kill and wait in each test only guarentee script finish, but command in script
LNETLND="socklnd/ksocklnd"
esac
fi
- load_module ../lnet/klnds/$LNETLND
- load_module obdclass/obdclass
- load_module ptlrpc/ptlrpc
- load_module ptlrpc/gss/ptlrpc_gss
- load_module fld/fld
- load_module fid/fid
- load_module lmv/lmv
- load_module mdc/mdc
- load_module osc/osc
- load_module lov/lov
- load_module mgc/mgc
- load_module obdecho/obdecho
+ load_module ../lnet/klnds/$LNETLND
+ load_module obdclass/obdclass
+ load_module ptlrpc/ptlrpc
+ load_module ptlrpc/gss/ptlrpc_gss
+ load_module fld/fld
+ load_module fid/fid
+ load_module lmv/lmv
+ load_module osc/osc
+ load_module mdc/mdc
+ load_module lov/lov
+ load_module mgc/mgc
+ load_module obdecho/obdecho
if ! client_only; then
SYMLIST=/proc/kallsyms
grep -q crc16 $SYMLIST ||
sync_all_data() {
do_nodes $(comma_list $(mdts_nodes)) \
- "lctl set_param -n osd*.*MDT*.force_sync=1"
+ "lctl set_param -n os[cd]*.*MDT*.force_sync=1"
do_nodes $(comma_list $(osts_nodes)) \
"lctl set_param -n osd*.*OS*.force_sync=1" 2>&1 |
grep -v 'Found no match'
mds2sync=$(comma_list $mds2sync)
# sync MDS transactions
- do_nodes $mds2sync "$LCTL set_param -n osd*.*MD*.force_sync 1"
+ do_nodes $mds2sync "$LCTL set_param -n os[cd]*.*MD*.force_sync 1"
# wait till all changes are sent and commmitted by OSTs
# for ldiskfs space is released upon execution, but DMU
}
formatall() {
- stopall
+ stopall -f
# Set hostid for ZFS/SPL zpool import protection
# (Assumes MDS version is also OSS version)
if [ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.8.54) ];
/* Setstripe and migrate share mostly the same parameters */
#define SSM_CMD_COMMON(cmd) \
- "usage: "cmd" [--stripe-count|-c <stripe_count>]\n" \
+ "usage: "cmd" [--component-end|-E <comp_end>]\n" \
+ " [--stripe-count|-c <stripe_count>]\n" \
" [--stripe-index|-i <start_ost_idx>]\n" \
" [--stripe-size|-S <stripe_size>]\n" \
+ " [--layout|-L <pattern>]\n" \
" [--pool|-p <pool_name>]\n" \
- " [--ost|-o <ost_indices>]\n" \
- " [--component-end|-E <comp_end>]\n"
+ " [--ost|-o <ost_indices>]\n"
#define SSM_HELP_COMMON \
- "\tstripe_size: Number of bytes on each OST (0 filesystem default)\n" \
- "\t Can be specified with k, m or g (in KB, MB and GB\n" \
+ "\tstripe_count: Number of OSTs to stripe over (0=fs default, -1 all)\n" \
+ "\tstart_ost_idx: OST index of first stripe (-1=default round robin)\n"\
+ "\tstripe_size: Number of bytes on each OST (0=fs default)\n" \
+ "\t Can be specified with K, M or G (for KB, MB, GB\n" \
"\t respectively)\n" \
- "\tstart_ost_idx: OST index of first stripe (-1 default)\n" \
- "\tstripe_count: Number of OSTs to stripe over (0 default, -1 all)\n" \
"\tpool_name: Name of OST pool to use (default none)\n" \
+ "\tlayout: stripe pattern type: raid0, mdt (default raid0)\n"\
"\tost_indices: List of OST indices, can be repeated multiple times\n"\
"\t Indices be specified in a format of:\n" \
"\t -o <ost_1>,<ost_i>-<ost_j>,<ost_n>\n" \
"\t Or:\n" \
"\t -o <ost_1> -o <ost_i>-<ost_j> -o <ost_n>\n" \
"\t If --pool is set with --ost, then the OSTs\n" \
- "\t must be the members of the pool." \
- "\tcomp_end: Extent end of the component\n" \
- "\t Can be specified with k, m or g (in KB, MB and GB\n" \
- "\t respectively, -1 for EOF), it must be aligned with\n"\
- "\t the stripe_size\n"
+ "\t must be the members of the pool.\n" \
+ "\tcomp_end: Extent end of component, start after previous end.\n"\
+ "\t Can be specified with K, M or G (for KB, MB, GB\n" \
+ "\t respectively, -1 for EOF). Must be a multiple of\n"\
+ "\t stripe_size.\n"
-#define SETSTRIPE_USAGE \
- SSM_CMD_COMMON("setstripe") \
- " <directory|filename>\n" \
- SSM_HELP_COMMON \
#define MIGRATE_USAGE \
SSM_CMD_COMMON("migrate ") \
/* all available commands */
command_t cmdlist[] = {
{"setstripe", lfs_setstripe, 0,
- "Create a new file with a specific striping pattern or\n"
- "set the default striping pattern on an existing directory or\n"
- "delete the default striping pattern from an existing directory or\n"
- "add layout component(s) to an existing composite file or\n"
- "delete specified component(s) from an existing composite file\n\n"
- "To delete default striping from an existing directory:\n"
+ "To create a file with specified striping/composite layout, or\n"
+ "create/replace the default layout on an existing directory:\n"
+ SSM_CMD_COMMON("setstripe")
+ " <directory|filename>\n"
+ " or\n"
+ "To add component(s) to an existing composite file:\n"
+ SSM_CMD_COMMON("setstripe --component-add")
+ SSM_HELP_COMMON
+ "To totally delete the default striping from an existing directory:\n"
"usage: setstripe -d <directory>\n"
" or\n"
- "To delete component(s) from an existing composite file:\n"
+ "To delete the last component(s) from an existing composite file\n"
+ "(note that this will also delete any data in those components):\n"
"usage: setstripe --component-del [--component-id|-I <comp_id>]\n"
" [--component-flags|-F <comp_flags>]\n"
" <filename>\n"
- "\tcomp_id: Unique component ID\n"
+ "\tcomp_id: Unique component ID to delete\n"
"\tcomp_flags: 'init' indicating all instantiated components\n"
- "\t '^init' indicating all uninstantiated components\n"
- "\t-I and -F can't be specified at the same time\n"
- " or\n"
- "To add component(s) to an existing composite file:\n"
- SSM_CMD_COMMON("setstripe --component-add")
- " or\n"
- "To create a file with specified striping/composite layout:\n"
- SETSTRIPE_USAGE},
+ "\t '^init' indicating all uninstantiated components\n"
+ "\t-I and -F cannot be specified at the same time\n"},
{"getstripe", lfs_getstripe, 0,
"To list the striping info for a given file or files in a\n"
"directory or recursively for all files in a directory tree.\n"
" [[!] --gid|-g|--group|-G <gid>|<gname>]\n"
" [[!] --uid|-u|--user|-U <uid>|<uname>] [[!] --pool <pool>]\n"
" [[!] --projid <projid>]\n"
- " [[!] --layout|-L released,raid0]\n"
+ " [[!] --layout|-L released,raid0,mdt]\n"
" [[!] --component-count [+-]<comp_cnt>]\n"
" [[!] --component-start [+-]N[kMGTPE]]\n"
" [[!] --component-end|-E [+-]N[kMGTPE]]\n"
fd = llapi_layout_file_open(fname, open_flags, open_mode, layout);
if (fd < 0)
- fprintf(stderr, "%s %s failed. %s\n",
+ fprintf(stderr, "%s: cannot %s '%s': %s\n", progname,
S_ISDIR(st.st_mode) ?
- "Set default composite layout to " :
- "Create composite file",
+ "set default composite layout for" :
+ "create composite file",
fname, strerror(errno));
return fd;
}
fd = open(name, O_RDWR | O_DIRECT);
if (fd == -1) {
rc = -errno;
- fprintf(stderr, "%s: %s: cannot open: %s\n", progname, name,
+ fprintf(stderr, "%s: cannot open '%s': %s\n", progname, name,
strerror(-rc));
goto free;
}
int lsa_stripe_off;
__u32 lsa_comp_flags;
int lsa_nr_osts;
+ int lsa_pattern;
__u32 *lsa_osts;
char *lsa_pool_name;
};
{
return (lsa->lsa_stripe_size != 0 || lsa->lsa_stripe_count != 0 ||
lsa->lsa_stripe_off != -1 || lsa->lsa_pool_name != NULL ||
- lsa->lsa_comp_end != 0);
+ lsa->lsa_comp_end != 0 || lsa->lsa_pattern != 0);
}
static int comp_args_to_layout(struct llapi_layout **composite,
return rc;
}
+ /* Data-on-MDT component setting */
+ if (lsa->lsa_pattern == LLAPI_LAYOUT_MDT) {
+ /* In case of Data-on-MDT patterns the only extra option
+ * applicable is stripe size option. */
+ if (lsa->lsa_stripe_count) {
+ fprintf(stderr, "Option 'stripe-count' can't be "
+ "specified with Data-on-MDT component: %i\n",
+ lsa->lsa_stripe_count);
+ return -EINVAL;
+ }
+ if (lsa->lsa_stripe_size) {
+ fprintf(stderr, "Option 'stripe-size' can't be "
+ "specified with Data-on-MDT component: %llu\n",
+ lsa->lsa_stripe_size);
+ return -EINVAL;
+ }
+ if (lsa->lsa_nr_osts != 0) {
+ fprintf(stderr, "Option 'ost-list' can't be specified "
+ "with Data-on-MDT component: '%i'\n",
+ lsa->lsa_nr_osts);
+ return -EINVAL;
+ }
+ if (lsa->lsa_stripe_off != -1) {
+ fprintf(stderr, "Option 'stripe-offset' can't be "
+ "specified with Data-on-MDT component: %i\n",
+ lsa->lsa_stripe_off);
+ return -EINVAL;
+ }
+ if (lsa->lsa_pool_name != 0) {
+ fprintf(stderr, "Option 'pool' can't be specified "
+ "with Data-on-MDT component: '%s'\n",
+ lsa->lsa_pool_name);
+ return -EINVAL;
+ }
+
+ rc = llapi_layout_pattern_set(layout, lsa->lsa_pattern);
+ if (rc) {
+ fprintf(stderr, "Set stripe pattern %#x failed. %s\n",
+ lsa->lsa_pattern, strerror(errno));
+ return rc;
+ }
+ /* Data-on-MDT component has always single stripe up to end */
+ lsa->lsa_stripe_size = lsa->lsa_comp_end;
+ }
+
if (lsa->lsa_stripe_size != 0) {
rc = llapi_layout_stripe_size_set(layout,
lsa->lsa_stripe_size);
}
}
if (!found) {
- llapi_printf(LLAPI_MSG_ERROR, "Component flag "
- "'%s' is not supported.\n", name);
+ llapi_printf(LLAPI_MSG_ERROR,
+ "%s: component flag '%s' not supported\n",
+ progname, name);
return -EINVAL;
}
}
{ .val = LFS_COMP_SET_OPT,
.name = "component-set",
.has_arg = no_argument},
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 59, 0)
- /* This formerly implied "stripe-count", but was explicitly
- * made "stripe-count" for consistency with other options,
- * and to separate it from "mdt-count" when DNE arrives. */
- { .val = 'c', .name = "count", .has_arg = required_argument },
-#endif
{ .val = 'c', .name = "stripe-count", .has_arg = required_argument},
{ .val = 'c', .name = "stripe_count", .has_arg = required_argument},
{ .val = 'd', .name = "delete", .has_arg = no_argument},
{ .val = 'E', .name = "component-end",
.has_arg = required_argument},
/* dirstripe {"mdt-hash", required_argument, 0, 'H'}, */
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 59, 0)
- /* This formerly implied "stripe-index", but was explicitly
- * made "stripe-index" for consistency with other options,
- * and to separate it from "mdt-index" when DNE arrives. */
- { .val = 'i', .name = "index", .has_arg = required_argument },
-#endif
{ .val = 'i', .name = "stripe-index", .has_arg = required_argument},
{ .val = 'i', .name = "stripe_index", .has_arg = required_argument},
{ .val = 'I', .name = "comp-id", .has_arg = required_argument},
{ .val = 'I', .name = "component-id", .has_arg = required_argument},
+ { .val = 'L', .name = "layout", .has_arg = required_argument },
{ .val = 'm', .name = "mdt", .has_arg = required_argument},
{ .val = 'm', .name = "mdt-index", .has_arg = required_argument},
{ .val = 'm', .name = "mdt_index", .has_arg = required_argument},
{ .val = 'o', .name = "ost_list", .has_arg = required_argument },
#endif
{ .val = 'p', .name = "pool", .has_arg = required_argument },
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 59, 0)
- /* This formerly implied "--stripe-size", but was confusing
- * with "lfs find --size|-s", which means "file size", so use
- * the consistent "--stripe-size|-S" for all commands. */
- { .val = 's', .name = "size", .has_arg = required_argument },
-#endif
{ .val = 'S', .name = "stripe-size", .has_arg = required_argument },
{ .val = 'S', .name = "stripe_size", .has_arg = required_argument },
/* dirstripe {"mdt-count", required_argument, 0, 'T'}, */
if (strcmp(argv[0], "migrate") == 0)
migrate_mode = true;
- while ((c = getopt_long(argc, argv, "bc:dE:i:I:m:no:p:s:S:v",
+ while ((c = getopt_long(argc, argv, "bc:dE:i:I:m:no:p:L:s:S:v",
long_opts, NULL)) >= 0) {
switch (c) {
case 0:
break;
case LFS_COMP_FLAGS_OPT:
result = comp_str2flags(&lsa.lsa_comp_flags, optarg);
- if (result != 0) {
- fprintf(stderr, "error: %s: bad comp flags "
- "'%s'\n", argv[0], optarg);
- goto error;
- }
+ if (result != 0)
+ goto usage_error;
break;
case LFS_COMP_SET_OPT:
comp_set = 1;
break;
case 'b':
if (!migrate_mode) {
- fprintf(stderr, "--block is valid only for"
- " migrate mode\n");
- goto error;
+ fprintf(stderr,
+ "%s %s: -b|--block valid only for migrate command\n",
+ progname, argv[0]);
+ goto usage_error;
}
migration_block = true;
break;
case 'c':
-#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 6, 53, 0)
- if (strcmp(argv[optind - 1], "--count") == 0)
- fprintf(stderr, "warning: '--count' deprecated"
- ", use '--stripe-count' instead\n");
-#endif
lsa.lsa_stripe_count = strtoul(optarg, &end, 0);
if (*end != '\0') {
- fprintf(stderr, "error: %s: bad stripe count "
- "'%s'\n", argv[0], optarg);
- goto error;
+ fprintf(stderr,
+ "%s %s: invalid stripe count '%s'\n",
+ progname, argv[0], optarg);
+ goto usage_error;
}
break;
case 'd':
case 'E':
if (lsa.lsa_comp_end != 0) {
result = comp_args_to_layout(&layout, &lsa);
- if (result)
- goto error;
+ if (result) {
+ fprintf(stderr,
+ "%s %s: invalid layout\n",
+ progname, argv[0]);
+ goto usage_error;
+ }
setstripe_args_init(&lsa);
}
&lsa.lsa_comp_end,
&size_units, 0);
if (result) {
- fprintf(stderr, "error: %s: "
- "bad component end '%s'\n",
- argv[0], optarg);
- goto error;
+ fprintf(stderr,
+ "%s %s: invalid component end '%s'\n",
+ progname, argv[0], optarg);
+ goto usage_error;
}
}
break;
case 'i':
- if (strcmp(argv[optind - 1], "--index") == 0)
- fprintf(stderr, "warning: '--index' deprecated"
- ", use '--stripe-index' instead\n");
lsa.lsa_stripe_off = strtol(optarg, &end, 0);
if (*end != '\0') {
- fprintf(stderr, "error: %s: bad stripe offset "
- "'%s'\n", argv[0], optarg);
- goto error;
+ fprintf(stderr,
+ "%s %s: invalid stripe offset '%s'\n",
+ progname, argv[0], optarg);
+ goto usage_error;
}
break;
case 'I':
comp_id = strtoul(optarg, &end, 0);
if (*end != '\0' || comp_id == 0 ||
comp_id > LCME_ID_MAX) {
- fprintf(stderr, "error: %s: bad comp ID "
- "'%s'\n", argv[0], optarg);
- goto error;
+ fprintf(stderr,
+ "%s %s: invalid component ID '%s'\n",
+ progname, argv[0], optarg);
+ goto usage_error;
}
break;
+ case 'L':
+ if (strcmp(argv[optind - 1], "mdt") == 0) {
+ /* Can be only the first component */
+ if (layout != NULL) {
+ result = -EINVAL;
+ fprintf(stderr, "error: 'mdt' layout "
+ "can be only the first one\n");
+ goto error;
+ }
+ if (lsa.lsa_comp_end > (1ULL << 30)) { /* 1Gb */
+ result = -EFBIG;
+ fprintf(stderr, "error: 'mdt' layout "
+ "size is too big\n");
+ goto error;
+ }
+ lsa.lsa_pattern = LLAPI_LAYOUT_MDT;
+ } else if (strcmp(argv[optind - 1], "raid0") != 0) {
+ result = -EINVAL;
+ fprintf(stderr, "error: layout '%s' is "
+ "unknown, supported layouts are: "
+ "'mdt', 'raid0'\n", argv[optind]);
+ goto error;
+ }
+ break;
case 'm':
if (!migrate_mode) {
- fprintf(stderr, "--mdt-index is valid only for"
- " migrate mode\n");
- goto error;
+ fprintf(stderr,
+ "%s %s: -m|--mdt-index valid only for migrate command\n",
+ progname, argv[0]);
+ goto usage_error;
}
mdt_idx_arg = optarg;
break;
case 'n':
if (!migrate_mode) {
- fprintf(stderr, "--non-block is valid only for"
- " migrate mode\n");
- goto error;
+ fprintf(stderr,
+ "%s %s: -n|--non-block valid only for migrate command\n",
+ progname, argv[0]);
+ goto usage_error;
}
migration_flags |= MIGRATION_NONBLOCK;
break;
lsa.lsa_nr_osts, optarg);
if (lsa.lsa_nr_osts < 0) {
fprintf(stderr,
- "error: %s: bad OST indices '%s'\n",
- argv[0], optarg);
- goto error;
+ "%s %s: invalid OST target(s) '%s'\n",
+ progname, argv[0], optarg);
+ goto usage_error;
}
lsa.lsa_osts = osts;
break;
case 'p':
if (optarg == NULL)
- goto error;
+ goto usage_error;
lsa.lsa_pool_name = optarg;
break;
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 59, 0)
- case 's':
-#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 6, 53, 0)
- fprintf(stderr, "warning: '--size|-s' deprecated, "
- "use '--stripe-size|-S' instead\n");
-#endif
-#endif /* LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 9, 59, 0) */
case 'S':
result = llapi_parse_size(optarg, &lsa.lsa_stripe_size,
&size_units, 0);
if (result) {
- fprintf(stderr, "error: %s: bad stripe size "
- "'%s'\n", argv[0], optarg);
- goto error;
+ fprintf(stderr,
+ "%s %s: invalid stripe size '%s'\n",
+ progname, argv[0], optarg);
+ goto usage_error;
}
break;
case 'v':
if (!migrate_mode) {
- fprintf(stderr, "--verbose is valid only for"
- " migrate mode\n");
- goto error;
+ fprintf(stderr,
+ "%s %s: -v|--verbose valid only for migrate command\n",
+ progname, argv[0]);
+ goto usage_error;
}
migrate_mdt_param.fp_verbose = VERBOSE_DETAIL;
break;
default:
- goto error;
+ fprintf(stderr, "%s %s: unrecognized option '%s'\n",
+ progname, argv[0], argv[optind - 1]);
+ goto usage_error;
}
}
if (lsa.lsa_comp_end != 0) {
result = comp_args_to_layout(&layout, &lsa);
- if (result)
- goto error;
+ if (result) {
+ fprintf(stderr, "%s %s: invalid component layout\n",
+ progname, argv[0]);
+ goto usage_error;
+ }
}
if (optind == argc) {
- fprintf(stderr, "error: %s: missing filename|dirname\n",
- argv[0]);
- goto error;
+ fprintf(stderr, "%s %s: FILE must be specified\n",
+ progname, argv[0]);
+ goto usage_error;
}
/* Only LCME_FL_INIT flags is used in PFL, and it shouldn't be
* altered by user space tool, so we don't need to support the
* --component-set for this moment. */
if (comp_set != 0) {
- fprintf(stderr, "error: %s: --component-set isn't supported.\n",
- argv[0]);
- goto error;
+ fprintf(stderr, "%s %s: --component-set not supported\n",
+ progname, argv[0]);
+ goto usage_error;
}
if ((delete + comp_set + comp_del + comp_add) > 1) {
- fprintf(stderr, "error: %s: can't specify --component-set, "
- "--component-del, --component-add or -d together\n",
- argv[0]);
- goto error;
+ fprintf(stderr,
+ "%s %s: options --component-set, --component-del, --component-add and -d are mutually exclusive\n",
+ progname, argv[0]);
+ goto usage_error;
}
if (delete && (setstripe_args_specified(&lsa) || comp_id != 0 ||
lsa.lsa_comp_flags != 0 || layout != NULL)) {
- fprintf(stderr, "error: %s: can't specify -d with "
- "-s, -c, -o, -p, -I, -F or -E options\n",
- argv[0]);
- goto error;
+ fprintf(stderr,
+ "%s %s: option -d is mutually exclusive with -s, -c, -o, -p, -I, -F and -E options\n",
+ progname, argv[0]);
+ goto usage_error;
}
if ((comp_set || comp_del) &&
(setstripe_args_specified(&lsa) || layout != NULL)) {
- fprintf(stderr, "error: %s: can't specify --component-del or "
- "--component-set with -s, -c, -o, -p or -E options.\n",
- argv[0]);
- goto error;
+ fprintf(stderr,
+ "%s %s: options --component-del and --component-set are mutually exclusive when used with -c, -E, -o, -p, or -s\n",
+ progname, argv[0]);
+ goto usage_error;
}
if (comp_del && comp_id != 0 && lsa.lsa_comp_flags != 0) {
- fprintf(stderr, "error: %s: can't specify both -I and -F for "
- "--component-del option.\n", argv[0]);
- goto error;
+ fprintf(stderr,
+ "%s %s: options -I and -F are mutually exclusive when used with --component-del\n",
+ progname, argv[0]);
+ goto usage_error;
}
if (comp_add || comp_del) {
result = lstat(fname, &st);
if (result == 0 && S_ISDIR(st.st_mode)) {
- fprintf(stderr, "error: %s: can't use --component-add "
- "or --component-del for directory.\n",
- argv[0]);
- goto error;
+ fprintf(stderr,
+ "%s setstripe: cannot use --component-add or --component-del for directory\n",
+ progname);
+ goto usage_error;
}
}
if (comp_add) {
if (layout == NULL) {
- fprintf(stderr, "error: %s: -E option must be present"
- "in --component-add mode.\n", argv[0]);
- goto error;
+ fprintf(stderr,
+ "%s %s: option -E must be specified with --component-add\n",
+ progname, argv[0]);
+ goto usage_error;
}
result = adjust_first_extent(fname, layout);
if (result == -ENODATA)
}
if (mdt_idx_arg != NULL && optind > 3) {
- fprintf(stderr, "error: %s: cannot specify -m with other "
- "options\n", argv[0]);
- goto error;
+ fprintf(stderr,
+ "%s %s: option -m cannot be used with other options\n",
+ progname, argv[0]);
+ goto usage_error;
}
if ((migration_flags & MIGRATION_NONBLOCK) && migration_block) {
fprintf(stderr,
- "error: %s: cannot specify --non-block and --block\n",
- argv[0]);
- goto error;
+ "%s %s: options --non-block and --block are mutually exclusive\n",
+ progname, argv[0]);
+ goto usage_error;
}
if (!comp_del && !comp_set && comp_id != 0) {
- fprintf(stderr, "error: %s: -I can only be used with "
- "--component-del.\n", argv[0]);
- goto error;
+ fprintf(stderr,
+ "%s %s: option -I can only be used with --component-del\n",
+ progname, argv[0]);
+ goto usage_error;
}
if (mdt_idx_arg != NULL) {
/* initialize migrate mdt parameters */
migrate_mdt_param.fp_mdt_index = strtoul(mdt_idx_arg, &end, 0);
if (*end != '\0') {
- fprintf(stderr, "error: %s: bad MDT index '%s'\n",
- argv[0], mdt_idx_arg);
- goto error;
+ fprintf(stderr, "%s %s: invalid MDT index '%s'\n",
+ progname, argv[0], mdt_idx_arg);
+ goto usage_error;
}
migrate_mdt_param.fp_migrate = 1;
} else if (layout == NULL) {
param = calloc(1, offsetof(typeof(*param),
lsp_osts[lsa.lsa_nr_osts]));
if (param == NULL) {
- fprintf(stderr, "error: %s: %s\n", argv[0],
- strerror(ENOMEM));
+ fprintf(stderr,
+ "%s %s: cannot allocate memory for parameters: %s\n",
+ progname, argv[0], strerror(ENOMEM));
+ result = -ENOMEM;
goto error;
}
param->lsp_stripe_size = lsa.lsa_stripe_size;
param->lsp_stripe_offset = lsa.lsa_stripe_off;
param->lsp_stripe_count = lsa.lsa_stripe_count;
- param->lsp_stripe_pattern = 0;
param->lsp_pool = lsa.lsa_pool_name;
param->lsp_is_specific = false;
if (lsa.lsa_nr_osts > 0) {
if (lsa.lsa_stripe_count > 0 &&
lsa.lsa_nr_osts != lsa.lsa_stripe_count) {
- fprintf(stderr, "error: %s: stripe count '%d' "
- "doesn't match the number of OSTs: %d\n"
- , argv[0], lsa.lsa_stripe_count,
+ fprintf(stderr,
+ "%s %s: stripe count '%d' does not match number of OSTs: %d\n",
+ progname, argv[0], lsa.lsa_stripe_count,
lsa.lsa_nr_osts);
free(param);
- goto error;
+ goto usage_error;
}
param->lsp_is_specific = true;
}
for (fname = argv[optind]; fname != NULL; fname = argv[++optind]) {
- char *op;
if (mdt_idx_arg != NULL) {
result = llapi_migrate_mdt(fname, &migrate_mdt_param);
} else if (migrate_mode) {
result = lfs_migrate(fname, migration_flags, param,
layout);
} else if (comp_set != 0) {
result = lfs_component_set(fname, comp_id,
lsa.lsa_comp_flags);
} else if (comp_del != 0) {
result = lfs_component_del(fname, comp_id,
lsa.lsa_comp_flags);
- op = "delete component of";
} else if (comp_add != 0) {
result = lfs_component_add(fname, layout);
- op = "add component to";
} else if (layout != NULL) {
result = lfs_component_create(fname, O_CREAT | O_WRONLY,
0644, layout);
close(result);
result = 0;
}
- op = "create composite";
} else {
result = llapi_file_open_param(fname,
O_CREAT | O_WRONLY,
close(result);
result = 0;
}
- op = "create striped";
}
if (result) {
/* Save the first error encountered. */
if (result2 == 0)
result2 = result;
- fprintf(stderr, "error: %s: %s file '%s' failed: %s\n",
- argv[0], op, fname,
- lsa.lsa_pool_name != NULL && result == EINVAL ?
- "OST not in pool?" : strerror(errno));
continue;
}
}
free(param);
llapi_layout_free(layout);
return result2;
+usage_error:
+ result = CMD_HELP;
error:
llapi_layout_free(layout);
- return CMD_HELP;
+ return result;
}
static int lfs_poollist(int argc, char **argv)
static int name2layout(__u32 *layout, char *name)
{
- char *ptr, *lyt;
+ char *ptr, *layout_name;
*layout = 0;
for (ptr = name; ; ptr = NULL) {
- lyt = strtok(ptr, ",");
- if (lyt == NULL)
+ layout_name = strtok(ptr, ",");
+ if (layout_name == NULL)
break;
- if (strcmp(lyt, "released") == 0)
+ if (strcmp(layout_name, "released") == 0)
*layout |= LOV_PATTERN_F_RELEASED;
- else if (strcmp(lyt, "raid0") == 0)
+ else if (strcmp(layout_name, "raid0") == 0)
*layout |= LOV_PATTERN_RAID0;
+ else if (strcmp(layout_name, "mdt") == 0)
+ *layout |= LOV_PATTERN_MDT;
else
return -1;
}
break;
default:
rc = -ENOTSUP;
- break;
+ pass++;
+ goto out;
}
if (rc)
name = "<unknown>";
int main(int argc, char **argv)
{
- int rc;
+ int rc;
/* Ensure that liblustreapi constructor has run */
if (!liblustreapi_initialized)
fprintf(stderr, "liblustreapi was not properly initialized\n");
- setlinebuf(stdout);
+ setlinebuf(stdout);
+ opterr = 0;
Parser_init("lfs > ", cmdlist);
progname = argv[0]; /* Used in error messages */
- if (argc > 1) {
- rc = Parser_execarg(argc - 1, argv + 1, cmdlist);
- } else {
- rc = Parser_commands();
- }
+ if (argc > 1)
+ rc = Parser_execarg(argc - 1, argv + 1, cmdlist);
+ else
+ rc = Parser_commands();
- return rc < 0 ? -rc : rc;
+ return rc < 0 ? -rc : rc;
}
#ifdef _LUSTRE_IDL_H_