Whamcloud - gitweb
libext2fs: don't use O_DIRECT for files on tmpfs
[tools/e2fsprogs.git] / lib / ext2fs / unix_io.c
1 /*
2  * unix_io.c --- This is the Unix (well, really POSIX) implementation
3  *      of the I/O manager.
4  *
5  * Implements a one-block write-through cache.
6  *
7  * Includes support for Windows NT support under Cygwin.
8  *
9  * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10  *      2002 by Theodore Ts'o.
11  *
12  * %Begin-Header%
13  * This file may be redistributed under the terms of the GNU Library
14  * General Public License, version 2.
15  * %End-Header%
16  */
17
18 #if !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
19 #define _XOPEN_SOURCE 600
20 #define _DARWIN_C_SOURCE
21 #define _FILE_OFFSET_BITS 64
22 #ifndef _LARGEFILE_SOURCE
23 #define _LARGEFILE_SOURCE
24 #endif
25 #ifndef _LARGEFILE64_SOURCE
26 #define _LARGEFILE64_SOURCE
27 #endif
28 #ifndef _GNU_SOURCE
29 #define _GNU_SOURCE
30 #endif
31 #endif
32
33 #include "config.h"
34 #include <stdio.h>
35 #include <string.h>
36 #if HAVE_UNISTD_H
37 #include <unistd.h>
38 #endif
39 #if HAVE_ERRNO_H
40 #include <errno.h>
41 #endif
42 #include <fcntl.h>
43 #include <time.h>
44 #ifdef __linux__
45 #include <sys/utsname.h>
46 #endif
47 #if HAVE_SYS_TYPES_H
48 #include <sys/types.h>
49 #endif
50 #ifdef HAVE_SYS_IOCTL_H
51 #include <sys/ioctl.h>
52 #endif
53 #ifdef HAVE_SYS_MOUNT_H
54 #include <sys/mount.h>
55 #endif
56 #ifdef HAVE_SYS_PRCTL_H
57 #include <sys/prctl.h>
58 #else
59 #define PR_GET_DUMPABLE 3
60 #endif
61 #if HAVE_SYS_STAT_H
62 #include <sys/stat.h>
63 #endif
64 #if HAVE_SYS_RESOURCE_H
65 #include <sys/resource.h>
66 #endif
67 #if HAVE_LINUX_FALLOC_H
68 #include <linux/falloc.h>
69 #endif
70
71 #if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
72 #define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
73 #endif
74
75 #undef ALIGN_DEBUG
76
77 #include "ext2_fs.h"
78 #include "ext2fs.h"
79 #include "ext2fsP.h"
80
81 /*
82  * For checking structure magic numbers...
83  */
84
85 #define EXT2_CHECK_MAGIC(struct, code) \
86           if ((struct)->magic != (code)) return (code)
87
88 struct unix_cache {
89         char                    *buf;
90         unsigned long long      block;
91         int                     access_time;
92         unsigned                dirty:1;
93         unsigned                in_use:1;
94 };
95
96 #define CACHE_SIZE 8
97 #define WRITE_DIRECT_SIZE 4     /* Must be smaller than CACHE_SIZE */
98 #define READ_DIRECT_SIZE 4      /* Should be smaller than CACHE_SIZE */
99
100 struct unix_private_data {
101         int     magic;
102         int     dev;
103         int     flags;
104         int     align;
105         int     access_time;
106         ext2_loff_t offset;
107         struct unix_cache cache[CACHE_SIZE];
108         void    *bounce;
109         struct struct_io_stats io_stats;
110 };
111
112 #define IS_ALIGNED(n, align) ((((uintptr_t) n) & \
113                                ((uintptr_t) ((align)-1))) == 0)
114
115 static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
116 {
117         errcode_t       retval = 0;
118
119         struct unix_private_data *data;
120
121         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
122         data = (struct unix_private_data *) channel->private_data;
123         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
124
125         if (stats)
126                 *stats = &data->io_stats;
127
128         return retval;
129 }
130
131 static char *safe_getenv(const char *arg)
132 {
133         if ((getuid() != geteuid()) || (getgid() != getegid()))
134                 return NULL;
135 #ifdef HAVE_PRCTL
136         if (prctl(PR_GET_DUMPABLE, 0, 0, 0, 0) == 0)
137                 return NULL;
138 #else
139 #if (defined(linux) && defined(SYS_prctl))
140         if (syscall(SYS_prctl, PR_GET_DUMPABLE, 0, 0, 0, 0) == 0)
141                 return NULL;
142 #endif
143 #endif
144
145 #if defined(HAVE_SECURE_GETENV)
146         return secure_getenv(arg);
147 #elif defined(HAVE___SECURE_GETENV)
148         return __secure_getenv(arg);
149 #else
150         return getenv(arg);
151 #endif
152 }
153
154 /*
155  * Here are the raw I/O functions
156  */
157 static errcode_t raw_read_blk(io_channel channel,
158                               struct unix_private_data *data,
159                               unsigned long long block,
160                               int count, void *bufv)
161 {
162         errcode_t       retval;
163         ssize_t         size;
164         ext2_loff_t     location;
165         int             actual = 0;
166         unsigned char   *buf = bufv;
167         ssize_t         really_read = 0;
168
169         size = (count < 0) ? -count : (ext2_loff_t) count * channel->block_size;
170         data->io_stats.bytes_read += size;
171         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
172
173         if (data->flags & IO_FLAG_FORCE_BOUNCE) {
174                 if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
175                         retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
176                         goto error_out;
177                 }
178                 goto bounce_read;
179         }
180
181 #ifdef HAVE_PREAD64
182         /* Try an aligned pread */
183         if ((channel->align == 0) ||
184             (IS_ALIGNED(buf, channel->align) &&
185              IS_ALIGNED(size, channel->align))) {
186                 actual = pread64(data->dev, buf, size, location);
187                 if (actual == size)
188                         return 0;
189                 actual = 0;
190         }
191 #elif HAVE_PREAD
192         /* Try an aligned pread */
193         if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
194             ((channel->align == 0) ||
195              (IS_ALIGNED(buf, channel->align) &&
196               IS_ALIGNED(size, channel->align)))) {
197                 actual = pread(data->dev, buf, size, location);
198                 if (actual == size)
199                         return 0;
200                 actual = 0;
201         }
202 #endif /* HAVE_PREAD */
203
204         if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
205                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
206                 goto error_out;
207         }
208         if ((channel->align == 0) ||
209             (IS_ALIGNED(buf, channel->align) &&
210              IS_ALIGNED(size, channel->align))) {
211                 actual = read(data->dev, buf, size);
212                 if (actual != size) {
213                 short_read:
214                         if (actual < 0) {
215                                 retval = errno;
216                                 actual = 0;
217                         } else
218                                 retval = EXT2_ET_SHORT_READ;
219                         goto error_out;
220                 }
221                 return 0;
222         }
223
224 #ifdef ALIGN_DEBUG
225         printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
226                (unsigned long) size);
227 #endif
228
229         /*
230          * The buffer or size which we're trying to read isn't aligned
231          * to the O_DIRECT rules, so we need to do this the hard way...
232          */
233 bounce_read:
234         while (size > 0) {
235                 actual = read(data->dev, data->bounce, channel->block_size);
236                 if (actual != channel->block_size) {
237                         actual = really_read;
238                         buf -= really_read;
239                         size += really_read;
240                         goto short_read;
241                 }
242                 actual = size;
243                 if (size > channel->block_size)
244                         actual = channel->block_size;
245                 memcpy(buf, data->bounce, actual);
246                 really_read += actual;
247                 size -= actual;
248                 buf += actual;
249         }
250         return 0;
251
252 error_out:
253         if (actual >= 0 && actual < size)
254                 memset((char *) buf+actual, 0, size-actual);
255         if (channel->read_error)
256                 retval = (channel->read_error)(channel, block, count, buf,
257                                                size, actual, retval);
258         return retval;
259 }
260
261 static errcode_t raw_write_blk(io_channel channel,
262                                struct unix_private_data *data,
263                                unsigned long long block,
264                                int count, const void *bufv)
265 {
266         ssize_t         size;
267         ext2_loff_t     location;
268         int             actual = 0;
269         errcode_t       retval;
270         const unsigned char *buf = bufv;
271
272         if (count == 1)
273                 size = channel->block_size;
274         else {
275                 if (count < 0)
276                         size = -count;
277                 else
278                         size = (ext2_loff_t) count * channel->block_size;
279         }
280         data->io_stats.bytes_written += size;
281
282         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
283
284         if (data->flags & IO_FLAG_FORCE_BOUNCE) {
285                 if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
286                         retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
287                         goto error_out;
288                 }
289                 goto bounce_write;
290         }
291
292 #ifdef HAVE_PWRITE64
293         /* Try an aligned pwrite */
294         if ((channel->align == 0) ||
295             (IS_ALIGNED(buf, channel->align) &&
296              IS_ALIGNED(size, channel->align))) {
297                 actual = pwrite64(data->dev, buf, size, location);
298                 if (actual == size)
299                         return 0;
300         }
301 #elif HAVE_PWRITE
302         /* Try an aligned pwrite */
303         if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
304             ((channel->align == 0) ||
305              (IS_ALIGNED(buf, channel->align) &&
306               IS_ALIGNED(size, channel->align)))) {
307                 actual = pwrite(data->dev, buf, size, location);
308                 if (actual == size)
309                         return 0;
310         }
311 #endif /* HAVE_PWRITE */
312
313         if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
314                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
315                 goto error_out;
316         }
317
318         if ((channel->align == 0) ||
319             (IS_ALIGNED(buf, channel->align) &&
320              IS_ALIGNED(size, channel->align))) {
321                 actual = write(data->dev, buf, size);
322                 if (actual < 0) {
323                         retval = errno;
324                         goto error_out;
325                 }
326                 if (actual != size) {
327                 short_write:
328                         retval = EXT2_ET_SHORT_WRITE;
329                         goto error_out;
330                 }
331                 return 0;
332         }
333
334 #ifdef ALIGN_DEBUG
335         printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
336                (unsigned long) size);
337 #endif
338         /*
339          * The buffer or size which we're trying to write isn't aligned
340          * to the O_DIRECT rules, so we need to do this the hard way...
341          */
342 bounce_write:
343         while (size > 0) {
344                 if (size < channel->block_size) {
345                         actual = read(data->dev, data->bounce,
346                                       channel->block_size);
347                         if (actual != channel->block_size) {
348                                 if (actual < 0) {
349                                         retval = errno;
350                                         goto error_out;
351                                 }
352                                 memset((char *) data->bounce + actual, 0,
353                                        channel->block_size - actual);
354                         }
355                 }
356                 actual = size;
357                 if (size > channel->block_size)
358                         actual = channel->block_size;
359                 memcpy(data->bounce, buf, actual);
360                 if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
361                         retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
362                         goto error_out;
363                 }
364                 actual = write(data->dev, data->bounce, channel->block_size);
365                 if (actual < 0) {
366                         retval = errno;
367                         goto error_out;
368                 }
369                 if (actual != channel->block_size)
370                         goto short_write;
371                 size -= actual;
372                 buf += actual;
373                 location += actual;
374         }
375         return 0;
376
377 error_out:
378         if (channel->write_error)
379                 retval = (channel->write_error)(channel, block, count, buf,
380                                                 size, actual, retval);
381         return retval;
382 }
383
384
385 /*
386  * Here we implement the cache functions
387  */
388
389 /* Allocate the cache buffers */
390 static errcode_t alloc_cache(io_channel channel,
391                              struct unix_private_data *data)
392 {
393         errcode_t               retval;
394         struct unix_cache       *cache;
395         int                     i;
396
397         data->access_time = 0;
398         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
399                 cache->block = 0;
400                 cache->access_time = 0;
401                 cache->dirty = 0;
402                 cache->in_use = 0;
403                 if (cache->buf)
404                         ext2fs_free_mem(&cache->buf);
405                 retval = io_channel_alloc_buf(channel, 0, &cache->buf);
406                 if (retval)
407                         return retval;
408         }
409         if (channel->align || data->flags & IO_FLAG_FORCE_BOUNCE) {
410                 if (data->bounce)
411                         ext2fs_free_mem(&data->bounce);
412                 retval = io_channel_alloc_buf(channel, 0, &data->bounce);
413         }
414         return retval;
415 }
416
417 /* Free the cache buffers */
418 static void free_cache(struct unix_private_data *data)
419 {
420         struct unix_cache       *cache;
421         int                     i;
422
423         data->access_time = 0;
424         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
425                 cache->block = 0;
426                 cache->access_time = 0;
427                 cache->dirty = 0;
428                 cache->in_use = 0;
429                 if (cache->buf)
430                         ext2fs_free_mem(&cache->buf);
431         }
432         if (data->bounce)
433                 ext2fs_free_mem(&data->bounce);
434 }
435
436 #ifndef NO_IO_CACHE
437 /*
438  * Try to find a block in the cache.  If the block is not found, and
439  * eldest is a non-zero pointer, then fill in eldest with the cache
440  * entry to that should be reused.
441  */
442 static struct unix_cache *find_cached_block(struct unix_private_data *data,
443                                             unsigned long long block,
444                                             struct unix_cache **eldest)
445 {
446         struct unix_cache       *cache, *unused_cache, *oldest_cache;
447         int                     i;
448
449         unused_cache = oldest_cache = 0;
450         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
451                 if (!cache->in_use) {
452                         if (!unused_cache)
453                                 unused_cache = cache;
454                         continue;
455                 }
456                 if (cache->block == block) {
457                         cache->access_time = ++data->access_time;
458                         return cache;
459                 }
460                 if (!oldest_cache ||
461                     (cache->access_time < oldest_cache->access_time))
462                         oldest_cache = cache;
463         }
464         if (eldest)
465                 *eldest = (unused_cache) ? unused_cache : oldest_cache;
466         return 0;
467 }
468
469 /*
470  * Reuse a particular cache entry for another block.
471  */
472 static void reuse_cache(io_channel channel, struct unix_private_data *data,
473                  struct unix_cache *cache, unsigned long long block)
474 {
475         if (cache->dirty && cache->in_use)
476                 raw_write_blk(channel, data, cache->block, 1, cache->buf);
477
478         cache->in_use = 1;
479         cache->dirty = 0;
480         cache->block = block;
481         cache->access_time = ++data->access_time;
482 }
483
484 /*
485  * Flush all of the blocks in the cache
486  */
487 static errcode_t flush_cached_blocks(io_channel channel,
488                                      struct unix_private_data *data,
489                                      int invalidate)
490
491 {
492         struct unix_cache       *cache;
493         errcode_t               retval, retval2;
494         int                     i;
495
496         retval2 = 0;
497         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
498                 if (!cache->in_use)
499                         continue;
500
501                 if (invalidate)
502                         cache->in_use = 0;
503
504                 if (!cache->dirty)
505                         continue;
506
507                 retval = raw_write_blk(channel, data,
508                                        cache->block, 1, cache->buf);
509                 if (retval)
510                         retval2 = retval;
511                 else
512                         cache->dirty = 0;
513         }
514         return retval2;
515 }
516 #endif /* NO_IO_CACHE */
517
518 #ifdef __linux__
519 #ifndef BLKDISCARDZEROES
520 #define BLKDISCARDZEROES _IO(0x12,124)
521 #endif
522 #endif
523
524 int ext2fs_open_file(const char *pathname, int flags, mode_t mode)
525 {
526         if (mode)
527 #if defined(HAVE_OPEN64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
528                 return open64(pathname, flags, mode);
529         else
530                 return open64(pathname, flags);
531 #else
532                 return open(pathname, flags, mode);
533         else
534                 return open(pathname, flags);
535 #endif
536 }
537
538 int ext2fs_stat(const char *path, ext2fs_struct_stat *buf)
539 {
540 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
541         return stat64(path, buf);
542 #else
543         return stat(path, buf);
544 #endif
545 }
546
547 int ext2fs_fstat(int fd, ext2fs_struct_stat *buf)
548 {
549 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
550         return fstat64(fd, buf);
551 #else
552         return fstat(fd, buf);
553 #endif
554 }
555
556
557 static errcode_t unix_open_channel(const char *name, int fd,
558                                    int flags, io_channel *channel,
559                                    io_manager io_mgr)
560 {
561         io_channel      io = NULL;
562         struct unix_private_data *data = NULL;
563         errcode_t       retval;
564         ext2fs_struct_stat st;
565 #ifdef __linux__
566         struct          utsname ut;
567 #endif
568
569         if (safe_getenv("UNIX_IO_FORCE_BOUNCE"))
570                 flags |= IO_FLAG_FORCE_BOUNCE;
571
572 #ifdef __linux__
573         /*
574          * We need to make sure any previous errors in the block
575          * device are thrown away, sigh.
576          */
577         (void) fsync(fd);
578 #endif
579
580         retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
581         if (retval)
582                 goto cleanup;
583         memset(io, 0, sizeof(struct struct_io_channel));
584         io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
585         retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data);
586         if (retval)
587                 goto cleanup;
588
589         io->manager = io_mgr;
590         retval = ext2fs_get_mem(strlen(name)+1, &io->name);
591         if (retval)
592                 goto cleanup;
593
594         strcpy(io->name, name);
595         io->private_data = data;
596         io->block_size = 1024;
597         io->read_error = 0;
598         io->write_error = 0;
599         io->refcount = 1;
600
601         memset(data, 0, sizeof(struct unix_private_data));
602         data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
603         data->io_stats.num_fields = 2;
604         data->flags = flags;
605         data->dev = fd;
606
607 #if defined(O_DIRECT)
608         if (flags & IO_FLAG_DIRECT_IO)
609                 io->align = ext2fs_get_dio_alignment(data->dev);
610 #elif defined(F_NOCACHE)
611         if (flags & IO_FLAG_DIRECT_IO)
612                 io->align = 4096;
613 #endif
614
615         /*
616          * If the device is really a block device, then set the
617          * appropriate flag, otherwise we can set DISCARD_ZEROES flag
618          * because we are going to use punch hole instead of discard
619          * and if it succeed, subsequent read from sparse area returns
620          * zero.
621          */
622         if (ext2fs_fstat(data->dev, &st) == 0) {
623                 if (ext2fsP_is_disk_device(st.st_mode))
624                         io->flags |= CHANNEL_FLAGS_BLOCK_DEVICE;
625                 else
626                         io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
627         }
628
629 #ifdef BLKDISCARDZEROES
630         {
631                 int zeroes = 0;
632                 if (ioctl(data->dev, BLKDISCARDZEROES, &zeroes) == 0 &&
633                     zeroes)
634                         io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
635         }
636 #endif
637
638 #if defined(__CYGWIN__)
639         /*
640          * Some operating systems require that the buffers be aligned,
641          * regardless of O_DIRECT
642          */
643         if (!io->align)
644                 io->align = 512;
645 #endif
646
647 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
648         if (io->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
649                 int dio_align = ext2fs_get_dio_alignment(fd);
650
651                 if (io->align < dio_align)
652                         io->align = dio_align;
653         }
654 #endif
655
656         if ((retval = alloc_cache(io, data)))
657                 goto cleanup;
658
659 #ifdef BLKROGET
660         if (flags & IO_FLAG_RW) {
661                 int error;
662                 int readonly = 0;
663
664                 /* Is the block device actually writable? */
665                 error = ioctl(data->dev, BLKROGET, &readonly);
666                 if (!error && readonly) {
667                         retval = EPERM;
668                         goto cleanup;
669                 }
670         }
671 #endif
672
673 #ifdef __linux__
674 #undef RLIM_INFINITY
675 #if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
676 #define RLIM_INFINITY   ((unsigned long)(~0UL>>1))
677 #else
678 #define RLIM_INFINITY  (~0UL)
679 #endif
680         /*
681          * Work around a bug in 2.4.10-2.4.18 kernels where writes to
682          * block devices are wrongly getting hit by the filesize
683          * limit.  This workaround isn't perfect, since it won't work
684          * if glibc wasn't built against 2.2 header files.  (Sigh.)
685          *
686          */
687         if ((flags & IO_FLAG_RW) &&
688             (uname(&ut) == 0) &&
689             ((ut.release[0] == '2') && (ut.release[1] == '.') &&
690              (ut.release[2] == '4') && (ut.release[3] == '.') &&
691              (ut.release[4] == '1') && (ut.release[5] >= '0') &&
692              (ut.release[5] < '8')) &&
693             (ext2fs_fstat(data->dev, &st) == 0) &&
694             (ext2fsP_is_disk_device(st.st_mode))) {
695                 struct rlimit   rlim;
696
697                 rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
698                 setrlimit(RLIMIT_FSIZE, &rlim);
699                 getrlimit(RLIMIT_FSIZE, &rlim);
700                 if (((unsigned long) rlim.rlim_cur) <
701                     ((unsigned long) rlim.rlim_max)) {
702                         rlim.rlim_cur = rlim.rlim_max;
703                         setrlimit(RLIMIT_FSIZE, &rlim);
704                 }
705         }
706 #endif
707         *channel = io;
708         return 0;
709
710 cleanup:
711         if (data) {
712                 if (data->dev >= 0)
713                         close(data->dev);
714                 free_cache(data);
715                 ext2fs_free_mem(&data);
716         }
717         if (io) {
718                 if (io->name) {
719                         ext2fs_free_mem(&io->name);
720                 }
721                 ext2fs_free_mem(&io);
722         }
723         return retval;
724 }
725
726 static errcode_t unixfd_open(const char *str_fd, int flags,
727                              io_channel *channel)
728 {
729         int fd;
730         int fd_flags;
731
732         fd = atoi(str_fd);
733 #if defined(HAVE_FCNTL)
734         fd_flags = fcntl(fd, F_GETFD);
735         if (fd_flags == -1)
736                 return -EBADF;
737
738         flags = 0;
739         if (fd_flags & O_RDWR)
740                 flags |= IO_FLAG_RW;
741         if (fd_flags & O_EXCL)
742                 flags |= IO_FLAG_EXCLUSIVE;
743 #if defined(O_DIRECT)
744         if (fd_flags & O_DIRECT)
745                 flags |= IO_FLAG_DIRECT_IO;
746 #endif
747 #endif  /* HAVE_FCNTL */
748
749         return unix_open_channel(str_fd, fd, flags, channel, unixfd_io_manager);
750 }
751
752 static errcode_t unix_open(const char *name, int flags,
753                            io_channel *channel)
754 {
755         int fd = -1;
756         int open_flags;
757
758         if (name == 0)
759                 return EXT2_ET_BAD_DEVICE_NAME;
760
761         open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
762         if (flags & IO_FLAG_EXCLUSIVE)
763                 open_flags |= O_EXCL;
764 #if defined(O_DIRECT)
765         if (flags & IO_FLAG_DIRECT_IO)
766                 open_flags |= O_DIRECT;
767 #endif
768         fd = ext2fs_open_file(name, open_flags, 0);
769         if (fd < 0)
770                 return errno;
771 #if defined(F_NOCACHE) && !defined(IO_DIRECT)
772         if (flags & IO_FLAG_DIRECT_IO) {
773                 if (fcntl(fd, F_NOCACHE, 1) < 0)
774                         return errno;
775         }
776 #endif
777         return unix_open_channel(name, fd, flags, channel, unix_io_manager);
778 }
779
780 static errcode_t unix_close(io_channel channel)
781 {
782         struct unix_private_data *data;
783         errcode_t       retval = 0;
784
785         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
786         data = (struct unix_private_data *) channel->private_data;
787         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
788
789         if (--channel->refcount > 0)
790                 return 0;
791
792 #ifndef NO_IO_CACHE
793         retval = flush_cached_blocks(channel, data, 0);
794 #endif
795
796         if (close(data->dev) < 0)
797                 retval = errno;
798         free_cache(data);
799
800         ext2fs_free_mem(&channel->private_data);
801         if (channel->name)
802                 ext2fs_free_mem(&channel->name);
803         ext2fs_free_mem(&channel);
804         return retval;
805 }
806
807 static errcode_t unix_set_blksize(io_channel channel, int blksize)
808 {
809         struct unix_private_data *data;
810         errcode_t               retval;
811
812         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
813         data = (struct unix_private_data *) channel->private_data;
814         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
815
816         if (channel->block_size != blksize) {
817 #ifndef NO_IO_CACHE
818                 if ((retval = flush_cached_blocks(channel, data, 0)))
819                         return retval;
820 #endif
821
822                 channel->block_size = blksize;
823                 free_cache(data);
824                 if ((retval = alloc_cache(channel, data)))
825                         return retval;
826         }
827         return 0;
828 }
829
830 static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
831                                int count, void *buf)
832 {
833         struct unix_private_data *data;
834         struct unix_cache *cache, *reuse[READ_DIRECT_SIZE];
835         errcode_t       retval;
836         char            *cp;
837         int             i, j;
838
839         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
840         data = (struct unix_private_data *) channel->private_data;
841         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
842
843 #ifdef NO_IO_CACHE
844         return raw_read_blk(channel, data, block, count, buf);
845 #else
846         /*
847          * If we're doing an odd-sized read or a very large read,
848          * flush out the cache and then do a direct read.
849          */
850         if (count < 0 || count > WRITE_DIRECT_SIZE) {
851                 if ((retval = flush_cached_blocks(channel, data, 0)))
852                         return retval;
853                 return raw_read_blk(channel, data, block, count, buf);
854         }
855
856         cp = buf;
857         while (count > 0) {
858                 /* If it's in the cache, use it! */
859                 if ((cache = find_cached_block(data, block, &reuse[0]))) {
860 #ifdef DEBUG
861                         printf("Using cached block %lu\n", block);
862 #endif
863                         memcpy(cp, cache->buf, channel->block_size);
864                         count--;
865                         block++;
866                         cp += channel->block_size;
867                         continue;
868                 }
869                 if (count == 1) {
870                         /*
871                          * Special case where we read directly into the
872                          * cache buffer; important in the O_DIRECT case
873                          */
874                         cache = reuse[0];
875                         reuse_cache(channel, data, cache, block);
876                         if ((retval = raw_read_blk(channel, data, block, 1,
877                                                    cache->buf))) {
878                                 cache->in_use = 0;
879                                 return retval;
880                         }
881                         memcpy(cp, cache->buf, channel->block_size);
882                         return 0;
883                 }
884
885                 /*
886                  * Find the number of uncached blocks so we can do a
887                  * single read request
888                  */
889                 for (i=1; i < count; i++)
890                         if (find_cached_block(data, block+i, &reuse[i]))
891                                 break;
892 #ifdef DEBUG
893                 printf("Reading %d blocks starting at %lu\n", i, block);
894 #endif
895                 if ((retval = raw_read_blk(channel, data, block, i, cp)))
896                         return retval;
897
898                 /* Save the results in the cache */
899                 for (j=0; j < i; j++) {
900                         count--;
901                         cache = reuse[j];
902                         reuse_cache(channel, data, cache, block++);
903                         memcpy(cache->buf, cp, channel->block_size);
904                         cp += channel->block_size;
905                 }
906         }
907         return 0;
908 #endif /* NO_IO_CACHE */
909 }
910
911 static errcode_t unix_read_blk(io_channel channel, unsigned long block,
912                                int count, void *buf)
913 {
914         return unix_read_blk64(channel, block, count, buf);
915 }
916
917 static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
918                                 int count, const void *buf)
919 {
920         struct unix_private_data *data;
921         struct unix_cache *cache, *reuse;
922         errcode_t       retval = 0;
923         const char      *cp;
924         int             writethrough;
925
926         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
927         data = (struct unix_private_data *) channel->private_data;
928         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
929
930 #ifdef NO_IO_CACHE
931         return raw_write_blk(channel, data, block, count, buf);
932 #else
933         /*
934          * If we're doing an odd-sized write or a very large write,
935          * flush out the cache completely and then do a direct write.
936          */
937         if (count < 0 || count > WRITE_DIRECT_SIZE) {
938                 if ((retval = flush_cached_blocks(channel, data, 1)))
939                         return retval;
940                 return raw_write_blk(channel, data, block, count, buf);
941         }
942
943         /*
944          * For a moderate-sized multi-block write, first force a write
945          * if we're in write-through cache mode, and then fill the
946          * cache with the blocks.
947          */
948         writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
949         if (writethrough)
950                 retval = raw_write_blk(channel, data, block, count, buf);
951
952         cp = buf;
953         while (count > 0) {
954                 cache = find_cached_block(data, block, &reuse);
955                 if (!cache) {
956                         cache = reuse;
957                         reuse_cache(channel, data, cache, block);
958                 }
959                 if (cache->buf != cp)
960                         memcpy(cache->buf, cp, channel->block_size);
961                 cache->dirty = !writethrough;
962                 count--;
963                 block++;
964                 cp += channel->block_size;
965         }
966         return retval;
967 #endif /* NO_IO_CACHE */
968 }
969
970 static errcode_t unix_cache_readahead(io_channel channel,
971                                       unsigned long long block,
972                                       unsigned long long count)
973 {
974 #ifdef POSIX_FADV_WILLNEED
975         struct unix_private_data *data;
976
977         data = (struct unix_private_data *)channel->private_data;
978         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
979         return posix_fadvise(data->dev,
980                              (ext2_loff_t)block * channel->block_size + data->offset,
981                              (ext2_loff_t)count * channel->block_size,
982                              POSIX_FADV_WILLNEED);
983 #else
984         return EXT2_ET_OP_NOT_SUPPORTED;
985 #endif
986 }
987
988 static errcode_t unix_write_blk(io_channel channel, unsigned long block,
989                                 int count, const void *buf)
990 {
991         return unix_write_blk64(channel, block, count, buf);
992 }
993
994 static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
995                                  int size, const void *buf)
996 {
997         struct unix_private_data *data;
998         errcode_t       retval = 0;
999         ssize_t         actual;
1000
1001         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1002         data = (struct unix_private_data *) channel->private_data;
1003         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1004
1005         if (channel->align != 0) {
1006 #ifdef ALIGN_DEBUG
1007                 printf("unix_write_byte: O_DIRECT fallback\n");
1008 #endif
1009                 return EXT2_ET_UNIMPLEMENTED;
1010         }
1011
1012 #ifndef NO_IO_CACHE
1013         /*
1014          * Flush out the cache completely
1015          */
1016         if ((retval = flush_cached_blocks(channel, data, 1)))
1017                 return retval;
1018 #endif
1019
1020         if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
1021                 return errno;
1022
1023         actual = write(data->dev, buf, size);
1024         if (actual < 0)
1025                 return errno;
1026         if (actual != size)
1027                 return EXT2_ET_SHORT_WRITE;
1028
1029         return 0;
1030 }
1031
1032 /*
1033  * Flush data buffers to disk.
1034  */
1035 static errcode_t unix_flush(io_channel channel)
1036 {
1037         struct unix_private_data *data;
1038         errcode_t retval = 0;
1039
1040         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1041         data = (struct unix_private_data *) channel->private_data;
1042         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1043
1044 #ifndef NO_IO_CACHE
1045         retval = flush_cached_blocks(channel, data, 0);
1046 #endif
1047 #ifdef HAVE_FSYNC
1048         if (!retval && fsync(data->dev) != 0)
1049                 return errno;
1050 #endif
1051         return retval;
1052 }
1053
1054 static errcode_t unix_set_option(io_channel channel, const char *option,
1055                                  const char *arg)
1056 {
1057         struct unix_private_data *data;
1058         unsigned long long tmp;
1059         char *end;
1060
1061         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1062         data = (struct unix_private_data *) channel->private_data;
1063         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1064
1065         if (!strcmp(option, "offset")) {
1066                 if (!arg)
1067                         return EXT2_ET_INVALID_ARGUMENT;
1068
1069                 tmp = strtoull(arg, &end, 0);
1070                 if (*end)
1071                         return EXT2_ET_INVALID_ARGUMENT;
1072                 data->offset = tmp;
1073                 if (data->offset < 0)
1074                         return EXT2_ET_INVALID_ARGUMENT;
1075                 return 0;
1076         }
1077         return EXT2_ET_INVALID_ARGUMENT;
1078 }
1079
1080 #if defined(__linux__) && !defined(BLKDISCARD)
1081 #define BLKDISCARD              _IO(0x12,119)
1082 #endif
1083
1084 static errcode_t unix_discard(io_channel channel, unsigned long long block,
1085                               unsigned long long count)
1086 {
1087         struct unix_private_data *data;
1088         int             ret;
1089
1090         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1091         data = (struct unix_private_data *) channel->private_data;
1092         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1093
1094         if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
1095 #ifdef BLKDISCARD
1096                 __u64 range[2];
1097
1098                 range[0] = (__u64)(block) * channel->block_size + data->offset;
1099                 range[1] = (__u64)(count) * channel->block_size;
1100
1101                 ret = ioctl(data->dev, BLKDISCARD, &range);
1102 #else
1103                 goto unimplemented;
1104 #endif
1105         } else {
1106 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE)
1107                 /*
1108                  * If we are not on block device, try to use punch hole
1109                  * to reclaim free space.
1110                  */
1111                 ret = fallocate(data->dev,
1112                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1113                                 (off_t)(block) * channel->block_size + data->offset,
1114                                 (off_t)(count) * channel->block_size);
1115 #else
1116                 goto unimplemented;
1117 #endif
1118         }
1119         if (ret < 0) {
1120                 if (errno == EOPNOTSUPP)
1121                         goto unimplemented;
1122                 return errno;
1123         }
1124         return 0;
1125 unimplemented:
1126         return EXT2_ET_UNIMPLEMENTED;
1127 }
1128
1129 /*
1130  * If we know about ZERO_RANGE, try that before we try PUNCH_HOLE because
1131  * ZERO_RANGE doesn't unmap preallocated blocks.  We prefer fallocate because
1132  * it always invalidates page cache, and libext2fs requires that reads after
1133  * ZERO_RANGE return zeroes.
1134  */
1135 static int __unix_zeroout(int fd, off_t offset, off_t len)
1136 {
1137         int ret = -1;
1138
1139 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_ZERO_RANGE)
1140         ret = fallocate(fd, FALLOC_FL_ZERO_RANGE, offset, len);
1141         if (ret == 0)
1142                 return 0;
1143 #endif
1144 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)
1145         ret = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1146                         offset,  len);
1147         if (ret == 0)
1148                 return 0;
1149 #endif
1150         errno = EOPNOTSUPP;
1151         return ret;
1152 }
1153
1154 /* parameters might not be used if OS doesn't support zeroout */
1155 #if __GNUC_PREREQ (4, 6)
1156 #pragma GCC diagnostic push
1157 #pragma GCC diagnostic ignored "-Wunused-parameter"
1158 #endif
1159 static errcode_t unix_zeroout(io_channel channel, unsigned long long block,
1160                               unsigned long long count)
1161 {
1162         struct unix_private_data *data;
1163         int             ret;
1164
1165         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1166         data = (struct unix_private_data *) channel->private_data;
1167         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1168
1169         if (safe_getenv("UNIX_IO_NOZEROOUT"))
1170                 goto unimplemented;
1171
1172         if (!(channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE)) {
1173                 /* Regular file, try to use truncate/punch/zero. */
1174                 struct stat statbuf;
1175
1176                 if (count == 0)
1177                         return 0;
1178                 /*
1179                  * If we're trying to zero a range past the end of the file,
1180                  * extend the file size, then truncate everything.
1181                  */
1182                 ret = fstat(data->dev, &statbuf);
1183                 if (ret)
1184                         goto err;
1185                 if ((unsigned long long) statbuf.st_size <
1186                         (block + count) * channel->block_size + data->offset) {
1187                         ret = ftruncate(data->dev,
1188                                         (block + count) * channel->block_size + data->offset);
1189                         if (ret)
1190                                 goto err;
1191                 }
1192         }
1193
1194         ret = __unix_zeroout(data->dev,
1195                         (off_t)(block) * channel->block_size + data->offset,
1196                         (off_t)(count) * channel->block_size);
1197 err:
1198         if (ret < 0) {
1199                 if (errno == EOPNOTSUPP)
1200                         goto unimplemented;
1201                 return errno;
1202         }
1203         return 0;
1204 unimplemented:
1205         return EXT2_ET_UNIMPLEMENTED;
1206 }
1207 #if __GNUC_PREREQ (4, 6)
1208 #pragma GCC diagnostic pop
1209 #endif
1210
1211 static struct struct_io_manager struct_unix_manager = {
1212         .magic          = EXT2_ET_MAGIC_IO_MANAGER,
1213         .name           = "Unix I/O Manager",
1214         .open           = unix_open,
1215         .close          = unix_close,
1216         .set_blksize    = unix_set_blksize,
1217         .read_blk       = unix_read_blk,
1218         .write_blk      = unix_write_blk,
1219         .flush          = unix_flush,
1220         .write_byte     = unix_write_byte,
1221         .set_option     = unix_set_option,
1222         .get_stats      = unix_get_stats,
1223         .read_blk64     = unix_read_blk64,
1224         .write_blk64    = unix_write_blk64,
1225         .discard        = unix_discard,
1226         .cache_readahead        = unix_cache_readahead,
1227         .zeroout        = unix_zeroout,
1228 };
1229
1230 io_manager unix_io_manager = &struct_unix_manager;
1231
1232 static struct struct_io_manager struct_unixfd_manager = {
1233         .magic          = EXT2_ET_MAGIC_IO_MANAGER,
1234         .name           = "Unix fd I/O Manager",
1235         .open           = unixfd_open,
1236         .close          = unix_close,
1237         .set_blksize    = unix_set_blksize,
1238         .read_blk       = unix_read_blk,
1239         .write_blk      = unix_write_blk,
1240         .flush          = unix_flush,
1241         .write_byte     = unix_write_byte,
1242         .set_option     = unix_set_option,
1243         .get_stats      = unix_get_stats,
1244         .read_blk64     = unix_read_blk64,
1245         .write_blk64    = unix_write_blk64,
1246         .discard        = unix_discard,
1247         .cache_readahead        = unix_cache_readahead,
1248         .zeroout        = unix_zeroout,
1249 };
1250
1251 io_manager unixfd_io_manager = &struct_unixfd_manager;