Whamcloud - gitweb
AOSP: android: libext2fs and com_err for windows
[tools/e2fsprogs.git] / lib / ext2fs / unix_io.c
1 /*
2  * unix_io.c --- This is the Unix (well, really POSIX) implementation
3  *      of the I/O manager.
4  *
5  * Implements a one-block write-through cache.
6  *
7  * Includes support for Windows NT support under Cygwin.
8  *
9  * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10  *      2002 by Theodore Ts'o.
11  *
12  * %Begin-Header%
13  * This file may be redistributed under the terms of the GNU Library
14  * General Public License, version 2.
15  * %End-Header%
16  */
17
18 #if !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
19 #define _XOPEN_SOURCE 600
20 #define _DARWIN_C_SOURCE
21 #define _FILE_OFFSET_BITS 64
22 #ifndef _LARGEFILE_SOURCE
23 #define _LARGEFILE_SOURCE
24 #endif
25 #ifndef _LARGEFILE64_SOURCE
26 #define _LARGEFILE64_SOURCE
27 #endif
28 #ifndef _GNU_SOURCE
29 #define _GNU_SOURCE
30 #endif
31 #endif
32
33 #include "config.h"
34 #include <stdio.h>
35 #include <string.h>
36 #if HAVE_UNISTD_H
37 #include <unistd.h>
38 #endif
39 #if HAVE_ERRNO_H
40 #include <errno.h>
41 #endif
42 #include <fcntl.h>
43 #include <time.h>
44 #ifdef __linux__
45 #include <sys/utsname.h>
46 #endif
47 #if HAVE_SYS_TYPES_H
48 #include <sys/types.h>
49 #endif
50 #ifdef HAVE_SYS_IOCTL_H
51 #include <sys/ioctl.h>
52 #endif
53 #ifdef HAVE_SYS_MOUNT_H
54 #include <sys/mount.h>
55 #endif
56 #ifdef HAVE_SYS_PRCTL_H
57 #include <sys/prctl.h>
58 #else
59 #define PR_GET_DUMPABLE 3
60 #endif
61 #if HAVE_SYS_STAT_H
62 #include <sys/stat.h>
63 #endif
64 #if HAVE_SYS_RESOURCE_H
65 #include <sys/resource.h>
66 #endif
67 #if HAVE_LINUX_FALLOC_H
68 #include <linux/falloc.h>
69 #endif
70
71 #if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
72 #define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
73 #endif
74
75 #undef ALIGN_DEBUG
76
77 #include "ext2_fs.h"
78 #include "ext2fs.h"
79
80 /*
81  * For checking structure magic numbers...
82  */
83
84 #define EXT2_CHECK_MAGIC(struct, code) \
85           if ((struct)->magic != (code)) return (code)
86
87 struct unix_cache {
88         char                    *buf;
89         unsigned long long      block;
90         int                     access_time;
91         unsigned                dirty:1;
92         unsigned                in_use:1;
93 };
94
95 #define CACHE_SIZE 8
96 #define WRITE_DIRECT_SIZE 4     /* Must be smaller than CACHE_SIZE */
97 #define READ_DIRECT_SIZE 4      /* Should be smaller than CACHE_SIZE */
98
99 struct unix_private_data {
100         int     magic;
101         int     dev;
102         int     flags;
103         int     align;
104         int     access_time;
105         ext2_loff_t offset;
106         struct unix_cache cache[CACHE_SIZE];
107         void    *bounce;
108         struct struct_io_stats io_stats;
109 };
110
111 #define IS_ALIGNED(n, align) ((((uintptr_t) n) & \
112                                ((uintptr_t) ((align)-1))) == 0)
113
114 static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
115 {
116         errcode_t       retval = 0;
117
118         struct unix_private_data *data;
119
120         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
121         data = (struct unix_private_data *) channel->private_data;
122         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
123
124         if (stats)
125                 *stats = &data->io_stats;
126
127         return retval;
128 }
129
130 static char *safe_getenv(const char *arg)
131 {
132         if ((getuid() != geteuid()) || (getgid() != getegid()))
133                 return NULL;
134 #ifdef HAVE_PRCTL
135         if (prctl(PR_GET_DUMPABLE, 0, 0, 0, 0) == 0)
136                 return NULL;
137 #else
138 #if (defined(linux) && defined(SYS_prctl))
139         if (syscall(SYS_prctl, PR_GET_DUMPABLE, 0, 0, 0, 0) == 0)
140                 return NULL;
141 #endif
142 #endif
143
144 #if defined(HAVE_SECURE_GETENV)
145         return secure_getenv(arg);
146 #elif defined(HAVE___SECURE_GETENV)
147         return __secure_getenv(arg);
148 #else
149         return getenv(arg);
150 #endif
151 }
152
153 /*
154  * Here are the raw I/O functions
155  */
156 static errcode_t raw_read_blk(io_channel channel,
157                               struct unix_private_data *data,
158                               unsigned long long block,
159                               int count, void *bufv)
160 {
161         errcode_t       retval;
162         ssize_t         size;
163         ext2_loff_t     location;
164         int             actual = 0;
165         unsigned char   *buf = bufv;
166         ssize_t         really_read = 0;
167
168         size = (count < 0) ? -count : count * channel->block_size;
169         data->io_stats.bytes_read += size;
170         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
171
172         if (data->flags & IO_FLAG_FORCE_BOUNCE) {
173                 if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
174                         retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
175                         goto error_out;
176                 }
177                 goto bounce_read;
178         }
179
180 #ifdef HAVE_PREAD64
181         /* Try an aligned pread */
182         if ((channel->align == 0) ||
183             (IS_ALIGNED(buf, channel->align) &&
184              IS_ALIGNED(size, channel->align))) {
185                 actual = pread64(data->dev, buf, size, location);
186                 if (actual == size)
187                         return 0;
188         }
189 #elif HAVE_PREAD
190         /* Try an aligned pread */
191         if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
192             ((channel->align == 0) ||
193              (IS_ALIGNED(buf, channel->align) &&
194               IS_ALIGNED(size, channel->align)))) {
195                 actual = pread(data->dev, buf, size, location);
196                 if (actual == size)
197                         return 0;
198         }
199 #endif /* HAVE_PREAD */
200
201         if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
202                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
203                 goto error_out;
204         }
205         if ((channel->align == 0) ||
206             (IS_ALIGNED(buf, channel->align) &&
207              IS_ALIGNED(size, channel->align))) {
208                 actual = read(data->dev, buf, size);
209                 if (actual != size) {
210                 short_read:
211                         if (actual < 0) {
212                                 retval = errno;
213                                 actual = 0;
214                         } else
215                                 retval = EXT2_ET_SHORT_READ;
216                         goto error_out;
217                 }
218                 return 0;
219         }
220
221 #ifdef ALIGN_DEBUG
222         printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
223                (unsigned long) size);
224 #endif
225
226         /*
227          * The buffer or size which we're trying to read isn't aligned
228          * to the O_DIRECT rules, so we need to do this the hard way...
229          */
230 bounce_read:
231         while (size > 0) {
232                 actual = read(data->dev, data->bounce, channel->block_size);
233                 if (actual != channel->block_size) {
234                         actual = really_read;
235                         buf -= really_read;
236                         size += really_read;
237                         goto short_read;
238                 }
239                 actual = size;
240                 if (size > channel->block_size)
241                         actual = channel->block_size;
242                 memcpy(buf, data->bounce, actual);
243                 really_read += actual;
244                 size -= actual;
245                 buf += actual;
246         }
247         return 0;
248
249 error_out:
250         memset((char *) buf+actual, 0, size-actual);
251         if (channel->read_error)
252                 retval = (channel->read_error)(channel, block, count, buf,
253                                                size, actual, retval);
254         return retval;
255 }
256
257 static errcode_t raw_write_blk(io_channel channel,
258                                struct unix_private_data *data,
259                                unsigned long long block,
260                                int count, const void *bufv)
261 {
262         ssize_t         size;
263         ext2_loff_t     location;
264         int             actual = 0;
265         errcode_t       retval;
266         const unsigned char *buf = bufv;
267
268         if (count == 1)
269                 size = channel->block_size;
270         else {
271                 if (count < 0)
272                         size = -count;
273                 else
274                         size = count * channel->block_size;
275         }
276         data->io_stats.bytes_written += size;
277
278         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
279
280         if (data->flags & IO_FLAG_FORCE_BOUNCE) {
281                 if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
282                         retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
283                         goto error_out;
284                 }
285                 goto bounce_write;
286         }
287
288 #ifdef HAVE_PWRITE64
289         /* Try an aligned pwrite */
290         if ((channel->align == 0) ||
291             (IS_ALIGNED(buf, channel->align) &&
292              IS_ALIGNED(size, channel->align))) {
293                 actual = pwrite64(data->dev, buf, size, location);
294                 if (actual == size)
295                         return 0;
296         }
297 #elif HAVE_PWRITE
298         /* Try an aligned pwrite */
299         if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
300             ((channel->align == 0) ||
301              (IS_ALIGNED(buf, channel->align) &&
302               IS_ALIGNED(size, channel->align)))) {
303                 actual = pwrite(data->dev, buf, size, location);
304                 if (actual == size)
305                         return 0;
306         }
307 #endif /* HAVE_PWRITE */
308
309         if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
310                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
311                 goto error_out;
312         }
313
314         if ((channel->align == 0) ||
315             (IS_ALIGNED(buf, channel->align) &&
316              IS_ALIGNED(size, channel->align))) {
317                 actual = write(data->dev, buf, size);
318                 if (actual < 0) {
319                         retval = errno;
320                         goto error_out;
321                 }
322                 if (actual != size) {
323                 short_write:
324                         retval = EXT2_ET_SHORT_WRITE;
325                         goto error_out;
326                 }
327                 return 0;
328         }
329
330 #ifdef ALIGN_DEBUG
331         printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
332                (unsigned long) size);
333 #endif
334         /*
335          * The buffer or size which we're trying to write isn't aligned
336          * to the O_DIRECT rules, so we need to do this the hard way...
337          */
338 bounce_write:
339         while (size > 0) {
340                 if (size < channel->block_size) {
341                         actual = read(data->dev, data->bounce,
342                                       channel->block_size);
343                         if (actual != channel->block_size) {
344                                 if (actual < 0) {
345                                         retval = errno;
346                                         goto error_out;
347                                 }
348                                 memset(data->bounce + actual, 0,
349                                        channel->block_size - actual);
350                         }
351                 }
352                 actual = size;
353                 if (size > channel->block_size)
354                         actual = channel->block_size;
355                 memcpy(data->bounce, buf, actual);
356                 if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
357                         retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
358                         goto error_out;
359                 }
360                 actual = write(data->dev, data->bounce, channel->block_size);
361                 if (actual < 0) {
362                         retval = errno;
363                         goto error_out;
364                 }
365                 if (actual != channel->block_size)
366                         goto short_write;
367                 size -= actual;
368                 buf += actual;
369                 location += actual;
370         }
371         return 0;
372
373 error_out:
374         if (channel->write_error)
375                 retval = (channel->write_error)(channel, block, count, buf,
376                                                 size, actual, retval);
377         return retval;
378 }
379
380
381 /*
382  * Here we implement the cache functions
383  */
384
385 /* Allocate the cache buffers */
386 static errcode_t alloc_cache(io_channel channel,
387                              struct unix_private_data *data)
388 {
389         errcode_t               retval;
390         struct unix_cache       *cache;
391         int                     i;
392
393         data->access_time = 0;
394         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
395                 cache->block = 0;
396                 cache->access_time = 0;
397                 cache->dirty = 0;
398                 cache->in_use = 0;
399                 if (cache->buf)
400                         ext2fs_free_mem(&cache->buf);
401                 retval = io_channel_alloc_buf(channel, 0, &cache->buf);
402                 if (retval)
403                         return retval;
404         }
405         if (channel->align || data->flags & IO_FLAG_FORCE_BOUNCE) {
406                 if (data->bounce)
407                         ext2fs_free_mem(&data->bounce);
408                 retval = io_channel_alloc_buf(channel, 0, &data->bounce);
409         }
410         return retval;
411 }
412
413 /* Free the cache buffers */
414 static void free_cache(struct unix_private_data *data)
415 {
416         struct unix_cache       *cache;
417         int                     i;
418
419         data->access_time = 0;
420         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
421                 cache->block = 0;
422                 cache->access_time = 0;
423                 cache->dirty = 0;
424                 cache->in_use = 0;
425                 if (cache->buf)
426                         ext2fs_free_mem(&cache->buf);
427         }
428         if (data->bounce)
429                 ext2fs_free_mem(&data->bounce);
430 }
431
432 #ifndef NO_IO_CACHE
433 /*
434  * Try to find a block in the cache.  If the block is not found, and
435  * eldest is a non-zero pointer, then fill in eldest with the cache
436  * entry to that should be reused.
437  */
438 static struct unix_cache *find_cached_block(struct unix_private_data *data,
439                                             unsigned long long block,
440                                             struct unix_cache **eldest)
441 {
442         struct unix_cache       *cache, *unused_cache, *oldest_cache;
443         int                     i;
444
445         unused_cache = oldest_cache = 0;
446         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
447                 if (!cache->in_use) {
448                         if (!unused_cache)
449                                 unused_cache = cache;
450                         continue;
451                 }
452                 if (cache->block == block) {
453                         cache->access_time = ++data->access_time;
454                         return cache;
455                 }
456                 if (!oldest_cache ||
457                     (cache->access_time < oldest_cache->access_time))
458                         oldest_cache = cache;
459         }
460         if (eldest)
461                 *eldest = (unused_cache) ? unused_cache : oldest_cache;
462         return 0;
463 }
464
465 /*
466  * Reuse a particular cache entry for another block.
467  */
468 static void reuse_cache(io_channel channel, struct unix_private_data *data,
469                  struct unix_cache *cache, unsigned long long block)
470 {
471         if (cache->dirty && cache->in_use)
472                 raw_write_blk(channel, data, cache->block, 1, cache->buf);
473
474         cache->in_use = 1;
475         cache->dirty = 0;
476         cache->block = block;
477         cache->access_time = ++data->access_time;
478 }
479
480 /*
481  * Flush all of the blocks in the cache
482  */
483 static errcode_t flush_cached_blocks(io_channel channel,
484                                      struct unix_private_data *data,
485                                      int invalidate)
486
487 {
488         struct unix_cache       *cache;
489         errcode_t               retval, retval2;
490         int                     i;
491
492         retval2 = 0;
493         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
494                 if (!cache->in_use)
495                         continue;
496
497                 if (invalidate)
498                         cache->in_use = 0;
499
500                 if (!cache->dirty)
501                         continue;
502
503                 retval = raw_write_blk(channel, data,
504                                        cache->block, 1, cache->buf);
505                 if (retval)
506                         retval2 = retval;
507                 else
508                         cache->dirty = 0;
509         }
510         return retval2;
511 }
512 #endif /* NO_IO_CACHE */
513
514 #ifdef __linux__
515 #ifndef BLKDISCARDZEROES
516 #define BLKDISCARDZEROES _IO(0x12,124)
517 #endif
518 #endif
519
520 int ext2fs_open_file(const char *pathname, int flags, mode_t mode)
521 {
522         if (mode)
523 #if defined(HAVE_OPEN64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
524                 return open64(pathname, flags, mode);
525         else
526                 return open64(pathname, flags);
527 #else
528                 return open(pathname, flags, mode);
529         else
530                 return open(pathname, flags);
531 #endif
532 }
533
534 int ext2fs_stat(const char *path, ext2fs_struct_stat *buf)
535 {
536 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
537         return stat64(path, buf);
538 #else
539         return stat(path, buf);
540 #endif
541 }
542
543 int ext2fs_fstat(int fd, ext2fs_struct_stat *buf)
544 {
545 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
546         return fstat64(fd, buf);
547 #else
548         return fstat(fd, buf);
549 #endif
550 }
551
552
553 static errcode_t unix_open_channel(const char *name, int fd,
554                                    int flags, io_channel *channel,
555                                    io_manager io_mgr)
556 {
557         io_channel      io = NULL;
558         struct unix_private_data *data = NULL;
559         errcode_t       retval;
560         ext2fs_struct_stat st;
561 #ifdef __linux__
562         struct          utsname ut;
563 #endif
564
565         if (safe_getenv("UNIX_IO_FORCE_BOUNCE"))
566                 flags |= IO_FLAG_FORCE_BOUNCE;
567
568         retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
569         if (retval)
570                 goto cleanup;
571         memset(io, 0, sizeof(struct struct_io_channel));
572         io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
573         retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data);
574         if (retval)
575                 goto cleanup;
576
577         io->manager = io_mgr;
578         retval = ext2fs_get_mem(strlen(name)+1, &io->name);
579         if (retval)
580                 goto cleanup;
581
582         strcpy(io->name, name);
583         io->private_data = data;
584         io->block_size = 1024;
585         io->read_error = 0;
586         io->write_error = 0;
587         io->refcount = 1;
588
589         memset(data, 0, sizeof(struct unix_private_data));
590         data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
591         data->io_stats.num_fields = 2;
592         data->flags = flags;
593         data->dev = fd;
594
595 #if defined(O_DIRECT)
596         if (flags & IO_FLAG_DIRECT_IO)
597                 io->align = ext2fs_get_dio_alignment(data->dev);
598 #elif defined(F_NOCACHE)
599         if (flags & IO_FLAG_DIRECT_IO)
600                 io->align = 4096;
601 #endif
602
603         /*
604          * If the device is really a block device, then set the
605          * appropriate flag, otherwise we can set DISCARD_ZEROES flag
606          * because we are going to use punch hole instead of discard
607          * and if it succeed, subsequent read from sparse area returns
608          * zero.
609          */
610         if (ext2fs_fstat(data->dev, &st) == 0) {
611                 if (S_ISBLK(st.st_mode))
612                         io->flags |= CHANNEL_FLAGS_BLOCK_DEVICE;
613                 else
614                         io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
615         }
616
617 #ifdef BLKDISCARDZEROES
618         {
619                 int zeroes = 0;
620                 if (ioctl(data->dev, BLKDISCARDZEROES, &zeroes) == 0 &&
621                     zeroes)
622                         io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
623         }
624 #endif
625
626 #if defined(__CYGWIN__)
627         /*
628          * Some operating systems require that the buffers be aligned,
629          * regardless of O_DIRECT
630          */
631         if (!io->align)
632                 io->align = 512;
633 #endif
634
635 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
636         if (io->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
637                 int dio_align = ext2fs_get_dio_alignment(fd);
638
639                 if (io->align < dio_align)
640                         io->align = dio_align;
641         }
642 #endif
643
644         if ((retval = alloc_cache(io, data)))
645                 goto cleanup;
646
647 #ifdef BLKROGET
648         if (flags & IO_FLAG_RW) {
649                 int error;
650                 int readonly = 0;
651
652                 /* Is the block device actually writable? */
653                 error = ioctl(data->dev, BLKROGET, &readonly);
654                 if (!error && readonly) {
655                         retval = EPERM;
656                         goto cleanup;
657                 }
658         }
659 #endif
660
661 #ifdef __linux__
662 #undef RLIM_INFINITY
663 #if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
664 #define RLIM_INFINITY   ((unsigned long)(~0UL>>1))
665 #else
666 #define RLIM_INFINITY  (~0UL)
667 #endif
668         /*
669          * Work around a bug in 2.4.10-2.4.18 kernels where writes to
670          * block devices are wrongly getting hit by the filesize
671          * limit.  This workaround isn't perfect, since it won't work
672          * if glibc wasn't built against 2.2 header files.  (Sigh.)
673          *
674          */
675         if ((flags & IO_FLAG_RW) &&
676             (uname(&ut) == 0) &&
677             ((ut.release[0] == '2') && (ut.release[1] == '.') &&
678              (ut.release[2] == '4') && (ut.release[3] == '.') &&
679              (ut.release[4] == '1') && (ut.release[5] >= '0') &&
680              (ut.release[5] < '8')) &&
681             (ext2fs_fstat(data->dev, &st) == 0) &&
682             (S_ISBLK(st.st_mode))) {
683                 struct rlimit   rlim;
684
685                 rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
686                 setrlimit(RLIMIT_FSIZE, &rlim);
687                 getrlimit(RLIMIT_FSIZE, &rlim);
688                 if (((unsigned long) rlim.rlim_cur) <
689                     ((unsigned long) rlim.rlim_max)) {
690                         rlim.rlim_cur = rlim.rlim_max;
691                         setrlimit(RLIMIT_FSIZE, &rlim);
692                 }
693         }
694 #endif
695         *channel = io;
696         return 0;
697
698 cleanup:
699         if (data) {
700                 if (data->dev >= 0)
701                         close(data->dev);
702                 free_cache(data);
703                 ext2fs_free_mem(&data);
704         }
705         if (io) {
706                 if (io->name) {
707                         ext2fs_free_mem(&io->name);
708                 }
709                 ext2fs_free_mem(&io);
710         }
711         return retval;
712 }
713
714 static errcode_t unixfd_open(const char *str_fd, int flags,
715                              io_channel *channel)
716 {
717         int fd;
718         int fd_flags;
719
720         fd = atoi(str_fd);
721 #if defined(HAVE_FCNTL)
722         fd_flags = fcntl(fd, F_GETFD);
723         if (fd_flags == -1)
724                 return -EBADF;
725
726         flags = 0;
727         if (fd_flags & O_RDWR)
728                 flags |= IO_FLAG_RW;
729         if (fd_flags & O_EXCL)
730                 flags |= IO_FLAG_EXCLUSIVE;
731 #if defined(O_DIRECT)
732         if (fd_flags & O_DIRECT)
733                 flags |= IO_FLAG_DIRECT_IO;
734 #endif
735 #endif  /* HAVE_FCNTL */
736
737         return unix_open_channel(str_fd, fd, flags, channel, unixfd_io_manager);
738 }
739
740 static errcode_t unix_open(const char *name, int flags,
741                            io_channel *channel)
742 {
743         int fd = -1;
744         int open_flags;
745
746         if (name == 0)
747                 return EXT2_ET_BAD_DEVICE_NAME;
748
749         open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
750         if (flags & IO_FLAG_EXCLUSIVE)
751                 open_flags |= O_EXCL;
752 #if defined(O_DIRECT)
753         if (flags & IO_FLAG_DIRECT_IO)
754                 open_flags |= O_DIRECT;
755 #endif
756         fd = ext2fs_open_file(name, open_flags, 0);
757         if (fd < 0)
758                 return errno;
759 #if defined(F_NOCACHE) && !defined(IO_DIRECT)
760         if (flags & IO_FLAG_DIRECT_IO) {
761                 if (fcntl(fd, F_NOCACHE, 1) < 0)
762                         return errno;
763         }
764 #endif
765         return unix_open_channel(name, fd, flags, channel, unix_io_manager);
766 }
767
768 static errcode_t unix_close(io_channel channel)
769 {
770         struct unix_private_data *data;
771         errcode_t       retval = 0;
772
773         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
774         data = (struct unix_private_data *) channel->private_data;
775         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
776
777         if (--channel->refcount > 0)
778                 return 0;
779
780 #ifndef NO_IO_CACHE
781         retval = flush_cached_blocks(channel, data, 0);
782 #endif
783
784         if (close(data->dev) < 0)
785                 retval = errno;
786         free_cache(data);
787
788         ext2fs_free_mem(&channel->private_data);
789         if (channel->name)
790                 ext2fs_free_mem(&channel->name);
791         ext2fs_free_mem(&channel);
792         return retval;
793 }
794
795 static errcode_t unix_set_blksize(io_channel channel, int blksize)
796 {
797         struct unix_private_data *data;
798         errcode_t               retval;
799
800         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
801         data = (struct unix_private_data *) channel->private_data;
802         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
803
804         if (channel->block_size != blksize) {
805 #ifndef NO_IO_CACHE
806                 if ((retval = flush_cached_blocks(channel, data, 0)))
807                         return retval;
808 #endif
809
810                 channel->block_size = blksize;
811                 free_cache(data);
812                 if ((retval = alloc_cache(channel, data)))
813                         return retval;
814         }
815         return 0;
816 }
817
818 static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
819                                int count, void *buf)
820 {
821         struct unix_private_data *data;
822         struct unix_cache *cache, *reuse[READ_DIRECT_SIZE];
823         errcode_t       retval;
824         char            *cp;
825         int             i, j;
826
827         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
828         data = (struct unix_private_data *) channel->private_data;
829         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
830
831 #ifdef NO_IO_CACHE
832         return raw_read_blk(channel, data, block, count, buf);
833 #else
834         /*
835          * If we're doing an odd-sized read or a very large read,
836          * flush out the cache and then do a direct read.
837          */
838         if (count < 0 || count > WRITE_DIRECT_SIZE) {
839                 if ((retval = flush_cached_blocks(channel, data, 0)))
840                         return retval;
841                 return raw_read_blk(channel, data, block, count, buf);
842         }
843
844         cp = buf;
845         while (count > 0) {
846                 /* If it's in the cache, use it! */
847                 if ((cache = find_cached_block(data, block, &reuse[0]))) {
848 #ifdef DEBUG
849                         printf("Using cached block %lu\n", block);
850 #endif
851                         memcpy(cp, cache->buf, channel->block_size);
852                         count--;
853                         block++;
854                         cp += channel->block_size;
855                         continue;
856                 }
857                 if (count == 1) {
858                         /*
859                          * Special case where we read directly into the
860                          * cache buffer; important in the O_DIRECT case
861                          */
862                         cache = reuse[0];
863                         reuse_cache(channel, data, cache, block);
864                         if ((retval = raw_read_blk(channel, data, block, 1,
865                                                    cache->buf))) {
866                                 cache->in_use = 0;
867                                 return retval;
868                         }
869                         memcpy(cp, cache->buf, channel->block_size);
870                         return 0;
871                 }
872
873                 /*
874                  * Find the number of uncached blocks so we can do a
875                  * single read request
876                  */
877                 for (i=1; i < count; i++)
878                         if (find_cached_block(data, block+i, &reuse[i]))
879                                 break;
880 #ifdef DEBUG
881                 printf("Reading %d blocks starting at %lu\n", i, block);
882 #endif
883                 if ((retval = raw_read_blk(channel, data, block, i, cp)))
884                         return retval;
885
886                 /* Save the results in the cache */
887                 for (j=0; j < i; j++) {
888                         count--;
889                         cache = reuse[j];
890                         reuse_cache(channel, data, cache, block++);
891                         memcpy(cache->buf, cp, channel->block_size);
892                         cp += channel->block_size;
893                 }
894         }
895         return 0;
896 #endif /* NO_IO_CACHE */
897 }
898
899 static errcode_t unix_read_blk(io_channel channel, unsigned long block,
900                                int count, void *buf)
901 {
902         return unix_read_blk64(channel, block, count, buf);
903 }
904
905 static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
906                                 int count, const void *buf)
907 {
908         struct unix_private_data *data;
909         struct unix_cache *cache, *reuse;
910         errcode_t       retval = 0;
911         const char      *cp;
912         int             writethrough;
913
914         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
915         data = (struct unix_private_data *) channel->private_data;
916         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
917
918 #ifdef NO_IO_CACHE
919         return raw_write_blk(channel, data, block, count, buf);
920 #else
921         /*
922          * If we're doing an odd-sized write or a very large write,
923          * flush out the cache completely and then do a direct write.
924          */
925         if (count < 0 || count > WRITE_DIRECT_SIZE) {
926                 if ((retval = flush_cached_blocks(channel, data, 1)))
927                         return retval;
928                 return raw_write_blk(channel, data, block, count, buf);
929         }
930
931         /*
932          * For a moderate-sized multi-block write, first force a write
933          * if we're in write-through cache mode, and then fill the
934          * cache with the blocks.
935          */
936         writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
937         if (writethrough)
938                 retval = raw_write_blk(channel, data, block, count, buf);
939
940         cp = buf;
941         while (count > 0) {
942                 cache = find_cached_block(data, block, &reuse);
943                 if (!cache) {
944                         cache = reuse;
945                         reuse_cache(channel, data, cache, block);
946                 }
947                 if (cache->buf != cp)
948                         memcpy(cache->buf, cp, channel->block_size);
949                 cache->dirty = !writethrough;
950                 count--;
951                 block++;
952                 cp += channel->block_size;
953         }
954         return retval;
955 #endif /* NO_IO_CACHE */
956 }
957
958 static errcode_t unix_cache_readahead(io_channel channel,
959                                       unsigned long long block,
960                                       unsigned long long count)
961 {
962 #ifdef POSIX_FADV_WILLNEED
963         struct unix_private_data *data;
964
965         data = (struct unix_private_data *)channel->private_data;
966         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
967         return posix_fadvise(data->dev,
968                              (ext2_loff_t)block * channel->block_size + data->offset,
969                              (ext2_loff_t)count * channel->block_size,
970                              POSIX_FADV_WILLNEED);
971 #else
972         return EXT2_ET_OP_NOT_SUPPORTED;
973 #endif
974 }
975
976 static errcode_t unix_write_blk(io_channel channel, unsigned long block,
977                                 int count, const void *buf)
978 {
979         return unix_write_blk64(channel, block, count, buf);
980 }
981
982 static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
983                                  int size, const void *buf)
984 {
985         struct unix_private_data *data;
986         errcode_t       retval = 0;
987         ssize_t         actual;
988
989         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
990         data = (struct unix_private_data *) channel->private_data;
991         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
992
993         if (channel->align != 0) {
994 #ifdef ALIGN_DEBUG
995                 printf("unix_write_byte: O_DIRECT fallback\n");
996 #endif
997                 return EXT2_ET_UNIMPLEMENTED;
998         }
999
1000 #ifndef NO_IO_CACHE
1001         /*
1002          * Flush out the cache completely
1003          */
1004         if ((retval = flush_cached_blocks(channel, data, 1)))
1005                 return retval;
1006 #endif
1007
1008         if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
1009                 return errno;
1010
1011         actual = write(data->dev, buf, size);
1012         if (actual < 0)
1013                 return errno;
1014         if (actual != size)
1015                 return EXT2_ET_SHORT_WRITE;
1016
1017         return 0;
1018 }
1019
1020 /*
1021  * Flush data buffers to disk.
1022  */
1023 static errcode_t unix_flush(io_channel channel)
1024 {
1025         struct unix_private_data *data;
1026         errcode_t retval = 0;
1027
1028         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1029         data = (struct unix_private_data *) channel->private_data;
1030         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1031
1032 #ifndef NO_IO_CACHE
1033         retval = flush_cached_blocks(channel, data, 0);
1034 #endif
1035 #ifdef HAVE_FSYNC
1036         if (!retval && fsync(data->dev) != 0)
1037                 return errno;
1038 #endif
1039         return retval;
1040 }
1041
1042 static errcode_t unix_set_option(io_channel channel, const char *option,
1043                                  const char *arg)
1044 {
1045         struct unix_private_data *data;
1046         unsigned long long tmp;
1047         char *end;
1048
1049         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1050         data = (struct unix_private_data *) channel->private_data;
1051         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1052
1053         if (!strcmp(option, "offset")) {
1054                 if (!arg)
1055                         return EXT2_ET_INVALID_ARGUMENT;
1056
1057                 tmp = strtoull(arg, &end, 0);
1058                 if (*end)
1059                         return EXT2_ET_INVALID_ARGUMENT;
1060                 data->offset = tmp;
1061                 if (data->offset < 0)
1062                         return EXT2_ET_INVALID_ARGUMENT;
1063                 return 0;
1064         }
1065         return EXT2_ET_INVALID_ARGUMENT;
1066 }
1067
1068 #if defined(__linux__) && !defined(BLKDISCARD)
1069 #define BLKDISCARD              _IO(0x12,119)
1070 #endif
1071
1072 static errcode_t unix_discard(io_channel channel, unsigned long long block,
1073                               unsigned long long count)
1074 {
1075         struct unix_private_data *data;
1076         int             ret;
1077
1078         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1079         data = (struct unix_private_data *) channel->private_data;
1080         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1081
1082         if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
1083 #ifdef BLKDISCARD
1084                 __u64 range[2];
1085
1086                 range[0] = (__u64)(block) * channel->block_size + data->offset;
1087                 range[1] = (__u64)(count) * channel->block_size;
1088
1089                 ret = ioctl(data->dev, BLKDISCARD, &range);
1090 #else
1091                 goto unimplemented;
1092 #endif
1093         } else {
1094 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE)
1095                 /*
1096                  * If we are not on block device, try to use punch hole
1097                  * to reclaim free space.
1098                  */
1099                 ret = fallocate(data->dev,
1100                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1101                                 (off_t)(block) * channel->block_size + data->offset,
1102                                 (off_t)(count) * channel->block_size);
1103 #else
1104                 goto unimplemented;
1105 #endif
1106         }
1107         if (ret < 0) {
1108                 if (errno == EOPNOTSUPP)
1109                         goto unimplemented;
1110                 return errno;
1111         }
1112         return 0;
1113 unimplemented:
1114         return EXT2_ET_UNIMPLEMENTED;
1115 }
1116
1117 /* parameters might not be used if OS doesn't support zeroout */
1118 #pragma GCC diagnostic push
1119 #pragma GCC diagnostic ignored "-Wunused-parameter"
1120 static errcode_t unix_zeroout(io_channel channel, unsigned long long block,
1121                               unsigned long long count)
1122 {
1123         struct unix_private_data *data;
1124         int             ret;
1125
1126         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1127         data = (struct unix_private_data *) channel->private_data;
1128         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1129
1130         if (safe_getenv("UNIX_IO_NOZEROOUT"))
1131                 goto unimplemented;
1132
1133         if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
1134                 /* Not implemented until the BLKZEROOUT mess is fixed */
1135                 goto unimplemented;
1136         } else {
1137                 /* Regular file, try to use truncate/punch/zero. */
1138                 struct stat statbuf;
1139
1140                 if (count == 0)
1141                         return 0;
1142                 /*
1143                  * If we're trying to zero a range past the end of the file,
1144                  * extend the file size, then truncate everything.
1145                  */
1146                 ret = fstat(data->dev, &statbuf);
1147                 if (ret)
1148                         goto err;
1149                 if ((unsigned long long) statbuf.st_size <
1150                         (block + count) * channel->block_size + data->offset) {
1151                         ret = ftruncate(data->dev,
1152                                         (block + count) * channel->block_size + data->offset);
1153                         if (ret)
1154                                 goto err;
1155                 }
1156 #if defined(HAVE_FALLOCATE) && (defined(FALLOC_FL_ZERO_RANGE) || \
1157         (defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)))
1158 #if defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)
1159                 ret = fallocate(data->dev,
1160                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1161                                 (off_t)(block) * channel->block_size + data->offset,
1162                                 (off_t)(count) * channel->block_size);
1163                 if (ret == 0)
1164                         goto err;
1165 #endif
1166 #ifdef FALLOC_FL_ZERO_RANGE
1167                 ret = fallocate(data->dev,
1168                                 FALLOC_FL_ZERO_RANGE,
1169                                 (off_t)(block) * channel->block_size + data->offset,
1170                                 (off_t)(count) * channel->block_size);
1171 #endif
1172 #else
1173                 goto unimplemented;
1174 #endif /* HAVE_FALLOCATE && (ZERO_RANGE || (PUNCH_HOLE && KEEP_SIZE)) */
1175         }
1176 err:
1177         if (ret < 0) {
1178                 if (errno == EOPNOTSUPP)
1179                         goto unimplemented;
1180                 return errno;
1181         }
1182         return 0;
1183 unimplemented:
1184         return EXT2_ET_UNIMPLEMENTED;
1185 }
1186 #pragma GCC diagnostic pop
1187
1188 static struct struct_io_manager struct_unix_manager = {
1189         .magic          = EXT2_ET_MAGIC_IO_MANAGER,
1190         .name           = "Unix I/O Manager",
1191         .open           = unix_open,
1192         .close          = unix_close,
1193         .set_blksize    = unix_set_blksize,
1194         .read_blk       = unix_read_blk,
1195         .write_blk      = unix_write_blk,
1196         .flush          = unix_flush,
1197         .write_byte     = unix_write_byte,
1198         .set_option     = unix_set_option,
1199         .get_stats      = unix_get_stats,
1200         .read_blk64     = unix_read_blk64,
1201         .write_blk64    = unix_write_blk64,
1202         .discard        = unix_discard,
1203         .cache_readahead        = unix_cache_readahead,
1204         .zeroout        = unix_zeroout,
1205 };
1206
1207 io_manager unix_io_manager = &struct_unix_manager;
1208
1209 static struct struct_io_manager struct_unixfd_manager = {
1210         .magic          = EXT2_ET_MAGIC_IO_MANAGER,
1211         .name           = "Unix fd I/O Manager",
1212         .open           = unixfd_open,
1213         .close          = unix_close,
1214         .set_blksize    = unix_set_blksize,
1215         .read_blk       = unix_read_blk,
1216         .write_blk      = unix_write_blk,
1217         .flush          = unix_flush,
1218         .write_byte     = unix_write_byte,
1219         .set_option     = unix_set_option,
1220         .get_stats      = unix_get_stats,
1221         .read_blk64     = unix_read_blk64,
1222         .write_blk64    = unix_write_blk64,
1223         .discard        = unix_discard,
1224         .cache_readahead        = unix_cache_readahead,
1225         .zeroout        = unix_zeroout,
1226 };
1227
1228 io_manager unixfd_io_manager = &struct_unixfd_manager;