Whamcloud - gitweb
libext2fs: unix_io: reflect error from read/write calls to caller
[tools/e2fsprogs.git] / lib / ext2fs / unix_io.c
1 /*
2  * unix_io.c --- This is the Unix (well, really POSIX) implementation
3  *      of the I/O manager.
4  *
5  * Implements a one-block write-through cache.
6  *
7  * Includes support for Windows NT support under Cygwin.
8  *
9  * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10  *      2002 by Theodore Ts'o.
11  *
12  * %Begin-Header%
13  * This file may be redistributed under the terms of the GNU Library
14  * General Public License, version 2.
15  * %End-Header%
16  */
17
18 #if !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
19 #define _XOPEN_SOURCE 600
20 #define _DARWIN_C_SOURCE
21 #define _FILE_OFFSET_BITS 64
22 #ifndef _LARGEFILE_SOURCE
23 #define _LARGEFILE_SOURCE
24 #endif
25 #ifndef _LARGEFILE64_SOURCE
26 #define _LARGEFILE64_SOURCE
27 #endif
28 #ifndef _GNU_SOURCE
29 #define _GNU_SOURCE
30 #endif
31 #endif
32
33 #include "config.h"
34 #include <stdio.h>
35 #include <string.h>
36 #if HAVE_UNISTD_H
37 #include <unistd.h>
38 #endif
39 #if HAVE_ERRNO_H
40 #include <errno.h>
41 #endif
42 #include <fcntl.h>
43 #include <time.h>
44 #ifdef __linux__
45 #include <sys/utsname.h>
46 #endif
47 #if HAVE_SYS_TYPES_H
48 #include <sys/types.h>
49 #endif
50 #ifdef HAVE_SYS_IOCTL_H
51 #include <sys/ioctl.h>
52 #endif
53 #ifdef HAVE_SYS_MOUNT_H
54 #include <sys/mount.h>
55 #endif
56 #ifdef HAVE_SYS_PRCTL_H
57 #include <sys/prctl.h>
58 #else
59 #define PR_GET_DUMPABLE 3
60 #endif
61 #if HAVE_SYS_STAT_H
62 #include <sys/stat.h>
63 #endif
64 #if HAVE_SYS_RESOURCE_H
65 #include <sys/resource.h>
66 #endif
67 #if HAVE_LINUX_FALLOC_H
68 #include <linux/falloc.h>
69 #endif
70
71 #if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
72 #define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
73 #endif
74
75 #undef ALIGN_DEBUG
76
77 #include "ext2_fs.h"
78 #include "ext2fs.h"
79
80 /*
81  * For checking structure magic numbers...
82  */
83
84 #define EXT2_CHECK_MAGIC(struct, code) \
85           if ((struct)->magic != (code)) return (code)
86
87 struct unix_cache {
88         char                    *buf;
89         unsigned long long      block;
90         int                     access_time;
91         unsigned                dirty:1;
92         unsigned                in_use:1;
93 };
94
95 #define CACHE_SIZE 8
96 #define WRITE_DIRECT_SIZE 4     /* Must be smaller than CACHE_SIZE */
97 #define READ_DIRECT_SIZE 4      /* Should be smaller than CACHE_SIZE */
98
99 struct unix_private_data {
100         int     magic;
101         int     dev;
102         int     flags;
103         int     align;
104         int     access_time;
105         ext2_loff_t offset;
106         struct unix_cache cache[CACHE_SIZE];
107         void    *bounce;
108         struct struct_io_stats io_stats;
109 };
110
111 #define IS_ALIGNED(n, align) ((((uintptr_t) n) & \
112                                ((uintptr_t) ((align)-1))) == 0)
113
114 static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
115 {
116         errcode_t       retval = 0;
117
118         struct unix_private_data *data;
119
120         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
121         data = (struct unix_private_data *) channel->private_data;
122         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
123
124         if (stats)
125                 *stats = &data->io_stats;
126
127         return retval;
128 }
129
130 static char *safe_getenv(const char *arg)
131 {
132         if ((getuid() != geteuid()) || (getgid() != getegid()))
133                 return NULL;
134 #ifdef HAVE_PRCTL
135         if (prctl(PR_GET_DUMPABLE, 0, 0, 0, 0) == 0)
136                 return NULL;
137 #else
138 #if (defined(linux) && defined(SYS_prctl))
139         if (syscall(SYS_prctl, PR_GET_DUMPABLE, 0, 0, 0, 0) == 0)
140                 return NULL;
141 #endif
142 #endif
143
144 #if defined(HAVE_SECURE_GETENV)
145         return secure_getenv(arg);
146 #elif defined(HAVE___SECURE_GETENV)
147         return __secure_getenv(arg);
148 #else
149         return getenv(arg);
150 #endif
151 }
152
153 /*
154  * Here are the raw I/O functions
155  */
156 static errcode_t raw_read_blk(io_channel channel,
157                               struct unix_private_data *data,
158                               unsigned long long block,
159                               int count, void *bufv)
160 {
161         errcode_t       retval;
162         ssize_t         size;
163         ext2_loff_t     location;
164         int             actual = 0;
165         unsigned char   *buf = bufv;
166         ssize_t         really_read = 0;
167
168         size = (count < 0) ? -count : count * channel->block_size;
169         data->io_stats.bytes_read += size;
170         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
171
172         if (data->flags & IO_FLAG_FORCE_BOUNCE) {
173                 if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
174                         retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
175                         goto error_out;
176                 }
177                 goto bounce_read;
178         }
179
180 #ifdef HAVE_PREAD64
181         /* Try an aligned pread */
182         if ((channel->align == 0) ||
183             (IS_ALIGNED(buf, channel->align) &&
184              IS_ALIGNED(size, channel->align))) {
185                 actual = pread64(data->dev, buf, size, location);
186                 if (actual == size)
187                         return 0;
188         }
189 #elif HAVE_PREAD
190         /* Try an aligned pread */
191         if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
192             ((channel->align == 0) ||
193              (IS_ALIGNED(buf, channel->align) &&
194               IS_ALIGNED(size, channel->align)))) {
195                 actual = pread(data->dev, buf, size, location);
196                 if (actual == size)
197                         return 0;
198         }
199 #endif /* HAVE_PREAD */
200
201         if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
202                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
203                 goto error_out;
204         }
205         if ((channel->align == 0) ||
206             (IS_ALIGNED(buf, channel->align) &&
207              IS_ALIGNED(size, channel->align))) {
208                 actual = read(data->dev, buf, size);
209                 if (actual != size) {
210                 short_read:
211                         if (actual < 0) {
212                                 retval = errno;
213                                 actual = 0;
214                         } else
215                                 retval = EXT2_ET_SHORT_READ;
216                         goto error_out;
217                 }
218                 return 0;
219         }
220
221 #ifdef ALIGN_DEBUG
222         printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
223                (unsigned long) size);
224 #endif
225
226         /*
227          * The buffer or size which we're trying to read isn't aligned
228          * to the O_DIRECT rules, so we need to do this the hard way...
229          */
230 bounce_read:
231         while (size > 0) {
232                 actual = read(data->dev, data->bounce, channel->block_size);
233                 if (actual != channel->block_size) {
234                         actual = really_read;
235                         buf -= really_read;
236                         size += really_read;
237                         goto short_read;
238                 }
239                 actual = size;
240                 if (size > channel->block_size)
241                         actual = channel->block_size;
242                 memcpy(buf, data->bounce, actual);
243                 really_read += actual;
244                 size -= actual;
245                 buf += actual;
246         }
247         return 0;
248
249 error_out:
250         memset((char *) buf+actual, 0, size-actual);
251         if (channel->read_error)
252                 retval = (channel->read_error)(channel, block, count, buf,
253                                                size, actual, retval);
254         return retval;
255 }
256
257 static errcode_t raw_write_blk(io_channel channel,
258                                struct unix_private_data *data,
259                                unsigned long long block,
260                                int count, const void *bufv)
261 {
262         ssize_t         size;
263         ext2_loff_t     location;
264         int             actual = 0;
265         errcode_t       retval;
266         const unsigned char *buf = bufv;
267
268         if (count == 1)
269                 size = channel->block_size;
270         else {
271                 if (count < 0)
272                         size = -count;
273                 else
274                         size = count * channel->block_size;
275         }
276         data->io_stats.bytes_written += size;
277
278         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
279
280         if (data->flags & IO_FLAG_FORCE_BOUNCE) {
281                 if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
282                         retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
283                         goto error_out;
284                 }
285                 goto bounce_write;
286         }
287
288 #ifdef HAVE_PWRITE64
289         /* Try an aligned pwrite */
290         if ((channel->align == 0) ||
291             (IS_ALIGNED(buf, channel->align) &&
292              IS_ALIGNED(size, channel->align))) {
293                 actual = pwrite64(data->dev, buf, size, location);
294                 if (actual == size)
295                         return 0;
296         }
297 #elif HAVE_PWRITE
298         /* Try an aligned pwrite */
299         if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
300             ((channel->align == 0) ||
301              (IS_ALIGNED(buf, channel->align) &&
302               IS_ALIGNED(size, channel->align)))) {
303                 actual = pwrite(data->dev, buf, size, location);
304                 if (actual == size)
305                         return 0;
306         }
307 #endif /* HAVE_PWRITE */
308
309         if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
310                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
311                 goto error_out;
312         }
313
314         if ((channel->align == 0) ||
315             (IS_ALIGNED(buf, channel->align) &&
316              IS_ALIGNED(size, channel->align))) {
317                 actual = write(data->dev, buf, size);
318                 if (actual < 0) {
319                         retval = errno;
320                         goto error_out;
321                 }
322                 if (actual != size) {
323                 short_write:
324                         retval = EXT2_ET_SHORT_WRITE;
325                         goto error_out;
326                 }
327                 return 0;
328         }
329
330 #ifdef ALIGN_DEBUG
331         printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
332                (unsigned long) size);
333 #endif
334         /*
335          * The buffer or size which we're trying to write isn't aligned
336          * to the O_DIRECT rules, so we need to do this the hard way...
337          */
338 bounce_write:
339         while (size > 0) {
340                 if (size < channel->block_size) {
341                         actual = read(data->dev, data->bounce,
342                                       channel->block_size);
343                         if (actual != channel->block_size) {
344                                 if (actual < 0) {
345                                         retval = errno;
346                                         goto error_out;
347                                 }
348                                 memset(data->bounce + actual, 0,
349                                        channel->block_size - actual);
350                         }
351                 }
352                 actual = size;
353                 if (size > channel->block_size)
354                         actual = channel->block_size;
355                 memcpy(data->bounce, buf, actual);
356                 if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
357                         retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
358                         goto error_out;
359                 }
360                 actual = write(data->dev, data->bounce, channel->block_size);
361                 if (actual < 0) {
362                         retval = errno;
363                         goto error_out;
364                 }
365                 if (actual != channel->block_size)
366                         goto short_write;
367                 size -= actual;
368                 buf += actual;
369                 location += actual;
370         }
371         return 0;
372
373 error_out:
374         if (channel->write_error)
375                 retval = (channel->write_error)(channel, block, count, buf,
376                                                 size, actual, retval);
377         return retval;
378 }
379
380
381 /*
382  * Here we implement the cache functions
383  */
384
385 /* Allocate the cache buffers */
386 static errcode_t alloc_cache(io_channel channel,
387                              struct unix_private_data *data)
388 {
389         errcode_t               retval;
390         struct unix_cache       *cache;
391         int                     i;
392
393         data->access_time = 0;
394         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
395                 cache->block = 0;
396                 cache->access_time = 0;
397                 cache->dirty = 0;
398                 cache->in_use = 0;
399                 if (cache->buf)
400                         ext2fs_free_mem(&cache->buf);
401                 retval = io_channel_alloc_buf(channel, 0, &cache->buf);
402                 if (retval)
403                         return retval;
404         }
405         if (channel->align || data->flags & IO_FLAG_FORCE_BOUNCE) {
406                 if (data->bounce)
407                         ext2fs_free_mem(&data->bounce);
408                 retval = io_channel_alloc_buf(channel, 0, &data->bounce);
409         }
410         return retval;
411 }
412
413 /* Free the cache buffers */
414 static void free_cache(struct unix_private_data *data)
415 {
416         struct unix_cache       *cache;
417         int                     i;
418
419         data->access_time = 0;
420         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
421                 cache->block = 0;
422                 cache->access_time = 0;
423                 cache->dirty = 0;
424                 cache->in_use = 0;
425                 if (cache->buf)
426                         ext2fs_free_mem(&cache->buf);
427         }
428         if (data->bounce)
429                 ext2fs_free_mem(&data->bounce);
430 }
431
432 #ifndef NO_IO_CACHE
433 /*
434  * Try to find a block in the cache.  If the block is not found, and
435  * eldest is a non-zero pointer, then fill in eldest with the cache
436  * entry to that should be reused.
437  */
438 static struct unix_cache *find_cached_block(struct unix_private_data *data,
439                                             unsigned long long block,
440                                             struct unix_cache **eldest)
441 {
442         struct unix_cache       *cache, *unused_cache, *oldest_cache;
443         int                     i;
444
445         unused_cache = oldest_cache = 0;
446         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
447                 if (!cache->in_use) {
448                         if (!unused_cache)
449                                 unused_cache = cache;
450                         continue;
451                 }
452                 if (cache->block == block) {
453                         cache->access_time = ++data->access_time;
454                         return cache;
455                 }
456                 if (!oldest_cache ||
457                     (cache->access_time < oldest_cache->access_time))
458                         oldest_cache = cache;
459         }
460         if (eldest)
461                 *eldest = (unused_cache) ? unused_cache : oldest_cache;
462         return 0;
463 }
464
465 /*
466  * Reuse a particular cache entry for another block.
467  */
468 static void reuse_cache(io_channel channel, struct unix_private_data *data,
469                  struct unix_cache *cache, unsigned long long block)
470 {
471         if (cache->dirty && cache->in_use)
472                 raw_write_blk(channel, data, cache->block, 1, cache->buf);
473
474         cache->in_use = 1;
475         cache->dirty = 0;
476         cache->block = block;
477         cache->access_time = ++data->access_time;
478 }
479
480 /*
481  * Flush all of the blocks in the cache
482  */
483 static errcode_t flush_cached_blocks(io_channel channel,
484                                      struct unix_private_data *data,
485                                      int invalidate)
486
487 {
488         struct unix_cache       *cache;
489         errcode_t               retval, retval2;
490         int                     i;
491
492         retval2 = 0;
493         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
494                 if (!cache->in_use)
495                         continue;
496
497                 if (invalidate)
498                         cache->in_use = 0;
499
500                 if (!cache->dirty)
501                         continue;
502
503                 retval = raw_write_blk(channel, data,
504                                        cache->block, 1, cache->buf);
505                 if (retval)
506                         retval2 = retval;
507                 else
508                         cache->dirty = 0;
509         }
510         return retval2;
511 }
512 #endif /* NO_IO_CACHE */
513
514 #ifdef __linux__
515 #ifndef BLKDISCARDZEROES
516 #define BLKDISCARDZEROES _IO(0x12,124)
517 #endif
518 #endif
519
520 int ext2fs_open_file(const char *pathname, int flags, mode_t mode)
521 {
522         if (mode)
523 #if defined(HAVE_OPEN64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
524                 return open64(pathname, flags, mode);
525         else
526                 return open64(pathname, flags);
527 #else
528                 return open(pathname, flags, mode);
529         else
530                 return open(pathname, flags);
531 #endif
532 }
533
534 int ext2fs_stat(const char *path, ext2fs_struct_stat *buf)
535 {
536 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
537         return stat64(path, buf);
538 #else
539         return stat(path, buf);
540 #endif
541 }
542
543 int ext2fs_fstat(int fd, ext2fs_struct_stat *buf)
544 {
545 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
546         return fstat64(fd, buf);
547 #else
548         return fstat(fd, buf);
549 #endif
550 }
551
552
553 static errcode_t unix_open_channel(const char *name, int fd,
554                                    int flags, io_channel *channel,
555                                    io_manager io_mgr)
556 {
557         io_channel      io = NULL;
558         struct unix_private_data *data = NULL;
559         errcode_t       retval;
560         ext2fs_struct_stat st;
561 #ifdef __linux__
562         struct          utsname ut;
563 #endif
564
565         if (safe_getenv("UNIX_IO_FORCE_BOUNCE"))
566                 flags |= IO_FLAG_FORCE_BOUNCE;
567
568         retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
569         if (retval)
570                 goto cleanup;
571         memset(io, 0, sizeof(struct struct_io_channel));
572         io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
573         retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data);
574         if (retval)
575                 goto cleanup;
576
577         io->manager = io_mgr;
578         retval = ext2fs_get_mem(strlen(name)+1, &io->name);
579         if (retval)
580                 goto cleanup;
581
582         strcpy(io->name, name);
583         io->private_data = data;
584         io->block_size = 1024;
585         io->read_error = 0;
586         io->write_error = 0;
587         io->refcount = 1;
588
589         memset(data, 0, sizeof(struct unix_private_data));
590         data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
591         data->io_stats.num_fields = 2;
592         data->flags = flags;
593         data->dev = fd;
594
595 #if defined(O_DIRECT)
596         if (flags & IO_FLAG_DIRECT_IO)
597                 io->align = ext2fs_get_dio_alignment(data->dev);
598 #elif defined(F_NOCACHE)
599         if (flags & IO_FLAG_DIRECT_IO)
600                 io->align = 4096;
601 #endif
602
603         /*
604          * If the device is really a block device, then set the
605          * appropriate flag, otherwise we can set DISCARD_ZEROES flag
606          * because we are going to use punch hole instead of discard
607          * and if it succeed, subsequent read from sparse area returns
608          * zero.
609          */
610         if (ext2fs_fstat(data->dev, &st) == 0) {
611                 if (S_ISBLK(st.st_mode))
612                         io->flags |= CHANNEL_FLAGS_BLOCK_DEVICE;
613                 else
614                         io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
615         }
616
617 #ifdef BLKDISCARDZEROES
618         {
619                 int zeroes = 0;
620                 if (ioctl(data->dev, BLKDISCARDZEROES, &zeroes) == 0 &&
621                     zeroes)
622                         io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
623         }
624 #endif
625
626 #if defined(__CYGWIN__)
627         /*
628          * Some operating systems require that the buffers be aligned,
629          * regardless of O_DIRECT
630          */
631         if (!io->align)
632                 io->align = 512;
633 #endif
634
635 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
636         if (io->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
637                 int dio_align = ext2fs_get_dio_alignment(fd);
638
639                 if (io->align < dio_align)
640                         io->align = dio_align;
641         }
642 #endif
643
644         if ((retval = alloc_cache(io, data)))
645                 goto cleanup;
646
647 #ifdef BLKROGET
648         if (flags & IO_FLAG_RW) {
649                 int error;
650                 int readonly = 0;
651
652                 /* Is the block device actually writable? */
653                 error = ioctl(data->dev, BLKROGET, &readonly);
654                 if (!error && readonly) {
655                         retval = EPERM;
656                         goto cleanup;
657                 }
658         }
659 #endif
660
661 #ifdef __linux__
662 #undef RLIM_INFINITY
663 #if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
664 #define RLIM_INFINITY   ((unsigned long)(~0UL>>1))
665 #else
666 #define RLIM_INFINITY  (~0UL)
667 #endif
668         /*
669          * Work around a bug in 2.4.10-2.4.18 kernels where writes to
670          * block devices are wrongly getting hit by the filesize
671          * limit.  This workaround isn't perfect, since it won't work
672          * if glibc wasn't built against 2.2 header files.  (Sigh.)
673          *
674          */
675         if ((flags & IO_FLAG_RW) &&
676             (uname(&ut) == 0) &&
677             ((ut.release[0] == '2') && (ut.release[1] == '.') &&
678              (ut.release[2] == '4') && (ut.release[3] == '.') &&
679              (ut.release[4] == '1') && (ut.release[5] >= '0') &&
680              (ut.release[5] < '8')) &&
681             (ext2fs_fstat(data->dev, &st) == 0) &&
682             (S_ISBLK(st.st_mode))) {
683                 struct rlimit   rlim;
684
685                 rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
686                 setrlimit(RLIMIT_FSIZE, &rlim);
687                 getrlimit(RLIMIT_FSIZE, &rlim);
688                 if (((unsigned long) rlim.rlim_cur) <
689                     ((unsigned long) rlim.rlim_max)) {
690                         rlim.rlim_cur = rlim.rlim_max;
691                         setrlimit(RLIMIT_FSIZE, &rlim);
692                 }
693         }
694 #endif
695         *channel = io;
696         return 0;
697
698 cleanup:
699         if (data) {
700                 if (data->dev >= 0)
701                         close(data->dev);
702                 free_cache(data);
703                 ext2fs_free_mem(&data);
704         }
705         if (io) {
706                 if (io->name) {
707                         ext2fs_free_mem(&io->name);
708                 }
709                 ext2fs_free_mem(&io);
710         }
711         return retval;
712 }
713
714 static errcode_t unixfd_open(const char *str_fd, int flags,
715                              io_channel *channel)
716 {
717         int fd;
718         int fd_flags;
719
720         fd = atoi(str_fd);
721         fd_flags = fcntl(fd, F_GETFD);
722         if (fd_flags == -1)
723                 return -EBADF;
724
725         flags = 0;
726         if (fd_flags & O_RDWR)
727                 flags |= IO_FLAG_RW;
728         if (fd_flags & O_EXCL)
729                 flags |= IO_FLAG_EXCLUSIVE;
730 #if defined(O_DIRECT)
731         if (fd_flags & O_DIRECT)
732                 flags |= IO_FLAG_DIRECT_IO;
733 #endif
734
735         return unix_open_channel(str_fd, fd, flags, channel, unixfd_io_manager);
736 }
737
738 static errcode_t unix_open(const char *name, int flags,
739                            io_channel *channel)
740 {
741         int fd = -1;
742         int open_flags;
743
744         if (name == 0)
745                 return EXT2_ET_BAD_DEVICE_NAME;
746
747         open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
748         if (flags & IO_FLAG_EXCLUSIVE)
749                 open_flags |= O_EXCL;
750 #if defined(O_DIRECT)
751         if (flags & IO_FLAG_DIRECT_IO)
752                 open_flags |= O_DIRECT;
753 #endif
754         fd = ext2fs_open_file(name, open_flags, 0);
755         if (fd < 0)
756                 return errno;
757 #if defined(F_NOCACHE) && !defined(IO_DIRECT)
758         if (flags & IO_FLAG_DIRECT_IO) {
759                 if (fcntl(fd, F_NOCACHE, 1) < 0)
760                         return errno;
761         }
762 #endif
763         return unix_open_channel(name, fd, flags, channel, unix_io_manager);
764 }
765
766 static errcode_t unix_close(io_channel channel)
767 {
768         struct unix_private_data *data;
769         errcode_t       retval = 0;
770
771         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
772         data = (struct unix_private_data *) channel->private_data;
773         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
774
775         if (--channel->refcount > 0)
776                 return 0;
777
778 #ifndef NO_IO_CACHE
779         retval = flush_cached_blocks(channel, data, 0);
780 #endif
781
782         if (close(data->dev) < 0)
783                 retval = errno;
784         free_cache(data);
785
786         ext2fs_free_mem(&channel->private_data);
787         if (channel->name)
788                 ext2fs_free_mem(&channel->name);
789         ext2fs_free_mem(&channel);
790         return retval;
791 }
792
793 static errcode_t unix_set_blksize(io_channel channel, int blksize)
794 {
795         struct unix_private_data *data;
796         errcode_t               retval;
797
798         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
799         data = (struct unix_private_data *) channel->private_data;
800         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
801
802         if (channel->block_size != blksize) {
803 #ifndef NO_IO_CACHE
804                 if ((retval = flush_cached_blocks(channel, data, 0)))
805                         return retval;
806 #endif
807
808                 channel->block_size = blksize;
809                 free_cache(data);
810                 if ((retval = alloc_cache(channel, data)))
811                         return retval;
812         }
813         return 0;
814 }
815
816 static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
817                                int count, void *buf)
818 {
819         struct unix_private_data *data;
820         struct unix_cache *cache, *reuse[READ_DIRECT_SIZE];
821         errcode_t       retval;
822         char            *cp;
823         int             i, j;
824
825         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
826         data = (struct unix_private_data *) channel->private_data;
827         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
828
829 #ifdef NO_IO_CACHE
830         return raw_read_blk(channel, data, block, count, buf);
831 #else
832         /*
833          * If we're doing an odd-sized read or a very large read,
834          * flush out the cache and then do a direct read.
835          */
836         if (count < 0 || count > WRITE_DIRECT_SIZE) {
837                 if ((retval = flush_cached_blocks(channel, data, 0)))
838                         return retval;
839                 return raw_read_blk(channel, data, block, count, buf);
840         }
841
842         cp = buf;
843         while (count > 0) {
844                 /* If it's in the cache, use it! */
845                 if ((cache = find_cached_block(data, block, &reuse[0]))) {
846 #ifdef DEBUG
847                         printf("Using cached block %lu\n", block);
848 #endif
849                         memcpy(cp, cache->buf, channel->block_size);
850                         count--;
851                         block++;
852                         cp += channel->block_size;
853                         continue;
854                 }
855                 if (count == 1) {
856                         /*
857                          * Special case where we read directly into the
858                          * cache buffer; important in the O_DIRECT case
859                          */
860                         cache = reuse[0];
861                         reuse_cache(channel, data, cache, block);
862                         if ((retval = raw_read_blk(channel, data, block, 1,
863                                                    cache->buf))) {
864                                 cache->in_use = 0;
865                                 return retval;
866                         }
867                         memcpy(cp, cache->buf, channel->block_size);
868                         return 0;
869                 }
870
871                 /*
872                  * Find the number of uncached blocks so we can do a
873                  * single read request
874                  */
875                 for (i=1; i < count; i++)
876                         if (find_cached_block(data, block+i, &reuse[i]))
877                                 break;
878 #ifdef DEBUG
879                 printf("Reading %d blocks starting at %lu\n", i, block);
880 #endif
881                 if ((retval = raw_read_blk(channel, data, block, i, cp)))
882                         return retval;
883
884                 /* Save the results in the cache */
885                 for (j=0; j < i; j++) {
886                         count--;
887                         cache = reuse[j];
888                         reuse_cache(channel, data, cache, block++);
889                         memcpy(cache->buf, cp, channel->block_size);
890                         cp += channel->block_size;
891                 }
892         }
893         return 0;
894 #endif /* NO_IO_CACHE */
895 }
896
897 static errcode_t unix_read_blk(io_channel channel, unsigned long block,
898                                int count, void *buf)
899 {
900         return unix_read_blk64(channel, block, count, buf);
901 }
902
903 static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
904                                 int count, const void *buf)
905 {
906         struct unix_private_data *data;
907         struct unix_cache *cache, *reuse;
908         errcode_t       retval = 0;
909         const char      *cp;
910         int             writethrough;
911
912         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
913         data = (struct unix_private_data *) channel->private_data;
914         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
915
916 #ifdef NO_IO_CACHE
917         return raw_write_blk(channel, data, block, count, buf);
918 #else
919         /*
920          * If we're doing an odd-sized write or a very large write,
921          * flush out the cache completely and then do a direct write.
922          */
923         if (count < 0 || count > WRITE_DIRECT_SIZE) {
924                 if ((retval = flush_cached_blocks(channel, data, 1)))
925                         return retval;
926                 return raw_write_blk(channel, data, block, count, buf);
927         }
928
929         /*
930          * For a moderate-sized multi-block write, first force a write
931          * if we're in write-through cache mode, and then fill the
932          * cache with the blocks.
933          */
934         writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
935         if (writethrough)
936                 retval = raw_write_blk(channel, data, block, count, buf);
937
938         cp = buf;
939         while (count > 0) {
940                 cache = find_cached_block(data, block, &reuse);
941                 if (!cache) {
942                         cache = reuse;
943                         reuse_cache(channel, data, cache, block);
944                 }
945                 if (cache->buf != cp)
946                         memcpy(cache->buf, cp, channel->block_size);
947                 cache->dirty = !writethrough;
948                 count--;
949                 block++;
950                 cp += channel->block_size;
951         }
952         return retval;
953 #endif /* NO_IO_CACHE */
954 }
955
956 static errcode_t unix_cache_readahead(io_channel channel,
957                                       unsigned long long block,
958                                       unsigned long long count)
959 {
960 #ifdef POSIX_FADV_WILLNEED
961         struct unix_private_data *data;
962
963         data = (struct unix_private_data *)channel->private_data;
964         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
965         return posix_fadvise(data->dev,
966                              (ext2_loff_t)block * channel->block_size + data->offset,
967                              (ext2_loff_t)count * channel->block_size,
968                              POSIX_FADV_WILLNEED);
969 #else
970         return EXT2_ET_OP_NOT_SUPPORTED;
971 #endif
972 }
973
974 static errcode_t unix_write_blk(io_channel channel, unsigned long block,
975                                 int count, const void *buf)
976 {
977         return unix_write_blk64(channel, block, count, buf);
978 }
979
980 static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
981                                  int size, const void *buf)
982 {
983         struct unix_private_data *data;
984         errcode_t       retval = 0;
985         ssize_t         actual;
986
987         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
988         data = (struct unix_private_data *) channel->private_data;
989         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
990
991         if (channel->align != 0) {
992 #ifdef ALIGN_DEBUG
993                 printf("unix_write_byte: O_DIRECT fallback\n");
994 #endif
995                 return EXT2_ET_UNIMPLEMENTED;
996         }
997
998 #ifndef NO_IO_CACHE
999         /*
1000          * Flush out the cache completely
1001          */
1002         if ((retval = flush_cached_blocks(channel, data, 1)))
1003                 return retval;
1004 #endif
1005
1006         if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
1007                 return errno;
1008
1009         actual = write(data->dev, buf, size);
1010         if (actual < 0)
1011                 return errno;
1012         if (actual != size)
1013                 return EXT2_ET_SHORT_WRITE;
1014
1015         return 0;
1016 }
1017
1018 /*
1019  * Flush data buffers to disk.
1020  */
1021 static errcode_t unix_flush(io_channel channel)
1022 {
1023         struct unix_private_data *data;
1024         errcode_t retval = 0;
1025
1026         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1027         data = (struct unix_private_data *) channel->private_data;
1028         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1029
1030 #ifndef NO_IO_CACHE
1031         retval = flush_cached_blocks(channel, data, 0);
1032 #endif
1033         fsync(data->dev);
1034         return retval;
1035 }
1036
1037 static errcode_t unix_set_option(io_channel channel, const char *option,
1038                                  const char *arg)
1039 {
1040         struct unix_private_data *data;
1041         unsigned long long tmp;
1042         char *end;
1043
1044         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1045         data = (struct unix_private_data *) channel->private_data;
1046         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1047
1048         if (!strcmp(option, "offset")) {
1049                 if (!arg)
1050                         return EXT2_ET_INVALID_ARGUMENT;
1051
1052                 tmp = strtoull(arg, &end, 0);
1053                 if (*end)
1054                         return EXT2_ET_INVALID_ARGUMENT;
1055                 data->offset = tmp;
1056                 if (data->offset < 0)
1057                         return EXT2_ET_INVALID_ARGUMENT;
1058                 return 0;
1059         }
1060         return EXT2_ET_INVALID_ARGUMENT;
1061 }
1062
1063 #if defined(__linux__) && !defined(BLKDISCARD)
1064 #define BLKDISCARD              _IO(0x12,119)
1065 #endif
1066
1067 static errcode_t unix_discard(io_channel channel, unsigned long long block,
1068                               unsigned long long count)
1069 {
1070         struct unix_private_data *data;
1071         int             ret;
1072
1073         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1074         data = (struct unix_private_data *) channel->private_data;
1075         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1076
1077         if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
1078 #ifdef BLKDISCARD
1079                 __u64 range[2];
1080
1081                 range[0] = (__u64)(block) * channel->block_size + data->offset;
1082                 range[1] = (__u64)(count) * channel->block_size;
1083
1084                 ret = ioctl(data->dev, BLKDISCARD, &range);
1085 #else
1086                 goto unimplemented;
1087 #endif
1088         } else {
1089 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE)
1090                 /*
1091                  * If we are not on block device, try to use punch hole
1092                  * to reclaim free space.
1093                  */
1094                 ret = fallocate(data->dev,
1095                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1096                                 (off_t)(block) * channel->block_size + data->offset,
1097                                 (off_t)(count) * channel->block_size);
1098 #else
1099                 goto unimplemented;
1100 #endif
1101         }
1102         if (ret < 0) {
1103                 if (errno == EOPNOTSUPP)
1104                         goto unimplemented;
1105                 return errno;
1106         }
1107         return 0;
1108 unimplemented:
1109         return EXT2_ET_UNIMPLEMENTED;
1110 }
1111
1112 /* parameters might not be used if OS doesn't support zeroout */
1113 #pragma GCC diagnostic push
1114 #pragma GCC diagnostic ignored "-Wunused-parameter"
1115 static errcode_t unix_zeroout(io_channel channel, unsigned long long block,
1116                               unsigned long long count)
1117 {
1118         struct unix_private_data *data;
1119         int             ret;
1120
1121         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1122         data = (struct unix_private_data *) channel->private_data;
1123         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1124
1125         if (safe_getenv("UNIX_IO_NOZEROOUT"))
1126                 goto unimplemented;
1127
1128         if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
1129                 /* Not implemented until the BLKZEROOUT mess is fixed */
1130                 goto unimplemented;
1131         } else {
1132                 /* Regular file, try to use truncate/punch/zero. */
1133                 struct stat statbuf;
1134
1135                 if (count == 0)
1136                         return 0;
1137                 /*
1138                  * If we're trying to zero a range past the end of the file,
1139                  * extend the file size, then truncate everything.
1140                  */
1141                 ret = fstat(data->dev, &statbuf);
1142                 if (ret)
1143                         goto err;
1144                 if ((unsigned long long) statbuf.st_size <
1145                         (block + count) * channel->block_size + data->offset) {
1146                         ret = ftruncate(data->dev,
1147                                         (block + count) * channel->block_size + data->offset);
1148                         if (ret)
1149                                 goto err;
1150                 }
1151 #if defined(HAVE_FALLOCATE) && (defined(FALLOC_FL_ZERO_RANGE) || \
1152         (defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)))
1153 #if defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)
1154                 ret = fallocate(data->dev,
1155                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1156                                 (off_t)(block) * channel->block_size + data->offset,
1157                                 (off_t)(count) * channel->block_size);
1158                 if (ret == 0)
1159                         goto err;
1160 #endif
1161 #ifdef FALLOC_FL_ZERO_RANGE
1162                 ret = fallocate(data->dev,
1163                                 FALLOC_FL_ZERO_RANGE,
1164                                 (off_t)(block) * channel->block_size + data->offset,
1165                                 (off_t)(count) * channel->block_size);
1166 #endif
1167 #else
1168                 goto unimplemented;
1169 #endif /* HAVE_FALLOCATE && (ZERO_RANGE || (PUNCH_HOLE && KEEP_SIZE)) */
1170         }
1171 err:
1172         if (ret < 0) {
1173                 if (errno == EOPNOTSUPP)
1174                         goto unimplemented;
1175                 return errno;
1176         }
1177         return 0;
1178 unimplemented:
1179         return EXT2_ET_UNIMPLEMENTED;
1180 }
1181 #pragma GCC diagnostic pop
1182
1183 static struct struct_io_manager struct_unix_manager = {
1184         .magic          = EXT2_ET_MAGIC_IO_MANAGER,
1185         .name           = "Unix I/O Manager",
1186         .open           = unix_open,
1187         .close          = unix_close,
1188         .set_blksize    = unix_set_blksize,
1189         .read_blk       = unix_read_blk,
1190         .write_blk      = unix_write_blk,
1191         .flush          = unix_flush,
1192         .write_byte     = unix_write_byte,
1193         .set_option     = unix_set_option,
1194         .get_stats      = unix_get_stats,
1195         .read_blk64     = unix_read_blk64,
1196         .write_blk64    = unix_write_blk64,
1197         .discard        = unix_discard,
1198         .cache_readahead        = unix_cache_readahead,
1199         .zeroout        = unix_zeroout,
1200 };
1201
1202 io_manager unix_io_manager = &struct_unix_manager;
1203
1204 static struct struct_io_manager struct_unixfd_manager = {
1205         .magic          = EXT2_ET_MAGIC_IO_MANAGER,
1206         .name           = "Unix fd I/O Manager",
1207         .open           = unixfd_open,
1208         .close          = unix_close,
1209         .set_blksize    = unix_set_blksize,
1210         .read_blk       = unix_read_blk,
1211         .write_blk      = unix_write_blk,
1212         .flush          = unix_flush,
1213         .write_byte     = unix_write_byte,
1214         .set_option     = unix_set_option,
1215         .get_stats      = unix_get_stats,
1216         .read_blk64     = unix_read_blk64,
1217         .write_blk64    = unix_write_blk64,
1218         .discard        = unix_discard,
1219         .cache_readahead        = unix_cache_readahead,
1220         .zeroout        = unix_zeroout,
1221 };
1222
1223 io_manager unixfd_io_manager = &struct_unixfd_manager;