diff --git a/README.md b/README.md index 12a4c5ad..7469f639 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ % mergerfs(1) mergerfs user manual % Antonio SJ Musumeci -% 2020-01-24 +% 2020-02-09 # NAME @@ -93,6 +93,7 @@ mergerfs does **not** support the copy-on-write (CoW) behavior found in **aufs** * **cache.entry=INT**: File name lookup cache timeout in seconds. (default: 1) * **cache.negative_entry=INT**: Negative file name lookup cache timeout in seconds. (default: 0) * **cache.files=libfuse|off|partial|full|auto-full**: File page caching mode (default: libfuse) +* **cache.writeback=BOOL**: Enable kernel writeback caching (default: false) * **cache.symlinks=BOOL**: Cache symlinks (if supported by kernel) (default: false) * **cache.readdir=BOOL**: Cache readdir (if supported by kernel) (default: false) * **direct_io**: deprecated - Bypass page cache. Use `cache.files=off` instead. (default: false) @@ -530,6 +531,15 @@ kernel documentation: https://www.kernel.org/doc/Documentation/filesystems/fuse- Given the relatively high cost of FUSE due to the kernel <-> userspace round trips there are kernel side caches for file entries and attributes. The entry cache limits the `lookup` calls to mergerfs which ask if a file exists. The attribute cache limits the need to make `getattr` calls to mergerfs which provide file attributes (mode, size, type, etc.). As with the page cache these should not be used if the underlying filesystems are being manipulated at the same time as it could lead to odd behavior or data corruption. The options for setting these are `cache.entry` and `cache.negative_entry` for the entry cache and `cache.attr` for the attributes cache. `cache.negative_entry` refers to the timeout for negative responses to lookups (non-existent files). +#### writeback caching + +When `cache.files` is enabled the default is for it to perform writethrough caching. This behavior won't help improve performance as each write still goes one for one through the filesystem. By enabling the FUSE writeback cache small writes may be aggregated by the kernel and then sent to mergerfs as one larger request. This can greatly improve the throughput for apps which write to files inefficiently. The amount the kernel can aggregate is limited by the size of a FUSE message. Read the `fuse_msg_size` section for more details. + +There is a small side effect as a result of enabling wrtieback caching. Underlying files won't ever be opened with O_APPEND or O_WRONLY. The former because the kernel then manages append mode and the latter because the kernel may request file data from mergerfs to populate the write cache. The O_APPEND change means that if a file is changed outside of mergerfs it could lead to corruption as the kernel won't know the end of the file has changed. That said any time you use caching you should keep from using the same file outside of mergerfs at the same time. + +Note that if an application is properly sizing writes then writeback caching will have little or no effect. It will only help with writes of sizes below the FUSE message size (128K on older kernels, 1M on newer). + + #### policy caching Policies are run every time a function (with a policy as mentioned above) is called. These policies can be expensive depending on mergerfs' setup and client usage patterns. Generally we wouldn't want to cache policy results because it may result in stale responses if the underlying drives are used directly. @@ -556,11 +566,6 @@ As of version 4.20 Linux supports symlink caching. Significant performance incre As of version 4.20 Linux supports readdir caching. This can have a significant impact on directory traversal. Especially when combined with entry (`cache.entry`) and attribute (`cache.attr`) caching. Setting `cache.readdir=true` will result in requesting readdir caching from the kernel on each `opendir`. If the kernel doesn't support readdir caching setting the option to `true` has no effect. This option is configurable at runtime via xattr `user.mergerfs.cache.readdir`. -#### writeback caching - -writeback caching is a technique for improving write speeds by batching writes at a faster device and then bulk writing to the slower device. With FUSE the kernel will wait for a number of writes to be made and then send it to the filesystem as one request. mergerfs currently uses a modified and vendor ed libfuse 2.9.7 which does not support writeback caching. Adding said feature should not be difficult but benchmarking needs to be done to see if what effect it will have. - - #### tiered caching Some storage technologies support what some call "tiered" caching. The placing of usually smaller, faster storage as a transparent cache to larger, slower storage. NVMe, SSD, Optane in front of traditional HDDs for instance. diff --git a/libfuse/include/fuse_common.h b/libfuse/include/fuse_common.h index b53e8b50..42041981 100644 --- a/libfuse/include/fuse_common.h +++ b/libfuse/include/fuse_common.h @@ -121,6 +121,7 @@ fuse_file_info #define FUSE_CAP_FLOCK_LOCKS (1 << 10) #define FUSE_CAP_IOCTL_DIR (1 << 11) #define FUSE_CAP_ASYNC_DIO (1 << 15) +#define FUSE_CAP_WRITEBACK_CACHE (1 << 16) #define FUSE_CAP_PARALLEL_DIROPS (1 << 18) #define FUSE_CAP_POSIX_ACL (1 << 19) #define FUSE_CAP_CACHE_SYMLINKS (1 << 20) diff --git a/libfuse/lib/fuse_lowlevel.c b/libfuse/lib/fuse_lowlevel.c index 01f35b45..95462fde 100644 --- a/libfuse/lib/fuse_lowlevel.c +++ b/libfuse/lib/fuse_lowlevel.c @@ -1794,6 +1794,8 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) f->conn.capable |= FUSE_CAP_PARALLEL_DIROPS; if (arg->flags & FUSE_MAX_PAGES) f->conn.capable |= FUSE_CAP_MAX_PAGES; + if (arg->flags & FUSE_WRITEBACK_CACHE) + f->conn.capable |= FUSE_CAP_WRITEBACK_CACHE; } else { f->conn.want &= ~FUSE_CAP_ASYNC_READ; f->conn.max_readahead = 0; @@ -1870,6 +1872,8 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg) outarg.flags |= FUSE_ASYNC_DIO; if (f->conn.want & FUSE_CAP_PARALLEL_DIROPS) outarg.flags |= FUSE_PARALLEL_DIROPS; + if (f->conn.want & FUSE_CAP_WRITEBACK_CACHE) + outarg.flags |= FUSE_WRITEBACK_CACHE; outarg.max_readahead = f->conn.max_readahead; outarg.max_write = f->conn.max_write; if (f->conn.proto_minor >= 13) { diff --git a/man/mergerfs.1 b/man/mergerfs.1 index 915540c6..ebbe9d22 100644 --- a/man/mergerfs.1 +++ b/man/mergerfs.1 @@ -1,7 +1,7 @@ .\"t .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "mergerfs" "1" "2020\-01\-24" "mergerfs user manual" "" +.TH "mergerfs" "1" "2020\-02\-09" "mergerfs user manual" "" .hy .SH NAME .PP @@ -232,6 +232,9 @@ timeout in seconds. \f[B]cache.files=libfuse|off|partial|full|auto\-full\f[]: File page caching mode (default: libfuse) .IP \[bu] 2 +\f[B]cache.writeback=BOOL\f[]: Enable kernel writeback caching (default: +false) +.IP \[bu] 2 \f[B]cache.symlinks=BOOL\f[]: Cache symlinks (if supported by kernel) (default: false) .IP \[bu] 2 @@ -1167,6 +1170,35 @@ The options for setting these are \f[C]cache.entry\f[] and \f[C]cache.attr\f[] for the attributes cache. \f[C]cache.negative_entry\f[] refers to the timeout for negative responses to lookups (non\-existent files). +.SS writeback caching +.PP +When \f[C]cache.files\f[] is enabled the default is for it to perform +writethrough caching. +This behavior won\[aq]t help improve performance as each write still +goes one for one through the filesystem. +By enabling the FUSE writeback cache small writes may be aggregated by +the kernel and then sent to mergerfs as one larger request. +This can greatly improve the throughput for apps which write to files +inefficiently. +The amount the kernel can aggregate is limited by the size of a FUSE +message. +Read the \f[C]fuse_msg_size\f[] section for more details. +.PP +There is a small side effect as a result of enabling wrtieback caching. +Underlying files won\[aq]t ever be opened with O_APPEND or O_WRONLY. +The former because the kernel then manages append mode and the latter +because the kernel may request file data from mergerfs to populate the +write cache. +The O_APPEND change means that if a file is changed outside of mergerfs +it could lead to corruption as the kernel won\[aq]t know the end of the +file has changed. +That said any time you use caching you should keep from using the same +file outside of mergerfs at the same time. +.PP +Note that if an application is properly sizing writes then writeback +caching will have little or no effect. +It will only help with writes of sizes below the FUSE message size (128K +on older kernels, 1M on newer). .SS policy caching .PP Policies are run every time a function (with a policy as mentioned @@ -1224,16 +1256,6 @@ If the kernel doesn\[aq]t support readdir caching setting the option to \f[C]true\f[] has no effect. This option is configurable at runtime via xattr \f[C]user.mergerfs.cache.readdir\f[]. -.SS writeback caching -.PP -writeback caching is a technique for improving write speeds by batching -writes at a faster device and then bulk writing to the slower device. -With FUSE the kernel will wait for a number of writes to be made and -then send it to the filesystem as one request. -mergerfs currently uses a modified and vendor ed libfuse 2.9.7 which -does not support writeback caching. -Adding said feature should not be difficult but benchmarking needs to be -done to see if what effect it will have. .SS tiered caching .PP Some storage technologies support what some call "tiered" caching. diff --git a/src/config.cpp b/src/config.cpp index 22f7c845..f9d54709 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -52,6 +52,7 @@ Config::Config() cache_symlinks(false), cache_readdir(false), async_read(true), + writeback_cache(false), cache_files(CacheFiles::LIBFUSE), fuse_msg_size(FUSE_MAX_MAX_PAGES), POLICYINIT(access), diff --git a/src/config.hpp b/src/config.hpp index a9506173..f8ca93e1 100644 --- a/src/config.hpp +++ b/src/config.hpp @@ -112,6 +112,7 @@ public: bool cache_symlinks; bool cache_readdir; bool async_read; + bool writeback_cache; CacheFiles cache_files; uint16_t fuse_msg_size; diff --git a/src/fuse_create.cpp b/src/fuse_create.cpp index 70204750..80026775 100644 --- a/src/fuse_create.cpp +++ b/src/fuse_create.cpp @@ -35,6 +35,30 @@ typedef Config::CacheFiles CacheFiles; namespace l { + /* + The kernel expects being able to issue read requests when running + with writeback caching enabled so we must change O_WRONLY to + O_RDWR. + + With writeback caching enabled the kernel handles O_APPEND. Could + be an issue if the underlying file changes out of band but that is + true of any caching. + */ + static + int + tweak_flags_writeback_cache(const int flags_) + { + int flags; + + flags = flags_; + if((flags & O_ACCMODE) == O_WRONLY) + flags = ((flags & ~O_ACCMODE) | O_RDWR); + if(flags & O_APPEND) + flags &= ~O_APPEND; + + return flags; + } + static int create_core(const string &fullpath_, @@ -153,6 +177,9 @@ namespace FUSE break; } + if(config.writeback_cache) + ffi_->flags = l::tweak_flags_writeback_cache(ffi_->flags); + return l::create(config.getattr, config.create, config.branches, diff --git a/src/fuse_init.cpp b/src/fuse_init.cpp index 5bfb91c3..a5375fda 100644 --- a/src/fuse_init.cpp +++ b/src/fuse_init.cpp @@ -96,6 +96,7 @@ namespace FUSE l::want_if_capable(conn_,FUSE_CAP_IOCTL_DIR); l::want_if_capable(conn_,FUSE_CAP_PARALLEL_DIROPS); l::want_if_capable(conn_,FUSE_CAP_POSIX_ACL,&c.posix_acl); + l::want_if_capable(conn_,FUSE_CAP_WRITEBACK_CACHE,&c.writeback_cache); l::want_if_capable_max_pages(conn_,c); return &c; diff --git a/src/fuse_open.cpp b/src/fuse_open.cpp index 3f87916a..4ff7b027 100644 --- a/src/fuse_open.cpp +++ b/src/fuse_open.cpp @@ -35,6 +35,21 @@ typedef Config::CacheFiles CacheFiles; namespace l { + static + int + tweak_flags_writeback_cache(const int flags_) + { + int flags; + + flags = flags_; + if((flags & O_ACCMODE) == O_WRONLY) + flags = ((flags & ~O_ACCMODE) | O_RDWR); + if(flags & O_APPEND) + flags &= ~O_APPEND; + + return flags; + } + static int open_core(const string &basepath_, @@ -122,6 +137,9 @@ namespace FUSE break; } + if(config.writeback_cache) + ffi_->flags = l::tweak_flags_writeback_cache(ffi_->flags); + return l::open(config.open, config.open_cache, config.branches, diff --git a/src/option_parser.cpp b/src/option_parser.cpp index 2206e921..a1ce5f23 100644 --- a/src/option_parser.cpp +++ b/src/option_parser.cpp @@ -281,6 +281,8 @@ parse_and_process_cache(Config &config_, return parse_and_process(value_,config_.cache_readdir); else if(func_ == "files") return parse_and_process(value_,config_.cache_files); + else if(func_ == "writeback") + return parse_and_process(value_,config_.writeback_cache); return 1; } @@ -461,6 +463,10 @@ usage(void) " * full: Keep cache on file open\n" " * auto-full: Keep cache if mtime & size not changed\n" " default = libfuse\n" + " -o cache.writeback=\n" + " Enable kernel writeback caching (if supported)\n" + " cache.files must must be enabled as well.\n" + " default = false\n" " -o cache.symlinks=\n" " Enable kernel caching of symlinks (if supported)\n" " default = false\n"