libfsdriver: support mmap on FSes with no device

This patch adds (very limited) support for memory-mapping pages on
file systems that are mounted on the special "none" device and that
do not implement PEEK support by themselves.  This includes hgfs,
vbfs, and procfs.

The solution is implemented in libvtreefs, and consists of allocating
pages, filling them with content by calling the file system's READ
functionality, passing the pages to VM, and freeing them again.  A new
VM flag is used to indicate that these pages should be mapped in only
once, and thus not cached beyond their single use.  This prevents
stale data from getting mapped in without the involvement of the file
system, which would be problematic on file systems where file contents
may become outdated at any time.  No VM caching means no sharing and
poor performance, but mmap no longer fails on these file systems.

Compared to a libc-based approach, this patch retains the on-demand
nature of mmap.  Especially tail(1) is known to map in a large file
area only to use a small portion of it.

All file systems now need to be given permission for the SETCACHEPAGE
and CLEARCACHE calls to VM.

A very basic regression test is added to test74.

Change-Id: I17afc4cb97315b515cad1542521b98f293b6b559
This commit is contained in:
David van Moolenbroek 2014-11-15 10:14:00 +00:00
parent 289b04677a
commit e321f65582
13 changed files with 200 additions and 21 deletions

View file

@ -385,6 +385,8 @@ service procfs
; ;
vm vm
INFO INFO
SETCACHEPAGE
CLEARCACHE
; ;
uid 0; uid 0;
}; };
@ -402,6 +404,10 @@ service hgfs
ipc ipc
SYSTEM pm vfs rs vm SYSTEM pm vfs rs vm
; ;
vm
SETCACHEPAGE
CLEARCACHE
;
}; };
service vbfs service vbfs
@ -409,6 +415,10 @@ service vbfs
ipc ipc
SYSTEM pm vfs rs ds vm vbox SYSTEM pm vfs rs ds vm vbox
; ;
vm
SETCACHEPAGE
CLEARCACHE
;
}; };
service printer service printer
@ -554,6 +564,10 @@ service hello
service devman service devman
{ {
uid 0; uid 0;
vm
SETCACHEPAGE
CLEARCACHE
;
}; };
service mmc service mmc
@ -589,6 +603,10 @@ service gpio
IRQCTL # 19 IRQCTL # 19
PADCONF # 57 PADCONF # 57
; ;
vm
SETCACHEPAGE
CLEARCACHE
;
irq irq
29 # GPIO module 1 (dm37xx) 29 # GPIO module 1 (dm37xx)
30 # GPIO module 2 (dm37xx) 30 # GPIO module 2 (dm37xx)

View file

@ -71,7 +71,8 @@ int vm_procctl_clear(endpoint_t ep);
int vm_procctl_handlemem(endpoint_t ep, vir_bytes m1, vir_bytes m2, int wr); int vm_procctl_handlemem(endpoint_t ep, vir_bytes m1, vir_bytes m2, int wr);
int vm_set_cacheblock(void *block, dev_t dev, off_t dev_offset, int vm_set_cacheblock(void *block, dev_t dev, off_t dev_offset,
ino_t ino, off_t ino_offset, u32_t *flags, int blocksize); ino_t ino, off_t ino_offset, u32_t *flags, int blocksize,
int setflags);
void *vm_map_cacheblock(dev_t dev, off_t dev_offset, void *vm_map_cacheblock(dev_t dev, off_t dev_offset,
ino_t ino, off_t ino_offset, u32_t *flags, int blocksize); ino_t ino, off_t ino_offset, u32_t *flags, int blocksize);
@ -87,5 +88,8 @@ int vm_clear_cache(dev_t dev);
/* special inode number for vm cache functions */ /* special inode number for vm cache functions */
#define VMC_NO_INODE 0 /* to reference a disk block, no associated file */ #define VMC_NO_INODE 0 /* to reference a disk block, no associated file */
/* setflags for vm_set_cacheblock, also used internally in VM */
#define VMSF_ONCE 0x01 /* discard block after one-time use */
#endif /* _MINIX_VM_H */ #endif /* _MINIX_VM_H */

View file

@ -1,6 +1,7 @@
#include "fsdriver.h" #include "fsdriver.h"
#include <minix/ds.h> #include <minix/ds.h>
#include <sys/mman.h>
/* /*
* Process a READSUPER request from VFS. * Process a READSUPER request from VFS.
@ -43,7 +44,8 @@ fsdriver_readsuper(const struct fsdriver * __restrict fdp,
if (r == OK) { if (r == OK) {
/* This one we can set on the file system's behalf. */ /* This one we can set on the file system's behalf. */
if (fdp->fdr_peek != NULL && fdp->fdr_bpeek != NULL) if ((fdp->fdr_peek != NULL && fdp->fdr_bpeek != NULL) ||
major(dev) == NONE_MAJOR)
res_flags |= RES_HASPEEK; res_flags |= RES_HASPEEK;
m_out->m_fs_vfs_readsuper.inode = root_node.fn_ino_nr; m_out->m_fs_vfs_readsuper.inode = root_node.fn_ino_nr;
@ -74,6 +76,10 @@ fsdriver_unmount(const struct fsdriver * __restrict fdp,
if (fdp->fdr_unmount != NULL) if (fdp->fdr_unmount != NULL)
fdp->fdr_unmount(); fdp->fdr_unmount();
/* If we used mmap emulation, clear any cached blocks from VM. */
if (fdp->fdr_peek == NULL && major(fsdriver_device) == NONE_MAJOR)
vm_clear_cache(fsdriver_device);
/* Update library-local state. */ /* Update library-local state. */
fsdriver_mounted = FALSE; fsdriver_mounted = FALSE;
@ -206,6 +212,61 @@ fsdriver_write(const struct fsdriver * __restrict fdp,
return read_write(fdp, m_in, m_out, FSC_WRITE); return read_write(fdp, m_in, m_out, FSC_WRITE);
} }
/*
* A read-based peek implementation. This allows file systems that do not have
* a buffer cache and do not implement peek, to support a limited form of mmap.
* We map in a block, fill it by calling the file system's read function, tell
* VM about the page, and then unmap the block again. We tell VM not to cache
* the block beyond its immediate use for the mmap request, so as to prevent
* potentially stale data from being cached--at the cost of performance.
*/
static ssize_t
builtin_peek(const struct fsdriver * __restrict fdp, ino_t ino_nr,
size_t nbytes, off_t pos)
{
static u32_t flags = 0; /* storage for the VMMC_ flags of all blocks */
static off_t dev_off = 0; /* fake device offset, see below */
struct fsdriver_data data;
char *buf;
ssize_t r;
if ((buf = mmap(NULL, nbytes, PROT_READ | PROT_WRITE,
MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED)
return ENOMEM;
data.endpt = SELF;
data.grant = (cp_grant_id_t)buf;
data.size = nbytes;
r = fdp->fdr_read(ino_nr, &data, nbytes, pos, FSC_READ);
if (r >= 0) {
if ((size_t)r < nbytes)
memset(&buf[r], 0, nbytes - r);
/*
* VM uses serialized communication to VFS. Since the page is
* to be used only once, VM will use and then discard it before
* sending a new peek request. Thus, it should be safe to
* reuse the same device offset all the time. However, relying
* on assumptions in protocols elsewhere a bit dangerous, so we
* use an ever-increasing device offset just to be safe.
*/
r = vm_set_cacheblock(buf, fsdriver_device, dev_off, ino_nr,
pos, &flags, nbytes, VMSF_ONCE);
if (r == OK) {
dev_off += nbytes;
r = nbytes;
}
}
munmap(buf, nbytes);
return r;
}
/* /*
* Process a PEEK request from VFS. * Process a PEEK request from VFS.
*/ */
@ -222,13 +283,22 @@ fsdriver_peek(const struct fsdriver * __restrict fdp,
pos = m_in->m_vfs_fs_readwrite.seek_pos; pos = m_in->m_vfs_fs_readwrite.seek_pos;
nbytes = m_in->m_vfs_fs_readwrite.nbytes; nbytes = m_in->m_vfs_fs_readwrite.nbytes;
if (fdp->fdr_peek == NULL)
return ENOSYS;
if (pos < 0 || nbytes > SSIZE_MAX) if (pos < 0 || nbytes > SSIZE_MAX)
return EINVAL; return EINVAL;
r = fdp->fdr_peek(ino_nr, NULL /*data*/, nbytes, pos, FSC_PEEK); if (fdp->fdr_peek == NULL) {
if (major(fsdriver_device) != NONE_MAJOR)
return ENOSYS;
/*
* For file systems that have no backing device, emulate peek
* support by reading into temporary buffers and passing these
* to VM.
*/
r = builtin_peek(fdp, ino_nr, nbytes, pos);
} else
r = fdp->fdr_peek(ino_nr, NULL /*data*/, nbytes, pos,
FSC_PEEK);
/* Do not return a new position. */ /* Do not return a new position. */
if (r >= 0) { if (r >= 0) {

View file

@ -469,7 +469,7 @@ void lmfs_put_block(
if(vmcache && bp->lmfs_needsetcache && dev != NO_DEV) { if(vmcache && bp->lmfs_needsetcache && dev != NO_DEV) {
if((r=vm_set_cacheblock(bp->data, dev, dev_off, if((r=vm_set_cacheblock(bp->data, dev, dev_off,
bp->lmfs_inode, bp->lmfs_inode_offset, bp->lmfs_inode, bp->lmfs_inode_offset,
&bp->lmfs_flags, fs_block_size)) != OK) { &bp->lmfs_flags, fs_block_size, 0)) != OK) {
if(r == ENOSYS) { if(r == ENOSYS) {
printf("libminixfs: ENOSYS, disabling VM calls\n"); printf("libminixfs: ENOSYS, disabling VM calls\n");
vmcache = 0; vmcache = 0;

View file

@ -14,7 +14,7 @@
static int vm_cachecall(message *m, int call, void *addr, dev_t dev, static int vm_cachecall(message *m, int call, void *addr, dev_t dev,
off_t dev_offset, ino_t ino, off_t ino_offset, u32_t *flags, off_t dev_offset, ino_t ino, off_t ino_offset, u32_t *flags,
int blocksize) int blocksize, int setflags)
{ {
if(blocksize % PAGE_SIZE) if(blocksize % PAGE_SIZE)
panic("blocksize %d should be a multiple of pagesize %d\n", panic("blocksize %d should be a multiple of pagesize %d\n",
@ -39,7 +39,7 @@ static int vm_cachecall(message *m, int call, void *addr, dev_t dev,
m->m_vmmcp.flags_ptr = flags; m->m_vmmcp.flags_ptr = flags;
m->m_vmmcp.dev = dev; m->m_vmmcp.dev = dev;
m->m_vmmcp.pages = blocksize / PAGE_SIZE; m->m_vmmcp.pages = blocksize / PAGE_SIZE;
m->m_vmmcp.flags = 0; m->m_vmmcp.flags = setflags;
return _taskcall(VM_PROC_NR, call, m); return _taskcall(VM_PROC_NR, call, m);
} }
@ -50,19 +50,19 @@ void *vm_map_cacheblock(dev_t dev, off_t dev_offset,
message m; message m;
if(vm_cachecall(&m, VM_MAPCACHEPAGE, NULL, dev, dev_offset, if(vm_cachecall(&m, VM_MAPCACHEPAGE, NULL, dev, dev_offset,
ino, ino_offset, flags, blocksize) != OK) ino, ino_offset, flags, blocksize, 0) != OK)
return MAP_FAILED; return MAP_FAILED;
return m.m_vmmcp_reply.addr; return m.m_vmmcp_reply.addr;
} }
int vm_set_cacheblock(void *block, dev_t dev, off_t dev_offset, int vm_set_cacheblock(void *block, dev_t dev, off_t dev_offset,
ino_t ino, off_t ino_offset, u32_t *flags, int blocksize) ino_t ino, off_t ino_offset, u32_t *flags, int blocksize, int setflags)
{ {
message m; message m;
return vm_cachecall(&m, VM_SETCACHEPAGE, block, dev, dev_offset, return vm_cachecall(&m, VM_SETCACHEPAGE, block, dev, dev_offset,
ino, ino_offset, flags, blocksize); ino, ino_offset, flags, blocksize, setflags);
} }
int int

View file

@ -213,7 +213,8 @@ struct cached_page *find_cached_page_byino(dev_t dev, ino_t ino, u64_t ino_off,
return NULL; return NULL;
} }
int addcache(dev_t dev, u64_t dev_off, ino_t ino, u64_t ino_off, struct phys_block *pb) int addcache(dev_t dev, u64_t dev_off, ino_t ino, u64_t ino_off, int flags,
struct phys_block *pb)
{ {
int hv_dev; int hv_dev;
struct cached_page *hb; struct cached_page *hb;
@ -237,6 +238,7 @@ int addcache(dev_t dev, u64_t dev_off, ino_t ino, u64_t ino_off, struct phys_blo
hb->dev_offset = dev_off; hb->dev_offset = dev_off;
hb->ino = ino; hb->ino = ino;
hb->ino_offset = ino_off; hb->ino_offset = ino_off;
hb->flags = flags & VMSF_ONCE;
hb->page = pb; hb->page = pb;
hb->page->refcount++; /* block also referenced by cache now */ hb->page->refcount++; /* block also referenced by cache now */
hb->page->flags |= PBF_INCACHE; hb->page->flags |= PBF_INCACHE;

View file

@ -12,6 +12,7 @@ struct cached_page {
ino_t ino; /* which ino is it about */ ino_t ino; /* which ino is it about */
u64_t ino_offset; /* offset within ino */ u64_t ino_offset; /* offset within ino */
int flags; /* currently only VMSF_ONCE or 0 */
struct phys_block *page; /* page ptr */ struct phys_block *page; /* page ptr */
struct cached_page *older; /* older in lru chain */ struct cached_page *older; /* older in lru chain */
struct cached_page *newer; /* newer in lru chain */ struct cached_page *newer; /* newer in lru chain */

View file

@ -175,6 +175,7 @@ do_setcache(message *msg)
dev_t dev = msg->m_vmmcp.dev; dev_t dev = msg->m_vmmcp.dev;
off_t dev_off = msg->m_vmmcp.dev_offset; off_t dev_off = msg->m_vmmcp.dev_offset;
off_t ino_off = msg->m_vmmcp.ino_offset; off_t ino_off = msg->m_vmmcp.ino_offset;
int flags = msg->m_vmmcp.flags;
int n; int n;
struct vmproc *caller; struct vmproc *caller;
phys_bytes offset; phys_bytes offset;
@ -209,7 +210,8 @@ do_setcache(message *msg)
if((hb=find_cached_page_bydev(dev, dev_off + offset, if((hb=find_cached_page_bydev(dev, dev_off + offset,
msg->m_vmmcp.ino, ino_off + offset, 1))) { msg->m_vmmcp.ino, ino_off + offset, 1))) {
/* block inode info updated */ /* block inode info updated */
if(hb->page != phys_region->ph) { if(hb->page != phys_region->ph ||
(hb->flags & VMSF_ONCE)) {
/* previous cache entry has become /* previous cache entry has become
* obsolete; make a new one. rmcache * obsolete; make a new one. rmcache
* removes it from the cache and frees * removes it from the cache and frees
@ -236,8 +238,8 @@ do_setcache(message *msg)
phys_region->memtype = &mem_type_cache; phys_region->memtype = &mem_type_cache;
if((r=addcache(dev, dev_off + offset, if((r=addcache(dev, dev_off + offset, msg->m_vmmcp.ino,
msg->m_vmmcp.ino, ino_off + offset, phys_region->ph)) != OK) { ino_off + offset, flags, phys_region->ph)) != OK) {
printf("VM: addcache failed\n"); printf("VM: addcache failed\n");
return r; return r;
} }

View file

@ -107,7 +107,17 @@ static int mappedfile_pagefault(struct vmproc *vmp, struct vir_region *region,
cp = find_cached_page_byino(region->param.file.fdref->dev, cp = find_cached_page_byino(region->param.file.fdref->dev,
region->param.file.fdref->ino, referenced_offset, 1); region->param.file.fdref->ino, referenced_offset, 1);
} }
if(cp) { /*
* Normally, a cache hit saves a round-trip to the file system
* to load the page. However, if the page in the VM cache is
* marked for one-time use, then force a round-trip through the
* file system anyway, so that the FS can update the page by
* by readding it to the cache. Thus, for one-time use pages,
* no caching is performed. This approach is correct even in
* the light of concurrent requests and disappearing processes
* but relies on VM requests to VFS being fully serialized.
*/
if(cp && (!cb || !(cp->flags & VMSF_ONCE))) {
int result = OK; int result = OK;
pb_unreferenced(region, ph, 0); pb_unreferenced(region, ph, 0);
pb_link(ph, cp->page, ph->offset, region); pb_link(ph, cp->page, ph->offset, region);
@ -120,6 +130,10 @@ static int mappedfile_pagefault(struct vmproc *vmp, struct vir_region *region,
result = cow_block(vmp, region, ph, 0); result = cow_block(vmp, region, ph, 0);
} }
/* Discard one-use pages after mapping them in. */
if (result == OK && (cp->flags & VMSF_ONCE))
rmcache(cp);
return result; return result;
} }
@ -210,7 +224,14 @@ int mappedfile_setfile(struct vmproc *owner,
cp = find_cached_page_byino(dev, ino, cp = find_cached_page_byino(dev, ino,
referenced_offset, 1); referenced_offset, 1);
} }
if(!cp) continue; /*
* If we get a hit for a page that is to be used only once,
* then either we found a stale page (due to a process dying
* before a requested once-page could be mapped in) or this is
* a rare case of concurrent requests for the same page. In
* both cases, force the page to be obtained from its FS later.
*/
if(!cp || (cp->flags & VMSF_ONCE)) continue;
if(!(pr = pb_reference(cp->page, vaddr, region, if(!(pr = pb_reference(cp->page, vaddr, region,
&mem_type_mappedfile))) { &mem_type_mappedfile))) {
printf("mappedfile_setfile: pb_reference failed\n"); printf("mappedfile_setfile: pb_reference failed\n");

View file

@ -227,7 +227,8 @@ int do_clearcache(message *m);
struct cached_page *find_cached_page_bydev(dev_t dev, u64_t dev_off, struct cached_page *find_cached_page_bydev(dev_t dev, u64_t dev_off,
ino_t ino, u64_t ino_off, int touchlru); ino_t ino, u64_t ino_off, int touchlru);
struct cached_page *find_cached_page_byino(dev_t dev, ino_t ino, u64_t ino_off, int touchlru); struct cached_page *find_cached_page_byino(dev_t dev, ino_t ino, u64_t ino_off, int touchlru);
int addcache(dev_t dev, u64_t def_off, ino_t ino, u64_t ino_off, struct phys_block *pb); int addcache(dev_t dev, u64_t def_off, ino_t ino, u64_t ino_off, int flags,
struct phys_block *pb);
void cache_sanitycheck_internal(void); void cache_sanitycheck_internal(void);
int cache_freepages(int pages); int cache_freepages(int pages);
void get_stats_info(struct vm_stats_info *vsi); void get_stats_info(struct vm_stats_info *vsi);

View file

@ -230,7 +230,7 @@ u32_t sqrt_approx(u32_t v)
} }
int vm_set_cacheblock(void *block, dev_t dev, off_t dev_offset, int vm_set_cacheblock(void *block, dev_t dev, off_t dev_offset,
ino_t ino, off_t ino_offset, u32_t *flags, int blocksize) ino_t ino, off_t ino_offset, u32_t *flags, int blocksize, int setflags)
{ {
return ENOSYS; return ENOSYS;
} }

View file

@ -36,6 +36,7 @@
#include <sys/ioctl.h> #include <sys/ioctl.h>
#include <sys/ioc_memory.h> #include <sys/ioc_memory.h>
#include <sys/param.h> #include <sys/param.h>
#include <minix/paths.h>
#include <stdio.h> #include <stdio.h>
#include <assert.h> #include <assert.h>
#include <string.h> #include <string.h>
@ -461,6 +462,63 @@ static void basic_regression(void)
} }
/*
* Test mmap on none-dev file systems - file systems that do not have a buffer
* cache and therefore have to fake mmap support. We use procfs as target.
* The idea is that while we succeed in mapping in /proc/uptime, we also get
* a new uptime value every time we map in the page -- VM must not cache it.
*/
static void
nonedev_regression(void)
{
int fd;
char *buf;
unsigned long uptime1, uptime2, uptime3;
subtest++;
if ((fd = open(_PATH_PROC "uptime", O_RDONLY)) < 0) e(1);
buf = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0);
if (buf == MAP_FAILED) e(2);
if (buf[4095] != 0) e(3);
if ((uptime1 = atoi(buf)) == 0) e(4);
if (munmap(buf, 4096) != 0) e(5);
sleep(2);
buf = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE,
fd, 0);
if (buf == MAP_FAILED) e(6);
if (buf[4095] != 0) e(7);
if ((uptime2 = atoi(buf)) == 0) e(8);
if (uptime1 == uptime2) e(9);
if (munmap(buf, 4096) != 0) e(10);
sleep(2);
buf = mmap(NULL, 4096, PROT_READ, MAP_SHARED | MAP_FILE, fd, 0);
if (buf == MAP_FAILED) e(11);
if (buf[4095] != 0) e(12);
if ((uptime3 = atoi(buf)) == 0) e(13);
if (uptime1 == uptime3) e(14);
if (uptime2 == uptime3) e(15);
if (munmap(buf, 4096) != 0) e(16);
close(fd);
}
int int
main(int argc, char *argv[]) main(int argc, char *argv[])
{ {
@ -470,6 +528,8 @@ main(int argc, char *argv[])
basic_regression(); basic_regression();
nonedev_regression();
test_memory_types_vs_operations(); test_memory_types_vs_operations();
makefiles(MAXFILES); makefiles(MAXFILES);

View file

@ -44,7 +44,7 @@ int dowriteblock(int b, int blocksize, u32_t seed, char *block)
memcpy(bdata, block, blocksize); memcpy(bdata, block, blocksize);
if(mustset && (r=vm_set_cacheblock(bdata, MYDEV, dev_off, if(mustset && (r=vm_set_cacheblock(bdata, MYDEV, dev_off,
VMC_NO_INODE, 0, NULL, blocksize)) != OK) { VMC_NO_INODE, 0, NULL, blocksize, 0)) != OK) {
printf("dowriteblock: vm_set_cacheblock failed %d\n", r); printf("dowriteblock: vm_set_cacheblock failed %d\n", r);
exit(1); exit(1);
} }