2008-11-19 13:26:10 +01:00
|
|
|
/* Prototypes and definitions for VM interface. */
|
|
|
|
|
|
|
|
#ifndef _MINIX_VM_H
|
|
|
|
#define _MINIX_VM_H
|
|
|
|
|
2012-11-30 19:44:40 +01:00
|
|
|
#include <sys/types.h>
|
2008-11-19 13:26:10 +01:00
|
|
|
#include <minix/endpoint.h>
|
|
|
|
|
2012-03-24 16:16:34 +01:00
|
|
|
int vm_exit(endpoint_t ep);
|
|
|
|
int vm_fork(endpoint_t ep, int slotno, endpoint_t *child_ep);
|
2015-09-27 19:32:10 +02:00
|
|
|
int vm_getrusage(endpoint_t endpt, void *addr, int children);
|
2012-03-24 16:16:34 +01:00
|
|
|
int vm_willexit(endpoint_t ep);
|
|
|
|
int vm_adddma(endpoint_t proc_e, phys_bytes start, phys_bytes size);
|
|
|
|
int vm_deldma(endpoint_t proc_e, phys_bytes start, phys_bytes size);
|
|
|
|
int vm_getdma(endpoint_t *procp, phys_bytes *basep, phys_bytes *sizep);
|
|
|
|
void *vm_map_phys(endpoint_t who, void *physaddr, size_t len);
|
|
|
|
int vm_unmap_phys(endpoint_t who, void *vaddr, size_t len);
|
2008-11-19 13:26:10 +01:00
|
|
|
|
2012-03-24 16:16:34 +01:00
|
|
|
int vm_notify_sig(endpoint_t ep, endpoint_t ipc_ep);
|
2013-08-07 22:03:47 +02:00
|
|
|
int vm_set_priv(endpoint_t ep, void *buf, int sys_proc);
|
2014-03-02 01:15:08 +01:00
|
|
|
int vm_update(endpoint_t src_e, endpoint_t dst_e, int flags);
|
2014-03-11 23:22:55 +01:00
|
|
|
int vm_memctl(endpoint_t ep, int req, void** addr, size_t *len);
|
2015-07-14 07:42:48 +02:00
|
|
|
int vm_prepare(endpoint_t src_e, endpoint_t dst_e, int flags);
|
2012-03-24 16:16:34 +01:00
|
|
|
int vm_query_exit(endpoint_t *endpt);
|
|
|
|
int vm_watch_exit(endpoint_t ep);
|
2013-11-28 17:51:21 +01:00
|
|
|
int minix_vfs_mmap(endpoint_t who, off_t offset, size_t len,
|
|
|
|
dev_t dev, ino_t ino, int fd, u32_t vaddr, u16_t clearend, u16_t
|
2013-05-07 14:41:07 +02:00
|
|
|
flags);
|
|
|
|
|
2013-12-06 12:04:52 +01:00
|
|
|
void *minix_mmap_for(endpoint_t forwhom,
|
|
|
|
void *addr, size_t len, int prot, int flags, int fd, off_t offset);
|
|
|
|
int minix_vfs_mmap(endpoint_t who, off_t offset, size_t len,
|
|
|
|
dev_t dev, ino_t ino, int fd, u32_t vaddr, u16_t clearend,
|
|
|
|
u16_t flags);
|
|
|
|
|
2013-05-07 14:41:07 +02:00
|
|
|
/* minix vfs mmap flags */
|
|
|
|
#define MVM_WRITABLE 0x8000
|
2010-05-05 13:35:04 +02:00
|
|
|
|
New RS and new signal handling for system processes.
UPDATING INFO:
20100317:
/usr/src/etc/system.conf updated to ignore default kernel calls: copy
it (or merge it) to /etc/system.conf.
The hello driver (/dev/hello) added to the distribution:
# cd /usr/src/commands/scripts && make clean install
# cd /dev && MAKEDEV hello
KERNEL CHANGES:
- Generic signal handling support. The kernel no longer assumes PM as a signal
manager for every process. The signal manager of a given process can now be
specified in its privilege slot. When a signal has to be delivered, the kernel
performs the lookup and forwards the signal to the appropriate signal manager.
PM is the default signal manager for user processes, RS is the default signal
manager for system processes. To enable ptrace()ing for system processes, it
is sufficient to change the default signal manager to PM. This will temporarily
disable crash recovery, though.
- sys_exit() is now split into sys_exit() (i.e. exit() for system processes,
which generates a self-termination signal), and sys_clear() (i.e. used by PM
to ask the kernel to clear a process slot when a process exits).
- Added a new kernel call (i.e. sys_update()) to swap two process slots and
implement live update.
PM CHANGES:
- Posix signal handling is no longer allowed for system processes. System
signals are split into two fixed categories: termination and non-termination
signals. When a non-termination signaled is processed, PM transforms the signal
into an IPC message and delivers the message to the system process. When a
termination signal is processed, PM terminates the process.
- PM no longer assumes itself as the signal manager for system processes. It now
makes sure that every system signal goes through the kernel before being
actually processes. The kernel will then dispatch the signal to the appropriate
signal manager which may or may not be PM.
SYSLIB CHANGES:
- Simplified SEF init and LU callbacks.
- Added additional predefined SEF callbacks to debug crash recovery and
live update.
- Fixed a temporary ack in the SEF init protocol. SEF init reply is now
completely synchronous.
- Added SEF signal event type to provide a uniform interface for system
processes to deal with signals. A sef_cb_signal_handler() callback is
available for system processes to handle every received signal. A
sef_cb_signal_manager() callback is used by signal managers to process
system signals on behalf of the kernel.
- Fixed a few bugs with memory mapping and DS.
VM CHANGES:
- Page faults and memory requests coming from the kernel are now implemented
using signals.
- Added a new VM call to swap two process slots and implement live update.
- The call is used by RS at update time and in turn invokes the kernel call
sys_update().
RS CHANGES:
- RS has been reworked with a better functional decomposition.
- Better kernel call masks. com.h now defines the set of very basic kernel calls
every system service is allowed to use. This makes system.conf simpler and
easier to maintain. In addition, this guarantees a higher level of isolation
for system libraries that use one or more kernel calls internally (e.g. printf).
- RS is the default signal manager for system processes. By default, RS
intercepts every signal delivered to every system process. This makes crash
recovery possible before bringing PM and friends in the loop.
- RS now supports fast rollback when something goes wrong while initializing
the new version during a live update.
- Live update is now implemented by keeping the two versions side-by-side and
swapping the process slots when the old version is ready to update.
- Crash recovery is now implemented by keeping the two versions side-by-side
and cleaning up the old version only when the recovery process is complete.
DS CHANGES:
- Fixed a bug when the process doing ds_publish() or ds_delete() is not known
by DS.
- Fixed the completely broken support for strings. String publishing is now
implemented in the system library and simply wraps publishing of memory ranges.
Ideally, we should adopt a similar approach for other data types as well.
- Test suite fixed.
DRIVER CHANGES:
- The hello driver has been added to the Minix distribution to demonstrate basic
live update and crash recovery functionalities.
- Other drivers have been adapted to conform the new SEF interface.
2010-03-17 02:15:29 +01:00
|
|
|
/* VM kernel request types. */
|
|
|
|
#define VMPTYPE_NONE 0
|
|
|
|
#define VMPTYPE_CHECK 1
|
|
|
|
|
2010-01-19 22:00:20 +01:00
|
|
|
struct vm_stats_info {
|
2010-09-14 23:22:56 +02:00
|
|
|
unsigned int vsi_pagesize; /* page size */
|
|
|
|
unsigned long vsi_total; /* total number of memory pages */
|
|
|
|
unsigned long vsi_free; /* number of free pages */
|
|
|
|
unsigned long vsi_largest; /* largest number of consecutive free pages */
|
|
|
|
unsigned long vsi_cached; /* number of pages cached for file systems */
|
2010-01-19 22:00:20 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
struct vm_usage_info {
|
|
|
|
vir_bytes vui_total; /* total amount of process memory */
|
|
|
|
vir_bytes vui_common; /* part of memory mapped in more than once */
|
|
|
|
vir_bytes vui_shared; /* shared (non-COW) part of common memory */
|
|
|
|
};
|
|
|
|
|
|
|
|
struct vm_region_info {
|
|
|
|
vir_bytes vri_addr; /* base address of region */
|
|
|
|
vir_bytes vri_length; /* length of region */
|
|
|
|
int vri_prot; /* protection flags (PROT_) */
|
|
|
|
int vri_flags; /* memory flags (subset of MAP_) */
|
|
|
|
};
|
|
|
|
|
|
|
|
#define MAX_VRI_COUNT 64 /* max. number of regions provided at once */
|
|
|
|
|
2012-03-24 16:16:34 +01:00
|
|
|
int vm_info_stats(struct vm_stats_info *vfi);
|
|
|
|
int vm_info_usage(endpoint_t who, struct vm_usage_info *vui);
|
|
|
|
int vm_info_region(endpoint_t who, struct vm_region_info *vri, int
|
|
|
|
count, vir_bytes *next);
|
make vfs & filesystems use failable copying
Change the kernel to add features to vircopy and safecopies so that
transparent copy fixing won't happen to avoid deadlocks, and such copies
fail with EFAULT.
Transparently making copying work from filesystems (as normally done by
the kernel & VM when copying fails because of missing/readonly memory)
is problematic as it can happen that, for file-mapped ranges, that that
same filesystem that is blocked on the copy request is needed to satisfy
the memory range, leading to deadlock. Dito for VFS itself, if done with
a blocking call.
This change makes the copying done from a filesystem fail in such cases
with EFAULT by VFS adding the CPF_TRY flag to the grants. If a FS call
fails with EFAULT, VFS will then request the range to be made available
to VM after the FS is unblocked, allowing it to be used to satisfy the
range if need be in another VFS thread.
Similarly, for datacopies that VFS itself does, it uses the failable
vircopy variant and callers use a wrapper that talk to VM if necessary
to get the copy to work.
. kernel: add CPF_TRY flag to safecopies
. kernel: only request writable ranges to VM for the
target buffer when copying fails
. do copying in VFS TRY-first
. some fixes in VM to build SANITYCHECK mode
. add regression test for the cases where
- a FS system call needs memory mapped in a process that the
FS itself must map.
- such a range covers more than one file-mapped region.
. add 'try' mode to vircopy, physcopy
. add flags field to copy kernel call messages
. if CP_FLAG_TRY is set, do not transparently try
to fix memory ranges
. for use by VFS when accessing user buffers to avoid
deadlock
. remove some obsolete backwards compatability assignments
. VFS: let thread scheduling work for VM requests too
Allows VFS to make calls to VM while suspending and resuming
the currently running thread. Does currently not work for the
main thread.
. VM: add fix memory range call for use by VFS
Change-Id: I295794269cea51a3163519a9cfe5901301d90b32
2014-01-16 14:22:13 +01:00
|
|
|
int vm_procctl_clear(endpoint_t ep);
|
|
|
|
int vm_procctl_handlemem(endpoint_t ep, vir_bytes m1, vir_bytes m2, int wr);
|
2010-01-19 22:00:20 +01:00
|
|
|
|
2013-11-28 17:51:21 +01:00
|
|
|
int vm_set_cacheblock(void *block, dev_t dev, off_t dev_offset,
|
libfsdriver: support mmap on FSes with no device
This patch adds (very limited) support for memory-mapping pages on
file systems that are mounted on the special "none" device and that
do not implement PEEK support by themselves. This includes hgfs,
vbfs, and procfs.
The solution is implemented in libvtreefs, and consists of allocating
pages, filling them with content by calling the file system's READ
functionality, passing the pages to VM, and freeing them again. A new
VM flag is used to indicate that these pages should be mapped in only
once, and thus not cached beyond their single use. This prevents
stale data from getting mapped in without the involvement of the file
system, which would be problematic on file systems where file contents
may become outdated at any time. No VM caching means no sharing and
poor performance, but mmap no longer fails on these file systems.
Compared to a libc-based approach, this patch retains the on-demand
nature of mmap. Especially tail(1) is known to map in a large file
area only to use a small portion of it.
All file systems now need to be given permission for the SETCACHEPAGE
and CLEARCACHE calls to VM.
A very basic regression test is added to test74.
Change-Id: I17afc4cb97315b515cad1542521b98f293b6b559
2014-11-15 11:14:00 +01:00
|
|
|
ino_t ino, off_t ino_offset, u32_t *flags, int blocksize,
|
|
|
|
int setflags);
|
2013-11-28 17:51:21 +01:00
|
|
|
void *vm_map_cacheblock(dev_t dev, off_t dev_offset,
|
|
|
|
ino_t ino, off_t ino_offset, u32_t *flags, int blocksize);
|
libminixfs/VM: fix memory-mapped file corruption
This patch employs one solution to resolve two independent but related
issues. Both issues are the result of one fundamental aspect of the
way VM's memory mapping works: VM uses its cache to map in blocks for
memory-mapped file regions, and for blocks already in the VM cache, VM
does not go to the file system before mapping them in. To preserve
consistency between the FS and VM caches, VM relies on being informed
about all updates to file contents through the block cache. The two
issues are both the result of VM not being properly informed about
such updates:
1. Once a file system provides libminixfs with an inode association
(inode number + inode offset) for a disk block, this association
is not broken until a new inode association is provided for it.
If a block is freed and reallocated as a metadata (non-inode)
block, its old association is maintained, and may be supplied to
VM's secondary cache. Due to reuse of inodes, it is possible
that the same inode association becomes valid for an actual file
block again. In that case, when that new file is memory-mapped,
under certain circumstances, VM may end up using the metadata
block to satisfy a page fault on the file, due to the stale inode
association. The result is a corrupted memory mapping, with the
application seeing data other than the current file contents
mapped in at the file block.
2. When a hole is created in a file, the underlying block is freed
from the device, but VM is not informed of this update, and thus,
if VM's cache contains the block with its previous inode
association, this block will remain there. As a result, if an
application subsequently memory-maps the file, VM will map in the
old block at the position of the hole, rather than an all-zeroes
block. Thus, again, the result is a corrupted memory mapping.
This patch resolves both issues by making the file system inform the
minixfs library about blocks being freed, so that libminixfs can
break the inode association for that block, both in its own cache and
in the VM cache. Since libminixfs does not know whether VM has the
block in its cache or not, it makes a call to VM for each block being
freed. Thus, this change introduces more calls to VM, but it solves
the correctness issues at hand; optimizations may be introduced
later. On the upside, all freed blocks are now marked as clean,
which should result in fewer blocks being written back to the device,
and the blocks are removed from the caches entirely, which should
result in slightly better cache usage.
This patch is necessary but not sufficient to resolve the situation
with respect to memory mapping of file holes in general. Therefore,
this patch extends test 74 with a (rather particular but effective)
test for the first issue, but not yet with a test for the second one.
This fixes #90.
Change-Id: Iad8b134d2f88a884f15d3fc303e463280749c467
2015-08-13 13:29:33 +02:00
|
|
|
int vm_forget_cacheblock(dev_t dev, off_t dev_offset, int blocksize);
|
2013-11-15 19:01:25 +01:00
|
|
|
int vm_clear_cache(dev_t dev);
|
2013-09-15 18:55:42 +02:00
|
|
|
|
2013-03-20 20:18:52 +01:00
|
|
|
/* flags for vm cache functions */
|
|
|
|
#define VMMC_FLAGS_LOCKED 0x01 /* someone is updating the flags; don't read/write */
|
|
|
|
#define VMMC_DIRTY 0x02 /* dirty buffer and it may not be evicted */
|
|
|
|
#define VMMC_EVICTED 0x04 /* VM has evicted the buffer and it's invalid */
|
|
|
|
#define VMMC_BLOCK_LOCKED 0x08 /* client is using it and it may not be evicted */
|
|
|
|
|
|
|
|
/* special inode number for vm cache functions */
|
|
|
|
#define VMC_NO_INODE 0 /* to reference a disk block, no associated file */
|
|
|
|
|
libfsdriver: support mmap on FSes with no device
This patch adds (very limited) support for memory-mapping pages on
file systems that are mounted on the special "none" device and that
do not implement PEEK support by themselves. This includes hgfs,
vbfs, and procfs.
The solution is implemented in libvtreefs, and consists of allocating
pages, filling them with content by calling the file system's READ
functionality, passing the pages to VM, and freeing them again. A new
VM flag is used to indicate that these pages should be mapped in only
once, and thus not cached beyond their single use. This prevents
stale data from getting mapped in without the involvement of the file
system, which would be problematic on file systems where file contents
may become outdated at any time. No VM caching means no sharing and
poor performance, but mmap no longer fails on these file systems.
Compared to a libc-based approach, this patch retains the on-demand
nature of mmap. Especially tail(1) is known to map in a large file
area only to use a small portion of it.
All file systems now need to be given permission for the SETCACHEPAGE
and CLEARCACHE calls to VM.
A very basic regression test is added to test74.
Change-Id: I17afc4cb97315b515cad1542521b98f293b6b559
2014-11-15 11:14:00 +01:00
|
|
|
/* setflags for vm_set_cacheblock, also used internally in VM */
|
|
|
|
#define VMSF_ONCE 0x01 /* discard block after one-time use */
|
|
|
|
|
2008-11-19 13:26:10 +01:00
|
|
|
#endif /* _MINIX_VM_H */
|
|
|
|
|