RS/VM: proper preparation for multi-VM live update

Due to changed VM internals, more elaborate preparation is required
before a live update with multiple components including VM can take
place.  This patch adds the essential preparation infrastructure to
VM and adapts RS to make use of it.  As a side effect, it is no
longer necessary to supply RS as the last component (if at all)
during the set-up of a multicomponent live update operation.

Change-Id: If069fd3f93f96f9d5433998e4615f861465ef448
This commit is contained in:
David van Moolenbroek 2015-07-14 07:42:48 +02:00
parent 5a4672e300
commit abf8a7e7b3
13 changed files with 245 additions and 54 deletions

View file

@ -11,7 +11,8 @@ service rs
RS_SET_PRIV # 37
RS_UPDATE # 41
RS_MEMCTL # 42
PROCCTL
PROCCTL # 45
RS_PREPARE # 48
;
io NONE; # No I/O range allowed
irq NONE; # No IRQ allowed

View file

@ -754,6 +754,7 @@ struct
{ "CLEARCACHE", VM_CLEARCACHE },
{ "VFS_MMAP", VM_VFS_MMAP },
{ "VFS_REPLY", VM_VFS_REPLY },
{ "RS_PREPARE", VM_RS_PREPARE },
{ NULL, 0 },
};

View file

@ -751,8 +751,10 @@
#define VM_GETRUSAGE (VM_RQ_BASE+47)
#define VM_RS_PREPARE (VM_RQ_BASE+48)
/* Total. */
#define NR_VM_CALLS 48
#define NR_VM_CALLS 49
#define VM_CALL_MASK_SIZE BITMAP_CHUNKS(NR_VM_CALLS)
/* not handled as a normal VM call, thus at the end of the reserved rage */

View file

@ -19,6 +19,7 @@ int vm_notify_sig(endpoint_t ep, endpoint_t ipc_ep);
int vm_set_priv(endpoint_t ep, void *buf, int sys_proc);
int vm_update(endpoint_t src_e, endpoint_t dst_e, int flags);
int vm_memctl(endpoint_t ep, int req, void** addr, size_t *len);
int vm_prepare(endpoint_t src_e, endpoint_t dst_e, int flags);
int vm_query_exit(endpoint_t *endpt);
int vm_watch_exit(endpoint_t ep);
int minix_vfs_mmap(endpoint_t who, off_t offset, size_t len,

View file

@ -100,6 +100,7 @@ SRCS+= \
vm_map_phys.c \
vm_memctl.c \
vm_notify_sig.c \
vm_prepare.c \
vm_procctl.c \
vm_query_exit.c \
vm_set_priv.c \

View file

@ -0,0 +1,17 @@
#include "syslib.h"
#include <unistd.h>
#include <string.h>
int
vm_prepare(endpoint_t src_e, endpoint_t dst_e, int flags)
{
message m;
memset(&m, 0, sizeof(m));
m.m_lsys_vm_update.src = src_e;
m.m_lsys_vm_update.dst = dst_e;
m.m_lsys_vm_update.flags = flags;
return _taskcall(VM_PROC_NR, VM_RS_PREPARE, &m);
}

View file

@ -87,21 +87,18 @@
#define RUPDATE_INIT() memset(&rupdate, 0, sizeof(rupdate))
#define RUPDATE_CLEAR() RUPDATE_INIT()
/* Note that we have 'B' last in order to allow 'continue' statements */
#define RUPDATE_ITER(HEAD, RPUPD_PREV, RPUPD, B) do { \
RPUPD = HEAD; \
RPUPD_PREV = NULL; \
while(RPUPD) { \
for(RPUPD = HEAD, RPUPD_PREV = NULL; RPUPD != NULL; \
RPUPD_PREV = RPUPD, RPUPD = RPUPD->next_rpupd) { \
B \
RPUPD_PREV = RPUPD; \
RPUPD = RPUPD->next_rpupd; \
} \
} while(0)
#define RUPDATE_REV_ITER(TAIL, RPUPD_PREV, RPUPD, B) do { \
RPUPD = TAIL; \
while(RPUPD) { \
for(RPUPD = TAIL; RPUPD != NULL; RPUPD = RPUPD->prev_rpupd) { \
RPUPD_PREV = RPUPD->prev_rpupd; \
B \
RPUPD = RPUPD->prev_rpupd; \
} \
} while(0)

View file

@ -667,10 +667,6 @@ int do_update(message *m_ptr)
printf("RS: the specified process is already part of the currently scheduled update\n");
return EINVAL;
}
if(rupdate.last_rpupd->rp->r_pub->endpoint == RS_PROC_NR) {
printf("RS: RS should always be the last service to update in a multi-component update\n");
return EINVAL;
}
}
/* Prepare-only update for VM, PM, and VFS is only supported with an unreachable state. */

View file

@ -23,35 +23,62 @@ void rupdate_clear_upds()
void rupdate_add_upd(struct rprocupd* rpupd)
{
/* Add an update descriptor to the update chain. */
struct rprocupd* prev_rpupd;
struct rprocupd *prev_rpupd, *walk_rpupd;
endpoint_t ep;
int lu_flags;
rpupd->prev_rpupd = rupdate.last_rpupd;
if(rupdate.num_rpupds == 0) {
rupdate.first_rpupd = rpupd;
rupdate.curr_rpupd = rpupd;
}
else {
rupdate.last_rpupd->next_rpupd = rpupd;
/* In order to allow multicomponent-with-VM live updates to be processed
* correctly, we perform partial sorting on the chain: RS is to be last (if
* present), VM is to be right before it (if present), and all the other
* processes are to be at the start of the chain.
*/
ep = rpupd->rp->r_pub->endpoint;
assert(rpupd->next_rpupd == NULL);
assert(rpupd->prev_rpupd == NULL);
/* Determine what element to insert after, if not at the head. */
prev_rpupd = rupdate.last_rpupd;
if (prev_rpupd != NULL && ep != RS_PROC_NR &&
prev_rpupd->rp->r_pub->endpoint == RS_PROC_NR)
prev_rpupd = prev_rpupd->prev_rpupd;
if (prev_rpupd != NULL && ep != RS_PROC_NR && ep != VM_PROC_NR &&
prev_rpupd->rp->r_pub->endpoint == VM_PROC_NR)
prev_rpupd = prev_rpupd->prev_rpupd;
/* Perform the insertion. */
if (prev_rpupd == NULL) {
rpupd->next_rpupd = rupdate.first_rpupd;
rupdate.first_rpupd = rupdate.curr_rpupd = rpupd;
} else {
rpupd->next_rpupd = prev_rpupd->next_rpupd;
rpupd->prev_rpupd = prev_rpupd;
prev_rpupd->next_rpupd = rpupd;
}
if (rpupd->next_rpupd != NULL)
rpupd->next_rpupd->prev_rpupd = rpupd;
else
rupdate.last_rpupd = rpupd;
rupdate.num_rpupds++;
/* Propagate relevant flags from the new descriptor. */
lu_flags = rpupd->lu_flags & (SEF_LU_INCLUDES_VM|SEF_LU_INCLUDES_RS|SEF_LU_UNSAFE|SEF_LU_MULTI);
if(lu_flags) {
RUPDATE_ITER(rupdate.first_rpupd, prev_rpupd, rpupd,
rpupd->lu_flags |= lu_flags;
rpupd->init_flags |= lu_flags;
RUPDATE_ITER(rupdate.first_rpupd, prev_rpupd, walk_rpupd,
walk_rpupd->lu_flags |= lu_flags;
walk_rpupd->init_flags |= lu_flags;
);
}
/* Set VM/RS update descriptor pointers. */
if(!rupdate.vm_rpupd && (lu_flags & SEF_LU_INCLUDES_VM)) {
rupdate.vm_rpupd = rupdate.last_rpupd;
rupdate.vm_rpupd = rpupd;
}
else if(!rupdate.rs_rpupd && (lu_flags & SEF_LU_INCLUDES_RS)) {
rupdate.rs_rpupd = rupdate.last_rpupd;
rupdate.rs_rpupd = rpupd;
}
}
@ -419,14 +446,6 @@ int start_update_prepare(int allow_retries)
if(rpupd->lu_flags & SEF_LU_NOMMAP) {
rp->r_pub->sys_flags |= SF_VM_NOMMAP;
}
if(!(rpupd->lu_flags & SEF_LU_UNSAFE)) {
if(rs_verbose)
printf("RS: %s pinning memory\n", srv_to_string(rp));
vm_memctl(rp->r_pub->new_endpoint, VM_RS_MEM_PIN, 0, 0);
if(rs_verbose)
printf("RS: %s pinning memory\n", srv_to_string(new_rp));
vm_memctl(new_rp->r_pub->endpoint, VM_RS_MEM_PIN, 0, 0);
}
}
}
);
@ -448,7 +467,9 @@ int start_update_prepare(int allow_retries)
struct rprocupd* start_update_prepare_next()
{
/* Request the next service in the update chain to prepare for the update. */
struct rprocupd *rpupd = NULL;
struct rprocupd *rpupd, *prev_rpupd, *walk_rpupd;
struct rproc *rp, *new_rp;
if(!RUPDATE_IS_UPDATING()) {
rpupd = rupdate.first_rpupd;
}
@ -458,6 +479,34 @@ struct rprocupd* start_update_prepare_next()
if(!rpupd) {
return NULL;
}
if (RUPDATE_IS_UPD_VM_MULTI() && rpupd == rupdate.vm_rpupd) {
/* We are doing a multicomponent live update that includes VM, and all
* services are now ready (and thereby stopped) except VM and possibly
* RS. This is the last point in time, and therefore also the best, that
* we can ask the (old) VM instance to do stuff for us, before we ask it
* to get ready as well: preallocate and pin memory, and copy over
* memory-mapped regions. Do this now, for all services except VM
* itself. In particular, also do it for RS, as we know that RS (yes,
* this service) is not going to create problems from here on.
*/
RUPDATE_ITER(rupdate.first_rpupd, prev_rpupd, walk_rpupd,
if (UPD_IS_PREPARING_ONLY(walk_rpupd))
continue; /* skip prepare-only processes */
if (walk_rpupd == rupdate.vm_rpupd)
continue; /* skip VM */
rp = walk_rpupd->rp;
new_rp = rp->r_new_rp;
assert(rp && new_rp);
if (rs_verbose)
printf("RS: preparing VM for %s -> %s\n", srv_to_string(rp),
srv_to_string(new_rp));
/* Ask VM to prepare the new instance based on the old instance. */
vm_prepare(rp->r_pub->new_endpoint, new_rp->r_pub->endpoint,
rp->r_pub->sys_flags);
);
}
rupdate.flags |= RS_UPDATING;
while(1) {

View file

@ -554,6 +554,7 @@ void init_vm(void)
/* Calls from RS */
CALLMAP(VM_RS_SET_PRIV, do_rs_set_priv);
CALLMAP(VM_RS_PREPARE, do_rs_prepare);
CALLMAP(VM_RS_UPDATE, do_rs_update);
CALLMAP(VM_RS_MEMCTL, do_rs_memctl);

View file

@ -49,6 +49,7 @@ int do_info(message *);
int swap_proc_slot(struct vmproc *src_vmp, struct vmproc *dst_vmp);
int swap_proc_dyn_data(struct vmproc *src_vmp, struct vmproc *dst_vmp,
int sys_upd_flags);
int map_proc_dyn_data(struct vmproc *src_vmp, struct vmproc *dst_vmp);
void adjust_proc_refs(void);
int do_getrusage(message *m);
@ -192,6 +193,7 @@ void map_sanitycheck(const char *file, int line);
/* rs.c */
int do_rs_set_priv(message *m);
int do_rs_prepare(message *m);
int do_rs_update(message *m);
int do_rs_memctl(message *m);

View file

@ -66,6 +66,85 @@ int do_rs_set_priv(message *m)
return OK;
}
/*===========================================================================*
* do_rs_prepare *
*===========================================================================*/
int do_rs_prepare(message *m_ptr)
{
/* Prepare a new instance of a service for an upcoming live-update
* switch, based on the old instance of this service. This call is
* used only by RS and only for a multicomponent live update which
* includes VM. In this case, all processes need to be prepared such
* that they don't require the new VM instance to perform actions
* during live update that cannot be undone in the case of a rollback.
*/
endpoint_t src_e, dst_e;
int src_p, dst_p;
struct vmproc *src_vmp, *dst_vmp;
struct vir_region *src_data_vr, *dst_data_vr;
vir_bytes src_addr, dst_addr;
int sys_upd_flags;
src_e = m_ptr->m_lsys_vm_update.src;
dst_e = m_ptr->m_lsys_vm_update.dst;
sys_upd_flags = m_ptr->m_lsys_vm_update.flags;
/* Lookup slots for source and destination process. */
if(vm_isokendpt(src_e, &src_p) != OK) {
printf("VM: do_rs_prepare: bad src endpoint %d\n", src_e);
return EINVAL;
}
src_vmp = &vmproc[src_p];
if(vm_isokendpt(dst_e, &dst_p) != OK) {
printf("VM: do_rs_prepare: bad dst endpoint %d\n", dst_e);
return EINVAL;
}
dst_vmp = &vmproc[dst_p];
/* Pin memory for the source process. */
map_pin_memory(src_vmp);
/* See if the source process has a larger heap than the destination
* process. If so, extend the heap of the destination process to
* match the source's. While this may end up wasting quite some
* memory, it is absolutely essential that the destination process
* does not run out of heap memory during the live update window,
* and since most processes will be doing an identity transfer, they
* are likely to require as much heap as their previous instances.
* Better safe than sorry. TODO: prevent wasting memory somehow;
* this seems particularly relevant for RS.
*/
src_data_vr = region_search(&src_vmp->vm_regions_avl, VM_MMAPBASE,
AVL_LESS);
assert(src_data_vr);
dst_data_vr = region_search(&dst_vmp->vm_regions_avl, VM_MMAPBASE,
AVL_LESS);
assert(dst_data_vr);
src_addr = src_data_vr->vaddr + src_data_vr->length;
dst_addr = dst_data_vr->vaddr + dst_data_vr->length;
if (src_addr > dst_addr)
real_brk(dst_vmp, src_addr);
/* Now also pin memory for the destination process. */
map_pin_memory(dst_vmp);
/* Finally, map the source process's memory-mapped regions into the
* destination process. This needs to happen now, because VM may not
* allocate any objects during the live update window, since this
* would prevent successful rollback of VM afterwards. The
* destination may not actually touch these regions during the live
* update window either, because they are mapped copy-on-write and a
* pagefault would also cause object allocation. Objects are pages,
* slab objects, anything in the new VM instance to which changes are
* visible in the old VM basically.
*/
if (!(sys_upd_flags & SF_VM_NOMMAP))
map_proc_dyn_data(src_vmp, dst_vmp);
return OK;
}
/*===========================================================================*
* do_rs_update *
*===========================================================================*/

View file

@ -223,12 +223,13 @@ int swap_proc_slot(struct vmproc *src_vmp, struct vmproc *dst_vmp)
* Transfer memory mapped regions, using CoW sharing, from 'src_vmp' to
* 'dst_vmp', for the source process's address range of 'start_addr'
* (inclusive) to 'end_addr' (exclusive). Return OK or an error code.
* If the regions seem to have been transferred already, do nothing.
*/
static int
transfer_mmap_regions(struct vmproc *dst_vmp, struct vmproc *src_vmp,
transfer_mmap_regions(struct vmproc *src_vmp, struct vmproc *dst_vmp,
vir_bytes start_addr, vir_bytes end_addr)
{
struct vir_region *start_vr, *end_vr;
struct vir_region *start_vr, *check_vr, *end_vr;
start_vr = region_search(&src_vmp->vm_regions_avl, start_addr,
AVL_GREATER_EQUAL);
@ -236,6 +237,31 @@ transfer_mmap_regions(struct vmproc *dst_vmp, struct vmproc *src_vmp,
if (start_vr == NULL || start_vr->vaddr >= end_addr)
return OK; /* nothing to do */
/* In the case of multicomponent live update that includes VM, this
* function may be called for the same process more than once, for the
* sake of keeping code paths as little divergent as possible while at
* the same time ensuring that the regions are copied early enough.
*
* To compensate for these multiple calls, we perform a very simple
* check here to see if the region to transfer is already present in
* the target process. If so, we can safely skip copying the regions
* again, because there is no other possible explanation for the
* region being present already. Things would go horribly wrong if we
* tried copying anyway, but this check is not good enough to detect
* all such problems, since we do a check on the base address only.
*/
check_vr = region_search(&dst_vmp->vm_regions_avl, start_vr->vaddr,
AVL_EQUAL);
if (check_vr != NULL) {
#if LU_DEBUG
printf("VM: transfer_mmap_regions: skipping transfer from "
"%d to %d (0x%lx already present)\n",
src_vmp->vm_endpoint, dst_vmp->vm_endpoint,
start_vr->vaddr);
#endif
return OK;
}
end_vr = region_search(&src_vmp->vm_regions_avl, end_addr, AVL_LESS);
assert(end_vr != NULL);
assert(start_vr->vaddr <= end_vr->vaddr);
@ -249,6 +275,38 @@ transfer_mmap_regions(struct vmproc *dst_vmp, struct vmproc *src_vmp,
return map_proc_copy_range(dst_vmp, src_vmp, start_vr, end_vr);
}
/*
* Create copy-on-write mappings in process 'dst_vmp' for all memory-mapped
* regions present in 'src_vmp'. Return OK on success, or an error otherwise.
* In the case of failure, successfully created mappings are not undone.
*/
int
map_proc_dyn_data(struct vmproc *src_vmp, struct vmproc *dst_vmp)
{
int r;
#if LU_DEBUG
printf("VM: mapping dynamic data from %d to %d\n",
src_vmp->vm_endpoint, dst_vmp->vm_endpoint);
#endif
/* Transfer memory mapped regions now. To sandbox the new instance and
* prevent state corruption on rollback, we share all the regions
* between the two instances as COW.
*/
r = transfer_mmap_regions(src_vmp, dst_vmp, VM_MMAPBASE, VM_MMAPTOP);
/* If the stack is not mapped at the VM_DATATOP, there might be some
* more regions hiding above the stack. We also have to transfer
* those.
*/
if (r == OK && VM_STACKTOP < VM_DATATOP)
r = transfer_mmap_regions(src_vmp, dst_vmp, VM_STACKTOP,
VM_DATATOP);
return r;
}
/*===========================================================================*
* swap_proc_dyn_data *
*===========================================================================*/
@ -297,22 +355,8 @@ int swap_proc_dyn_data(struct vmproc *src_vmp, struct vmproc *dst_vmp,
/* Make sure regions are consistent. */
assert(region_search_root(&src_vmp->vm_regions_avl) && region_search_root(&dst_vmp->vm_regions_avl));
/* Transfer memory mapped regions now. To sandbox the new instance and
* prevent state corruption on rollback, we share all the regions
* between the two instances as COW. Source and destination are
* intentionally swapped in these calls!
*/
r = transfer_mmap_regions(src_vmp, dst_vmp, VM_MMAPBASE, VM_MMAPTOP);
/* If the stack is not mapped at the VM_DATATOP, there might be some
* more regions hiding above the stack. We also have to transfer
* those.
*/
if (r == OK && VM_STACKTOP < VM_DATATOP)
r = transfer_mmap_regions(src_vmp, dst_vmp, VM_STACKTOP,
VM_DATATOP);
return r;
/* Source and destination are intentionally swapped here! */
return map_proc_dyn_data(dst_vmp, src_vmp);
}
void *mmap(void *addr, size_t len, int f, int f2, int f3, off_t o)