From 7f79fb8810a7bdcd4c6f0647b9a2c47a30a73bad Mon Sep 17 00:00:00 2001 From: David van Moolenbroek Date: Sun, 6 Sep 2015 16:54:42 +0200 Subject: [PATCH] Improve asynsend support for process swapping This resolves various system stalls while running testrelpol. Change-Id: Ie70fc2dbcdb0a8c9e3800cc0df564be747e111ec --- minix/kernel/system/do_privctl.c | 2 ++ minix/kernel/system/do_update.c | 1 + minix/lib/libsys/sef_liveupdate.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/minix/kernel/system/do_privctl.c b/minix/kernel/system/do_privctl.c index 1dd19423e..fc332a82e 100644 --- a/minix/kernel/system/do_privctl.c +++ b/minix/kernel/system/do_privctl.c @@ -120,6 +120,8 @@ int do_privctl(struct proc * caller, message * m_ptr) priv(rp)->s_proc_nr = proc_nr; /* reassociate process nr */ for (i=0; i< NR_SYS_CHUNKS; i++) /* remove pending: */ + priv(rp)->s_asyn_pending.chunk[i] = 0; /* - incoming asyn */ + for (i=0; i< NR_SYS_CHUNKS; i++) /* messages */ priv(rp)->s_notify_pending.chunk[i] = 0; /* - notifications */ priv(rp)->s_int_pending = 0; /* - interrupts */ (void) sigemptyset(&priv(rp)->s_sig_pending); /* - signals */ diff --git a/minix/kernel/system/do_update.c b/minix/kernel/system/do_update.c index ba6a8ab70..34c35f624 100644 --- a/minix/kernel/system/do_update.c +++ b/minix/kernel/system/do_update.c @@ -286,6 +286,7 @@ static void adjust_priv_slot(struct priv *privp, struct priv *from_privp) { /* Preserve privilege ids and non-privilege stuff in the priv structure. */ privp->s_id = from_privp->s_id; + privp->s_asyn_pending = from_privp->s_asyn_pending; privp->s_notify_pending = from_privp->s_notify_pending; privp->s_int_pending = from_privp->s_int_pending; privp->s_sig_pending = from_privp->s_sig_pending; diff --git a/minix/lib/libsys/sef_liveupdate.c b/minix/lib/libsys/sef_liveupdate.c index 5becf0c2a..e326d28da 100644 --- a/minix/lib/libsys/sef_liveupdate.c +++ b/minix/lib/libsys/sef_liveupdate.c @@ -196,6 +196,37 @@ static void sef_lu_ready(int result) * Restore things back to normal and continue executing. */ sef_lu_state_change(SEF_LU_STATE_NULL, 0); + + /* Transfer of asynsend tables during live update is messy at best. The + * general idea is that the asynsend table is preserved during live update, + * so that messages never get lost. That means that 1) the new instance + * takes over the table from the old instance upon live update, and 2) the + * old instance takes over the table on rollback. Case 1 is not atomic: + * the new instance starts with no asynsend table, and after swapping slots, + * the old instance's table will no longer be looked at by the kernel. The + * new instance copies over the table from the old instance, and then calls + * senda_reload() to tell the kernel about the new location of the otherwise + * preserved table. Case 2 is different: the old instance cannot copy the + * table from the new instance, and so the kernel does that part, based on + * the table provided through the new instance's senda_reload(). However, if + * the new instance never got to the senda_reload() call, then the kernel + * also would not have been able to deliver any messages, and so the old + * instance's table can still be used as is. Now the problem. Because case 1 + * is not atomic, there is a small window during which other processes may + * attempt to receive a message, based on the fact that their s_asyn_pending + * mask in the kernel has a bit set for the process being updated. Failing + * to find a matching message in the yet-missing table of the new process, + * the kernel will unset the s_asyn_pending bit. Now, normally the bit would + * be set again through the new instance's senda_reload() call. However, if + * the new instance rolls back instead, the old instance will have a message + * for the other process, but its s_asyn_pending bit will not be set. Thus, + * the message will never be delivered unless we call senda_reload() here. + * XXX TODO: the story is even more complicated, because based on the above + * story, copying back the table should never be necessary and never happen. + * My logs show it does happen for at least RS, which may indicate RS sends + * asynchronous messages in its initialization code.. -dcvmoole + */ + senda_reload(); } /*===========================================================================*