New rescue functionality.

This commit is contained in:
Jorrit Herder 2005-10-21 13:28:26 +00:00
parent dd49f3586f
commit 9333141704
6 changed files with 84 additions and 37 deletions

View file

@ -10,6 +10,8 @@ service \- Start or stop an operating system server or device driver.
.PP
\fBservice refresh\fR \fI<pid>\fR
.PP
\fBservice rescue\fR \fI<dir>\fR
.PP
\fBservice shutdown\fR
.br
.de FL
@ -41,6 +43,8 @@ server interprets this as an unexpected exit and will automatically restart
a fresh copy of the process. The clean way to restart a process is using the
'refresh' option of the service utility.
.PP
The rescue call can be used to set the current working directory of the reincarnation server. By using a trusted rescue driver to shadow certain binaries in RAM the reincarnation server can check that rescue directory for binaries if the normal execution of the absolute path fails. This allows recovery from failures of the root file system driver.
.PP
If the system is to be shutdown, the reincarnation server should know about this event to prevent it from restarting services that are killed during the shutdown
sequence.
.SH EXAMPLES

View file

@ -60,20 +60,14 @@ PUBLIC int main(void)
switch (call_nr) {
case SYN_ALARM:
do_period(&m); /* check drivers status */
continue; /* no reply is expected */
continue;
case SYS_SIG:
sigset = (sigset_t) m.NOTIFY_ARG;
if (sigismember(&sigset, SIGCHLD)) {
do_exit(&m);
}
if (sigismember(&sigset, SIGTERM) ||
sigismember(&sigset, SIGKSTOP)) {
/* Prevent restarting services. */
do_shutdown(NULL);
}
continue; /* no reply is expected */
sigset = (sigset_t) m.NOTIFY_ARG; /* check signals passed */
if (sigismember(&sigset, SIGCHLD)) do_exit(&m);
if (sigismember(&sigset, SIGTERM)) do_shutdown(NULL);
if (sigismember(&sigset, SIGKSTOP)) do_shutdown(NULL);
continue;
default: /* heartbeat notification */
printf("Got heartbeat from %d\n", who);
if (rproc_ptr[who] != NULL) /* mark heartbeat time */
rproc_ptr[who]->r_alive_tm = m.NOTIFY_TIMESTAMP;
}
@ -84,22 +78,12 @@ PUBLIC int main(void)
*/
else {
switch(call_nr) {
case RS_UP:
result = do_up(&m);
break;
case RS_DOWN:
result = do_down(&m);
break;
case RS_REFRESH:
result = do_refresh(&m);
break;
case RS_SHUTDOWN:
result = do_shutdown(&m);
break;
case GETSYSINFO:
printf("RS got GETSYSINFO request from %d\n", m.m_source);
result = do_getsysinfo(&m);
break;
case RS_UP: result = do_up(&m); break;
case RS_DOWN: result = do_down(&m); break;
case RS_REFRESH: result = do_refresh(&m); break;
case RS_RESCUE: result = do_rescue(&m); break;
case RS_SHUTDOWN: result = do_shutdown(&m); break;
case GETSYSINFO: result = do_getsysinfo(&m); break;
default:
printf("Warning, RS got unexpected request %d from %d\n",
m.m_type, m.m_source);
@ -142,10 +126,9 @@ PRIVATE void init_server(void)
if ((s = getsysinfo(FS_PROC_NR, SI_DMAP_TAB, dmap)) < 0)
panic("RS","warning: couldn't get copy of dmap table", errno);
/* Change working directory to /sbin, where the binaries for the programs
* in the system image are.
/* Now initialize the table with the processes in the system image.
* Prepend /sbin/ to the binaries so that we can actually find them.
*/
chdir("/sbin/");
for (s=0; s< NR_BOOT_PROCS; s++) {
ip = &image[s];
if (ip->proc_nr >= 0) {
@ -156,7 +139,8 @@ PRIVATE void init_server(void)
for(t=0; t< NR_DEVICES; t++)
if (dmap[t].dmap_driver == ip->proc_nr)
rproc[s].r_dev_nr = t;
strcpy(rproc[s].r_cmd, ip->proc_name);
strcpy(rproc[s].r_cmd, "/sbin/");
strcpy(rproc[s].r_cmd+6, ip->proc_name);
rproc[s].r_argc = 1;
rproc[s].r_argv[0] = rproc[s].r_cmd;
rproc[s].r_argv[1] = NULL;

View file

@ -135,6 +135,30 @@ PUBLIC int do_refresh(message *m_ptr)
return(ESRCH);
}
/*===========================================================================*
* do_rescue *
*===========================================================================*/
PUBLIC int do_rescue(message *m_ptr)
{
char rescue_dir[MAX_RESCUE_DIR_LEN];
int s;
/* Copy rescue directory from user. */
if (m_ptr->RS_CMD_LEN > MAX_RESCUE_DIR_LEN) return(E2BIG);
if (OK!=(s=sys_datacopy(m_ptr->m_source, (vir_bytes) m_ptr->RS_CMD_ADDR,
SELF, (vir_bytes) rescue_dir, m_ptr->RS_CMD_LEN))) return(s);
rescue_dir[m_ptr->RS_CMD_LEN] = '\0'; /* ensure it is terminated */
if (rescue_dir[0] != '/') return(EINVAL); /* insist on absolute path */
/* Change RS' directory to the rescue directory. Provided that the needed
* binaries are in the rescue dir, this makes recovery possible even if the
* (root) file system is no longer available, because no directory lookups
* are required. Thus if an absolute path fails, we can try to strip the
* path an see if the command is in the rescue dir.
*/
if (chdir(rescue_dir) != 0) return(errno);
return(OK);
}
/*===========================================================================*
* do_shutdown *
@ -303,6 +327,7 @@ struct rproc *rp;
*/
int child_proc_nr; /* child process slot */
pid_t child_pid; /* child's process id */
char *file_only;
int s;
message m;
@ -314,9 +339,16 @@ struct rproc *rp;
return(errno); /* return error */
case 0: /* child process */
/* Try to execute the binary that has an absolute path. If this fails,
* e.g., because the root file system cannot be read, try to strip of
* the path, and see if the command is in RS' current working dir.
*/
execve(rp->r_argv[0], rp->r_argv, NULL); /* POSIX execute */
printf("RS: exec failed for %s\n", rp->r_argv[0]);
report("RS", "warning, exec() failed", errno); /* shouldn't happen */
file_only = strrchr(rp->r_argv[0], '/') + 1;
printf("Absolute exec failed (%d), trying file only: %s\n",
errno, file_only);
execve(file_only, rp->r_argv, NULL); /* POSIX execute */
printf("RS: exec failed for %s: %d\n", rp->r_argv[0], errno);
exit(EXEC_FAILED); /* terminate child */
default: /* parent process */

View file

@ -6,6 +6,7 @@
/* Space reserved for program and arguments. */
#define MAX_COMMAND_LEN 512 /* maximum argument string length */
#define MAX_NR_ARGS 4 /* maximum number of arguments */
#define MAX_RESCUE_DIR_LEN 64 /* maximum rescue dir length */
/* Definition of the system process table. This table only has entries for
* the servers and drivers, and thus is not directly indexed by slot number.

View file

@ -7,6 +7,7 @@ _PROTOTYPE( int main, (void));
_PROTOTYPE( int do_up, (message *m));
_PROTOTYPE( int do_down, (message *m));
_PROTOTYPE( int do_refresh, (message *m));
_PROTOTYPE( int do_rescue, (message *m));
_PROTOTYPE( int do_shutdown, (message *m));
_PROTOTYPE( void do_period, (message *m));
_PROTOTYPE( void do_exit, (message *m));

View file

@ -24,6 +24,7 @@ PRIVATE char *known_requests[] = {
"up",
"down",
"refresh",
"rescue",
"shutdown",
"catch for illegal requests"
};
@ -39,7 +40,7 @@ extern int errno;
*/
#define ARG_NAME 0 /* own application name */
#define ARG_REQUEST 1 /* request to perform */
#define ARG_PATH 2 /* binary of system service */
#define ARG_PATH 2 /* rescue dir or system service */
#define ARG_PID 2 /* pid of system service */
#define MIN_ARG_COUNT 2 /* require an action */
@ -74,6 +75,7 @@ PRIVATE void print_usage(char *app_name, char *problem)
app_name, ARG_ARGS, ARG_DEV, ARG_PERIOD);
printf(" %s down <pid>\n", app_name);
printf(" %s refresh <pid>\n", app_name);
printf(" %s rescue <dir>\n", app_name);
printf(" %s shutdown\n", app_name);
printf("\n");
}
@ -159,7 +161,7 @@ PRIVATE int parse_arguments(int argc, char **argv)
exit(errno);
}
if ( ! (stat_buf.st_mode & (S_IFBLK | S_IFCHR))) {
print_usage(argv[ARG_NAME], "special file is not a device node");
print_usage(argv[ARG_NAME], "special file is not a device");
exit(EINVAL);
}
req_major = (stat_buf.st_rdev >> MAJOR) & BYTE;
@ -185,8 +187,25 @@ PRIVATE int parse_arguments(int argc, char **argv)
exit(EINVAL);
}
}
else if (req_nr == RS_RESCUE) {
/* Verify argument count. */
if (argc - 1 < ARG_PATH) {
print_usage(argv[ARG_NAME], "action requires rescue directory");
exit(EINVAL);
}
req_path = argv[ARG_PATH];
if (stat(argv[ARG_PATH], &stat_buf) == -1) {
print_usage(argv[ARG_NAME], "couldn't get status of directory");
exit(errno);
}
if ( ! (stat_buf.st_mode & S_IFDIR)) {
print_usage(argv[ARG_NAME], "file is not a directory");
exit(EINVAL);
}
}
else if (req_nr == RS_SHUTDOWN) {
/* no extra arguments required */
/* no extra arguments required */
}
/* Return the request number if no error were found. */
@ -235,6 +254,12 @@ PUBLIC int main(int argc, char **argv)
if (OK != (s=_taskcall(RS_PROC_NR, request, &m)))
failure(s);
break;
case RS_RESCUE:
m.RS_CMD_ADDR = req_path;
m.RS_CMD_LEN = strlen(req_path);
if (OK != (s=_taskcall(RS_PROC_NR, request, &m)))
failure(s);
break;
case RS_SHUTDOWN:
if (OK != (s=_taskcall(RS_PROC_NR, request, &m)))
failure(s);