2 arch/i386/Kconfig | 17 +
3 arch/i386/defconfig | 1
4 arch/i386/kernel/Makefile | 1
5 arch/i386/kernel/apic.c | 54 +++
6 arch/i386/kernel/dmi_scan.c | 27 -
7 arch/i386/kernel/entry.S | 1
8 arch/i386/kernel/i8259.c | 12
9 arch/i386/kernel/io_apic.c | 2
10 arch/i386/kernel/machine_kexec.c | 116 ++++++
11 arch/i386/kernel/reboot.c | 44 --
12 arch/i386/kernel/relocate_kernel.S | 107 ++++++
13 arch/i386/kernel/smp.c | 24 +
14 include/asm-i386/apic.h | 3
15 include/asm-i386/apicdef.h | 1
16 include/asm-i386/kexec.h | 23 +
17 include/asm-i386/unistd.h | 5
18 include/linux/kexec.h | 54 +++
19 include/linux/reboot.h | 2
21 kernel/kexec.c | 629 +++++++++++++++++++++++++++++++++++++
23 22 files changed, 1089 insertions(+), 69 deletions(-)
25 --- linux-2.6.0-test1/MAINTAINERS~kexec-2.6.0-full 2003-07-22 00:46:07.000000000 -0600
26 +++ linux-2.6.0-test1-braam/MAINTAINERS 2003-07-22 00:54:04.000000000 -0600
27 @@ -1095,6 +1095,17 @@ W: http://nfs.sourceforge.net/
28 W: http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/
33 +M: ebiederm@xmission.com
34 +M: ebiederman@lnxi.com
35 +W: http://www.xmission.com/~ebiederm/files/kexec/
38 +W: http://www.osdl.org/archive/andyp/bloom/Code/Linux/Kexec/
39 +L: linux-kernel@vger.kernel.org
42 LANMEDIA WAN CARD DRIVER
43 P: Andrew Stanley-Jones
45 --- linux-2.6.0-test1/arch/i386/Kconfig~kexec-2.6.0-full 2003-07-22 00:52:14.000000000 -0600
46 +++ linux-2.6.0-test1-braam/arch/i386/Kconfig 2003-07-22 00:54:04.000000000 -0600
47 @@ -804,6 +804,23 @@ config BOOT_IOREMAP
48 depends on ((X86_SUMMIT || X86_GENERICARCH) && NUMA)
52 + bool "kexec system call (EXPERIMENTAL)"
53 + depends on EXPERIMENTAL
55 + kexec is a system call that implements the ability to shutdown your
56 + current kernel, and to start another kernel. It is like a reboot
57 + but it is indepedent of the system firmware. And like a reboot
58 + you can start any kernel with it not just Linux.
60 + The name comes from the similiarity to the exec system call.
62 + It is on an going process to be certain the hardware in a machine
63 + is properly shutdown, so do not be surprised if this code does not
64 + initially work for you. It may help to enable device hotplugging
65 + support. As of this writing the exact hardware interface is
66 + strongly in flux, so no good recommendation can be made.
71 --- linux-2.6.0-test1/arch/i386/defconfig~kexec-2.6.0-full 2003-07-13 21:35:57.000000000 -0600
72 +++ linux-2.6.0-test1-braam/arch/i386/defconfig 2003-07-22 00:54:04.000000000 -0600
73 @@ -72,6 +72,7 @@ CONFIG_SMP=y
74 CONFIG_X86_LOCAL_APIC=y
79 # CONFIG_X86_MCE_NONFATAL is not set
80 CONFIG_X86_MCE_P4THERMAL=y
81 --- linux-2.6.0-test1/arch/i386/kernel/Makefile~kexec-2.6.0-full 2003-07-22 00:46:03.000000000 -0600
82 +++ linux-2.6.0-test1-braam/arch/i386/kernel/Makefile 2003-07-22 00:54:04.000000000 -0600
83 @@ -25,6 +25,7 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoli
84 obj-$(CONFIG_X86_MPPARSE) += mpparse.o
85 obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
86 obj-$(CONFIG_X86_IO_APIC) += io_apic.o
87 +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o
88 obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o suspend_asm.o
89 obj-$(CONFIG_X86_NUMAQ) += numaq.o
90 obj-$(CONFIG_X86_SUMMIT) += summit.o
91 --- linux-2.6.0-test1/arch/i386/kernel/apic.c~kexec-2.6.0-full 2003-07-22 00:46:03.000000000 -0600
92 +++ linux-2.6.0-test1-braam/arch/i386/kernel/apic.c 2003-07-22 00:54:04.000000000 -0600
94 #include <linux/mc146818rtc.h>
95 #include <linux/kernel_stat.h>
96 #include <linux/sysdev.h>
97 +#include <linux/reboot.h>
99 #include <asm/atomic.h>
101 @@ -175,6 +176,39 @@ void disconnect_bsp_APIC(void)
107 + /* Go back to Virtual Wire compatibility mode */
108 + unsigned long value;
110 + /* For the spurious interrupt use vector F, and enable it */
111 + value = apic_read(APIC_SPIV);
112 + value &= ~APIC_VECTOR_MASK;
113 + value |= APIC_SPIV_APIC_ENABLED;
115 + apic_write_around(APIC_SPIV, value);
117 + /* For LVT0 make it edge triggered, active high, external and enabled */
118 + value = apic_read(APIC_LVT0);
119 + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
120 + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
121 + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
122 + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
123 + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT);
124 + apic_write_around(APIC_LVT0, value);
126 + /* For LVT1 make it edge triggered, active high, nmi and enabled */
127 + value = apic_read(APIC_LVT1);
129 + APIC_MODE_MASK | APIC_SEND_PENDING |
130 + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
131 + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
132 + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
133 + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
134 + apic_write_around(APIC_LVT1, value);
136 +#endif /* CONFIG_KEXEC */
140 void disable_local_APIC(void)
141 @@ -1115,6 +1149,26 @@ asmlinkage void smp_error_interrupt(void
145 +void stop_apics(void)
147 + /* By resetting the APIC's we disable the nmi watchdog */
150 + * Stop all CPUs and turn off local APICs and the IO-APIC, so
151 + * other OSs see a clean IRQ state.
155 + disable_local_APIC();
157 +#if defined(CONFIG_X86_IO_APIC)
158 + if (smp_found_config) {
162 + disconnect_bsp_APIC();
166 * This initializes the IO-APIC and APIC hardware if this is
168 --- linux-2.6.0-test1/arch/i386/kernel/dmi_scan.c~kexec-2.6.0-full 2003-07-13 21:32:44.000000000 -0600
169 +++ linux-2.6.0-test1-braam/arch/i386/kernel/dmi_scan.c 2003-07-22 00:54:04.000000000 -0600
170 @@ -222,31 +222,6 @@ static __init int set_bios_reboot(struct
175 - * Some machines require the "reboot=s" commandline option, this quirk makes that automatic.
177 -static __init int set_smp_reboot(struct dmi_blacklist *d)
180 - extern int reboot_smp;
181 - if (reboot_smp == 0)
184 - printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident);
191 - * Some machines require the "reboot=b,s" commandline option, this quirk makes that automatic.
193 -static __init int set_smp_bios_reboot(struct dmi_blacklist *d)
196 - set_bios_reboot(d);
201 * Some bioses have a broken protected mode poweroff and need to use realmode
202 @@ -527,7 +502,7 @@ static __initdata struct dmi_blacklist d
203 MATCH(DMI_BIOS_VERSION, "4.60 PGMA"),
204 MATCH(DMI_BIOS_DATE, "134526184"), NO_MATCH
206 - { set_smp_bios_reboot, "Dell PowerEdge 1300", { /* Handle problems with rebooting on Dell 1300's */
207 + { set_bios_reboot, "Dell PowerEdge 1300", { /* Handle problems with rebooting on Dell 1300's */
208 MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
209 MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
211 --- linux-2.6.0-test1/arch/i386/kernel/entry.S~kexec-2.6.0-full 2003-07-22 00:46:03.000000000 -0600
212 +++ linux-2.6.0-test1-braam/arch/i386/kernel/entry.S 2003-07-22 00:54:27.000000000 -0600
213 @@ -905,5 +905,6 @@ ENTRY(sys_call_table)
214 .long sys_tgkill /* 270 */
217 + .long sys_kexec_load
219 nr_syscalls=(.-sys_call_table)/4
220 --- linux-2.6.0-test1/arch/i386/kernel/i8259.c~kexec-2.6.0-full 2003-07-13 21:38:03.000000000 -0600
221 +++ linux-2.6.0-test1-braam/arch/i386/kernel/i8259.c 2003-07-22 00:54:04.000000000 -0600
222 @@ -244,9 +244,21 @@ static int i8259A_resume(struct sys_devi
226 +static int i8259A_shutdown(struct sys_device *dev)
228 + /* Put the i8259A into a quiescent state that
229 + * the kernel initialization code can get it
232 + outb(0xff, 0x21); /* mask all of 8259A-1 */
233 + outb(0xff, 0xA1); /* mask all of 8259A-1 */
237 static struct sysdev_class i8259_sysdev_class = {
238 set_kset_name("i8259"),
239 .resume = i8259A_resume,
240 + .shutdown = i8259A_shutdown,
243 static struct sys_device device_i8259A = {
244 --- linux-2.6.0-test1/arch/i386/kernel/io_apic.c~kexec-2.6.0-full 2003-07-22 00:46:03.000000000 -0600
245 +++ linux-2.6.0-test1-braam/arch/i386/kernel/io_apic.c 2003-07-22 00:54:04.000000000 -0600
246 @@ -1601,8 +1601,6 @@ void disable_IO_APIC(void)
247 * Clear the IO-APIC before rebooting:
251 - disconnect_bsp_APIC();
255 --- /dev/null 2003-01-30 03:24:37.000000000 -0700
256 +++ linux-2.6.0-test1-braam/arch/i386/kernel/machine_kexec.c 2003-07-22 00:54:04.000000000 -0600
258 +#include <linux/config.h>
259 +#include <linux/mm.h>
260 +#include <linux/kexec.h>
261 +#include <linux/delay.h>
262 +#include <asm/pgtable.h>
263 +#include <asm/pgalloc.h>
264 +#include <asm/tlbflush.h>
265 +#include <asm/mmu_context.h>
267 +#include <asm/apic.h>
272 + * =======================
276 +static void set_idt(void *newidt, __u16 limit)
278 + unsigned char curidt[6];
280 + /* ia32 supports unaliged loads & stores */
281 + (*(__u16 *)(curidt)) = limit;
282 + (*(__u32 *)(curidt +2)) = (unsigned long)(newidt);
284 + __asm__ __volatile__ (
291 +static void set_gdt(void *newgdt, __u16 limit)
293 + unsigned char curgdt[6];
295 + /* ia32 supports unaliged loads & stores */
296 + (*(__u16 *)(curgdt)) = limit;
297 + (*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt);
299 + __asm__ __volatile__ (
305 +static void load_segments(void)
308 +#define STR(X) __STR(X)
310 + __asm__ __volatile__ (
311 + "\tljmp $"STR(__KERNEL_CS)",$1f\n"
313 + "\tmovl $"STR(__KERNEL_DS)",%eax\n"
314 + "\tmovl %eax,%ds\n"
315 + "\tmovl %eax,%es\n"
316 + "\tmovl %eax,%fs\n"
317 + "\tmovl %eax,%gs\n"
318 + "\tmovl %eax,%ss\n"
324 +typedef void (*relocate_new_kernel_t)(
325 + unsigned long indirection_page, unsigned long reboot_code_buffer,
326 + unsigned long start_address);
328 +const extern unsigned char relocate_new_kernel[];
329 +extern void relocate_new_kernel_end(void);
330 +const extern unsigned int relocate_new_kernel_size;
331 +extern void use_mm(struct mm_struct *mm);
333 +void machine_kexec(struct kimage *image)
335 + unsigned long indirection_page;
336 + unsigned long reboot_code_buffer;
337 + relocate_new_kernel_t rnk;
339 + /* switch to an mm where the reboot_code_buffer is identity mapped */
343 + /* Interrupts aren't acceptable while we reboot */
344 + local_irq_disable();
345 + reboot_code_buffer = page_to_pfn(image->reboot_code_pages) << PAGE_SHIFT;
346 + indirection_page = image->head & PAGE_MASK;
349 + memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size);
351 + /* The segment registers are funny things, they are
352 + * automatically loaded from a table, in memory wherever you
353 + * set them to a specific selector, but this table is never
354 + * accessed again you set the segment to a different selector.
356 + * The more common model is are caches where the behide
357 + * the scenes work is done, but is also dropped at arbitrary
360 + * I take advantage of this here by force loading the
361 + * segments, before I zap the gdt with an invalid value.
364 + /* The gdt & idt are now invalid.
365 + * If you want to load them you must set up your own idt & gdt.
367 + set_gdt(phys_to_virt(0),0);
368 + set_idt(phys_to_virt(0),0);
371 + rnk = (relocate_new_kernel_t) reboot_code_buffer;
372 + (*rnk)(indirection_page, reboot_code_buffer, image->start);
374 --- linux-2.6.0-test1/arch/i386/kernel/reboot.c~kexec-2.6.0-full 2003-07-22 00:46:03.000000000 -0600
375 +++ linux-2.6.0-test1-braam/arch/i386/kernel/reboot.c 2003-07-22 00:55:22.000000000 -0600
377 #include <linux/interrupt.h>
378 #include <linux/mc146818rtc.h>
379 #include <asm/uaccess.h>
380 +#include <asm/apic.h>
381 #include "mach_reboot.h"
384 @@ -20,8 +21,7 @@ static int reboot_mode;
385 int reboot_thru_bios;
389 -static int reboot_cpu = -1;
390 +int reboot_cpu = -1; /* specifies the internal linux cpu id, not the apicid */
391 /* shamelessly grabbed from lib/vsprintf.c for readability */
392 #define is_digit(c) ((c) >= '0' && (c) <= '9')
394 @@ -43,7 +43,6 @@ static int __init reboot_setup(char *str
397 case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
399 if (is_digit(*(str+1))) {
400 reboot_cpu = (int) (*(str+1) - '0');
401 if (is_digit(*(str+2)))
402 @@ -215,42 +214,7 @@ void machine_real_restart(unsigned char
404 void machine_restart(char * __unused)
409 - cpuid = GET_APIC_ID(apic_read(APIC_ID));
413 - /* check to see if reboot_cpu is valid
414 - if its not, default to the BSP */
415 - if ((reboot_cpu == -1) ||
416 - (reboot_cpu > (NR_CPUS -1)) ||
417 - !cpu_isset(cpuid, phys_cpu_present_map))
418 - reboot_cpu = boot_cpu_physical_apicid;
420 - reboot_smp = 0; /* use this as a flag to only go through this once*/
421 - /* re-run this function on the other CPUs
422 - it will fall though this section since we have
423 - cleared reboot_smp, and do the reboot if it is the
424 - correct CPU, otherwise it halts. */
425 - if (reboot_cpu != cpuid)
426 - smp_call_function((void *)machine_restart , NULL, 1, 0);
429 - /* if reboot_cpu is still -1, then we want a tradional reboot,
430 - and if we are not running on the reboot_cpu,, halt */
431 - if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
433 - __asm__ __volatile__ ("hlt");
436 - * Stop all CPUs and turn off local APICs and the IO-APIC, so
437 - * other OSs see a clean IRQ state.
444 if(!reboot_thru_bios) {
445 /* rebooting needs to touch the page at absolute addr 0 */
446 @@ -268,10 +232,12 @@ void machine_restart(char * __unused)
448 void machine_halt(void)
453 void machine_power_off(void)
459 --- /dev/null 2003-01-30 03:24:37.000000000 -0700
460 +++ linux-2.6.0-test1-braam/arch/i386/kernel/relocate_kernel.S 2003-07-22 00:54:04.000000000 -0600
462 +#include <linux/config.h>
463 +#include <linux/linkage.h>
465 + /* Must be relocatable PIC code callable as a C function, that once
466 + * it starts can not use the previous processes stack.
469 + .globl relocate_new_kernel
470 +relocate_new_kernel:
471 + /* read the arguments and say goodbye to the stack */
472 + movl 4(%esp), %ebx /* indirection_page */
473 + movl 8(%esp), %ebp /* reboot_code_buffer */
474 + movl 12(%esp), %edx /* start address */
476 + /* zero out flags, and disable interrupts */
480 + /* set a new stack at the bottom of our page... */
481 + lea 4096(%ebp), %esp
483 + /* store the parameters back on the stack */
484 + pushl %edx /* store the start address */
486 + /* Set cr0 to a known state:
487 + * 31 0 == Paging disabled
488 + * 18 0 == Alignment check disabled
489 + * 16 0 == Write protect disabled
490 + * 3 0 == No task switch
491 + * 2 0 == Don't do FP software emulation.
492 + * 0 1 == Proctected mode enabled
495 + andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
499 + /* Set cr4 to a known state:
500 + * Setting everything to zero seems safe.
509 + /* Flush the TLB (needed?) */
513 + /* Do the copies */
515 +0: /* top, read another word for the indirection page */
519 + testl $0x1, %ecx /* is it a destination page */
522 + andl $0xfffff000, %edi
525 + testl $0x2, %ecx /* is it an indirection page */
528 + andl $0xfffff000, %ebx
531 + testl $0x4, %ecx /* is it the done indicator */
535 + testl $0x8, %ecx /* is it the source indicator */
536 + jz 0b /* Ignore it otherwise */
537 + movl %ecx, %esi /* For every source page do a copy */
538 + andl $0xfffff000, %esi
546 + /* To be certain of avoiding problems with self modifying code
547 + * I need to execute a serializing instruction here.
548 + * So I flush the TLB, it's handy, and not processor dependent.
553 + /* set all of the registers to known values */
554 + /* leave %esp alone */
564 +relocate_new_kernel_end:
566 + .globl relocate_new_kernel_size
567 +relocate_new_kernel_size:
568 + .long relocate_new_kernel_end - relocate_new_kernel
569 --- linux-2.6.0-test1/arch/i386/kernel/smp.c~kexec-2.6.0-full 2003-07-22 00:52:14.000000000 -0600
570 +++ linux-2.6.0-test1-braam/arch/i386/kernel/smp.c 2003-07-22 00:54:04.000000000 -0600
571 @@ -587,6 +587,30 @@ void stop_this_cpu (void * dummy)
573 void smp_send_stop(void)
575 + extern int reboot_cpu;
578 + /* The boot cpu is always logical cpu 0 */
581 + /* See if there has been give a command line override .
583 + if ((reboot_cpu != -1) && !(reboot_cpu >= NR_CPUS) &&
584 + test_bit(reboot_cpu, &cpu_online_map)) {
585 + reboot_cpu_id = reboot_cpu;
588 + /* Make certain the the cpu I'm rebooting on is online */
589 + if (!test_bit(reboot_cpu_id, &cpu_online_map)) {
590 + reboot_cpu_id = smp_processor_id();
593 + /* Make certain I only run on the appropriate processor */
594 + set_cpus_allowed(current, 1 << reboot_cpu_id);
596 + /* O.k. Now that I'm on the appropriate processor stop
597 + * all of the others.
599 smp_call_function(stop_this_cpu, NULL, 1, 0);
602 --- linux-2.6.0-test1/include/asm-i386/apic.h~kexec-2.6.0-full 2003-07-13 21:38:53.000000000 -0600
603 +++ linux-2.6.0-test1-braam/include/asm-i386/apic.h 2003-07-22 00:54:04.000000000 -0600
604 @@ -97,6 +97,9 @@ extern unsigned int nmi_watchdog;
605 #define NMI_LOCAL_APIC 2
606 #define NMI_INVALID 3
608 +extern void stop_apics(void);
610 +static inline void stop_apics(void) { }
611 #endif /* CONFIG_X86_LOCAL_APIC */
613 #endif /* __ASM_APIC_H */
614 --- linux-2.6.0-test1/include/asm-i386/apicdef.h~kexec-2.6.0-full 2003-07-13 21:34:40.000000000 -0600
615 +++ linux-2.6.0-test1-braam/include/asm-i386/apicdef.h 2003-07-22 00:54:04.000000000 -0600
617 #define APIC_LVT_REMOTE_IRR (1<<14)
618 #define APIC_INPUT_POLARITY (1<<13)
619 #define APIC_SEND_PENDING (1<<12)
620 +#define APIC_MODE_MASK 0x700
621 #define GET_APIC_DELIVERY_MODE(x) (((x)>>8)&0x7)
622 #define SET_APIC_DELIVERY_MODE(x,y) (((x)&~0x700)|((y)<<8))
623 #define APIC_MODE_FIXED 0x0
624 --- /dev/null 2003-01-30 03:24:37.000000000 -0700
625 +++ linux-2.6.0-test1-braam/include/asm-i386/kexec.h 2003-07-22 00:54:04.000000000 -0600
627 +#ifndef _I386_KEXEC_H
628 +#define _I386_KEXEC_H
630 +#include <asm/fixmap.h>
633 + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
634 + * I.e. Maximum page that is mapped directly into kernel memory,
635 + * and kmap is not required.
637 + * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct
638 + * calculation for the amount of memory directly mappable into the
639 + * kernel memory space.
642 +/* Maximum physical address we can use pages from */
643 +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
644 +/* Maximum address we can reach in physical address mode */
645 +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
647 +#define KEXEC_REBOOT_CODE_SIZE 4096
649 +#endif /* _I386_KEXEC_H */
650 --- linux-2.6.0-test1/include/asm-i386/unistd.h~kexec-2.6.0-full 2003-07-22 00:46:07.000000000 -0600
651 +++ linux-2.6.0-test1-braam/include/asm-i386/unistd.h 2003-07-22 00:55:57.000000000 -0600
653 #define __NR_tgkill 270
654 #define __NR_utimes 271
655 #define __NR_mknod64 272
657 -#define NR_syscalls 273
658 +#define __NR_sys_kexec_load 273
660 +#define NR_syscalls 274
662 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
664 --- /dev/null 2003-01-30 03:24:37.000000000 -0700
665 +++ linux-2.6.0-test1-braam/include/linux/kexec.h 2003-07-22 00:54:04.000000000 -0600
667 +#ifndef LINUX_KEXEC_H
668 +#define LINUX_KEXEC_H
671 +#include <linux/types.h>
672 +#include <linux/list.h>
673 +#include <asm/kexec.h>
676 + * This structure is used to hold the arguments that are used when loading
680 +typedef unsigned long kimage_entry_t;
681 +#define IND_DESTINATION 0x1
682 +#define IND_INDIRECTION 0x2
683 +#define IND_DONE 0x4
684 +#define IND_SOURCE 0x8
686 +#define KEXEC_SEGMENT_MAX 8
687 +struct kexec_segment {
695 + kimage_entry_t head;
696 + kimage_entry_t *entry;
697 + kimage_entry_t *last_entry;
699 + unsigned long destination;
700 + unsigned long offset;
702 + unsigned long start;
703 + struct page *reboot_code_pages;
705 + unsigned long nr_segments;
706 + struct kexec_segment segment[KEXEC_SEGMENT_MAX+1];
708 + struct list_head dest_pages;
709 + struct list_head unuseable_pages;
713 +/* kexec interface functions */
714 +extern void machine_kexec(struct kimage *image);
715 +extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments,
716 + struct kexec_segment *segments);
717 +extern struct kimage *kexec_image;
719 +#endif /* LINUX_KEXEC_H */
721 --- linux-2.6.0-test1/include/linux/reboot.h~kexec-2.6.0-full 2003-07-13 21:39:35.000000000 -0600
722 +++ linux-2.6.0-test1-braam/include/linux/reboot.h 2003-07-22 00:54:04.000000000 -0600
724 * POWER_OFF Stop OS and remove all power from system, if possible.
725 * RESTART2 Restart system using given command string.
726 * SW_SUSPEND Suspend system using Software Suspend if compiled in
727 + * KEXEC Restart the system using a different kernel.
730 #define LINUX_REBOOT_CMD_RESTART 0x01234567
732 #define LINUX_REBOOT_CMD_POWER_OFF 0x4321FEDC
733 #define LINUX_REBOOT_CMD_RESTART2 0xA1B2C3D4
734 #define LINUX_REBOOT_CMD_SW_SUSPEND 0xD000FCE2
735 +#define LINUX_REBOOT_CMD_KEXEC 0x45584543
739 --- linux-2.6.0-test1/kernel/Makefile~kexec-2.6.0-full 2003-07-22 00:46:07.000000000 -0600
740 +++ linux-2.6.0-test1-braam/kernel/Makefile 2003-07-22 00:54:04.000000000 -0600
741 @@ -19,6 +19,7 @@ obj-$(CONFIG_PM) += pm.o
742 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
743 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
744 obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
745 +obj-$(CONFIG_KEXEC) += kexec.o
746 obj-$(CONFIG_COMPAT) += compat.o
748 ifneq ($(CONFIG_IA64),y)
749 --- /dev/null 2003-01-30 03:24:37.000000000 -0700
750 +++ linux-2.6.0-test1-braam/kernel/kexec.c 2003-07-22 00:54:04.000000000 -0600
752 +#include <linux/mm.h>
753 +#include <linux/file.h>
754 +#include <linux/slab.h>
755 +#include <linux/fs.h>
756 +#include <linux/version.h>
757 +#include <linux/compile.h>
758 +#include <linux/kexec.h>
759 +#include <linux/spinlock.h>
760 +#include <linux/list.h>
761 +#include <linux/highmem.h>
762 +#include <net/checksum.h>
763 +#include <asm/page.h>
764 +#include <asm/uaccess.h>
766 +#include <asm/system.h>
768 +/* When kexec transitions to the new kernel there is a one to one
769 + * mapping between physical and virtual addresses. On processors
770 + * where you can disable the MMU this is trivial, and easy. For
771 + * others it is still a simple predictable page table to setup.
773 + * In that environment kexec copies the new kernel to it's final
774 + * resting place. This means I can only support memory whose
775 + * physical address can fit in an unsigned long. In particular
776 + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
777 + * If the assembly stub has more restrictive requirements
778 + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
779 + * defined more restrictively in <asm/kexec.h>.
781 + * The code for the transition from the current kernel to the
782 + * the new kernel is placed in the reboot_code_buffer, whose size
783 + * is given by KEXEC_REBOOT_CODE_SIZE. In the best case only a single
784 + * page of memory is necessary, but some architectures require more.
785 + * Because this memory must be identity mapped in the transition from
786 + * virtual to physical addresses it must live in the range
787 + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
790 + * The assembly stub in the reboot code buffer is passed a linked list
791 + * of descriptor pages detailing the source pages of the new kernel,
792 + * and the destination addresses of those source pages. As this data
793 + * structure is not used in the context of the current OS, it must
794 + * be self contained.
796 + * The code has been made to work with highmem pages and will use a
797 + * destination page in it's final resting place (if it happens
798 + * to allocate it). The end product of this is that most of the
799 + * physical address space, and most of ram can be used.
801 + * Future directions include:
802 + * - allocating a page table with the reboot code buffer identity
803 + * mapped, to simplify machine_kexec and make kexec_on_panic, more
805 + * - allocating the pages for a page table for machines that cannot
806 + * disable their MMUs. (Hammer, Alpha...)
809 +/* KIMAGE_NO_DEST is an impossible destination address..., for
810 + * allocating pages whose destination address we do not care about.
812 +#define KIMAGE_NO_DEST (-1UL)
814 +static int kimage_is_destination_range(
815 + struct kimage *image, unsigned long start, unsigned long end);
816 +static struct page *kimage_alloc_reboot_code_pages(struct kimage *image);
817 +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
820 +static int kimage_alloc(struct kimage **rimage,
821 + unsigned long nr_segments, struct kexec_segment *segments)
824 + struct kimage *image;
825 + size_t segment_bytes;
826 + struct page *reboot_pages;
829 + /* Allocate a controlling structure */
831 + image = kmalloc(sizeof(*image), GFP_KERNEL);
835 + memset(image, 0, sizeof(*image));
837 + image->entry = &image->head;
838 + image->last_entry = &image->head;
840 + /* Initialize the list of destination pages */
841 + INIT_LIST_HEAD(&image->dest_pages);
843 + /* Initialize the list of unuseable pages */
844 + INIT_LIST_HEAD(&image->unuseable_pages);
846 + /* Read in the segments */
847 + image->nr_segments = nr_segments;
848 + segment_bytes = nr_segments * sizeof*segments;
849 + result = copy_from_user(image->segment, segments, segment_bytes);
853 + /* Verify we have good destination addresses. The caller is
854 + * responsible for making certain we don't attempt to load
855 + * the new image into invalid or reserved areas of RAM. This
856 + * just verifies it is an address we can use.
858 + result = -EADDRNOTAVAIL;
859 + for(i = 0; i < nr_segments; i++) {
860 + unsigned long mend;
861 + mend = ((unsigned long)(image->segment[i].mem)) +
862 + image->segment[i].memsz;
863 + if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
867 + /* Find a location for the reboot code buffer, and add it
868 + * the vector of segments so that it's pages will also be
869 + * counted as destination pages.
872 + reboot_pages = kimage_alloc_reboot_code_pages(image);
873 + if (!reboot_pages) {
874 + printk(KERN_ERR "Could not allocate reboot_code_buffer\n");
877 + image->reboot_code_pages = reboot_pages;
878 + image->segment[nr_segments].buf = 0;
879 + image->segment[nr_segments].bufsz = 0;
880 + image->segment[nr_segments].mem = (void *)(page_to_pfn(reboot_pages) << PAGE_SHIFT);
881 + image->segment[nr_segments].memsz = KEXEC_REBOOT_CODE_SIZE;
882 + image->nr_segments++;
894 +static int kimage_is_destination_range(
895 + struct kimage *image, unsigned long start, unsigned long end)
898 + for(i = 0; i < image->nr_segments; i++) {
899 + unsigned long mstart, mend;
900 + mstart = (unsigned long)image->segment[i].mem;
901 + mend = mstart + image->segment[i].memsz;
902 + if ((end > mstart) && (start < mend)) {
910 +static int identity_map_pages(struct page *pages, int order)
912 + struct mm_struct *mm;
913 + struct vm_area_struct *vma;
918 + down_write(&mm->mmap_sem);
920 + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
925 + memset(vma, 0, sizeof(vma));
927 + vma->vm_start = page_to_pfn(pages) << PAGE_SHIFT;
928 + vma->vm_end = vma->vm_start + (1 << (order + PAGE_SHIFT));
930 + vma->vm_flags = VM_SHARED \
931 + | VM_READ | VM_WRITE | VM_EXEC \
932 + | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC \
933 + | VM_DONTCOPY | VM_RESERVED;
934 + vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
935 + vma->vm_file = NULL;
936 + vma->vm_private_data = NULL;
937 + INIT_LIST_HEAD(&vma->shared);
938 + insert_vm_struct(mm, vma);
940 + error = remap_page_range(vma, vma->vm_start, vma->vm_start,
941 + vma->vm_end - vma->vm_start, vma->vm_page_prot);
948 + if (error && vma) {
949 + kmem_cache_free(vm_area_cachep, vma);
952 + up_write(&mm->mmap_sem);
957 +#define identity_map_pages(pages, order) 0
960 +struct page *kimage_alloc_reboot_code_pages(struct kimage *image)
962 + /* The reboot code buffer is special. It is the only set of
963 + * pages that must be allocated in their final resting place,
964 + * and the only set of pages whose final resting place we can
967 + * At worst this runs in O(N) of the image size.
969 + struct list_head extra_pages, *pos, *next;
970 + struct page *pages;
971 + unsigned long addr;
973 + order = get_order(KEXEC_REBOOT_CODE_SIZE);
974 + count = 1 << order;
975 + INIT_LIST_HEAD(&extra_pages);
978 + pages = alloc_pages(GFP_HIGHUSER, order);
981 + for(i = 0; i < count; i++) {
982 + SetPageReserved(pages +i);
984 + addr = page_to_pfn(pages) << PAGE_SHIFT;
985 + if ((page_to_pfn(pages) >= (TASK_SIZE >> PAGE_SHIFT)) ||
986 + kimage_is_destination_range(image, addr, addr + KEXEC_REBOOT_CODE_SIZE)) {
987 + list_add(&pages->list, &extra_pages);
993 + result = identity_map_pages(pages, order);
995 + list_add(&pages->list, &extra_pages);
999 + /* If I could convert a multi page allocation into a buch of
1000 + * single page allocations I could add these pages to
1001 + * image->dest_pages. For now it is simpler to just free the
1004 + list_for_each_safe(pos, next, &extra_pages) {
1005 + struct page *page;
1007 + page = list_entry(pos, struct page, list);
1008 + for(i = 0; i < count; i++) {
1009 + ClearPageReserved(pages +i);
1011 + list_del(&extra_pages);
1012 + __free_pages(page, order);
1017 +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
1019 + if (image->offset != 0) {
1022 + if (image->entry == image->last_entry) {
1023 + kimage_entry_t *ind_page;
1024 + struct page *page;
1025 + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
1029 + ind_page = page_address(page);
1030 + *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
1031 + image->entry = ind_page;
1032 + image->last_entry =
1033 + ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
1035 + *image->entry = entry;
1037 + image->offset = 0;
1041 +static int kimage_set_destination(
1042 + struct kimage *image, unsigned long destination)
1045 + destination &= PAGE_MASK;
1046 + result = kimage_add_entry(image, destination | IND_DESTINATION);
1047 + if (result == 0) {
1048 + image->destination = destination;
1054 +static int kimage_add_page(struct kimage *image, unsigned long page)
1057 + page &= PAGE_MASK;
1058 + result = kimage_add_entry(image, page | IND_SOURCE);
1059 + if (result == 0) {
1060 + image->destination += PAGE_SIZE;
1066 +static void kimage_free_extra_pages(struct kimage *image)
1068 + /* Walk through and free any extra destination pages I may have */
1069 + struct list_head *pos, *next;
1070 + list_for_each_safe(pos, next, &image->dest_pages) {
1071 + struct page *page;
1072 + page = list_entry(pos, struct page, list);
1073 + list_del(&page->list);
1074 + ClearPageReserved(page);
1075 + __free_page(page);
1077 + /* Walk through and free any unuseable pages I have cached */
1078 + list_for_each_safe(pos, next, &image->unuseable_pages) {
1079 + struct page *page;
1080 + page = list_entry(pos, struct page, list);
1081 + list_del(&page->list);
1082 + ClearPageReserved(page);
1083 + __free_page(page);
1087 +static int kimage_terminate(struct kimage *image)
1090 + result = kimage_add_entry(image, IND_DONE);
1091 + if (result == 0) {
1092 + /* Point at the terminating element */
1094 + kimage_free_extra_pages(image);
1099 +#define for_each_kimage_entry(image, ptr, entry) \
1100 + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
1101 + ptr = (entry & IND_INDIRECTION)? \
1102 + phys_to_virt((entry & PAGE_MASK)): ptr +1)
1104 +static void kimage_free(struct kimage *image)
1106 + kimage_entry_t *ptr, entry;
1107 + kimage_entry_t ind = 0;
1108 + int i, count, order;
1111 + kimage_free_extra_pages(image);
1112 + for_each_kimage_entry(image, ptr, entry) {
1113 + if (entry & IND_INDIRECTION) {
1114 + /* Free the previous indirection page */
1115 + if (ind & IND_INDIRECTION) {
1116 + free_page((unsigned long)phys_to_virt(ind & PAGE_MASK));
1118 + /* Save this indirection page until we are
1123 + else if (entry & IND_SOURCE) {
1124 + free_page((unsigned long)phys_to_virt(entry & PAGE_MASK));
1127 + order = get_order(KEXEC_REBOOT_CODE_SIZE);
1128 + count = 1 << order;
1129 + do_munmap(&init_mm,
1130 + page_to_pfn(image->reboot_code_pages) << PAGE_SHIFT,
1131 + count << PAGE_SHIFT);
1132 + for(i = 0; i < count; i++) {
1133 + ClearPageReserved(image->reboot_code_pages + i);
1135 + __free_pages(image->reboot_code_pages, order);
1139 +static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
1141 + kimage_entry_t *ptr, entry;
1142 + unsigned long destination = 0;
1143 + for_each_kimage_entry(image, ptr, entry) {
1144 + if (entry & IND_DESTINATION) {
1145 + destination = entry & PAGE_MASK;
1147 + else if (entry & IND_SOURCE) {
1148 + if (page == destination) {
1151 + destination += PAGE_SIZE;
1157 +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
1159 + /* Here we implment safe guards to ensure that a source page
1160 + * is not copied to it's destination page before the data on
1161 + * the destination page is no longer useful.
1163 + * To do this we maintain the invariant that a source page is
1164 + * either it's own destination page, or it is not a
1165 + * destination page at all.
1167 + * That is slightly stronger than required, but the proof
1168 + * that no problems will not occur is trivial, and the
1169 + * implemenation is simply to verify.
1171 + * When allocating all pages normally this algorithm will run
1172 + * in O(N) time, but in the worst case it will run in O(N^2)
1173 + * time. If the runtime is a problem the data structures can
1176 + struct page *page;
1177 + unsigned long addr;
1179 + /* Walk through the list of destination pages, and see if I
1182 + list_for_each_entry(page, &image->dest_pages, list) {
1183 + addr = page_to_pfn(page) << PAGE_SHIFT;
1184 + if (addr == destination) {
1185 + list_del(&page->list);
1191 + kimage_entry_t *old;
1192 + /* Allocate a page, if we run out of memory give up */
1193 + page = alloc_page(gfp_mask);
1197 + SetPageReserved(page);
1198 + /* If the page cannot be used file it away */
1199 + if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
1200 + list_add(&page->list, &image->unuseable_pages);
1203 + addr = page_to_pfn(page) << PAGE_SHIFT;
1205 + /* If it is the destination page we want use it */
1206 + if (addr == destination)
1209 + /* If the page is not a destination page use it */
1210 + if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
1213 + /* I know that the page is someones destination page.
1214 + * See if there is already a source page for this
1215 + * destination page. And if so swap the source pages.
1217 + old = kimage_dst_used(image, addr);
1219 + /* If so move it */
1220 + unsigned long old_addr;
1221 + struct page *old_page;
1223 + old_addr = *old & PAGE_MASK;
1224 + old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
1225 + copy_highpage(page, old_page);
1226 + *old = addr | (*old & ~PAGE_MASK);
1228 + /* The old page I have found cannot be a
1229 + * destination page, so return it.
1236 + /* Place the page on the destination list I
1237 + * will use it later.
1239 + list_add(&page->list, &image->dest_pages);
1245 +static int kimage_load_segment(struct kimage *image,
1246 + struct kexec_segment *segment)
1248 + unsigned long mstart;
1250 + unsigned long offset;
1251 + unsigned long offset_end;
1252 + unsigned char *buf;
1255 + buf = segment->buf;
1256 + mstart = (unsigned long)segment->mem;
1258 + offset_end = segment->memsz;
1260 + result = kimage_set_destination(image, mstart);
1264 + for(offset = 0; offset < segment->memsz; offset += PAGE_SIZE) {
1265 + struct page *page;
1267 + size_t size, leader;
1268 + page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset);
1273 + result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
1278 + if (segment->bufsz < offset) {
1279 + /* We are past the end zero the whole page */
1280 + memset(ptr, 0, PAGE_SIZE);
1286 + if ((offset == 0)) {
1287 + leader = mstart & ~PAGE_MASK;
1290 + /* We are on the first page zero the unused portion */
1291 + memset(ptr, 0, leader);
1295 + if (size > (segment->bufsz - offset)) {
1296 + size = segment->bufsz - offset;
1298 + if (size < (PAGE_SIZE - leader)) {
1299 + /* zero the trailing part of the page */
1300 + memset(ptr + size, 0, (PAGE_SIZE - leader) - size);
1302 + result = copy_from_user(ptr, buf + offset, size);
1305 + result = (result < 0)?result : -EIO;
1314 + * Exec Kernel system call: for obvious reasons only root may call it.
1316 + * This call breaks up into three pieces.
1317 + * - A generic part which loads the new kernel from the current
1318 + * address space, and very carefully places the data in the
1319 + * allocated pages.
1321 + * - A generic part that interacts with the kernel and tells all of
1322 + * the devices to shut down. Preventing on-going dmas, and placing
1323 + * the devices in a consistent state so a later kernel can
1324 + * reinitialize them.
1326 + * - A machine specific part that includes the syscall number
1327 + * and the copies the image to it's final destination. And
1328 + * jumps into the image at entry.
1330 + * kexec does not sync, or unmount filesystems so if you need
1331 + * that to happen you need to do that yourself.
1333 +struct kimage *kexec_image = 0;
1335 +asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
1336 + struct kexec_segment *segments, unsigned long flags)
1338 + struct kimage *image;
1341 + /* We only trust the superuser with rebooting the system. */
1342 + if (!capable(CAP_SYS_ADMIN))
1345 + /* In case we need just a little bit of special behavior for
1351 + if (nr_segments > KEXEC_SEGMENT_MAX)
1356 + if (nr_segments > 0) {
1358 + result = kimage_alloc(&image, nr_segments, segments);
1362 + image->start = entry;
1363 + for(i = 0; i < nr_segments; i++) {
1364 + result = kimage_load_segment(image, &segments[i]);
1369 + result = kimage_terminate(image);
1375 + image = xchg(&kexec_image, image);
1378 + kimage_free(image);
1381 --- linux-2.6.0-test1/kernel/sys.c~kexec-2.6.0-full 2003-07-22 00:46:07.000000000 -0600
1382 +++ linux-2.6.0-test1-braam/kernel/sys.c 2003-07-22 00:54:04.000000000 -0600
1384 #include <linux/init.h>
1385 #include <linux/highuid.h>
1386 #include <linux/fs.h>
1387 +#include <linux/kexec.h>
1388 #include <linux/workqueue.h>
1389 #include <linux/device.h>
1390 #include <linux/times.h>
1391 @@ -208,6 +209,7 @@ cond_syscall(sys_acct)
1392 cond_syscall(sys_lookup_dcookie)
1393 cond_syscall(sys_swapon)
1394 cond_syscall(sys_swapoff)
1395 +cond_syscall(sys_kexec_load)
1396 cond_syscall(sys_init_module)
1397 cond_syscall(sys_delete_module)
1398 cond_syscall(sys_socketpair)
1399 @@ -454,6 +456,27 @@ asmlinkage long sys_reboot(int magic1, i
1400 machine_restart(buffer);
1403 +#ifdef CONFIG_KEXEC
1404 + case LINUX_REBOOT_CMD_KEXEC:
1406 + struct kimage *image;
1411 + image = xchg(&kexec_image, 0);
1416 + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
1417 + system_running = 0;
1418 + device_shutdown();
1419 + printk(KERN_EMERG "Starting new kernel\n");
1420 + machine_kexec(image);
1424 #ifdef CONFIG_SOFTWARE_SUSPEND
1425 case LINUX_REBOOT_CMD_SW_SUSPEND:
1426 if (!software_suspend_enabled) {