3 --- linux-2.6.0-test1/MAINTAINERS~kexec-2.6.0-full 2003-07-23 12:08:43.000000000 +0800
4 +++ linux-2.6.0-test1-root/MAINTAINERS 2003-07-23 12:08:54.000000000 +0800
5 @@ -1095,6 +1095,17 @@ W: http://nfs.sourceforge.net/
6 W: http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/
11 +M: ebiederm@xmission.com
12 +M: ebiederman@lnxi.com
13 +W: http://www.xmission.com/~ebiederm/files/kexec/
16 +W: http://www.osdl.org/archive/andyp/bloom/Code/Linux/Kexec/
17 +L: linux-kernel@vger.kernel.org
20 LANMEDIA WAN CARD DRIVER
21 P: Andrew Stanley-Jones
23 --- linux-2.6.0-test1/arch/i386/Kconfig~kexec-2.6.0-full 2003-07-23 12:08:52.000000000 +0800
24 +++ linux-2.6.0-test1-root/arch/i386/Kconfig 2003-07-23 12:08:54.000000000 +0800
25 @@ -804,6 +804,23 @@ config BOOT_IOREMAP
26 depends on ((X86_SUMMIT || X86_GENERICARCH) && NUMA)
30 + bool "kexec system call (EXPERIMENTAL)"
31 + depends on EXPERIMENTAL
33 + kexec is a system call that implements the ability to shutdown your
34 + current kernel, and to start another kernel. It is like a reboot
35 + but it is indepedent of the system firmware. And like a reboot
36 + you can start any kernel with it not just Linux.
38 + The name comes from the similiarity to the exec system call.
40 + It is on an going process to be certain the hardware in a machine
41 + is properly shutdown, so do not be surprised if this code does not
42 + initially work for you. It may help to enable device hotplugging
43 + support. As of this writing the exact hardware interface is
44 + strongly in flux, so no good recommendation can be made.
49 --- linux-2.6.0-test1/arch/i386/defconfig~kexec-2.6.0-full 2003-07-14 11:35:57.000000000 +0800
50 +++ linux-2.6.0-test1-root/arch/i386/defconfig 2003-07-23 12:08:54.000000000 +0800
51 @@ -72,6 +72,7 @@ CONFIG_SMP=y
52 CONFIG_X86_LOCAL_APIC=y
57 # CONFIG_X86_MCE_NONFATAL is not set
58 CONFIG_X86_MCE_P4THERMAL=y
59 --- linux-2.6.0-test1/arch/i386/kernel/Makefile~kexec-2.6.0-full 2003-07-23 12:08:31.000000000 +0800
60 +++ linux-2.6.0-test1-root/arch/i386/kernel/Makefile 2003-07-23 12:08:54.000000000 +0800
61 @@ -25,6 +25,7 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoli
62 obj-$(CONFIG_X86_MPPARSE) += mpparse.o
63 obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
64 obj-$(CONFIG_X86_IO_APIC) += io_apic.o
65 +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o
66 obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o suspend_asm.o
67 obj-$(CONFIG_X86_NUMAQ) += numaq.o
68 obj-$(CONFIG_X86_SUMMIT) += summit.o
69 --- linux-2.6.0-test1/arch/i386/kernel/apic.c~kexec-2.6.0-full 2003-07-23 12:08:30.000000000 +0800
70 +++ linux-2.6.0-test1-root/arch/i386/kernel/apic.c 2003-07-23 12:08:54.000000000 +0800
72 #include <linux/mc146818rtc.h>
73 #include <linux/kernel_stat.h>
74 #include <linux/sysdev.h>
75 +#include <linux/reboot.h>
77 #include <asm/atomic.h>
79 @@ -175,6 +176,39 @@ void disconnect_bsp_APIC(void)
85 + /* Go back to Virtual Wire compatibility mode */
86 + unsigned long value;
88 + /* For the spurious interrupt use vector F, and enable it */
89 + value = apic_read(APIC_SPIV);
90 + value &= ~APIC_VECTOR_MASK;
91 + value |= APIC_SPIV_APIC_ENABLED;
93 + apic_write_around(APIC_SPIV, value);
95 + /* For LVT0 make it edge triggered, active high, external and enabled */
96 + value = apic_read(APIC_LVT0);
97 + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
98 + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
99 + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
100 + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
101 + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT);
102 + apic_write_around(APIC_LVT0, value);
104 + /* For LVT1 make it edge triggered, active high, nmi and enabled */
105 + value = apic_read(APIC_LVT1);
107 + APIC_MODE_MASK | APIC_SEND_PENDING |
108 + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
109 + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
110 + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
111 + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
112 + apic_write_around(APIC_LVT1, value);
114 +#endif /* CONFIG_KEXEC */
118 void disable_local_APIC(void)
119 @@ -1115,6 +1149,26 @@ asmlinkage void smp_error_interrupt(void
123 +void stop_apics(void)
125 + /* By resetting the APIC's we disable the nmi watchdog */
128 + * Stop all CPUs and turn off local APICs and the IO-APIC, so
129 + * other OSs see a clean IRQ state.
133 + disable_local_APIC();
135 +#if defined(CONFIG_X86_IO_APIC)
136 + if (smp_found_config) {
140 + disconnect_bsp_APIC();
144 * This initializes the IO-APIC and APIC hardware if this is
146 --- linux-2.6.0-test1/arch/i386/kernel/dmi_scan.c~kexec-2.6.0-full 2003-07-14 11:32:44.000000000 +0800
147 +++ linux-2.6.0-test1-root/arch/i386/kernel/dmi_scan.c 2003-07-23 12:08:54.000000000 +0800
148 @@ -222,31 +222,6 @@ static __init int set_bios_reboot(struct
153 - * Some machines require the "reboot=s" commandline option, this quirk makes that automatic.
155 -static __init int set_smp_reboot(struct dmi_blacklist *d)
158 - extern int reboot_smp;
159 - if (reboot_smp == 0)
162 - printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident);
169 - * Some machines require the "reboot=b,s" commandline option, this quirk makes that automatic.
171 -static __init int set_smp_bios_reboot(struct dmi_blacklist *d)
174 - set_bios_reboot(d);
179 * Some bioses have a broken protected mode poweroff and need to use realmode
180 @@ -527,7 +502,7 @@ static __initdata struct dmi_blacklist d
181 MATCH(DMI_BIOS_VERSION, "4.60 PGMA"),
182 MATCH(DMI_BIOS_DATE, "134526184"), NO_MATCH
184 - { set_smp_bios_reboot, "Dell PowerEdge 1300", { /* Handle problems with rebooting on Dell 1300's */
185 + { set_bios_reboot, "Dell PowerEdge 1300", { /* Handle problems with rebooting on Dell 1300's */
186 MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
187 MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
189 --- linux-2.6.0-test1/arch/i386/kernel/entry.S~kexec-2.6.0-full 2003-07-23 12:08:30.000000000 +0800
190 +++ linux-2.6.0-test1-root/arch/i386/kernel/entry.S 2003-07-23 12:08:54.000000000 +0800
191 @@ -905,5 +905,6 @@ ENTRY(sys_call_table)
192 .long sys_tgkill /* 270 */
195 + .long sys_kexec_load
197 nr_syscalls=(.-sys_call_table)/4
198 --- linux-2.6.0-test1/arch/i386/kernel/i8259.c~kexec-2.6.0-full 2003-07-14 11:38:03.000000000 +0800
199 +++ linux-2.6.0-test1-root/arch/i386/kernel/i8259.c 2003-07-23 12:08:54.000000000 +0800
200 @@ -244,9 +244,21 @@ static int i8259A_resume(struct sys_devi
204 +static int i8259A_shutdown(struct sys_device *dev)
206 + /* Put the i8259A into a quiescent state that
207 + * the kernel initialization code can get it
210 + outb(0xff, 0x21); /* mask all of 8259A-1 */
211 + outb(0xff, 0xA1); /* mask all of 8259A-1 */
215 static struct sysdev_class i8259_sysdev_class = {
216 set_kset_name("i8259"),
217 .resume = i8259A_resume,
218 + .shutdown = i8259A_shutdown,
221 static struct sys_device device_i8259A = {
222 --- linux-2.6.0-test1/arch/i386/kernel/io_apic.c~kexec-2.6.0-full 2003-07-23 12:08:30.000000000 +0800
223 +++ linux-2.6.0-test1-root/arch/i386/kernel/io_apic.c 2003-07-23 12:08:54.000000000 +0800
224 @@ -1601,8 +1601,6 @@ void disable_IO_APIC(void)
225 * Clear the IO-APIC before rebooting:
229 - disconnect_bsp_APIC();
233 --- /dev/null 2002-08-31 07:31:37.000000000 +0800
234 +++ linux-2.6.0-test1-root/arch/i386/kernel/machine_kexec.c 2003-07-23 12:08:54.000000000 +0800
236 +#include <linux/config.h>
237 +#include <linux/mm.h>
238 +#include <linux/kexec.h>
239 +#include <linux/delay.h>
240 +#include <asm/pgtable.h>
241 +#include <asm/pgalloc.h>
242 +#include <asm/tlbflush.h>
243 +#include <asm/mmu_context.h>
245 +#include <asm/apic.h>
250 + * =======================
254 +static void set_idt(void *newidt, __u16 limit)
256 + unsigned char curidt[6];
258 + /* ia32 supports unaliged loads & stores */
259 + (*(__u16 *)(curidt)) = limit;
260 + (*(__u32 *)(curidt +2)) = (unsigned long)(newidt);
262 + __asm__ __volatile__ (
269 +static void set_gdt(void *newgdt, __u16 limit)
271 + unsigned char curgdt[6];
273 + /* ia32 supports unaliged loads & stores */
274 + (*(__u16 *)(curgdt)) = limit;
275 + (*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt);
277 + __asm__ __volatile__ (
283 +static void load_segments(void)
286 +#define STR(X) __STR(X)
288 + __asm__ __volatile__ (
289 + "\tljmp $"STR(__KERNEL_CS)",$1f\n"
291 + "\tmovl $"STR(__KERNEL_DS)",%eax\n"
292 + "\tmovl %eax,%ds\n"
293 + "\tmovl %eax,%es\n"
294 + "\tmovl %eax,%fs\n"
295 + "\tmovl %eax,%gs\n"
296 + "\tmovl %eax,%ss\n"
302 +typedef void (*relocate_new_kernel_t)(
303 + unsigned long indirection_page, unsigned long reboot_code_buffer,
304 + unsigned long start_address);
306 +const extern unsigned char relocate_new_kernel[];
307 +extern void relocate_new_kernel_end(void);
308 +const extern unsigned int relocate_new_kernel_size;
309 +extern void use_mm(struct mm_struct *mm);
311 +void machine_kexec(struct kimage *image)
313 + unsigned long indirection_page;
314 + unsigned long reboot_code_buffer;
315 + relocate_new_kernel_t rnk;
317 + /* switch to an mm where the reboot_code_buffer is identity mapped */
321 + /* Interrupts aren't acceptable while we reboot */
322 + local_irq_disable();
323 + reboot_code_buffer = page_to_pfn(image->reboot_code_pages) << PAGE_SHIFT;
324 + indirection_page = image->head & PAGE_MASK;
327 + memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size);
329 + /* The segment registers are funny things, they are
330 + * automatically loaded from a table, in memory wherever you
331 + * set them to a specific selector, but this table is never
332 + * accessed again you set the segment to a different selector.
334 + * The more common model is are caches where the behide
335 + * the scenes work is done, but is also dropped at arbitrary
338 + * I take advantage of this here by force loading the
339 + * segments, before I zap the gdt with an invalid value.
342 + /* The gdt & idt are now invalid.
343 + * If you want to load them you must set up your own idt & gdt.
345 + set_gdt(phys_to_virt(0),0);
346 + set_idt(phys_to_virt(0),0);
349 + rnk = (relocate_new_kernel_t) reboot_code_buffer;
350 + (*rnk)(indirection_page, reboot_code_buffer, image->start);
352 --- linux-2.6.0-test1/arch/i386/kernel/reboot.c~kexec-2.6.0-full 2003-07-23 12:08:31.000000000 +0800
353 +++ linux-2.6.0-test1-root/arch/i386/kernel/reboot.c 2003-07-23 12:08:54.000000000 +0800
355 #include <linux/interrupt.h>
356 #include <linux/mc146818rtc.h>
357 #include <asm/uaccess.h>
358 +#include <asm/apic.h>
359 #include "mach_reboot.h"
362 @@ -20,8 +21,7 @@ static int reboot_mode;
363 int reboot_thru_bios;
367 -static int reboot_cpu = -1;
368 +int reboot_cpu = -1; /* specifies the internal linux cpu id, not the apicid */
369 /* shamelessly grabbed from lib/vsprintf.c for readability */
370 #define is_digit(c) ((c) >= '0' && (c) <= '9')
372 @@ -43,7 +43,6 @@ static int __init reboot_setup(char *str
375 case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
377 if (is_digit(*(str+1))) {
378 reboot_cpu = (int) (*(str+1) - '0');
379 if (is_digit(*(str+2)))
380 @@ -215,42 +214,7 @@ void machine_real_restart(unsigned char
382 void machine_restart(char * __unused)
387 - cpuid = GET_APIC_ID(apic_read(APIC_ID));
391 - /* check to see if reboot_cpu is valid
392 - if its not, default to the BSP */
393 - if ((reboot_cpu == -1) ||
394 - (reboot_cpu > (NR_CPUS -1)) ||
395 - !cpu_isset(cpuid, phys_cpu_present_map))
396 - reboot_cpu = boot_cpu_physical_apicid;
398 - reboot_smp = 0; /* use this as a flag to only go through this once*/
399 - /* re-run this function on the other CPUs
400 - it will fall though this section since we have
401 - cleared reboot_smp, and do the reboot if it is the
402 - correct CPU, otherwise it halts. */
403 - if (reboot_cpu != cpuid)
404 - smp_call_function((void *)machine_restart , NULL, 1, 0);
407 - /* if reboot_cpu is still -1, then we want a tradional reboot,
408 - and if we are not running on the reboot_cpu,, halt */
409 - if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
411 - __asm__ __volatile__ ("hlt");
414 - * Stop all CPUs and turn off local APICs and the IO-APIC, so
415 - * other OSs see a clean IRQ state.
422 if(!reboot_thru_bios) {
423 /* rebooting needs to touch the page at absolute addr 0 */
424 @@ -268,10 +232,12 @@ void machine_restart(char * __unused)
426 void machine_halt(void)
431 void machine_power_off(void)
437 --- /dev/null 2002-08-31 07:31:37.000000000 +0800
438 +++ linux-2.6.0-test1-root/arch/i386/kernel/relocate_kernel.S 2003-07-23 12:08:54.000000000 +0800
440 +#include <linux/config.h>
441 +#include <linux/linkage.h>
443 + /* Must be relocatable PIC code callable as a C function, that once
444 + * it starts can not use the previous processes stack.
447 + .globl relocate_new_kernel
448 +relocate_new_kernel:
449 + /* read the arguments and say goodbye to the stack */
450 + movl 4(%esp), %ebx /* indirection_page */
451 + movl 8(%esp), %ebp /* reboot_code_buffer */
452 + movl 12(%esp), %edx /* start address */
454 + /* zero out flags, and disable interrupts */
458 + /* set a new stack at the bottom of our page... */
459 + lea 4096(%ebp), %esp
461 + /* store the parameters back on the stack */
462 + pushl %edx /* store the start address */
464 + /* Set cr0 to a known state:
465 + * 31 0 == Paging disabled
466 + * 18 0 == Alignment check disabled
467 + * 16 0 == Write protect disabled
468 + * 3 0 == No task switch
469 + * 2 0 == Don't do FP software emulation.
470 + * 0 1 == Proctected mode enabled
473 + andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
477 + /* Set cr4 to a known state:
478 + * Setting everything to zero seems safe.
487 + /* Flush the TLB (needed?) */
491 + /* Do the copies */
493 +0: /* top, read another word for the indirection page */
497 + testl $0x1, %ecx /* is it a destination page */
500 + andl $0xfffff000, %edi
503 + testl $0x2, %ecx /* is it an indirection page */
506 + andl $0xfffff000, %ebx
509 + testl $0x4, %ecx /* is it the done indicator */
513 + testl $0x8, %ecx /* is it the source indicator */
514 + jz 0b /* Ignore it otherwise */
515 + movl %ecx, %esi /* For every source page do a copy */
516 + andl $0xfffff000, %esi
524 + /* To be certain of avoiding problems with self modifying code
525 + * I need to execute a serializing instruction here.
526 + * So I flush the TLB, it's handy, and not processor dependent.
531 + /* set all of the registers to known values */
532 + /* leave %esp alone */
542 +relocate_new_kernel_end:
544 + .globl relocate_new_kernel_size
545 +relocate_new_kernel_size:
546 + .long relocate_new_kernel_end - relocate_new_kernel
547 --- linux-2.6.0-test1/arch/i386/kernel/smp.c~kexec-2.6.0-full 2003-07-23 12:08:52.000000000 +0800
548 +++ linux-2.6.0-test1-root/arch/i386/kernel/smp.c 2003-07-23 12:53:45.000000000 +0800
549 @@ -587,6 +587,30 @@ void stop_this_cpu (void * dummy)
551 void smp_send_stop(void)
553 + extern int reboot_cpu;
556 + /* The boot cpu is always logical cpu 0 */
559 + /* See if there has been give a command line override .
561 + if ((reboot_cpu != -1) && !(reboot_cpu >= NR_CPUS) &&
562 + test_bit(reboot_cpu, &cpu_online_map)) {
563 + reboot_cpu_id = reboot_cpu;
566 + /* Make certain the the cpu I'm rebooting on is online */
567 + if (!test_bit(reboot_cpu_id, &cpu_online_map)) {
568 + reboot_cpu_id = smp_processor_id();
571 + /* Make certain I only run on the appropriate processor */
572 + set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
574 + /* O.k. Now that I'm on the appropriate processor stop
575 + * all of the others.
577 smp_call_function(stop_this_cpu, NULL, 1, 0);
580 --- linux-2.6.0-test1/include/asm-i386/apic.h~kexec-2.6.0-full 2003-07-14 11:38:53.000000000 +0800
581 +++ linux-2.6.0-test1-root/include/asm-i386/apic.h 2003-07-23 12:08:54.000000000 +0800
582 @@ -97,6 +97,9 @@ extern unsigned int nmi_watchdog;
583 #define NMI_LOCAL_APIC 2
584 #define NMI_INVALID 3
586 +extern void stop_apics(void);
588 +static inline void stop_apics(void) { }
589 #endif /* CONFIG_X86_LOCAL_APIC */
591 #endif /* __ASM_APIC_H */
592 --- linux-2.6.0-test1/include/asm-i386/apicdef.h~kexec-2.6.0-full 2003-07-14 11:34:40.000000000 +0800
593 +++ linux-2.6.0-test1-root/include/asm-i386/apicdef.h 2003-07-23 12:08:54.000000000 +0800
595 #define APIC_LVT_REMOTE_IRR (1<<14)
596 #define APIC_INPUT_POLARITY (1<<13)
597 #define APIC_SEND_PENDING (1<<12)
598 +#define APIC_MODE_MASK 0x700
599 #define GET_APIC_DELIVERY_MODE(x) (((x)>>8)&0x7)
600 #define SET_APIC_DELIVERY_MODE(x,y) (((x)&~0x700)|((y)<<8))
601 #define APIC_MODE_FIXED 0x0
602 --- /dev/null 2002-08-31 07:31:37.000000000 +0800
603 +++ linux-2.6.0-test1-root/include/asm-i386/kexec.h 2003-07-23 12:08:54.000000000 +0800
605 +#ifndef _I386_KEXEC_H
606 +#define _I386_KEXEC_H
608 +#include <asm/fixmap.h>
611 + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
612 + * I.e. Maximum page that is mapped directly into kernel memory,
613 + * and kmap is not required.
615 + * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct
616 + * calculation for the amount of memory directly mappable into the
617 + * kernel memory space.
620 +/* Maximum physical address we can use pages from */
621 +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
622 +/* Maximum address we can reach in physical address mode */
623 +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
625 +#define KEXEC_REBOOT_CODE_SIZE 4096
627 +#endif /* _I386_KEXEC_H */
628 --- linux-2.6.0-test1/include/asm-i386/unistd.h~kexec-2.6.0-full 2003-07-23 12:08:42.000000000 +0800
629 +++ linux-2.6.0-test1-root/include/asm-i386/unistd.h 2003-07-23 12:08:54.000000000 +0800
631 #define __NR_tgkill 270
632 #define __NR_utimes 271
633 #define __NR_mknod64 272
635 -#define NR_syscalls 273
636 +#define __NR_sys_kexec_load 273
638 +#define NR_syscalls 274
640 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
642 --- /dev/null 2002-08-31 07:31:37.000000000 +0800
643 +++ linux-2.6.0-test1-root/include/linux/kexec.h 2003-07-23 12:08:54.000000000 +0800
645 +#ifndef LINUX_KEXEC_H
646 +#define LINUX_KEXEC_H
649 +#include <linux/types.h>
650 +#include <linux/list.h>
651 +#include <asm/kexec.h>
654 + * This structure is used to hold the arguments that are used when loading
658 +typedef unsigned long kimage_entry_t;
659 +#define IND_DESTINATION 0x1
660 +#define IND_INDIRECTION 0x2
661 +#define IND_DONE 0x4
662 +#define IND_SOURCE 0x8
664 +#define KEXEC_SEGMENT_MAX 8
665 +struct kexec_segment {
673 + kimage_entry_t head;
674 + kimage_entry_t *entry;
675 + kimage_entry_t *last_entry;
677 + unsigned long destination;
678 + unsigned long offset;
680 + unsigned long start;
681 + struct page *reboot_code_pages;
683 + unsigned long nr_segments;
684 + struct kexec_segment segment[KEXEC_SEGMENT_MAX+1];
686 + struct list_head dest_pages;
687 + struct list_head unuseable_pages;
691 +/* kexec interface functions */
692 +extern void machine_kexec(struct kimage *image);
693 +extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments,
694 + struct kexec_segment *segments);
695 +extern struct kimage *kexec_image;
697 +#endif /* LINUX_KEXEC_H */
699 --- linux-2.6.0-test1/include/linux/reboot.h~kexec-2.6.0-full 2003-07-14 11:39:35.000000000 +0800
700 +++ linux-2.6.0-test1-root/include/linux/reboot.h 2003-07-23 12:08:54.000000000 +0800
702 * POWER_OFF Stop OS and remove all power from system, if possible.
703 * RESTART2 Restart system using given command string.
704 * SW_SUSPEND Suspend system using Software Suspend if compiled in
705 + * KEXEC Restart the system using a different kernel.
708 #define LINUX_REBOOT_CMD_RESTART 0x01234567
710 #define LINUX_REBOOT_CMD_POWER_OFF 0x4321FEDC
711 #define LINUX_REBOOT_CMD_RESTART2 0xA1B2C3D4
712 #define LINUX_REBOOT_CMD_SW_SUSPEND 0xD000FCE2
713 +#define LINUX_REBOOT_CMD_KEXEC 0x45584543
717 --- linux-2.6.0-test1/kernel/Makefile~kexec-2.6.0-full 2003-07-23 12:08:43.000000000 +0800
718 +++ linux-2.6.0-test1-root/kernel/Makefile 2003-07-23 12:08:54.000000000 +0800
719 @@ -19,6 +19,7 @@ obj-$(CONFIG_PM) += pm.o
720 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
721 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
722 obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
723 +obj-$(CONFIG_KEXEC) += kexec.o
724 obj-$(CONFIG_COMPAT) += compat.o
726 ifneq ($(CONFIG_IA64),y)
727 --- /dev/null 2002-08-31 07:31:37.000000000 +0800
728 +++ linux-2.6.0-test1-root/kernel/kexec.c 2003-07-23 12:08:54.000000000 +0800
730 +#include <linux/mm.h>
731 +#include <linux/file.h>
732 +#include <linux/slab.h>
733 +#include <linux/fs.h>
734 +#include <linux/version.h>
735 +#include <linux/compile.h>
736 +#include <linux/kexec.h>
737 +#include <linux/spinlock.h>
738 +#include <linux/list.h>
739 +#include <linux/highmem.h>
740 +#include <net/checksum.h>
741 +#include <asm/page.h>
742 +#include <asm/uaccess.h>
744 +#include <asm/system.h>
746 +/* When kexec transitions to the new kernel there is a one to one
747 + * mapping between physical and virtual addresses. On processors
748 + * where you can disable the MMU this is trivial, and easy. For
749 + * others it is still a simple predictable page table to setup.
751 + * In that environment kexec copies the new kernel to it's final
752 + * resting place. This means I can only support memory whose
753 + * physical address can fit in an unsigned long. In particular
754 + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
755 + * If the assembly stub has more restrictive requirements
756 + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
757 + * defined more restrictively in <asm/kexec.h>.
759 + * The code for the transition from the current kernel to the
760 + * the new kernel is placed in the reboot_code_buffer, whose size
761 + * is given by KEXEC_REBOOT_CODE_SIZE. In the best case only a single
762 + * page of memory is necessary, but some architectures require more.
763 + * Because this memory must be identity mapped in the transition from
764 + * virtual to physical addresses it must live in the range
765 + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
768 + * The assembly stub in the reboot code buffer is passed a linked list
769 + * of descriptor pages detailing the source pages of the new kernel,
770 + * and the destination addresses of those source pages. As this data
771 + * structure is not used in the context of the current OS, it must
772 + * be self contained.
774 + * The code has been made to work with highmem pages and will use a
775 + * destination page in it's final resting place (if it happens
776 + * to allocate it). The end product of this is that most of the
777 + * physical address space, and most of ram can be used.
779 + * Future directions include:
780 + * - allocating a page table with the reboot code buffer identity
781 + * mapped, to simplify machine_kexec and make kexec_on_panic, more
783 + * - allocating the pages for a page table for machines that cannot
784 + * disable their MMUs. (Hammer, Alpha...)
787 +/* KIMAGE_NO_DEST is an impossible destination address..., for
788 + * allocating pages whose destination address we do not care about.
790 +#define KIMAGE_NO_DEST (-1UL)
792 +static int kimage_is_destination_range(
793 + struct kimage *image, unsigned long start, unsigned long end);
794 +static struct page *kimage_alloc_reboot_code_pages(struct kimage *image);
795 +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
798 +static int kimage_alloc(struct kimage **rimage,
799 + unsigned long nr_segments, struct kexec_segment *segments)
802 + struct kimage *image;
803 + size_t segment_bytes;
804 + struct page *reboot_pages;
807 + /* Allocate a controlling structure */
809 + image = kmalloc(sizeof(*image), GFP_KERNEL);
813 + memset(image, 0, sizeof(*image));
815 + image->entry = &image->head;
816 + image->last_entry = &image->head;
818 + /* Initialize the list of destination pages */
819 + INIT_LIST_HEAD(&image->dest_pages);
821 + /* Initialize the list of unuseable pages */
822 + INIT_LIST_HEAD(&image->unuseable_pages);
824 + /* Read in the segments */
825 + image->nr_segments = nr_segments;
826 + segment_bytes = nr_segments * sizeof*segments;
827 + result = copy_from_user(image->segment, segments, segment_bytes);
831 + /* Verify we have good destination addresses. The caller is
832 + * responsible for making certain we don't attempt to load
833 + * the new image into invalid or reserved areas of RAM. This
834 + * just verifies it is an address we can use.
836 + result = -EADDRNOTAVAIL;
837 + for(i = 0; i < nr_segments; i++) {
838 + unsigned long mend;
839 + mend = ((unsigned long)(image->segment[i].mem)) +
840 + image->segment[i].memsz;
841 + if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
845 + /* Find a location for the reboot code buffer, and add it
846 + * the vector of segments so that it's pages will also be
847 + * counted as destination pages.
850 + reboot_pages = kimage_alloc_reboot_code_pages(image);
851 + if (!reboot_pages) {
852 + printk(KERN_ERR "Could not allocate reboot_code_buffer\n");
855 + image->reboot_code_pages = reboot_pages;
856 + image->segment[nr_segments].buf = 0;
857 + image->segment[nr_segments].bufsz = 0;
858 + image->segment[nr_segments].mem = (void *)(page_to_pfn(reboot_pages) << PAGE_SHIFT);
859 + image->segment[nr_segments].memsz = KEXEC_REBOOT_CODE_SIZE;
860 + image->nr_segments++;
872 +static int kimage_is_destination_range(
873 + struct kimage *image, unsigned long start, unsigned long end)
876 + for(i = 0; i < image->nr_segments; i++) {
877 + unsigned long mstart, mend;
878 + mstart = (unsigned long)image->segment[i].mem;
879 + mend = mstart + image->segment[i].memsz;
880 + if ((end > mstart) && (start < mend)) {
888 +static int identity_map_pages(struct page *pages, int order)
890 + struct mm_struct *mm;
891 + struct vm_area_struct *vma;
896 + down_write(&mm->mmap_sem);
898 + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
903 + memset(vma, 0, sizeof(vma));
905 + vma->vm_start = page_to_pfn(pages) << PAGE_SHIFT;
906 + vma->vm_end = vma->vm_start + (1 << (order + PAGE_SHIFT));
908 + vma->vm_flags = VM_SHARED \
909 + | VM_READ | VM_WRITE | VM_EXEC \
910 + | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC \
911 + | VM_DONTCOPY | VM_RESERVED;
912 + vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
913 + vma->vm_file = NULL;
914 + vma->vm_private_data = NULL;
915 + INIT_LIST_HEAD(&vma->shared);
916 + insert_vm_struct(mm, vma);
918 + error = remap_page_range(vma, vma->vm_start, vma->vm_start,
919 + vma->vm_end - vma->vm_start, vma->vm_page_prot);
926 + if (error && vma) {
927 + kmem_cache_free(vm_area_cachep, vma);
930 + up_write(&mm->mmap_sem);
935 +#define identity_map_pages(pages, order) 0
938 +struct page *kimage_alloc_reboot_code_pages(struct kimage *image)
940 + /* The reboot code buffer is special. It is the only set of
941 + * pages that must be allocated in their final resting place,
942 + * and the only set of pages whose final resting place we can
945 + * At worst this runs in O(N) of the image size.
947 + struct list_head extra_pages, *pos, *next;
948 + struct page *pages;
949 + unsigned long addr;
951 + order = get_order(KEXEC_REBOOT_CODE_SIZE);
952 + count = 1 << order;
953 + INIT_LIST_HEAD(&extra_pages);
956 + pages = alloc_pages(GFP_HIGHUSER, order);
959 + for(i = 0; i < count; i++) {
960 + SetPageReserved(pages +i);
962 + addr = page_to_pfn(pages) << PAGE_SHIFT;
963 + if ((page_to_pfn(pages) >= (TASK_SIZE >> PAGE_SHIFT)) ||
964 + kimage_is_destination_range(image, addr, addr + KEXEC_REBOOT_CODE_SIZE)) {
965 + list_add(&pages->list, &extra_pages);
971 + result = identity_map_pages(pages, order);
973 + list_add(&pages->list, &extra_pages);
977 + /* If I could convert a multi page allocation into a buch of
978 + * single page allocations I could add these pages to
979 + * image->dest_pages. For now it is simpler to just free the
982 + list_for_each_safe(pos, next, &extra_pages) {
985 + page = list_entry(pos, struct page, list);
986 + for(i = 0; i < count; i++) {
987 + ClearPageReserved(pages +i);
989 + list_del(&extra_pages);
990 + __free_pages(page, order);
995 +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
997 + if (image->offset != 0) {
1000 + if (image->entry == image->last_entry) {
1001 + kimage_entry_t *ind_page;
1002 + struct page *page;
1003 + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
1007 + ind_page = page_address(page);
1008 + *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
1009 + image->entry = ind_page;
1010 + image->last_entry =
1011 + ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
1013 + *image->entry = entry;
1015 + image->offset = 0;
1019 +static int kimage_set_destination(
1020 + struct kimage *image, unsigned long destination)
1023 + destination &= PAGE_MASK;
1024 + result = kimage_add_entry(image, destination | IND_DESTINATION);
1025 + if (result == 0) {
1026 + image->destination = destination;
1032 +static int kimage_add_page(struct kimage *image, unsigned long page)
1035 + page &= PAGE_MASK;
1036 + result = kimage_add_entry(image, page | IND_SOURCE);
1037 + if (result == 0) {
1038 + image->destination += PAGE_SIZE;
1044 +static void kimage_free_extra_pages(struct kimage *image)
1046 + /* Walk through and free any extra destination pages I may have */
1047 + struct list_head *pos, *next;
1048 + list_for_each_safe(pos, next, &image->dest_pages) {
1049 + struct page *page;
1050 + page = list_entry(pos, struct page, list);
1051 + list_del(&page->list);
1052 + ClearPageReserved(page);
1053 + __free_page(page);
1055 + /* Walk through and free any unuseable pages I have cached */
1056 + list_for_each_safe(pos, next, &image->unuseable_pages) {
1057 + struct page *page;
1058 + page = list_entry(pos, struct page, list);
1059 + list_del(&page->list);
1060 + ClearPageReserved(page);
1061 + __free_page(page);
1065 +static int kimage_terminate(struct kimage *image)
1068 + result = kimage_add_entry(image, IND_DONE);
1069 + if (result == 0) {
1070 + /* Point at the terminating element */
1072 + kimage_free_extra_pages(image);
1077 +#define for_each_kimage_entry(image, ptr, entry) \
1078 + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
1079 + ptr = (entry & IND_INDIRECTION)? \
1080 + phys_to_virt((entry & PAGE_MASK)): ptr +1)
1082 +static void kimage_free(struct kimage *image)
1084 + kimage_entry_t *ptr, entry;
1085 + kimage_entry_t ind = 0;
1086 + int i, count, order;
1089 + kimage_free_extra_pages(image);
1090 + for_each_kimage_entry(image, ptr, entry) {
1091 + if (entry & IND_INDIRECTION) {
1092 + /* Free the previous indirection page */
1093 + if (ind & IND_INDIRECTION) {
1094 + free_page((unsigned long)phys_to_virt(ind & PAGE_MASK));
1096 + /* Save this indirection page until we are
1101 + else if (entry & IND_SOURCE) {
1102 + free_page((unsigned long)phys_to_virt(entry & PAGE_MASK));
1105 + order = get_order(KEXEC_REBOOT_CODE_SIZE);
1106 + count = 1 << order;
1107 + do_munmap(&init_mm,
1108 + page_to_pfn(image->reboot_code_pages) << PAGE_SHIFT,
1109 + count << PAGE_SHIFT);
1110 + for(i = 0; i < count; i++) {
1111 + ClearPageReserved(image->reboot_code_pages + i);
1113 + __free_pages(image->reboot_code_pages, order);
1117 +static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
1119 + kimage_entry_t *ptr, entry;
1120 + unsigned long destination = 0;
1121 + for_each_kimage_entry(image, ptr, entry) {
1122 + if (entry & IND_DESTINATION) {
1123 + destination = entry & PAGE_MASK;
1125 + else if (entry & IND_SOURCE) {
1126 + if (page == destination) {
1129 + destination += PAGE_SIZE;
1135 +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
1137 + /* Here we implment safe guards to ensure that a source page
1138 + * is not copied to it's destination page before the data on
1139 + * the destination page is no longer useful.
1141 + * To do this we maintain the invariant that a source page is
1142 + * either it's own destination page, or it is not a
1143 + * destination page at all.
1145 + * That is slightly stronger than required, but the proof
1146 + * that no problems will not occur is trivial, and the
1147 + * implemenation is simply to verify.
1149 + * When allocating all pages normally this algorithm will run
1150 + * in O(N) time, but in the worst case it will run in O(N^2)
1151 + * time. If the runtime is a problem the data structures can
1154 + struct page *page;
1155 + unsigned long addr;
1157 + /* Walk through the list of destination pages, and see if I
1160 + list_for_each_entry(page, &image->dest_pages, list) {
1161 + addr = page_to_pfn(page) << PAGE_SHIFT;
1162 + if (addr == destination) {
1163 + list_del(&page->list);
1169 + kimage_entry_t *old;
1170 + /* Allocate a page, if we run out of memory give up */
1171 + page = alloc_page(gfp_mask);
1175 + SetPageReserved(page);
1176 + /* If the page cannot be used file it away */
1177 + if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
1178 + list_add(&page->list, &image->unuseable_pages);
1181 + addr = page_to_pfn(page) << PAGE_SHIFT;
1183 + /* If it is the destination page we want use it */
1184 + if (addr == destination)
1187 + /* If the page is not a destination page use it */
1188 + if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
1191 + /* I know that the page is someones destination page.
1192 + * See if there is already a source page for this
1193 + * destination page. And if so swap the source pages.
1195 + old = kimage_dst_used(image, addr);
1197 + /* If so move it */
1198 + unsigned long old_addr;
1199 + struct page *old_page;
1201 + old_addr = *old & PAGE_MASK;
1202 + old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
1203 + copy_highpage(page, old_page);
1204 + *old = addr | (*old & ~PAGE_MASK);
1206 + /* The old page I have found cannot be a
1207 + * destination page, so return it.
1214 + /* Place the page on the destination list I
1215 + * will use it later.
1217 + list_add(&page->list, &image->dest_pages);
1223 +static int kimage_load_segment(struct kimage *image,
1224 + struct kexec_segment *segment)
1226 + unsigned long mstart;
1228 + unsigned long offset;
1229 + unsigned long offset_end;
1230 + unsigned char *buf;
1233 + buf = segment->buf;
1234 + mstart = (unsigned long)segment->mem;
1236 + offset_end = segment->memsz;
1238 + result = kimage_set_destination(image, mstart);
1242 + for(offset = 0; offset < segment->memsz; offset += PAGE_SIZE) {
1243 + struct page *page;
1245 + size_t size, leader;
1246 + page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset);
1251 + result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
1256 + if (segment->bufsz < offset) {
1257 + /* We are past the end zero the whole page */
1258 + memset(ptr, 0, PAGE_SIZE);
1264 + if ((offset == 0)) {
1265 + leader = mstart & ~PAGE_MASK;
1268 + /* We are on the first page zero the unused portion */
1269 + memset(ptr, 0, leader);
1273 + if (size > (segment->bufsz - offset)) {
1274 + size = segment->bufsz - offset;
1276 + if (size < (PAGE_SIZE - leader)) {
1277 + /* zero the trailing part of the page */
1278 + memset(ptr + size, 0, (PAGE_SIZE - leader) - size);
1280 + result = copy_from_user(ptr, buf + offset, size);
1283 + result = (result < 0)?result : -EIO;
1292 + * Exec Kernel system call: for obvious reasons only root may call it.
1294 + * This call breaks up into three pieces.
1295 + * - A generic part which loads the new kernel from the current
1296 + * address space, and very carefully places the data in the
1297 + * allocated pages.
1299 + * - A generic part that interacts with the kernel and tells all of
1300 + * the devices to shut down. Preventing on-going dmas, and placing
1301 + * the devices in a consistent state so a later kernel can
1302 + * reinitialize them.
1304 + * - A machine specific part that includes the syscall number
1305 + * and the copies the image to it's final destination. And
1306 + * jumps into the image at entry.
1308 + * kexec does not sync, or unmount filesystems so if you need
1309 + * that to happen you need to do that yourself.
1311 +struct kimage *kexec_image = 0;
1313 +asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
1314 + struct kexec_segment *segments, unsigned long flags)
1316 + struct kimage *image;
1319 + /* We only trust the superuser with rebooting the system. */
1320 + if (!capable(CAP_SYS_ADMIN))
1323 + /* In case we need just a little bit of special behavior for
1329 + if (nr_segments > KEXEC_SEGMENT_MAX)
1334 + if (nr_segments > 0) {
1336 + result = kimage_alloc(&image, nr_segments, segments);
1340 + image->start = entry;
1341 + for(i = 0; i < nr_segments; i++) {
1342 + result = kimage_load_segment(image, &segments[i]);
1347 + result = kimage_terminate(image);
1353 + image = xchg(&kexec_image, image);
1356 + kimage_free(image);
1359 --- linux-2.6.0-test1/kernel/sys.c~kexec-2.6.0-full 2003-07-23 12:08:43.000000000 +0800
1360 +++ linux-2.6.0-test1-root/kernel/sys.c 2003-07-23 12:08:54.000000000 +0800
1362 #include <linux/init.h>
1363 #include <linux/highuid.h>
1364 #include <linux/fs.h>
1365 +#include <linux/kexec.h>
1366 #include <linux/workqueue.h>
1367 #include <linux/device.h>
1368 #include <linux/times.h>
1369 @@ -208,6 +209,7 @@ cond_syscall(sys_acct)
1370 cond_syscall(sys_lookup_dcookie)
1371 cond_syscall(sys_swapon)
1372 cond_syscall(sys_swapoff)
1373 +cond_syscall(sys_kexec_load)
1374 cond_syscall(sys_init_module)
1375 cond_syscall(sys_delete_module)
1376 cond_syscall(sys_socketpair)
1377 @@ -454,6 +456,27 @@ asmlinkage long sys_reboot(int magic1, i
1378 machine_restart(buffer);
1381 +#ifdef CONFIG_KEXEC
1382 + case LINUX_REBOOT_CMD_KEXEC:
1384 + struct kimage *image;
1389 + image = xchg(&kexec_image, 0);
1394 + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
1395 + system_running = 0;
1396 + device_shutdown();
1397 + printk(KERN_EMERG "Starting new kernel\n");
1398 + machine_kexec(image);
1402 #ifdef CONFIG_SOFTWARE_SUSPEND
1403 case LINUX_REBOOT_CMD_SW_SUSPEND:
1404 if (!software_suspend_enabled) {