3 Index: linux-2.6.0-test6/MAINTAINERS
4 ===================================================================
5 --- linux-2.6.0-test6.orig/MAINTAINERS 2003-10-07 16:08:42.000000000 +0800
6 +++ linux-2.6.0-test6/MAINTAINERS 2003-10-07 16:09:00.000000000 +0800
8 W: http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/
13 +M: ebiederm@xmission.com
14 +M: ebiederman@lnxi.com
15 +W: http://www.xmission.com/~ebiederm/files/kexec/
18 +W: http://www.osdl.org/archive/andyp/bloom/Code/Linux/Kexec/
19 +L: linux-kernel@vger.kernel.org
22 LANMEDIA WAN CARD DRIVER
23 P: Andrew Stanley-Jones
25 Index: linux-2.6.0-test6/arch/i386/Kconfig
26 ===================================================================
27 --- linux-2.6.0-test6.orig/arch/i386/Kconfig 2003-10-07 16:08:59.000000000 +0800
28 +++ linux-2.6.0-test6/arch/i386/Kconfig 2003-10-07 16:09:00.000000000 +0800
30 # depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA)) || X86_GENERICARCH
34 + bool "kexec system call (EXPERIMENTAL)"
35 + depends on EXPERIMENTAL
37 + kexec is a system call that implements the ability to shutdown your
38 + current kernel, and to start another kernel. It is like a reboot
39 + but it is indepedent of the system firmware. And like a reboot
40 + you can start any kernel with it not just Linux.
42 + The name comes from the similiarity to the exec system call.
44 + It is on an going process to be certain the hardware in a machine
45 + is properly shutdown, so do not be surprised if this code does not
46 + initially work for you. It may help to enable device hotplugging
47 + support. As of this writing the exact hardware interface is
48 + strongly in flux, so no good recommendation can be made.
53 Index: linux-2.6.0-test6/arch/i386/defconfig
54 ===================================================================
55 --- linux-2.6.0-test6.orig/arch/i386/defconfig 2003-10-07 15:47:25.000000000 +0800
56 +++ linux-2.6.0-test6/arch/i386/defconfig 2003-10-07 16:09:00.000000000 +0800
58 # CONFIG_HUGETLB_PAGE is not set
63 CONFIG_X86_LOCAL_APIC=y
65 Index: linux-2.6.0-test6/arch/i386/kernel/Makefile
66 ===================================================================
67 --- linux-2.6.0-test6.orig/arch/i386/kernel/Makefile 2003-10-07 16:08:34.000000000 +0800
68 +++ linux-2.6.0-test6/arch/i386/kernel/Makefile 2003-10-07 16:09:00.000000000 +0800
70 obj-$(CONFIG_X86_MPPARSE) += mpparse.o
71 obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
72 obj-$(CONFIG_X86_IO_APIC) += io_apic.o
73 +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o
74 obj-$(CONFIG_X86_NUMAQ) += numaq.o
75 obj-$(CONFIG_X86_SUMMIT) += summit.o
76 obj-$(CONFIG_EDD) += edd.o
77 Index: linux-2.6.0-test6/arch/i386/kernel/apic.c
78 ===================================================================
79 --- linux-2.6.0-test6.orig/arch/i386/kernel/apic.c 2003-10-07 15:47:25.000000000 +0800
80 +++ linux-2.6.0-test6/arch/i386/kernel/apic.c 2003-10-07 16:09:01.000000000 +0800
82 #include <linux/mc146818rtc.h>
83 #include <linux/kernel_stat.h>
84 #include <linux/sysdev.h>
85 +#include <linux/reboot.h>
87 #include <asm/atomic.h>
95 + /* Go back to Virtual Wire compatibility mode */
96 + unsigned long value;
98 + /* For the spurious interrupt use vector F, and enable it */
99 + value = apic_read(APIC_SPIV);
100 + value &= ~APIC_VECTOR_MASK;
101 + value |= APIC_SPIV_APIC_ENABLED;
103 + apic_write_around(APIC_SPIV, value);
105 + /* For LVT0 make it edge triggered, active high, external and enabled */
106 + value = apic_read(APIC_LVT0);
107 + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
108 + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
109 + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
110 + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
111 + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT);
112 + apic_write_around(APIC_LVT0, value);
114 + /* For LVT1 make it edge triggered, active high, nmi and enabled */
115 + value = apic_read(APIC_LVT1);
117 + APIC_MODE_MASK | APIC_SEND_PENDING |
118 + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
119 + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
120 + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
121 + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
122 + apic_write_around(APIC_LVT1, value);
124 +#endif /* CONFIG_KEXEC */
128 void disable_local_APIC(void)
129 @@ -1147,6 +1181,26 @@
133 +void stop_apics(void)
135 + /* By resetting the APIC's we disable the nmi watchdog */
138 + * Stop all CPUs and turn off local APICs and the IO-APIC, so
139 + * other OSs see a clean IRQ state.
143 + disable_local_APIC();
145 +#if defined(CONFIG_X86_IO_APIC)
146 + if (smp_found_config) {
150 + disconnect_bsp_APIC();
154 * This initializes the IO-APIC and APIC hardware if this is
156 Index: linux-2.6.0-test6/arch/i386/kernel/dmi_scan.c
157 ===================================================================
158 --- linux-2.6.0-test6.orig/arch/i386/kernel/dmi_scan.c 2003-10-07 16:08:34.000000000 +0800
159 +++ linux-2.6.0-test6/arch/i386/kernel/dmi_scan.c 2003-10-07 16:09:01.000000000 +0800
165 - * Some machines require the "reboot=s" commandline option, this quirk makes that automatic.
167 -static __init int set_smp_reboot(struct dmi_blacklist *d)
170 - extern int reboot_smp;
171 - if (reboot_smp == 0)
174 - printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident);
181 - * Some machines require the "reboot=b,s" commandline option, this quirk makes that automatic.
183 -static __init int set_smp_bios_reboot(struct dmi_blacklist *d)
186 - set_bios_reboot(d);
191 * Some bioses have a broken protected mode poweroff and need to use realmode
193 MATCH(DMI_BIOS_VERSION, "4.60 PGMA"),
194 MATCH(DMI_BIOS_DATE, "134526184"), NO_MATCH
196 - { set_smp_bios_reboot, "Dell PowerEdge 1300", { /* Handle problems with rebooting on Dell 1300's */
197 + { set_bios_reboot, "Dell PowerEdge 1300", { /* Handle problems with rebooting on Dell 1300's */
198 MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
199 MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
201 Index: linux-2.6.0-test6/arch/i386/kernel/entry.S
202 ===================================================================
203 --- linux-2.6.0-test6.orig/arch/i386/kernel/entry.S 2003-10-07 16:08:34.000000000 +0800
204 +++ linux-2.6.0-test6/arch/i386/kernel/entry.S 2003-10-07 16:09:01.000000000 +0800
205 @@ -1046,6 +1046,7 @@
207 .long sys_fadvise64_64
208 .long sys_ni_syscall /* sys_vserver */
209 + .long sys_kexec_load
211 nr_syscalls=(.-sys_call_table)/4
213 Index: linux-2.6.0-test6/arch/i386/kernel/i8259.c
214 ===================================================================
215 --- linux-2.6.0-test6.orig/arch/i386/kernel/i8259.c 2003-10-07 16:08:34.000000000 +0800
216 +++ linux-2.6.0-test6/arch/i386/kernel/i8259.c 2003-10-07 16:09:01.000000000 +0800
221 +static int i8259A_shutdown(struct sys_device *dev)
223 + /* Put the i8259A into a quiescent state that
224 + * the kernel initialization code can get it
227 + outb(0xff, 0x21); /* mask all of 8259A-1 */
228 + outb(0xff, 0xA1); /* mask all of 8259A-1 */
232 static struct sysdev_class i8259_sysdev_class = {
233 set_kset_name("i8259"),
234 .resume = i8259A_resume,
235 + .shutdown = i8259A_shutdown,
238 static struct sys_device device_i8259A = {
239 Index: linux-2.6.0-test6/arch/i386/kernel/io_apic.c
240 ===================================================================
241 --- linux-2.6.0-test6.orig/arch/i386/kernel/io_apic.c 2003-10-07 16:08:34.000000000 +0800
242 +++ linux-2.6.0-test6/arch/i386/kernel/io_apic.c 2003-10-07 16:09:01.000000000 +0800
243 @@ -1629,8 +1629,6 @@
244 * Clear the IO-APIC before rebooting:
248 - disconnect_bsp_APIC();
252 Index: linux-2.6.0-test6/arch/i386/kernel/machine_kexec.c
253 ===================================================================
254 --- linux-2.6.0-test6.orig/arch/i386/kernel/machine_kexec.c 2003-10-07 16:09:00.000000000 +0800
255 +++ linux-2.6.0-test6/arch/i386/kernel/machine_kexec.c 2003-10-07 16:09:01.000000000 +0800
257 +#include <linux/config.h>
258 +#include <linux/mm.h>
259 +#include <linux/kexec.h>
260 +#include <linux/delay.h>
261 +#include <asm/pgtable.h>
262 +#include <asm/pgalloc.h>
263 +#include <asm/tlbflush.h>
264 +#include <asm/mmu_context.h>
266 +#include <asm/apic.h>
271 + * =======================
275 +static void set_idt(void *newidt, __u16 limit)
277 + unsigned char curidt[6];
279 + /* ia32 supports unaliged loads & stores */
280 + (*(__u16 *)(curidt)) = limit;
281 + (*(__u32 *)(curidt +2)) = (unsigned long)(newidt);
283 + __asm__ __volatile__ (
290 +static void set_gdt(void *newgdt, __u16 limit)
292 + unsigned char curgdt[6];
294 + /* ia32 supports unaliged loads & stores */
295 + (*(__u16 *)(curgdt)) = limit;
296 + (*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt);
298 + __asm__ __volatile__ (
304 +static void load_segments(void)
307 +#define STR(X) __STR(X)
309 + __asm__ __volatile__ (
310 + "\tljmp $"STR(__KERNEL_CS)",$1f\n"
312 + "\tmovl $"STR(__KERNEL_DS)",%eax\n"
313 + "\tmovl %eax,%ds\n"
314 + "\tmovl %eax,%es\n"
315 + "\tmovl %eax,%fs\n"
316 + "\tmovl %eax,%gs\n"
317 + "\tmovl %eax,%ss\n"
323 +typedef void (*relocate_new_kernel_t)(
324 + unsigned long indirection_page, unsigned long reboot_code_buffer,
325 + unsigned long start_address);
327 +const extern unsigned char relocate_new_kernel[];
328 +extern void relocate_new_kernel_end(void);
329 +const extern unsigned int relocate_new_kernel_size;
330 +extern void use_mm(struct mm_struct *mm);
332 +void machine_kexec(struct kimage *image)
334 + unsigned long indirection_page;
335 + unsigned long reboot_code_buffer;
336 + relocate_new_kernel_t rnk;
338 + /* switch to an mm where the reboot_code_buffer is identity mapped */
342 + /* Interrupts aren't acceptable while we reboot */
343 + local_irq_disable();
344 + reboot_code_buffer = page_to_pfn(image->reboot_code_pages) << PAGE_SHIFT;
345 + indirection_page = image->head & PAGE_MASK;
348 + memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size);
350 + /* The segment registers are funny things, they are
351 + * automatically loaded from a table, in memory wherever you
352 + * set them to a specific selector, but this table is never
353 + * accessed again you set the segment to a different selector.
355 + * The more common model is are caches where the behide
356 + * the scenes work is done, but is also dropped at arbitrary
359 + * I take advantage of this here by force loading the
360 + * segments, before I zap the gdt with an invalid value.
363 + /* The gdt & idt are now invalid.
364 + * If you want to load them you must set up your own idt & gdt.
366 + set_gdt(phys_to_virt(0),0);
367 + set_idt(phys_to_virt(0),0);
370 + rnk = (relocate_new_kernel_t) reboot_code_buffer;
371 + (*rnk)(indirection_page, reboot_code_buffer, image->start);
373 Index: linux-2.6.0-test6/arch/i386/kernel/reboot.c
374 ===================================================================
375 --- linux-2.6.0-test6.orig/arch/i386/kernel/reboot.c 2003-10-07 16:08:34.000000000 +0800
376 +++ linux-2.6.0-test6/arch/i386/kernel/reboot.c 2003-10-07 16:09:01.000000000 +0800
378 int reboot_thru_bios;
382 -static int reboot_cpu = -1;
383 +int reboot_cpu = -1; /* specifies the internal linux cpu id, not the apicid */
384 /* shamelessly grabbed from lib/vsprintf.c for readability */
385 #define is_digit(c) ((c) >= '0' && (c) <= '9')
390 case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
392 if (is_digit(*(str+1))) {
393 reboot_cpu = (int) (*(str+1) - '0');
394 if (is_digit(*(str+2)))
397 void machine_restart(char * __unused)
402 - cpuid = GET_APIC_ID(apic_read(APIC_ID));
406 - /* check to see if reboot_cpu is valid
407 - if its not, default to the BSP */
408 - if ((reboot_cpu == -1) ||
409 - (reboot_cpu > (NR_CPUS -1)) ||
410 - !physid_isset(cpuid, phys_cpu_present_map))
411 - reboot_cpu = boot_cpu_physical_apicid;
413 - reboot_smp = 0; /* use this as a flag to only go through this once*/
414 - /* re-run this function on the other CPUs
415 - it will fall though this section since we have
416 - cleared reboot_smp, and do the reboot if it is the
417 - correct CPU, otherwise it halts. */
418 - if (reboot_cpu != cpuid)
419 - smp_call_function((void *)machine_restart , NULL, 1, 0);
422 - /* if reboot_cpu is still -1, then we want a tradional reboot,
423 - and if we are not running on the reboot_cpu,, halt */
424 - if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
426 - __asm__ __volatile__ ("hlt");
429 - * Stop all CPUs and turn off local APICs and the IO-APIC, so
430 - * other OSs see a clean IRQ state.
433 -#elif defined(CONFIG_X86_LOCAL_APIC)
434 - if (cpu_has_apic) {
435 - local_irq_disable();
436 - disable_local_APIC();
437 - local_irq_enable();
440 -#ifdef CONFIG_X86_IO_APIC
445 if (!reboot_thru_bios) {
447 efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, 0);
448 @@ -284,11 +238,13 @@
451 void machine_halt(void)
457 void machine_power_off(void)
461 efi.reset_system(EFI_RESET_SHUTDOWN, EFI_SUCCESS, 0, 0);
463 Index: linux-2.6.0-test6/arch/i386/kernel/relocate_kernel.S
464 ===================================================================
465 --- linux-2.6.0-test6.orig/arch/i386/kernel/relocate_kernel.S 2003-10-07 16:09:00.000000000 +0800
466 +++ linux-2.6.0-test6/arch/i386/kernel/relocate_kernel.S 2003-10-07 16:09:01.000000000 +0800
468 +#include <linux/config.h>
469 +#include <linux/linkage.h>
471 + /* Must be relocatable PIC code callable as a C function, that once
472 + * it starts can not use the previous processes stack.
475 + .globl relocate_new_kernel
476 +relocate_new_kernel:
477 + /* read the arguments and say goodbye to the stack */
478 + movl 4(%esp), %ebx /* indirection_page */
479 + movl 8(%esp), %ebp /* reboot_code_buffer */
480 + movl 12(%esp), %edx /* start address */
482 + /* zero out flags, and disable interrupts */
486 + /* set a new stack at the bottom of our page... */
487 + lea 4096(%ebp), %esp
489 + /* store the parameters back on the stack */
490 + pushl %edx /* store the start address */
492 + /* Set cr0 to a known state:
493 + * 31 0 == Paging disabled
494 + * 18 0 == Alignment check disabled
495 + * 16 0 == Write protect disabled
496 + * 3 0 == No task switch
497 + * 2 0 == Don't do FP software emulation.
498 + * 0 1 == Proctected mode enabled
501 + andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
505 + /* Set cr4 to a known state:
506 + * Setting everything to zero seems safe.
515 + /* Flush the TLB (needed?) */
519 + /* Do the copies */
521 +0: /* top, read another word for the indirection page */
525 + testl $0x1, %ecx /* is it a destination page */
528 + andl $0xfffff000, %edi
531 + testl $0x2, %ecx /* is it an indirection page */
534 + andl $0xfffff000, %ebx
537 + testl $0x4, %ecx /* is it the done indicator */
541 + testl $0x8, %ecx /* is it the source indicator */
542 + jz 0b /* Ignore it otherwise */
543 + movl %ecx, %esi /* For every source page do a copy */
544 + andl $0xfffff000, %esi
552 + /* To be certain of avoiding problems with self modifying code
553 + * I need to execute a serializing instruction here.
554 + * So I flush the TLB, it's handy, and not processor dependent.
559 + /* set all of the registers to known values */
560 + /* leave %esp alone */
570 +relocate_new_kernel_end:
572 + .globl relocate_new_kernel_size
573 +relocate_new_kernel_size:
574 + .long relocate_new_kernel_end - relocate_new_kernel
575 Index: linux-2.6.0-test6/arch/i386/kernel/smp.c
576 ===================================================================
577 --- linux-2.6.0-test6.orig/arch/i386/kernel/smp.c 2003-10-07 16:08:59.000000000 +0800
578 +++ linux-2.6.0-test6/arch/i386/kernel/smp.c 2003-10-07 16:09:01.000000000 +0800
581 void smp_send_stop(void)
583 + extern int reboot_cpu;
586 + /* The boot cpu is always logical cpu 0 */
589 + /* See if there has been give a command line override .
591 + if ((reboot_cpu != -1) && !(reboot_cpu >= NR_CPUS) &&
592 + test_bit(reboot_cpu, &cpu_online_map)) {
593 + reboot_cpu_id = reboot_cpu;
596 + /* Make certain the the cpu I'm rebooting on is online */
597 + if (!test_bit(reboot_cpu_id, &cpu_online_map)) {
598 + reboot_cpu_id = smp_processor_id();
601 + /* Make certain I only run on the appropriate processor */
602 + set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
604 + /* O.k. Now that I'm on the appropriate processor stop
605 + * all of the others.
607 smp_call_function(stop_this_cpu, NULL, 1, 0);
610 Index: linux-2.6.0-test6/include/asm-i386/apic.h
611 ===================================================================
612 --- linux-2.6.0-test6.orig/include/asm-i386/apic.h 2003-10-07 15:47:25.000000000 +0800
613 +++ linux-2.6.0-test6/include/asm-i386/apic.h 2003-10-07 16:09:01.000000000 +0800
615 #define NMI_LOCAL_APIC 2
616 #define NMI_INVALID 3
618 +extern void stop_apics(void);
620 +static inline void stop_apics(void) { }
621 #endif /* CONFIG_X86_LOCAL_APIC */
623 #endif /* __ASM_APIC_H */
624 Index: linux-2.6.0-test6/include/asm-i386/apicdef.h
625 ===================================================================
626 --- linux-2.6.0-test6.orig/include/asm-i386/apicdef.h 2003-10-07 15:47:25.000000000 +0800
627 +++ linux-2.6.0-test6/include/asm-i386/apicdef.h 2003-10-07 16:09:01.000000000 +0800
629 #define APIC_LVT_REMOTE_IRR (1<<14)
630 #define APIC_INPUT_POLARITY (1<<13)
631 #define APIC_SEND_PENDING (1<<12)
632 +#define APIC_MODE_MASK 0x700
633 #define GET_APIC_DELIVERY_MODE(x) (((x)>>8)&0x7)
634 #define SET_APIC_DELIVERY_MODE(x,y) (((x)&~0x700)|((y)<<8))
635 #define APIC_MODE_FIXED 0x0
636 Index: linux-2.6.0-test6/include/asm-i386/kexec.h
637 ===================================================================
638 --- linux-2.6.0-test6.orig/include/asm-i386/kexec.h 2003-10-07 16:09:00.000000000 +0800
639 +++ linux-2.6.0-test6/include/asm-i386/kexec.h 2003-10-07 16:09:01.000000000 +0800
641 +#ifndef _I386_KEXEC_H
642 +#define _I386_KEXEC_H
644 +#include <asm/fixmap.h>
647 + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
648 + * I.e. Maximum page that is mapped directly into kernel memory,
649 + * and kmap is not required.
651 + * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct
652 + * calculation for the amount of memory directly mappable into the
653 + * kernel memory space.
656 +/* Maximum physical address we can use pages from */
657 +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
658 +/* Maximum address we can reach in physical address mode */
659 +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
661 +#define KEXEC_REBOOT_CODE_SIZE 4096
663 +#endif /* _I386_KEXEC_H */
664 Index: linux-2.6.0-test6/include/asm-i386/unistd.h
665 ===================================================================
666 --- linux-2.6.0-test6.orig/include/asm-i386/unistd.h 2003-10-07 16:08:41.000000000 +0800
667 +++ linux-2.6.0-test6/include/asm-i386/unistd.h 2003-10-07 16:09:01.000000000 +0800
669 #define __NR_utimes 271
670 #define __NR_fadvise64_64 272
671 #define __NR_vserver 273
673 -#define NR_syscalls 274
674 +#define __NR_sys_kexec_load 274
676 +#define NR_syscalls 275
678 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
680 Index: linux-2.6.0-test6/include/linux/kexec.h
681 ===================================================================
682 --- linux-2.6.0-test6.orig/include/linux/kexec.h 2003-10-07 16:09:00.000000000 +0800
683 +++ linux-2.6.0-test6/include/linux/kexec.h 2003-10-07 16:09:01.000000000 +0800
685 +#ifndef LINUX_KEXEC_H
686 +#define LINUX_KEXEC_H
689 +#include <linux/types.h>
690 +#include <linux/list.h>
691 +#include <asm/kexec.h>
694 + * This structure is used to hold the arguments that are used when loading
698 +typedef unsigned long kimage_entry_t;
699 +#define IND_DESTINATION 0x1
700 +#define IND_INDIRECTION 0x2
701 +#define IND_DONE 0x4
702 +#define IND_SOURCE 0x8
704 +#define KEXEC_SEGMENT_MAX 8
705 +struct kexec_segment {
713 + kimage_entry_t head;
714 + kimage_entry_t *entry;
715 + kimage_entry_t *last_entry;
717 + unsigned long destination;
718 + unsigned long offset;
720 + unsigned long start;
721 + struct page *reboot_code_pages;
723 + unsigned long nr_segments;
724 + struct kexec_segment segment[KEXEC_SEGMENT_MAX+1];
726 + struct list_head dest_pages;
727 + struct list_head unuseable_pages;
731 +/* kexec interface functions */
732 +extern void machine_kexec(struct kimage *image);
733 +extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments,
734 + struct kexec_segment *segments);
735 +extern struct kimage *kexec_image;
737 +#endif /* LINUX_KEXEC_H */
739 Index: linux-2.6.0-test6/include/linux/reboot.h
740 ===================================================================
741 --- linux-2.6.0-test6.orig/include/linux/reboot.h 2003-10-07 15:47:25.000000000 +0800
742 +++ linux-2.6.0-test6/include/linux/reboot.h 2003-10-07 16:09:01.000000000 +0800
744 * POWER_OFF Stop OS and remove all power from system, if possible.
745 * RESTART2 Restart system using given command string.
746 * SW_SUSPEND Suspend system using software suspend if compiled in.
747 + * KEXEC Restart the system using a different kernel.
750 #define LINUX_REBOOT_CMD_RESTART 0x01234567
752 #define LINUX_REBOOT_CMD_POWER_OFF 0x4321FEDC
753 #define LINUX_REBOOT_CMD_RESTART2 0xA1B2C3D4
754 #define LINUX_REBOOT_CMD_SW_SUSPEND 0xD000FCE2
755 +#define LINUX_REBOOT_CMD_KEXEC 0x45584543
759 Index: linux-2.6.0-test6/kernel/Makefile
760 ===================================================================
761 --- linux-2.6.0-test6.orig/kernel/Makefile 2003-10-07 16:08:42.000000000 +0800
762 +++ linux-2.6.0-test6/kernel/Makefile 2003-10-07 16:09:01.000000000 +0800
764 obj-$(CONFIG_KALLSYMS) += kallsyms.o
765 obj-$(CONFIG_PM) += power/
766 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
767 +obj-$(CONFIG_KEXEC) += kexec.o
768 obj-$(CONFIG_COMPAT) += compat.o
769 obj-$(CONFIG_IKCONFIG) += configs.o
770 obj-$(CONFIG_IKCONFIG_PROC) += configs.o
771 Index: linux-2.6.0-test6/kernel/kexec.c
772 ===================================================================
773 --- linux-2.6.0-test6.orig/kernel/kexec.c 2003-10-07 16:09:00.000000000 +0800
774 +++ linux-2.6.0-test6/kernel/kexec.c 2003-10-07 16:09:01.000000000 +0800
776 +#include <linux/mm.h>
777 +#include <linux/file.h>
778 +#include <linux/slab.h>
779 +#include <linux/fs.h>
780 +#include <linux/version.h>
781 +#include <linux/compile.h>
782 +#include <linux/kexec.h>
783 +#include <linux/spinlock.h>
784 +#include <linux/list.h>
785 +#include <linux/highmem.h>
786 +#include <net/checksum.h>
787 +#include <asm/page.h>
788 +#include <asm/uaccess.h>
790 +#include <asm/system.h>
792 +/* When kexec transitions to the new kernel there is a one to one
793 + * mapping between physical and virtual addresses. On processors
794 + * where you can disable the MMU this is trivial, and easy. For
795 + * others it is still a simple predictable page table to setup.
797 + * In that environment kexec copies the new kernel to it's final
798 + * resting place. This means I can only support memory whose
799 + * physical address can fit in an unsigned long. In particular
800 + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
801 + * If the assembly stub has more restrictive requirements
802 + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
803 + * defined more restrictively in <asm/kexec.h>.
805 + * The code for the transition from the current kernel to the
806 + * the new kernel is placed in the reboot_code_buffer, whose size
807 + * is given by KEXEC_REBOOT_CODE_SIZE. In the best case only a single
808 + * page of memory is necessary, but some architectures require more.
809 + * Because this memory must be identity mapped in the transition from
810 + * virtual to physical addresses it must live in the range
811 + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
814 + * The assembly stub in the reboot code buffer is passed a linked list
815 + * of descriptor pages detailing the source pages of the new kernel,
816 + * and the destination addresses of those source pages. As this data
817 + * structure is not used in the context of the current OS, it must
818 + * be self contained.
820 + * The code has been made to work with highmem pages and will use a
821 + * destination page in it's final resting place (if it happens
822 + * to allocate it). The end product of this is that most of the
823 + * physical address space, and most of ram can be used.
825 + * Future directions include:
826 + * - allocating a page table with the reboot code buffer identity
827 + * mapped, to simplify machine_kexec and make kexec_on_panic, more
829 + * - allocating the pages for a page table for machines that cannot
830 + * disable their MMUs. (Hammer, Alpha...)
833 +/* KIMAGE_NO_DEST is an impossible destination address..., for
834 + * allocating pages whose destination address we do not care about.
836 +#define KIMAGE_NO_DEST (-1UL)
838 +static int kimage_is_destination_range(
839 + struct kimage *image, unsigned long start, unsigned long end);
840 +static struct page *kimage_alloc_reboot_code_pages(struct kimage *image);
841 +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
844 +static int kimage_alloc(struct kimage **rimage,
845 + unsigned long nr_segments, struct kexec_segment *segments)
848 + struct kimage *image;
849 + size_t segment_bytes;
850 + struct page *reboot_pages;
853 + /* Allocate a controlling structure */
855 + image = kmalloc(sizeof(*image), GFP_KERNEL);
859 + memset(image, 0, sizeof(*image));
861 + image->entry = &image->head;
862 + image->last_entry = &image->head;
864 + /* Initialize the list of destination pages */
865 + INIT_LIST_HEAD(&image->dest_pages);
867 + /* Initialize the list of unuseable pages */
868 + INIT_LIST_HEAD(&image->unuseable_pages);
870 + /* Read in the segments */
871 + image->nr_segments = nr_segments;
872 + segment_bytes = nr_segments * sizeof*segments;
873 + result = copy_from_user(image->segment, segments, segment_bytes);
877 + /* Verify we have good destination addresses. The caller is
878 + * responsible for making certain we don't attempt to load
879 + * the new image into invalid or reserved areas of RAM. This
880 + * just verifies it is an address we can use.
882 + result = -EADDRNOTAVAIL;
883 + for(i = 0; i < nr_segments; i++) {
884 + unsigned long mend;
885 + mend = ((unsigned long)(image->segment[i].mem)) +
886 + image->segment[i].memsz;
887 + if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
891 + /* Find a location for the reboot code buffer, and add it
892 + * the vector of segments so that it's pages will also be
893 + * counted as destination pages.
896 + reboot_pages = kimage_alloc_reboot_code_pages(image);
897 + if (!reboot_pages) {
898 + printk(KERN_ERR "Could not allocate reboot_code_buffer\n");
901 + image->reboot_code_pages = reboot_pages;
902 + image->segment[nr_segments].buf = 0;
903 + image->segment[nr_segments].bufsz = 0;
904 + image->segment[nr_segments].mem = (void *)(page_to_pfn(reboot_pages) << PAGE_SHIFT);
905 + image->segment[nr_segments].memsz = KEXEC_REBOOT_CODE_SIZE;
906 + image->nr_segments++;
918 +static int kimage_is_destination_range(
919 + struct kimage *image, unsigned long start, unsigned long end)
922 + for(i = 0; i < image->nr_segments; i++) {
923 + unsigned long mstart, mend;
924 + mstart = (unsigned long)image->segment[i].mem;
925 + mend = mstart + image->segment[i].memsz;
926 + if ((end > mstart) && (start < mend)) {
934 +static int identity_map_pages(struct page *pages, int order)
936 + struct mm_struct *mm;
937 + struct vm_area_struct *vma;
942 + down_write(&mm->mmap_sem);
944 + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
949 + memset(vma, 0, sizeof(vma));
951 + vma->vm_start = page_to_pfn(pages) << PAGE_SHIFT;
952 + vma->vm_end = vma->vm_start + (1 << (order + PAGE_SHIFT));
954 + vma->vm_flags = VM_SHARED \
955 + | VM_READ | VM_WRITE | VM_EXEC \
956 + | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC \
957 + | VM_DONTCOPY | VM_RESERVED;
958 + vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
959 + vma->vm_file = NULL;
960 + vma->vm_private_data = NULL;
961 + INIT_LIST_HEAD(&vma->shared);
962 + insert_vm_struct(mm, vma);
964 + error = remap_page_range(vma, vma->vm_start, vma->vm_start,
965 + vma->vm_end - vma->vm_start, vma->vm_page_prot);
972 + if (error && vma) {
973 + kmem_cache_free(vm_area_cachep, vma);
976 + up_write(&mm->mmap_sem);
981 +#define identity_map_pages(pages, order) 0
984 +struct page *kimage_alloc_reboot_code_pages(struct kimage *image)
986 + /* The reboot code buffer is special. It is the only set of
987 + * pages that must be allocated in their final resting place,
988 + * and the only set of pages whose final resting place we can
991 + * At worst this runs in O(N) of the image size.
993 + struct list_head extra_pages, *pos, *next;
994 + struct page *pages;
995 + unsigned long addr;
997 + order = get_order(KEXEC_REBOOT_CODE_SIZE);
998 + count = 1 << order;
999 + INIT_LIST_HEAD(&extra_pages);
1002 + pages = alloc_pages(GFP_HIGHUSER, order);
1005 + for(i = 0; i < count; i++) {
1006 + SetPageReserved(pages +i);
1008 + addr = page_to_pfn(pages) << PAGE_SHIFT;
1009 + if ((page_to_pfn(pages) >= (TASK_SIZE >> PAGE_SHIFT)) ||
1010 + kimage_is_destination_range(image, addr, addr + KEXEC_REBOOT_CODE_SIZE)) {
1011 + list_add(&pages->list, &extra_pages);
1017 + result = identity_map_pages(pages, order);
1019 + list_add(&pages->list, &extra_pages);
1023 + /* If I could convert a multi page allocation into a buch of
1024 + * single page allocations I could add these pages to
1025 + * image->dest_pages. For now it is simpler to just free the
1028 + list_for_each_safe(pos, next, &extra_pages) {
1029 + struct page *page;
1031 + page = list_entry(pos, struct page, list);
1032 + for(i = 0; i < count; i++) {
1033 + ClearPageReserved(pages +i);
1035 + list_del(&extra_pages);
1036 + __free_pages(page, order);
1041 +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
1043 + if (image->offset != 0) {
1046 + if (image->entry == image->last_entry) {
1047 + kimage_entry_t *ind_page;
1048 + struct page *page;
1049 + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
1053 + ind_page = page_address(page);
1054 + *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
1055 + image->entry = ind_page;
1056 + image->last_entry =
1057 + ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
1059 + *image->entry = entry;
1061 + image->offset = 0;
1065 +static int kimage_set_destination(
1066 + struct kimage *image, unsigned long destination)
1069 + destination &= PAGE_MASK;
1070 + result = kimage_add_entry(image, destination | IND_DESTINATION);
1071 + if (result == 0) {
1072 + image->destination = destination;
1078 +static int kimage_add_page(struct kimage *image, unsigned long page)
1081 + page &= PAGE_MASK;
1082 + result = kimage_add_entry(image, page | IND_SOURCE);
1083 + if (result == 0) {
1084 + image->destination += PAGE_SIZE;
1090 +static void kimage_free_extra_pages(struct kimage *image)
1092 + /* Walk through and free any extra destination pages I may have */
1093 + struct list_head *pos, *next;
1094 + list_for_each_safe(pos, next, &image->dest_pages) {
1095 + struct page *page;
1096 + page = list_entry(pos, struct page, list);
1097 + list_del(&page->list);
1098 + ClearPageReserved(page);
1099 + __free_page(page);
1101 + /* Walk through and free any unuseable pages I have cached */
1102 + list_for_each_safe(pos, next, &image->unuseable_pages) {
1103 + struct page *page;
1104 + page = list_entry(pos, struct page, list);
1105 + list_del(&page->list);
1106 + ClearPageReserved(page);
1107 + __free_page(page);
1111 +static int kimage_terminate(struct kimage *image)
1114 + result = kimage_add_entry(image, IND_DONE);
1115 + if (result == 0) {
1116 + /* Point at the terminating element */
1118 + kimage_free_extra_pages(image);
1123 +#define for_each_kimage_entry(image, ptr, entry) \
1124 + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
1125 + ptr = (entry & IND_INDIRECTION)? \
1126 + phys_to_virt((entry & PAGE_MASK)): ptr +1)
1128 +static void kimage_free(struct kimage *image)
1130 + kimage_entry_t *ptr, entry;
1131 + kimage_entry_t ind = 0;
1132 + int i, count, order;
1135 + kimage_free_extra_pages(image);
1136 + for_each_kimage_entry(image, ptr, entry) {
1137 + if (entry & IND_INDIRECTION) {
1138 + /* Free the previous indirection page */
1139 + if (ind & IND_INDIRECTION) {
1140 + free_page((unsigned long)phys_to_virt(ind & PAGE_MASK));
1142 + /* Save this indirection page until we are
1147 + else if (entry & IND_SOURCE) {
1148 + free_page((unsigned long)phys_to_virt(entry & PAGE_MASK));
1151 + order = get_order(KEXEC_REBOOT_CODE_SIZE);
1152 + count = 1 << order;
1153 + do_munmap(&init_mm,
1154 + page_to_pfn(image->reboot_code_pages) << PAGE_SHIFT,
1155 + count << PAGE_SHIFT);
1156 + for(i = 0; i < count; i++) {
1157 + ClearPageReserved(image->reboot_code_pages + i);
1159 + __free_pages(image->reboot_code_pages, order);
1163 +static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
1165 + kimage_entry_t *ptr, entry;
1166 + unsigned long destination = 0;
1167 + for_each_kimage_entry(image, ptr, entry) {
1168 + if (entry & IND_DESTINATION) {
1169 + destination = entry & PAGE_MASK;
1171 + else if (entry & IND_SOURCE) {
1172 + if (page == destination) {
1175 + destination += PAGE_SIZE;
1181 +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
1183 + /* Here we implment safe guards to ensure that a source page
1184 + * is not copied to it's destination page before the data on
1185 + * the destination page is no longer useful.
1187 + * To do this we maintain the invariant that a source page is
1188 + * either it's own destination page, or it is not a
1189 + * destination page at all.
1191 + * That is slightly stronger than required, but the proof
1192 + * that no problems will not occur is trivial, and the
1193 + * implemenation is simply to verify.
1195 + * When allocating all pages normally this algorithm will run
1196 + * in O(N) time, but in the worst case it will run in O(N^2)
1197 + * time. If the runtime is a problem the data structures can
1200 + struct page *page;
1201 + unsigned long addr;
1203 + /* Walk through the list of destination pages, and see if I
1206 + list_for_each_entry(page, &image->dest_pages, list) {
1207 + addr = page_to_pfn(page) << PAGE_SHIFT;
1208 + if (addr == destination) {
1209 + list_del(&page->list);
1215 + kimage_entry_t *old;
1216 + /* Allocate a page, if we run out of memory give up */
1217 + page = alloc_page(gfp_mask);
1221 + SetPageReserved(page);
1222 + /* If the page cannot be used file it away */
1223 + if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
1224 + list_add(&page->list, &image->unuseable_pages);
1227 + addr = page_to_pfn(page) << PAGE_SHIFT;
1229 + /* If it is the destination page we want use it */
1230 + if (addr == destination)
1233 + /* If the page is not a destination page use it */
1234 + if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
1237 + /* I know that the page is someones destination page.
1238 + * See if there is already a source page for this
1239 + * destination page. And if so swap the source pages.
1241 + old = kimage_dst_used(image, addr);
1243 + /* If so move it */
1244 + unsigned long old_addr;
1245 + struct page *old_page;
1247 + old_addr = *old & PAGE_MASK;
1248 + old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
1249 + copy_highpage(page, old_page);
1250 + *old = addr | (*old & ~PAGE_MASK);
1252 + /* The old page I have found cannot be a
1253 + * destination page, so return it.
1260 + /* Place the page on the destination list I
1261 + * will use it later.
1263 + list_add(&page->list, &image->dest_pages);
1269 +static int kimage_load_segment(struct kimage *image,
1270 + struct kexec_segment *segment)
1272 + unsigned long mstart;
1274 + unsigned long offset;
1275 + unsigned long offset_end;
1276 + unsigned char *buf;
1279 + buf = segment->buf;
1280 + mstart = (unsigned long)segment->mem;
1282 + offset_end = segment->memsz;
1284 + result = kimage_set_destination(image, mstart);
1288 + for(offset = 0; offset < segment->memsz; offset += PAGE_SIZE) {
1289 + struct page *page;
1291 + size_t size, leader;
1292 + page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset);
1297 + result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
1302 + if (segment->bufsz < offset) {
1303 + /* We are past the end zero the whole page */
1304 + memset(ptr, 0, PAGE_SIZE);
1310 + if ((offset == 0)) {
1311 + leader = mstart & ~PAGE_MASK;
1314 + /* We are on the first page zero the unused portion */
1315 + memset(ptr, 0, leader);
1319 + if (size > (segment->bufsz - offset)) {
1320 + size = segment->bufsz - offset;
1322 + if (size < (PAGE_SIZE - leader)) {
1323 + /* zero the trailing part of the page */
1324 + memset(ptr + size, 0, (PAGE_SIZE - leader) - size);
1326 + result = copy_from_user(ptr, buf + offset, size);
1329 + result = (result < 0)?result : -EIO;
1338 + * Exec Kernel system call: for obvious reasons only root may call it.
1340 + * This call breaks up into three pieces.
1341 + * - A generic part which loads the new kernel from the current
1342 + * address space, and very carefully places the data in the
1343 + * allocated pages.
1345 + * - A generic part that interacts with the kernel and tells all of
1346 + * the devices to shut down. Preventing on-going dmas, and placing
1347 + * the devices in a consistent state so a later kernel can
1348 + * reinitialize them.
1350 + * - A machine specific part that includes the syscall number
1351 + * and the copies the image to it's final destination. And
1352 + * jumps into the image at entry.
1354 + * kexec does not sync, or unmount filesystems so if you need
1355 + * that to happen you need to do that yourself.
1357 +struct kimage *kexec_image = 0;
1359 +asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
1360 + struct kexec_segment *segments, unsigned long flags)
1362 + struct kimage *image;
1365 + /* We only trust the superuser with rebooting the system. */
1366 + if (!capable(CAP_SYS_ADMIN))
1369 + /* In case we need just a little bit of special behavior for
1375 + if (nr_segments > KEXEC_SEGMENT_MAX)
1380 + if (nr_segments > 0) {
1382 + result = kimage_alloc(&image, nr_segments, segments);
1386 + image->start = entry;
1387 + for(i = 0; i < nr_segments; i++) {
1388 + result = kimage_load_segment(image, &segments[i]);
1393 + result = kimage_terminate(image);
1399 + image = xchg(&kexec_image, image);
1402 + kimage_free(image);
1405 Index: linux-2.6.0-test6/kernel/sys.c
1406 ===================================================================
1407 --- linux-2.6.0-test6.orig/kernel/sys.c 2003-10-07 16:08:42.000000000 +0800
1408 +++ linux-2.6.0-test6/kernel/sys.c 2003-10-09 18:38:57.000000000 +0800
1410 #include <linux/init.h>
1411 #include <linux/highuid.h>
1412 #include <linux/fs.h>
1413 +#include <linux/kexec.h>
1414 #include <linux/workqueue.h>
1415 #include <linux/device.h>
1416 #include <linux/times.h>
1418 int notifier_chain_register(struct notifier_block **list, struct notifier_block *n)
1420 write_lock(¬ifier_lock);
1421 + printk(KERN_EMERG"add noitifier nb %p \n", n);
1424 if(n->priority > (*list)->priority)
1429 + printk("nb %p notifier_call %p \n", nb, nb->notifier_call);
1430 ret=nb->notifier_call(nb,val,v);
1431 if(ret&NOTIFY_STOP_MASK)
1434 cond_syscall(sys_lookup_dcookie)
1435 cond_syscall(sys_swapon)
1436 cond_syscall(sys_swapoff)
1437 +cond_syscall(sys_kexec_load)
1438 cond_syscall(sys_init_module)
1439 cond_syscall(sys_delete_module)
1440 cond_syscall(sys_socketpair)
1441 @@ -454,6 +458,27 @@
1442 machine_restart(buffer);
1445 +#ifdef CONFIG_KEXEC
1446 + case LINUX_REBOOT_CMD_KEXEC:
1448 + struct kimage *image;
1453 + image = xchg(&kexec_image, 0);
1458 + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
1459 + system_running = 0;
1460 + device_shutdown();
1461 + printk(KERN_EMERG "Starting new kernel\n");
1462 + machine_kexec(image);
1466 #ifdef CONFIG_SOFTWARE_SUSPEND
1467 case LINUX_REBOOT_CMD_SW_SUSPEND:
1468 if (!software_suspend_enabled) {
1469 Index: linux-2.6.0-test6/fs/aio.c
1470 ===================================================================
1471 --- linux-2.6.0-test6.orig/fs/aio.c 2003-10-07 16:08:40.000000000 +0800
1472 +++ linux-2.6.0-test6/fs/aio.c 2003-10-07 16:09:01.000000000 +0800
1474 * (Note: this routine is intended to be called only
1475 * from a kernel thread context)
1477 -static void use_mm(struct mm_struct *mm)
1478 +void use_mm(struct mm_struct *mm)
1480 struct mm_struct *active_mm;
1481 struct task_struct *tsk = current;