Whamcloud - gitweb
- put the exit(1)s back in after MPI_Abort() in the hopes that MPI
[fs/lustre-release.git] / lustre / kernel_patches / patches / kexec-2.6.0-test6-full.patch
1  0 files changed
2
3 Index: linux-2.6.0-test6/MAINTAINERS
4 ===================================================================
5 --- linux-2.6.0-test6.orig/MAINTAINERS  2003-10-07 16:08:42.000000000 +0800
6 +++ linux-2.6.0-test6/MAINTAINERS       2003-10-07 16:09:00.000000000 +0800
7 @@ -1174,6 +1174,17 @@
8  W:     http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/
9  S:     Maintained
10  
11 +KEXEC
12 +P:     Eric Biederman
13 +M:     ebiederm@xmission.com
14 +M:     ebiederman@lnxi.com
15 +W:     http://www.xmission.com/~ebiederm/files/kexec/
16 +P:     Andy Pfiffer
17 +M:     andyp@osdl.org
18 +W:     http://www.osdl.org/archive/andyp/bloom/Code/Linux/Kexec/
19 +L:     linux-kernel@vger.kernel.org
20 +S:     Maintained
21 +
22  LANMEDIA WAN CARD DRIVER
23  P:     Andrew Stanley-Jones
24  M:     asj@lanmedia.com
25 Index: linux-2.6.0-test6/arch/i386/Kconfig
26 ===================================================================
27 --- linux-2.6.0-test6.orig/arch/i386/Kconfig    2003-10-07 16:08:59.000000000 +0800
28 +++ linux-2.6.0-test6/arch/i386/Kconfig 2003-10-07 16:09:00.000000000 +0800
29 @@ -845,6 +845,23 @@
30  # depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA)) || X86_GENERICARCH
31         default y
32  
33 +config KEXEC
34 +       bool "kexec system call (EXPERIMENTAL)"
35 +       depends on EXPERIMENTAL
36 +       help
37 +         kexec is a system call that implements the ability to  shutdown your
38 +         current kernel, and to start another kernel.  It is like a reboot
39 +         but it is indepedent of the system firmware.   And like a reboot
40 +         you can start any kernel with it not just Linux.  
41 +       
42 +         The name comes from the similiarity to the exec system call. 
43 +       
44 +         It is on an going process to be certain the hardware in a machine
45 +         is properly shutdown, so do not be surprised if this code does not
46 +         initially work for you.  It may help to enable device hotplugging
47 +         support.  As of this writing the exact hardware interface is
48 +         strongly in flux, so no good recommendation can be made.
49 +
50  endmenu
51  
52  
53 Index: linux-2.6.0-test6/arch/i386/defconfig
54 ===================================================================
55 --- linux-2.6.0-test6.orig/arch/i386/defconfig  2003-10-07 15:47:25.000000000 +0800
56 +++ linux-2.6.0-test6/arch/i386/defconfig       2003-10-07 16:09:00.000000000 +0800
57 @@ -82,6 +82,7 @@
58  # CONFIG_HUGETLB_PAGE is not set
59  CONFIG_SMP=y
60  CONFIG_NR_CPUS=8
61 +CONFIG_KEXEC=y
62  CONFIG_PREEMPT=y
63  CONFIG_X86_LOCAL_APIC=y
64  CONFIG_X86_IO_APIC=y
65 Index: linux-2.6.0-test6/arch/i386/kernel/Makefile
66 ===================================================================
67 --- linux-2.6.0-test6.orig/arch/i386/kernel/Makefile    2003-10-07 16:08:34.000000000 +0800
68 +++ linux-2.6.0-test6/arch/i386/kernel/Makefile 2003-10-07 16:09:00.000000000 +0800
69 @@ -24,6 +24,7 @@
70  obj-$(CONFIG_X86_MPPARSE)      += mpparse.o
71  obj-$(CONFIG_X86_LOCAL_APIC)   += apic.o nmi.o
72  obj-$(CONFIG_X86_IO_APIC)      += io_apic.o
73 +obj-$(CONFIG_KEXEC)            += machine_kexec.o relocate_kernel.o
74  obj-$(CONFIG_X86_NUMAQ)                += numaq.o
75  obj-$(CONFIG_X86_SUMMIT)       += summit.o
76  obj-$(CONFIG_EDD)              += edd.o
77 Index: linux-2.6.0-test6/arch/i386/kernel/apic.c
78 ===================================================================
79 --- linux-2.6.0-test6.orig/arch/i386/kernel/apic.c      2003-10-07 15:47:25.000000000 +0800
80 +++ linux-2.6.0-test6/arch/i386/kernel/apic.c   2003-10-07 16:09:01.000000000 +0800
81 @@ -26,6 +26,7 @@
82  #include <linux/mc146818rtc.h>
83  #include <linux/kernel_stat.h>
84  #include <linux/sysdev.h>
85 +#include <linux/reboot.h>
86  
87  #include <asm/atomic.h>
88  #include <asm/smp.h>
89 @@ -183,6 +184,39 @@
90                 outb(0x70, 0x22);
91                 outb(0x00, 0x23);
92         }
93 +#ifdef         CONFIG_KEXEC
94 +       else {
95 +               /* Go back to Virtual Wire compatibility mode */
96 +               unsigned long value;
97 +
98 +               /* For the spurious interrupt use vector F, and enable it */
99 +               value = apic_read(APIC_SPIV);
100 +               value &= ~APIC_VECTOR_MASK; 
101 +               value |= APIC_SPIV_APIC_ENABLED;
102 +               value |= 0xf;
103 +               apic_write_around(APIC_SPIV, value);
104 +
105 +               /* For LVT0 make it edge triggered, active high, external and enabled */
106 +               value = apic_read(APIC_LVT0);
107 +               value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | 
108 +                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 
109 +                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
110 +               value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
111 +               value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXINT);
112 +               apic_write_around(APIC_LVT0, value);
113 +               
114 +               /* For LVT1 make it edge triggered, active high, nmi and enabled */
115 +               value = apic_read(APIC_LVT1);
116 +               value &= ~(
117 +                       APIC_MODE_MASK | APIC_SEND_PENDING | 
118 +                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 
119 +                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
120 +               value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
121 +               value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
122 +               apic_write_around(APIC_LVT1, value);
123 +       }
124 +#endif /* CONFIG_KEXEC */
125 +
126  }
127  
128  void disable_local_APIC(void)
129 @@ -1147,6 +1181,26 @@
130         irq_exit();
131  }
132  
133 +void stop_apics(void)
134 +{
135 +       /* By resetting the APIC's we disable the nmi watchdog */
136 +#if CONFIG_SMP
137 +       /*
138 +        * Stop all CPUs and turn off local APICs and the IO-APIC, so
139 +        * other OSs see a clean IRQ state.
140 +        */
141 +       smp_send_stop();
142 +#else
143 +       disable_local_APIC();
144 +#endif
145 +#if defined(CONFIG_X86_IO_APIC)
146 +       if (smp_found_config) {
147 +               disable_IO_APIC();
148 +       }
149 +#endif
150 +       disconnect_bsp_APIC();
151 +}
152 +
153  /*
154   * This initializes the IO-APIC and APIC hardware if this is
155   * a UP kernel.
156 Index: linux-2.6.0-test6/arch/i386/kernel/dmi_scan.c
157 ===================================================================
158 --- linux-2.6.0-test6.orig/arch/i386/kernel/dmi_scan.c  2003-10-07 16:08:34.000000000 +0800
159 +++ linux-2.6.0-test6/arch/i386/kernel/dmi_scan.c       2003-10-07 16:09:01.000000000 +0800
160 @@ -222,31 +222,6 @@
161         return 0;
162  }
163  
164 -/*
165 - * Some machines require the "reboot=s"  commandline option, this quirk makes that automatic.
166 - */
167 -static __init int set_smp_reboot(struct dmi_blacklist *d)
168 -{
169 -#ifdef CONFIG_SMP
170 -       extern int reboot_smp;
171 -       if (reboot_smp == 0)
172 -       {
173 -               reboot_smp = 1;
174 -               printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident);
175 -       }
176 -#endif
177 -       return 0;
178 -}
179 -
180 -/*
181 - * Some machines require the "reboot=b,s"  commandline option, this quirk makes that automatic.
182 - */
183 -static __init int set_smp_bios_reboot(struct dmi_blacklist *d)
184 -{
185 -       set_smp_reboot(d);
186 -       set_bios_reboot(d);
187 -       return 0;
188 -}
189  
190  /*
191   * Some bioses have a broken protected mode poweroff and need to use realmode
192 @@ -581,7 +556,7 @@
193                         MATCH(DMI_BIOS_VERSION, "4.60 PGMA"),
194                         MATCH(DMI_BIOS_DATE, "134526184"), NO_MATCH
195                         } },
196 -       { set_smp_bios_reboot, "Dell PowerEdge 1300", { /* Handle problems with rebooting on Dell 1300's */
197 +       { set_bios_reboot, "Dell PowerEdge 1300", {     /* Handle problems with rebooting on Dell 1300's */
198                         MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
199                         MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
200                         NO_MATCH, NO_MATCH
201 Index: linux-2.6.0-test6/arch/i386/kernel/entry.S
202 ===================================================================
203 --- linux-2.6.0-test6.orig/arch/i386/kernel/entry.S     2003-10-07 16:08:34.000000000 +0800
204 +++ linux-2.6.0-test6/arch/i386/kernel/entry.S  2003-10-07 16:09:01.000000000 +0800
205 @@ -1046,6 +1046,7 @@
206         .long sys_utimes
207         .long sys_fadvise64_64
208         .long sys_ni_syscall    /* sys_vserver */
209 +        .long sys_kexec_load
210  
211  nr_syscalls=(.-sys_call_table)/4
212  
213 Index: linux-2.6.0-test6/arch/i386/kernel/i8259.c
214 ===================================================================
215 --- linux-2.6.0-test6.orig/arch/i386/kernel/i8259.c     2003-10-07 16:08:34.000000000 +0800
216 +++ linux-2.6.0-test6/arch/i386/kernel/i8259.c  2003-10-07 16:09:01.000000000 +0800
217 @@ -244,9 +244,21 @@
218         return 0;
219  }
220  
221 +static int i8259A_shutdown(struct sys_device *dev)
222 +{
223 +       /* Put the i8259A into a quiescent state that
224 +        * the kernel initialization code can get it
225 +        * out of.
226 +        */
227 +       outb(0xff, 0x21);       /* mask all of 8259A-1 */
228 +       outb(0xff, 0xA1);       /* mask all of 8259A-1 */
229 +       return 0;
230 +}
231 +
232  static struct sysdev_class i8259_sysdev_class = {
233         set_kset_name("i8259"),
234         .resume = i8259A_resume,
235 +       .shutdown = i8259A_shutdown,
236  };
237  
238  static struct sys_device device_i8259A = {
239 Index: linux-2.6.0-test6/arch/i386/kernel/io_apic.c
240 ===================================================================
241 --- linux-2.6.0-test6.orig/arch/i386/kernel/io_apic.c   2003-10-07 16:08:34.000000000 +0800
242 +++ linux-2.6.0-test6/arch/i386/kernel/io_apic.c        2003-10-07 16:09:01.000000000 +0800
243 @@ -1629,8 +1629,6 @@
244          * Clear the IO-APIC before rebooting:
245          */
246         clear_IO_APIC();
247 -
248 -       disconnect_bsp_APIC();
249  }
250  
251  /*
252 Index: linux-2.6.0-test6/arch/i386/kernel/machine_kexec.c
253 ===================================================================
254 --- linux-2.6.0-test6.orig/arch/i386/kernel/machine_kexec.c     2003-10-07 16:09:00.000000000 +0800
255 +++ linux-2.6.0-test6/arch/i386/kernel/machine_kexec.c  2003-10-07 16:09:01.000000000 +0800
256 @@ -0,0 +1,116 @@
257 +#include <linux/config.h>
258 +#include <linux/mm.h>
259 +#include <linux/kexec.h>
260 +#include <linux/delay.h>
261 +#include <asm/pgtable.h>
262 +#include <asm/pgalloc.h>
263 +#include <asm/tlbflush.h>
264 +#include <asm/mmu_context.h>
265 +#include <asm/io.h>
266 +#include <asm/apic.h>
267 +
268 +
269 +/*
270 + * machine_kexec
271 + * =======================
272 + */
273 +
274 +
275 +static void set_idt(void *newidt, __u16 limit)
276 +{
277 +       unsigned char curidt[6];
278 +
279 +       /* ia32 supports unaliged loads & stores */
280 +       (*(__u16 *)(curidt)) = limit;
281 +       (*(__u32 *)(curidt +2)) = (unsigned long)(newidt);
282 +
283 +       __asm__ __volatile__ (
284 +               "lidt %0\n" 
285 +               : "=m" (curidt)
286 +               );
287 +};
288 +
289 +
290 +static void set_gdt(void *newgdt, __u16 limit)
291 +{
292 +       unsigned char curgdt[6];
293 +
294 +       /* ia32 supports unaliged loads & stores */
295 +       (*(__u16 *)(curgdt)) = limit;
296 +       (*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt);
297 +
298 +       __asm__ __volatile__ (
299 +               "lgdt %0\n" 
300 +               : "=m" (curgdt)
301 +               );
302 +};
303 +
304 +static void load_segments(void)
305 +{
306 +#define __STR(X) #X
307 +#define STR(X) __STR(X)
308 +
309 +       __asm__ __volatile__ (
310 +               "\tljmp $"STR(__KERNEL_CS)",$1f\n"
311 +               "\t1:\n"
312 +               "\tmovl $"STR(__KERNEL_DS)",%eax\n"
313 +               "\tmovl %eax,%ds\n"
314 +               "\tmovl %eax,%es\n"
315 +               "\tmovl %eax,%fs\n"
316 +               "\tmovl %eax,%gs\n"
317 +               "\tmovl %eax,%ss\n"
318 +               );
319 +#undef STR
320 +#undef __STR
321 +}
322 +
323 +typedef void (*relocate_new_kernel_t)(
324 +       unsigned long indirection_page, unsigned long reboot_code_buffer,
325 +       unsigned long start_address);
326 +
327 +const extern unsigned char relocate_new_kernel[];
328 +extern void relocate_new_kernel_end(void);
329 +const extern unsigned int relocate_new_kernel_size;
330 +extern void use_mm(struct mm_struct *mm);
331 +
332 +void machine_kexec(struct kimage *image)
333 +{
334 +       unsigned long indirection_page;
335 +       unsigned long reboot_code_buffer;
336 +       relocate_new_kernel_t rnk;
337 +
338 +       /* switch to an mm where the reboot_code_buffer is identity mapped */
339 +       use_mm(&init_mm);
340 +       stop_apics();
341 +
342 +       /* Interrupts aren't acceptable while we reboot */
343 +       local_irq_disable();
344 +       reboot_code_buffer = page_to_pfn(image->reboot_code_pages) << PAGE_SHIFT;
345 +       indirection_page = image->head & PAGE_MASK;
346 +
347 +       /* copy it out */
348 +       memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size);
349 +
350 +       /* The segment registers are funny things, they are
351 +        * automatically loaded from a table, in memory wherever you
352 +        * set them to a specific selector, but this table is never
353 +        * accessed again you set the segment to a different selector.
354 +        *
355 +        * The more common model is are caches where the behide
356 +        * the scenes work is done, but is also dropped at arbitrary
357 +        * times.
358 +        *
359 +        * I take advantage of this here by force loading the
360 +        * segments, before I zap the gdt with an invalid value.
361 +        */
362 +       load_segments();
363 +       /* The gdt & idt are now invalid.
364 +        * If you want to load them you must set up your own idt & gdt.
365 +        */
366 +       set_gdt(phys_to_virt(0),0);
367 +       set_idt(phys_to_virt(0),0);
368 +
369 +       /* now call it */
370 +       rnk = (relocate_new_kernel_t) reboot_code_buffer;
371 +       (*rnk)(indirection_page, reboot_code_buffer, image->start);
372 +}
373 Index: linux-2.6.0-test6/arch/i386/kernel/reboot.c
374 ===================================================================
375 --- linux-2.6.0-test6.orig/arch/i386/kernel/reboot.c    2003-10-07 16:08:34.000000000 +0800
376 +++ linux-2.6.0-test6/arch/i386/kernel/reboot.c 2003-10-07 16:09:01.000000000 +0800
377 @@ -21,8 +21,7 @@
378  int reboot_thru_bios;
379  
380  #ifdef CONFIG_SMP
381 -int reboot_smp = 0;
382 -static int reboot_cpu = -1;
383 +int reboot_cpu = -1;     /* specifies the internal linux cpu id, not the apicid */
384  /* shamelessly grabbed from lib/vsprintf.c for readability */
385  #define is_digit(c)    ((c) >= '0' && (c) <= '9')
386  #endif
387 @@ -44,7 +43,6 @@
388                         break;
389  #ifdef CONFIG_SMP
390                 case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
391 -                       reboot_smp = 1;
392                         if (is_digit(*(str+1))) {
393                                 reboot_cpu = (int) (*(str+1) - '0');
394                                 if (is_digit(*(str+2))) 
395 @@ -217,51 +215,7 @@
396  
397  void machine_restart(char * __unused)
398  {
399 -#ifdef CONFIG_SMP
400 -       int cpuid;
401 -       
402 -       cpuid = GET_APIC_ID(apic_read(APIC_ID));
403 -
404 -       if (reboot_smp) {
405 -
406 -               /* check to see if reboot_cpu is valid 
407 -                  if its not, default to the BSP */
408 -               if ((reboot_cpu == -1) ||  
409 -                     (reboot_cpu > (NR_CPUS -1))  || 
410 -                     !physid_isset(cpuid, phys_cpu_present_map))
411 -                       reboot_cpu = boot_cpu_physical_apicid;
412 -
413 -               reboot_smp = 0;  /* use this as a flag to only go through this once*/
414 -               /* re-run this function on the other CPUs
415 -                  it will fall though this section since we have 
416 -                  cleared reboot_smp, and do the reboot if it is the
417 -                  correct CPU, otherwise it halts. */
418 -               if (reboot_cpu != cpuid)
419 -                       smp_call_function((void *)machine_restart , NULL, 1, 0);
420 -       }
421 -
422 -       /* if reboot_cpu is still -1, then we want a tradional reboot, 
423 -          and if we are not running on the reboot_cpu,, halt */
424 -       if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
425 -               for (;;)
426 -               __asm__ __volatile__ ("hlt");
427 -       }
428 -       /*
429 -        * Stop all CPUs and turn off local APICs and the IO-APIC, so
430 -        * other OSs see a clean IRQ state.
431 -        */
432 -       smp_send_stop();
433 -#elif defined(CONFIG_X86_LOCAL_APIC)
434 -       if (cpu_has_apic) {
435 -               local_irq_disable();
436 -               disable_local_APIC();
437 -               local_irq_enable();
438 -       }
439 -#endif
440 -#ifdef CONFIG_X86_IO_APIC
441 -       disable_IO_APIC();
442 -#endif
443 -
444 +        stop_apics();
445         if (!reboot_thru_bios) {
446                 if (efi_enabled) {
447                         efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, 0);
448 @@ -284,11 +238,13 @@
449  }
450  
451  void machine_halt(void)
452 -{
453 +{ 
454 +       stop_apics();
455  }
456  
457  void machine_power_off(void)
458  {
459 +       stop_apics();
460         if (efi_enabled)
461                 efi.reset_system(EFI_RESET_SHUTDOWN, EFI_SUCCESS, 0, 0);
462         if (pm_power_off)
463 Index: linux-2.6.0-test6/arch/i386/kernel/relocate_kernel.S
464 ===================================================================
465 --- linux-2.6.0-test6.orig/arch/i386/kernel/relocate_kernel.S   2003-10-07 16:09:00.000000000 +0800
466 +++ linux-2.6.0-test6/arch/i386/kernel/relocate_kernel.S        2003-10-07 16:09:01.000000000 +0800
467 @@ -0,0 +1,107 @@
468 +#include <linux/config.h>
469 +#include <linux/linkage.h>
470 +
471 +       /* Must be relocatable PIC code callable as a C function, that once
472 +        * it starts can not use the previous processes stack.
473 +        *
474 +        */
475 +       .globl relocate_new_kernel
476 +relocate_new_kernel:
477 +       /* read the arguments and say goodbye to the stack */
478 +       movl  4(%esp), %ebx /* indirection_page */
479 +       movl  8(%esp), %ebp /* reboot_code_buffer */
480 +       movl  12(%esp), %edx /* start address */
481 +
482 +       /* zero out flags, and disable interrupts */
483 +       pushl $0
484 +       popfl
485 +
486 +       /* set a new stack at the bottom of our page... */
487 +       lea   4096(%ebp), %esp
488 +
489 +       /* store the parameters back on the stack */
490 +       pushl   %edx /* store the start address */
491 +
492 +       /* Set cr0 to a known state:
493 +        * 31 0 == Paging disabled
494 +        * 18 0 == Alignment check disabled
495 +        * 16 0 == Write protect disabled
496 +        * 3  0 == No task switch
497 +        * 2  0 == Don't do FP software emulation.
498 +        * 0  1 == Proctected mode enabled
499 +        */
500 +       movl    %cr0, %eax
501 +       andl    $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
502 +       orl     $(1<<0), %eax
503 +       movl    %eax, %cr0
504 +       
505 +       /* Set cr4 to a known state:
506 +        * Setting everything to zero seems safe.
507 +        */
508 +       movl    %cr4, %eax
509 +       andl    $0, %eax
510 +       movl    %eax, %cr4
511 +       
512 +       jmp 1f
513 +1:     
514 +
515 +       /* Flush the TLB (needed?) */
516 +       xorl    %eax, %eax
517 +       movl    %eax, %cr3
518 +
519 +       /* Do the copies */
520 +       cld
521 +0:     /* top, read another word for the indirection page */
522 +       movl    %ebx, %ecx
523 +       movl    (%ebx), %ecx
524 +       addl    $4, %ebx
525 +       testl   $0x1,   %ecx  /* is it a destination page */
526 +       jz      1f
527 +       movl    %ecx,   %edi
528 +       andl    $0xfffff000, %edi
529 +       jmp     0b
530 +1:
531 +       testl   $0x2,   %ecx  /* is it an indirection page */
532 +       jz      1f
533 +       movl    %ecx,   %ebx
534 +       andl    $0xfffff000, %ebx
535 +       jmp     0b
536 +1:
537 +       testl   $0x4,   %ecx /* is it the done indicator */
538 +       jz      1f
539 +       jmp     2f
540 +1:
541 +       testl   $0x8,   %ecx /* is it the source indicator */
542 +       jz      0b           /* Ignore it otherwise */
543 +       movl    %ecx,   %esi /* For every source page do a copy */
544 +       andl    $0xfffff000, %esi
545 +
546 +       movl    $1024, %ecx
547 +       rep ; movsl
548 +       jmp     0b
549 +
550 +2:
551 +
552 +       /* To be certain of avoiding problems with self modifying code
553 +        * I need to execute a serializing instruction here.
554 +        * So I flush the TLB, it's handy, and not processor dependent.
555 +        */
556 +       xorl    %eax, %eax
557 +       movl    %eax, %cr3
558 +       
559 +       /* set all of the registers to known values */
560 +       /* leave %esp alone */
561 +       
562 +       xorl    %eax, %eax
563 +       xorl    %ebx, %ebx
564 +       xorl    %ecx, %ecx
565 +       xorl    %edx, %edx
566 +       xorl    %esi, %esi
567 +       xorl    %edi, %edi
568 +       xorl    %ebp, %ebp
569 +       ret
570 +relocate_new_kernel_end:
571 +
572 +       .globl relocate_new_kernel_size
573 +relocate_new_kernel_size:      
574 +       .long relocate_new_kernel_end - relocate_new_kernel
575 Index: linux-2.6.0-test6/arch/i386/kernel/smp.c
576 ===================================================================
577 --- linux-2.6.0-test6.orig/arch/i386/kernel/smp.c       2003-10-07 16:08:59.000000000 +0800
578 +++ linux-2.6.0-test6/arch/i386/kernel/smp.c    2003-10-07 16:09:01.000000000 +0800
579 @@ -577,6 +577,30 @@
580  
581  void smp_send_stop(void)
582  {
583 +       extern int reboot_cpu;
584 +       int reboot_cpu_id;
585 +       
586 +       /* The boot cpu is always logical cpu 0 */
587 +       reboot_cpu_id = 0;
588 +
589 +       /* See if there has been give a command line override .
590 +        */
591 +       if ((reboot_cpu != -1) && !(reboot_cpu >= NR_CPUS) && 
592 +               test_bit(reboot_cpu, &cpu_online_map)) {
593 +               reboot_cpu_id = reboot_cpu;
594 +       }
595 +        
596 +       /* Make certain the the cpu I'm rebooting on is online */
597 +       if (!test_bit(reboot_cpu_id, &cpu_online_map)) {
598 +               reboot_cpu_id = smp_processor_id();
599 +       }
600 +
601 +       /* Make certain I only run on the appropriate processor */
602 +       set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
603 +
604 +       /* O.k. Now that I'm on the appropriate processor stop
605 +        * all of the others.
606 +        */
607         smp_call_function(stop_this_cpu, NULL, 1, 0);
608  
609         local_irq_disable();
610 Index: linux-2.6.0-test6/include/asm-i386/apic.h
611 ===================================================================
612 --- linux-2.6.0-test6.orig/include/asm-i386/apic.h      2003-10-07 15:47:25.000000000 +0800
613 +++ linux-2.6.0-test6/include/asm-i386/apic.h   2003-10-07 16:09:01.000000000 +0800
614 @@ -99,6 +99,9 @@
615  #define NMI_LOCAL_APIC 2
616  #define NMI_INVALID    3
617  
618 +extern void stop_apics(void);
619 +#else
620 +static inline void stop_apics(void) { }
621  #endif /* CONFIG_X86_LOCAL_APIC */
622  
623  #endif /* __ASM_APIC_H */
624 Index: linux-2.6.0-test6/include/asm-i386/apicdef.h
625 ===================================================================
626 --- linux-2.6.0-test6.orig/include/asm-i386/apicdef.h   2003-10-07 15:47:25.000000000 +0800
627 +++ linux-2.6.0-test6/include/asm-i386/apicdef.h        2003-10-07 16:09:01.000000000 +0800
628 @@ -86,6 +86,7 @@
629  #define                        APIC_LVT_REMOTE_IRR             (1<<14)
630  #define                        APIC_INPUT_POLARITY             (1<<13)
631  #define                        APIC_SEND_PENDING               (1<<12)
632 +#define                        APIC_MODE_MASK                  0x700
633  #define                        GET_APIC_DELIVERY_MODE(x)       (((x)>>8)&0x7)
634  #define                        SET_APIC_DELIVERY_MODE(x,y)     (((x)&~0x700)|((y)<<8))
635  #define                                APIC_MODE_FIXED         0x0
636 Index: linux-2.6.0-test6/include/asm-i386/kexec.h
637 ===================================================================
638 --- linux-2.6.0-test6.orig/include/asm-i386/kexec.h     2003-10-07 16:09:00.000000000 +0800
639 +++ linux-2.6.0-test6/include/asm-i386/kexec.h  2003-10-07 16:09:01.000000000 +0800
640 @@ -0,0 +1,23 @@
641 +#ifndef _I386_KEXEC_H
642 +#define _I386_KEXEC_H
643 +
644 +#include <asm/fixmap.h>
645 +
646 +/*
647 + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
648 + * I.e. Maximum page that is mapped directly into kernel memory,
649 + * and kmap is not required.
650 + *
651 + * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct
652 + * calculation for the amount of memory directly mappable into the
653 + * kernel memory space.
654 + */
655 +
656 +/* Maximum physical address we can use pages from */
657 +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
658 +/* Maximum address we can reach in physical address mode */
659 +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
660 +
661 +#define KEXEC_REBOOT_CODE_SIZE 4096
662 +
663 +#endif /* _I386_KEXEC_H */
664 Index: linux-2.6.0-test6/include/asm-i386/unistd.h
665 ===================================================================
666 --- linux-2.6.0-test6.orig/include/asm-i386/unistd.h    2003-10-07 16:08:41.000000000 +0800
667 +++ linux-2.6.0-test6/include/asm-i386/unistd.h 2003-10-07 16:09:01.000000000 +0800
668 @@ -279,8 +279,9 @@
669  #define __NR_utimes            271
670  #define __NR_fadvise64_64      272
671  #define __NR_vserver           273
672 -
673 -#define NR_syscalls 274
674 +#define __NR_sys_kexec_load    274
675 +    
676 +#define NR_syscalls 275
677  
678  /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
679  
680 Index: linux-2.6.0-test6/include/linux/kexec.h
681 ===================================================================
682 --- linux-2.6.0-test6.orig/include/linux/kexec.h        2003-10-07 16:09:00.000000000 +0800
683 +++ linux-2.6.0-test6/include/linux/kexec.h     2003-10-07 16:09:01.000000000 +0800
684 @@ -0,0 +1,54 @@
685 +#ifndef LINUX_KEXEC_H
686 +#define LINUX_KEXEC_H
687 +
688 +#if CONFIG_KEXEC
689 +#include <linux/types.h>
690 +#include <linux/list.h>
691 +#include <asm/kexec.h>
692 +
693 +/* 
694 + * This structure is used to hold the arguments that are used when loading
695 + * kernel binaries.
696 + */
697 +
698 +typedef unsigned long kimage_entry_t;
699 +#define IND_DESTINATION  0x1
700 +#define IND_INDIRECTION  0x2
701 +#define IND_DONE         0x4
702 +#define IND_SOURCE       0x8
703 +
704 +#define KEXEC_SEGMENT_MAX 8
705 +struct kexec_segment {
706 +       void *buf;
707 +       size_t bufsz;
708 +       void *mem;
709 +       size_t memsz;
710 +};
711 +
712 +struct kimage {
713 +       kimage_entry_t head;
714 +       kimage_entry_t *entry;
715 +       kimage_entry_t *last_entry;
716 +
717 +       unsigned long destination;
718 +       unsigned long offset;
719 +
720 +       unsigned long start;
721 +       struct page *reboot_code_pages;
722 +
723 +       unsigned long nr_segments;
724 +       struct kexec_segment segment[KEXEC_SEGMENT_MAX+1];
725 +
726 +       struct list_head dest_pages;
727 +       struct list_head unuseable_pages;
728 +};
729 +
730 +
731 +/* kexec interface functions */
732 +extern void machine_kexec(struct kimage *image);
733 +extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments, 
734 +       struct kexec_segment *segments);
735 +extern struct kimage *kexec_image;
736 +#endif
737 +#endif /* LINUX_KEXEC_H */
738 +
739 Index: linux-2.6.0-test6/include/linux/reboot.h
740 ===================================================================
741 --- linux-2.6.0-test6.orig/include/linux/reboot.h       2003-10-07 15:47:25.000000000 +0800
742 +++ linux-2.6.0-test6/include/linux/reboot.h    2003-10-07 16:09:01.000000000 +0800
743 @@ -22,6 +22,7 @@
744   * POWER_OFF   Stop OS and remove all power from system, if possible.
745   * RESTART2    Restart system using given command string.
746   * SW_SUSPEND  Suspend system using software suspend if compiled in.
747 + * KEXEC       Restart the system using a different kernel.
748   */
749  
750  #define        LINUX_REBOOT_CMD_RESTART        0x01234567
751 @@ -31,6 +32,7 @@
752  #define        LINUX_REBOOT_CMD_POWER_OFF      0x4321FEDC
753  #define        LINUX_REBOOT_CMD_RESTART2       0xA1B2C3D4
754  #define        LINUX_REBOOT_CMD_SW_SUSPEND     0xD000FCE2
755 +#define LINUX_REBOOT_CMD_KEXEC         0x45584543
756  
757  
758  #ifdef __KERNEL__
759 Index: linux-2.6.0-test6/kernel/Makefile
760 ===================================================================
761 --- linux-2.6.0-test6.orig/kernel/Makefile      2003-10-07 16:08:42.000000000 +0800
762 +++ linux-2.6.0-test6/kernel/Makefile   2003-10-07 16:09:01.000000000 +0800
763 @@ -17,6 +17,7 @@
764  obj-$(CONFIG_KALLSYMS) += kallsyms.o
765  obj-$(CONFIG_PM) += power/
766  obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
767 +obj-$(CONFIG_KEXEC) += kexec.o
768  obj-$(CONFIG_COMPAT) += compat.o
769  obj-$(CONFIG_IKCONFIG) += configs.o
770  obj-$(CONFIG_IKCONFIG_PROC) += configs.o
771 Index: linux-2.6.0-test6/kernel/kexec.c
772 ===================================================================
773 --- linux-2.6.0-test6.orig/kernel/kexec.c       2003-10-07 16:09:00.000000000 +0800
774 +++ linux-2.6.0-test6/kernel/kexec.c    2003-10-07 16:09:01.000000000 +0800
775 @@ -0,0 +1,629 @@
776 +#include <linux/mm.h>
777 +#include <linux/file.h>
778 +#include <linux/slab.h>
779 +#include <linux/fs.h>
780 +#include <linux/version.h>
781 +#include <linux/compile.h>
782 +#include <linux/kexec.h>
783 +#include <linux/spinlock.h>
784 +#include <linux/list.h>
785 +#include <linux/highmem.h>
786 +#include <net/checksum.h>
787 +#include <asm/page.h>
788 +#include <asm/uaccess.h>
789 +#include <asm/io.h>
790 +#include <asm/system.h>
791 +
792 +/* When kexec transitions to the new kernel there is a one to one
793 + * mapping between physical and virtual addresses.  On processors
794 + * where you can disable the MMU this is trivial, and easy.  For
795 + * others it is still a simple predictable page table to setup.
796 + *
797 + * In that environment kexec copies the new kernel to it's final
798 + * resting place.  This means I can only support memory whose
799 + * physical address can fit in an unsigned long.  In particular
800 + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
801 + * If the assembly stub has more restrictive requirements
802 + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
803 + * defined more restrictively in <asm/kexec.h>.
804 + *
805 + * The code for the transition from the current kernel to the 
806 + * the new kernel is placed in the reboot_code_buffer, whose size
807 + * is given by KEXEC_REBOOT_CODE_SIZE.  In the best case only a single
808 + * page of memory is necessary, but some architectures require more.
809 + * Because this memory must be identity mapped in the transition from
810 + * virtual to physical addresses it must live in the range
811 + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
812 + * modifyable.
813 + *
814 + * The assembly stub in the reboot code buffer is passed a linked list
815 + * of descriptor pages detailing the source pages of the new kernel,
816 + * and the destination addresses of those source pages.  As this data
817 + * structure is not used in the context of the current OS, it must
818 + * be self contained.
819 + *
820 + * The code has been made to work with highmem pages and will use a
821 + * destination page in it's final resting place (if it happens 
822 + * to allocate it).  The end product of this is that most of the
823 + * physical address space, and most of ram can be used.
824 + *
825 + * Future directions include:
826 + *  - allocating a page table with the reboot code buffer identity
827 + *    mapped, to simplify machine_kexec and make kexec_on_panic, more
828 + *    reliable.  
829 + *  - allocating the pages for a page table for machines that cannot
830 + *    disable their MMUs.  (Hammer, Alpha...)
831 + */
832 +
833 +/* KIMAGE_NO_DEST is an impossible destination address..., for
834 + * allocating pages whose destination address we do not care about.
835 + */
836 +#define KIMAGE_NO_DEST (-1UL)
837 +
838 +static int kimage_is_destination_range(
839 +       struct kimage *image, unsigned long start, unsigned long end);
840 +static struct page *kimage_alloc_reboot_code_pages(struct kimage *image);
841 +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
842 +
843 +
844 +static int kimage_alloc(struct kimage **rimage, 
845 +       unsigned long nr_segments, struct kexec_segment *segments)
846 +{
847 +       int result;
848 +       struct kimage *image;
849 +       size_t segment_bytes;
850 +       struct page *reboot_pages;
851 +       unsigned long i;
852 +
853 +       /* Allocate a controlling structure */
854 +       result = -ENOMEM;
855 +       image = kmalloc(sizeof(*image), GFP_KERNEL);
856 +       if (!image) {
857 +               goto out;
858 +       }
859 +       memset(image, 0, sizeof(*image));
860 +       image->head = 0;
861 +       image->entry = &image->head;
862 +       image->last_entry = &image->head;
863 +
864 +       /* Initialize the list of destination pages */
865 +       INIT_LIST_HEAD(&image->dest_pages);
866 +
867 +       /* Initialize the list of unuseable pages */
868 +       INIT_LIST_HEAD(&image->unuseable_pages);
869 +
870 +       /* Read in the segments */
871 +       image->nr_segments = nr_segments;
872 +       segment_bytes = nr_segments * sizeof*segments;
873 +       result = copy_from_user(image->segment, segments, segment_bytes);
874 +       if (result) 
875 +               goto out;
876 +
877 +       /* Verify we have good destination addresses.  The caller is
878 +        * responsible for making certain we don't attempt to load
879 +        * the new image into invalid or reserved areas of RAM.  This
880 +        * just verifies it is an address we can use. 
881 +        */
882 +       result = -EADDRNOTAVAIL;
883 +       for(i = 0; i < nr_segments; i++) {
884 +               unsigned long mend;
885 +               mend = ((unsigned long)(image->segment[i].mem)) + 
886 +                       image->segment[i].memsz;
887 +               if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
888 +                       goto out;
889 +       }
890 +
891 +       /* Find a location for the reboot code buffer, and add it
892 +        * the vector of segments so that it's pages will also be
893 +        * counted as destination pages.  
894 +        */
895 +       result = -ENOMEM;
896 +       reboot_pages = kimage_alloc_reboot_code_pages(image);
897 +       if (!reboot_pages) {
898 +               printk(KERN_ERR "Could not allocate reboot_code_buffer\n");
899 +               goto out;
900 +       }
901 +       image->reboot_code_pages = reboot_pages;
902 +       image->segment[nr_segments].buf = 0;
903 +       image->segment[nr_segments].bufsz = 0;
904 +       image->segment[nr_segments].mem = (void *)(page_to_pfn(reboot_pages) << PAGE_SHIFT);
905 +       image->segment[nr_segments].memsz = KEXEC_REBOOT_CODE_SIZE;
906 +       image->nr_segments++;
907 +
908 +       result = 0;
909 + out:
910 +       if (result == 0) {
911 +               *rimage = image;
912 +       } else {
913 +               kfree(image);
914 +       }
915 +       return result;
916 +}
917 +
918 +static int kimage_is_destination_range(
919 +       struct kimage *image, unsigned long start, unsigned long end)
920 +{
921 +       unsigned long i;
922 +       for(i = 0; i < image->nr_segments; i++) {
923 +               unsigned long mstart, mend;
924 +               mstart = (unsigned long)image->segment[i].mem;
925 +               mend   = mstart + image->segment[i].memsz;
926 +               if ((end > mstart) && (start < mend)) {
927 +                       return 1;
928 +               }
929 +       }
930 +       return 0;
931 +}
932 +
933 +#ifdef CONFIG_MMU
934 +static int identity_map_pages(struct page *pages, int order)
935 +{
936 +       struct mm_struct *mm;
937 +       struct vm_area_struct *vma;
938 +       int error;
939 +       mm = &init_mm;
940 +       vma = 0;
941 +
942 +       down_write(&mm->mmap_sem);
943 +       error = -ENOMEM;
944 +       vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
945 +       if (!vma) {
946 +               goto out;
947 +       }
948 +
949 +       memset(vma, 0, sizeof(vma));
950 +       vma->vm_mm = mm;
951 +       vma->vm_start = page_to_pfn(pages) << PAGE_SHIFT;
952 +       vma->vm_end = vma->vm_start + (1 << (order + PAGE_SHIFT));
953 +       vma->vm_ops = 0;
954 +       vma->vm_flags = VM_SHARED \
955 +               | VM_READ | VM_WRITE | VM_EXEC \
956 +               | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC \
957 +               | VM_DONTCOPY | VM_RESERVED;
958 +       vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
959 +       vma->vm_file = NULL;
960 +       vma->vm_private_data = NULL;
961 +       INIT_LIST_HEAD(&vma->shared);
962 +       insert_vm_struct(mm, vma);
963 +       
964 +       error = remap_page_range(vma, vma->vm_start, vma->vm_start,
965 +               vma->vm_end - vma->vm_start, vma->vm_page_prot);
966 +       if (error) {
967 +               goto out;
968 +       }
969 +
970 +       error = 0;
971 + out:
972 +       if (error && vma) {
973 +               kmem_cache_free(vm_area_cachep, vma);
974 +               vma = 0;
975 +       }
976 +       up_write(&mm->mmap_sem);
977 +
978 +       return error;
979 +}
980 +#else
981 +#define identity_map_pages(pages, order) 0
982 +#endif
983 +
984 +struct page *kimage_alloc_reboot_code_pages(struct kimage *image)
985 +{
986 +       /* The reboot code buffer is special.  It is the only set of
987 +        * pages that must be allocated in their final resting place,
988 +        * and the only set of pages whose final resting place we can
989 +        * pick. 
990 +        *
991 +        * At worst this runs in O(N) of the image size.
992 +        */
993 +       struct list_head extra_pages, *pos, *next;
994 +       struct page *pages;
995 +       unsigned long addr;
996 +       int order, count;
997 +       order = get_order(KEXEC_REBOOT_CODE_SIZE);
998 +       count = 1 << order;
999 +       INIT_LIST_HEAD(&extra_pages);
1000 +       do {
1001 +               int i;
1002 +               pages = alloc_pages(GFP_HIGHUSER, order);
1003 +               if (!pages)
1004 +                       break;
1005 +               for(i = 0; i < count; i++) {
1006 +                       SetPageReserved(pages +i);
1007 +               }
1008 +               addr = page_to_pfn(pages) << PAGE_SHIFT;
1009 +               if ((page_to_pfn(pages) >= (TASK_SIZE >> PAGE_SHIFT)) ||
1010 +                       kimage_is_destination_range(image, addr, addr + KEXEC_REBOOT_CODE_SIZE)) {
1011 +                       list_add(&pages->list, &extra_pages);
1012 +                       pages = 0;
1013 +               }
1014 +       } while(!pages);
1015 +       if (pages) {
1016 +               int result;
1017 +               result = identity_map_pages(pages, order);
1018 +               if (result < 0) {
1019 +                       list_add(&pages->list, &extra_pages);
1020 +                       pages = 0;
1021 +               }
1022 +       }
1023 +       /* If I could convert a multi page allocation into a buch of
1024 +        * single page allocations I could add these pages to
1025 +        * image->dest_pages.  For now it is simpler to just free the
1026 +        * pages again.
1027 +        */
1028 +       list_for_each_safe(pos, next, &extra_pages) {
1029 +               struct page *page;
1030 +               int i;
1031 +               page = list_entry(pos, struct page, list);
1032 +               for(i = 0; i < count; i++) {
1033 +                       ClearPageReserved(pages +i);
1034 +               }
1035 +               list_del(&extra_pages);
1036 +               __free_pages(page, order);
1037 +       }
1038 +       return pages;
1039 +}
1040 +
1041 +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
1042 +{
1043 +       if (image->offset != 0) {
1044 +               image->entry++;
1045 +       }
1046 +       if (image->entry == image->last_entry) {
1047 +               kimage_entry_t *ind_page;
1048 +               struct page *page;
1049 +               page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
1050 +               if (!page) {
1051 +                       return -ENOMEM;
1052 +               }
1053 +               ind_page = page_address(page);
1054 +               *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
1055 +               image->entry = ind_page;
1056 +               image->last_entry = 
1057 +                       ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
1058 +       }
1059 +       *image->entry = entry;
1060 +       image->entry++;
1061 +       image->offset = 0;
1062 +       return 0;
1063 +}
1064 +
1065 +static int kimage_set_destination(
1066 +       struct kimage *image, unsigned long destination) 
1067 +{
1068 +       int result;
1069 +       destination &= PAGE_MASK;
1070 +       result = kimage_add_entry(image, destination | IND_DESTINATION);
1071 +       if (result == 0) {
1072 +               image->destination = destination;
1073 +       }
1074 +       return result;
1075 +}
1076 +
1077 +
1078 +static int kimage_add_page(struct kimage *image, unsigned long page)
1079 +{
1080 +       int result;
1081 +       page &= PAGE_MASK;
1082 +       result = kimage_add_entry(image, page | IND_SOURCE);
1083 +       if (result == 0) {
1084 +               image->destination += PAGE_SIZE;
1085 +       }
1086 +       return result;
1087 +}
1088 +
1089 +
1090 +static void kimage_free_extra_pages(struct kimage *image)
1091 +{
1092 +       /* Walk through and free any extra destination pages I may have */
1093 +       struct list_head *pos, *next;
1094 +       list_for_each_safe(pos, next, &image->dest_pages) {
1095 +               struct page *page;
1096 +               page = list_entry(pos, struct page, list);
1097 +               list_del(&page->list);
1098 +               ClearPageReserved(page);
1099 +               __free_page(page);
1100 +       }
1101 +       /* Walk through and free any unuseable pages I have cached */
1102 +       list_for_each_safe(pos, next, &image->unuseable_pages) {
1103 +               struct page *page;
1104 +               page = list_entry(pos, struct page, list);
1105 +               list_del(&page->list);
1106 +               ClearPageReserved(page);
1107 +               __free_page(page);
1108 +       }
1109 +
1110 +}
1111 +static int kimage_terminate(struct kimage *image)
1112 +{
1113 +       int result;
1114 +       result = kimage_add_entry(image, IND_DONE);
1115 +       if (result == 0) {
1116 +               /* Point at the terminating element */
1117 +               image->entry--;
1118 +               kimage_free_extra_pages(image);
1119 +       }
1120 +       return result;
1121 +}
1122 +
1123 +#define for_each_kimage_entry(image, ptr, entry) \
1124 +       for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
1125 +               ptr = (entry & IND_INDIRECTION)? \
1126 +                       phys_to_virt((entry & PAGE_MASK)): ptr +1)
1127 +
1128 +static void kimage_free(struct kimage *image)
1129 +{
1130 +       kimage_entry_t *ptr, entry;
1131 +       kimage_entry_t ind = 0;
1132 +       int i, count, order;
1133 +       if (!image)
1134 +               return;
1135 +       kimage_free_extra_pages(image);
1136 +       for_each_kimage_entry(image, ptr, entry) {
1137 +               if (entry & IND_INDIRECTION) {
1138 +                       /* Free the previous indirection page */
1139 +                       if (ind & IND_INDIRECTION) {
1140 +                               free_page((unsigned long)phys_to_virt(ind & PAGE_MASK));
1141 +                       }
1142 +                       /* Save this indirection page until we are
1143 +                        * done with it.
1144 +                        */
1145 +                       ind = entry;
1146 +               }
1147 +               else if (entry & IND_SOURCE) {
1148 +                       free_page((unsigned long)phys_to_virt(entry & PAGE_MASK));
1149 +               }
1150 +       }
1151 +       order = get_order(KEXEC_REBOOT_CODE_SIZE);
1152 +       count = 1 << order;
1153 +       do_munmap(&init_mm, 
1154 +               page_to_pfn(image->reboot_code_pages) << PAGE_SHIFT, 
1155 +               count << PAGE_SHIFT);
1156 +       for(i = 0; i < count; i++) {
1157 +               ClearPageReserved(image->reboot_code_pages + i);
1158 +       }
1159 +       __free_pages(image->reboot_code_pages, order);
1160 +       kfree(image);
1161 +}
1162 +
1163 +static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
1164 +{
1165 +       kimage_entry_t *ptr, entry;
1166 +       unsigned long destination = 0;
1167 +       for_each_kimage_entry(image, ptr, entry) {
1168 +               if (entry & IND_DESTINATION) {
1169 +                       destination = entry & PAGE_MASK;
1170 +               }
1171 +               else if (entry & IND_SOURCE) {
1172 +                       if (page == destination) {
1173 +                               return ptr;
1174 +                       }
1175 +                       destination += PAGE_SIZE;
1176 +               }
1177 +       }
1178 +       return 0;
1179 +}
1180 +
1181 +static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
1182 +{
1183 +       /* Here we implment safe guards to ensure that a source page
1184 +        * is not copied to it's destination page before the data on
1185 +        * the destination page is no longer useful.
1186 +        *
1187 +        * To do this we maintain the invariant that a source page is
1188 +        * either it's own destination page, or it is not a
1189 +        * destination page at all.  
1190 +        *
1191 +        * That is slightly stronger than required, but the proof
1192 +        * that no problems will not occur is trivial, and the
1193 +        * implemenation is simply to verify.
1194 +        *
1195 +        * When allocating all pages normally this algorithm will run
1196 +        * in O(N) time, but in the worst case it will run in O(N^2)
1197 +        * time.   If the runtime is a problem the data structures can
1198 +        * be fixed.
1199 +        */
1200 +       struct page *page;
1201 +       unsigned long addr;
1202 +
1203 +       /* Walk through the list of destination pages, and see if I
1204 +        * have a match.
1205 +        */
1206 +       list_for_each_entry(page, &image->dest_pages, list) {
1207 +               addr = page_to_pfn(page) << PAGE_SHIFT;
1208 +               if (addr == destination) {
1209 +                       list_del(&page->list);
1210 +                       return page;
1211 +               }
1212 +       }
1213 +       page = 0;
1214 +       while(1) {
1215 +               kimage_entry_t *old;
1216 +               /* Allocate a page, if we run out of memory give up */
1217 +               page = alloc_page(gfp_mask);
1218 +               if (!page) {
1219 +                       return 0;
1220 +               }
1221 +               SetPageReserved(page);
1222 +               /* If the page cannot be used file it away */
1223 +               if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
1224 +                       list_add(&page->list, &image->unuseable_pages);
1225 +                       continue;
1226 +               }
1227 +               addr = page_to_pfn(page) << PAGE_SHIFT;
1228 +
1229 +               /* If it is the destination page we want use it */
1230 +               if (addr == destination)
1231 +                       break;
1232 +
1233 +               /* If the page is not a destination page use it */
1234 +               if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
1235 +                       break;
1236 +
1237 +               /* I know that the page is someones destination page.
1238 +                * See if there is already a source page for this
1239 +                * destination page.  And if so swap the source pages.
1240 +                */
1241 +               old = kimage_dst_used(image, addr);
1242 +               if (old) {
1243 +                       /* If so move it */
1244 +                       unsigned long old_addr;
1245 +                       struct page *old_page;
1246 +                       
1247 +                       old_addr = *old & PAGE_MASK;
1248 +                       old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
1249 +                       copy_highpage(page, old_page);
1250 +                       *old = addr | (*old & ~PAGE_MASK);
1251 +
1252 +                       /* The old page I have found cannot be a
1253 +                        * destination page, so return it.
1254 +                        */
1255 +                       addr = old_addr;
1256 +                       page = old_page;
1257 +                       break;
1258 +               }
1259 +               else {
1260 +                       /* Place the page on the destination list I
1261 +                        * will use it later.
1262 +                        */
1263 +                       list_add(&page->list, &image->dest_pages);
1264 +               }
1265 +       }
1266 +       return page;
1267 +}
1268 +
1269 +static int kimage_load_segment(struct kimage *image,
1270 +       struct kexec_segment *segment)
1271 +{      
1272 +       unsigned long mstart;
1273 +       int result;
1274 +       unsigned long offset;
1275 +       unsigned long offset_end;
1276 +       unsigned char *buf;
1277 +
1278 +       result = 0;
1279 +       buf = segment->buf;
1280 +       mstart = (unsigned long)segment->mem;
1281 +
1282 +       offset_end = segment->memsz;
1283 +
1284 +       result = kimage_set_destination(image, mstart);
1285 +       if (result < 0) {
1286 +               goto out;
1287 +       }
1288 +       for(offset = 0;  offset < segment->memsz; offset += PAGE_SIZE) {
1289 +               struct page *page;
1290 +               char *ptr;
1291 +               size_t size, leader;
1292 +               page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset);
1293 +               if (page == 0) {
1294 +                       result  = -ENOMEM;
1295 +                       goto out;
1296 +               }
1297 +               result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
1298 +               if (result < 0) {
1299 +                       goto out;
1300 +               }
1301 +               ptr = kmap(page);
1302 +               if (segment->bufsz < offset) {
1303 +                       /* We are past the end zero the whole page */
1304 +                       memset(ptr, 0, PAGE_SIZE);
1305 +                       kunmap(page);
1306 +                       continue;
1307 +               }
1308 +               size = PAGE_SIZE;
1309 +               leader = 0;
1310 +               if ((offset == 0)) {
1311 +                       leader = mstart & ~PAGE_MASK;
1312 +               }
1313 +               if (leader) {
1314 +                       /* We are on the first page zero the unused portion */
1315 +                       memset(ptr, 0, leader);
1316 +                       size -= leader;
1317 +                       ptr += leader;
1318 +               }
1319 +               if (size > (segment->bufsz - offset)) {
1320 +                       size = segment->bufsz - offset;
1321 +               }
1322 +               if (size < (PAGE_SIZE - leader)) {
1323 +                       /* zero the trailing part of the page */
1324 +                       memset(ptr + size, 0, (PAGE_SIZE - leader) - size);
1325 +               }
1326 +               result = copy_from_user(ptr, buf + offset, size);
1327 +               kunmap(page);
1328 +               if (result) {
1329 +                       result = (result < 0)?result : -EIO;
1330 +                       goto out;
1331 +               }
1332 +       }
1333 + out:
1334 +       return result;
1335 +}
1336 +
1337 +/*
1338 + * Exec Kernel system call: for obvious reasons only root may call it.
1339 + * 
1340 + * This call breaks up into three pieces.  
1341 + * - A generic part which loads the new kernel from the current
1342 + *   address space, and very carefully places the data in the
1343 + *   allocated pages.
1344 + *
1345 + * - A generic part that interacts with the kernel and tells all of
1346 + *   the devices to shut down.  Preventing on-going dmas, and placing
1347 + *   the devices in a consistent state so a later kernel can
1348 + *   reinitialize them.
1349 + *
1350 + * - A machine specific part that includes the syscall number
1351 + *   and the copies the image to it's final destination.  And
1352 + *   jumps into the image at entry.
1353 + *
1354 + * kexec does not sync, or unmount filesystems so if you need
1355 + * that to happen you need to do that yourself.
1356 + */
1357 +struct kimage *kexec_image = 0;
1358 +
1359 +asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 
1360 +       struct kexec_segment *segments, unsigned long flags)
1361 +{
1362 +       struct kimage *image;
1363 +       int result;
1364 +               
1365 +       /* We only trust the superuser with rebooting the system. */
1366 +       if (!capable(CAP_SYS_ADMIN))
1367 +               return -EPERM;
1368 +
1369 +       /* In case we need just a little bit of special behavior for
1370 +        * reboot on panic 
1371 +        */
1372 +       if (flags != 0)
1373 +               return -EINVAL;
1374 +
1375 +       if (nr_segments > KEXEC_SEGMENT_MAX)
1376 +               return -EINVAL;
1377 +       image = 0;
1378 +
1379 +       result = 0;
1380 +       if (nr_segments > 0) {
1381 +               unsigned long i;
1382 +               result = kimage_alloc(&image, nr_segments, segments);
1383 +               if (result) {
1384 +                       goto out;
1385 +               }
1386 +               image->start = entry;
1387 +               for(i = 0; i < nr_segments; i++) {
1388 +                       result = kimage_load_segment(image, &segments[i]);
1389 +                       if (result) {
1390 +                               goto out;
1391 +                       }
1392 +               }
1393 +               result = kimage_terminate(image);
1394 +               if (result) {
1395 +                       goto out;
1396 +               }
1397 +       }
1398 +
1399 +       image = xchg(&kexec_image, image);
1400 +
1401 + out:
1402 +       kimage_free(image);
1403 +       return result;
1404 +}
1405 Index: linux-2.6.0-test6/kernel/sys.c
1406 ===================================================================
1407 --- linux-2.6.0-test6.orig/kernel/sys.c 2003-10-07 16:08:42.000000000 +0800
1408 +++ linux-2.6.0-test6/kernel/sys.c      2003-10-09 18:38:57.000000000 +0800
1409 @@ -17,6 +17,7 @@
1410  #include <linux/init.h>
1411  #include <linux/highuid.h>
1412  #include <linux/fs.h>
1413 +#include <linux/kexec.h>
1414  #include <linux/workqueue.h>
1415  #include <linux/device.h>
1416  #include <linux/times.h>
1417 @@ -94,6 +95,7 @@
1418  int notifier_chain_register(struct notifier_block **list, struct notifier_block *n)
1419  {
1420         write_lock(&notifier_lock);
1421 +       printk(KERN_EMERG"add noitifier nb %p \n", n);
1422         while(*list)
1423         {
1424                 if(n->priority > (*list)->priority)
1425 @@ -156,6 +158,7 @@
1426  
1427         while(nb)
1428         {
1429 +               printk("nb %p notifier_call %p \n", nb, nb->notifier_call);
1430                 ret=nb->notifier_call(nb,val,v);
1431                 if(ret&NOTIFY_STOP_MASK)
1432                 {
1433 @@ -208,6 +211,7 @@
1434  cond_syscall(sys_lookup_dcookie)
1435  cond_syscall(sys_swapon)
1436  cond_syscall(sys_swapoff)
1437 +cond_syscall(sys_kexec_load)
1438  cond_syscall(sys_init_module)
1439  cond_syscall(sys_delete_module)
1440  cond_syscall(sys_socketpair)
1441 @@ -454,6 +458,27 @@
1442                 machine_restart(buffer);
1443                 break;
1444  
1445 +#ifdef CONFIG_KEXEC
1446 +       case LINUX_REBOOT_CMD_KEXEC:
1447 +       {
1448 +               struct kimage *image;
1449 +               if (arg) {
1450 +                       unlock_kernel();
1451 +                       return -EINVAL;
1452 +               }
1453 +               image = xchg(&kexec_image, 0);
1454 +               if (!image) {
1455 +                       unlock_kernel();
1456 +                       return -EINVAL;
1457 +               }
1458 +               notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
1459 +               system_running = 0;
1460 +               device_shutdown();
1461 +               printk(KERN_EMERG "Starting new kernel\n");
1462 +               machine_kexec(image);
1463 +               break;
1464 +       }
1465 +#endif
1466  #ifdef CONFIG_SOFTWARE_SUSPEND
1467         case LINUX_REBOOT_CMD_SW_SUSPEND:
1468                 if (!software_suspend_enabled) {
1469 Index: linux-2.6.0-test6/fs/aio.c
1470 ===================================================================
1471 --- linux-2.6.0-test6.orig/fs/aio.c     2003-10-07 16:08:40.000000000 +0800
1472 +++ linux-2.6.0-test6/fs/aio.c  2003-10-07 16:09:01.000000000 +0800
1473 @@ -560,7 +560,7 @@
1474   *     (Note: this routine is intended to be called only
1475   *     from a kernel thread context)
1476   */
1477 -static void use_mm(struct mm_struct *mm)
1478 +void use_mm(struct mm_struct *mm)
1479  {
1480         struct mm_struct *active_mm;
1481         struct task_struct *tsk = current;