8. SMP Boot

There are a few SMP related macros, like CONFIG_SMP, CONFIG_X86_LOCAL_APIC, CONFIG_X86_IO_APIC, CONFIG_MULTIQUAD and CONFIG_VISWS. I will ignore code that requires CONFIG_MULTIQUAD or CONFIG_VISWS, which most people don't care (if not using IBM high-end multiprocessor server or SGI Visual Workstation).

BSP executes start_kernel() -> smp_init() -> smp_boot_cpus() -> do_boot_cpu() -> wakeup_secondary_via_INIT() to trigger APs. Check MultiProcessor Specification and IA-32 Manual Vol.3 (Ch.7. Multile-Processor Management, and Ch.8. Advanced Programmable Interrupt Controller) for technical details.

8.1. Before smp_init()

Before calling smp_init(), start_kernel() did something to setup SMP environment:

start_kernel()
|-- setup_arch()
|   |-- parse_cmdline_early();  // SMP looks for "noht" and "acpismp=force"
|   |   `-- /* "noht" disables HyperThreading (2 logical cpus per Xeon) */
|   |       if (!memcmp(from, "noht", 4)) {
|   |           disable_x86_ht = 1;
|   |           set_bit(X86_FEATURE_HT, disabled_x86_caps);
|   |       }
|   |       /* "acpismp=force" forces parsing and use of the ACPI SMP table */
|   |       else if (!memcmp(from, "acpismp=force", 13))
|   |           enable_acpi_smp_table = 1;
|   |-- setup_memory();         // reserve memory for MP configuration table
|   |   |-- reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
|   |   `-- find_smp_config();
|   |       `-- find_intel_smp();
|   |           `-- smp_scan_config();
|   |               |-- set flag smp_found_config
|   |               |-- set MP floating pointer mpf_found
|   |               `-- reserve_bootmem(mpf_found, PAGE_SIZE);
|   |-- if (disable_x86_ht) {   // if HyperThreading feature disabled
|   |       clear_bit(X86_FEATURE_HT, &boot_cpu_data.x86_capability[0]);
|   |       set_bit(X86_FEATURE_HT, disabled_x86_caps);
|   |       enable_acpi_smp_table = 0;
|   |   }
|   |-- if (test_bit(X86_FEATURE_HT, &boot_cpu_data.x86_capability[0]))
|   |       enable_acpi_smp_table = 1;
|   |-- smp_alloc_memory();
|   |   `-- /* reserve AP processor's real-mode code space in low memory */
|   |       trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
|   `-- get_smp_config();     /* get boot-time MP configuration */
|       |-- config_acpi_tables();
|       |   |-- memset(&acpi_boot_ops, 0, sizeof(acpi_boot_ops));
|       |   |-- acpi_boot_ops[ACPI_APIC] = acpi_parse_madt;
|       |   `-- /* Set have_acpi_tables to indicate using
|       |        * MADT in the ACPI tables; Use MPS tables if failed. */
|       |       if (enable_acpi_smp_table && !acpi_tables_init())
|       |           have_acpi_tables = 1;
|       |-- set pic_mode
|       |   /* =1, if the IMCR is present and PIC Mode is implemented;
|       |    * =0, otherwise Virtual Wire Mode is implemented. */
|       |-- save local APIC address in mp_lapic_addr
|       `-- scan for MP configuration table entries, like
|             MP_PROCESSOR, MP_BUS, MP_IOAPIC, MP_INTSRC and MP_LINTSRC.
|-- trap_init();
|   `-- init_apic_mappings();   // setup PTE for APIC
|       |-- /* If no local APIC can be found then set up a fake all
|       |    * zeroes page to simulate the local APIC and another
|       |    * one for the IO-APIC. */
|       |   if (!smp_found_config && detect_init_APIC()) {
|       |       apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
|       |       apic_phys = __pa(apic_phys);
|       |   } else
|       |       apic_phys = mp_lapic_addr;
|       |-- /* map local APIC address,
|       |    *   mp_lapic_addr (0xfee00000) in most case,
|       |    *   to linear address FIXADDR_TOP (0xffffe000) */
|       |   set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
|       |-- /* Fetch the APIC ID of the BSP in case we have a
|       |    * default configuration (or the MP table is broken). */
|       |   if (boot_cpu_physical_apicid == -1U)
|       |       boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
|       `-- // map IOAPIC address to uncacheable linear address
|           set_fixmap_nocache(idx, ioapic_phys);
|       // Now we can use linear address to access APIC space.
|-- init_IRQ();
|   |-- init_ISA_irqs();
|   |   |-- /* An initial setup of the virtual wire mode. */
|   |   |   init_bsp_APIC();
|   |   `-- init_8259A(auto_eoi=0);
|   `-- setup SMP/APIC interrupt handlers, esp. IPI.
`-- mem_init();
    `-- /* delay zapping low mapping entries for SMP: zap_low_mappings() */

IPI (InterProcessor Interrupt), CPU-to-CPU interrupt through local APIC, is the mechanism used by BSP to trigger APs.

Be aware that "one local APIC per CPU is required" in an MP-compliant system. Processors do not share APIC local units address space (physical address 0xFEE00000 - 0xFEEFFFFF), but will share APIC I/O units (0xFEC00000 - 0xFECFFFFF). Both address spaces are uncacheable.

8.2. smp_init()

BSP calls start_kernel() -> smp_init() -> smp_boot_cpus() to setup data structures for each CPU and activate the rest APs.

///////////////////////////////////////////////////////////////////////////////
static void __init smp_init(void)
{
        /* Get other processors into their bootup holding patterns. */
        smp_boot_cpus();
        wait_init_idle = cpu_online_map;
        clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */

        smp_threads_ready=1;
        smp_commence() {
                /* Lets the callins below out of their loop. */
                Dprintk("Setting commenced=1, go go go\n");
                wmb();
                atomic_set(&smp_commenced,1);
        }

        /* Wait for the other cpus to set up their idle processes */
        printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle);
        while (wait_init_idle) {
                cpu_relax();    // i.e. "rep;nop"
                barrier();
        }
        printk("All processors have done init_idle\n");
}

///////////////////////////////////////////////////////////////////////////////
void __init smp_boot_cpus(void)
{
        // ... something not very interesting :-)

        /* Initialize the logical to physical CPU number mapping
         * and the per-CPU profiling router/multiplier */
        prof_counter[0..NR_CPUS-1] = 0;
        prof_old_multiplier[0..NR_CPUS-1] = 0;
        prof_multiplier[0..NR_CPUS-1] = 0;

        init_cpu_to_apicid() {
                physical_apicid_2_cpu[0..MAX_APICID-1] = -1;
                logical_apicid_2_cpu[0..MAX_APICID-1] = -1;
                cpu_2_physical_apicid[0..NR_CPUS-1] = 0;
                cpu_2_logical_apicid[0..NR_CPUS-1] = 0;
        }

        /* Setup boot CPU information */
        smp_store_cpu_info(0); /* Final full version of the data */
        printk("CPU%d: ", 0);
        print_cpu_info(&cpu_data[0]);

        /* We have the boot CPU online for sure. */
        set_bit(0, &cpu_online_map);
        boot_cpu_logical_apicid = logical_smp_processor_id() {
                GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
        }
        map_cpu_to_boot_apicid(0, boot_cpu_apicid) {
               physical_apicid_2_cpu[boot_cpu_apicid] = 0;
               cpu_2_physical_apicid[0] = boot_cpu_apicid;
        }

        global_irq_holder = 0;
        current->processor = 0;
        init_idle();    // will clear corresponding bit in wait_init_idle
        smp_tune_scheduling();

        // ... some conditions checked

        connect_bsp_APIC();     // enable APIC mode if used to be PIC mode
        setup_local_APIC();

        if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid)
                BUG();

        /* Scan the CPU present map and fire up the other CPUs
         *   via do_boot_cpu() */
        Dprintk("CPU present map: %lx\n", phys_cpu_present_map);
        for (bit = 0; bit < NR_CPUS; bit++) {
                apicid = cpu_present_to_apicid(bit);
                /* Don't even attempt to start the boot CPU! */
                if (apicid == boot_cpu_apicid)
                        continue;
                if (!(phys_cpu_present_map & (1 << bit)))
                        continue;
                if ((max_cpus >= 0) && (max_cpus <= cpucount+1))
                        continue;
                do_boot_cpu(apicid);
                /* Make sure we unmap all failed CPUs */
                if ((boot_apicid_to_cpu(apicid) == -1) &&
                                (phys_cpu_present_map & (1 << bit)))
                        printk("CPU #%d not responding - cannot use it.\n",
                                                                apicid);
        }

        // ... SMP BogoMIPS
        // ... B stepping processor warning
        // ... HyperThreading handling

        /* Set up all local APIC timers in the system */
        setup_APIC_clocks();

        /* Synchronize the TSC with the AP */
        if (cpu_has_tsc && cpucount)
                synchronize_tsc_bp();

smp_done:
        zap_low_mappings();
}

///////////////////////////////////////////////////////////////////////////////
static void __init do_boot_cpu (int apicid)
{
        cpu = ++cpucount;

        // 1. prepare "idle process" task struct for next AP

        /* We can't use kernel_thread since we must avoid to
         * reschedule the child. */
        if (fork_by_hand() < 0)
                panic("failed fork for CPU %d", cpu);
        /* We remove it from the pidhash and the runqueue
         * once we got the process: */
        idle = init_task.prev_task;
        if (!idle)
                panic("No idle process for CPU %d", cpu);

        /* we schedule the first task manually */
        idle->processor = cpu;
        idle->cpus_runnable = 1 << cpu; // only on this AP!

        map_cpu_to_boot_apicid(cpu, apicid) {
                physical_apicid_2_cpu[apicid] = cpu;
                cpu_2_physical_apicid[cpu] = apicid;
        }

        idle->thread.eip = (unsigned long) start_secondary;

        del_from_runqueue(idle);
        unhash_process(idle);
        init_tasks[cpu] = idle;

        // 2. prepare stack and code (CS:IP) for next AP

        /* start_eip had better be page-aligned! */
        start_eip = setup_trampoline() {
                memcpy(trampoline_base, trampoline_data,
                        trampoline_end - trampoline_data);
                /* trampoline_base was reserved in
                 * start_kernel() -> setup_arch() -> smp_alloc_memory(),
                 * and will be shared by all APs (one by one) */
                return virt_to_phys(trampoline_base);
        }

        /* So we see what's up */
        printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
        stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle);
        /* this value is used by next AP when it executes
         *   "lss stack_start,%esp" in
         *   linux/arch/i386/kernel/head.S:startup_32(). */

        /* This grunge runs the startup process for
         * the targeted processor. */
        atomic_set(&init_deasserted, 0);
        Dprintk("Setting warm reset code and vector.\n");

        CMOS_WRITE(0xa, 0xf);
        local_flush_tlb();
        Dprintk("1.\n");
        *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4;
        Dprintk("2.\n");
        *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf;
        Dprintk("3.\n");
        // we have setup 0:467 to start_eip (trampoline_base)

        // 3. kick AP to run (AP gets CS:IP from 0:467)

        // Starting actual IPI sequence...
        boot_error = wakeup_secondary_via_INIT(apicid, start_eip);
        if (!boot_error) {      // looks OK
                /* allow APs to start initializing. */
                set_bit(cpu, &cpu_callout_map);

                /* ... Wait 5s total for a response */

                // bit cpu in cpu_callin_map is set by AP in smp_callin()
                if (test_bit(cpu, &cpu_callin_map)) {
                        print_cpu_info(&cpu_data[cpu]);
                } else {
                        boot_error= 1;
                        // marker 0xA5 set by AP in trampoline_data()
                        if (*((volatile unsigned char *)phys_to_virt(8192))
                                        == 0xA5)
                                /* trampoline started but... */
                                printk("Stuck ??\n");
                        else
                                /* trampoline code not run */
                                printk("Not responding.\n");
                }
        }
        if (boot_error) {
                /* Try to put things back the way they were before ... */
                unmap_cpu_to_boot_apicid(cpu, apicid);
                clear_bit(cpu, &cpu_callout_map); /* set in do_boot_cpu() */
                clear_bit(cpu, &cpu_initialized); /* set in cpu_init() */
                clear_bit(cpu, &cpu_online_map);  /* set in smp_callin() */
                cpucount--;
        }

        /* mark "stuck" area as not stuck */
        *((volatile unsigned long *)phys_to_virt(8192)) = 0;
}

Don't confuse start_secondary() with trampoline_data(). The former is AP "idle" process task struct EIP value, and the latter is the real-mode code that AP runs after BSP kicks it (using wakeup_secondary_via_INIT()).

8.3. linux/arch/i386/kernel/trampoline.S

This file contains the 16-bit real-mode AP startup code. BSP reserved memory space trampoline_base in start_kernel() -> setup_arch() -> smp_alloc_memory(). Before BSP triggers AP, it copies the trampoline code, between trampoline_data and trampoline_end, to trampoline_base (in do_boot_cpu() -> setup_trampoline()). BSP sets up 0:467 to point to trampoline_base, so that AP will run from here.

///////////////////////////////////////////////////////////////////////////////
trampoline_data()
{
r_base:
        wbinvd;         // Needed for NUMA-Q should be harmless for other
        DS = CS;
        BX = 1;         // Flag an SMP trampoline
        cli;

        // write marker for master knows we're running
        trampoline_base = 0xA5A5A5A5;

        lidt idt_48;
        lgdt gdt_48;

        AX = 1;
        lmsw AX;        // protected mode!
        goto flush_instr;
flush_instr:
        goto CS:100000; // see linux/arch/i386/kernel/head.S:startup_32()
}

idt_48:
        .word   0                       # idt limit = 0
        .word   0, 0                    # idt base = 0L

gdt_48:
        .word   0x0800                  # gdt limit = 2048, 256 GDT entries
        .long   gdt_table-__PAGE_OFFSET # gdt base = gdt (first SMP CPU)

.globl SYMBOL_NAME(trampoline_end)
SYMBOL_NAME_LABEL(trampoline_end)

Note that BX=1 when AP jumps to linux/arch/i386/kernel/head.S:startup_32(), which is different from that of BSP (BX=0). See Section 6, “linux/arch/i386/kernel/head.S”.

8.4. initialize_secondary()

Unlike BSP, at the end of linux/arch/i386/kernel/head.S:startup_32() in Section 6.4, “Go Start Kernel”, AP will call initialize_secondary() instead of start_kernel().

/* Everything has been set up for the secondary
 * CPUs - they just need to reload everything
 * from the task structure
 * This function must not return. */
void __init initialize_secondary(void)
{
        /* We don't actually need to load the full TSS,
         * basically just the stack pointer and the eip. */
        asm volatile(
                "movl %0,%%esp\n\t"
                "jmp *%1"
                :
                :"r" (current->thread.esp),"r" (current->thread.eip));
}

As BSP called do_boot_cpu() to set thread.eip to start_secondary(), control of AP is passed to this function. AP uses a new stack frame, which was set up by BSP in do_boot_cpu() -> fork_by_hand() -> do_fork().

8.5. start_secondary()

All APs wait for signal smp_commenced from BSP, triggered in Section 8.2, “smp_init()” smp_init() -> smp_commence(). After getting this signal, they will run "idle" processes.

///////////////////////////////////////////////////////////////////////////////
int __init start_secondary(void *unused)
{
        /* Dont put anything before smp_callin(), SMP
         * booting is too fragile that we want to limit the
         * things done here to the most necessary things. */
        cpu_init();
        smp_callin();
        while (!atomic_read(&smp_commenced))
                rep_nop();
        /* low-memory mappings have been cleared, flush them from
         * the local TLBs too. */
        local_flush_tlb();
        return cpu_idle();      // never return, see Section 7.3, “cpu_idle()”
}

cpu_idle() -> init_idle() will clear corresponding bit in wait_init_idle, and finally make BSP finish smp_init() and continue with the following function in start_kernel() (i.e. rest_init()).

8.6. Reference