A bit about the context switch

Typical actions of the context switch

This is a very hardware-dependent action!

Usually context switches begin with a call to the kernel routine context_switch.

context_switch calls a machine-dependent macro switch_to which is defined with three arguments as switch_to(prev, next, last). prev and next are the present and next processes. When this is called the CPU switches to the next process, which is has been partienting waiting at its last switch_to. The last parameter stores the identity of the process that was running when the CPU was switched back to the waiting prev process.

Implementation of the switch

switch_to is implemented by a macro in architecture-specific files.

Switching on the Intel 86

On the Intel 86, the switch is done by a rather long macro, containing many C-embedded assembler instructions, in arch/x86/include/asm/switch_to.h.

#define switch_to(prev, next, last)					\
do {									\
	/*								\
	 * Context-switching clobbers all registers, so we clobber	\
	 * them explicitly, via unused output variables.		\
	 * (EAX and EBP is not listed because EBP is saved/restored	\
	 * explicitly for wchan access and EAX is the return value of	\
	 * __switch_to())						\
	 */								\
	unsigned long ebx, ecx, edx, esi, edi;				\
									\
	asm volatile("pushfl\n\t"		/* save    flags */	\
		     "pushl %%ebp\n\t"		/* save    EBP   */	\
		     "movl %%esp,%[prev_sp]\n\t"	/* save    ESP   */ \
		     "movl %[next_sp],%%esp\n\t"	/* restore ESP   */ \
		     "movl $1f,%[prev_ip]\n\t"	/* save    EIP   */	\
		     "pushl %[next_ip]\n\t"	/* restore EIP   */	\
		     __switch_canary					\
		     "jmp __switch_to\n"	/* regparm call  */	\
		     "1:\t"						\
		     "popl %%ebp\n\t"		/* restore EBP   */	\
		     "popfl\n"			/* restore flags */	\
									\
		     /* output parameters */				\
		     : [prev_sp] "=m" (prev->thread.sp),		\
		       [prev_ip] "=m" (prev->thread.ip),		\
		       "=a" (last),					\
									\
		       /* clobbered output registers: */		\
		       "=b" (ebx), "=c" (ecx), "=d" (edx),		\
		       "=S" (esi), "=D" (edi)				\
		       							\
		       __switch_canary_oparam				\
									\
		       /* input parameters: */				\
		     : [next_sp]  "m" (next->thread.sp),		\
		       [next_ip]  "m" (next->thread.ip),		\
		       							\
		       /* regparm parameters for __switch_to(): */	\
		       [prev]     "a" (prev),				\
		       [next]     "d" (next)				\
									\
		       __switch_canary_iparam				\
									\
		     : /* reloaded segment registers */			\
			"memory");					\
} while (0)

This macro makes a call (jmp) which is defined in two versions in the files arch/x86/kernel/process_32.c and arch/x86/kernel/process_64.c. Here’s the __switch_to from process_32.c.

__notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
	struct thread_struct *prev = &prev_p->thread,
				 *next = &next_p->thread;
	int cpu = smp_processor_id();
	struct tss_struct *tss = &per_cpu(init_tss, cpu);
	fpu_switch_t fpu;

	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */

	fpu = switch_fpu_prepare(prev_p, next_p, cpu);

	/*
	 * Reload esp0.
	 */
	load_sp0(tss, next);

	/*
	 * Save away %gs. No need to save %fs, as it was saved on the
	 * stack on entry.  No need to save %es and %ds, as those are
	 * always kernel segments while inside the kernel.  Doing this
	 * before setting the new TLS descriptors avoids the situation
	 * where we temporarily have non-reloadable segments in %fs
	 * and %gs.  This could be an issue if the NMI handler ever
	 * used %fs or %gs (it does not today), or if the kernel is
	 * running inside of a hypervisor layer.
	 */
	lazy_save_gs(prev->gs);

	/*
	 * Load the per-thread Thread-Local Storage descriptor.
	 */
	load_TLS(next, cpu);

	/*
	 * Restore IOPL if needed.  In normal use, the flags restore
	 * in the switch assembly will handle this.  But if the kernel
	 * is running virtualized at a non-zero CPL, the popf will
	 * not restore flags, so it must be done in a separate step.
	 */
	if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
		set_iopl_mask(next->iopl);

	/*
	 * Now maybe handle debug registers and/or IO bitmaps
	 */
	if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
		     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
		__switch_to_xtra(prev_p, next_p, tss);

	/*
	 * Leave lazy mode, flushing any hypercalls made here.
	 * This must be done before restoring TLS segments so
	 * the GDT and LDT are properly updated, and must be
	 * done before math_state_restore, so the TS bit is up
	 * to date.
	 */
	arch_end_context_switch(next_p);

	/*
	 * Restore %gs if needed (which is common)
	 */
	if (prev->gs | next->gs)
		lazy_load_gs(next->gs);

	switch_fpu_finish(next_p, fpu);

	this_cpu_write(current_task, next_p);

	return prev_p;
}

Switching on the Alpha

In the Alpha, the file arch/alpha/include/asm/switch_to.h contains a simple macro which calls the assembler function alpha_switch_to.

#define switch_to(P,N,L)						 \
  do {									 \
    (L) = alpha_switch_to(virt_to_phys(&task_thread_info(N)->pcb), (P)); \
    check_mmu_context();						 \
  } while (0)

The Alpha architecture used a firmware-layer called PALcode as an interface between the hardware and operating system. This allows the switch to be written fairly simply in assembly language. The can be seen in arch/alpha/kernel/entry.S.

	.align	4
	.globl	alpha_switch_to
	.ent	alpha_switch_to
alpha_switch_to:
	.prologue 0
	bsr	$1, do_switch_stack
	call_pal PAL_swpctx
	lda	$8, 0x3fff
	bsr	$1, undo_switch_stack
	bic	$sp, $8, $8
	mov	$17, $0
	ret
.end alpha_switch_to

Switching on the x64-86

The x86_64 is a significant revision, based on AMD64, of the x86 architecture.

The switch_to macro is found in arch/ia64/include/asm/switch_to.h.

# define switch_to(prev,next,last) do {						\
	if (ia64_psr(task_pt_regs(prev))->mfh && ia64_is_local_fpu_owner(prev)) {				\
		ia64_psr(task_pt_regs(prev))->mfh = 0;			\
		(prev)->thread.flags |= IA64_THREAD_FPH_VALID;			\
		__ia64_save_fpu((prev)->thread.fph);				\
	}									\
	__switch_to(prev, next, last);						\
	/* "next" in old context is "current" in new context */			\
	if (unlikely((current->thread.flags & IA64_THREAD_MIGRATION) &&	       \
		     (task_cpu(current) !=				       \
		      		      task_thread_info(current)->last_cpu))) { \
		platform_migrate(current);				       \
		task_thread_info(current)->last_cpu = task_cpu(current);       \
	}								       \
} while (0)

The __switch_to macro is also found in this same file.

#define __switch_to(prev,next,last) do {							 \
	if (IA64_HAS_EXTRA_STATE(prev))								 \
		ia64_save_extra(prev);								 \
	if (IA64_HAS_EXTRA_STATE(next))								 \
		ia64_load_extra(next);								 \
	ia64_psr(task_pt_regs(next))->dfh = !ia64_is_local_fpu_owner(next);			 \
	(last) = ia64_switch_to((next));							 \
} while (0)

In arch/ia64/include/asm/paravirt_privop.h, we find two macros definition that transform ia64_switch_to into paravirt_switch_to.

#define ia64_switch_to			IA64_PARAVIRT_ASM_FUNC(switch_to)
#define IA64_PARAVIRT_ASM_FUNC(name)	paravirt_ ## name

The assembly language implementation of paravirt_switch_to can be found in arch/ia64/kernel/entry.S.