diff -X dontdiff -ruN linux-2.4.17/Documentation/Configure.help linux-2.4.17-lse02-D/Documentation/Configure.help --- linux-2.4.17/Documentation/Configure.help Fri Dec 21 09:41:53 2001 +++ linux-2.4.17-lse02-D/Documentation/Configure.help Thu Apr 4 17:37:16 2002 @@ -376,6 +376,12 @@ Select this if you have a 32-bit processor and more than 4 gigabytes of physical RAM. +HIGHMEM I/O support +CONFIG_HIGHIO + If you want to be able to do I/O to high memory pages, say Y. + Otherwise low memory pages are used as bounce buffers causing a + degrade in performance. + Normal floppy disk support CONFIG_BLK_DEV_FD If you want to use the floppy disk drive(s) of your PC under Linux, diff -X dontdiff -ruN linux-2.4.17/arch/i386/config.in linux-2.4.17-lse02-D/arch/i386/config.in --- linux-2.4.17/arch/i386/config.in Fri Dec 21 09:41:53 2001 +++ linux-2.4.17-lse02-D/arch/i386/config.in Thu Apr 4 17:37:16 2002 @@ -173,6 +173,12 @@ define_bool CONFIG_X86_PAE y fi +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + if [ "$CONFIG_NOHIGHMEM" != "y" ]; then + bool 'HIGHMEM I/O support (EXPERIMENTAL)' CONFIG_HIGHIO + fi +fi + bool 'Math emulation' CONFIG_MATH_EMULATION bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR bool 'Symmetric multi-processing support' CONFIG_SMP diff -X dontdiff -ruN linux-2.4.17/arch/i386/kernel/apic.c linux-2.4.17-lse02-D/arch/i386/kernel/apic.c --- linux-2.4.17/arch/i386/kernel/apic.c Fri Nov 9 14:12:55 2001 +++ linux-2.4.17-lse02-D/arch/i386/kernel/apic.c Thu Apr 4 17:37:16 2002 @@ -785,8 +785,7 @@ */ slice = clocks / (smp_num_cpus+1); - printk("cpu: %d, clocks: %d, slice: %d\n", - smp_processor_id(), clocks, slice); + printk("cpu: %d, clocks: %d, slice: %d\n", smp_processor_id(), clocks, slice); /* * Wait for IRQ0's slice: @@ -809,8 +808,7 @@ __setup_APIC_LVTT(clocks); - printk("CPU%d\n", - smp_processor_id(), t0, t1, delta, slice, clocks); + printk("CPU%d\n", smp_processor_id(), t0, t1, delta, slice, clocks); __restore_flags(flags); } @@ -913,6 +911,26 @@ smp_call_function(setup_APIC_timer, (void *)calibration_result, 1, 1); } +void __init disable_APIC_timer(void) +{ + if (using_apic_timer) { + unsigned long v; + + v = apic_read(APIC_LVTT); + apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + } +} + +void enable_APIC_timer(void) +{ + if (using_apic_timer) { + unsigned long v; + + v = apic_read(APIC_LVTT); + apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); + } +} + /* * the frequency of the profiling timer can be changed * by writing a multiplier value into /proc/profile. @@ -1031,7 +1049,9 @@ irq_enter(cpu, 0); smp_local_timer_interrupt(regs); irq_exit(cpu, 0); - +#if CONFIG_SMP + run_local_timers(); +#endif if (softirq_pending(cpu)) do_softirq(); } diff -X dontdiff -ruN linux-2.4.17/arch/i386/kernel/entry.S linux-2.4.17-lse02-D/arch/i386/kernel/entry.S --- linux-2.4.17/arch/i386/kernel/entry.S Fri Nov 2 17:18:49 2001 +++ linux-2.4.17-lse02-D/arch/i386/kernel/entry.S Thu Apr 4 17:37:16 2002 @@ -77,7 +77,7 @@ exec_domain = 16 need_resched = 20 tsk_ptrace = 24 -processor = 52 +cpu = 32 ENOSYS = 38 @@ -176,9 +176,11 @@ ENTRY(ret_from_fork) +#if CONFIG_SMP pushl %ebx call SYMBOL_NAME(schedule_tail) addl $4, %esp +#endif GET_CURRENT(%ebx) testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS jne tracesys_exit diff -X dontdiff -ruN linux-2.4.17/arch/i386/kernel/i8259.c linux-2.4.17-lse02-D/arch/i386/kernel/i8259.c --- linux-2.4.17/arch/i386/kernel/i8259.c Mon Sep 17 23:03:09 2001 +++ linux-2.4.17-lse02-D/arch/i386/kernel/i8259.c Thu Apr 4 17:37:16 2002 @@ -79,6 +79,7 @@ * through the ICC by us (IPIs) */ #ifdef CONFIG_SMP +BUILD_SMP_INTERRUPT(task_migration_interrupt,TASK_MIGRATION_VECTOR) BUILD_SMP_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_SMP_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR) BUILD_SMP_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) @@ -472,6 +473,9 @@ * IPI, driven by wakeup. */ set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPI for task migration */ + set_intr_gate(TASK_MIGRATION_VECTOR, task_migration_interrupt); /* IPI for invalidation */ set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); diff -X dontdiff -ruN linux-2.4.17/arch/i386/kernel/nmi.c linux-2.4.17-lse02-D/arch/i386/kernel/nmi.c --- linux-2.4.17/arch/i386/kernel/nmi.c Thu Sep 20 20:55:24 2001 +++ linux-2.4.17-lse02-D/arch/i386/kernel/nmi.c Thu Apr 4 17:37:16 2002 @@ -283,7 +283,7 @@ * to get a message out. */ bust_spinlocks(1); - printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu); + printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip); show_registers(regs); printk("console shuts up ...\n"); console_silent(); diff -X dontdiff -ruN linux-2.4.17/arch/i386/kernel/process.c linux-2.4.17-lse02-D/arch/i386/kernel/process.c --- linux-2.4.17/arch/i386/kernel/process.c Thu Oct 4 18:42:54 2001 +++ linux-2.4.17-lse02-D/arch/i386/kernel/process.c Thu Apr 4 17:37:16 2002 @@ -123,15 +123,12 @@ void cpu_idle (void) { /* endless idle loop with no priority at all */ - init_idle(); - current->nice = 20; - current->counter = -100; while (1) { void (*idle)(void) = pm_idle; if (!idle) idle = default_idle; - while (!current->need_resched) + if (!current->need_resched) idle(); schedule(); check_pgt_cache(); @@ -694,15 +691,17 @@ asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs)); /* - * Restore %fs and %gs. + * Restore %fs and %gs if needed. */ - loadsegment(fs, next->fs); - loadsegment(gs, next->gs); + if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) { + loadsegment(fs, next->fs); + loadsegment(gs, next->gs); + } /* * Now maybe reload the debug registers */ - if (next->debugreg[7]){ + if (unlikely(next->debugreg[7])) { loaddebug(next, 0); loaddebug(next, 1); loaddebug(next, 2); @@ -712,7 +711,7 @@ loaddebug(next, 7); } - if (prev->ioperm || next->ioperm) { + if (unlikely(prev->ioperm || next->ioperm)) { if (next->ioperm) { /* * 4 cachelines copy ... not good, but not that diff -X dontdiff -ruN linux-2.4.17/arch/i386/kernel/setup.c linux-2.4.17-lse02-D/arch/i386/kernel/setup.c --- linux-2.4.17/arch/i386/kernel/setup.c Fri Dec 21 09:41:53 2001 +++ linux-2.4.17-lse02-D/arch/i386/kernel/setup.c Thu Apr 4 17:37:16 2002 @@ -161,6 +161,8 @@ static int disable_x86_serial_nr __initdata = 1; static int disable_x86_fxsr __initdata = 0; +extern int blk_nohighio; + int enable_acpi_smp_table; /* @@ -782,13 +784,17 @@ void __init setup_arch(char **cmdline_p) { unsigned long bootmap_size, low_mem_size; - unsigned long start_pfn, max_pfn, max_low_pfn; + unsigned long start_pfn, max_low_pfn; int i; #ifdef CONFIG_VISWS visws_get_board_type_and_rev(); #endif +#ifndef CONFIG_HIGHIO + blk_nohighio = 1; +#endif + ROOT_DEV = to_kdev_t(ORIG_ROOT_DEV); drive_info = DRIVE_INFO; screen_info = SCREEN_INFO; @@ -1067,6 +1073,14 @@ __setup("notsc", tsc_setup); #endif +static int __init highio_setup(char *str) +{ + printk("i386: disabling HIGHMEM block I/O\n"); + blk_nohighio = 1; + return 1; +} +__setup("nohighio", highio_setup); + static int __init get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; @@ -2922,9 +2936,10 @@ load_TR(nr); load_LDT(&init_mm); - /* - * Clear all 6 debug registers: - */ + /* Clear %fs and %gs. */ + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); + + /* Clear all 6 debug registers: */ #define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) ); diff -X dontdiff -ruN linux-2.4.17/arch/i386/kernel/smp.c linux-2.4.17-lse02-D/arch/i386/kernel/smp.c --- linux-2.4.17/arch/i386/kernel/smp.c Fri Dec 21 09:41:53 2001 +++ linux-2.4.17-lse02-D/arch/i386/kernel/smp.c Thu Apr 4 17:37:16 2002 @@ -105,7 +105,7 @@ /* The 'big kernel lock' */ spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; -struct tlb_state cpu_tlbstate[NR_CPUS] = {[0 ... NR_CPUS-1] = { &init_mm, 0 }}; +struct tlb_state cpu_tlbstate[NR_CPUS] __cacheline_aligned = {[0 ... NR_CPUS-1] = { &init_mm, 0, }}; /* * the following functions deal with sending IPIs between CPUs. @@ -485,15 +485,54 @@ do_flush_tlb_all_local(); } +static spinlock_t migration_lock = SPIN_LOCK_UNLOCKED; +static task_t *new_task; + +/* + * This function sends a 'task migration' IPI to another CPU. + * Must be called from syscall contexts, with interrupts *enabled*. + */ +void smp_migrate_task(int cpu, task_t *p) +{ + /* + * The target CPU will unlock the migration spinlock: + */ + spin_lock(&migration_lock); + new_task = p; + send_IPI_mask(1 << cpu, TASK_MIGRATION_VECTOR); +} + +/* + * Task migration callback. + */ +asmlinkage void smp_task_migration_interrupt(void) +{ + task_t *p; + + ack_APIC_irq(); + p = new_task; + spin_unlock(&migration_lock); + sched_task_migrated(p); +} /* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing * anything. Worst case is that we lose a reschedule ... */ - void smp_send_reschedule(int cpu) { send_IPI_mask(1 << cpu, RESCHEDULE_VECTOR); +} + +/* + * this function sends a reschedule IPI to all (other) CPUs. + * This should only be used if some 'global' task became runnable, + * such as a RT task, that must be handled now. The first CPU + * that manages to grab the task will run it. + */ +void smp_send_reschedule_all(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); } /* diff -X dontdiff -ruN linux-2.4.17/arch/i386/kernel/smpboot.c linux-2.4.17-lse02-D/arch/i386/kernel/smpboot.c --- linux-2.4.17/arch/i386/kernel/smpboot.c Fri Dec 21 09:41:53 2001 +++ linux-2.4.17-lse02-D/arch/i386/kernel/smpboot.c Thu Apr 4 17:37:16 2002 @@ -308,14 +308,14 @@ if (tsc_values[i] < avg) realdelta = -realdelta; - printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", - i, realdelta); + printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta); } sum += delta; } if (!buggy) printk("passed.\n"); + ; } static void __init synchronize_tsc_ap (void) @@ -365,7 +365,7 @@ * (This works even if the APIC is not enabled.) */ phys_id = GET_APIC_ID(apic_read(APIC_ID)); - cpuid = current->processor; + cpuid = cpu(); if (test_and_set_bit(cpuid, &cpu_online_map)) { printk("huh, phys CPU#%d, CPU#%d already present??\n", phys_id, cpuid); @@ -435,6 +435,7 @@ */ smp_store_cpu_info(cpuid); + disable_APIC_timer(); /* * Allow the master to continue. */ @@ -465,6 +466,7 @@ smp_callin(); while (!atomic_read(&smp_commenced)) rep_nop(); + enable_APIC_timer(); /* * low-memory mappings have been cleared, flush them from * the local TLBs too. @@ -803,16 +805,13 @@ if (!idle) panic("No idle process for CPU %d", cpu); - idle->processor = cpu; - idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */ + init_idle(idle, cpu); map_cpu_to_boot_apicid(cpu, apicid); idle->thread.eip = (unsigned long) start_secondary; - del_from_runqueue(idle); unhash_process(idle); - init_tasks[cpu] = idle; /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); @@ -925,6 +924,7 @@ } cycles_t cacheflush_time; +unsigned long cache_decay_ticks; static void smp_tune_scheduling (void) { @@ -958,9 +958,13 @@ cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth; } + cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000; + printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", (long)cacheflush_time/(cpu_khz/1000), ((long)cacheflush_time*100/(cpu_khz/1000)) % 100); + printk("task migration cache decay timeout: %ld msecs.\n", + (cache_decay_ticks + 1) * 1000 / HZ); } /* @@ -1020,8 +1024,7 @@ map_cpu_to_boot_apicid(0, boot_cpu_apicid); global_irq_holder = 0; - current->processor = 0; - init_idle(); + current->cpu = 0; smp_tune_scheduling(); /* diff -X dontdiff -ruN linux-2.4.17/arch/i386/mm/fault.c linux-2.4.17-lse02-D/arch/i386/mm/fault.c --- linux-2.4.17/arch/i386/mm/fault.c Tue Oct 9 15:13:03 2001 +++ linux-2.4.17-lse02-D/arch/i386/mm/fault.c Thu Apr 4 17:37:16 2002 @@ -88,23 +88,18 @@ out_of_memory: if (current->pid == 1) { - current->policy |= SCHED_YIELD; - schedule(); + yield(); goto survive; } goto bad_area; } -extern spinlock_t timerlist_lock; - /* * Unlock any spinlocks which will prevent us from getting the - * message out (timerlist_lock is acquired through the - * console unblank code) + * message out */ void bust_spinlocks(int yes) { - spin_lock_init(&timerlist_lock); if (yes) { oops_in_progress = 1; #ifdef CONFIG_SMP @@ -344,8 +339,7 @@ out_of_memory: up_read(&mm->mmap_sem); if (tsk->pid == 1) { - tsk->policy |= SCHED_YIELD; - schedule(); + yield(); down_read(&mm->mmap_sem); goto survive; } diff -X dontdiff -ruN linux-2.4.17/arch/sparc/kernel/irq.c linux-2.4.17-lse02-D/arch/sparc/kernel/irq.c --- linux-2.4.17/arch/sparc/kernel/irq.c Thu Jul 19 18:11:13 2001 +++ linux-2.4.17-lse02-D/arch/sparc/kernel/irq.c Thu Apr 4 17:37:16 2002 @@ -72,7 +72,7 @@ prom_halt(); } -void (*init_timers)(void (*)(int, void *,struct pt_regs *)) = +void (*sparc_init_timers)(void (*)(int, void *,struct pt_regs *)) = (void (*)(void (*)(int, void *,struct pt_regs *))) irq_panic; /* diff -X dontdiff -ruN linux-2.4.17/arch/sparc/kernel/sun4c_irq.c linux-2.4.17-lse02-D/arch/sparc/kernel/sun4c_irq.c --- linux-2.4.17/arch/sparc/kernel/sun4c_irq.c Thu Apr 26 22:17:26 2001 +++ linux-2.4.17-lse02-D/arch/sparc/kernel/sun4c_irq.c Thu Apr 4 17:37:16 2002 @@ -143,7 +143,7 @@ /* Errm.. not sure how to do this.. */ } -static void __init sun4c_init_timers(void (*counter_fn)(int, void *, struct pt_regs *)) +static void __init sun4c_sparc_init_timers(void (*counter_fn)(int, void *, struct pt_regs *)) { int irq; @@ -221,7 +221,7 @@ BTFIXUPSET_CALL(clear_profile_irq, sun4c_clear_profile_irq, BTFIXUPCALL_NOP); BTFIXUPSET_CALL(load_profile_irq, sun4c_load_profile_irq, BTFIXUPCALL_NOP); BTFIXUPSET_CALL(__irq_itoa, sun4m_irq_itoa, BTFIXUPCALL_NORM); - init_timers = sun4c_init_timers; + sparc_init_timers = sun4c_sparc_init_timers; #ifdef CONFIG_SMP BTFIXUPSET_CALL(set_cpu_int, sun4c_nop, BTFIXUPCALL_NOP); BTFIXUPSET_CALL(clear_cpu_int, sun4c_nop, BTFIXUPCALL_NOP); diff -X dontdiff -ruN linux-2.4.17/arch/sparc/kernel/sun4d_irq.c linux-2.4.17-lse02-D/arch/sparc/kernel/sun4d_irq.c --- linux-2.4.17/arch/sparc/kernel/sun4d_irq.c Thu Jul 19 18:11:13 2001 +++ linux-2.4.17-lse02-D/arch/sparc/kernel/sun4d_irq.c Thu Apr 4 17:37:16 2002 @@ -435,7 +435,7 @@ bw_set_prof_limit(cpu, limit); } -static void __init sun4d_init_timers(void (*counter_fn)(int, void *, struct pt_regs *)) +static void __init sun4d_sparc_init_timers(void (*counter_fn)(int, void *, struct pt_regs *)) { int irq; extern struct prom_cpuinfo linux_cpus[NR_CPUS]; @@ -546,7 +546,7 @@ BTFIXUPSET_CALL(clear_profile_irq, sun4d_clear_profile_irq, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(load_profile_irq, sun4d_load_profile_irq, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(__irq_itoa, sun4d_irq_itoa, BTFIXUPCALL_NORM); - init_timers = sun4d_init_timers; + sparc_init_timers = sun4d_sparc_init_timers; #ifdef CONFIG_SMP BTFIXUPSET_CALL(set_cpu_int, sun4d_set_cpu_int, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(clear_cpu_int, sun4d_clear_ipi, BTFIXUPCALL_NOP); diff -X dontdiff -ruN linux-2.4.17/arch/sparc/kernel/sun4m_irq.c linux-2.4.17-lse02-D/arch/sparc/kernel/sun4m_irq.c --- linux-2.4.17/arch/sparc/kernel/sun4m_irq.c Thu Apr 26 22:17:26 2001 +++ linux-2.4.17-lse02-D/arch/sparc/kernel/sun4m_irq.c Thu Apr 4 17:37:16 2002 @@ -223,7 +223,7 @@ return buff; } -static void __init sun4m_init_timers(void (*counter_fn)(int, void *, struct pt_regs *)) +static void __init sun4m_sparc_init_timers(void (*counter_fn)(int, void *, struct pt_regs *)) { int reg_count, irq, cpu; struct linux_prom_registers cnt_regs[PROMREG_MAX]; @@ -374,7 +374,7 @@ BTFIXUPSET_CALL(clear_profile_irq, sun4m_clear_profile_irq, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(load_profile_irq, sun4m_load_profile_irq, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(__irq_itoa, sun4m_irq_itoa, BTFIXUPCALL_NORM); - init_timers = sun4m_init_timers; + sparc_init_timers = sun4m_sparc_init_timers; #ifdef CONFIG_SMP BTFIXUPSET_CALL(set_cpu_int, sun4m_send_ipi, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(clear_cpu_int, sun4m_clear_ipi, BTFIXUPCALL_NORM); diff -X dontdiff -ruN linux-2.4.17/arch/sparc/kernel/time.c linux-2.4.17-lse02-D/arch/sparc/kernel/time.c --- linux-2.4.17/arch/sparc/kernel/time.c Tue Oct 30 15:08:11 2001 +++ linux-2.4.17-lse02-D/arch/sparc/kernel/time.c Thu Apr 4 17:37:16 2002 @@ -380,7 +380,7 @@ else clock_probe(); - init_timers(timer_interrupt); + sparc_init_timers(timer_interrupt); #ifdef CONFIG_SUN4 if(idprom->id_machtype == (SM_SUN4 | SM_4_330)) { diff -X dontdiff -ruN linux-2.4.17/arch/sparc64/kernel/irq.c linux-2.4.17-lse02-D/arch/sparc64/kernel/irq.c --- linux-2.4.17/arch/sparc64/kernel/irq.c Fri Dec 21 09:41:53 2001 +++ linux-2.4.17-lse02-D/arch/sparc64/kernel/irq.c Thu Apr 4 17:37:16 2002 @@ -1026,7 +1026,7 @@ } /* This is gets the master TICK_INT timer going. */ -void init_timers(void (*cfunc)(int, void *, struct pt_regs *), +void sparc_init_timers(void (*cfunc)(int, void *, struct pt_regs *), unsigned long *clock) { unsigned long pstate; diff -X dontdiff -ruN linux-2.4.17/arch/sparc64/kernel/time.c linux-2.4.17-lse02-D/arch/sparc64/kernel/time.c --- linux-2.4.17/arch/sparc64/kernel/time.c Fri Dec 21 09:41:53 2001 +++ linux-2.4.17-lse02-D/arch/sparc64/kernel/time.c Thu Apr 4 17:37:16 2002 @@ -597,7 +597,7 @@ __restore_flags(flags); } -extern void init_timers(void (*func)(int, void *, struct pt_regs *), +extern void sparc_init_timers(void (*func)(int, void *, struct pt_regs *), unsigned long *); void __init time_init(void) @@ -608,7 +608,7 @@ */ unsigned long clock; - init_timers(timer_interrupt, &clock); + sparc_init_timers(timer_interrupt, &clock); timer_ticks_per_usec_quotient = ((1UL<<32) / (clock / 1000020)); } diff -X dontdiff -ruN linux-2.4.17/drivers/block/Makefile linux-2.4.17-lse02-D/drivers/block/Makefile --- linux-2.4.17/drivers/block/Makefile Sun Sep 9 12:00:55 2001 +++ linux-2.4.17-lse02-D/drivers/block/Makefile Thu Apr 4 17:37:16 2002 @@ -10,7 +10,7 @@ O_TARGET := block.o -export-objs := ll_rw_blk.o blkpg.o loop.o DAC960.o genhd.o +export-objs := ll_rw_blk.o blkpg.o loop.o DAC960.o genhd.o elevator.o obj-y := ll_rw_blk.o blkpg.o genhd.o elevator.o diff -X dontdiff -ruN linux-2.4.17/drivers/block/cciss.c linux-2.4.17-lse02-D/drivers/block/cciss.c --- linux-2.4.17/drivers/block/cciss.c Fri Dec 21 09:41:53 2001 +++ linux-2.4.17-lse02-D/drivers/block/cciss.c Thu Apr 4 17:37:16 2002 @@ -611,7 +611,7 @@ { buff = kmalloc(iocommand.buf_size, GFP_KERNEL); if( buff == NULL) - return -EFAULT; + return -ENOMEM; } if (iocommand.Request.Type.Direction == XFER_WRITE) { @@ -680,7 +680,7 @@ { kfree(buff); cmd_free(h, c, 0); - return( -EFAULT); + return( -EFAULT); } if (iocommand.Request.Type.Direction == XFER_READ) @@ -1125,20 +1125,22 @@ static inline void complete_command( CommandList_struct *cmd, int timeout) { int status = 1; - int i; + int i, ddir; u64bit temp64; if (timeout) status = 0; /* unmap the DMA mapping for all the scatter gather elements */ + if (cmd->Request.Type.Direction == XFER_READ) + ddir = PCI_DMA_FROMDEVICE; + else + ddir = PCI_DMA_TODEVICE; for(i=0; iHeader.SGList; i++) { temp64.val32.lower = cmd->SG[i].Addr.lower; temp64.val32.upper = cmd->SG[i].Addr.upper; - pci_unmap_single(hba[cmd->ctlr]->pdev, - temp64.val, cmd->SG[i].Len, - (cmd->Request.Type.Direction == XFER_READ) ? - PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE); + pci_unmap_page(hba[cmd->ctlr]->pdev, + temp64.val, cmd->SG[i].Len, ddir); } if(cmd->err_info->CommandStatus != 0) @@ -1215,7 +1217,6 @@ } } complete_buffers(cmd->rq->bh, status); - #ifdef CCISS_DEBUG printk("Done with %p\n", cmd->rq); #endif /* CCISS_DEBUG */ @@ -1236,7 +1237,7 @@ static int cpq_back_merge_fn(request_queue_t *q, struct request *rq, struct buffer_head *bh, int max_segments) { - if (rq->bhtail->b_data + rq->bhtail->b_size == bh->b_data) + if (BH_CONTIG(rq->bhtail, bh)) return 1; return cpq_new_segment(q, rq, max_segments); } @@ -1244,7 +1245,7 @@ static int cpq_front_merge_fn(request_queue_t *q, struct request *rq, struct buffer_head *bh, int max_segments) { - if (bh->b_data + bh->b_size == rq->bh->b_data) + if (BH_CONTIG(bh, rq->bh)) return 1; return cpq_new_segment(q, rq, max_segments); } @@ -1254,7 +1255,7 @@ { int total_segments = rq->nr_segments + nxt->nr_segments; - if (rq->bhtail->b_data + rq->bhtail->b_size == nxt->bh->b_data) + if (BH_CONTIG(rq->bhtail, nxt->bh)) total_segments--; if (total_segments > MAXSGENTRIES) @@ -1275,18 +1276,18 @@ ctlr_info_t *h= q->queuedata; CommandList_struct *c; int log_unit, start_blk, seg; - char *lastdataend; + unsigned long long lastdataend; struct buffer_head *bh; struct list_head *queue_head = &q->queue_head; struct request *creq; u64bit temp64; - struct my_sg tmp_sg[MAXSGENTRIES]; - int i; + struct scatterlist tmp_sg[MAXSGENTRIES]; + int i, ddir; if (q->plugged) goto startio; -queue_next: +next: if (list_empty(queue_head)) goto startio; @@ -1312,8 +1313,8 @@ spin_unlock_irq(&io_request_lock); c->cmd_type = CMD_RWREQ; - bh = creq->bh; c->rq = creq; + bh = creq->bh; /* fill in the request */ log_unit = MINOR(creq->rq_dev) >> NWD_SHIFT; @@ -1335,34 +1336,36 @@ printk(KERN_DEBUG "ciss: sector =%d nr_sectors=%d\n",(int) creq->sector, (int) creq->nr_sectors); #endif /* CCISS_DEBUG */ - seg = 0; - lastdataend = NULL; + seg = 0; + lastdataend = ~0ULL; while(bh) { - if (bh->b_data == lastdataend) + if (bh_phys(bh) == lastdataend) { // tack it on to the last segment - tmp_sg[seg-1].len +=bh->b_size; + tmp_sg[seg-1].length +=bh->b_size; lastdataend += bh->b_size; } else { if (seg == MAXSGENTRIES) BUG(); - tmp_sg[seg].len = bh->b_size; - tmp_sg[seg].start_addr = bh->b_data; - lastdataend = bh->b_data + bh->b_size; + tmp_sg[seg].page = bh->b_page; + tmp_sg[seg].length = bh->b_size; + tmp_sg[seg].offset = bh_offset(bh); + lastdataend = bh_phys(bh) + bh->b_size; seg++; } bh = bh->b_reqnext; } + /* get the DMA records for the setup */ - for (i=0; iSG[i].Len = tmp_sg[i].len; - temp64.val = (__u64) pci_map_single( h->pdev, - tmp_sg[i].start_addr, - tmp_sg[i].len, - (c->Request.Type.Direction == XFER_READ) ? - PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE); + if (c->Request.Type.Direction == XFER_READ) + ddir = PCI_DMA_FROMDEVICE; + else + ddir = PCI_DMA_TODEVICE; + for (i=0; iSG[i].Len = tmp_sg[i].length; + temp64.val = pci_map_page(h->pdev, tmp_sg[i].page, + tmp_sg[i].offset, tmp_sg[i].length, ddir); c->SG[i].Addr.lower = temp64.val32.lower; c->SG[i].Addr.upper = temp64.val32.upper; c->SG[i].Ext = 0; // we are not chaining @@ -1372,7 +1375,7 @@ h->maxSG = seg; #ifdef CCISS_DEBUG - printk(KERN_DEBUG "cciss: Submitting %d sectors in %d segments\n", creq->nr_sectors, seg); + printk(KERN_DEBUG "cciss: Submitting %d sectors in %d segments\n", sect, seg); #endif /* CCISS_DEBUG */ c->Header.SGList = c->Header.SGTotal = seg; @@ -1393,7 +1396,8 @@ if(h->Qdepth > h->maxQsinceinit) h->maxQsinceinit = h->Qdepth; - goto queue_next; + goto next; + startio: start_io(h); } @@ -1873,7 +1877,18 @@ sprintf(hba[i]->devname, "cciss%d", i); hba[i]->ctlr = i; hba[i]->pdev = pdev; - + + /* configure PCI DMA stuff */ + if (!pci_set_dma_mask(pdev, (u64) 0xffffffffffffffff)) + printk("cciss: using DAC cycles\n"); + else if (!pci_set_dma_mask(pdev, (u64) 0xffffffff)) + printk("cciss: not using DAC cycles\n"); + else { + printk("cciss: no suitable DMA available\n"); + free_hba(i); + return -ENODEV; + } + if( register_blkdev(MAJOR_NR+i, hba[i]->devname, &cciss_fops)) { printk(KERN_ERR "cciss: Unable to get major number " @@ -1941,9 +1956,10 @@ cciss_procinit(i); q = BLK_DEFAULT_QUEUE(MAJOR_NR + i); - q->queuedata = hba[i]; - blk_init_queue(q, do_cciss_request); - blk_queue_headactive(q, 0); + q->queuedata = hba[i]; + blk_init_queue(q, do_cciss_request); + blk_queue_bounce_limit(q, hba[i]->pdev->dma_mask); + blk_queue_headactive(q, 0); /* fill in the other Kernel structs */ blksize_size[MAJOR_NR+i] = hba[i]->blocksizes; diff -X dontdiff -ruN linux-2.4.17/drivers/block/cciss.h linux-2.4.17-lse02-D/drivers/block/cciss.h --- linux-2.4.17/drivers/block/cciss.h Tue May 22 10:23:16 2001 +++ linux-2.4.17-lse02-D/drivers/block/cciss.h Thu Apr 4 17:37:16 2002 @@ -15,11 +15,6 @@ #define MAJOR_NR COMPAQ_CISS_MAJOR -struct my_sg { - int len; - char *start_addr; -}; - struct ctlr_info; typedef struct ctlr_info ctlr_info_t; diff -X dontdiff -ruN linux-2.4.17/drivers/block/cpqarray.c linux-2.4.17-lse02-D/drivers/block/cpqarray.c --- linux-2.4.17/drivers/block/cpqarray.c Fri Nov 9 14:28:46 2001 +++ linux-2.4.17-lse02-D/drivers/block/cpqarray.c Thu Apr 4 17:37:16 2002 @@ -359,7 +359,7 @@ static int cpq_back_merge_fn(request_queue_t *q, struct request *rq, struct buffer_head *bh, int max_segments) { - if (rq->bhtail->b_data + rq->bhtail->b_size == bh->b_data) + if (BH_CONTIG(rq->bhtail, bh)) return 1; return cpq_new_segment(q, rq, max_segments); } @@ -367,7 +367,7 @@ static int cpq_front_merge_fn(request_queue_t *q, struct request *rq, struct buffer_head *bh, int max_segments) { - if (bh->b_data + bh->b_size == rq->bh->b_data) + if (BH_CONTIG(bh, rq->bh)) return 1; return cpq_new_segment(q, rq, max_segments); } @@ -377,7 +377,7 @@ { int total_segments = rq->nr_segments + nxt->nr_segments; - if (rq->bhtail->b_data + rq->bhtail->b_size == nxt->bh->b_data) + if (BH_CONTIG(rq->bhtail, nxt->bh)) total_segments--; if (total_segments > SG_MAX) @@ -524,6 +524,7 @@ q = BLK_DEFAULT_QUEUE(MAJOR_NR + i); q->queuedata = hba[i]; blk_init_queue(q, do_ida_request); + blk_queue_bounce_limit(q, hba[i]->pci_dev->dma_mask); blk_queue_headactive(q, 0); blksize_size[MAJOR_NR+i] = ida_blocksizes + (i*256); hardsect_size[MAJOR_NR+i] = ida_hardsizes + (i*256); @@ -911,17 +912,17 @@ { ctlr_info_t *h = q->queuedata; cmdlist_t *c; - char *lastdataend; + unsigned long lastdataend; struct list_head * queue_head = &q->queue_head; struct buffer_head *bh; struct request *creq; - struct my_sg tmp_sg[SG_MAX]; + struct scatterlist tmp_sg[SG_MAX]; int i, seg; if (q->plugged) goto startio; -queue_next: +next: if (list_empty(queue_head)) goto startio; @@ -961,17 +962,19 @@ printk("sector=%d, nr_sectors=%d\n", creq->sector, creq->nr_sectors); ); - seg = 0; lastdataend = NULL; + seg = 0; + lastdataend = ~0UL; while(bh) { - if (bh->b_data == lastdataend) { - tmp_sg[seg-1].size += bh->b_size; + if (bh_phys(bh) == lastdataend) { + tmp_sg[seg-1].length += bh->b_size; lastdataend += bh->b_size; } else { if (seg == SG_MAX) BUG(); - tmp_sg[seg].size = bh->b_size; - tmp_sg[seg].start_addr = bh->b_data; - lastdataend = bh->b_data + bh->b_size; + tmp_sg[seg].page = bh->b_page; + tmp_sg[seg].length = bh->b_size; + tmp_sg[seg].offset = bh_offset(bh); + lastdataend = bh_phys(bh) + bh->b_size; seg++; } bh = bh->b_reqnext; @@ -979,10 +982,10 @@ /* Now do all the DMA Mappings */ for( i=0; i < seg; i++) { - c->req.sg[i].size = tmp_sg[i].size; - c->req.sg[i].addr = (__u32) pci_map_single( - h->pci_dev, tmp_sg[i].start_addr, - tmp_sg[i].size, + c->req.sg[i].size = tmp_sg[i].length; + c->req.sg[i].addr = (__u32) pci_map_page( + h->pci_dev, tmp_sg[i].page, tmp_sg[i].offset, + tmp_sg[i].length, (creq->cmd == READ) ? PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE); } @@ -1000,7 +1003,7 @@ if (h->Qdepth > h->maxQsinceinit) h->maxQsinceinit = h->Qdepth; - goto queue_next; + goto next; startio: start_io(h); @@ -1076,17 +1079,14 @@ /* unmap the DMA mapping for all the scatter gather elements */ for(i=0; ireq.hdr.sg_cnt; i++) { - pci_unmap_single(hba[cmd->ctlr]->pci_dev, + pci_unmap_page(hba[cmd->ctlr]->pci_dev, cmd->req.sg[i].addr, cmd->req.sg[i].size, (cmd->req.hdr.cmd == IDA_READ) ? PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE); } complete_buffers(cmd->rq->bh, ok); - DBGPX(printk("Done with %p\n", cmd->rq);); end_that_request_last(cmd->rq); - - } /* diff -X dontdiff -ruN linux-2.4.17/drivers/block/cpqarray.h linux-2.4.17-lse02-D/drivers/block/cpqarray.h --- linux-2.4.17/drivers/block/cpqarray.h Tue May 22 10:23:16 2001 +++ linux-2.4.17-lse02-D/drivers/block/cpqarray.h Thu Apr 4 17:37:16 2002 @@ -56,11 +56,6 @@ #ifdef __KERNEL__ -struct my_sg { - int size; - char *start_addr; -}; - struct ctlr_info; typedef struct ctlr_info ctlr_info_t; diff -X dontdiff -ruN linux-2.4.17/drivers/block/elevator.c linux-2.4.17-lse02-D/drivers/block/elevator.c --- linux-2.4.17/drivers/block/elevator.c Thu Jul 19 20:59:41 2001 +++ linux-2.4.17-lse02-D/drivers/block/elevator.c Thu Apr 4 17:37:16 2002 @@ -110,7 +110,6 @@ break; } else if (__rq->sector - count == bh->b_rsector) { ret = ELEVATOR_FRONT_MERGE; - __rq->elevator_sequence -= count; *req = __rq; break; } @@ -220,3 +219,8 @@ *elevator = type; elevator->queue_ID = queue_ID++; } + +EXPORT_SYMBOL(elevator_linus_merge); +EXPORT_SYMBOL(elevator_linus_merge_cleanup); +EXPORT_SYMBOL(elevator_init); +EXPORT_SYMBOL(elevator_linus_merge_req); diff -X dontdiff -ruN linux-2.4.17/drivers/block/ll_rw_blk.c linux-2.4.17-lse02-D/drivers/block/ll_rw_blk.c --- linux-2.4.17/drivers/block/ll_rw_blk.c Mon Oct 29 12:11:17 2001 +++ linux-2.4.17-lse02-D/drivers/block/ll_rw_blk.c Thu Apr 4 17:37:16 2002 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -121,9 +122,13 @@ * How many reqeusts do we allocate per queue, * and how many do we "batch" on freeing them? */ -static int queue_nr_requests, batch_requests; +static int queue_nr_requests; +int batch_requests; -static inline int get_max_sectors(kdev_t dev) +unsigned long blk_max_low_pfn, blk_max_pfn; +int blk_nohighio = 0; + +inline int get_max_sectors(kdev_t dev) { if (!max_sectors[MAJOR(dev)]) return MAX_SECTORS; @@ -244,6 +249,55 @@ q->make_request_fn = mfn; } +/** + * blk_queue_bounce_limit - set bounce buffer limit for queue + * @q: the request queue for the device + * @dma_addr: bus address limit + * + * Description: + * Different hardware can have different requirements as to what pages + * it can do I/O directly to. A low level driver can call + * blk_queue_bounce_limit to have lower memory pages allocated as bounce + * buffers for doing I/O to pages residing above @page. By default + * the block layer sets this to the highest numbered "low" memory page. + **/ +void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr) +{ + unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; + unsigned long mb = dma_addr >> 20; + static request_queue_t *old_q; + + /* + * keep this for debugging for now... + */ + if (dma_addr != BLK_BOUNCE_HIGH && q != old_q) { + old_q = q; + printk("blk: queue %p, ", q); + if (dma_addr == BLK_BOUNCE_ANY) + printk("no I/O memory limit\n"); + else + printk("I/O limit %luMb (mask 0x%Lx)\n", mb, (u64) dma_addr); + } + + q->bounce_pfn = bounce_pfn; +} + + +/* + * can we merge the two segments, or do we need to start a new one? + */ +inline int blk_seg_merge_ok(request_queue_t *q, struct buffer_head *bh, + struct buffer_head *nxt) +{ + if (!BH_CONTIG(bh, nxt)) + return 0; + + if (BH_PHYS_4G(bh, nxt)) + return 1; + + return 0; +} + static inline int ll_new_segment(request_queue_t *q, struct request *req, int max_segments) { if (req->nr_segments < max_segments) { @@ -256,16 +310,18 @@ static int ll_back_merge_fn(request_queue_t *q, struct request *req, struct buffer_head *bh, int max_segments) { - if (req->bhtail->b_data + req->bhtail->b_size == bh->b_data) + if (blk_seg_merge_ok(q, req->bhtail, bh)) return 1; + return ll_new_segment(q, req, max_segments); } static int ll_front_merge_fn(request_queue_t *q, struct request *req, struct buffer_head *bh, int max_segments) { - if (bh->b_data + bh->b_size == req->bh->b_data) + if (blk_seg_merge_ok(q, bh, req->bh)) return 1; + return ll_new_segment(q, req, max_segments); } @@ -274,7 +330,7 @@ { int total_segments = req->nr_segments + next->nr_segments; - if (req->bhtail->b_data + req->bhtail->b_size == next->bh->b_data) + if (blk_seg_merge_ok(q, req->bhtail, next->bh)) total_segments--; if (total_segments > max_segments) @@ -326,7 +382,7 @@ spin_unlock_irqrestore(&io_request_lock, flags); } -static void blk_init_free_list(request_queue_t *q) +void blk_init_free_list(request_queue_t *q) { struct request *rq; int i; @@ -413,9 +469,10 @@ */ q->plug_device_fn = generic_plug_device; q->head_active = 1; + + blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); } -#define blkdev_free_rq(list) list_entry((list)->next, struct request, queue); /* * Get a free request. io_request_lock must be held and interrupts * disabled on the way in. @@ -452,7 +509,7 @@ if (q->rq[rw].count < batch_requests) schedule(); spin_lock_irq(&io_request_lock); - rq = get_request(q,rw); + rq = get_request(q, rw); spin_unlock_irq(&io_request_lock); } while (rq == NULL); remove_wait_queue(&q->wait_for_request, &wait); @@ -600,7 +657,7 @@ blkdev_release_request(next); } -static inline void attempt_back_merge(request_queue_t * q, +inline void attempt_back_merge(request_queue_t * q, struct request *req, int max_sectors, int max_segments) @@ -610,7 +667,7 @@ attempt_merge(q, req, max_sectors, max_segments); } -static inline void attempt_front_merge(request_queue_t * q, +inline void attempt_front_merge(request_queue_t * q, struct list_head * head, struct request *req, int max_sectors, @@ -664,9 +721,7 @@ * driver. Create a bounce buffer if the buffer data points into * high memory - keep the original buffer otherwise. */ -#if CONFIG_HIGHMEM - bh = create_bounce(rw, bh); -#endif + bh = blk_queue_bounce(q, rw, bh); /* look for a free request. */ /* @@ -711,8 +766,13 @@ elevator->elevator_merge_cleanup_fn(q, req, count); bh->b_reqnext = req->bh; req->bh = bh; + /* + * may not be valid, but queues not having bounce + * enabled for highmem pages must not look at + * ->buffer anyway + */ req->buffer = bh->b_data; - req->current_nr_sectors = count; + req->current_nr_sectors = req->hard_cur_sectors = count; req->sector = req->hard_sector = sector; req->nr_sectors = req->hard_nr_sectors += count; blk_started_io(count); @@ -762,7 +822,7 @@ req->errors = 0; req->hard_sector = req->sector = sector; req->hard_nr_sectors = req->nr_sectors = count; - req->current_nr_sectors = count; + req->current_nr_sectors = req->hard_cur_sectors = count; req->nr_segments = 1; /* Always 1 for a new request. */ req->nr_hw_segments = 1; /* Always 1 for a new request. */ req->buffer = bh->b_data; @@ -915,6 +975,34 @@ } } +void submit_bh_blknr(int rw, struct buffer_head * bh) +{ + int count = bh->b_size >> 9; + + if (!test_bit(BH_Lock, &bh->b_state)) + BUG(); + + set_bit(BH_Req, &bh->b_state); + + /* + * First step, 'identity mapping' - RAID or LVM might + * further remap this. + */ + bh->b_rdev = bh->b_dev; + bh->b_rsector = bh->b_blocknr; + + generic_make_request(rw, bh); + + switch (rw) { + case WRITE: + kstat.pgpgout += count; + break; + default: + kstat.pgpgin += count; + break; + } +} + /** * ll_rw_block: low-level access to block devices * @rw: whether to %READ or %WRITE or maybe %READA (readahead) @@ -1067,6 +1155,7 @@ req->nr_sectors = req->hard_nr_sectors; req->current_nr_sectors = bh->b_size >> 9; + req->hard_cur_sectors = req->current_nr_sectors; if (req->nr_sectors < req->current_nr_sectors) { req->nr_sectors = req->current_nr_sectors; printk("end_request: buffer-list destroyed\n"); @@ -1115,14 +1204,19 @@ */ queue_nr_requests = 64; if (total_ram > MB(32)) - queue_nr_requests = 128; + queue_nr_requests = 256; /* * Batch frees according to queue length */ - batch_requests = queue_nr_requests/4; + if ((batch_requests = queue_nr_requests / 4) > 32) + batch_requests = 32; + printk("block: %d slots per queue, batch=%d\n", queue_nr_requests, batch_requests); + blk_max_low_pfn = max_low_pfn; + blk_max_pfn = max_pfn; + #ifdef CONFIG_AMIGA_Z2RAM z2_init(); #endif @@ -1241,3 +1335,14 @@ EXPORT_SYMBOL(generic_make_request); EXPORT_SYMBOL(blkdev_release_request); EXPORT_SYMBOL(generic_unplug_device); +EXPORT_SYMBOL(blk_queue_bounce_limit); +EXPORT_SYMBOL(blk_max_low_pfn); +EXPORT_SYMBOL(blk_max_pfn); +EXPORT_SYMBOL(blk_seg_merge_ok); +EXPORT_SYMBOL(blk_nohighio); +EXPORT_SYMBOL(blk_init_free_list); +EXPORT_SYMBOL(batch_requests); +EXPORT_SYMBOL(drive_stat_acct); +EXPORT_SYMBOL(get_max_sectors); +EXPORT_SYMBOL(attempt_back_merge); +EXPORT_SYMBOL(attempt_front_merge); diff -X dontdiff -ruN linux-2.4.17/drivers/block/loop.c linux-2.4.17-lse02-D/drivers/block/loop.c --- linux-2.4.17/drivers/block/loop.c Fri Dec 21 09:41:53 2001 +++ linux-2.4.17-lse02-D/drivers/block/loop.c Thu Apr 4 17:37:16 2002 @@ -480,9 +480,7 @@ goto err; } -#if CONFIG_HIGHMEM - rbh = create_bounce(rw, rbh); -#endif + rbh = blk_queue_bounce(q, rw, rbh); /* * file backed, queue for loop_thread to handle @@ -569,9 +567,6 @@ sigfillset(¤t->blocked); flush_signals(current); spin_unlock_irq(¤t->sigmask_lock); - - current->policy = SCHED_OTHER; - current->nice = -20; spin_lock_irq(&lo->lo_lock); lo->lo_state = Lo_bound; diff -X dontdiff -ruN linux-2.4.17/drivers/char/mwave/mwavedd.c linux-2.4.17-lse02-D/drivers/char/mwave/mwavedd.c --- linux-2.4.17/drivers/char/mwave/mwavedd.c Thu Oct 11 09:14:32 2001 +++ linux-2.4.17-lse02-D/drivers/char/mwave/mwavedd.c Thu Apr 4 17:37:16 2002 @@ -279,7 +279,6 @@ pDrvData->IPCs[ipcnum].bIsHere = FALSE; pDrvData->IPCs[ipcnum].bIsEnabled = TRUE; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0) - current->nice = -20; /* boost to provide priority timing */ #else current->priority = 0x28; /* boost to provide priority timing */ #endif diff -X dontdiff -ruN linux-2.4.17/drivers/char/raw.c linux-2.4.17-lse02-D/drivers/char/raw.c --- linux-2.4.17/drivers/char/raw.c Sat Sep 22 20:35:43 2001 +++ linux-2.4.17-lse02-D/drivers/char/raw.c Thu Apr 4 17:37:16 2002 @@ -34,6 +34,9 @@ int raw_open(struct inode *, struct file *); int raw_release(struct inode *, struct file *); int raw_ctl_ioctl(struct inode *, struct file *, unsigned int, unsigned long); +ssize_t raw_readv(struct file *, const struct iovec *, unsigned long, loff_t *, size_t); +ssize_t raw_writev(struct file *, const struct iovec *, unsigned long, loff_t *,size_t); +static ssize_t rwvec_raw_dev(int rw, struct file *, const struct iovec *, unsigned long, loff_t *, size_t); static struct file_operations raw_fops = { @@ -41,6 +44,8 @@ write: raw_write, open: raw_open, release: raw_release, + readv: raw_readv, + writev: raw_writev, }; static struct file_operations raw_ctl_fops = { @@ -48,6 +53,8 @@ open: raw_open, }; +extern int map_user_kiobuf_iovecs(int, struct kiobuf *, struct iovec *, ulong); + static int __init raw_init(void) { int i; @@ -86,7 +93,7 @@ } if (!filp->f_iobuf) { - err = alloc_kiovec(1, &filp->f_iobuf); + err = alloc_kiovec_raw(1, &filp->f_iobuf); if (err) return err; } @@ -297,7 +304,7 @@ * A parallel read/write is using the preallocated iobuf * so just run slow and allocate a new one. */ - err = alloc_kiovec(1, &iobuf); + err = alloc_kiovec_raw(1, &iobuf); if (err) goto out; new_iobuf = 1; @@ -348,10 +355,13 @@ if (err) break; - for (i=0; i < blocks; i++) - iobuf->blocks[i] = blocknr++; - - err = brw_kiovec(rw, 1, &iobuf, dev, iobuf->blocks, sector_size); + iobuf->blkno = blocknr; + iobuf->dovary = 1; + iobuf->pinfo = NULL; + + err = brw_kiovec(rw, 1, &iobuf, dev, NULL, sector_size); + + iobuf->dovary = 0; if (rw == READ && err > 0) mark_dirty_kiobuf(iobuf, err); @@ -360,6 +370,7 @@ transferred += err; size -= err; buf += err; + blocknr += (err/sector_size); } unmap_kiobuf(iobuf); @@ -380,4 +391,132 @@ free_kiovec(1, &iobuf); out: return err; +} + + + +ssize_t raw_readv(struct file *filp, const struct iovec *iov, + unsigned long nr, loff_t *offp, size_t tot_len) +{ + return rwvec_raw_dev(READ, filp, iov, nr, offp, tot_len); +} + +ssize_t raw_writev(struct file *filp, const struct iovec *iov, + unsigned long nr, loff_t *offp, size_t tot_len) +{ + return rwvec_raw_dev(WRITE, filp, iov, nr, offp, tot_len); +} + + +ssize_t rwvec_raw_dev(int rw, struct file * filp, const struct iovec *iov, ulong iov_count, loff_t *offp, size_t tot_len) +{ + kdev_t dev; + size_t transferred; + struct pinfo pinfo; + struct kiobuf * iobuf; + ulong blocknr, blocks, limit; + int iosize, i, minor, err, new_iobuf; + int sector_size, sector_bits, sector_mask, max_sectors; + + minor = MINOR(filp->f_dentry->d_inode->i_rdev); + + new_iobuf = 0; + iobuf = filp->f_iobuf; + if (test_and_set_bit(0, &filp->f_iobuf_lock)) { + /* + * A parallel read/write is using the preallocated iobuf + * so just run slow and allocate a new one. + */ + err = alloc_kiovec(1, &iobuf); + if (err) + goto out; + new_iobuf = 1; + } + + dev = to_kdev_t(raw_devices[minor].binding->bd_dev); + sector_size = raw_devices[minor].sector_size; + sector_bits = raw_devices[minor].sector_bits; + sector_mask = sector_size- 1; + max_sectors = KIO_MAX_SECTORS >> (sector_bits - 9); + + /* revert to rw_raw_dev if transfer size is too large */ + if ( (tot_len >> sector_bits) > max_sectors ) { + printk("rwvec_raw_dev: len %d is too large \n", tot_len); + err = -ENOSYS; + goto out_free; + } + + if (blk_size[MAJOR(dev)]) + limit = (((loff_t) blk_size[MAJOR(dev)][MINOR(dev)]) << BLOCK_SIZE_BITS) >> sector_bits; + else + limit = INT_MAX; + + dprintk("rw_raw_dev: dev %d:%d (+%d)\n", MAJOR(dev), MINOR(dev), limit); + + err = -EINVAL; + if ((*offp & sector_mask)||(tot_len & sector_mask)) + goto out_free; + err = 0; + if (tot_len) + err = -ENXIO; + if ((*offp >> sector_bits) >= limit) + goto out_free; + + /* + * Split the IO into KIO_MAX_SECTORS chunks, mapping and + * unmapping the single kiobuf as we go to perform each chunk of + * IO. + */ + + transferred = 0; + blocknr = *offp >> sector_bits; + while (tot_len > 0) { + blocks = tot_len >> sector_bits; + if (blocks > max_sectors) + blocks = max_sectors; + if (blocks > limit - blocknr) + blocks = limit - blocknr; + if (!blocks) + break; + + iosize = blocks << sector_bits; + + iobuf->pinfo = &pinfo; + iobuf->dovary = 1; + iobuf->blkno = blocknr; + + err = map_user_kiobuf_iovecs(rw, iobuf, iov, iov_count); + if (err) + break; + + err = brw_kiovec(rw, 1, &iobuf, dev, NULL, sector_size); + if (rw == READ && err > 0) + mark_dirty_kiobuf_iovec(iobuf, err); + + if (err >= 0) { + transferred += err; + tot_len -= err; + blocknr += (err/sector_size); + } + + unmap_kiobuf(iobuf); + iobuf->pinfo = NULL; + iobuf->dovary = 0; + + if (err != iosize) + break; + } + + if (transferred) { + *offp += transferred; + err = transferred; + } + + out_free: + if (!new_iobuf) + clear_bit(0, &filp->f_iobuf_lock); + else + free_kiovec(1, &iobuf); + out: + return err; } diff -X dontdiff -ruN linux-2.4.17/drivers/ide/amd74xx.c linux-2.4.17-lse02-D/drivers/ide/amd74xx.c --- linux-2.4.17/drivers/ide/amd74xx.c Mon Aug 13 14:56:19 2001 +++ linux-2.4.17-lse02-D/drivers/ide/amd74xx.c Thu Apr 4 17:37:16 2002 @@ -469,8 +469,10 @@ if (hwif->dma_base) { hwif->dmaproc = &amd74xx_dmaproc; - if (!noautodma) + if (!noautodma) { hwif->autodma = 1; + hwif->highmem = 1; + } } else { hwif->autodma = 0; hwif->drives[0].autotune = 1; diff -X dontdiff -ruN linux-2.4.17/drivers/ide/ataraid.c linux-2.4.17-lse02-D/drivers/ide/ataraid.c --- linux-2.4.17/drivers/ide/ataraid.c Thu Oct 25 13:58:35 2001 +++ linux-2.4.17-lse02-D/drivers/ide/ataraid.c Thu Apr 4 17:37:16 2002 @@ -123,8 +123,7 @@ ptr=kmalloc(sizeof(struct buffer_head),GFP_NOIO); if (!ptr) { __set_current_state(TASK_RUNNING); - current->policy |= SCHED_YIELD; - schedule(); + yield(); } } return ptr; @@ -139,8 +138,7 @@ ptr=kmalloc(sizeof(struct ataraid_bh_private),GFP_NOIO); if (!ptr) { __set_current_state(TASK_RUNNING); - current->policy |= SCHED_YIELD; - schedule(); + yield(); } } return ptr; diff -X dontdiff -ruN linux-2.4.17/drivers/ide/hpt34x.c linux-2.4.17-lse02-D/drivers/ide/hpt34x.c --- linux-2.4.17/drivers/ide/hpt34x.c Sat May 19 17:43:06 2001 +++ linux-2.4.17-lse02-D/drivers/ide/hpt34x.c Thu Apr 4 17:37:16 2002 @@ -425,6 +425,7 @@ hwif->autodma = 0; hwif->dmaproc = &hpt34x_dmaproc; + hwif->highmem = 1; } else { hwif->drives[0].autotune = 1; hwif->drives[1].autotune = 1; diff -X dontdiff -ruN linux-2.4.17/drivers/ide/hpt366.c linux-2.4.17-lse02-D/drivers/ide/hpt366.c --- linux-2.4.17/drivers/ide/hpt366.c Tue Aug 14 20:01:07 2001 +++ linux-2.4.17-lse02-D/drivers/ide/hpt366.c Thu Apr 4 17:37:16 2002 @@ -730,6 +730,7 @@ hwif->autodma = 1; else hwif->autodma = 0; + hwif->highmem = 1; } else { hwif->autodma = 0; hwif->drives[0].autotune = 1; diff -X dontdiff -ruN linux-2.4.17/drivers/ide/ide-disk.c linux-2.4.17-lse02-D/drivers/ide/ide-disk.c --- linux-2.4.17/drivers/ide/ide-disk.c Fri Dec 21 09:41:54 2001 +++ linux-2.4.17-lse02-D/drivers/ide/ide-disk.c Thu Apr 4 17:37:16 2002 @@ -27,9 +27,10 @@ * Version 1.09 added increment of rq->sector in ide_multwrite * added UDMA 3/4 reporting * Version 1.10 request queue changes, Ultra DMA 100 + * Version 1.11 Highmem I/O support, Jens Axboe */ -#define IDEDISK_VERSION "1.10" +#define IDEDISK_VERSION "1.11" #undef REALLY_SLOW_IO /* most systems can safely undef this */ @@ -139,7 +140,9 @@ byte stat; int i; unsigned int msect, nsect; + unsigned long flags; struct request *rq; + char *to; /* new way for dealing with premature shared PCI interrupts */ if (!OK_STAT(stat=GET_STAT(),DATA_READY,BAD_R_STAT)) { @@ -150,8 +153,8 @@ ide_set_handler(drive, &read_intr, WAIT_CMD, NULL); return ide_started; } + msect = drive->mult_count; - read_next: rq = HWGROUP(drive)->rq; if (msect) { @@ -160,14 +163,15 @@ msect -= nsect; } else nsect = 1; - idedisk_input_data(drive, rq->buffer, nsect * SECTOR_WORDS); + to = ide_map_buffer(rq, &flags); + idedisk_input_data(drive, to, nsect * SECTOR_WORDS); #ifdef DEBUG printk("%s: read: sectors(%ld-%ld), buffer=0x%08lx, remaining=%ld\n", drive->name, rq->sector, rq->sector+nsect-1, (unsigned long) rq->buffer+(nsect<<9), rq->nr_sectors-nsect); #endif + ide_unmap_buffer(to, &flags); rq->sector += nsect; - rq->buffer += nsect<<9; rq->errors = 0; i = (rq->nr_sectors -= nsect); if (((long)(rq->current_nr_sectors -= nsect)) <= 0) @@ -201,14 +205,16 @@ #endif if ((rq->nr_sectors == 1) ^ ((stat & DRQ_STAT) != 0)) { rq->sector++; - rq->buffer += 512; rq->errors = 0; i = --rq->nr_sectors; --rq->current_nr_sectors; if (((long)rq->current_nr_sectors) <= 0) ide_end_request(1, hwgroup); if (i > 0) { - idedisk_output_data (drive, rq->buffer, SECTOR_WORDS); + unsigned long flags; + char *to = ide_map_buffer(rq, &flags); + idedisk_output_data (drive, to, SECTOR_WORDS); + ide_unmap_buffer(to, &flags); ide_set_handler (drive, &write_intr, WAIT_CMD, NULL); return ide_started; } @@ -238,14 +244,14 @@ do { char *buffer; int nsect = rq->current_nr_sectors; - + unsigned long flags; + if (nsect > mcount) nsect = mcount; mcount -= nsect; - buffer = rq->buffer; + buffer = ide_map_buffer(rq, &flags); rq->sector += nsect; - rq->buffer += nsect << 9; rq->nr_sectors -= nsect; rq->current_nr_sectors -= nsect; @@ -259,7 +265,7 @@ } else { rq->bh = bh; rq->current_nr_sectors = bh->b_size >> 9; - rq->buffer = bh->b_data; + rq->hard_cur_sectors = rq->current_nr_sectors; } } @@ -268,6 +274,7 @@ * re-entering us on the last transfer. */ idedisk_output_data(drive, buffer, nsect<<7); + ide_unmap_buffer(buffer, &flags); } while (mcount); return 0; @@ -452,8 +459,11 @@ return ide_stopped; } } else { + unsigned long flags; + char *buffer = ide_map_buffer(rq, &flags); ide_set_handler (drive, &write_intr, WAIT_CMD, NULL); - idedisk_output_data(drive, rq->buffer, SECTOR_WORDS); + idedisk_output_data(drive, buffer, SECTOR_WORDS); + ide_unmap_buffer(buffer, &flags); } return ide_started; } diff -X dontdiff -ruN linux-2.4.17/drivers/ide/ide-dma.c linux-2.4.17-lse02-D/drivers/ide/ide-dma.c --- linux-2.4.17/drivers/ide/ide-dma.c Sun Sep 9 10:43:02 2001 +++ linux-2.4.17-lse02-D/drivers/ide/ide-dma.c Thu Apr 4 17:37:16 2002 @@ -251,33 +251,48 @@ { struct buffer_head *bh; struct scatterlist *sg = hwif->sg_table; + unsigned long lastdataend = ~0UL; int nents = 0; if (hwif->sg_dma_active) BUG(); - + if (rq->cmd == READ) hwif->sg_dma_direction = PCI_DMA_FROMDEVICE; else hwif->sg_dma_direction = PCI_DMA_TODEVICE; + bh = rq->bh; do { - unsigned char *virt_addr = bh->b_data; - unsigned int size = bh->b_size; + /* + * continue segment from before? + */ + if (bh_phys(bh) == lastdataend) { + sg[nents - 1].length += bh->b_size; + lastdataend += bh->b_size; + } else { + struct scatterlist *sge; + /* + * start new segment + */ + if (nents >= PRD_ENTRIES) + return 0; - if (nents >= PRD_ENTRIES) - return 0; + sge = &sg[nents]; + memset(sge, 0, sizeof(*sge)); + if ((sge->page = bh->b_page)) + sge->offset = bh_offset(bh); + else { + if (((unsigned long) bh->b_data) < PAGE_SIZE) + BUG(); + sge->address = bh->b_data; + } - while ((bh = bh->b_reqnext) != NULL) { - if ((virt_addr + size) != (unsigned char *) bh->b_data) - break; - size += bh->b_size; + sge->length = bh->b_size; + lastdataend = bh_phys(bh) + bh->b_size; + nents++; } - memset(&sg[nents], 0, sizeof(*sg)); - sg[nents].address = virt_addr; - sg[nents].length = size; - nents++; - } while (bh != NULL); + } while ((bh = bh->b_reqnext) != NULL); return pci_map_sg(hwif->pci_dev, sg, nents, hwif->sg_dma_direction); } @@ -305,7 +320,7 @@ return 0; sg = HWIF(drive)->sg_table; - while (i && sg_dma_len(sg)) { + while (i) { u32 cur_addr; u32 cur_len; @@ -319,36 +334,35 @@ */ while (cur_len) { - if (count++ >= PRD_ENTRIES) { - printk("%s: DMA table too small\n", drive->name); - goto use_pio_instead; - } else { - u32 xcount, bcount = 0x10000 - (cur_addr & 0xffff); - - if (bcount > cur_len) - bcount = cur_len; - *table++ = cpu_to_le32(cur_addr); - xcount = bcount & 0xffff; - if (is_trm290_chipset) - xcount = ((xcount >> 2) - 1) << 16; - if (xcount == 0x0000) { - /* - * Most chipsets correctly interpret a length of 0x0000 as 64KB, - * but at least one (e.g. CS5530) misinterprets it as zero (!). - * So here we break the 64KB entry into two 32KB entries instead. - */ - if (count++ >= PRD_ENTRIES) { - printk("%s: DMA table too small\n", drive->name); - goto use_pio_instead; - } - *table++ = cpu_to_le32(0x8000); - *table++ = cpu_to_le32(cur_addr + 0x8000); - xcount = 0x8000; - } - *table++ = cpu_to_le32(xcount); - cur_addr += bcount; - cur_len -= bcount; + u32 xcount, bcount = 0x10000 - (cur_addr & 0xffff); + + if (count++ >= PRD_ENTRIES) + BUG(); + + if (bcount > cur_len) + bcount = cur_len; + *table++ = cpu_to_le32(cur_addr); + xcount = bcount & 0xffff; + if (is_trm290_chipset) + xcount = ((xcount >> 2) - 1) << 16; + if (xcount == 0x0000) { + /* + * Most chipsets correctly interpret a length + * of 0x0000 as 64KB, but at least one + * (e.g. CS5530) misinterprets it as zero (!). + * So here we break the 64KB entry into two + * 32KB entries instead. + */ + if (count++ >= PRD_ENTRIES) + goto use_pio_instead; + + *table++ = cpu_to_le32(0x8000); + *table++ = cpu_to_le32(cur_addr + 0x8000); + xcount = 0x8000; } + *table++ = cpu_to_le32(xcount); + cur_addr += bcount; + cur_len -= bcount; } sg++; @@ -532,6 +546,20 @@ } #endif /* CONFIG_BLK_DEV_IDEDMA_TIMEOUT */ +static inline void ide_toggle_bounce(ide_drive_t *drive, int on) +{ + dma64_addr_t addr = BLK_BOUNCE_HIGH; + + if (on && drive->media == ide_disk && HWIF(drive)->highmem) { + if (!PCI_DMA_BUS_IS_PHYS) + addr = BLK_BOUNCE_ANY; + else + addr = HWIF(drive)->pci_dev->dma_mask; + } + + blk_queue_bounce_limit(&drive->queue, addr); +} + /* * ide_dmaproc() initiates/aborts DMA read/write operations on a drive. * @@ -554,18 +582,20 @@ ide_hwif_t *hwif = HWIF(drive); unsigned long dma_base = hwif->dma_base; byte unit = (drive->select.b.unit & 0x01); - unsigned int count, reading = 0; + unsigned int count, reading = 0, set_high = 1; byte dma_stat; switch (func) { case ide_dma_off: printk("%s: DMA disabled\n", drive->name); + set_high = 0; case ide_dma_off_quietly: outb(inb(dma_base+2) & ~(1<<(5+unit)), dma_base+2); case ide_dma_on: drive->using_dma = (func == ide_dma_on); if (drive->using_dma) outb(inb(dma_base+2)|(1<<(5+unit)), dma_base+2); + ide_toggle_bounce(drive, set_high); return 0; case ide_dma_check: return config_drive_for_dma (drive); @@ -696,8 +726,8 @@ request_region(dma_base, num_ports, hwif->name); hwif->dma_base = dma_base; hwif->dmatable_cpu = pci_alloc_consistent(hwif->pci_dev, - PRD_ENTRIES * PRD_BYTES, - &hwif->dmatable_dma); + PRD_ENTRIES * PRD_BYTES, + &hwif->dmatable_dma); if (hwif->dmatable_cpu == NULL) goto dma_alloc_failure; diff -X dontdiff -ruN linux-2.4.17/drivers/ide/pdc202xx.c linux-2.4.17-lse02-D/drivers/ide/pdc202xx.c --- linux-2.4.17/drivers/ide/pdc202xx.c Wed Nov 14 11:44:03 2001 +++ linux-2.4.17-lse02-D/drivers/ide/pdc202xx.c Thu Apr 4 17:37:16 2002 @@ -893,6 +893,7 @@ #ifdef CONFIG_BLK_DEV_IDEDMA if (hwif->dma_base) { hwif->dmaproc = &pdc202xx_dmaproc; + hwif->highmem = 1; if (!noautodma) hwif->autodma = 1; } else { diff -X dontdiff -ruN linux-2.4.17/drivers/ide/piix.c linux-2.4.17-lse02-D/drivers/ide/piix.c --- linux-2.4.17/drivers/ide/piix.c Thu Oct 25 13:53:47 2001 +++ linux-2.4.17-lse02-D/drivers/ide/piix.c Thu Apr 4 17:37:16 2002 @@ -523,6 +523,7 @@ if (!hwif->dma_base) return; + hwif->highmem = 1; #ifndef CONFIG_BLK_DEV_IDEDMA hwif->autodma = 0; #else /* CONFIG_BLK_DEV_IDEDMA */ diff -X dontdiff -ruN linux-2.4.17/drivers/md/md.c linux-2.4.17-lse02-D/drivers/md/md.c --- linux-2.4.17/drivers/md/md.c Fri Dec 21 09:41:54 2001 +++ linux-2.4.17-lse02-D/drivers/md/md.c Thu Apr 4 17:37:16 2002 @@ -2930,8 +2930,6 @@ * bdflush, otherwise bdflush will deadlock if there are too * many dirty RAID5 blocks. */ - current->policy = SCHED_OTHER; - current->nice = -20; md_unlock_kernel(); complete(thread->event); @@ -3381,11 +3379,6 @@ "(but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max); - /* - * Resync has low priority. - */ - current->nice = 19; - is_mddev_idle(mddev); /* this also initializes IO event counters */ for (m = 0; m < SYNC_MARKS; m++) { mark[m] = jiffies; @@ -3463,16 +3456,13 @@ currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; if (currspeed > sysctl_speed_limit_min) { - current->nice = 19; - if ((currspeed > sysctl_speed_limit_max) || !is_mddev_idle(mddev)) { current->state = TASK_INTERRUPTIBLE; md_schedule_timeout(HZ/4); goto repeat; } - } else - current->nice = -20; + } } printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); err = 0; diff -X dontdiff -ruN linux-2.4.17/drivers/net/eepro100.c linux-2.4.17-lse02-D/drivers/net/eepro100.c --- linux-2.4.17/drivers/net/eepro100.c Fri Dec 21 09:41:54 2001 +++ linux-2.4.17-lse02-D/drivers/net/eepro100.c Thu Apr 4 17:37:16 2002 @@ -1123,9 +1123,6 @@ /* We must continue to monitor the media. */ sp->timer.expires = RUN_AT(2*HZ); /* 2.0 sec. */ add_timer(&sp->timer); -#if defined(timer_exit) - timer_exit(&sp->timer); -#endif } static void speedo_show_state(struct net_device *dev) diff -X dontdiff -ruN linux-2.4.17/drivers/net/slip.c linux-2.4.17-lse02-D/drivers/net/slip.c --- linux-2.4.17/drivers/net/slip.c Sun Sep 30 12:26:07 2001 +++ linux-2.4.17-lse02-D/drivers/net/slip.c Thu Apr 4 17:37:16 2002 @@ -1393,10 +1393,8 @@ /* First of all: check for active disciplines and hangup them. */ do { - if (busy) { - current->counter = 0; - schedule(); - } + if (busy) + sys_sched_yield(); busy = 0; local_bh_disable(); diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/aic7xxx/aic7xxx_linux_host.h linux-2.4.17-lse02-D/drivers/scsi/aic7xxx/aic7xxx_linux_host.h --- linux-2.4.17/drivers/scsi/aic7xxx/aic7xxx_linux_host.h Thu Oct 25 13:53:49 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/aic7xxx/aic7xxx_linux_host.h Thu Apr 4 17:37:16 2002 @@ -89,7 +89,9 @@ present: 0, /* number of 7xxx's present */\ unchecked_isa_dma: 0, /* no memory DMA restrictions */\ use_clustering: ENABLE_CLUSTERING, \ - use_new_eh_code: 1 \ + use_new_eh_code: 1, \ + highmem_io: 1, \ + concurrent_queue: 1 \ } #endif /* _AIC7XXX_LINUX_HOST_H_ */ diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/aic7xxx/aic7xxx_linux_pci.c linux-2.4.17-lse02-D/drivers/scsi/aic7xxx/aic7xxx_linux_pci.c --- linux-2.4.17/drivers/scsi/aic7xxx/aic7xxx_linux_pci.c Tue Nov 13 09:19:41 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/aic7xxx/aic7xxx_linux_pci.c Thu Apr 4 17:37:16 2002 @@ -158,7 +158,7 @@ if (sizeof(bus_addr_t) > 4 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,3) && ahc_linux_get_memsize() > 0x80000000 - && pci_set_dma_mask(pdev, 0x7FFFFFFFFFULL) == 0) { + && pci_set_dma_mask(pdev, 0xFFFFFFFFULL) == 0) { #else && ahc_linux_get_memsize() > 0x80000000) { diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/aic7xxx_old/aic7xxx.h linux-2.4.17-lse02-D/drivers/scsi/aic7xxx_old/aic7xxx.h --- linux-2.4.17/drivers/scsi/aic7xxx_old/aic7xxx.h Sun Mar 4 14:30:18 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/aic7xxx_old/aic7xxx.h Thu Apr 4 17:37:17 2002 @@ -55,7 +55,8 @@ present: 0, /* number of 7xxx's present */\ unchecked_isa_dma: 0, /* no memory DMA restrictions */\ use_clustering: ENABLE_CLUSTERING, \ - use_new_eh_code: 0 \ + use_new_eh_code: 0, \ + highmem_io: 1 \ } extern int aic7xxx_queue(Scsi_Cmnd *, void (*)(Scsi_Cmnd *)); diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/hosts.c linux-2.4.17-lse02-D/drivers/scsi/hosts.c --- linux-2.4.17/drivers/scsi/hosts.c Thu Jul 5 11:28:17 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/hosts.c Thu Apr 4 17:37:17 2002 @@ -129,7 +129,7 @@ * once we are 100% sure that we want to use this host adapter - it is a * pain to reverse this, so we try to avoid it */ - +extern int blk_nohighio; struct Scsi_Host * scsi_register(Scsi_Host_Template * tpnt, int j){ struct Scsi_Host * retval, *shpnt, *o_shp; Scsi_Host_Name *shn, *shn2; @@ -235,6 +235,8 @@ retval->cmd_per_lun = tpnt->cmd_per_lun; retval->unchecked_isa_dma = tpnt->unchecked_isa_dma; retval->use_clustering = tpnt->use_clustering; + if (!blk_nohighio) + retval->highmem_io = tpnt->highmem_io; retval->select_queue_depths = tpnt->select_queue_depths; retval->max_sectors = tpnt->max_sectors; diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/hosts.h linux-2.4.17-lse02-D/drivers/scsi/hosts.h --- linux-2.4.17/drivers/scsi/hosts.h Thu Nov 22 11:49:15 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/hosts.h Thu Apr 4 17:37:17 2002 @@ -291,11 +291,15 @@ */ unsigned emulated:1; + unsigned highmem_io:1; + /* * Name of proc directory */ char *proc_name; + unsigned concurrent_queue:1; + } Scsi_Host_Template; /* @@ -390,6 +394,8 @@ unsigned in_recovery:1; unsigned unchecked_isa_dma:1; unsigned use_clustering:1; + unsigned highmem_io:1; + /* * True if this host was loaded as a loadable module */ diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/ide-scsi.c linux-2.4.17-lse02-D/drivers/scsi/ide-scsi.c --- linux-2.4.17/drivers/scsi/ide-scsi.c Fri Dec 21 09:41:55 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/ide-scsi.c Thu Apr 4 17:37:17 2002 @@ -695,7 +695,7 @@ int segments = pc->scsi_cmd->use_sg; struct scatterlist *sg = pc->scsi_cmd->request_buffer; - if (!drive->using_dma || !pc->request_transfer || pc->request_transfer % 1024) + if (!drive->using_dma || !pc->request_transfer || pc->request_transfer & 1023) return NULL; if (idescsi_set_direction(pc)) return NULL; @@ -706,12 +706,22 @@ printk ("ide-scsi: %s: building DMA table, %d segments, %dkB total\n", drive->name, segments, pc->request_transfer >> 10); #endif /* IDESCSI_DEBUG_LOG */ while (segments--) { - bh->b_data = sg->address; + if (sg->address) { + bh->b_page = virt_to_page(sg->address); + bh->b_data = (char *) ((unsigned long) sg->address & ~PAGE_MASK); + } else if (sg->page) { + bh->b_page = sg->page; + bh->b_data = (char *) sg->offset; + } + bh->b_size = sg->length; bh = bh->b_reqnext; sg++; } } else { + /* + * non-sg requests are guarenteed not to reside in highmem /jens + */ if ((first_bh = bh = idescsi_kmalloc_bh (1)) == NULL) return NULL; #if IDESCSI_DEBUG_LOG diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/ips.c linux-2.4.17-lse02-D/drivers/scsi/ips.c --- linux-2.4.17/drivers/scsi/ips.c Fri Nov 9 14:05:06 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/ips.c Thu Apr 4 17:38:11 2002 @@ -846,6 +846,13 @@ /* found a controller */ sh = scsi_register(SHT, sizeof(ips_ha_t)); + /* + * Set pci_dev and dma_mask + */ + pci_set_dma_mask(dev[i], (u64) 0xffffffff); + + scsi_set_pci_device(sh, dev[i]); + if (sh == NULL) { printk(KERN_WARNING "(%s%d) Unable to register controller with SCSI subsystem - skipping controller\n", ips_name, ips_next_controller); @@ -3568,7 +3575,7 @@ Scsi_Cmnd *p; Scsi_Cmnd *q; ips_copp_wait_item_t *item; - int ret; + int ret, sg_entries = 0; int intr_status; unsigned long cpu_flags; unsigned long cpu_flags2; @@ -3767,6 +3774,8 @@ int i; sg = SC->request_buffer; + scb->org_sg_list = sg; + sg_entries = pci_map_sg(ha->pcidev, sg, SC->use_sg, ips_command_direction[scb->scsi_cmd->cmnd[0]]); if (SC->use_sg == 1) { if (sg[0].length > ha->max_xfer) { @@ -3776,12 +3785,12 @@ scb->data_len = sg[0].length; scb->dcdb.transfer_length = scb->data_len; - scb->data_busaddr = VIRT_TO_BUS(sg[0].address); + scb->data_busaddr = sg_dma_address(&sg[0]); scb->sg_len = 0; } else { /* Check for the first Element being bigger than MAX_XFER */ if (sg[0].length > ha->max_xfer) { - scb->sg_list[0].address = cpu_to_le32(VIRT_TO_BUS(sg[0].address)); + scb->sg_list[0].address = cpu_to_le32(sg_dma_address(&sg[0])); scb->sg_list[0].length = ha->max_xfer; scb->data_len = ha->max_xfer; scb->breakup = 0; @@ -3790,7 +3799,7 @@ } else { for (i = 0; i < SC->use_sg; i++) { - scb->sg_list[i].address = cpu_to_le32(VIRT_TO_BUS(sg[i].address)); + scb->sg_list[i].address = cpu_to_le32(sg_dma_address(&sg[i])); scb->sg_list[i].length = cpu_to_le32(sg[i].length); if (scb->data_len + sg[i].length > ha->max_xfer) { @@ -3805,7 +3814,7 @@ } if (!scb->breakup) - scb->sg_len = SC->use_sg; + scb->sg_len = sg_entries; else scb->sg_len = scb->breakup; } @@ -4465,11 +4474,11 @@ if (sg[0].length - (bk_save * ha->max_xfer) > ha->max_xfer) { /* Further breakup required */ scb->data_len = ha->max_xfer; - scb->data_busaddr = VIRT_TO_BUS(sg[0].address + (bk_save * ha->max_xfer)); + scb->data_busaddr = sg_dma_address(&sg[0] + (bk_save * ha->max_xfer)); scb->breakup = bk_save + 1; } else { scb->data_len = sg[0].length - (bk_save * ha->max_xfer); - scb->data_busaddr = VIRT_TO_BUS(sg[0].address + (bk_save * ha->max_xfer)); + scb->data_busaddr = sg_dma_address(&sg[0] + (bk_save * ha->max_xfer)); } scb->dcdb.transfer_length = scb->data_len; @@ -4486,7 +4495,7 @@ /* pointed to by bk_save */ if (scb->sg_break) { scb->sg_len = 1; - scb->sg_list[0].address = VIRT_TO_BUS(sg[bk_save].address+ha->max_xfer*scb->sg_break); + scb->sg_list[0].address = sg_dma_address(&sg[bk_save] + ha->max_xfer*scb->sg_break); if (ha->max_xfer > sg[bk_save].length-ha->max_xfer * scb->sg_break) scb->sg_list[0].length = sg[bk_save].length-ha->max_xfer * scb->sg_break; else @@ -4504,7 +4513,7 @@ } else { /* ( sg_break == 0 ), so this is our first look at a new sg piece */ if (sg[bk_save].length > ha->max_xfer) { - scb->sg_list[0].address = cpu_to_le32(VIRT_TO_BUS(sg[bk_save].address)); + scb->sg_list[0].address = cpu_to_le32(sg_dma_address(&sg[bk_save])); scb->sg_list[0].length = ha->max_xfer; scb->breakup = bk_save; scb->sg_break = 1; @@ -4517,7 +4526,7 @@ scb->sg_break = 0; /* We're only doing full units here */ for (i = bk_save; i < scb->scsi_cmd->use_sg; i++) { - scb->sg_list[i - bk_save].address = cpu_to_le32(VIRT_TO_BUS(sg[i].address)); + scb->sg_list[i - bk_save].address = cpu_to_le32(sg_dma_address(&sg[i])); scb->sg_list[i - bk_save].length = cpu_to_le32(sg[i].length); if (scb->data_len + sg[i].length > ha->max_xfer) { scb->breakup = i; /* sneaky, if not more work, than breakup is 0 */ @@ -4585,6 +4594,7 @@ break; } /* end case */ + pci_unmap_sg(ha->pcidev, scb->org_sg_list, scb->sg_len, ips_command_direction[scb->scsi_cmd->cmnd[0]]); return ; } #ifndef NO_IPS_CMDLINE diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/ips.h linux-2.4.17-lse02-D/drivers/scsi/ips.h --- linux-2.4.17/drivers/scsi/ips.h Sun Sep 30 12:26:08 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/ips.h Thu Apr 4 17:38:11 2002 @@ -472,7 +472,9 @@ present : 0, \ unchecked_isa_dma : 0, \ use_clustering : ENABLE_CLUSTERING, \ - use_new_eh_code : 1 \ + use_new_eh_code : 1, \ + highmem_io : 1, \ + concurrent_queue : 1 \ } #endif @@ -1083,6 +1085,7 @@ u_int32_t flags; u_int32_t op_code; IPS_SG_LIST *sg_list; + struct scatterlist *org_sg_list; Scsi_Cmnd *scsi_cmd; struct ips_scb *q_next; ips_scb_callback callback; diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/megaraid.h linux-2.4.17-lse02-D/drivers/scsi/megaraid.h --- linux-2.4.17/drivers/scsi/megaraid.h Thu Oct 25 13:53:51 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/megaraid.h Thu Apr 4 17:37:17 2002 @@ -223,7 +223,8 @@ cmd_per_lun: MAX_CMD_PER_LUN, /* SCSI Commands per LUN */\ present: 0, /* Present */\ unchecked_isa_dma: 0, /* Default Unchecked ISA DMA */\ - use_clustering: ENABLE_CLUSTERING /* Enable Clustering */\ + use_clustering: ENABLE_CLUSTERING, /* Enable Clustering */\ + highmem_io: 1, /* enable HIGHMEM I/O */ \ } #endif diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/qlogicfc.h linux-2.4.17-lse02-D/drivers/scsi/qlogicfc.h --- linux-2.4.17/drivers/scsi/qlogicfc.h Sun Oct 21 10:36:54 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/qlogicfc.h Thu Apr 4 17:37:17 2002 @@ -95,7 +95,8 @@ cmd_per_lun: QLOGICFC_CMD_PER_LUN, \ present: 0, \ unchecked_isa_dma: 0, \ - use_clustering: ENABLE_CLUSTERING \ + use_clustering: ENABLE_CLUSTERING, \ + highmem_io: 1 \ } #endif /* _QLOGICFC_H */ diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/scsi.c linux-2.4.17-lse02-D/drivers/scsi/scsi.c --- linux-2.4.17/drivers/scsi/scsi.c Fri Dec 21 09:41:55 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/scsi.c Thu Apr 4 17:37:17 2002 @@ -186,10 +186,13 @@ * handler in the list - ultimately they call scsi_request_fn * to do the dirty deed. */ -void scsi_initialize_queue(Scsi_Device * SDpnt, struct Scsi_Host * SHpnt) { - blk_init_queue(&SDpnt->request_queue, scsi_request_fn); - blk_queue_headactive(&SDpnt->request_queue, 0); - SDpnt->request_queue.queuedata = (void *) SDpnt; + +void scsi_initialize_queue(Scsi_Device * SDpnt, struct Scsi_Host * SHpnt) +{ + request_queue_t *q = &SDpnt->request_queue; + + scsi_init_queue(q, SHpnt->hostt->concurrent_queue); + q->queuedata = (void *) SDpnt; } #ifdef MODULE diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/scsi.h linux-2.4.17-lse02-D/drivers/scsi/scsi.h --- linux-2.4.17/drivers/scsi/scsi.h Thu Nov 22 11:49:15 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/scsi.h Thu Apr 4 17:37:17 2002 @@ -386,15 +386,6 @@ #define ASKED_FOR_SENSE 0x20 #define SYNC_RESET 0x40 -#if defined(__mc68000__) || defined(CONFIG_APUS) -#include -#define CONTIGUOUS_BUFFERS(X,Y) \ - (virt_to_phys((X)->b_data+(X)->b_size-1)+1==virt_to_phys((Y)->b_data)) -#else -#define CONTIGUOUS_BUFFERS(X,Y) ((X->b_data+X->b_size) == Y->b_data) -#endif - - /* * This is the crap from the old error handling code. We have it in a special * place so that we can more easily delete it later on. @@ -633,7 +624,7 @@ struct scatterlist *buffer; /* which buffer */ int buffers_residual; /* how many buffers left */ - dma_addr_t dma_handle; + dma_addr_t dma_handle; volatile int Status; volatile int Message; diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/scsi_lib.c linux-2.4.17-lse02-D/drivers/scsi/scsi_lib.c --- linux-2.4.17/drivers/scsi/scsi_lib.c Fri Oct 12 15:35:54 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/scsi_lib.c Thu Apr 4 17:37:17 2002 @@ -84,14 +84,16 @@ * head of the queue for things like a QUEUE_FULL message from a * device, or a host that is unable to accept a particular command. */ - spin_lock_irqsave(&io_request_lock, flags); + spin_lock_irqsave(&io_request_lock, flags); + rq_lock(q); if (at_head) list_add(&rq->queue, &q->queue_head); else list_add_tail(&rq->queue, &q->queue_head); q->request_fn(q); + rq_unlock(q); spin_unlock_irqrestore(&io_request_lock, flags); } @@ -262,13 +264,17 @@ * the bad sector. */ SCpnt->request.special = (void *) SCpnt; + rq_lock(q); list_add(&SCpnt->request.queue, &q->queue_head); + rq_unlock(q); } /* * Just hit the requeue function for the queue. */ + rq_lock(q); q->request_fn(q); + rq_unlock(q); SDpnt = (Scsi_Device *) q->queuedata; SHpnt = SDpnt->host; @@ -296,7 +302,9 @@ break; } q = &SDpnt->request_queue; + rq_lock(q); q->request_fn(q); + rq_unlock(q); } } @@ -321,7 +329,9 @@ continue; } q = &SDpnt->request_queue; + rq_lock(q); q->request_fn(q); + rq_unlock(q); all_clear = 0; } if (SDpnt == NULL && all_clear) { @@ -388,6 +398,7 @@ req->nr_sectors -= nsect; req->current_nr_sectors = bh->b_size >> 9; + req->hard_cur_sectors = req->current_nr_sectors; if (req->nr_sectors < req->current_nr_sectors) { req->nr_sectors = req->current_nr_sectors; printk("scsi_end_request: buffer-list destroyed\n"); @@ -410,7 +421,6 @@ q = &SCpnt->device->request_queue; - req->buffer = bh->b_data; /* * Bleah. Leftovers again. Stick the leftovers in * the front of the queue, and goose the queue again. @@ -489,6 +499,8 @@ */ static void scsi_release_buffers(Scsi_Cmnd * SCpnt) { + struct request *req = &SCpnt->request; + ASSERT_LOCK(&io_request_lock, 0); /* @@ -499,7 +511,7 @@ void **bbpnt; int i; - sgpnt = (struct scatterlist *) SCpnt->request_buffer; + sgpnt = (struct scatterlist *) req->buffer; bbpnt = SCpnt->bounce_buffers; if (bbpnt) { @@ -548,6 +560,7 @@ int result = SCpnt->result; int this_count = SCpnt->bufflen >> 9; request_queue_t *q = &SCpnt->device->request_queue; + struct request *req = &SCpnt->request; /* * We must do one of several things here: @@ -580,7 +593,7 @@ if (bbpnt) { for (i = 0; i < SCpnt->use_sg; i++) { if (bbpnt[i]) { - if (SCpnt->request.cmd == READ) { + if (req->cmd == READ) { memcpy(bbpnt[i], sgpnt[i].address, sgpnt[i].length); @@ -592,9 +605,11 @@ scsi_free(SCpnt->buffer, SCpnt->sglist_len); } else { if (SCpnt->buffer != SCpnt->request.buffer) { - if (SCpnt->request.cmd == READ) { - memcpy(SCpnt->request.buffer, SCpnt->buffer, - SCpnt->bufflen); + if (req->cmd == READ) { + unsigned long flags; + char *to = bh_kmap_irq(req->bh, &flags); + memcpy(to, SCpnt->buffer, SCpnt->bufflen); + bh_kunmap_irq(to, &flags); } scsi_free(SCpnt->buffer, SCpnt->bufflen); } @@ -619,7 +634,7 @@ good_sectors)); SCSI_LOG_HLCOMPLETE(1, printk("use_sg is %d\n ", SCpnt->use_sg)); - SCpnt->request.errors = 0; + req->errors = 0; /* * If multiple sectors are requested in one buffer, then * they will have been finished off by the first command. @@ -818,6 +833,234 @@ return NULL; } +extern int batch_requests; +extern int get_max_sectors(kdev_t); +extern void attempt_back_merge(request_queue_t *, struct request *, int, int); +extern void attempt_front_merge(request_queue_t *, struct list_head *, + struct request *, int, int); + +static void scsi_plug_device(request_queue_t *q, kdev_t dev) +{ + /* + * no need to replug device + */ + if (!list_empty(&q->queue_head) || q->plugged) + return; + + q->plugged = 1; + queue_task(&q->plug_tq, &tq_disk); +} + +static void scsi_unplug_device(void *data) +{ + request_queue_t *q = (request_queue_t *) data; + unsigned long flags; + + spin_lock_irqsave(&io_request_lock, flags); + rq_lock(q); + if (q->plugged) { + q->plugged = 0; + if (!list_empty(&q->queue_head)) + q->request_fn(q); + } + rq_unlock(q); + spin_unlock_irqrestore(&io_request_lock, flags); +} + +static inline struct request *scsi_get_request(request_queue_t *q, int rw) +{ + struct request *rq = NULL; + struct request_list *rl = q->rq + rw; + + if (!list_empty(&rl->free)) { + rq = blkdev_free_rq(&rl->free); + list_del(&rq->queue); + rl->count--; + rq->rq_status = RQ_ACTIVE; + rq->special = NULL; + rq->q = q; + } + + return rq; +} + +static struct request *scsi_get_request_wait(request_queue_t *q, int rw) +{ + register struct request *rq; + DECLARE_WAITQUEUE(wait, current); + + scsi_unplug_device(q); + add_wait_queue(&q->wait_for_request, &wait); + do { + set_current_state(TASK_UNINTERRUPTIBLE); + if (q->rq[rw].count < batch_requests) + schedule(); + xrq_lock_irq(q); + rq = scsi_get_request(q, rw); + xrq_unlock_irq(q); + } while (rq == NULL); + remove_wait_queue(&q->wait_for_request, &wait); + current->state = TASK_RUNNING; + return rq; +} + +static int scsi_make_request(request_queue_t * q, int rw, + struct buffer_head * bh) +{ + unsigned int sector, count; + int max_segments = MAX_SEGMENTS; + struct request * req, *freereq = NULL; + int rw_ahead, max_sectors, el_ret; + struct list_head *head, *insert_here; + int latency; + elevator_t *elevator = &q->elevator; + + count = bh->b_size >> 9; + sector = bh->b_rsector; + + rw_ahead = 0; /* normal case; gets changed below for READA */ + switch (rw) { + case READA: + rw_ahead = 1; + rw = READ; /* drop into READ */ + case READ: + case WRITE: + latency = elevator_request_latency(elevator, rw); + break; + default: + BUG(); + goto end_io; + } + + /* We'd better have a real physical mapping! + Check this bit only if the buffer was dirty and just locked + down by us so at this point flushpage will block and + won't clear the mapped bit under us. */ + if (!buffer_mapped(bh)) + BUG(); + + /* + * Temporary solution - in 2.5 this will be done by the lowlevel + * driver. Create a bounce buffer if the buffer data points into + * high memory - keep the original buffer otherwise. + */ + bh = blk_queue_bounce(q, rw, bh); + +/* look for a free request. */ + /* + * Try to coalesce the new request with old requests + */ + max_sectors = get_max_sectors(bh->b_rdev); + +again: + req = NULL; + head = &q->queue_head; + /* + * Now we acquire the request spinlock, we have to be mega careful + * not to schedule or do something nonatomic + */ + xrq_lock_irq(q); + + insert_here = head->prev; + if (list_empty(head)) { + q->plug_device_fn(q, bh->b_rdev); /* is atomic */ + goto get_rq; + } + + el_ret = elevator->elevator_merge_fn(q, &req, head, bh, rw,max_sectors); + switch (el_ret) { + + case ELEVATOR_BACK_MERGE: + if (!q->back_merge_fn(q, req, bh, max_segments)) + break; + elevator->elevator_merge_cleanup_fn(q, req, count); + req->bhtail->b_reqnext = bh; + req->bhtail = bh; + req->nr_sectors = req->hard_nr_sectors += count; + blk_started_io(count); + drive_stat_acct(req->rq_dev, req->cmd, count, 0); + attempt_back_merge(q, req, max_sectors, max_segments); + goto out; + + case ELEVATOR_FRONT_MERGE: + if (!q->front_merge_fn(q, req, bh, max_segments)) + break; + elevator->elevator_merge_cleanup_fn(q, req, count); + bh->b_reqnext = req->bh; + req->bh = bh; + req->buffer = bh->b_data; + req->current_nr_sectors = count; + req->sector = req->hard_sector = sector; + req->nr_sectors = req->hard_nr_sectors += count; + blk_started_io(count); + drive_stat_acct(req->rq_dev, req->cmd, count, 0); + attempt_front_merge(q, head, req, max_sectors, max_segments); + goto out; + + /* + * elevator says don't/can't merge. get new request + */ + case ELEVATOR_NO_MERGE: + /* + * use elevator hints as to where to insert the + * request. if no hints, just add it to the back + * of the queue + */ + if (req) + insert_here = &req->queue; + break; + + default: + printk("elevator returned crap (%d)\n", el_ret); + BUG(); + } + + /* + * Grab a free request from the freelist - if that is empty, check + * if we are doing read ahead and abort instead of blocking for + * a free slot. + */ +get_rq: + if (freereq) { + req = freereq; + freereq = NULL; + } else if ((req = scsi_get_request(q, rw)) == NULL) { + xrq_unlock_irq(q); + if (rw_ahead) + goto end_io; + + freereq = scsi_get_request_wait(q, rw); + goto again; + } + +/* fill up the request-info, and add it to the queue */ + req->elevator_sequence = latency; + req->cmd = rw; + req->errors = 0; + req->hard_sector = req->sector = sector; + req->hard_nr_sectors = req->nr_sectors = count; + req->current_nr_sectors = count; + req->nr_segments = 1; /* Always 1 for a new request. */ + req->nr_hw_segments = 1; /* Always 1 for a new request. */ + req->buffer = bh->b_data; + req->waiting = NULL; + req->bh = bh; + req->bhtail = bh; + req->rq_dev = bh->b_rdev; + blk_started_io(count); + drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1); + list_add(&req->queue, insert_here); + +out: + if (freereq) + blkdev_release_request(freereq); + xrq_unlock_irq(q); + return 0; +end_io: + bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); + return 0; +} + /* * Function: scsi_request_fn() * @@ -913,9 +1156,11 @@ */ SDpnt->was_reset = 0; if (SDpnt->removable && !in_interrupt()) { + rq_unlock(q); spin_unlock_irq(&io_request_lock); scsi_ioctl(SDpnt, SCSI_IOCTL_DOORLOCK, 0); spin_lock_irq(&io_request_lock); + rq_lock(q); continue; } } @@ -1024,7 +1269,6 @@ * another. */ req = NULL; - spin_unlock_irq(&io_request_lock); if (SCpnt->request.cmd != SPECIAL) { /* @@ -1054,7 +1298,6 @@ { panic("Should not have leftover blocks\n"); } - spin_lock_irq(&io_request_lock); SHpnt->host_busy--; SDpnt->device_busy--; continue; @@ -1070,7 +1313,6 @@ { panic("Should not have leftover blocks\n"); } - spin_lock_irq(&io_request_lock); SHpnt->host_busy--; SDpnt->device_busy--; continue; @@ -1085,14 +1327,30 @@ /* * Dispatch the command to the low-level driver. */ + rq_unlock(q); + spin_unlock_irq(&io_request_lock); scsi_dispatch_cmd(SCpnt); - - /* - * Now we need to grab the lock again. We are about to mess - * with the request queue and try to find another command. - */ spin_lock_irq(&io_request_lock); + rq_lock(q); } +} + +extern void blk_init_free_list(request_queue_t *); + +void scsi_init_queue(request_queue_t * q, unsigned concurrent_queue) +{ + INIT_LIST_HEAD(&q->queue_head); + elevator_init(&q->elevator, ELEVATOR_LINUS); + blk_init_free_list(q); + q->request_fn = scsi_request_fn; + q->make_request_fn = scsi_make_request; + q->plug_tq.sync = 0; + q->plug_tq.routine = scsi_unplug_device; + q->plug_tq.data = q; + q->plugged = 0; + q->plug_device_fn = scsi_plug_device; + q->head_active = 0; + q->concurrent = concurrent_queue; } /* diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/scsi_merge.c linux-2.4.17-lse02-D/drivers/scsi/scsi_merge.c --- linux-2.4.17/drivers/scsi/scsi_merge.c Thu Oct 25 14:05:31 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/scsi_merge.c Thu Apr 4 17:37:17 2002 @@ -6,6 +6,7 @@ * Based upon conversations with large numbers * of people at Linux Expo. * Support for dynamic DMA mapping: Jakub Jelinek (jakub@redhat.com). + * Support for highmem I/O: Jens Axboe */ /* @@ -48,7 +49,6 @@ #include #include - #define __KERNEL_SYSCALLS__ #include @@ -95,7 +95,7 @@ printk("Segment 0x%p, blocks %d, addr 0x%lx\n", bh, bh->b_size >> 9, - virt_to_phys(bh->b_data - 1)); + bh_phys(bh) - 1); } panic("Ththththaats all folks. Too dangerous to continue.\n"); } @@ -223,11 +223,10 @@ * DMA capable host, make sure that a segment doesn't span * the DMA threshold boundary. */ - if (dma_host && - virt_to_phys(bhnext->b_data) - 1 == ISA_DMA_THRESHOLD) { + if (dma_host && bh_phys(bhnext) - 1 == ISA_DMA_THRESHOLD) { ret++; reqsize = bhnext->b_size; - } else if (CONTIGUOUS_BUFFERS(bh, bhnext)) { + } else if (BH_CONTIG(bh, bhnext)) { /* * This one is OK. Let it go. */ @@ -241,8 +240,7 @@ * kind of screwed and we need to start * another segment. */ - if( dma_host - && virt_to_phys(bh->b_data) - 1 >= ISA_DMA_THRESHOLD + if( dma_host && bh_phys(bh) - 1 >= ISA_DMA_THRESHOLD && reqsize + bhnext->b_size > PAGE_SIZE ) { ret++; @@ -304,7 +302,7 @@ } #define MERGEABLE_BUFFERS(X,Y) \ -(((((long)(X)->b_data+(X)->b_size)|((long)(Y)->b_data)) & \ +(((((long)bh_phys((X))+(X)->b_size)|((long)bh_phys((Y)))) & \ (DMA_CHUNK_SIZE - 1)) == 0) #ifdef DMA_CHUNK_SIZE @@ -420,6 +418,9 @@ if ((req->nr_sectors + (bh->b_size >> 9)) > SHpnt->max_sectors) return 0; + if (!BH_PHYS_4G(req->bhtail, bh)) + return 0; + if (use_clustering) { /* * See if we can do this without creating another @@ -427,14 +428,11 @@ * DMA capable host, make sure that a segment doesn't span * the DMA threshold boundary. */ - if (dma_host && - virt_to_phys(req->bhtail->b_data) - 1 == ISA_DMA_THRESHOLD) { + if (dma_host && bh_phys(req->bhtail) - 1 == ISA_DMA_THRESHOLD) goto new_end_segment; - } - if (CONTIGUOUS_BUFFERS(req->bhtail, bh)) { + if (BH_CONTIG(req->bhtail, bh)) { #ifdef DMA_SEGMENT_SIZE_LIMITED - if( dma_host - && virt_to_phys(bh->b_data) - 1 >= ISA_DMA_THRESHOLD ) { + if (dma_host && bh_phys(bh) - 1 >= ISA_DMA_THRESHOLD) { segment_size = 0; count = __count_segments(req, use_clustering, dma_host, &segment_size); if( segment_size + bh->b_size > PAGE_SIZE ) { @@ -479,6 +477,9 @@ if ((req->nr_sectors + (bh->b_size >> 9)) > SHpnt->max_sectors) return 0; + if (!BH_PHYS_4G(bh, req->bh)) + return 0; + if (use_clustering) { /* * See if we can do this without creating another @@ -486,14 +487,12 @@ * DMA capable host, make sure that a segment doesn't span * the DMA threshold boundary. */ - if (dma_host && - virt_to_phys(bh->b_data) - 1 == ISA_DMA_THRESHOLD) { + if (dma_host && bh_phys(bh) - 1 == ISA_DMA_THRESHOLD) { goto new_start_segment; } - if (CONTIGUOUS_BUFFERS(bh, req->bh)) { + if (BH_CONTIG(bh, req->bh)) { #ifdef DMA_SEGMENT_SIZE_LIMITED - if( dma_host - && virt_to_phys(bh->b_data) - 1 >= ISA_DMA_THRESHOLD ) { + if (dma_host && bh_phys(bh) - 1 >= ISA_DMA_THRESHOLD) { segment_size = bh->b_size; count = __count_segments(req, use_clustering, dma_host, &segment_size); if( count != req->nr_segments ) { @@ -641,6 +640,9 @@ if ((req->nr_sectors + next->nr_sectors) > SHpnt->max_sectors) return 0; + if (!BH_PHYS_4G(req->bhtail, next->bh)) + return 0; + /* * The main question is whether the two segments at the boundaries * would be considered one or two. @@ -652,18 +654,15 @@ * DMA capable host, make sure that a segment doesn't span * the DMA threshold boundary. */ - if (dma_host && - virt_to_phys(req->bhtail->b_data) - 1 == ISA_DMA_THRESHOLD) { + if (dma_host && bh_phys(req->bhtail) - 1 == ISA_DMA_THRESHOLD) goto dont_combine; - } #ifdef DMA_SEGMENT_SIZE_LIMITED /* * We currently can only allocate scatter-gather bounce * buffers in chunks of PAGE_SIZE or less. */ - if (dma_host - && CONTIGUOUS_BUFFERS(req->bhtail, next->bh) - && virt_to_phys(req->bhtail->b_data) - 1 >= ISA_DMA_THRESHOLD ) + if (dma_host && BH_CONTIG(req->bhtail, next->bh) + && bh_phys(req->bhtail) - 1 >= ISA_DMA_THRESHOLD) { int segment_size = 0; int count = 0; @@ -675,7 +674,7 @@ } } #endif - if (CONTIGUOUS_BUFFERS(req->bhtail, next->bh)) { + if (BH_CONTIG(req->bhtail, next->bh)) { /* * This one is OK. Let it go. */ @@ -803,37 +802,13 @@ char * buff; int count; int i; - struct request * req; + struct request * req = &SCpnt->request; int sectors; struct scatterlist * sgpnt; int this_count; void ** bbpnt; /* - * FIXME(eric) - don't inline this - it doesn't depend on the - * integer flags. Come to think of it, I don't think this is even - * needed any more. Need to play with it and see if we hit the - * panic. If not, then don't bother. - */ - if (!SCpnt->request.bh) { - /* - * Case of page request (i.e. raw device), or unlinked buffer - * Typically used for swapping, but this isn't how we do - * swapping any more. - */ - panic("I believe this is dead code. If we hit this, I was wrong"); -#if 0 - SCpnt->request_bufflen = SCpnt->request.nr_sectors << 9; - SCpnt->request_buffer = SCpnt->request.buffer; - SCpnt->use_sg = 0; - /* - * FIXME(eric) - need to handle DMA here. - */ -#endif - return 1; - } - req = &SCpnt->request; - /* * First we need to know how many scatter gather segments are needed. */ if (!sg_count_valid) { @@ -848,21 +823,27 @@ * buffer. */ if (dma_host && scsi_dma_free_sectors <= 10) { - this_count = SCpnt->request.current_nr_sectors; + this_count = req->current_nr_sectors; goto single_segment; } /* - * Don't bother with scatter-gather if there is only one segment. - */ - if (count == 1) { - this_count = SCpnt->request.nr_sectors; + * we really want to use sg even for a single segment request, + * however some people just cannot be bothered to write decent + * driver code so we can't risk to break somebody making the + * assumption that sg requests will always contain at least 2 + * segments. if the driver is 32-bit dma safe, then use sg for + * 1 entry anyways. if not, don't rely on the driver handling this + * case. + */ + if (count == 1 && !SCpnt->host->highmem_io) { + this_count = req->current_nr_sectors; goto single_segment; } - SCpnt->use_sg = count; - /* - * Allocate the actual scatter-gather table itself. + /* + * for sane drivers, use sg even for 1 entry request */ + SCpnt->use_sg = count; SCpnt->sglist_len = (SCpnt->use_sg * sizeof(struct scatterlist)); /* If we could potentially require ISA bounce buffers, allocate @@ -887,7 +868,7 @@ * simply write the first buffer all by itself. */ printk("Warning - running *really* short on DMA buffers\n"); - this_count = SCpnt->request.current_nr_sectors; + this_count = req->current_nr_sectors; goto single_segment; } /* @@ -907,13 +888,11 @@ SCpnt->bounce_buffers = bbpnt; - for (count = 0, bh = SCpnt->request.bh; - bh; bh = bh->b_reqnext) { + for (count = 0, bh = req->bh; bh; bh = bh->b_reqnext) { if (use_clustering && bhprev != NULL) { - if (dma_host && - virt_to_phys(bhprev->b_data) - 1 == ISA_DMA_THRESHOLD) { + if (dma_host && bh_phys(bhprev) - 1 == ISA_DMA_THRESHOLD) { /* Nothing - fall through */ - } else if (CONTIGUOUS_BUFFERS(bhprev, bh)) { + } else if (BH_CONTIG(bhprev, bh)) { /* * This one is OK. Let it go. Note that we * do not have the ability to allocate @@ -922,7 +901,7 @@ */ if( dma_host ) { #ifdef DMA_SEGMENT_SIZE_LIMITED - if( virt_to_phys(bh->b_data) - 1 < ISA_DMA_THRESHOLD + if (bh_phys(bh) - 1 < ISA_DMA_THRESHOLD || sgpnt[count - 1].length + bh->b_size <= PAGE_SIZE ) { sgpnt[count - 1].length += bh->b_size; bhprev = bh; @@ -941,13 +920,25 @@ } } } - count++; - sgpnt[count - 1].address = bh->b_data; - sgpnt[count - 1].page = NULL; - sgpnt[count - 1].length += bh->b_size; - if (!dma_host) { - SCpnt->request_bufflen += bh->b_size; + + if (SCpnt->host->highmem_io) { + sgpnt[count].page = bh->b_page; + sgpnt[count].offset = bh_offset(bh); + sgpnt[count].address = NULL; + } else { + if (PageHighMem(bh->b_page)) + BUG(); + + sgpnt[count].page = NULL; + sgpnt[count].address = bh->b_data; } + + sgpnt[count].length = bh->b_size; + + if (!dma_host) + SCpnt->request_bufflen += bh->b_size; + + count++; bhprev = bh; } @@ -970,6 +961,10 @@ for (i = 0; i < count; i++) { sectors = (sgpnt[i].length >> 9); SCpnt->request_bufflen += sgpnt[i].length; + /* + * only done for dma_host, in which case .page is not + * set since it's guarenteed to be a low memory page + */ if (virt_to_phys(sgpnt[i].address) + sgpnt[i].length - 1 > ISA_DMA_THRESHOLD) { if( scsi_dma_free_sectors - sectors <= 10 ) { @@ -1005,7 +1000,7 @@ } break; } - if (SCpnt->request.cmd == WRITE) { + if (req->cmd == WRITE) { memcpy(sgpnt[i].address, bbpnt[i], sgpnt[i].length); } @@ -1050,8 +1045,7 @@ * single-block requests if we had hundreds of free sectors. */ if( scsi_dma_free_sectors > 30 ) { - for (this_count = 0, bh = SCpnt->request.bh; - bh; bh = bh->b_reqnext) { + for (this_count = 0, bh = req->bh; bh; bh = bh->b_reqnext) { if( scsi_dma_free_sectors - this_count < 30 || this_count == sectors ) { @@ -1064,7 +1058,7 @@ /* * Yow! Take the absolute minimum here. */ - this_count = SCpnt->request.current_nr_sectors; + this_count = req->current_nr_sectors; } /* @@ -1077,28 +1071,31 @@ * segment. Possibly the entire request, or possibly a small * chunk of the entire request. */ - bh = SCpnt->request.bh; - buff = SCpnt->request.buffer; + bh = req->bh; + buff = req->buffer = bh->b_data; - if (dma_host) { + if (dma_host || PageHighMem(bh->b_page)) { /* * Allocate a DMA bounce buffer. If the allocation fails, fall * back and allocate a really small one - enough to satisfy * the first buffer. */ - if (virt_to_phys(SCpnt->request.bh->b_data) - + (this_count << 9) - 1 > ISA_DMA_THRESHOLD) { + if (bh_phys(bh) + (this_count << 9) - 1 > ISA_DMA_THRESHOLD) { buff = (char *) scsi_malloc(this_count << 9); if (!buff) { printk("Warning - running low on DMA memory\n"); - this_count = SCpnt->request.current_nr_sectors; + this_count = req->current_nr_sectors; buff = (char *) scsi_malloc(this_count << 9); if (!buff) { dma_exhausted(SCpnt, 0); } } - if (SCpnt->request.cmd == WRITE) - memcpy(buff, (char *) SCpnt->request.buffer, this_count << 9); + if (req->cmd == WRITE) { + unsigned long flags; + char *buf = bh_kmap_irq(bh, &flags); + memcpy(buff, buf, this_count << 9); + bh_kunmap_irq(buf, &flags); + } } } SCpnt->request_bufflen = this_count << 9; @@ -1139,21 +1136,11 @@ */ void initialize_merge_fn(Scsi_Device * SDpnt) { - request_queue_t *q; - struct Scsi_Host *SHpnt; - SHpnt = SDpnt->host; - - q = &SDpnt->request_queue; + struct Scsi_Host *SHpnt = SDpnt->host; + request_queue_t *q = &SDpnt->request_queue; + dma64_addr_t bounce_limit; /* - * If the host has already selected a merge manager, then don't - * pick a new one. - */ -#if 0 - if (q->back_merge_fn && q->front_merge_fn) - return; -#endif - /* * If this host has an unlimited tablesize, then don't bother with a * merge manager. The whole point of the operation is to make sure * that requests don't grow too large, and this host isn't picky. @@ -1185,4 +1172,20 @@ q->merge_requests_fn = scsi_merge_requests_fn_dc; SDpnt->scsi_init_io_fn = scsi_init_io_vdc; } + + /* + * now enable highmem I/O, if appropriate + */ + bounce_limit = BLK_BOUNCE_HIGH; + if (SHpnt->highmem_io && (SDpnt->type == TYPE_DISK)) { + if (!PCI_DMA_BUS_IS_PHYS) + /* Platforms with virtual-DMA translation + * hardware have no practical limit. + */ + bounce_limit = BLK_BOUNCE_ANY; + else + bounce_limit = SHpnt->pci_dev->dma_mask; + } + + blk_queue_bounce_limit(q, bounce_limit); } diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/sym53c8xx.h linux-2.4.17-lse02-D/drivers/scsi/sym53c8xx.h --- linux-2.4.17/drivers/scsi/sym53c8xx.h Fri Dec 21 09:41:55 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/sym53c8xx.h Thu Apr 4 17:37:17 2002 @@ -97,7 +97,8 @@ sg_tablesize: SCSI_NCR_SG_TABLESIZE, \ cmd_per_lun: SCSI_NCR_CMD_PER_LUN, \ max_sectors: MAX_SEGMENTS*8, \ - use_clustering: DISABLE_CLUSTERING} + use_clustering: DISABLE_CLUSTERING, \ + highmem_io: 1} #else diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/sym53c8xx_2/sym53c8xx.h linux-2.4.17-lse02-D/drivers/scsi/sym53c8xx_2/sym53c8xx.h --- linux-2.4.17/drivers/scsi/sym53c8xx_2/sym53c8xx.h Fri Dec 21 09:41:55 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/sym53c8xx_2/sym53c8xx.h Thu Apr 4 17:37:17 2002 @@ -119,7 +119,8 @@ this_id: 7, \ sg_tablesize: 0, \ cmd_per_lun: 0, \ - use_clustering: DISABLE_CLUSTERING} + use_clustering: DISABLE_CLUSTERING, \ + highmem_io: 1} #endif /* defined(HOSTS_C) || defined(MODULE) */ diff -X dontdiff -ruN linux-2.4.17/drivers/scsi/sym53c8xx_2/sym_glue.c linux-2.4.17-lse02-D/drivers/scsi/sym53c8xx_2/sym_glue.c --- linux-2.4.17/drivers/scsi/sym53c8xx_2/sym_glue.c Fri Dec 21 09:41:55 2001 +++ linux-2.4.17-lse02-D/drivers/scsi/sym53c8xx_2/sym_glue.c Thu Apr 4 17:37:17 2002 @@ -2140,6 +2140,7 @@ instance->max_cmd_len = 16; #endif instance->select_queue_depths = sym53c8xx_select_queue_depths; + instance->highmem_io = 1; SYM_UNLOCK_HCB(np, flags); diff -X dontdiff -ruN linux-2.4.17/fs/binfmt_elf.c linux-2.4.17-lse02-D/fs/binfmt_elf.c --- linux-2.4.17/fs/binfmt_elf.c Fri Dec 21 09:41:55 2001 +++ linux-2.4.17-lse02-D/fs/binfmt_elf.c Thu Apr 4 17:37:17 2002 @@ -1143,7 +1143,7 @@ psinfo.pr_state = i; psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i]; psinfo.pr_zomb = psinfo.pr_sname == 'Z'; - psinfo.pr_nice = current->nice; + psinfo.pr_nice = task_nice(current); psinfo.pr_flag = current->flags; psinfo.pr_uid = NEW_TO_OLD_UID(current->uid); psinfo.pr_gid = NEW_TO_OLD_GID(current->gid); diff -X dontdiff -ruN linux-2.4.17/fs/buffer.c linux-2.4.17-lse02-D/fs/buffer.c --- linux-2.4.17/fs/buffer.c Fri Dec 21 09:41:55 2001 +++ linux-2.4.17-lse02-D/fs/buffer.c Thu Apr 4 17:37:17 2002 @@ -725,9 +725,8 @@ wakeup_bdflush(); try_to_free_pages(zone, GFP_NOFS, 0); run_task_queue(&tq_disk); - current->policy |= SCHED_YIELD; __set_current_state(TASK_RUNNING); - schedule(); + sys_sched_yield(); } void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) @@ -1262,16 +1261,14 @@ void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset) { - bh->b_page = page; if (offset >= PAGE_SIZE) BUG(); - if (PageHighMem(page)) - /* - * This catches illegal uses and preserves the offset: - */ - bh->b_data = (char *)(0 + offset); - else - bh->b_data = page_address(page) + offset; + + /* + * page_address will return NULL anyways for highmem pages + */ + bh->b_data = page_address(page) + offset; + bh->b_page = page; } EXPORT_SYMBOL(set_bh_page); @@ -2061,29 +2058,25 @@ * for them to complete. Clean up the buffer_heads afterwards. */ -static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size) +static int wait_kio(int rw, int nr, struct buffer_head *bh, int size) { int iosize, err; int i; - struct buffer_head *tmp; + struct buffer_head *tmp = bh; iosize = 0; err = 0; for (i = nr; --i >= 0; ) { - iosize += size; - tmp = bh[i]; if (buffer_locked(tmp)) { wait_on_buffer(tmp); } if (!buffer_uptodate(tmp)) { - /* We are traversing bh'es in reverse order so - clearing iosize on error calculates the - amount of IO before the first error. */ - iosize = 0; err = -EIO; } + if (!err) iosize += tmp->b_size; + tmp = tmp->b_next_free; } if (iosize) @@ -2117,7 +2110,8 @@ unsigned long blocknr; struct kiobuf * iobuf = NULL; struct page * map; - struct buffer_head *tmp, **bhs = NULL; + struct buffer_head *tmp, *bhs = NULL; + int iosize = size; if (!nr) return 0; @@ -2143,6 +2137,7 @@ offset = iobuf->offset; length = iobuf->length; iobuf->errno = 0; + blocknr = iobuf->blkno; if (!bhs) bhs = iobuf->bh; @@ -2152,9 +2147,16 @@ err = -EFAULT; goto finished; } + + if (iobuf->pinfo) { + offset = iobuf->pinfo->poffset[pageind]; + length = iobuf->pinfo->plen[pageind]; + } while (length > 0) { - blocknr = b[bufind++]; + if (iobuf->blocks) + blocknr = b[bufind++]; + if (blocknr == -1UL) { if (rw == READ) { /* there was an hole in the filesystem */ @@ -2167,9 +2169,17 @@ } else BUG(); } - tmp = bhs[bhind++]; - tmp->b_size = size; + tmp = bhs; + bhs = bhs->b_next_free; + bhind++; + + if (iobuf->dovary) { + iosize = PAGE_SIZE - offset; + if (iosize > length) + iosize = length; + } + tmp->b_size = iosize; set_bh_page(tmp, map, offset); tmp->b_this_page = tmp; @@ -2185,13 +2195,16 @@ set_bit(BH_Uptodate, &tmp->b_state); atomic_inc(&iobuf->io_count); - submit_bh(rw, tmp); + if (iobuf->dovary) + submit_bh_blknr(rw, tmp); + else + submit_bh(rw, tmp); /* * Wait for IO if we have got too much */ if (bhind >= KIO_MAX_SECTORS) { kiobuf_wait_for_io(iobuf); /* wake-one */ - err = wait_kio(rw, bhind, bhs, size); + err = wait_kio(rw, bhind, iobuf->bh, size); if (err >= 0) transferred += err; else @@ -2200,8 +2213,9 @@ } skip_block: - length -= size; - offset += size; + length -= iosize; + offset += iosize; + blocknr += (iosize/size); if (offset >= PAGE_SIZE) { offset = 0; @@ -2214,7 +2228,7 @@ /* Is there any IO still left to submit? */ if (bhind) { kiobuf_wait_for_io(iobuf); /* wake-one */ - err = wait_kio(rw, bhind, bhs, size); + err = wait_kio(rw, bhind, iobuf->bh, size); if (err >= 0) transferred += err; else diff -X dontdiff -ruN linux-2.4.17/fs/iobuf.c linux-2.4.17-lse02-D/fs/iobuf.c --- linux-2.4.17/fs/iobuf.c Fri Apr 27 14:23:25 2001 +++ linux-2.4.17-lse02-D/fs/iobuf.c Thu Apr 4 17:37:17 2002 @@ -30,29 +30,34 @@ iobuf->maplist = iobuf->map_array; } -int alloc_kiobuf_bhs(struct kiobuf * kiobuf) +int alloc_kiobuf_bhs(struct kiobuf * kiobuf, int nr) { int i; + struct buffer_head *bh; - for (i = 0; i < KIO_MAX_SECTORS; i++) - if (!(kiobuf->bh[i] = kmem_cache_alloc(bh_cachep, SLAB_KERNEL))) { - while (i--) { - kmem_cache_free(bh_cachep, kiobuf->bh[i]); - kiobuf->bh[i] = NULL; - } + for (i = 0; i < nr; i++) { + bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL); + if (!bh) { + free_kiobuf_bhs(kiobuf); return -ENOMEM; } + bh->b_next_free = kiobuf->bh; + kiobuf->bh = bh; + } return 0; } void free_kiobuf_bhs(struct kiobuf * kiobuf) { - int i; + struct buffer_head *bh, *bh_next; - for (i = 0; i < KIO_MAX_SECTORS; i++) { - kmem_cache_free(bh_cachep, kiobuf->bh[i]); - kiobuf->bh[i] = NULL; + bh = kiobuf->bh; + while (bh) { + bh_next = bh->b_next_free; + kmem_cache_free(bh_cachep, bh); + bh = bh_next; } + kiobuf->bh = NULL; } int alloc_kiovec(int nr, struct kiobuf **bufp) @@ -67,7 +72,36 @@ return -ENOMEM; } kiobuf_init(iobuf); - if (alloc_kiobuf_bhs(iobuf)) { + iobuf->blocks = kmalloc(KIO_MAX_SECTORS * sizeof(long), GFP_KERNEL); + if (!iobuf->blocks) { + vfree(iobuf); + free_kiovec(i, bufp); + return -ENOMEM; + } + if (alloc_kiobuf_bhs(iobuf, KIO_MAX_SECTORS )) { + vfree(iobuf); + free_kiovec(i, bufp); + return -ENOMEM; + } + bufp[i] = iobuf; + } + + return 0; +} + +int alloc_kiovec_raw(int nr, struct kiobuf **bufp) +{ + int i; + struct kiobuf *iobuf; + + for (i = 0; i < nr; i++) { + iobuf = vmalloc(sizeof(struct kiobuf)); + if (!iobuf) { + free_kiovec(i, bufp); + return -ENOMEM; + } + kiobuf_init(iobuf); + if (alloc_kiobuf_bhs(iobuf, KIO_STATIC_PAGES)) { vfree(iobuf); free_kiovec(i, bufp); return -ENOMEM; @@ -90,16 +124,23 @@ if (iobuf->array_len > KIO_STATIC_PAGES) kfree (iobuf->maplist); free_kiobuf_bhs(iobuf); + if (iobuf->blocks) + kfree (iobuf->blocks); vfree(bufp[i]); } } + int expand_kiobuf(struct kiobuf *iobuf, int wanted) { struct page ** maplist; if (iobuf->array_len >= wanted) return 0; + else if ((iobuf->pinfo) && (wanted > KIO_STATIC_INFO)) { + printk("expand_kiobuf: cannot expand to %d\n", wanted); + return -ENOMEM; + } maplist = (struct page **) kmalloc(wanted * sizeof(struct page **), GFP_KERNEL); diff -X dontdiff -ruN linux-2.4.17/fs/jbd/journal.c linux-2.4.17-lse02-D/fs/jbd/journal.c --- linux-2.4.17/fs/jbd/journal.c Fri Dec 21 09:41:55 2001 +++ linux-2.4.17-lse02-D/fs/jbd/journal.c Thu Apr 4 17:37:17 2002 @@ -460,8 +460,7 @@ printk (KERN_NOTICE __FUNCTION__ ": ENOMEM at get_unused_buffer_head, " "trying again.\n"); - current->policy |= SCHED_YIELD; - schedule(); + yield(); } } while (!new_bh); /* keep subsequent assertions sane */ @@ -1539,8 +1538,7 @@ last_warning = jiffies; } - current->policy |= SCHED_YIELD; - schedule(); + yield(); } } @@ -1598,8 +1596,7 @@ last_warning = jiffies; } while (ret == 0) { - current->policy |= SCHED_YIELD; - schedule(); + yield(); ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); } } diff -X dontdiff -ruN linux-2.4.17/fs/jbd/revoke.c linux-2.4.17-lse02-D/fs/jbd/revoke.c --- linux-2.4.17/fs/jbd/revoke.c Fri Dec 21 09:41:55 2001 +++ linux-2.4.17-lse02-D/fs/jbd/revoke.c Thu Apr 4 17:37:17 2002 @@ -137,8 +137,7 @@ if (!journal_oom_retry) return -ENOMEM; jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n"); - current->policy |= SCHED_YIELD; - schedule(); + yield(); goto repeat; } diff -X dontdiff -ruN linux-2.4.17/fs/jbd/transaction.c linux-2.4.17-lse02-D/fs/jbd/transaction.c --- linux-2.4.17/fs/jbd/transaction.c Fri Dec 21 09:41:55 2001 +++ linux-2.4.17-lse02-D/fs/jbd/transaction.c Thu Apr 4 17:37:17 2002 @@ -1377,8 +1377,7 @@ do { old_handle_count = transaction->t_handle_count; set_current_state(TASK_RUNNING); - current->policy |= SCHED_YIELD; - schedule(); + yield(); } while (old_handle_count != transaction->t_handle_count); } diff -X dontdiff -ruN linux-2.4.17/fs/jffs2/background.c linux-2.4.17-lse02-D/fs/jffs2/background.c --- linux-2.4.17/fs/jffs2/background.c Thu Oct 25 00:07:09 2001 +++ linux-2.4.17-lse02-D/fs/jffs2/background.c Thu Apr 4 17:37:17 2002 @@ -106,9 +106,6 @@ sprintf(current->comm, "jffs2_gcd_mtd%d", c->mtd->index); - /* FIXME in the 2.2 backport */ - current->nice = 10; - for (;;) { spin_lock_irq(¤t->sigmask_lock); siginitsetinv (¤t->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT)); diff -X dontdiff -ruN linux-2.4.17/fs/locks.c linux-2.4.17-lse02-D/fs/locks.c --- linux-2.4.17/fs/locks.c Thu Oct 11 07:52:18 2001 +++ linux-2.4.17-lse02-D/fs/locks.c Thu Apr 4 17:37:17 2002 @@ -445,8 +445,7 @@ /* Let the blocked process remove waiter from the * block list when it gets scheduled. */ - current->policy |= SCHED_YIELD; - schedule(); + yield(); } else { /* Remove waiter from the block list, because by the * time it wakes up blocker won't exist any more. diff -X dontdiff -ruN linux-2.4.17/fs/nfs/pagelist.c linux-2.4.17-lse02-D/fs/nfs/pagelist.c --- linux-2.4.17/fs/nfs/pagelist.c Fri Dec 21 09:41:55 2001 +++ linux-2.4.17-lse02-D/fs/nfs/pagelist.c Thu Apr 4 17:37:17 2002 @@ -96,8 +96,7 @@ continue; if (signalled() && (server->flags & NFS_MOUNT_INTR)) return ERR_PTR(-ERESTARTSYS); - current->policy = SCHED_YIELD; - schedule(); + yield(); } /* Initialize the request struct. Initially, we assume a diff -X dontdiff -ruN linux-2.4.17/fs/proc/array.c linux-2.4.17-lse02-D/fs/proc/array.c --- linux-2.4.17/fs/proc/array.c Thu Oct 11 09:00:01 2001 +++ linux-2.4.17-lse02-D/fs/proc/array.c Thu Apr 4 17:37:17 2002 @@ -335,9 +335,8 @@ /* scale priority and nice values from timeslices to -20..20 */ /* to make it look like a "normal" Unix priority/nice value */ - priority = task->counter; - priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER; - nice = task->nice; + priority = task_prio(task); + nice = task_nice(task); read_lock(&tasklist_lock); ppid = task->pid ? task->p_opptr->pid : 0; @@ -387,7 +386,7 @@ task->nswap, task->cnswap, task->exit_signal, - task->processor); + task->cpu); if(mm) mmput(mm); return res; diff -X dontdiff -ruN linux-2.4.17/fs/proc/proc_misc.c linux-2.4.17-lse02-D/fs/proc/proc_misc.c --- linux-2.4.17/fs/proc/proc_misc.c Tue Nov 20 21:29:09 2001 +++ linux-2.4.17-lse02-D/fs/proc/proc_misc.c Thu Apr 4 17:37:17 2002 @@ -85,11 +85,11 @@ a = avenrun[0] + (FIXED_1/200); b = avenrun[1] + (FIXED_1/200); c = avenrun[2] + (FIXED_1/200); - len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n", + len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n", LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(c), LOAD_FRAC(c), - nr_running, nr_threads, last_pid); + nr_running(), nr_threads, last_pid); return proc_calc_metrics(page, start, off, count, eof, len); } @@ -101,7 +101,7 @@ int len; uptime = jiffies; - idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime; + idle = init_task.times.tms_utime + init_task.times.tms_stime; /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but that would overflow about every five days at HZ == 100. @@ -303,10 +303,10 @@ } len += sprintf(page + len, - "\nctxt %u\n" + "\nctxt %lu\n" "btime %lu\n" "processes %lu\n", - kstat.context_swtch, + nr_context_switches(), xtime.tv_sec - jif / HZ, total_forks); diff -X dontdiff -ruN linux-2.4.17/fs/read_write.c linux-2.4.17-lse02-D/fs/read_write.c --- linux-2.4.17/fs/read_write.c Sun Aug 5 13:12:41 2001 +++ linux-2.4.17-lse02-D/fs/read_write.c Thu Apr 4 17:37:17 2002 @@ -203,7 +203,7 @@ unsigned long count) { typedef ssize_t (*io_fn_t)(struct file *, char *, size_t, loff_t *); - typedef ssize_t (*iov_fn_t)(struct file *, const struct iovec *, unsigned long, loff_t *); + typedef ssize_t (*iov_fn_t)(struct file *, const struct iovec *, unsigned long, loff_t *, size_t); size_t tot_len; struct iovec iovstack[UIO_FASTIOV]; @@ -258,8 +258,9 @@ fnv = (type == VERIFY_WRITE ? file->f_op->readv : file->f_op->writev); if (fnv) { - ret = fnv(file, iov, count, &file->f_pos); - goto out; + ret = fnv(file, iov, count, &file->f_pos, tot_len); + if (ret != -ENOSYS) + goto out; } /* VERIFY_WRITE actually means a read, as we write to user space */ diff -X dontdiff -ruN linux-2.4.17/fs/reiserfs/buffer2.c linux-2.4.17-lse02-D/fs/reiserfs/buffer2.c --- linux-2.4.17/fs/reiserfs/buffer2.c Fri Dec 21 09:42:03 2001 +++ linux-2.4.17-lse02-D/fs/reiserfs/buffer2.c Thu Apr 4 17:37:17 2002 @@ -33,8 +33,7 @@ buffer_journal_dirty(bh) ? ' ' : '!'); } run_task_queue(&tq_disk); - current->policy |= SCHED_YIELD; - schedule(); + yield(); } if (repeat_counter > 30000000) { reiserfs_warning("vs-3051: done waiting, ignore vs-3050 messages for (%b)\n", bh) ; @@ -52,11 +51,11 @@ struct buffer_head * reiserfs_bread (struct super_block *super, int n_block, int n_size) { struct buffer_head *result; - PROC_EXP( unsigned int ctx_switches = kstat.context_swtch ); + PROC_EXP( unsigned int ctx_switches = nr_context_switches(); ); result = bread (super -> s_dev, n_block, n_size); PROC_INFO_INC( super, breads ); - PROC_EXP( if( kstat.context_swtch != ctx_switches ) + PROC_EXP( if( nr_context_switches() != ctx_switches ) PROC_INFO_INC( super, bread_miss ) ); return result; } diff -X dontdiff -ruN linux-2.4.17/fs/reiserfs/journal.c linux-2.4.17-lse02-D/fs/reiserfs/journal.c --- linux-2.4.17/fs/reiserfs/journal.c Fri Dec 21 09:42:03 2001 +++ linux-2.4.17-lse02-D/fs/reiserfs/journal.c Thu Apr 4 17:37:17 2002 @@ -149,8 +149,7 @@ } bn = allocate_bitmap_node(p_s_sb) ; if (!bn) { - current->policy |= SCHED_YIELD ; - schedule() ; + yield(); goto repeat ; } return bn ; diff -X dontdiff -ruN linux-2.4.17/fs/ufs/truncate.c linux-2.4.17-lse02-D/fs/ufs/truncate.c --- linux-2.4.17/fs/ufs/truncate.c Mon Nov 19 14:55:46 2001 +++ linux-2.4.17-lse02-D/fs/ufs/truncate.c Thu Apr 4 17:37:17 2002 @@ -448,10 +448,7 @@ if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) ufs_sync_inode (inode); run_task_queue(&tq_disk); - current->policy |= SCHED_YIELD; - schedule (); - - + yield(); } offset = inode->i_size & uspi->s_fshift; if (offset) { diff -X dontdiff -ruN linux-2.4.17/include/asm-alpha/io.h linux-2.4.17-lse02-D/include/asm-alpha/io.h --- linux-2.4.17/include/asm-alpha/io.h Fri Nov 9 13:45:35 2001 +++ linux-2.4.17-lse02-D/include/asm-alpha/io.h Thu Apr 4 17:37:17 2002 @@ -60,6 +60,8 @@ return (void *) (address + IDENT_ADDR); } +#define page_to_phys(page) (((page) - (page)->zone->zone_mem_map) << PAGE_SHIFT) + /* * Change addresses as seen by the kernel (virtual) to addresses as * seen by a device (bus), and vice versa. diff -X dontdiff -ruN linux-2.4.17/include/asm-i386/apic.h linux-2.4.17-lse02-D/include/asm-i386/apic.h --- linux-2.4.17/include/asm-i386/apic.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-lse02-D/include/asm-i386/apic.h Thu Apr 4 17:37:17 2002 @@ -79,6 +79,8 @@ extern void setup_apic_nmi_watchdog (void); extern inline void nmi_watchdog_tick (struct pt_regs * regs); extern int APIC_init_uniprocessor (void); +extern void disable_APIC_timer(void); +extern void enable_APIC_timer(void); extern struct pm_dev *apic_pm_register(pm_dev_t, unsigned long, pm_callback); extern void apic_pm_unregister(struct pm_dev*); diff -X dontdiff -ruN linux-2.4.17/include/asm-i386/bitops.h linux-2.4.17-lse02-D/include/asm-i386/bitops.h --- linux-2.4.17/include/asm-i386/bitops.h Thu Nov 22 11:46:18 2001 +++ linux-2.4.17-lse02-D/include/asm-i386/bitops.h Thu Apr 4 17:37:17 2002 @@ -75,6 +75,14 @@ :"=m" (ADDR) :"Ir" (nr)); } + +static __inline__ void __clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( + "btrl %1,%0" + :"=m" (ADDR) + :"Ir" (nr)); +} #define smp_mb__before_clear_bit() barrier() #define smp_mb__after_clear_bit() barrier() @@ -284,6 +292,34 @@ } /** + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit-number of the first set bit, not the number of the byte + * containing a bit. + */ +static __inline__ int find_first_bit(void * addr, unsigned size) +{ + int d0, d1; + int res; + + /* This looks at memory. Mark it volatile to tell gcc not to move it around */ + __asm__ __volatile__( + "xorl %%eax,%%eax\n\t" + "repe; scasl\n\t" + "jz 1f\n\t" + "leal -4(%%edi),%%edi\n\t" + "bsfl (%%edi),%%eax\n" + "1:\tsubl %%ebx,%%edi\n\t" + "shll $3,%%edi\n\t" + "addl %%edi,%%eax" + :"=a" (res), "=&c" (d0), "=&D" (d1) + :"1" ((size + 31) >> 5), "2" (addr), "b" (addr)); + return res; +} + +/** * find_next_zero_bit - find the first zero bit in a memory region * @addr: The address to base the search on * @offset: The bitnumber to start searching at @@ -296,7 +332,7 @@ if (bit) { /* - * Look for zero in first byte + * Look for zero in the first 32 bits. */ __asm__("bsfl %1,%0\n\t" "jne 1f\n\t" @@ -317,6 +353,39 @@ } /** + * find_next_bit - find the first set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +static __inline__ int find_next_bit (void * addr, int size, int offset) +{ + unsigned long * p = ((unsigned long *) addr) + (offset >> 5); + int set = 0, bit = offset & 31, res; + + if (bit) { + /* + * Look for nonzero in the first 32 bits: + */ + __asm__("bsfl %1,%0\n\t" + "jne 1f\n\t" + "movl $32, %0\n" + "1:" + : "=r" (set) + : "r" (*p >> bit)); + if (set < (32 - bit)) + return set + offset; + set = 32 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr)); + return (offset + set + res); +} + +/** * ffz - find first zero in word. * @word: The word to search * @@ -327,6 +396,20 @@ __asm__("bsfl %1,%0" :"=r" (word) :"r" (~word)); + return word; +} + +/** + * __ffs - find first bit in word. + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static __inline__ unsigned long __ffs(unsigned long word) +{ + __asm__("bsfl %1,%0" + :"=r" (word) + :"rm" (word)); return word; } diff -X dontdiff -ruN linux-2.4.17/include/asm-i386/hw_irq.h linux-2.4.17-lse02-D/include/asm-i386/hw_irq.h --- linux-2.4.17/include/asm-i386/hw_irq.h Thu Nov 22 11:46:18 2001 +++ linux-2.4.17-lse02-D/include/asm-i386/hw_irq.h Thu Apr 4 17:37:17 2002 @@ -41,7 +41,8 @@ #define ERROR_APIC_VECTOR 0xfe #define INVALIDATE_TLB_VECTOR 0xfd #define RESCHEDULE_VECTOR 0xfc -#define CALL_FUNCTION_VECTOR 0xfb +#define TASK_MIGRATION_VECTOR 0xfb +#define CALL_FUNCTION_VECTOR 0xfa /* * Local APIC timer IRQ vector is on a different priority level, diff -X dontdiff -ruN linux-2.4.17/include/asm-i386/io.h linux-2.4.17-lse02-D/include/asm-i386/io.h --- linux-2.4.17/include/asm-i386/io.h Thu Nov 22 11:46:27 2001 +++ linux-2.4.17-lse02-D/include/asm-i386/io.h Thu Apr 4 17:37:17 2002 @@ -76,7 +76,11 @@ /* * Change "struct page" to physical address. */ +#ifdef CONFIG_HIGHMEM64G +#define page_to_phys(page) ((u64)(page - mem_map) << PAGE_SHIFT) +#else #define page_to_phys(page) ((page - mem_map) << PAGE_SHIFT) +#endif extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags); diff -X dontdiff -ruN linux-2.4.17/include/asm-i386/kmap_types.h linux-2.4.17-lse02-D/include/asm-i386/kmap_types.h --- linux-2.4.17/include/asm-i386/kmap_types.h Mon Sep 17 13:16:30 2001 +++ linux-2.4.17-lse02-D/include/asm-i386/kmap_types.h Thu Apr 4 17:37:17 2002 @@ -7,6 +7,7 @@ KM_SKB_DATA_SOFTIRQ, KM_USER0, KM_USER1, + KM_BH_IRQ, KM_TYPE_NR }; diff -X dontdiff -ruN linux-2.4.17/include/asm-i386/mmu_context.h linux-2.4.17-lse02-D/include/asm-i386/mmu_context.h --- linux-2.4.17/include/asm-i386/mmu_context.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-lse02-D/include/asm-i386/mmu_context.h Thu Apr 4 17:37:17 2002 @@ -7,6 +7,25 @@ #include /* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + if (unlikely(b[0])) + return __ffs(b[0]); + if (unlikely(b[1])) + return __ffs(b[1]) + 32; + if (unlikely(b[2])) + return __ffs(b[2]) + 64; + if (b[3]) + return __ffs(b[3]) + 96; + return __ffs(b[4]) + 128; +} + +/* * possibly do the LDT unload here? */ #define destroy_context(mm) do { } while(0) @@ -27,13 +46,13 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu) { - if (prev != next) { + if (likely(prev != next)) { /* stop flush ipis for the previous mm */ clear_bit(cpu, &prev->cpu_vm_mask); /* * Re-load LDT if necessary */ - if (prev->context.segments != next->context.segments) + if (unlikely(prev->context.segments != next->context.segments)) load_LDT(next); #ifdef CONFIG_SMP cpu_tlbstate[cpu].state = TLBSTATE_OK; diff -X dontdiff -ruN linux-2.4.17/include/asm-i386/pgalloc.h linux-2.4.17-lse02-D/include/asm-i386/pgalloc.h --- linux-2.4.17/include/asm-i386/pgalloc.h Fri Dec 21 09:42:03 2001 +++ linux-2.4.17-lse02-D/include/asm-i386/pgalloc.h Thu Apr 4 17:37:17 2002 @@ -224,6 +224,7 @@ { struct mm_struct *active_mm; int state; + char __cacheline_padding[24]; }; extern struct tlb_state cpu_tlbstate[NR_CPUS]; diff -X dontdiff -ruN linux-2.4.17/include/asm-i386/scatterlist.h linux-2.4.17-lse02-D/include/asm-i386/scatterlist.h --- linux-2.4.17/include/asm-i386/scatterlist.h Fri Oct 12 15:35:54 2001 +++ linux-2.4.17-lse02-D/include/asm-i386/scatterlist.h Thu Apr 4 17:37:17 2002 @@ -1,6 +1,24 @@ #ifndef _I386_SCATTERLIST_H #define _I386_SCATTERLIST_H +/* + * Drivers must set either ->address or (preferred) ->page and ->offset + * to indicate where data must be transferred to/from. + * + * Using ->page is recommended since it handles highmem data as well as + * low mem. ->address is restricted to data which has a virtual mapping, and + * it will go away in the future. Updating to ->page can be automated very + * easily -- something like + * + * sg->address = some_ptr; + * + * can be rewritten as + * + * sg->page = virt_to_page(some_ptr); + * sg->offset = (unsigned long) some_ptr & ~PAGE_MASK; + * + * and that's it. There's no excuse for not highmem enabling YOUR driver. /jens + */ struct scatterlist { char * address; /* Location data is to be transferred to, NULL for * highmem page */ diff -X dontdiff -ruN linux-2.4.17/include/asm-i386/smp.h linux-2.4.17-lse02-D/include/asm-i386/smp.h --- linux-2.4.17/include/asm-i386/smp.h Fri Dec 21 09:42:03 2001 +++ linux-2.4.17-lse02-D/include/asm-i386/smp.h Thu Apr 4 17:37:17 2002 @@ -63,6 +63,7 @@ extern void smp_flush_tlb(void); extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); extern void smp_send_reschedule(int cpu); +extern void smp_send_reschedule_all(void); extern void smp_invalidate_rcv(void); /* Process an NMI */ extern void (*mtrr_hook) (void); extern void zap_low_mappings (void); @@ -104,7 +105,7 @@ * so this is correct in the x86 case. */ -#define smp_processor_id() (current->processor) +#define smp_processor_id() (current->cpu) static __inline int hard_smp_processor_id(void) { @@ -121,18 +122,6 @@ #endif /* !__ASSEMBLY__ */ #define NO_PROC_ID 0xFF /* No processor magic marker */ - -/* - * This magic constant controls our willingness to transfer - * a process across CPUs. Such a transfer incurs misses on the L1 - * cache, and on a P6 or P5 with multiple L2 caches L2 hits. My - * gut feeling is this will vary by board in value. For a board - * with separate L2 cache it probably depends also on the RSS, and - * for a board with shared L2 cache it ought to decay fast as other - * processes are run. - */ - -#define PROC_CHANGE_PENALTY 15 /* Schedule penalty */ #endif #endif diff -X dontdiff -ruN linux-2.4.17/include/asm-parisc/pgtable.h linux-2.4.17-lse02-D/include/asm-parisc/pgtable.h --- linux-2.4.17/include/asm-parisc/pgtable.h Sun Nov 11 10:20:21 2001 +++ linux-2.4.17-lse02-D/include/asm-parisc/pgtable.h Thu Apr 4 17:37:17 2002 @@ -275,7 +275,7 @@ * Permanent address of a page. Obviously must never be * called on a highmem page. */ -#define page_address(page) ({ if (!(page)->virtual) BUG(); (page)->virtual; }) +#define page_address(page) ((page)->virtual) #define __page_address(page) ({ if (PageHighMem(page)) BUG(); PAGE_OFFSET + (((page) - mem_map) << PAGE_SHIFT); }) #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) #define pte_page(x) (mem_map+pte_pagenr(x)) diff -X dontdiff -ruN linux-2.4.17/include/asm-ppc/kmap_types.h linux-2.4.17-lse02-D/include/asm-ppc/kmap_types.h --- linux-2.4.17/include/asm-ppc/kmap_types.h Mon Sep 17 13:16:30 2001 +++ linux-2.4.17-lse02-D/include/asm-ppc/kmap_types.h Thu Apr 4 17:37:17 2002 @@ -11,6 +11,7 @@ KM_SKB_DATA_SOFTIRQ, KM_USER0, KM_USER1, + KM_BH_IRQ, KM_TYPE_NR }; diff -X dontdiff -ruN linux-2.4.17/include/asm-sparc/irq.h linux-2.4.17-lse02-D/include/asm-sparc/irq.h --- linux-2.4.17/include/asm-sparc/irq.h Mon Aug 28 21:20:03 2000 +++ linux-2.4.17-lse02-D/include/asm-sparc/irq.h Thu Apr 4 17:37:17 2002 @@ -45,7 +45,7 @@ #define clear_profile_irq(cpu) BTFIXUP_CALL(clear_profile_irq)(cpu) #define load_profile_irq(cpu,limit) BTFIXUP_CALL(load_profile_irq)(cpu,limit) -extern void (*init_timers)(void (*lvl10_irq)(int, void *, struct pt_regs *)); +extern void (*sparc_init_timers)(void (*lvl10_irq)(int, void *, struct pt_regs *)); extern void claim_ticker14(void (*irq_handler)(int, void *, struct pt_regs *), int irq, unsigned int timeout); diff -X dontdiff -ruN linux-2.4.17/include/asm-sparc/kmap_types.h linux-2.4.17-lse02-D/include/asm-sparc/kmap_types.h --- linux-2.4.17/include/asm-sparc/kmap_types.h Mon Sep 17 13:16:30 2001 +++ linux-2.4.17-lse02-D/include/asm-sparc/kmap_types.h Thu Apr 4 17:37:17 2002 @@ -7,6 +7,7 @@ KM_SKB_DATA_SOFTIRQ, KM_USER0, KM_USER1, + KM_BH_IRQ, KM_TYPE_NR }; diff -X dontdiff -ruN linux-2.4.17/include/asm-sparc64/irq.h linux-2.4.17-lse02-D/include/asm-sparc64/irq.h --- linux-2.4.17/include/asm-sparc64/irq.h Sun Mar 25 18:14:21 2001 +++ linux-2.4.17-lse02-D/include/asm-sparc64/irq.h Thu Apr 4 17:37:17 2002 @@ -116,7 +116,7 @@ extern void disable_irq(unsigned int); #define disable_irq_nosync disable_irq extern void enable_irq(unsigned int); -extern void init_timers(void (*lvl10_irq)(int, void *, struct pt_regs *), +extern void sparc_init_timers(void (*lvl10_irq)(int, void *, struct pt_regs *), unsigned long *); extern unsigned int build_irq(int pil, int inofixup, unsigned long iclr, unsigned long imap); extern unsigned int sbus_build_irq(void *sbus, unsigned int ino); diff -X dontdiff -ruN linux-2.4.17/include/linux/blkdev.h linux-2.4.17-lse02-D/include/linux/blkdev.h --- linux-2.4.17/include/linux/blkdev.h Mon Nov 26 05:29:17 2001 +++ linux-2.4.17-lse02-D/include/linux/blkdev.h Thu Apr 4 17:37:17 2002 @@ -7,6 +7,8 @@ #include #include +#include + struct request_queue; typedef struct request_queue request_queue_t; struct elevator_s; @@ -35,7 +37,7 @@ unsigned long hard_sector, hard_nr_sectors; unsigned int nr_segments; unsigned int nr_hw_segments; - unsigned long current_nr_sectors; + unsigned long current_nr_sectors, hard_cur_sectors; void * special; char * buffer; struct completion * waiting; @@ -112,6 +114,8 @@ */ char head_active; + unsigned long bounce_pfn; + /* * Is meant to protect the queue in the future instead of * io_request_lock @@ -122,8 +126,38 @@ * Tasks wait here for free request */ wait_queue_head_t wait_for_request; + + unsigned concurrent:1; }; +extern unsigned long blk_max_low_pfn, blk_max_pfn; + +#define BLK_BOUNCE_HIGH (blk_max_low_pfn << PAGE_SHIFT) +#define BLK_BOUNCE_ANY (blk_max_pfn << PAGE_SHIFT) + +extern void blk_queue_bounce_limit(request_queue_t *, u64); + +#ifdef CONFIG_HIGHMEM +extern struct buffer_head *create_bounce(int, struct buffer_head *); +extern inline struct buffer_head *blk_queue_bounce(request_queue_t *q, int rw, + struct buffer_head *bh) +{ + struct page *page = bh->b_page; + + if (page - mem_map <= q->bounce_pfn) + return bh; + + return create_bounce(rw, bh); +} +#else +#define blk_queue_bounce(q, rw, bh) (bh) +#endif + +#define bh_phys(bh) (page_to_phys((bh)->b_page) + bh_offset((bh))) + +#define BH_CONTIG(b1, b2) (bh_phys((b1)) + (b1)->b_size == bh_phys((b2))) +#define BH_PHYS_4G(b1, b2) ((bh_phys((b1)) | 0xffffffff) == ((bh_phys((b2)) + (b2)->b_size - 1) | 0xffffffff)) + struct blk_dev_struct { /* * queue_proc has to be atomic @@ -162,6 +196,8 @@ extern void blk_queue_headactive(request_queue_t *, int); extern void blk_queue_make_request(request_queue_t *, make_request_fn *); extern void generic_unplug_device(void *); +extern inline int blk_seg_merge_ok(request_queue_t *, struct buffer_head *, + struct buffer_head *); extern int * blk_size[MAX_BLKDEV]; @@ -185,6 +221,7 @@ #define blkdev_entry_prev_request(entry) blkdev_entry_to_request((entry)->prev) #define blkdev_next_request(req) blkdev_entry_to_request((req)->queue.next) #define blkdev_prev_request(req) blkdev_entry_to_request((req)->queue.prev) +#define blkdev_free_rq(list) list_entry((list)->next, struct request, queue); extern void drive_stat_acct (kdev_t dev, int rw, unsigned long nr_sectors, int new_io); @@ -204,6 +241,20 @@ #define blk_finished_io(nsects) do { } while (0) #define blk_started_io(nsects) do { } while (0) + +#define xrq_lock_irq(q) \ + {if (q->concurrent) spin_lock_irq(&q->queue_lock); \ + else spin_lock_irq(&io_request_lock);} + +#define xrq_unlock_irq(q) \ + {if (q->concurrent) spin_unlock_irq(&q->queue_lock); \ + else spin_unlock_irq(&io_request_lock);} + +#define rq_lock(q) \ + {if (q->concurrent) spin_lock(&q->queue_lock);} + +#define rq_unlock(q) \ + {if (q->concurrent) spin_unlock(&q->queue_lock);} static inline unsigned int blksize_bits(unsigned int size) { diff -X dontdiff -ruN linux-2.4.17/include/linux/bootmem.h linux-2.4.17-lse02-D/include/linux/bootmem.h --- linux-2.4.17/include/linux/bootmem.h Thu Nov 22 11:47:23 2001 +++ linux-2.4.17-lse02-D/include/linux/bootmem.h Thu Apr 4 17:37:17 2002 @@ -16,6 +16,7 @@ extern unsigned long max_low_pfn; extern unsigned long min_low_pfn; +extern unsigned long max_pfn; /* * node_bootmem_map is a map pointer - the bits represent all physical diff -X dontdiff -ruN linux-2.4.17/include/linux/fs.h linux-2.4.17-lse02-D/include/linux/fs.h --- linux-2.4.17/include/linux/fs.h Fri Dec 21 09:42:03 2001 +++ linux-2.4.17-lse02-D/include/linux/fs.h Thu Apr 4 17:37:17 2002 @@ -828,8 +828,8 @@ int (*fsync) (struct file *, struct dentry *, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); - ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *); - ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); + ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *, size_t); + ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *, size_t); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); }; @@ -1350,6 +1350,7 @@ extern struct buffer_head * getblk(kdev_t, int, int); extern void ll_rw_block(int, int, struct buffer_head * bh[]); extern void submit_bh(int, struct buffer_head *); +extern void submit_bh_blknr(int, struct buffer_head *); extern int is_read_only(kdev_t); extern void __brelse(struct buffer_head *); static inline void brelse(struct buffer_head *buf) diff -X dontdiff -ruN linux-2.4.17/include/linux/highmem.h linux-2.4.17-lse02-D/include/linux/highmem.h --- linux-2.4.17/include/linux/highmem.h Fri Dec 21 09:42:03 2001 +++ linux-2.4.17-lse02-D/include/linux/highmem.h Thu Apr 4 17:37:17 2002 @@ -13,8 +13,7 @@ /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); -extern struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig); - +extern struct buffer_head *create_bounce(int rw, struct buffer_head * bh_orig); static inline char *bh_kmap(struct buffer_head *bh) { @@ -26,6 +25,42 @@ kunmap(bh->b_page); } +/* + * remember to add offset! and never ever reenable interrupts between a + * bh_kmap_irq and bh_kunmap_irq!! + */ +static inline char *bh_kmap_irq(struct buffer_head *bh, unsigned long *flags) +{ + unsigned long addr; + + __save_flags(*flags); + + /* + * could be low + */ + if (!PageHighMem(bh->b_page)) + return bh->b_data; + + /* + * it's a highmem page + */ + __cli(); + addr = (unsigned long) kmap_atomic(bh->b_page, KM_BH_IRQ); + + if (addr & ~PAGE_MASK) + BUG(); + + return (char *) addr + bh_offset(bh); +} + +static inline void bh_kunmap_irq(char *buffer, unsigned long *flags) +{ + unsigned long ptr = (unsigned long) buffer & PAGE_MASK; + + kunmap_atomic((void *) ptr, KM_BH_IRQ); + __restore_flags(*flags); +} + #else /* CONFIG_HIGHMEM */ static inline unsigned int nr_free_highpages(void) { return 0; } @@ -37,8 +72,10 @@ #define kmap_atomic(page,idx) kmap(page) #define kunmap_atomic(page,idx) kunmap(page) -#define bh_kmap(bh) ((bh)->b_data) -#define bh_kunmap(bh) do { } while (0) +#define bh_kmap(bh) ((bh)->b_data) +#define bh_kunmap(bh) do { } while (0) +#define bh_kmap_irq(bh, flags) ((bh)->b_data) +#define bh_kunmap_irq(bh, flags) do { *(flags) = 0; } while (0) #endif /* CONFIG_HIGHMEM */ diff -X dontdiff -ruN linux-2.4.17/include/linux/ide.h linux-2.4.17-lse02-D/include/linux/ide.h --- linux-2.4.17/include/linux/ide.h Thu Nov 22 11:48:07 2001 +++ linux-2.4.17-lse02-D/include/linux/ide.h Thu Apr 4 17:37:17 2002 @@ -507,6 +507,7 @@ unsigned reset : 1; /* reset after probe */ unsigned autodma : 1; /* automatically try to enable DMA at boot */ unsigned udma_four : 1; /* 1=ATA-66 capable, 0=default */ + unsigned highmem : 1; /* can do full 32-bit dma */ byte channel; /* for dual-port chips: 0=primary, 1=secondary */ #ifdef CONFIG_BLK_DEV_IDEPCI struct pci_dev *pci_dev; /* for pci chipsets */ @@ -812,6 +813,21 @@ ide_preempt, /* insert rq in front of current request */ ide_end /* insert rq at end of list, but don't wait for it */ } ide_action_t; + +/* + * temporarily mapping a (possible) highmem bio + */ +#define ide_rq_offset(rq) (((rq)->hard_cur_sectors - (rq)->current_nr_sectors) << 9) + +extern inline void *ide_map_buffer(struct request *rq, unsigned long *flags) +{ + return bh_kmap_irq(rq->bh, flags) + ide_rq_offset(rq); +} + +extern inline void ide_unmap_buffer(char *buffer, unsigned long *flags) +{ + bh_kunmap_irq(buffer, flags); +} /* * This function issues a special IDE device request diff -X dontdiff -ruN linux-2.4.17/include/linux/iobuf.h linux-2.4.17-lse02-D/include/linux/iobuf.h --- linux-2.4.17/include/linux/iobuf.h Thu Nov 22 11:46:26 2001 +++ linux-2.4.17-lse02-D/include/linux/iobuf.h Thu Apr 4 17:37:17 2002 @@ -24,18 +24,26 @@ * entire iovec. */ +#define KIO_STATIC_INFO 256 #define KIO_MAX_ATOMIC_IO 512 /* in kb */ #define KIO_STATIC_PAGES (KIO_MAX_ATOMIC_IO / (PAGE_SIZE >> 10) + 1) #define KIO_MAX_SECTORS (KIO_MAX_ATOMIC_IO * 2) /* The main kiobuf struct used for all our IO! */ +struct pinfo +{ + unsigned short poffset[KIO_STATIC_INFO]; + unsigned short plen[KIO_STATIC_INFO]; +}; + struct kiobuf { int nr_pages; /* Pages actually referenced */ int array_len; /* Space in the allocated lists */ int offset; /* Offset to start of valid data */ int length; /* Number of valid bytes of data */ + int blkno; /* Block number for RAW IO */ /* Keep separate track of the physical addresses and page * structs involved. If we do IO to a memory-mapped device @@ -44,18 +52,21 @@ struct page ** maplist; - unsigned int locked : 1; /* If set, pages has been locked */ + unsigned int locked : 1, /* If set, pages has been locked */ + dovary : 1; /* If set, do variable length IO */ /* Always embed enough struct pages for atomic IO */ struct page * map_array[KIO_STATIC_PAGES]; - struct buffer_head * bh[KIO_MAX_SECTORS]; - unsigned long blocks[KIO_MAX_SECTORS]; + struct buffer_head *bh; + unsigned long *blocks; /* Dynamic state for IO completion: */ atomic_t io_count; /* IOs still in progress */ int errno; /* Status of completed IO */ void (*end_io) (struct kiobuf *); /* Completion callback */ wait_queue_head_t wait_queue; + struct pinfo *pinfo; + }; @@ -72,10 +83,11 @@ void end_kio_request(struct kiobuf *, int); void simple_wakeup_kiobuf(struct kiobuf *); int alloc_kiovec(int nr, struct kiobuf **); +int alloc_kiovec_raw(int nr, struct kiobuf **); void free_kiovec(int nr, struct kiobuf **); int expand_kiobuf(struct kiobuf *, int); void kiobuf_wait_for_io(struct kiobuf *); -extern int alloc_kiobuf_bhs(struct kiobuf *); +extern int alloc_kiobuf_bhs(struct kiobuf *, int); extern void free_kiobuf_bhs(struct kiobuf *); /* fs/buffer.c */ diff -X dontdiff -ruN linux-2.4.17/include/linux/kernel_stat.h linux-2.4.17-lse02-D/include/linux/kernel_stat.h --- linux-2.4.17/include/linux/kernel_stat.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-lse02-D/include/linux/kernel_stat.h Thu Apr 4 17:37:17 2002 @@ -32,10 +32,11 @@ unsigned int ipackets, opackets; unsigned int ierrors, oerrors; unsigned int collisions; - unsigned int context_swtch; }; extern struct kernel_stat kstat; + +extern unsigned long nr_context_switches(void); #if !defined(CONFIG_ARCH_S390) /* diff -X dontdiff -ruN linux-2.4.17/include/linux/list.h linux-2.4.17-lse02-D/include/linux/list.h --- linux-2.4.17/include/linux/list.h Fri Dec 21 09:42:03 2001 +++ linux-2.4.17-lse02-D/include/linux/list.h Thu Apr 4 17:37:17 2002 @@ -19,6 +19,8 @@ struct list_head *next, *prev; }; +typedef struct list_head list_t; + #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ diff -X dontdiff -ruN linux-2.4.17/include/linux/sched.h linux-2.4.17-lse02-D/include/linux/sched.h --- linux-2.4.17/include/linux/sched.h Fri Dec 21 09:42:03 2001 +++ linux-2.4.17-lse02-D/include/linux/sched.h Thu Apr 4 17:37:17 2002 @@ -6,6 +6,7 @@ extern unsigned long event; #include +#include #include #include #include @@ -42,6 +43,7 @@ #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ #define CLONE_THREAD 0x00010000 /* Same thread group? */ +#define CLONE_NEWNS 0x00020000 /* New namespace group? */ #define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD) @@ -72,8 +74,9 @@ #define CT_TO_SECS(x) ((x) / HZ) #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) -extern int nr_running, nr_threads; +extern int nr_threads; extern int last_pid; +extern unsigned long nr_running(void); #include #include @@ -116,12 +119,6 @@ #define SCHED_FIFO 1 #define SCHED_RR 2 -/* - * This is an additional bit set when we want to - * yield the CPU for one re-schedule.. - */ -#define SCHED_YIELD 0x10 - struct sched_param { int sched_priority; }; @@ -139,17 +136,22 @@ * a separate lock). */ extern rwlock_t tasklist_lock; -extern spinlock_t runqueue_lock; extern spinlock_t mmlist_lock; +typedef struct task_struct task_t; + extern void sched_init(void); -extern void init_idle(void); +extern void init_idle(task_t *idle, int cpu); extern void show_state(void); extern void cpu_init (void); extern void trap_init(void); extern void update_process_times(int user); -extern void update_one_process(struct task_struct *p, unsigned long user, +extern void update_one_process(task_t *p, unsigned long user, unsigned long system, int cpu); +extern void scheduler_tick(int user_tick, int system); +extern void sched_task_migrated(task_t *p); +extern void smp_migrate_task(int cpu, task_t *task); +extern unsigned long cache_decay_ticks; #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); @@ -166,6 +168,7 @@ */ #define NR_OPEN_DEFAULT BITS_PER_LONG +struct namespace; /* * Open file table structure */ @@ -278,6 +281,8 @@ extern struct user_struct root_user; #define INIT_USER (&root_user) +typedef struct prio_array prio_array_t; + struct task_struct { /* * offsets of these are hardcoded elsewhere - touch with care @@ -295,35 +300,26 @@ int lock_depth; /* Lock depth */ -/* - * offset 32 begins here on 32-bit platforms. We keep - * all fields in a single cacheline that are needed for - * the goodness() loop in schedule(). - */ - long counter; - long nice; - unsigned long policy; - struct mm_struct *mm; - int processor; /* - * cpus_runnable is ~0 if the process is not running on any - * CPU. It's (1 << cpu) if it's running on a CPU. This mask - * is updated under the runqueue lock. - * - * To determine whether a process might run on a CPU, this - * mask is AND-ed with cpus_allowed. + * offset 32 begins here on 32-bit platforms. */ - unsigned long cpus_runnable, cpus_allowed; - /* - * (only the 'next' pointer fits into the cacheline, but - * that's just fine.) - */ - struct list_head run_list; - unsigned long sleep_time; + unsigned int cpu; + int prio, static_prio; + list_t run_list; + prio_array_t *array; + + unsigned long sleep_avg; + unsigned long sleep_timestamp; + + unsigned long policy; + unsigned long cpus_allowed; + unsigned int time_slice; + + task_t *next_task, *prev_task; - struct task_struct *next_task, *prev_task; - struct mm_struct *active_mm; + struct mm_struct *mm, *active_mm; struct list_head local_pages; + unsigned int allocation_order, nr_local_pages; /* task state */ @@ -345,12 +341,12 @@ * older sibling, respectively. (p->father can be replaced with * p->p_pptr->pid) */ - struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; + task_t *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; struct list_head thread_group; /* PID hash table linkage. */ - struct task_struct *pidhash_next; - struct task_struct **pidhash_pprev; + task_t *pidhash_next; + task_t **pidhash_pprev; wait_queue_head_t wait_chldexit; /* for wait4() */ struct completion *vfork_done; /* for vfork() */ @@ -389,6 +385,8 @@ struct fs_struct *fs; /* open file information */ struct files_struct *files; +/* namespace */ + struct namespace *namespace; /* signal handlers */ spinlock_t sigmask_lock; /* Protects signal and blocked */ struct signal_struct *sig; @@ -446,10 +444,13 @@ */ #define _STK_LIM (8*1024*1024) -#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */ -#define MAX_COUNTER (20*HZ/100) -#define DEF_NICE (0) +extern void set_cpus_allowed(task_t *p, unsigned long new_mask); +extern void set_user_nice(task_t *p, long nice); +extern int task_prio(task_t *p); +extern int task_nice(task_t *p); +asmlinkage long sys_sched_yield(void); +#define yield() sys_sched_yield() /* * The default (Linux) execution domain. @@ -468,14 +469,14 @@ addr_limit: KERNEL_DS, \ exec_domain: &default_exec_domain, \ lock_depth: -1, \ - counter: DEF_COUNTER, \ - nice: DEF_NICE, \ + prio: 120, \ + static_prio: 120, \ policy: SCHED_OTHER, \ + cpus_allowed: -1, \ mm: NULL, \ active_mm: &init_mm, \ - cpus_runnable: -1, \ - cpus_allowed: -1, \ run_list: LIST_HEAD_INIT(tsk.run_list), \ + time_slice: HZ, \ next_task: &tsk, \ prev_task: &tsk, \ p_opptr: &tsk, \ @@ -509,24 +510,24 @@ #endif union task_union { - struct task_struct task; + task_t task; unsigned long stack[INIT_TASK_SIZE/sizeof(long)]; }; extern union task_union init_task_union; extern struct mm_struct init_mm; -extern struct task_struct *init_tasks[NR_CPUS]; +extern task_t *init_tasks[NR_CPUS]; /* PID hashing. (shouldnt this be dynamic?) */ #define PIDHASH_SZ (4096 >> 2) -extern struct task_struct *pidhash[PIDHASH_SZ]; +extern task_t *pidhash[PIDHASH_SZ]; #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) -static inline void hash_pid(struct task_struct *p) +static inline void hash_pid(task_t *p) { - struct task_struct **htable = &pidhash[pid_hashfn(p->pid)]; + task_t **htable = &pidhash[pid_hashfn(p->pid)]; if((p->pidhash_next = *htable) != NULL) (*htable)->pidhash_pprev = &p->pidhash_next; @@ -534,16 +535,16 @@ p->pidhash_pprev = htable; } -static inline void unhash_pid(struct task_struct *p) +static inline void unhash_pid(task_t *p) { if(p->pidhash_next) p->pidhash_next->pidhash_pprev = p->pidhash_pprev; *p->pidhash_pprev = p->pidhash_next; } -static inline struct task_struct *find_task_by_pid(int pid) +static inline task_t *find_task_by_pid(int pid) { - struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)]; + task_t *p, **htable = &pidhash[pid_hashfn(pid)]; for(p = *htable; p && p->pid != pid; p = p->pidhash_next) ; @@ -551,19 +552,6 @@ return p; } -#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL) - -static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu) -{ - tsk->processor = cpu; - tsk->cpus_runnable = 1UL << cpu; -} - -static inline void task_release_cpu(struct task_struct *tsk) -{ - tsk->cpus_runnable = ~0UL; -} - /* per-UID process charging. */ extern struct user_struct * alloc_uid(uid_t); extern void free_uid(struct user_struct *); @@ -590,7 +578,9 @@ extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q)); extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q, signed long timeout)); -extern int FASTCALL(wake_up_process(struct task_struct * tsk)); +extern int FASTCALL(wake_up_process(task_t * tsk)); +extern void FASTCALL(wake_up_forked_process(task_t * tsk)); +extern void FASTCALL(sched_exit(task_t * p)); #define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) #define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) @@ -608,28 +598,28 @@ extern int in_egroup_p(gid_t); extern void proc_caches_init(void); -extern void flush_signals(struct task_struct *); -extern void flush_signal_handlers(struct task_struct *); +extern void flush_signals(task_t *); +extern void flush_signal_handlers(task_t *); extern int dequeue_signal(sigset_t *, siginfo_t *); extern void block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask); extern void unblock_all_signals(void); -extern int send_sig_info(int, struct siginfo *, struct task_struct *); -extern int force_sig_info(int, struct siginfo *, struct task_struct *); +extern int send_sig_info(int, struct siginfo *, task_t *); +extern int force_sig_info(int, struct siginfo *, task_t *); extern int kill_pg_info(int, struct siginfo *, pid_t); extern int kill_sl_info(int, struct siginfo *, pid_t); extern int kill_proc_info(int, struct siginfo *, pid_t); -extern void notify_parent(struct task_struct *, int); -extern void do_notify_parent(struct task_struct *, int); -extern void force_sig(int, struct task_struct *); -extern int send_sig(int, struct task_struct *, int); +extern void notify_parent(task_t *, int); +extern void do_notify_parent(task_t *, int); +extern void force_sig(int, task_t *); +extern int send_sig(int, task_t *, int); extern int kill_pg(pid_t, int, int); extern int kill_sl(pid_t, int, int); extern int kill_proc(pid_t, int, int); extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *); extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long); -static inline int signal_pending(struct task_struct *p) +static inline int signal_pending(task_t *p) { return (p->sigpending != 0); } @@ -668,7 +658,7 @@ This is required every time the blocked sigset_t changes. All callers should have t->sigmask_lock. */ -static inline void recalc_sigpending(struct task_struct *t) +static inline void recalc_sigpending(task_t *t) { t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked); } @@ -775,16 +765,17 @@ extern int expand_fdset(struct files_struct *, int nr); extern void free_fdset(fd_set *, int); -extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); +extern int copy_thread(int, unsigned long, unsigned long, unsigned long, task_t *, struct pt_regs *); extern void flush_thread(void); extern void exit_thread(void); -extern void exit_mm(struct task_struct *); -extern void exit_files(struct task_struct *); -extern void exit_sighand(struct task_struct *); +extern void exit_mm(task_t *); +extern void exit_files(task_t *); +extern void exit_sighand(task_t *); extern void reparent_to_init(void); extern void daemonize(void); +extern task_t *child_reaper; extern int do_execve(char *, char **, char **, struct pt_regs *); extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long); @@ -793,6 +784,9 @@ extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); +extern void wait_task_inactive(task_t * p); +extern void kick_if_running(task_t * p); + #define __wait_event(wq, condition) \ do { \ wait_queue_t __wait; \ @@ -871,24 +865,10 @@ for (p = &init_task ; (p = p->next_task) != &init_task ; ) #define next_thread(p) \ - list_entry((p)->thread_group.next, struct task_struct, thread_group) - -static inline void del_from_runqueue(struct task_struct * p) -{ - nr_running--; - p->sleep_time = jiffies; - list_del(&p->run_list); - p->run_list.next = NULL; -} - -static inline int task_on_runqueue(struct task_struct *p) -{ - return (p->run_list.next != NULL); -} + list_entry((p)->thread_group.next, task_t, thread_group) -static inline void unhash_process(struct task_struct *p) +static inline void unhash_process(task_t *p) { - if (task_on_runqueue(p)) BUG(); write_lock_irq(&tasklist_lock); nr_threads--; unhash_pid(p); @@ -898,12 +878,12 @@ } /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ -static inline void task_lock(struct task_struct *p) +static inline void task_lock(task_t *p) { spin_lock(&p->alloc_lock); } -static inline void task_unlock(struct task_struct *p) +static inline void task_unlock(task_t *p) { spin_unlock(&p->alloc_lock); } diff -X dontdiff -ruN linux-2.4.17/include/linux/smp.h linux-2.4.17-lse02-D/include/linux/smp.h --- linux-2.4.17/include/linux/smp.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-lse02-D/include/linux/smp.h Thu Apr 4 17:37:17 2002 @@ -76,7 +76,8 @@ /* * These macros fold the SMP functionality into a single CPU system */ - + +#define NR_CPUS 1 #define smp_num_cpus 1 #define smp_processor_id() 0 #define hard_smp_processor_id() 0 @@ -86,6 +87,14 @@ #define cpu_number_map(cpu) 0 #define smp_call_function(func,info,retry,wait) ({ 0; }) #define cpu_online_map 1 +static inline void smp_send_reschedule(int cpu) { } +static inline void smp_send_reschedule_all(void) { } #endif + +/* + * Common definitions: + */ +#define cpu() smp_processor_id() + #endif diff -X dontdiff -ruN linux-2.4.17/include/linux/timer.h linux-2.4.17-lse02-D/include/linux/timer.h --- linux-2.4.17/include/linux/timer.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-lse02-D/include/linux/timer.h Thu Apr 4 17:37:17 2002 @@ -1,9 +1,6 @@ #ifndef _LINUX_TIMER_H #define _LINUX_TIMER_H -#include -#include - /* * In Linux 2.4, static timers have been removed from the kernel. * Timers may be dynamically created and destroyed, and should be initialized @@ -13,22 +10,77 @@ * timeouts. You can use this field to distinguish between the different * invocations. */ + +#include +#include +#include +#include + +/* + * Event timer code + */ +#define TVN_BITS 6 +#define TVR_BITS 8 +#define TVN_SIZE (1 << TVN_BITS) +#define TVR_SIZE (1 << TVR_BITS) +#define TVN_MASK (TVN_SIZE - 1) +#define TVR_MASK (TVR_SIZE - 1) + +typedef struct tvec_s { + int index; + struct list_head vec[TVN_SIZE]; +} tvec_t; + +typedef struct tvec_root_s { + int index; + struct list_head vec[TVR_SIZE]; +} tvec_root_t; + +#define NOOF_TVECS 5 + +typedef struct timer_list timer_t; + +typedef struct tvec_t_base_s { + spinlock_t lock; + unsigned long timer_jiffies; + volatile timer_t * volatile running_timer; + tvec_root_t tv1; + tvec_t tv2; + tvec_t tv3; + tvec_t tv4; + tvec_t tv5; +} tvec_base_t; + +/* + * This is the new and improved way of handling timers. + * + * The "data" field is in case you want to use the same + * timeout function for several timeouts. You can use this + * to distinguish between the different invocations. + */ struct timer_list { struct list_head list; unsigned long expires; unsigned long data; void (*function)(unsigned long); + tvec_base_t *base; }; -extern void add_timer(struct timer_list * timer); -extern int del_timer(struct timer_list * timer); +extern void add_timer(timer_t * timer); +extern int del_timer(timer_t * timer); #ifdef CONFIG_SMP -extern int del_timer_sync(struct timer_list * timer); +extern int del_timer_sync(timer_t * timer); extern void sync_timers(void); +#define timer_enter(base, t) do { base->running_timer = t; mb(); } while (0) +#define timer_exit(base) do { base->running_timer = NULL; } while (0) +#define timer_is_running(base,t) (base->running_timer == t) +#define timer_synchronize(base,t) while (timer_is_running(base,t)) barrier() #else #define del_timer_sync(t) del_timer(t) #define sync_timers() do { } while (0) +#define timer_enter(base,t) do { } while (0) +#define timer_exit(base) do { } while (0) #endif /* @@ -38,17 +90,33 @@ * If the timer is known to be not pending (ie, in the handler), mod_timer * is less efficient than a->expires = b; add_timer(a). */ -int mod_timer(struct timer_list *timer, unsigned long expires); +int mod_timer(timer_t *timer, unsigned long expires); extern void it_real_fn(unsigned long); -static inline void init_timer(struct timer_list * timer) +extern void init_timers(void); +extern void run_local_timers(void); + +extern tvec_base_t tvec_bases[NR_CPUS]; + +static inline void init_timer(timer_t * timer) { timer->list.next = timer->list.prev = NULL; + timer->base = tvec_bases + 0; } -static inline int timer_pending (const struct timer_list * timer) +#define TIMER_DEBUG 0 +#if TIMER_DEBUG +# define CHECK_BASE(base) \ + if (base && ((base < tvec_bases) || (base >= tvec_bases + NR_CPUS))) \ + BUG() +#else +# define CHECK_BASE(base) +#endif + +static inline int timer_pending(const timer_t * timer) { + CHECK_BASE(timer->base); return timer->list.next != NULL; } diff -X dontdiff -ruN linux-2.4.17/init/main.c linux-2.4.17-lse02-D/init/main.c --- linux-2.4.17/init/main.c Fri Dec 21 09:42:04 2001 +++ linux-2.4.17-lse02-D/init/main.c Thu Apr 4 17:37:17 2002 @@ -482,8 +482,6 @@ extern void setup_arch(char **); extern void cpu_idle(void); -unsigned long wait_init_idle; - #ifndef CONFIG_SMP #ifdef CONFIG_X86_LOCAL_APIC @@ -492,34 +490,24 @@ APIC_init_uniprocessor(); } #else -#define smp_init() do { } while (0) +#define smp_init() do { } while (0) #endif #else - /* Called by boot processor to activate the rest. */ static void __init smp_init(void) { /* Get other processors into their bootup holding patterns. */ smp_boot_cpus(); - wait_init_idle = cpu_online_map; - clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */ smp_threads_ready=1; smp_commence(); - - /* Wait for the other cpus to set up their idle processes */ - printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle); - while (wait_init_idle) { - cpu_relax(); - barrier(); - } - printk("All processors have done init_idle\n"); } #endif + /* * We need to finalize in a non-__init function or else race conditions * between the root thread and the init thread may cause start_kernel to @@ -531,9 +519,8 @@ { kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); unlock_kernel(); - current->need_resched = 1; - cpu_idle(); -} + cpu_idle(); +} /* * Activate the first processor. @@ -611,14 +598,18 @@ ipc_init(); #endif check_bugs(); + printk("POSIX conformance testing by UNIFIX\n"); - /* - * We count on the initial thread going ok - * Like idlers init is an unlocked kernel thread, which will - * make syscalls (and thus be locked). + init_idle(current, smp_processor_id()); + /* + * We count on the initial thread going ok + * Like idlers init is an unlocked kernel thread, which will + * make syscalls (and thus be locked). */ smp_init(); + + /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -779,12 +770,9 @@ int i, pid; pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD); - if (pid > 0) { - while (pid != wait(&i)) { - current->policy |= SCHED_YIELD; - schedule(); - } - } + if (pid > 0) + while (pid != wait(&i)) + yield(); if (MAJOR(real_root_dev) != RAMDISK_MAJOR || MINOR(real_root_dev) != 0) { error = change_root(real_root_dev,"/initrd"); diff -X dontdiff -ruN linux-2.4.17/kernel/capability.c linux-2.4.17-lse02-D/kernel/capability.c --- linux-2.4.17/kernel/capability.c Fri Jun 23 21:06:37 2000 +++ linux-2.4.17-lse02-D/kernel/capability.c Thu Apr 4 17:37:17 2002 @@ -8,6 +8,8 @@ #include #include +unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ + kernel_cap_t cap_bset = CAP_INIT_EFF_SET; /* Note: never hold tasklist_lock while spinning for this one */ diff -X dontdiff -ruN linux-2.4.17/kernel/exit.c linux-2.4.17-lse02-D/kernel/exit.c --- linux-2.4.17/kernel/exit.c Wed Nov 21 14:42:27 2001 +++ linux-2.4.17-lse02-D/kernel/exit.c Thu Apr 4 17:37:18 2002 @@ -27,49 +27,22 @@ static void release_task(struct task_struct * p) { - if (p != current) { + if (p == current) + BUG(); #ifdef CONFIG_SMP - /* - * Wait to make sure the process isn't on the - * runqueue (active on some other CPU still) - */ - for (;;) { - task_lock(p); - if (!task_has_cpu(p)) - break; - task_unlock(p); - do { - cpu_relax(); - barrier(); - } while (task_has_cpu(p)); - } - task_unlock(p); + wait_task_inactive(p); #endif - atomic_dec(&p->user->processes); - free_uid(p->user); - unhash_process(p); - - release_thread(p); - current->cmin_flt += p->min_flt + p->cmin_flt; - current->cmaj_flt += p->maj_flt + p->cmaj_flt; - current->cnswap += p->nswap + p->cnswap; - /* - * Potentially available timeslices are retrieved - * here - this way the parent does not get penalized - * for creating too many processes. - * - * (this cannot be used to artificially 'generate' - * timeslices, because any timeslice recovered here - * was given away by the parent in the first place.) - */ - current->counter += p->counter; - if (current->counter >= MAX_COUNTER) - current->counter = MAX_COUNTER; - p->pid = 0; - free_task_struct(p); - } else { - printk("task releasing itself\n"); - } + atomic_dec(&p->user->processes); + free_uid(p->user); + unhash_process(p); + + release_thread(p); + current->cmin_flt += p->min_flt + p->cmin_flt; + current->cmaj_flt += p->maj_flt + p->cmaj_flt; + current->cnswap += p->nswap + p->cnswap; + sched_exit(p); + p->pid = 0; + free_task_struct(p); } /* @@ -147,6 +120,79 @@ } read_unlock(&tasklist_lock); return retval; +} + +/** + * reparent_to_init() - Reparent the calling kernel thread to the init task. + * + * If a kernel thread is launched as a result of a system call, or if + * it ever exits, it should generally reparent itself to init so that + * it is correctly cleaned up on exit. + * + * The various task state such as scheduling policy and priority may have + * been inherited from a user process, so we reset them to sane values here. + * + * NOTE that reparent_to_init() gives the caller full capabilities. + */ +void reparent_to_init(void) +{ + write_lock_irq(&tasklist_lock); + + /* Reparent to init */ + REMOVE_LINKS(current); + current->p_pptr = child_reaper; + current->p_opptr = child_reaper; + SET_LINKS(current); + + /* Set the exit signal to SIGCHLD so we signal init on exit */ + current->exit_signal = SIGCHLD; + + current->ptrace = 0; + if ((current->policy == SCHED_OTHER) && (task_nice(current) < 0)) + set_user_nice(current, 0); + /* cpus_allowed? */ + /* rt_priority? */ + /* signals? */ + current->cap_effective = CAP_INIT_EFF_SET; + current->cap_inheritable = CAP_INIT_INH_SET; + current->cap_permitted = CAP_FULL_SET; + current->keep_capabilities = 0; + memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim))); + current->user = INIT_USER; + + write_unlock_irq(&tasklist_lock); +} + +/* + * Put all the gunge required to become a kernel thread without + * attached user resources in one place where it belongs. + */ + +void daemonize(void) +{ + struct fs_struct *fs; + + + /* + * If we were started as result of loading a module, close all of the + * user space pages. We don't need them, and if we didn't close them + * they would be locked into memory. + */ + exit_mm(current); + + current->session = 1; + current->pgrp = 1; + current->tty = NULL; + + /* Become as one with the init task */ + + exit_fs(current); /* current->fs->count--; */ + fs = init_task.fs; + current->fs = fs; + atomic_inc(&fs->count); + exit_files(current); + current->files = init_task.files; + atomic_inc(¤t->files->count); } /* diff -X dontdiff -ruN linux-2.4.17/kernel/fork.c linux-2.4.17-lse02-D/kernel/fork.c --- linux-2.4.17/kernel/fork.c Wed Nov 21 10:18:42 2001 +++ linux-2.4.17-lse02-D/kernel/fork.c Thu Apr 4 17:37:18 2002 @@ -28,7 +28,6 @@ /* The idle threads do not count.. */ int nr_threads; -int nr_running; int max_threads; unsigned long total_forks; /* Handle normal Linux uptimes. */ @@ -36,6 +35,8 @@ struct task_struct *pidhash[PIDHASH_SZ]; +rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ + void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) { unsigned long flags; @@ -563,6 +564,7 @@ struct pt_regs *regs, unsigned long stack_size) { int retval; + unsigned long flags; struct task_struct *p; struct completion vfork; @@ -611,8 +613,7 @@ copy_flags(clone_flags, p); p->pid = get_pid(clone_flags); - p->run_list.next = NULL; - p->run_list.prev = NULL; + INIT_LIST_HEAD(&p->run_list); p->p_cptr = NULL; init_waitqueue_head(&p->wait_chldexit); @@ -638,14 +639,15 @@ #ifdef CONFIG_SMP { int i; - p->cpus_runnable = ~0UL; - p->processor = current->processor; + /* ?? should we just memset this ?? */ for(i = 0; i < smp_num_cpus; i++) - p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0; + p->per_cpu_utime[cpu_logical_map(i)] = + p->per_cpu_stime[cpu_logical_map(i)] = 0; spin_lock_init(&p->sigmask_lock); } #endif + p->array = NULL; p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; @@ -677,15 +679,27 @@ p->pdeath_signal = 0; /* - * "share" dynamic priority between parent and child, thus the - * total amount of dynamic priorities in the system doesnt change, - * more scheduling fairness. This is only important in the first - * timeslice, on the long run the scheduling behaviour is unchanged. + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesnt change, + * resulting in more scheduling fairness. */ - p->counter = (current->counter + 1) >> 1; - current->counter >>= 1; - if (!current->counter) - current->need_resched = 1; + __save_flags(flags); + __cli(); + if (!current->time_slice) + BUG(); + p->time_slice = (current->time_slice + 1) >> 1; + current->time_slice >>= 1; + if (!current->time_slice) { + /* + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the + * runqueue lock is not a problem. + */ + current->time_slice = 1; + scheduler_tick(0,0); + } + p->sleep_timestamp = jiffies; + __restore_flags(flags); /* * Ok, add it to the run-queues and make it @@ -722,10 +736,23 @@ if (p->ptrace & PT_PTRACED) send_sig(SIGSTOP, p, 1); +#define RUN_CHILD_FIRST 1 +#if RUN_CHILD_FIRST + wake_up_forked_process(p); /* do this last */ +#else wake_up_process(p); /* do this last */ +#endif ++total_forks; if (clone_flags & CLONE_VFORK) wait_for_completion(&vfork); +#if RUN_CHILD_FIRST + else + /* + * Let the child process run first, to avoid most of the + * COW overhead when the child exec()s afterwards. + */ + current->need_resched = 1; +#endif fork_out: return retval; diff -X dontdiff -ruN linux-2.4.17/kernel/ksyms.c linux-2.4.17-lse02-D/kernel/ksyms.c --- linux-2.4.17/kernel/ksyms.c Fri Dec 21 09:42:04 2001 +++ linux-2.4.17-lse02-D/kernel/ksyms.c Thu Apr 4 17:37:18 2002 @@ -122,6 +122,8 @@ EXPORT_SYMBOL(kunmap_high); EXPORT_SYMBOL(highmem_start_page); EXPORT_SYMBOL(create_bounce); +EXPORT_SYMBOL(kmap_prot); +EXPORT_SYMBOL(kmap_pte); #endif /* filesystem internal functions */ @@ -378,6 +380,7 @@ EXPORT_SYMBOL(del_timer_sync); #endif EXPORT_SYMBOL(mod_timer); +EXPORT_SYMBOL(tvec_bases); EXPORT_SYMBOL(tq_timer); EXPORT_SYMBOL(tq_immediate); @@ -437,6 +440,9 @@ EXPORT_SYMBOL(interruptible_sleep_on_timeout); EXPORT_SYMBOL(schedule); EXPORT_SYMBOL(schedule_timeout); +EXPORT_SYMBOL(sys_sched_yield); +EXPORT_SYMBOL(set_user_nice); +EXPORT_SYMBOL(set_cpus_allowed); EXPORT_SYMBOL(jiffies); EXPORT_SYMBOL(xtime); EXPORT_SYMBOL(do_gettimeofday); @@ -448,6 +454,7 @@ EXPORT_SYMBOL(kstat); EXPORT_SYMBOL(nr_running); +EXPORT_SYMBOL(nr_context_switches); /* misc */ EXPORT_SYMBOL(panic); diff -X dontdiff -ruN linux-2.4.17/kernel/printk.c linux-2.4.17-lse02-D/kernel/printk.c --- linux-2.4.17/kernel/printk.c Fri Dec 21 09:42:04 2001 +++ linux-2.4.17-lse02-D/kernel/printk.c Thu Apr 4 17:37:18 2002 @@ -25,6 +25,7 @@ #include #include /* For in_interrupt() */ #include +#include #include diff -X dontdiff -ruN linux-2.4.17/kernel/ptrace.c linux-2.4.17-lse02-D/kernel/ptrace.c --- linux-2.4.17/kernel/ptrace.c Fri Dec 21 09:42:04 2001 +++ linux-2.4.17-lse02-D/kernel/ptrace.c Thu Apr 4 17:37:18 2002 @@ -31,20 +31,7 @@ if (child->state != TASK_STOPPED) return -ESRCH; #ifdef CONFIG_SMP - /* Make sure the child gets off its CPU.. */ - for (;;) { - task_lock(child); - if (!task_has_cpu(child)) - break; - task_unlock(child); - do { - if (child->state != TASK_STOPPED) - return -ESRCH; - barrier(); - cpu_relax(); - } while (task_has_cpu(child)); - } - task_unlock(child); + wait_task_inactive(child); #endif } diff -X dontdiff -ruN linux-2.4.17/kernel/sched.c linux-2.4.17-lse02-D/kernel/sched.c --- linux-2.4.17/kernel/sched.c Fri Dec 21 09:42:04 2001 +++ linux-2.4.17-lse02-D/kernel/sched.c Thu Apr 4 17:37:18 2002 @@ -12,333 +12,306 @@ * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar */ -/* - * 'sched.c' is the main kernel file. It contains scheduling primitives - * (sleep_on, wakeup, schedule etc) as well as a number of simple system - * call functions (type getpid()), which just extract a field from - * current-task - */ - -#include #include +#include #include +#include #include -#include #include -#include -#include -#include -#include - -#include #include - -extern void timer_bh(void); -extern void tqueue_bh(void); -extern void immediate_bh(void); +#include /* - * scheduler variables + * Priority of a process goes from 0 to 139. The 0-99 + * priority range is allocated to RT tasks, the 100-139 + * range is for SCHED_OTHER tasks. Priority values are + * inverted: lower p->prio value means higher priority. */ - -unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ - -extern void mem_use(void); +#define MAX_RT_PRIO 100 +#define MAX_PRIO (MAX_RT_PRIO + 40) /* - * Scheduling quanta. - * - * NOTE! The unix "nice" value influences how long a process - * gets. The nice value ranges from -20 to +19, where a -20 - * is a "high-priority" task, and a "+10" is a low-priority - * task. - * - * We want the time-slice to be around 50ms or so, so this - * calculation depends on the value of HZ. + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ 100 ... 139 (MAX_PRIO-1) ], + * and back. */ -#if HZ < 200 -#define TICK_SCALE(x) ((x) >> 2) -#elif HZ < 400 -#define TICK_SCALE(x) ((x) >> 1) -#elif HZ < 800 -#define TICK_SCALE(x) (x) -#elif HZ < 1600 -#define TICK_SCALE(x) ((x) << 1) -#else -#define TICK_SCALE(x) ((x) << 2) -#endif - -#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1) +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) /* - * Init task must be ok at boot for the ix86 as we will check its signals - * via the SMP irq return path. + * These are the 'tuning knobs' of the scheduler: + * + * Minimum timeslice is 10 msecs, default timeslice is 150 msecs, + * maximum timeslice is 300 msecs. Timeslices get refilled after + * they expire. */ - -struct task_struct * init_tasks[NR_CPUS] = {&init_task, }; +#define MIN_TIMESLICE ( 10 * HZ / 1000) +#define MAX_TIMESLICE (300 * HZ / 1000) +#define CHILD_PENALTY 95 +#define PARENT_PENALTY 100 +#define EXIT_WEIGHT 3 +#define PRIO_BONUS_RATIO 25 +#define INTERACTIVE_DELTA 2 +#define MAX_SLEEP_AVG (2*HZ) +#define STARVATION_LIMIT (2*HZ) /* - * The tasklist_lock protects the linked list of processes. + * If a task is 'interactive' then we reinsert it in the active + * array after it has expired its current timeslice. (it will not + * continue to run immediately, it will still roundrobin with + * other interactive tasks.) * - * The runqueue_lock locks the parts that actually access - * and change the run-queues, and have to be interrupt-safe. + * This part scales the interactivity limit depending on niceness. * - * If both locks are to be concurrently held, the runqueue_lock - * nests inside the tasklist_lock. + * We scale it linearly, offset by the INTERACTIVE_DELTA delta. + * Here are a few examples of different nice levels: * - * task->alloc_lock nests inside tasklist_lock. + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] + * + * (the X axis represents the possible -5 ... 0 ... +5 dynamic + * priority range a task can explore, a value of '1' means the + * task is rated interactive.) + * + * Ie. nice +19 tasks can never get 'interactive' enough to be + * reinserted into the active array. And only heavily CPU-hog nice -20 + * tasks will be expired. Default nice 0 tasks are somewhere between, + * it takes some effort for them to get interactive, but it's not + * too hard. */ -spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ -rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ - -static LIST_HEAD(runqueue_head); -/* - * We align per-CPU scheduling data on cacheline boundaries, - * to prevent cacheline ping-pong. - */ -static union { - struct schedule_data { - struct task_struct * curr; - cycles_t last_schedule; - } schedule_data; - char __pad [SMP_CACHE_BYTES]; -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; +#define SCALE(v1,v1_max,v2_max) \ + (v1) * (v2_max) / (v1_max) -#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule +#define DELTA(p) \ + (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \ + INTERACTIVE_DELTA) -struct kernel_stat kstat; -extern struct task_struct *child_reaper; +#define TASK_INTERACTIVE(p) \ + ((p)->prio <= (p)->static_prio - DELTA(p)) -#ifdef CONFIG_SMP +/* + * TASK_TIMESLICE scales user-nice values [ -20 ... 19 ] + * to time slice values. + * + * The higher a process's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority process gets MIN_TIMESLICE worth of execution time. + */ -#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) -#define can_schedule(p,cpu) \ - ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu)) +#define TASK_TIMESLICE(p) (MIN_TIMESLICE + \ + ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/39)) -#else +/* + * These are the runqueue data structures: + */ -#define idle_task(cpu) (&init_task) -#define can_schedule(p,cpu) (1) +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) -#endif +typedef struct runqueue runqueue_t; -void scheduling_functions_start_here(void) { } +struct prio_array { + int nr_active; + spinlock_t *lock; + runqueue_t *rq; + unsigned long bitmap[BITMAP_SIZE]; + list_t queue[MAX_PRIO]; +}; /* - * This is the function that decides how desirable a process is.. - * You can weigh different processes against each other depending - * on what CPU they've run on lately etc to try to handle cache - * and TLB miss penalties. + * This is the main, per-CPU runqueue data structure. * - * Return values: - * -1000: never select this - * 0: out of time, recalculate counters (but it might still be - * selected) - * +ve: "goodness" value (the larger, the better) - * +1000: realtime process, select this. + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the process migration code), lock + * acquire operations must be ordered by ascending &runqueue. */ +struct runqueue { + spinlock_t lock; + unsigned long nr_running, nr_switches, expired_timestamp; + task_t *curr, *idle; + prio_array_t *active, *expired, arrays[2]; + int prev_nr_running[NR_CPUS]; +} ____cacheline_aligned; -static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) -{ - int weight; +static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; - /* - * select the current process after every other - * runnable process, but before the idle thread. - * Also, dont trigger a counter recalculation. - */ - weight = -1; - if (p->policy & SCHED_YIELD) - goto out; +#define cpu_rq(cpu) (runqueues + (cpu)) +#define this_rq() cpu_rq(smp_processor_id()) +#define task_rq(p) cpu_rq((p)->cpu) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define rt_task(p) ((p)->prio < MAX_RT_PRIO) - /* - * Non-RT process - normal case first. - */ - if (p->policy == SCHED_OTHER) { - /* - * Give the process a first-approximation goodness value - * according to the number of clock-ticks it has left. - * - * Don't do any other calculations if the time slice is - * over.. - */ - weight = p->counter; - if (!weight) - goto out; - -#ifdef CONFIG_SMP - /* Give a largish advantage to the same processor... */ - /* (this is equivalent to penalizing other processors) */ - if (p->processor == this_cpu) - weight += PROC_CHANGE_PENALTY; -#endif +static inline runqueue_t *lock_task_rq(task_t *p, unsigned long *flags) +{ + struct runqueue *__rq; - /* .. and a slight advantage to the current MM */ - if (p->mm == this_mm || !p->mm) - weight += 1; - weight += 20 - p->nice; - goto out; +repeat_lock_task: + __rq = task_rq(p); + spin_lock_irqsave(&__rq->lock, *flags); + if (unlikely(__rq != task_rq(p))) { + spin_unlock_irqrestore(&__rq->lock, *flags); + goto repeat_lock_task; } + return __rq; +} - /* - * Realtime process, select the first one on the - * runqueue (taking priorities within processes - * into account). - */ - weight = 1000 + p->rt_priority; -out: - return weight; +static inline void unlock_task_rq(runqueue_t *rq, unsigned long *flags) +{ + spin_unlock_irqrestore(&rq->lock, *flags); } /* - * the 'goodness value' of replacing a process on a given CPU. - * positive value means 'replace', zero or negative means 'dont'. + * Adding/removing a task to/from a priority array: */ -static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) +static inline void dequeue_task(struct task_struct *p, prio_array_t *array) { - return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm); + array->nr_active--; + list_del_init(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); } -/* - * This is ugly, but reschedule_idle() is very timing-critical. - * We are called with the runqueue spinlock held and we must - * not claim the tasklist_lock. - */ -static FASTCALL(void reschedule_idle(struct task_struct * p)); +static inline void enqueue_task(struct task_struct *p, prio_array_t *array) +{ + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} -static void reschedule_idle(struct task_struct * p) +static inline int effective_prio(task_t *p) { -#ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); - struct task_struct *tsk, *target_tsk; - int cpu, best_cpu, i, max_prio; - cycles_t oldest_idle; + int bonus, prio; /* - * shortcut if the woken up task's last CPU is - * idle now. + * Here we scale the actual sleep average [0 .... MAX_SLEEP_AVG] + * into the -5 ... 0 ... +5 bonus/penalty range. + * + * We use 25% of the full 0...39 priority range so that: + * + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * + * Both properties are important to certain workloads. */ - best_cpu = p->processor; - if (can_schedule(p, best_cpu)) { - tsk = idle_task(best_cpu); - if (cpu_curr(best_cpu) == tsk) { - int need_resched; -send_now_idle: - /* - * If need_resched == -1 then we can skip sending - * the IPI altogether, tsk->need_resched is - * actively watched by the idle thread. - */ - need_resched = tsk->need_resched; - tsk->need_resched = 1; - if ((best_cpu != this_cpu) && !need_resched) - smp_send_reschedule(best_cpu); - return; - } - } + bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 - + MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; - /* - * We know that the preferred CPU has a cache-affine current - * process, lets try to find a new idle CPU for the woken-up - * process. Select the least recently active idle CPU. (that - * one will have the least active cache context.) Also find - * the executing process which has the least priority. - */ - oldest_idle = (cycles_t) -1; - target_tsk = NULL; - max_prio = 0; + prio = p->static_prio - bonus; + if (prio < MAX_RT_PRIO) + prio = MAX_RT_PRIO; + if (prio > MAX_PRIO-1) + prio = MAX_PRIO-1; + return prio; +} - for (i = 0; i < smp_num_cpus; i++) { - cpu = cpu_logical_map(i); - if (!can_schedule(p, cpu)) - continue; - tsk = cpu_curr(cpu); +static inline void activate_task(task_t *p, runqueue_t *rq) +{ + unsigned long sleep_time = jiffies - p->sleep_timestamp; + prio_array_t *array = rq->active; + + if (!rt_task(p) && sleep_time) { /* - * We use the first available idle CPU. This creates - * a priority list between idle CPUs, but this is not - * a problem. + * This code gives a bonus to interactive tasks. We update + * an 'average sleep time' value here, based on + * sleep_timestamp. The more time a task spends sleeping, + * the higher the average gets - and the higher the priority + * boost gets as well. */ - if (tsk == idle_task(cpu)) { -#if defined(__i386__) && defined(CONFIG_SMP) - /* - * Check if two siblings are idle in the same - * physical package. Use them if found. - */ - if (smp_num_siblings == 2) { - if (cpu_curr(cpu_sibling_map[cpu]) == - idle_task(cpu_sibling_map[cpu])) { - oldest_idle = last_schedule(cpu); - target_tsk = tsk; - break; - } - - } -#endif - if (last_schedule(cpu) < oldest_idle) { - oldest_idle = last_schedule(cpu); - target_tsk = tsk; - } - } else { - if (oldest_idle == -1ULL) { - int prio = preemption_goodness(tsk, p, cpu); - - if (prio > max_prio) { - max_prio = prio; - target_tsk = tsk; - } - } - } - } - tsk = target_tsk; - if (tsk) { - if (oldest_idle != -1ULL) { - best_cpu = tsk->processor; - goto send_now_idle; - } - tsk->need_resched = 1; - if (tsk->processor != this_cpu) - smp_send_reschedule(tsk->processor); + p->sleep_avg += sleep_time; + if (p->sleep_avg > MAX_SLEEP_AVG) + p->sleep_avg = MAX_SLEEP_AVG; + p->prio = effective_prio(p); } - return; - + enqueue_task(p, array); + rq->nr_running++; +} -#else /* UP */ - int this_cpu = smp_processor_id(); - struct task_struct *tsk; - - tsk = cpu_curr(this_cpu); - if (preemption_goodness(tsk, p, this_cpu) > 0) - tsk->need_resched = 1; -#endif +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + rq->nr_running--; + dequeue_task(p, p->array); + p->array = NULL; } +static inline void resched_task(task_t *p) +{ + int need_resched; + + need_resched = p->need_resched; + wmb(); + p->need_resched = 1; + if (!need_resched && (p->cpu != smp_processor_id())) + smp_send_reschedule(p->cpu); +} + +#ifdef CONFIG_SMP + /* - * Careful! - * - * This has to add the process to the _beginning_ of the - * run-queue, not the end. See the comment about "This is - * subtle" in the scheduler proper.. + * Wait for a process to unschedule. This is used by the exit() and + * ptrace() code. */ -static inline void add_to_runqueue(struct task_struct * p) +void wait_task_inactive(task_t * p) { - list_add(&p->run_list, &runqueue_head); - nr_running++; + unsigned long flags; + runqueue_t *rq; + +repeat: + rq = task_rq(p); + while (unlikely(rq->curr == p)) { + cpu_relax(); + barrier(); + } + rq = lock_task_rq(p, &flags); + if (unlikely(rq->curr == p)) { + unlock_task_rq(rq, &flags); + goto repeat; + } + unlock_task_rq(rq, &flags); } -static inline void move_last_runqueue(struct task_struct * p) +/* + * The SMP message passing code calls this function whenever + * the new task has arrived at the target CPU. We move the + * new task into the local runqueue. + * + * This function must be called with interrupts disabled. + */ +void sched_task_migrated(task_t *new_task) { - list_del(&p->run_list); - list_add_tail(&p->run_list, &runqueue_head); + wait_task_inactive(new_task); + new_task->cpu = smp_processor_id(); + wake_up_process(new_task); } -static inline void move_first_runqueue(struct task_struct * p) +/* + * Kick the remote CPU if the task is running currently, + * this code is used by the signal code to signal tasks + * which are in user-mode as quickly as possible. + * + * (Note that we do this lockless - if the task does anything + * while the message is in flight then it will notice the + * sigpending condition anyway.) + */ +void kick_if_running(task_t * p) { - list_del(&p->run_list); - list_add(&p->run_list, &runqueue_head); + if (p == task_rq(p)->curr) + resched_task(p); } +#endif /* * Wake up a process. Put it on the run-queue if it's not @@ -348,392 +321,528 @@ * "current->state = TASK_RUNNING" to mark yourself runnable * without the overhead of this. */ -static inline int try_to_wake_up(struct task_struct * p, int synchronous) +static int try_to_wake_up(task_t * p, int synchronous) { unsigned long flags; int success = 0; + runqueue_t *rq; - /* - * We want the common case fall through straight, thus the goto. - */ - spin_lock_irqsave(&runqueue_lock, flags); + rq = lock_task_rq(p, &flags); p->state = TASK_RUNNING; - if (task_on_runqueue(p)) - goto out; - add_to_runqueue(p); - if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id()))) - reschedule_idle(p); - success = 1; -out: - spin_unlock_irqrestore(&runqueue_lock, flags); + if (!p->array) { + activate_task(p, rq); + if ((rq->curr == rq->idle) || (p->prio < rq->curr->prio)) + resched_task(rq->curr); + success = 1; + } + unlock_task_rq(rq, &flags); return success; } -inline int wake_up_process(struct task_struct * p) +int wake_up_process(task_t * p) { return try_to_wake_up(p, 0); } -static void process_timeout(unsigned long __data) +void wake_up_forked_process(task_t * p) { - struct task_struct * p = (struct task_struct *) __data; + runqueue_t *rq = this_rq(); - wake_up_process(p); + p->state = TASK_RUNNING; + if (!rt_task(p)) { + /* + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. + */ + current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100; + p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; + p->prio = effective_prio(p); + } + spin_lock_irq(&rq->lock); + p->cpu = smp_processor_id(); + activate_task(p, rq); + spin_unlock_irq(&rq->lock); } -/** - * schedule_timeout - sleep until timeout - * @timeout: timeout value in jiffies - * - * Make the current task sleep until @timeout jiffies have - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns. The routine will return 0 - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. In this case the remaining time - * in jiffies will be returned, or 0 if the timer expired in time - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule - * the CPU away without a bound on the timeout. In this case the return - * value will be %MAX_SCHEDULE_TIMEOUT. +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many processes. * - * In all cases the return value is guaranteed to be non-negative. + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) */ -signed long schedule_timeout(signed long timeout) +void sched_exit(task_t * p) { - struct timer_list timer; - unsigned long expire; + __cli(); + current->time_slice += p->time_slice; + if (unlikely(current->time_slice > MAX_TIMESLICE)) + current->time_slice = MAX_TIMESLICE; + __sti(); + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. + */ + if (p->sleep_avg < current->sleep_avg) + current->sleep_avg = (current->sleep_avg * EXIT_WEIGHT + + p->sleep_avg) / (EXIT_WEIGHT + 1); +} - switch (timeout) - { - case MAX_SCHEDULE_TIMEOUT: - /* - * These two special cases are useful to be comfortable - * in the caller. Nothing more. We could take - * MAX_SCHEDULE_TIMEOUT from one of the negative value - * but I' d like to return a valid offset (>=0) to allow - * the caller to do everything it want with the retval. - */ - schedule(); - goto out; - default: - /* - * Another bit of PARANOID. Note that the retval will be - * 0 since no piece of kernel is supposed to do a check - * for a negative retval of schedule_timeout() (since it - * should never happens anyway). You just have the printk() - * that will tell you if something is gone wrong and where. - */ - if (timeout < 0) - { - printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx from %p\n", timeout, - __builtin_return_address(0)); - current->state = TASK_RUNNING; - goto out; - } - } +#if CONFIG_SMP +asmlinkage void schedule_tail(task_t *prev) +{ + spin_unlock_irq(&this_rq()->lock); +} +#endif - expire = timeout + jiffies; +static inline void context_switch(task_t *prev, task_t *next) +{ + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; - init_timer(&timer); - timer.expires = expire; - timer.data = (unsigned long) current; - timer.function = process_timeout; + prepare_to_switch(); - add_timer(&timer); - schedule(); - del_timer_sync(&timer); + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next, smp_processor_id()); + } else + switch_mm(oldmm, mm, next, smp_processor_id()); - timeout = expire - jiffies; + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + mmdrop(oldmm); + } - out: - return timeout < 0 ? 0 : timeout; + /* + * Here we just switch the register state and the stack. There are + * 3 processes affected by a context switch: + * + * prev ==> .... ==> (last => next) + * + * It's the 'much more previous' 'prev' that is on next's stack, + * but prev is set to (the just run) 'last' process by switch_to(). + * This might sound slightly confusing but makes tons of sense. + */ + switch_to(prev, next, prev); } -/* - * schedule_tail() is getting called from the fork return path. This - * cleans up all remaining scheduler things, without impacting the - * common case. - */ -static inline void __schedule_tail(struct task_struct *prev) +unsigned long nr_running(void) { -#ifdef CONFIG_SMP - int policy; - - /* - * prev->policy can be written from here only before `prev' - * can be scheduled (before setting prev->cpus_runnable to ~0UL). - * Of course it must also be read before allowing prev - * to be rescheduled, but since the write depends on the read - * to complete, wmb() is enough. (the spin_lock() acquired - * before setting cpus_runnable is not enough because the spin_lock() - * common code semantics allows code outside the critical section - * to enter inside the critical section) - */ - policy = prev->policy; - prev->policy = policy & ~SCHED_YIELD; - wmb(); + unsigned long i, sum = 0; - /* - * fast path falls through. We have to clear cpus_runnable before - * checking prev->state to avoid a wakeup race. Protect against - * the task exiting early. - */ - task_lock(prev); - task_release_cpu(prev); - mb(); - if (prev->state == TASK_RUNNING) - goto needs_resched; + for (i = 0; i < smp_num_cpus; i++) + sum += cpu_rq(cpu_logical_map(i))->nr_running; -out_unlock: - task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */ - return; + return sum; +} - /* - * Slow path - we 'push' the previous process and - * reschedule_idle() will attempt to find a new - * processor for it. (but it might preempt the - * current process as well.) We must take the runqueue - * lock and re-check prev->state to be correct. It might - * still happen that this process has a preemption - * 'in progress' already - but this is not a problem and - * might happen in other circumstances as well. - */ -needs_resched: - { - unsigned long flags; +unsigned long nr_context_switches(void) +{ + unsigned long i, sum = 0; - /* - * Avoid taking the runqueue lock in cases where - * no preemption-check is necessery: - */ - if ((prev == idle_task(smp_processor_id())) || - (policy & SCHED_YIELD)) - goto out_unlock; + for (i = 0; i < smp_num_cpus; i++) + sum += cpu_rq(cpu_logical_map(i))->nr_switches; - spin_lock_irqsave(&runqueue_lock, flags); - if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev)) - reschedule_idle(prev); - spin_unlock_irqrestore(&runqueue_lock, flags); - goto out_unlock; - } -#else - prev->policy &= ~SCHED_YIELD; -#endif /* CONFIG_SMP */ + return sum; } -asmlinkage void schedule_tail(struct task_struct *prev) +#if CONFIG_SMP +/* + * Lock the busiest runqueue as well, this_rq is locked already. + * Recalculate nr_running if we have to drop the runqueue lock. + */ +static inline unsigned int double_lock_balance(runqueue_t *this_rq, + runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running) { - __schedule_tail(prev); + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock(&this_rq->lock); + /* Need to recalculate nr_running */ + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + nr_running = this_rq->nr_running; + else + nr_running = this_rq->prev_nr_running[this_cpu]; + } else + spin_lock(&busiest->lock); + } + return nr_running; } /* - * 'schedule()' is the scheduler function. It's a very simple and nice - * scheduler: it's not perfect, but certainly works for most things. + * Current runqueue is empty, or rebalance tick: if there is an + * inbalance (current runqueue is too short) then pull from + * busiest runqueue(s). * - * The goto is "interesting". - * - * NOTE!! Task 0 is the 'idle' task, which gets called when no other - * tasks can run. It can not be killed, and it cannot sleep. The 'state' - * information in task[0] is never used. + * We call this with the current runqueue locked, + * irqs disabled. */ -asmlinkage void schedule(void) +static void load_balance(runqueue_t *this_rq, int idle) { - struct schedule_data * sched_data; - struct task_struct *prev, *next, *p; - struct list_head *tmp; - int this_cpu, c; + int imbalance, nr_running, load, max_load, + idx, i, this_cpu = smp_processor_id(); + task_t *next = this_rq->idle, *tmp; + runqueue_t *busiest, *rq_src; + prio_array_t *array; + list_t *head, *curr; + /* + * We search all runqueues to find the most busy one. + * We do this lockless to reduce cache-bouncing overhead, + * we re-check the 'best' source CPU later on again, with + * the lock held. + * + * We fend off statistical fluctuations in runqueue lengths by + * saving the runqueue length during the previous load-balancing + * operation and using the smaller one the current and saved lengths. + * If a runqueue is long enough for a longer amount of time then + * we recognize it and pull tasks from it. + * + * The 'current runqueue length' is a statistical maximum variable, + * for that one we take the longer one - to avoid fluctuations in + * the other direction. So for a load-balance to happen it needs + * stable long runqueue on the target CPU and stable short runqueue + * on the local runqueue. + * + * We make an exception if this CPU is about to become idle - in + * that case we are less picky about moving a task across CPUs and + * take what can be taken. + */ + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + nr_running = this_rq->nr_running; + else + nr_running = this_rq->prev_nr_running[this_cpu]; - spin_lock_prefetch(&runqueue_lock); + busiest = NULL; + max_load = 1; + for (i = 0; i < smp_num_cpus; i++) { + rq_src = cpu_rq(cpu_logical_map(i)); + if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i])) + load = rq_src->nr_running; + else + load = this_rq->prev_nr_running[i]; + this_rq->prev_nr_running[i] = rq_src->nr_running; + + if ((load > max_load) && (rq_src != this_rq)) { + busiest = rq_src; + max_load = load; + } + } - if (!current->active_mm) BUG(); -need_resched_back: - prev = current; - this_cpu = prev->processor; + if (likely(!busiest)) + return; - if (unlikely(in_interrupt())) { - printk("Scheduling in interrupt\n"); - BUG(); - } + imbalance = (max_load - nr_running) / 2; - release_kernel_lock(prev, this_cpu); + /* It needs an at least ~25% imbalance to trigger balancing. */ + if (!idle && (imbalance < (max_load + 3)/4)) + return; + nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running); /* - * 'sched_data' is protected by the fact that we can run - * only one process per CPU. + * Make sure nothing changed since we checked the + * runqueue length. */ - sched_data = & aligned_data[this_cpu].schedule_data; + if (busiest->nr_running <= this_rq->nr_running + 1) + goto out_unlock; - spin_lock_irq(&runqueue_lock); + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (busiest->expired->nr_active) + array = busiest->expired; + else + array = busiest->active; - /* move an exhausted RR process to be last.. */ - if (unlikely(prev->policy == SCHED_RR)) - if (!prev->counter) { - prev->counter = NICE_TO_TICKS(prev->nice); - move_last_runqueue(prev); +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx == MAX_PRIO) { + if (array == busiest->expired) { + array = busiest->active; + goto new_array; } - - switch (prev->state) { - case TASK_INTERRUPTIBLE: - if (signal_pending(prev)) { - prev->state = TASK_RUNNING; - break; - } - default: - del_from_runqueue(prev); - case TASK_RUNNING:; + goto out_unlock; } - prev->need_resched = 0; - - /* - * this is the scheduler proper: - */ -repeat_schedule: - /* - * Default process to select.. - */ - next = idle_task(this_cpu); - c = -1000; - list_for_each(tmp, &runqueue_head) { - p = list_entry(tmp, struct task_struct, run_list); - if (can_schedule(p, this_cpu)) { - int weight = goodness(p, this_cpu, prev->active_mm); - if (weight > c) - c = weight, next = p; + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + +#define CAN_MIGRATE_TASK(p,rq,this_cpu) \ + ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \ + ((p) != (rq)->curr) && \ + (tmp->cpus_allowed & (1 << (this_cpu)))) + + if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) { + curr = curr->next; + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + next = tmp; + /* + * take the task out of the other runqueue and + * put it into this one: + */ + dequeue_task(next, array); + busiest->nr_running--; + next->cpu = this_cpu; + this_rq->nr_running++; + enqueue_task(next, this_rq->active); + if (next->prio < current->prio) + current->need_resched = 1; + if (!idle && --imbalance) { + if (array == busiest->expired) { + array = busiest->active; + goto new_array; } } +out_unlock: + spin_unlock(&busiest->lock); +} + +/* + * One of the idle_cpu_tick() or the busy_cpu_tick() function will + * gets called every timer tick, on every CPU. Our balancing action + * frequency and balancing agressivity depends on whether the CPU is + * idle or not. + * + * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on + * systems with HZ=100, every 10 msecs.) + */ +#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) +#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) + +static inline void idle_tick(void) +{ + if (jiffies % IDLE_REBALANCE_TICK) + return; + spin_lock(&this_rq()->lock); + load_balance(this_rq(), 1); + spin_unlock(&this_rq()->lock); +} + +#endif + +/* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks: + */ +#define EXPIRED_STARVING(rq) \ + ((rq)->expired_timestamp && \ + (jiffies - (rq)->expired_timestamp >= \ + STARVATION_LIMIT * ((rq)->nr_running) + 1)) + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +void scheduler_tick(int user_tick, int system) +{ + int cpu = smp_processor_id(); + runqueue_t *rq = this_rq(); + task_t *p = current; - /* Do we need to re-calculate counters? */ - if (unlikely(!c)) { - struct task_struct *p; - - spin_unlock_irq(&runqueue_lock); - read_lock(&tasklist_lock); - for_each_task(p) - p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); - read_unlock(&tasklist_lock); - spin_lock_irq(&runqueue_lock); - goto repeat_schedule; + if (p == rq->idle) { + if (local_bh_count(cpu) || local_irq_count(cpu) > 1) + kstat.per_cpu_system[cpu] += system; +#if CONFIG_SMP + idle_tick(); +#endif + return; } + if (TASK_NICE(p) > 0) + kstat.per_cpu_nice[cpu] += user_tick; + else + kstat.per_cpu_user[cpu] += user_tick; + kstat.per_cpu_system[cpu] += system; + /* Task might have expired already, but not scheduled off yet */ + if (p->array != rq->active) { + p->need_resched = 1; + return; + } + spin_lock(&rq->lock); + if (unlikely(rt_task(p))) { + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if ((p->policy == SCHED_RR) && !--p->time_slice) { + p->time_slice = TASK_TIMESLICE(p); + p->need_resched = 1; + + /* put it at the end of the queue: */ + dequeue_task(p, rq->active); + enqueue_task(p, rq->active); + } + goto out; + } /* - * from this point on nothing can prevent us from - * switching to the next task, save this fact in - * sched_data. - */ - sched_data->curr = next; - task_set_cpu(next, this_cpu); - spin_unlock_irq(&runqueue_lock); - - if (unlikely(prev == next)) { - /* We won't go through the normal tail, so do this by hand */ - prev->policy &= ~SCHED_YIELD; - goto same_process; + * The task was running during this tick - update the + * time slice counter and the sleep average. Note: we + * do not update a process's priority until it either + * goes to sleep or uses up its timeslice. This makes + * it possible for interactive tasks to use up their + * timeslices at their highest priority levels. + */ + if (p->sleep_avg) + p->sleep_avg--; + if (!--p->time_slice) { + dequeue_task(p, rq->active); + p->need_resched = 1; + p->prio = effective_prio(p); + p->time_slice = TASK_TIMESLICE(p); + + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; + enqueue_task(p, rq->expired); + } else + enqueue_task(p, rq->active); } +out: +#if CONFIG_SMP + if (!(jiffies % BUSY_REBALANCE_TICK)) + load_balance(rq, 0); +#endif + spin_unlock(&rq->lock); +} -#ifdef CONFIG_SMP - /* - * maintain the per-process 'last schedule' value. - * (this has to be recalculated even if we reschedule to - * the same process) Currently this is only used on SMP, - * and it's approximate, so we do not have to maintain - * it while holding the runqueue spinlock. - */ - sched_data->last_schedule = get_cycles(); +void scheduling_functions_start_here(void) { } - /* - * We drop the scheduler lock early (it's a global spinlock), - * thus we have to lock the previous process from getting - * rescheduled during switch_to(). - */ +/* + * 'schedule()' is the main scheduler function. + */ +asmlinkage void schedule(void) +{ + task_t *prev = current, *next; + runqueue_t *rq = this_rq(); + prio_array_t *array; + list_t *queue; + int idx; -#endif /* CONFIG_SMP */ + if (unlikely(in_interrupt())) + BUG(); + release_kernel_lock(prev, smp_processor_id()); + prev->sleep_timestamp = jiffies; + spin_lock_irq(&rq->lock); - kstat.context_swtch++; - /* - * there are 3 processes which are affected by a context switch: - * - * prev == .... ==> (last => next) - * - * It's the 'much more previous' 'prev' that is on next's stack, - * but prev is set to (the just run) 'last' process by switch_to(). - * This might sound slightly confusing but makes tons of sense. - */ - prepare_to_switch(); - { - struct mm_struct *mm = next->mm; - struct mm_struct *oldmm = prev->active_mm; - if (!mm) { - if (next->active_mm) BUG(); - next->active_mm = oldmm; - atomic_inc(&oldmm->mm_count); - enter_lazy_tlb(oldmm, next, this_cpu); - } else { - if (next->active_mm != mm) BUG(); - switch_mm(oldmm, mm, next, this_cpu); + switch (prev->state) { + case TASK_INTERRUPTIBLE: + if (unlikely(signal_pending(prev))) { + prev->state = TASK_RUNNING; + break; } + default: + deactivate_task(prev, rq); + case TASK_RUNNING: + ; + } +#if CONFIG_SMP +pick_next_task: +#endif + if (unlikely(!rq->nr_running)) { +#if CONFIG_SMP + load_balance(rq, 1); + if (rq->nr_running) + goto pick_next_task; +#endif + next = rq->idle; + rq->expired_timestamp = 0; + goto switch_tasks; + } - if (!prev->mm) { - prev->active_mm = NULL; - mmdrop(oldmm); - } + array = rq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + rq->active = rq->expired; + rq->expired = array; + array = rq->active; + rq->expired_timestamp = 0; } - /* - * This just switches the register state and the - * stack. - */ - switch_to(prev, next, prev); - __schedule_tail(prev); + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + +switch_tasks: + prefetch(next); + prev->need_resched = 0; + + if (likely(prev != next)) { + rq->nr_switches++; + rq->curr = next; + context_switch(prev, next); + /* + * The runqueue pointer might be from another CPU + * if the new task was last running on a different + * CPU - thus re-load it. + */ + barrier(); + rq = this_rq(); + } + spin_unlock_irq(&rq->lock); -same_process: reacquire_kernel_lock(current); - if (current->need_resched) - goto need_resched_back; return; } /* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything - * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the - * non-exclusive tasks and one exclusive task. + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. * * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero - * in this (rare) case, and we handle it by contonuing to scan the queue. + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode, int nr_exclusive, const int sync) { struct list_head *tmp; - struct task_struct *p; + task_t *p; - CHECK_MAGIC_WQHEAD(q); - WQ_CHECK_LIST_HEAD(&q->task_list); - list_for_each(tmp,&q->task_list) { unsigned int state; - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); - CHECK_MAGIC(curr->__magic); p = curr->task; state = p->state; - if (state & mode) { - WQ_NOTE_WAKER(curr); - if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) - break; - } + if ((state & mode) && + try_to_wake_up(p, sync) && + ((curr->flags & WQ_FLAG_EXCLUSIVE) && + !--nr_exclusive)) + break; } } @@ -850,8 +959,71 @@ return timeout; } +/* + * Change the current task's CPU affinity. Migrate the process to a + * proper CPU and schedule away if the current CPU is removed from + * the allowed bitmask. + */ +void set_cpus_allowed(task_t *p, unsigned long new_mask) +{ + new_mask &= cpu_online_map; + if (!new_mask) + BUG(); + if (p != current) + BUG(); + + p->cpus_allowed = new_mask; + /* + * Can the task run on the current CPU? If not then + * migrate the process off to a proper CPU. + */ + if (new_mask & (1UL << smp_processor_id())) + return; +#if CONFIG_SMP + current->state = TASK_UNINTERRUPTIBLE; + smp_migrate_task(__ffs(new_mask), current); + + schedule(); +#endif +} + void scheduling_functions_end_here(void) { } +void set_user_nice(task_t *p, long nice) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + rq = lock_task_rq(p, &flags); + if (rt_task(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + array = p->array; + if (array) + dequeue_task(p, array); + p->static_prio = NICE_TO_PRIO(nice); + p->prio = NICE_TO_PRIO(nice); + if (array) { + enqueue_task(p, array); + /* + * If the task is running and lowered its priority, + * or increased its priority then reschedule its CPU: + */ + if ((NICE_TO_PRIO(nice) < p->static_prio) || (p == rq->curr)) + resched_task(rq->curr); + } +out_unlock: + unlock_task_rq(rq, &flags); +} + #ifndef __alpha__ /* @@ -862,7 +1034,7 @@ asmlinkage long sys_nice(int increment) { - long newprio; + long nice; /* * Setpriority might change our priority at the same moment. @@ -878,32 +1050,46 @@ if (increment > 40) increment = 40; - newprio = current->nice + increment; - if (newprio < -20) - newprio = -20; - if (newprio > 19) - newprio = 19; - current->nice = newprio; + nice = PRIO_TO_NICE(current->static_prio) + increment; + if (nice < -20) + nice = -20; + if (nice > 19) + nice = 19; + set_user_nice(current, nice); return 0; } #endif -static inline struct task_struct *find_process_by_pid(pid_t pid) +/* + * This is the priority value as seen by users in /proc + * + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. + */ +int task_prio(task_t *p) { - struct task_struct *tsk = current; + return p->prio - 100; +} - if (pid) - tsk = find_task_by_pid(pid); - return tsk; +int task_nice(task_t *p) +{ + return TASK_NICE(p); } -static int setscheduler(pid_t pid, int policy, - struct sched_param *param) +static inline task_t *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_pid(pid) : current; +} + +static int setscheduler(pid_t pid, int policy, struct sched_param *param) { struct sched_param lp; - struct task_struct *p; + prio_array_t *array; + unsigned long flags; + runqueue_t *rq; int retval; + task_t *p; retval = -EINVAL; if (!param || pid < 0) @@ -917,14 +1103,19 @@ * We play safe to avoid deadlocks. */ read_lock_irq(&tasklist_lock); - spin_lock(&runqueue_lock); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) - goto out_unlock; - + goto out_unlock_tasklist; + + /* + * To be able to change p->policy safely, the apropriate + * runqueue lock must be held. + */ + rq = lock_task_rq(p, &flags); + if (policy < 0) policy = p->policy; else { @@ -945,30 +1136,36 @@ goto out_unlock; retval = -EPERM; - if ((policy == SCHED_FIFO || policy == SCHED_RR) && + if ((policy == SCHED_FIFO || policy == SCHED_RR) && !capable(CAP_SYS_NICE)) goto out_unlock; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) goto out_unlock; + array = p->array; + if (array) + deactivate_task(p, task_rq(p)); retval = 0; p->policy = policy; p->rt_priority = lp.sched_priority; - if (task_on_runqueue(p)) - move_first_runqueue(p); - - current->need_resched = 1; + if (rt_task(p)) + p->prio = 99 - p->rt_priority; + else + p->prio = p->static_prio; + if (array) + activate_task(p, task_rq(p)); out_unlock: - spin_unlock(&runqueue_lock); + unlock_task_rq(rq, &flags); +out_unlock_tasklist: read_unlock_irq(&tasklist_lock); out_nounlock: return retval; } -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, struct sched_param *param) { return setscheduler(pid, policy, param); @@ -981,7 +1178,7 @@ asmlinkage long sys_sched_getscheduler(pid_t pid) { - struct task_struct *p; + task_t *p; int retval; retval = -EINVAL; @@ -992,7 +1189,7 @@ read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (p) - retval = p->policy & ~SCHED_YIELD; + retval = p->policy; read_unlock(&tasklist_lock); out_nounlock: @@ -1001,7 +1198,7 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param) { - struct task_struct *p; + task_t *p; struct sched_param lp; int retval; @@ -1032,42 +1229,64 @@ asmlinkage long sys_sched_yield(void) { + task_t *prev = current, *next; + runqueue_t *rq = this_rq(); + prio_array_t *array; + list_t *queue; + + if (unlikely(prev->state != TASK_RUNNING)) { + schedule(); + return 0; + } + release_kernel_lock(prev, smp_processor_id()); + prev->sleep_timestamp = jiffies; /* - * Trick. sched_yield() first counts the number of truly - * 'pending' runnable processes, then returns if it's - * only the current processes. (This test does not have - * to be atomic.) In threaded applications this optimization - * gets triggered quite often. + * Decrease the yielding task's priority by one, to avoid + * livelocks. This priority loss is temporary, it's recovered + * once the current timeslice expires. + * + * If priority is already MAX_PRIO-1 then we still + * roundrobin the task within the runlist. */ + spin_lock_irq(&rq->lock); + array = current->array; + /* + * If the task has reached maximum priority (or is a RT task) + * then just requeue the task to the end of the runqueue: + */ + if (likely(current->prio == MAX_PRIO-1 || rt_task(current))) { + list_del(¤t->run_list); + list_add_tail(¤t->run_list, array->queue + current->prio); + } else { + list_del(¤t->run_list); + if (list_empty(array->queue + current->prio)) + __clear_bit(current->prio, array->bitmap); + current->prio++; + list_add_tail(¤t->run_list, array->queue + current->prio); + __set_bit(current->prio, array->bitmap); + } + /* + * Context-switch manually. This is equivalent to + * calling schedule(), but faster, because yield() + * knows lots of things that can be optimized away + * from the generic scheduler path: + */ + queue = array->queue + sched_find_first_bit(array->bitmap); + next = list_entry(queue->next, task_t, run_list); + prefetch(next); - int nr_pending = nr_running; - -#if CONFIG_SMP - int i; - - // Subtract non-idle processes running on other CPUs. - for (i = 0; i < smp_num_cpus; i++) { - int cpu = cpu_logical_map(i); - if (aligned_data[cpu].schedule_data.curr != idle_task(cpu)) - nr_pending--; + prev->need_resched = 0; + if (likely(prev != next)) { + rq->nr_switches++; + rq->curr = next; + context_switch(prev, next); + barrier(); + rq = this_rq(); } -#else - // on UP this process is on the runqueue as well - nr_pending--; -#endif - if (nr_pending) { - /* - * This process can only be rescheduled by us, - * so this is safe without any locking. - */ - if (current->policy == SCHED_OTHER) - current->policy |= SCHED_YIELD; - current->need_resched = 1; + spin_unlock_irq(&rq->lock); + + reacquire_kernel_lock(current); - spin_lock_irq(&runqueue_lock); - move_last_runqueue(current); - spin_unlock_irq(&runqueue_lock); - } return 0; } @@ -1105,7 +1324,7 @@ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval) { struct timespec t; - struct task_struct *p; + task_t *p; int retval = -EINVAL; if (pid < 0) @@ -1115,8 +1334,8 @@ read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (p) - jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice), - &t); + jiffies_to_timespec(p->policy & SCHED_FIFO ? + 0 : TASK_TIMESLICE(p), &t); read_unlock(&tasklist_lock); if (p) retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; @@ -1124,14 +1343,14 @@ return retval; } -static void show_task(struct task_struct * p) +static void show_task(task_t * p) { unsigned long free = 0; int state; static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" }; printk("%-13.13s ", p->comm); - state = p->state ? ffz(~p->state) + 1 : 0; + state = p->state ? __ffs(p->state) + 1 : 0; if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *)) printk(stat_nam[state]); else @@ -1172,7 +1391,7 @@ printk(" (NOTLB)\n"); { - extern void show_trace_task(struct task_struct *tsk); + extern void show_trace_task(task_t *tsk); show_trace_task(p); } } @@ -1194,7 +1413,7 @@ void show_state(void) { - struct task_struct *p; + task_t *p; #if (BITS_PER_LONG == 32) printk("\n" @@ -1217,122 +1436,91 @@ read_unlock(&tasklist_lock); } -/** - * reparent_to_init() - Reparent the calling kernel thread to the init task. - * - * If a kernel thread is launched as a result of a system call, or if - * it ever exits, it should generally reparent itself to init so that - * it is correctly cleaned up on exit. - * - * The various task state such as scheduling policy and priority may have - * been inherited fro a user process, so we reset them to sane values here. - * - * NOTE that reparent_to_init() gives the caller full capabilities. - */ -void reparent_to_init(void) +/* HERE */ + +static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) { - struct task_struct *this_task = current; - - write_lock_irq(&tasklist_lock); - - /* Reparent to init */ - REMOVE_LINKS(this_task); - this_task->p_pptr = child_reaper; - this_task->p_opptr = child_reaper; - SET_LINKS(this_task); - - /* Set the exit signal to SIGCHLD so we signal init on exit */ - this_task->exit_signal = SIGCHLD; - - /* We also take the runqueue_lock while altering task fields - * which affect scheduling decisions */ - spin_lock(&runqueue_lock); - - this_task->ptrace = 0; - this_task->nice = DEF_NICE; - this_task->policy = SCHED_OTHER; - /* cpus_allowed? */ - /* rt_priority? */ - /* signals? */ - this_task->cap_effective = CAP_INIT_EFF_SET; - this_task->cap_inheritable = CAP_INIT_INH_SET; - this_task->cap_permitted = CAP_FULL_SET; - this_task->keep_capabilities = 0; - memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim))); - this_task->user = INIT_USER; - - spin_unlock(&runqueue_lock); - write_unlock_irq(&tasklist_lock); + if (rq1 == rq2) + spin_lock(&rq1->lock); + else { + if (rq1 < rq2) { + spin_lock(&rq1->lock); + spin_lock(&rq2->lock); + } else { + spin_lock(&rq2->lock); + spin_lock(&rq1->lock); + } + } } - -/* - * Put all the gunge required to become a kernel thread without - * attached user resources in one place where it belongs. - */ - -void daemonize(void) + +static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) { - struct fs_struct *fs; - - - /* - * If we were started as result of loading a module, close all of the - * user space pages. We don't need them, and if we didn't close them - * they would be locked into memory. - */ - exit_mm(current); - - current->session = 1; - current->pgrp = 1; - current->tty = NULL; - - /* Become as one with the init task */ - - exit_fs(current); /* current->fs->count--; */ - fs = init_task.fs; - current->fs = fs; - atomic_inc(&fs->count); - exit_files(current); - current->files = init_task.files; - atomic_inc(¤t->files->count); + spin_unlock(&rq1->lock); + if (rq1 != rq2) + spin_unlock(&rq2->lock); } - -extern unsigned long wait_init_idle; - -void __init init_idle(void) + +void __init init_idle(task_t *idle, int cpu) { - struct schedule_data * sched_data; - sched_data = &aligned_data[smp_processor_id()].schedule_data; + runqueue_t *idle_rq = cpu_rq(cpu), *rq = idle->array->rq; + unsigned long flags; - if (current != &init_task && task_on_runqueue(current)) { - printk("UGH! (%d:%d) was on the runqueue, removing.\n", - smp_processor_id(), current->pid); - del_from_runqueue(current); - } - sched_data->curr = current; - sched_data->last_schedule = get_cycles(); - clear_bit(current->processor, &wait_init_idle); + __save_flags(flags); + __cli(); + double_rq_lock(idle_rq, rq); + + idle_rq->curr = idle_rq->idle = idle; + deactivate_task(idle, rq); + idle->array = NULL; + idle->prio = MAX_PRIO; + idle->state = TASK_RUNNING; + idle->cpu = cpu; + double_rq_unlock(idle_rq, rq); + idle->need_resched = 1; + __restore_flags(flags); } -extern void init_timervecs (void); - +extern void init_timervecs(void); +extern void timer_bh(void); +extern void tqueue_bh(void); +extern void immediate_bh(void); + void __init sched_init(void) { - /* - * We have to do a little magic to get the first - * process right in SMP mode. - */ - int cpu = smp_processor_id(); - int nr; + runqueue_t *rq; + int i, j, k; - init_task.processor = cpu; - - for(nr = 0; nr < PIDHASH_SZ; nr++) - pidhash[nr] = NULL; - - init_timervecs(); + for (i = 0; i < NR_CPUS; i++) { + runqueue_t *rq = cpu_rq(i); + prio_array_t *array; + + rq->active = rq->arrays + 0; + rq->expired = rq->arrays + 1; + spin_lock_init(&rq->lock); + + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + array->rq = rq; + array->lock = &rq->lock; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } + } + /* + * We have to do a little magic to get the first + * process right in SMP mode. + */ + rq = this_rq(); + rq->curr = current; + rq->idle = current; + wake_up_process(current); + + init_timers(); - init_bh(TIMER_BH, timer_bh); init_bh(TQUEUE_BH, tqueue_bh); init_bh(IMMEDIATE_BH, immediate_bh); @@ -1340,5 +1528,5 @@ * The boot idle thread does lazy MMU switching as well: */ atomic_inc(&init_mm.mm_count); - enter_lazy_tlb(&init_mm, current, cpu); + enter_lazy_tlb(&init_mm, current, smp_processor_id()); } diff -X dontdiff -ruN linux-2.4.17/kernel/signal.c linux-2.4.17-lse02-D/kernel/signal.c --- linux-2.4.17/kernel/signal.c Wed Nov 21 16:26:27 2001 +++ linux-2.4.17-lse02-D/kernel/signal.c Thu Apr 4 17:37:18 2002 @@ -478,12 +478,9 @@ * process of changing - but no harm is done by that * other than doing an extra (lightweight) IPI interrupt. */ - spin_lock(&runqueue_lock); - if (task_has_cpu(t) && t->processor != smp_processor_id()) - smp_send_reschedule(t->processor); - spin_unlock(&runqueue_lock); -#endif /* CONFIG_SMP */ - + if ((t->state == TASK_RUNNING) && (t->cpu != cpu())) + kick_if_running(t); +#endif if (t->state & TASK_INTERRUPTIBLE) { wake_up_process(t); return; diff -X dontdiff -ruN linux-2.4.17/kernel/softirq.c linux-2.4.17-lse02-D/kernel/softirq.c --- linux-2.4.17/kernel/softirq.c Wed Oct 31 10:26:02 2001 +++ linux-2.4.17-lse02-D/kernel/softirq.c Thu Apr 4 17:37:18 2002 @@ -259,10 +259,9 @@ while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { current->state = TASK_RUNNING; - do { - current->policy |= SCHED_YIELD; - schedule(); - } while (test_bit(TASKLET_STATE_SCHED, &t->state)); + do + sys_sched_yield(); + while (test_bit(TASKLET_STATE_SCHED, &t->state)); } tasklet_unlock_wait(t); clear_bit(TASKLET_STATE_SCHED, &t->state); @@ -365,13 +364,13 @@ int cpu = cpu_logical_map(bind_cpu); daemonize(); - current->nice = 19; + set_user_nice(current, 19); sigfillset(¤t->blocked); /* Migrate to the right CPU */ - current->cpus_allowed = 1UL << cpu; - while (smp_processor_id() != cpu) - schedule(); + set_cpus_allowed(current, 1UL << cpu); + if (cpu() != cpu) + BUG(); sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu); @@ -396,7 +395,7 @@ } } -static __init int spawn_ksoftirqd(void) +__init int spawn_ksoftirqd(void) { int cpu; @@ -405,10 +404,8 @@ CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0) printk("spawn_ksoftirqd() failed for cpu %d\n", cpu); else { - while (!ksoftirqd_task(cpu_logical_map(cpu))) { - current->policy |= SCHED_YIELD; - schedule(); - } + while (!ksoftirqd_task(cpu_logical_map(cpu))) + sys_sched_yield(); } } diff -X dontdiff -ruN linux-2.4.17/kernel/sys.c linux-2.4.17-lse02-D/kernel/sys.c --- linux-2.4.17/kernel/sys.c Tue Sep 18 14:10:43 2001 +++ linux-2.4.17-lse02-D/kernel/sys.c Thu Apr 4 17:37:18 2002 @@ -220,10 +220,10 @@ } if (error == -ESRCH) error = 0; - if (niceval < p->nice && !capable(CAP_SYS_NICE)) + if (niceval < task_nice(p) && !capable(CAP_SYS_NICE)) error = -EACCES; else - p->nice = niceval; + set_user_nice(p, niceval); } read_unlock(&tasklist_lock); @@ -249,7 +249,7 @@ long niceval; if (!proc_sel(p, which, who)) continue; - niceval = 20 - p->nice; + niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } diff -X dontdiff -ruN linux-2.4.17/kernel/timer.c linux-2.4.17-lse02-D/kernel/timer.c --- linux-2.4.17/kernel/timer.c Mon Oct 8 10:41:41 2001 +++ linux-2.4.17-lse02-D/kernel/timer.c Thu Apr 4 17:37:18 2002 @@ -13,10 +13,15 @@ * serialize accesses to xtime/lost_ticks). * Copyright (C) 1998 Andrea Arcangeli * 1999-03-10 Improved NTP compatibility by Ulrich Windl + * 2000-10-05 Implemented scalable SMP per-CPU timer handling. + * Copyright (C) 2000 Ingo Molnar + * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar */ #include + #include +#include #include #include #include @@ -25,6 +30,8 @@ #include +struct kernel_stat kstat; + /* * Timekeeping variables */ @@ -71,83 +78,49 @@ unsigned long prof_len; unsigned long prof_shift; -/* - * Event timer code - */ -#define TVN_BITS 6 -#define TVR_BITS 8 -#define TVN_SIZE (1 << TVN_BITS) -#define TVR_SIZE (1 << TVR_BITS) -#define TVN_MASK (TVN_SIZE - 1) -#define TVR_MASK (TVR_SIZE - 1) - -struct timer_vec { - int index; - struct list_head vec[TVN_SIZE]; -}; - -struct timer_vec_root { - int index; - struct list_head vec[TVR_SIZE]; -}; - -static struct timer_vec tv5; -static struct timer_vec tv4; -static struct timer_vec tv3; -static struct timer_vec tv2; -static struct timer_vec_root tv1; - -static struct timer_vec * const tvecs[] = { - (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5 -}; - -#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0])) - -void init_timervecs (void) -{ - int i; +tvec_base_t tvec_bases[NR_CPUS]; - for (i = 0; i < TVN_SIZE; i++) { - INIT_LIST_HEAD(tv5.vec + i); - INIT_LIST_HEAD(tv4.vec + i); - INIT_LIST_HEAD(tv3.vec + i); - INIT_LIST_HEAD(tv2.vec + i); - } - for (i = 0; i < TVR_SIZE; i++) - INIT_LIST_HEAD(tv1.vec + i); -} +/* jiffies at the most recent update of wall time */ +unsigned long wall_jiffies; -static unsigned long timer_jiffies; +/* + * This spinlock protect us from races in SMP while playing with xtime. -arca + */ +rwlock_t xtime_lock = RW_LOCK_UNLOCKED; -static inline void internal_add_timer(struct timer_list *timer) +/* + * This is the 'global' timer BH. This gets called only if one of + * the local timer interrupts couldnt run timers. + */ +static inline void internal_add_timer(tvec_base_t *base, timer_t *timer) { /* * must be cli-ed when calling this */ unsigned long expires = timer->expires; - unsigned long idx = expires - timer_jiffies; + unsigned long idx = expires - base->timer_jiffies; struct list_head * vec; if (idx < TVR_SIZE) { int i = expires & TVR_MASK; - vec = tv1.vec + i; + vec = base->tv1.vec + i; } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { int i = (expires >> TVR_BITS) & TVN_MASK; - vec = tv2.vec + i; + vec = base->tv2.vec + i; } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; - vec = tv3.vec + i; + vec = base->tv3.vec + i; } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; - vec = tv4.vec + i; + vec = base->tv4.vec + i; } else if ((signed long) idx < 0) { /* can happen if you add a timer with expires == jiffies, * or you set a timer to go off in the past */ - vec = tv1.vec + tv1.index; + vec = base->tv1.vec + base->tv1.index; } else if (idx <= 0xffffffffUL) { int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; - vec = tv5.vec + i; + vec = base->tv5.vec + i; } else { /* Can only get here on architectures with 64-bit jiffies */ INIT_LIST_HEAD(&timer->list); @@ -159,37 +132,27 @@ list_add(&timer->list, vec->prev); } -/* Initialize both explicitly - let's try to have them in the same cache line */ -spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED; - -#ifdef CONFIG_SMP -volatile struct timer_list * volatile running_timer; -#define timer_enter(t) do { running_timer = t; mb(); } while (0) -#define timer_exit() do { running_timer = NULL; } while (0) -#define timer_is_running(t) (running_timer == t) -#define timer_synchronize(t) while (timer_is_running(t)) barrier() -#else -#define timer_enter(t) do { } while (0) -#define timer_exit() do { } while (0) -#endif - -void add_timer(struct timer_list *timer) +void add_timer(timer_t *timer) { + tvec_base_t * base = tvec_bases + smp_processor_id(); unsigned long flags; - spin_lock_irqsave(&timerlist_lock, flags); + CHECK_BASE(base); + CHECK_BASE(timer->base); + spin_lock_irqsave(&base->lock, flags); if (timer_pending(timer)) goto bug; - internal_add_timer(timer); - spin_unlock_irqrestore(&timerlist_lock, flags); + internal_add_timer(base, timer); + timer->base = base; + spin_unlock_irqrestore(&base->lock, flags); return; bug: - spin_unlock_irqrestore(&timerlist_lock, flags); + spin_unlock_irqrestore(&base->lock, flags); printk("bug: kernel timer added twice at %p.\n", __builtin_return_address(0)); } -static inline int detach_timer (struct timer_list *timer) +static inline int detach_timer(timer_t *timer) { if (!timer_pending(timer)) return 0; @@ -197,28 +160,81 @@ return 1; } -int mod_timer(struct timer_list *timer, unsigned long expires) +/* + * mod_timer() has subtle locking semantics because parallel + * calls to it must happen serialized. + */ +int mod_timer(timer_t *timer, unsigned long expires) { - int ret; + tvec_base_t *old_base, *new_base; unsigned long flags; + int ret; + + new_base = tvec_bases + smp_processor_id(); + CHECK_BASE(new_base); + + __save_flags(flags); + __cli(); +repeat: + old_base = timer->base; + CHECK_BASE(old_base); + + /* + * Prevent deadlocks via ordering by old_base < new_base. + */ + if (old_base && (new_base != old_base)) { + if (old_base < new_base) { + spin_lock(&new_base->lock); + spin_lock(&old_base->lock); + } else { + spin_lock(&old_base->lock); + spin_lock(&new_base->lock); + } + /* + * Subtle, we rely on timer->base being always + * valid and being updated atomically. + */ + if (timer->base != old_base) { + spin_unlock(&new_base->lock); + spin_unlock(&old_base->lock); + goto repeat; + } + } else + spin_lock(&new_base->lock); - spin_lock_irqsave(&timerlist_lock, flags); timer->expires = expires; ret = detach_timer(timer); - internal_add_timer(timer); - spin_unlock_irqrestore(&timerlist_lock, flags); + internal_add_timer(new_base, timer); + timer->base = new_base; + + + if (old_base && (new_base != old_base)) + spin_unlock(&old_base->lock); + spin_unlock_irqrestore(&new_base->lock, flags); + return ret; } -int del_timer(struct timer_list * timer) +int del_timer(timer_t * timer) { - int ret; unsigned long flags; + tvec_base_t * base; + int ret; - spin_lock_irqsave(&timerlist_lock, flags); + CHECK_BASE(timer->base); + if (!timer->base) + return 0; +repeat: + base = timer->base; + spin_lock_irqsave(&base->lock, flags); + if (base != timer->base) { + spin_unlock_irqrestore(&base->lock, flags); + goto repeat; + } ret = detach_timer(timer); timer->list.next = timer->list.prev = NULL; - spin_unlock_irqrestore(&timerlist_lock, flags); + spin_unlock_irqrestore(&base->lock, flags); + return ret; } @@ -236,24 +252,34 @@ * (for reference counting). */ -int del_timer_sync(struct timer_list * timer) +int del_timer_sync(timer_t * timer) { + tvec_base_t * base; int ret = 0; + CHECK_BASE(timer->base); + if (!timer->base) + return 0; for (;;) { unsigned long flags; int running; - spin_lock_irqsave(&timerlist_lock, flags); +repeat: + base = timer->base; + spin_lock_irqsave(&base->lock, flags); + if (base != timer->base) { + spin_unlock_irqrestore(&base->lock, flags); + goto repeat; + } ret += detach_timer(timer); timer->list.next = timer->list.prev = 0; - running = timer_is_running(timer); - spin_unlock_irqrestore(&timerlist_lock, flags); + running = timer_is_running(base, timer); + spin_unlock_irqrestore(&base->lock, flags); if (!running) break; - timer_synchronize(timer); + timer_synchronize(base, timer); } return ret; @@ -261,7 +287,7 @@ #endif -static inline void cascade_timers(struct timer_vec *tv) +static void cascade(tvec_base_t *base, tvec_t *tv) { /* cascade all the timers from tv up one level */ struct list_head *head, *curr, *next; @@ -273,54 +299,68 @@ * detach them individually, just clear the list afterwards. */ while (curr != head) { - struct timer_list *tmp; + timer_t *tmp; - tmp = list_entry(curr, struct timer_list, list); + tmp = list_entry(curr, timer_t, list); + CHECK_BASE(tmp->base); + if (tmp->base != base) + BUG(); next = curr->next; list_del(curr); // not needed - internal_add_timer(tmp); + internal_add_timer(base, tmp); curr = next; } INIT_LIST_HEAD(head); tv->index = (tv->index + 1) & TVN_MASK; } -static inline void run_timer_list(void) +static void __run_timers(tvec_base_t *base) { - spin_lock_irq(&timerlist_lock); - while ((long)(jiffies - timer_jiffies) >= 0) { + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + while ((long)(jiffies - base->timer_jiffies) >= 0) { struct list_head *head, *curr; - if (!tv1.index) { - int n = 1; - do { - cascade_timers(tvecs[n]); - } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS); + + /* + * Cascade timers: + */ + if (!base->tv1.index) { + cascade(base, &base->tv2); + if (base->tv2.index == 1) { + cascade(base, &base->tv3); + if (base->tv3.index == 1) { + cascade(base, &base->tv4); + if (base->tv4.index == 1) + cascade(base, &base->tv5); + } + } } repeat: - head = tv1.vec + tv1.index; + head = base->tv1.vec + base->tv1.index; curr = head->next; if (curr != head) { - struct timer_list *timer; void (*fn)(unsigned long); unsigned long data; + timer_t *timer; - timer = list_entry(curr, struct timer_list, list); + timer = list_entry(curr, timer_t, list); fn = timer->function; - data= timer->data; + data = timer->data; detach_timer(timer); timer->list.next = timer->list.prev = NULL; - timer_enter(timer); - spin_unlock_irq(&timerlist_lock); + timer_enter(base, timer); + spin_unlock_irq(&base->lock); fn(data); - spin_lock_irq(&timerlist_lock); - timer_exit(); + spin_lock_irq(&base->lock); + timer_exit(base); goto repeat; } - ++timer_jiffies; - tv1.index = (tv1.index + 1) & TVR_MASK; + ++base->timer_jiffies; + base->tv1.index = (base->tv1.index + 1) & TVR_MASK; } - spin_unlock_irq(&timerlist_lock); + spin_unlock_irqrestore(&base->lock, flags); } spinlock_t tqueue_lock = SPIN_LOCK_UNLOCKED; @@ -582,18 +622,7 @@ int cpu = smp_processor_id(), system = user_tick ^ 1; update_one_process(p, user_tick, system, cpu); - if (p->pid) { - if (--p->counter <= 0) { - p->counter = 0; - p->need_resched = 1; - } - if (p->nice > 0) - kstat.per_cpu_nice[cpu] += user_tick; - else - kstat.per_cpu_user[cpu] += user_tick; - kstat.per_cpu_system[cpu] += system; - } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1) - kstat.per_cpu_system[cpu] += system; + scheduler_tick(user_tick, system); } /* @@ -637,40 +666,76 @@ } } -/* jiffies at the most recent update of wall time */ -unsigned long wall_jiffies; +static void run_all_timers(void) +{ + int i; + + for (i = 0; i < smp_num_cpus; i++) { + tvec_base_t *base = tvec_bases + i; + if ((long)(jiffies - base->timer_jiffies) >= 0) + __run_timers(base); + } +} /* - * This spinlock protect us from races in SMP while playing with xtime. -arca + * Called by the local, per-CPU timer interrupt on SMP. + * + * This function has to do all sorts of locking to make legacy + * cli()-users and BH-disablers work. If locking doesnt succeed + * now then we fall back to TIMER_BH. */ -rwlock_t xtime_lock = RW_LOCK_UNLOCKED; +void run_local_timers(void) +{ + int cpu = smp_processor_id(); + tvec_base_t *base = tvec_bases + cpu; + + if (in_interrupt()) + goto out_mark; + + local_bh_disable(); + local_irq_disable(); + if (!spin_trylock(&global_bh_lock)) + goto out_enable_mark; + + if (!hardirq_trylock(cpu)) + goto out_unlock_enable_mark; + + if ((long)(jiffies - base->timer_jiffies) >= 0) + __run_timers(base); + + hardirq_endlock(cpu); + spin_unlock(&global_bh_lock); + local_irq_enable(); + local_bh_enable(); + return; + +out_unlock_enable_mark: + spin_unlock(&global_bh_lock); + +out_enable_mark: + local_irq_enable(); + local_bh_enable(); + +out_mark: + mark_bh(TIMER_BH); +} -static inline void update_times(void) +/* + * Called by the timer interrupt. xtime_lock must already be taken + * by the timer IRQ! + */ +static void update_times(void) { unsigned long ticks; - /* - * update_times() is run from the raw timer_bh handler so we - * just know that the irqs are locally enabled and so we don't - * need to save/restore the flags of the local CPU here. -arca - */ - write_lock_irq(&xtime_lock); - ticks = jiffies - wall_jiffies; if (ticks) { wall_jiffies += ticks; update_wall_time(ticks); } - write_unlock_irq(&xtime_lock); calc_load(ticks); } -void timer_bh(void) -{ - update_times(); - run_timer_list(); -} - void do_timer(struct pt_regs *regs) { (*(unsigned long *)&jiffies)++; @@ -678,8 +743,18 @@ /* SMP process accounting uses the local APIC timer */ update_process_times(user_mode(regs)); +#ifdef CONFIG_X86 + mark_bh(TIMER_BH); #endif +#endif + /* + * Right now only x86-SMP calls run_local_timers() from a + * per-CPU interrupt. + */ +#ifndef CONFIG_X86 mark_bh(TIMER_BH); +#endif + update_times(); if (TQ_ACTIVE(tq_timer)) mark_bh(TQUEUE_BH); } @@ -794,6 +869,89 @@ #endif +static void process_timeout(unsigned long __data) +{ + wake_up_process((task_t *)__data); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns. The routine will return 0 + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. In this case the remaining time + * in jiffies will be returned, or 0 if the timer expired in time + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * In all cases the return value is guaranteed to be non-negative. + */ +signed long schedule_timeout(signed long timeout) +{ + struct timer_list timer; + unsigned long expire; + + switch (timeout) + { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) + { + printk(KERN_ERR "schedule_timeout: wrong timeout " + "value %lx from %p\n", timeout, + __builtin_return_address(0)); + current->state = TASK_RUNNING; + goto out; + } + } + + expire = timeout + jiffies; + + init_timer(&timer); + timer.expires = expire; + timer.data = (unsigned long) current; + timer.function = process_timeout; + + add_timer(&timer); + schedule(); + del_timer_sync(&timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} + /* Thread ID - the internal kernel "pid" */ asmlinkage long sys_gettid(void) { @@ -841,3 +999,22 @@ return 0; } +void __init init_timers(void) +{ + int i, j; + + for (i = 0; i < NR_CPUS; i++) { + tvec_base_t *base = tvec_bases + i; + + spin_lock_init(&base->lock); + for (j = 0; j < TVN_SIZE; j++) { + INIT_LIST_HEAD(base->tv5.vec + j); + INIT_LIST_HEAD(base->tv4.vec + j); + INIT_LIST_HEAD(base->tv3.vec + j); + INIT_LIST_HEAD(base->tv2.vec + j); + } + for (j = 0; j < TVR_SIZE; j++) + INIT_LIST_HEAD(base->tv1.vec + j); + } + init_bh(TIMER_BH, run_all_timers); +} diff -X dontdiff -ruN linux-2.4.17/mm/bootmem.c linux-2.4.17-lse02-D/mm/bootmem.c --- linux-2.4.17/mm/bootmem.c Fri Dec 21 09:42:04 2001 +++ linux-2.4.17-lse02-D/mm/bootmem.c Thu Apr 4 17:37:18 2002 @@ -25,6 +25,7 @@ */ unsigned long max_low_pfn; unsigned long min_low_pfn; +unsigned long max_pfn; /* return the number of _pages_ that will be allocated for the boot bitmap */ unsigned long __init bootmem_bootmap_pages (unsigned long pages) diff -X dontdiff -ruN linux-2.4.17/mm/highmem.c linux-2.4.17-lse02-D/mm/highmem.c --- linux-2.4.17/mm/highmem.c Fri Dec 21 09:42:05 2001 +++ linux-2.4.17-lse02-D/mm/highmem.c Thu Apr 4 17:37:18 2002 @@ -354,9 +354,8 @@ /* we need to wait I/O completion */ run_task_queue(&tq_disk); - current->policy |= SCHED_YIELD; __set_current_state(TASK_RUNNING); - schedule(); + yield(); goto repeat_alloc; } @@ -392,9 +391,8 @@ /* we need to wait I/O completion */ run_task_queue(&tq_disk); - current->policy |= SCHED_YIELD; __set_current_state(TASK_RUNNING); - schedule(); + yield(); goto repeat_alloc; } diff -X dontdiff -ruN linux-2.4.17/mm/memory.c linux-2.4.17-lse02-D/mm/memory.c --- linux-2.4.17/mm/memory.c Fri Dec 21 09:42:05 2001 +++ linux-2.4.17-lse02-D/mm/memory.c Thu Apr 4 17:37:18 2002 @@ -504,6 +504,8 @@ * Force in an entire range of pages from the current process's user VA, * and pin them in physical memory. */ + + #define dprintk(x...) int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len) @@ -551,6 +553,79 @@ return 0; } + +/* + * Force in an entire range of pages from the current process's user VA, + * and pin them in physical memory. + */ + +int map_user_kiobuf_iovecs(int rw, struct kiobuf *iobuf, struct iovec *iov, int iov_count) +{ + int i, j; + int pgcount, err, iovpages; + struct mm_struct * mm; + ulong va; + size_t len, plen; + int offset; + + /* Make sure the iobuf is not already mapped somewhere. */ + if (iobuf->nr_pages) + return -EINVAL; + + mm = current->mm; + dprintk ("map_user_kiobuf_iovecs: begin\n"); + + iobuf->locked = 0; + iobuf->length = 0; + for (i=0, pgcount=0; ilength += len; + + iovpages = (va + len + PAGE_SIZE - 1)/PAGE_SIZE - va/PAGE_SIZE; + + /* Try to fault in all of the necessary pages */ + down_read(&mm->mmap_sem); + /* rw==READ means read from disk, write into memory area */ + err = get_user_pages(current, mm, va, iovpages, + (rw==READ), 0, &iobuf->maplist[pgcount], NULL); + up_read(&mm->mmap_sem); + + if (err < 0) { + iobuf->nr_pages = pgcount; + unmap_kiobuf(iobuf); + dprintk ("map_user_kiobuf_iovecs: end %d\n", err); + return err; + } + + for (j=pgcount; jpinfo->poffset[j] = offset; + plen = PAGE_SIZE - offset; + if (plen > len) { + iobuf->pinfo->plen[j] = len; + } else { + iobuf->pinfo->plen[j] = plen; + len -= plen; + } + offset = 0; + } + pgcount += iovpages; + } + + if ( (!pgcount)||(pgcount > KIO_STATIC_PAGES) ) + BUG(); + iobuf->nr_pages = pgcount; + while (pgcount--) { + /* FIXME: flush superflous for rw==READ, + * probably wrong function for rw==WRITE + */ + flush_dcache_page(iobuf->maplist[pgcount]); + } + dprintk ("map_user_kiobuf_iovecs: end OK\n"); + return 0; +} + /* * Mark all of the pages in a kiobuf as dirty * @@ -579,6 +654,31 @@ remaining -= (PAGE_SIZE - offset); offset = 0; + index++; + } +} + + +void mark_dirty_kiobuf_iovec(struct kiobuf *iobuf, int bytes) +{ + int index, offset, remaining; + struct page *page; + size_t plen; + + index = 0; + remaining = bytes; + if (remaining > iobuf->length) + remaining = iobuf->length; + + while (remaining > 0 && index < iobuf->nr_pages) { + page = iobuf->maplist[index]; + offset = iobuf->pinfo->poffset[index]; + plen = iobuf->pinfo->plen[index]; + + if (!PageReserved(page)) + SetPageDirty(page); + + remaining -= plen; index++; } } diff -X dontdiff -ruN linux-2.4.17/mm/oom_kill.c linux-2.4.17-lse02-D/mm/oom_kill.c --- linux-2.4.17/mm/oom_kill.c Sat Nov 3 17:05:25 2001 +++ linux-2.4.17-lse02-D/mm/oom_kill.c Thu Apr 4 17:37:18 2002 @@ -82,7 +82,7 @@ * Niced processes are most likely less important, so double * their badness points. */ - if (p->nice > 0) + if (task_nice(p) > 0) points *= 2; /* @@ -149,7 +149,7 @@ * all the memory it needs. That way it should be able to * exit() and clear out its resources quickly... */ - p->counter = 5 * HZ; + p->time_slice = HZ; p->flags |= PF_MEMALLOC | PF_MEMDIE; /* This process has hardware access, be more careful. */ @@ -188,8 +188,7 @@ * killing itself before someone else gets the chance to ask * for more memory. */ - current->policy |= SCHED_YIELD; - schedule(); + yield(); return; } diff -X dontdiff -ruN linux-2.4.17/mm/page_alloc.c linux-2.4.17-lse02-D/mm/page_alloc.c --- linux-2.4.17/mm/page_alloc.c Mon Nov 19 16:35:40 2001 +++ linux-2.4.17-lse02-D/mm/page_alloc.c Thu Apr 4 17:37:18 2002 @@ -394,9 +394,8 @@ return NULL; /* Yield for kswapd, and try again */ - current->policy |= SCHED_YIELD; __set_current_state(TASK_RUNNING); - schedule(); + yield(); goto rebalance; } diff -X dontdiff -ruN linux-2.4.17/net/ipv4/tcp_output.c linux-2.4.17-lse02-D/net/ipv4/tcp_output.c --- linux-2.4.17/net/ipv4/tcp_output.c Fri Dec 21 09:42:05 2001 +++ linux-2.4.17-lse02-D/net/ipv4/tcp_output.c Thu Apr 4 17:37:18 2002 @@ -1009,8 +1009,7 @@ skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL); if (skb) break; - current->policy |= SCHED_YIELD; - schedule(); + yield(); } /* Reserve space for headers and prepare control bits. */ diff -X dontdiff -ruN linux-2.4.17/net/sched/sch_generic.c linux-2.4.17-lse02-D/net/sched/sch_generic.c --- linux-2.4.17/net/sched/sch_generic.c Fri Aug 18 10:26:25 2000 +++ linux-2.4.17-lse02-D/net/sched/sch_generic.c Thu Apr 4 17:37:18 2002 @@ -475,10 +475,8 @@ dev_watchdog_down(dev); - while (test_bit(__LINK_STATE_SCHED, &dev->state)) { - current->policy |= SCHED_YIELD; - schedule(); - } + while (test_bit(__LINK_STATE_SCHED, &dev->state)) + yield(); spin_unlock_wait(&dev->xmit_lock); } diff -X dontdiff -ruN linux-2.4.17/net/socket.c linux-2.4.17-lse02-D/net/socket.c --- linux-2.4.17/net/socket.c Fri Dec 21 09:42:06 2001 +++ linux-2.4.17-lse02-D/net/socket.c Thu Apr 4 17:37:18 2002 @@ -148,8 +148,7 @@ while (atomic_read(&net_family_lockct) != 0) { spin_unlock(&net_family_lock); - current->policy |= SCHED_YIELD; - schedule(); + yield(); spin_lock(&net_family_lock); } diff -X dontdiff -ruN linux-2.4.17/net/sunrpc/sched.c linux-2.4.17-lse02-D/net/sunrpc/sched.c --- linux-2.4.17/net/sunrpc/sched.c Thu Oct 11 08:12:52 2001 +++ linux-2.4.17-lse02-D/net/sunrpc/sched.c Thu Apr 4 17:37:18 2002 @@ -772,8 +772,7 @@ } if (flags & RPC_TASK_ASYNC) return NULL; - current->policy |= SCHED_YIELD; - schedule(); + yield(); } while (!signalled()); return NULL; @@ -1114,8 +1113,7 @@ __rpc_schedule(); if (all_tasks) { dprintk("rpciod_killall: waiting for tasks to exit\n"); - current->policy |= SCHED_YIELD; - schedule(); + yield(); } } @@ -1185,8 +1183,7 @@ * wait briefly before checking the process id. */ current->sigpending = 0; - current->policy |= SCHED_YIELD; - schedule(); + yield(); /* * Display a message if we're going to wait longer. */ diff -X dontdiff -ruN linux-2.4.17/net/unix/af_unix.c linux-2.4.17-lse02-D/net/unix/af_unix.c --- linux-2.4.17/net/unix/af_unix.c Fri Dec 21 09:42:06 2001 +++ linux-2.4.17-lse02-D/net/unix/af_unix.c Thu Apr 4 17:37:18 2002 @@ -564,10 +564,8 @@ addr->hash)) { write_unlock(&unix_table_lock); /* Sanity yield. It is unusual case, but yet... */ - if (!(ordernum&0xFF)) { - current->policy |= SCHED_YIELD; - schedule(); - } + if (!(ordernum&0xFF)) + yield(); goto retry; } addr->hash ^= sk->type;