diff -urN softirq-ref/arch/i386/kernel/entry.S softirq/arch/i386/kernel/entry.S
--- softirq-ref/arch/i386/kernel/entry.S	Tue Jun 12 21:01:26 2001
+++ softirq/arch/i386/kernel/entry.S	Tue Jun 12 15:46:51 2001
@@ -208,8 +208,18 @@
 	movl %eax,EAX(%esp)		# save the return value
 ENTRY(ret_from_sys_call)
 	cli				# need_resched and signals atomic test
+#ifdef CONFIG_SMP
+	movl processor(%ebx),%eax
+	shll $CONFIG_X86_L1_CACHE_SHIFT,%eax
+	testl $0, SYMBOL_NAME(irq_stat)(,%eax)	# softirq_pending
+#else
+	testl $0, SYMBOL_NAME(irq_stat)		# softirq_pending
+#endif
+	jne   handle_softirq
+handle_softirq_back:
 	cmpl $0,need_resched(%ebx)
 	jne reschedule
+reschedule_back:	
 	cmpl $0,sigpending(%ebx)
 	jne signal_return
 restore_all:
@@ -260,9 +270,14 @@
 	jmp restore_all
 
 	ALIGN
+handle_softirq:
+	call SYMBOL_NAME(do_softirq)
+	jmp handle_softirq_back
+	
+	ALIGN
 reschedule:
 	call SYMBOL_NAME(schedule)    # test
-	jmp ret_from_sys_call
+	jmp reschedule_back
 
 ENTRY(divide_error)
 	pushl $0		# no error code
diff -urN softirq-ref/include/asm-alpha/hardirq.h softirq/include/asm-alpha/hardirq.h
--- softirq-ref/include/asm-alpha/hardirq.h	Tue Jun 12 21:01:26 2001
+++ softirq/include/asm-alpha/hardirq.h	Tue Jun 12 18:27:06 2001
@@ -10,6 +10,7 @@
 	unsigned int __local_irq_count;
 	unsigned int __local_bh_count;
 	unsigned int __syscall_count;
+	struct task_struct * __ksoftirqd_task;
 } ____cacheline_aligned irq_cpustat_t;
 
 #include <linux/irq_cpustat.h>	/* Standard mappings for irq_cpustat_t above */
diff -urN softirq-ref/include/asm-alpha/softirq.h softirq/include/asm-alpha/softirq.h
--- softirq-ref/include/asm-alpha/softirq.h	Tue Jun 12 21:01:26 2001
+++ softirq/include/asm-alpha/softirq.h	Tue Jun 12 20:50:59 2001
@@ -8,21 +8,28 @@
 extern inline void cpu_bh_disable(int cpu)
 {
 	local_bh_count(cpu)++;
-	mb();
+	barrier();
 }
 
-extern inline void cpu_bh_enable(int cpu)
+extern inline void __cpu_bh_enable(int cpu)
 {
-	mb();
+	barrier();
 	local_bh_count(cpu)--;
 }
 
-#define local_bh_enable()	cpu_bh_enable(smp_processor_id())
-#define __local_bh_enable	local_bh_enable
+#define __local_bh_enable()	__cpu_bh_enable(smp_processor_id())
 #define local_bh_disable()	cpu_bh_disable(smp_processor_id())
 
-#define in_softirq() (local_bh_count(smp_processor_id()) != 0)
+#define local_bh_enable()					\
+do {								\
+	int cpu;						\
+								\
+	barrier();						\
+	cpu = smp_processor_id();				\
+	if (!--local_bh_count(cpu) && softirq_pending(cpu))	\
+		do_softirq();					\
+} while (0)
 
-#define __cpu_raise_softirq(cpu,nr) set_bit((nr), &softirq_pending(cpu))
+#define in_softirq() (local_bh_count(smp_processor_id()) != 0)
 
 #endif /* _ALPHA_SOFTIRQ_H */
diff -urN softirq-ref/include/asm-i386/hardirq.h softirq/include/asm-i386/hardirq.h
--- softirq-ref/include/asm-i386/hardirq.h	Tue Jun 12 21:01:26 2001
+++ softirq/include/asm-i386/hardirq.h	Tue Jun 12 19:00:31 2001
@@ -11,6 +11,7 @@
 	unsigned int __local_irq_count;
 	unsigned int __local_bh_count;
 	unsigned int __syscall_count;
+	struct task_struct * __ksoftirqd_task; /* waitqueue is too large */
 	unsigned int __nmi_count;	/* arch dependent */
 } ____cacheline_aligned irq_cpustat_t;
 
diff -urN softirq-ref/include/asm-i386/softirq.h softirq/include/asm-i386/softirq.h
--- softirq-ref/include/asm-i386/softirq.h	Tue Jun 12 21:01:26 2001
+++ softirq/include/asm-i386/softirq.h	Tue Jun 12 17:21:50 2001
@@ -11,8 +11,6 @@
 
 #define local_bh_disable()	cpu_bh_disable(smp_processor_id())
 #define __local_bh_enable()	__cpu_bh_enable(smp_processor_id())
-#define __cpu_raise_softirq(cpu,nr) set_bit((nr), &softirq_pending(cpu));
-#define raise_softirq(nr) __cpu_raise_softirq(smp_processor_id(), (nr))
 
 #define in_softirq() (local_bh_count(smp_processor_id()) != 0)
 
@@ -28,6 +26,7 @@
 do {									\
 	unsigned int *ptr = &local_bh_count(smp_processor_id());	\
 									\
+	barrier();							\
 	if (!--*ptr)							\
 		__asm__ __volatile__ (					\
 			"cmpl $0, -8(%0);"				\
diff -urN softirq-ref/include/asm-sparc/hardirq.h softirq/include/asm-sparc/hardirq.h
--- softirq-ref/include/asm-sparc/hardirq.h	Tue Jun 12 21:01:26 2001
+++ softirq/include/asm-sparc/hardirq.h	Tue Jun 12 15:46:49 2001
@@ -23,6 +23,7 @@
 #endif
 	unsigned int __local_bh_count;
 	unsigned int __syscall_count;
+        struct task_struct * __ksoftirqd_task;
 } ____cacheline_aligned irq_cpustat_t;
 
 #include <linux/irq_cpustat.h>	/* Standard mappings for irq_cpustat_t above */
diff -urN softirq-ref/include/asm-sparc64/hardirq.h softirq/include/asm-sparc64/hardirq.h
--- softirq-ref/include/asm-sparc64/hardirq.h	Tue Jun 12 21:01:26 2001
+++ softirq/include/asm-sparc64/hardirq.h	Tue Jun 12 15:46:49 2001
@@ -22,6 +22,7 @@
 #endif
 	unsigned int __local_bh_count;
 	unsigned int __syscall_count;
+        struct task_struct * __ksoftirqd_task;
 } ____cacheline_aligned irq_cpustat_t;
 
 #include <linux/irq_cpustat.h>	/* Standard mappings for irq_cpustat_t above */
diff -urN softirq-ref/include/linux/interrupt.h softirq/include/linux/interrupt.h
--- softirq-ref/include/linux/interrupt.h	Tue Jun 12 21:01:26 2001
+++ softirq/include/linux/interrupt.h	Tue Jun 12 16:46:09 2001
@@ -74,6 +74,22 @@
 asmlinkage void do_softirq(void);
 extern void open_softirq(int nr, void (*action)(struct softirq_action*), void *data);
 
+static inline void __cpu_raise_softirq(int cpu, int nr)
+{
+	softirq_pending(cpu) |= (1<<nr);
+}
+
+
+/* I do not want to use atomic variables now, so that cli/sti */
+static inline void raise_softirq(int nr)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__cpu_raise_softirq(smp_processor_id(), nr);
+	local_irq_restore(flags);
+}
+
 extern void softirq_init(void);
 
 
@@ -129,7 +145,7 @@
 extern struct tasklet_head tasklet_hi_vec[NR_CPUS];
 
 #define tasklet_trylock(t) (!test_and_set_bit(TASKLET_STATE_RUN, &(t)->state))
-#define tasklet_unlock(t) clear_bit(TASKLET_STATE_RUN, &(t)->state)
+#define tasklet_unlock(t) do { smp_mb__before_clear_bit(); clear_bit(TASKLET_STATE_RUN, &(t)->state); } while(0)
 #define tasklet_unlock_wait(t) while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
 
 extern void tasklet_schedule(struct tasklet_struct *t);
diff -urN softirq-ref/include/linux/irq_cpustat.h softirq/include/linux/irq_cpustat.h
--- softirq-ref/include/linux/irq_cpustat.h	Tue Jun 12 21:01:26 2001
+++ softirq/include/linux/irq_cpustat.h	Tue Jun 12 16:46:09 2001
@@ -30,6 +30,7 @@
 #define local_irq_count(cpu)	__IRQ_STAT((cpu), __local_irq_count)
 #define local_bh_count(cpu)	__IRQ_STAT((cpu), __local_bh_count)
 #define syscall_count(cpu)	__IRQ_STAT((cpu), __syscall_count)
+#define ksoftirqd_task(cpu)	__IRQ_STAT((cpu), __ksoftirqd_task)
   /* arch dependent irq_stat fields */
 #define nmi_count(cpu)		__IRQ_STAT((cpu), __nmi_count)		/* i386, ia64 */
 
diff -urN softirq-ref/kernel/ksyms.c softirq/kernel/ksyms.c
--- softirq-ref/kernel/ksyms.c	Tue Jun 12 21:01:20 2001
+++ softirq/kernel/ksyms.c	Tue Jun 12 20:37:33 2001
@@ -553,7 +553,7 @@
 EXPORT_SYMBOL(tasklet_init);
 EXPORT_SYMBOL(tasklet_kill);
 EXPORT_SYMBOL(__run_task_queue);
-EXPORT_SYMBOL(do_softirq);
+EXPORT_SYMBOL_NOVERS(do_softirq);
 EXPORT_SYMBOL(tasklet_schedule);
 EXPORT_SYMBOL(tasklet_hi_schedule);
 
diff -urN softirq-ref/kernel/softirq.c softirq/kernel/softirq.c
--- softirq-ref/kernel/softirq.c	Tue Jun 12 21:01:26 2001
+++ softirq/kernel/softirq.c	Tue Jun 12 20:55:19 2001
@@ -51,17 +51,20 @@
 {
 	int cpu = smp_processor_id();
 	__u32 pending;
+	long flags;
+	__u32 mask;
 
 	if (in_interrupt())
 		return;
 
-	local_irq_disable();
+	local_irq_save(flags);
 
 	pending = softirq_pending(cpu);
 
 	if (pending) {
 		struct softirq_action *h;
 
+		mask = ~pending;
 		local_bh_disable();
 restart:
 		/* Reset the pending bitmask before enabling irqs */
@@ -81,12 +84,26 @@
 		local_irq_disable();
 
 		pending = softirq_pending(cpu);
-		if (pending)
+		if (pending & mask) {
+			mask &= ~pending;
 			goto restart;
+		}
 		__local_bh_enable();
+
+		if (pending) {
+			/*
+			 * we cannot loop indefinitely here to avoid userspace starvation,
+			 * but we also don't want to introduce a worst case 1/HZ latency
+			 * to the pending events, so lets the scheduler to balance
+			 * the softirq load for us.
+			 */
+			struct task_struct * tsk = ksoftirqd_task(cpu);
+			if (tsk && tsk->state != TASK_RUNNING)
+				wake_up_process(tsk);
+		}
 	}
 
-	local_irq_enable();
+	local_irq_restore(flags);
 }
 
 
@@ -112,8 +129,7 @@
 	 * If nobody is running it then add it to this CPU's
 	 * tasklet queue.
 	 */
-	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state) &&
-						tasklet_trylock(t)) {
+	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
 		t->next = tasklet_vec[cpu].list;
 		tasklet_vec[cpu].list = t;
 		__cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
@@ -130,8 +146,7 @@
 	cpu = smp_processor_id();
 	local_irq_save(flags);
 
-	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state) &&
-						tasklet_trylock(t)) {
+	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
 		t->next = tasklet_hi_vec[cpu].list;
 		tasklet_hi_vec[cpu].list = t;
 		__cpu_raise_softirq(cpu, HI_SOFTIRQ);
@@ -148,37 +163,29 @@
 	local_irq_disable();
 	list = tasklet_vec[cpu].list;
 	tasklet_vec[cpu].list = NULL;
+	local_irq_enable();
 
 	while (list) {
 		struct tasklet_struct *t = list;
 
 		list = list->next;
 
-		/*
-		 * A tasklet is only added to the queue while it's
-		 * locked, so no other CPU can have this tasklet
-		 * pending:
-		 */
 		if (!tasklet_trylock(t))
 			BUG();
-repeat:
-		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
-			BUG();
 		if (!atomic_read(&t->count)) {
-			local_irq_enable();
+			clear_bit(TASKLET_STATE_SCHED, &t->state);
 			t->func(t->data);
-			local_irq_disable();
-			/*
-			 * One more run if the tasklet got reactivated:
-			 */
-			if (test_bit(TASKLET_STATE_SCHED, &t->state))
-				goto repeat;
+			tasklet_unlock(t);
+			continue;
 		}
 		tasklet_unlock(t);
-		if (test_bit(TASKLET_STATE_SCHED, &t->state))
-			tasklet_schedule(t);
+
+		local_irq_disable();
+		t->next = tasklet_vec[cpu].list;
+		tasklet_vec[cpu].list = t;
+		__cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
+		local_irq_enable();
 	}
-	local_irq_enable();
 }
 
 
@@ -193,6 +200,7 @@
 	local_irq_disable();
 	list = tasklet_hi_vec[cpu].list;
 	tasklet_hi_vec[cpu].list = NULL;
+	local_irq_enable();
 
 	while (list) {
 		struct tasklet_struct *t = list;
@@ -201,21 +209,20 @@
 
 		if (!tasklet_trylock(t))
 			BUG();
-repeat:
-		if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
-			BUG();
 		if (!atomic_read(&t->count)) {
-			local_irq_enable();
+			clear_bit(TASKLET_STATE_SCHED, &t->state);
 			t->func(t->data);
-			local_irq_disable();
-			if (test_bit(TASKLET_STATE_SCHED, &t->state))
-				goto repeat;
+			tasklet_unlock(t);
+			continue;
 		}
 		tasklet_unlock(t);
-		if (test_bit(TASKLET_STATE_SCHED, &t->state))
-			tasklet_hi_schedule(t);
+
+		local_irq_disable();
+		t->next = tasklet_hi_vec[cpu].list;
+		tasklet_hi_vec[cpu].list = t;
+		__cpu_raise_softirq(cpu, HI_SOFTIRQ);
+		local_irq_enable();
 	}
-	local_irq_enable();
 }
 
 
@@ -335,3 +342,61 @@
 			f(data);
 	}
 }
+
+static int ksoftirqd(void * __bind_cpu)
+{
+	int bind_cpu = *(int *) __bind_cpu;
+	int cpu = cpu_logical_map(bind_cpu);
+
+	daemonize();
+	current->nice = 19;
+	sigfillset(&current->blocked);
+
+	/* Migrate to the right CPU */
+	current->cpus_allowed = 1UL << cpu;
+	while (smp_processor_id() != cpu)
+		schedule();
+
+	sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu);
+
+	__set_current_state(TASK_INTERRUPTIBLE);
+	mb();
+
+	ksoftirqd_task(cpu) = current;
+
+	for (;;) {
+		if (!softirq_pending(cpu))
+			schedule();
+
+		__set_current_state(TASK_RUNNING);
+
+		while (softirq_pending(cpu)) {
+			do_softirq();
+			if (current->need_resched)
+				schedule();
+		}
+
+		__set_current_state(TASK_INTERRUPTIBLE);
+	}
+}
+
+static __init int spawn_ksoftirqd(void)
+{
+	int cpu;
+
+	for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+		if (kernel_thread(ksoftirqd, (void *) &cpu,
+				  CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
+			printk("spawn_ksoftirqd() failed for cpu %d\n", cpu);
+		else {
+			while (!ksoftirqd_task(cpu_logical_map(cpu))) {
+				current->policy |= SCHED_YIELD;
+				schedule();
+			}
+		}
+	}
+
+	return 0;
+}
+
+__initcall(spawn_ksoftirqd);