diff -urpN -X /home/fletch/.diff.exclude 111-schedstat/kernel/sched.c 112-schedstat2/kernel/sched.c
--- 111-schedstat/kernel/sched.c	Sun Apr 20 21:11:58 2003
+++ 112-schedstat2/kernel/sched.c	Sun Apr 20 21:11:58 2003
@@ -246,16 +246,28 @@ struct schedstat {
 	/* load_balance stats */
 	unsigned long lb_imbalance;
 	unsigned long lb_idle;
+	unsigned long lb_busy;
 	unsigned long lb_resched;
 	unsigned long lb_cnt;
 	unsigned long lb_nobusy;
+	unsigned long lb_bnode;
+
+	/* pull_task stats */
+	unsigned long pt_gained;
+	unsigned long pt_lost;
+	unsigned long pt_node_gained;
+	unsigned long pt_node_lost;
+
+	/* balance_node stats */
+	unsigned long bn_cnt;
+	unsigned long bn_idle;
 } ____cacheline_aligned;
 
 /*
  * bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
-#define SCHEDSTAT_VERSION	1
+#define SCHEDSTAT_VERSION	2
 
 struct schedstat schedstats[NR_CPUS];
 
@@ -281,28 +293,44 @@ int schedstats_read_proc(char *page, cha
 		sums.yld_cnt += schedstats[i].yld_cnt;
 		sums.sched_noswitch += schedstats[i].sched_noswitch;
 		sums.sched_switch += schedstats[i].sched_switch;
-		sums.sched_switch += schedstats[i].sched_cnt;
+		sums.sched_cnt += schedstats[i].sched_cnt;
 		sums.lb_idle += schedstats[i].lb_idle;
+		sums.lb_busy += schedstats[i].lb_busy;
 		sums.lb_resched += schedstats[i].lb_resched;
 		sums.lb_cnt += schedstats[i].lb_cnt;
 		sums.lb_imbalance += schedstats[i].lb_imbalance;
 		sums.lb_nobusy += schedstats[i].lb_nobusy;
+		sums.lb_bnode += schedstats[i].lb_bnode;
+		sums.pt_node_gained += schedstats[i].pt_node_gained;
+		sums.pt_node_lost += schedstats[i].pt_node_lost;
+		sums.pt_gained += schedstats[i].pt_gained;
+		sums.pt_lost += schedstats[i].pt_lost;
+		sums.bn_cnt += schedstats[i].bn_cnt;
+		sums.bn_idle += schedstats[i].bn_idle;
 		len += sprintf(page + len,
-		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
+		    "%lu %lu %lu %lu %lu %lu %lu %lu\n",
 		    i, schedstats[i].yld_both_empty,
 		    schedstats[i].yld_act_empty, schedstats[i].yld_exp_empty,
 		    schedstats[i].yld_cnt, schedstats[i].sched_noswitch,
 		    schedstats[i].sched_switch, schedstats[i].sched_cnt,
-		    schedstats[i].lb_idle, schedstats[i].lb_resched,
+		    schedstats[i].lb_idle, schedstats[i].lb_busy,
+		    schedstats[i].lb_resched,
 		    schedstats[i].lb_cnt, schedstats[i].lb_imbalance,
-		    schedstats[i].lb_nobusy);
+		    schedstats[i].lb_nobusy, schedstats[i].lb_bnode,
+		    schedstats[i].pt_gained, schedstats[i].pt_lost,
+		    schedstats[i].pt_node_gained, schedstats[i].pt_node_lost,
+		    schedstats[i].bn_cnt, schedstats[i].bn_idle);
 	}
 	len += sprintf(page + len,
-	    "totals %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+	    "totals %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
+	    "%lu %lu %lu %lu %lu %lu %lu\n",
 	    sums.yld_both_empty, sums.yld_act_empty, sums.yld_exp_empty,
 	    sums.yld_cnt, sums.sched_noswitch, sums.sched_switch,
-	    sums.sched_cnt, sums.lb_idle, sums.lb_resched, sums.lb_cnt,
-	    sums.lb_imbalance, sums.lb_nobusy);
+	    sums.sched_cnt, sums.lb_idle, sums.lb_busy, sums.lb_resched,
+	    sums.lb_cnt, sums.lb_imbalance, sums.lb_nobusy, sums.lb_bnode,
+	    sums.pt_gained, sums.pt_lost, sums.pt_node_gained,
+	    sums.pt_node_lost, sums.bn_cnt, sums.bn_idle);
 
 	return len;
 }
@@ -1050,6 +1078,12 @@ out:
  */
 static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
 {
+	if (cpu_to_node(this_cpu) != cpu_to_node(src_rq - runqueues)) {
+		schedstats[this_cpu].pt_node_gained++;
+		schedstats[src_rq - runqueues].pt_node_lost++;
+	}
+	schedstats[this_cpu].pt_gained++;
+	schedstats[src_rq - runqueues].pt_lost++;
 	dequeue_task(p, src_array);
 	nr_running_dec(src_rq);
 	set_task_cpu(p, this_cpu);
@@ -1085,8 +1119,6 @@ static void load_balance(runqueue_t *thi
 	task_t *tmp;
 
 	schedstats[this_cpu].lb_cnt++;
-	if (idle)
-		schedstats[this_cpu].lb_idle++;
 	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask);
 	if (!busiest) {
 		schedstats[this_cpu].lb_nobusy++;
@@ -1181,9 +1213,13 @@ static void balance_node(runqueue_t *thi
 	int node = find_busiest_node(cpu_to_node(this_cpu));
 	unsigned long cpumask, this_cpumask = 1UL << this_cpu;
 
+	schedstats[this_cpu].bn_cnt++;
+	if (idle)
+	    schedstats[this_cpu].bn_idle++;
 	if (node >= 0) {
 		cpumask = node_to_cpumask(node) | this_cpumask;
 		spin_lock(&this_rq->lock);
+		schedstats[this_cpu].lb_bnode++;
 		load_balance(this_rq, idle, cpumask);
 		spin_unlock(&this_rq->lock);
 	}
@@ -1192,9 +1228,7 @@ static void balance_node(runqueue_t *thi
 
 static void rebalance_tick(runqueue_t *this_rq, int idle)
 {
-#ifdef CONFIG_NUMA
 	int this_cpu = smp_processor_id();
-#endif
 	unsigned long j = jiffies;
 
 	/*
@@ -1212,6 +1246,7 @@ static void rebalance_tick(runqueue_t *t
 #endif
 		if (!(j % IDLE_REBALANCE_TICK)) {
 			spin_lock(&this_rq->lock);
+			schedstats[this_cpu].lb_idle++;
 			load_balance(this_rq, 0, cpu_to_node_mask(this_cpu));
 			spin_unlock(&this_rq->lock);
 		}
@@ -1223,6 +1258,7 @@ static void rebalance_tick(runqueue_t *t
 #endif
 	if (!(j % BUSY_REBALANCE_TICK)) {
 		spin_lock(&this_rq->lock);
+		schedstats[this_cpu].lb_busy++;
 		load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
 		spin_unlock(&this_rq->lock);
 	}