diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/block/elevator.c x/drivers/block/elevator.c
--- x-ref/drivers/block/elevator.c	2003-05-11 05:06:25.000000000 +0200
+++ x/drivers/block/elevator.c	2003-05-14 01:29:31.000000000 +0200
@@ -80,30 +80,41 @@ int elevator_linus_merge(request_queue_t
 			 struct buffer_head *bh, int rw,
 			 int max_sectors)
 {
-	struct list_head *entry = &q->queue_head;
+	struct list_head *entry, *real_head;
 	unsigned int count = bh->b_size >> 9, ret = ELEVATOR_NO_MERGE;
 	struct request *__rq;
 	int backmerge_only = 0;
 
+	if (!bh_elv_seq(bh))
+		entry = &q->queue_head;
+	else
+		entry = &q->atomic_head;
+	real_head = entry;
+
 	while (!backmerge_only && (entry = entry->prev) != head) {
 		__rq = blkdev_entry_to_request(entry);
 
 		/*
 		 * we can't insert beyond a zero sequence point
 		 */
-		if (__rq->elevator_sequence <= 0)
+		if (__rq->elevator_sequence <= 0 && !bh_elv_seq(bh))
 			backmerge_only = 1;
 
 		if (__rq->waiting)
 			continue;
 		if (__rq->rq_dev != bh->b_rdev)
 			continue;
-		if (!*req && bh_rq_in_between(bh, __rq, &q->queue_head) && !backmerge_only)
+		if (!*req && bh_rq_in_between(bh, __rq, real_head) && !backmerge_only)
 			*req = __rq;
 		if (__rq->cmd != rw)
 			continue;
 		if (__rq->nr_sectors + count > max_sectors)
 			continue;
+		/*
+		 * possibly move this inside the merge path and make it a break
+		 */
+		if (bh_elv_seq(bh) != bh_elv_seq(__rq->bh))
+			continue;
 		if (__rq->sector + __rq->nr_sectors == bh->b_rsector) {
 			ret = ELEVATOR_BACK_MERGE;
 			*req = __rq;
@@ -124,7 +135,7 @@ int elevator_linus_merge(request_queue_t
 		int scan_cost = ret ? 1 : ELV_LINUS_SEEK_COST;
 		struct list_head *entry = &(*req)->queue;
 
-		while ((entry = entry->next) != &q->queue_head) {
+		while ((entry = entry->next) != real_head) {
 			__rq = blkdev_entry_to_request(entry);
 			__rq->elevator_sequence -= scan_cost;
 		}
@@ -147,13 +158,18 @@ int elevator_noop_merge(request_queue_t 
 			struct buffer_head *bh, int rw,
 			int max_sectors)
 {
-	struct list_head *entry;
+	struct list_head *entry, *real_head;
 	unsigned int count = bh->b_size >> 9;
 
-	if (list_empty(&q->queue_head))
+	if (!bh_elv_seq(bh))
+		entry = &q->queue_head;
+	else
+		entry = &q->atomic_head;
+	real_head = entry;
+
+	if (list_empty(real_head))
 		return ELEVATOR_NO_MERGE;
 
-	entry = &q->queue_head;
 	while ((entry = entry->prev) != head) {
 		struct request *__rq = blkdev_entry_to_request(entry);
 
@@ -165,6 +181,11 @@ int elevator_noop_merge(request_queue_t 
 			continue;
 		if (__rq->waiting)
 			continue;
+		/*
+		 * possibly move this inside the merge path and make it a break
+		 */
+		if (bh_elv_seq(bh) != bh_elv_seq(__rq->bh))
+			continue;
 		if (__rq->sector + __rq->nr_sectors == bh->b_rsector) {
 			*req = __rq;
 			return ELEVATOR_BACK_MERGE;
@@ -174,7 +195,7 @@ int elevator_noop_merge(request_queue_t 
 		}
 	}
 
-	*req = blkdev_entry_to_request(q->queue_head.prev);
+	*req = blkdev_entry_to_request(real_head->prev);
 	return ELEVATOR_NO_MERGE;
 }
 
diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/block/ll_rw_blk.c x/drivers/block/ll_rw_blk.c
--- x-ref/drivers/block/ll_rw_blk.c	2003-05-14 01:29:28.000000000 +0200
+++ x/drivers/block/ll_rw_blk.c	2003-05-14 01:29:31.000000000 +0200
@@ -51,6 +51,8 @@ static kmem_cache_t *request_cachep;
  */
 DECLARE_TASK_QUEUE(tq_disk);
 
+LIST_HEAD(blk_atomic_head);
+
 /*
  * Protect the request list against multiple users..
  *
@@ -125,9 +127,63 @@ int * max_sectors[MAX_BLKDEV];
  */
 char * blkdev_varyio[MAX_BLKDEV];
 
+/*
+ * only allow merging of buffer_heads with identical sequence, for transparent
+ * support for writing atomic blocks larger than what a single bh can hold
+ */
+static unsigned int blk_atomic_seq;
+static spinlock_cacheline_t blk_atomic_lock_cacheline = {SPIN_LOCK_UNLOCKED};
+static spinlock_cacheline_t blk_atomic_queue_lock_cacheline = {SPIN_LOCK_UNLOCKED};
+
+#ifdef CONFIG_SMP
+struct blk_atomic_cpu {
+	unsigned int seq;
+	unsigned int left;
+} ____cacheline_aligned_in_smp;
+
+struct blk_atomic_cpu __cacheline_aligned_in_smp blk_atomic_cpu[NR_CPUS];
+
+#define BLK_ATOMIC_SEQ_GRAB	1024
+#endif
+
 unsigned long blk_max_low_pfn, blk_max_pfn;
 int blk_nohighio = 0;
 
+unsigned int blk_get_atomic_seq(void)
+{
+	unsigned int ret;
+
+#ifdef CONFIG_SMP
+	{
+		struct blk_atomic_cpu *bcpu = &blk_atomic_cpu[smp_processor_id()];
+
+restart:
+		if (unlikely(!bcpu->left)) {
+			spin_lock_irq(&blk_atomic_lock);
+			bcpu->seq = blk_atomic_seq;
+			blk_atomic_seq += BLK_ATOMIC_SEQ_GRAB;
+			spin_unlock_irq(&blk_atomic_lock);
+			bcpu->left = BLK_ATOMIC_SEQ_GRAB;
+		}
+		bcpu->seq++;
+		bcpu->left--;
+		if (unlikely(!bcpu->seq))
+			goto restart;
+
+		ret = bcpu->seq;
+	}
+#else
+	spin_lock_irq(&blk_atomic_lock);
+	ret = ++blk_atomic_seq;
+	if (unlikely(!ret)) {
+		ret = 1;
+		++blk_atomic_seq;
+	}
+	spin_unlock_irq(&blk_atomic_lock);
+#endif
+	return ret;
+}
+
 static inline int get_max_sectors(kdev_t dev)
 {
 	if (!max_sectors[MAJOR(dev)])
@@ -383,6 +439,91 @@ void generic_unplug_device(void *data)
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
+static void blk_atomic_add(request_queue_t *q)
+{
+	spin_lock_irq(&blk_atomic_queue_lock);
+	/* it's empty only when it's out of the blk_atomic_head queue */
+	if (list_empty(&q->atomic_entry))
+		list_add_tail(&q->atomic_entry, &blk_atomic_head);
+	spin_unlock_irq(&blk_atomic_queue_lock);
+}
+
+static struct list_head *blk_find_insert_point(request_queue_t *q,
+					       struct request *rq)
+{
+	struct list_head *head = &q->queue_head, *insert = q->queue_head.prev;
+	struct buffer_head *bh;
+	int elv_seq;
+	struct request *dummy;
+
+	if (list_empty(head))
+		goto done;
+	else if (q->head_active && !q->plugged)
+		head = head->next;
+
+	dummy = NULL;
+	bh = rq->bh;
+
+	elv_seq = bh_elv_seq(bh);
+	bh_elv_seq(bh) = 0;
+
+	q->elevator.elevator_merge_fn(q, &dummy, head, bh,
+				      -1 /* non cmd -> no merge */,
+				      0 /* too small max_sectors -> no merge */);
+
+	bh_elv_seq(bh) = elv_seq;
+
+	if (dummy)
+		insert = &dummy->queue;
+
+done:
+	return insert;
+}
+
+void blk_refile_atomic_queue(int sequence)
+{
+	request_queue_t *q;
+	struct request * rq;
+	unsigned long flags;
+	struct list_head * q_entry, * rq_entry;
+	int __sequence;
+
+	spin_lock_irqsave(&blk_atomic_queue_lock, flags);
+
+	q_entry = blk_atomic_head.next;
+	while (q_entry != &blk_atomic_head) {
+		q = list_entry(q_entry, request_queue_t, atomic_entry);
+		q_entry = q_entry->next;
+
+		spin_lock(q->queue_lock);
+		rq_entry = q->atomic_head.next;
+		while (rq_entry != &q->atomic_head) {
+			rq = list_entry(rq_entry, struct request, queue);
+			rq_entry = rq_entry->next;
+
+			BUG_ON(!rq->q);
+			BUG_ON(!rq->bh);
+			__sequence = bh_elv_seq(rq->bh);
+			BUG_ON(!__sequence);
+			if (__sequence == sequence) {
+				struct list_head *ipoint;
+
+				list_del(&rq->queue);
+				if (list_empty(&q->queue_head))
+					q->plug_device_fn(q, rq->bh->b_rdev);
+
+				ipoint = blk_find_insert_point(q, rq);
+				list_add(&rq->queue, ipoint);
+			}
+		}
+		if (list_empty(&q->atomic_head))
+			list_del_init(&q->atomic_entry);
+		spin_unlock(q->queue_lock);
+	}
+
+	spin_unlock_irqrestore(&blk_atomic_queue_lock, flags);
+}
+
 /** blk_grow_request_list
  *  @q: The &request_queue_t
  *  @nr_requests: how many requests are desired
@@ -500,6 +641,8 @@ static int __make_request(request_queue_
 void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
 {
 	INIT_LIST_HEAD(&q->queue_head);
+	INIT_LIST_HEAD(&q->atomic_head);
+	INIT_LIST_HEAD(&q->atomic_entry);
 	elevator_init(&q->elevator, ELEVATOR_LINUS);
 	q->queue_lock		= &io_request_lock;
 	blk_init_free_list(q);
@@ -862,11 +1005,6 @@ static inline void add_request(request_q
 {
 	drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1);
 
-	if (!q->plugged && q->head_active && insert_here == &q->queue_head) {
-		spin_unlock_irq(q->queue_lock);
-		BUG();
-	}
-
 	/*
 	 * elevator indicated where it wants this request to be
 	 * inserted at elevator_merge time
@@ -916,6 +1054,8 @@ static void attempt_merge(request_queue_
 	    || req->nr_sectors + next->nr_sectors > max_sectors
 	    || next->waiting)
 		return;
+	if (bh_elv_seq(req->bh) != bh_elv_seq(next->bh))
+		return;
 	/*
 	 * If we are not allowed to merge these requests, then
 	 * return.  If we are allowed to merge, then the count
@@ -939,11 +1079,12 @@ static void attempt_merge(request_queue_
 }
 
 static inline void attempt_back_merge(request_queue_t * q,
+				      struct list_head * head,
 				      struct request *req,
 				      int max_sectors,
 				      int max_segments)
 {
-	if (&req->queue == q->queue_head.prev)
+	if (&req->queue == head->prev)
 		return;
 	attempt_merge(q, req, max_sectors, max_segments);
 }
@@ -969,9 +1110,10 @@ static int __make_request(request_queue_
 	int max_segments = MAX_SEGMENTS;
 	struct request * req, *freereq = NULL;
 	int rw_ahead, max_sectors, el_ret;
-	struct list_head *head, *insert_here;
+	struct list_head *head, *real_head, *insert_here;
 	int latency;
 	elevator_t *elevator = &q->elevator;
+	int atomic = bh_elv_seq(bh), atomic_add = 0;
 
 	count = bh->b_size >> 9;
 	sector = bh->b_rsector;
@@ -1013,7 +1155,7 @@ static int __make_request(request_queue_
 	max_sectors = get_max_sectors(bh->b_rdev);
 
 	req = NULL;
-	head = &q->queue_head;
+	real_head = head = !atomic ? &q->queue_head : &q->atomic_head;
 	/*
 	 * Now we acquire the request spinlock, we have to be mega careful
 	 * not to schedule or do something nonatomic
@@ -1022,11 +1164,14 @@ static int __make_request(request_queue_
 
 again:
 	insert_here = head->prev;
-	if (list_empty(head)) {
-		q->plug_device_fn(q, bh->b_rdev); /* is atomic */
+	if (!atomic) {
+		if (list_empty(head)) {
+			q->plug_device_fn(q, bh->b_rdev); /* is atomic */
+			goto get_rq;
+		} else if (q->head_active && !q->plugged)
+			head = head->next;
+	} else if (list_empty(head))
 		goto get_rq;
-	} else if (q->head_active && !q->plugged)
-		head = head->next;
 
 	el_ret = elevator->elevator_merge_fn(q, &req, head, bh, rw,max_sectors);
 	switch (el_ret) {
@@ -1042,7 +1187,7 @@ again:
 			blk_started_io(req, count);
 			drive_stat_acct(req->rq_dev, req->cmd, count, 0);
 			req_new_io(req, 1, count);
-			attempt_back_merge(q, req, max_sectors, max_segments);
+			attempt_back_merge(q, real_head, req, max_sectors, max_segments);
 			goto out;
 
 		case ELEVATOR_FRONT_MERGE:
@@ -1105,8 +1250,10 @@ get_rq:
 			req = get_request(q, rw);
 			if (req == NULL) {
 				spin_unlock_irq(q->queue_lock);
+				if (atomic)
+					blk_refile_atomic_queue(atomic);
 				freereq = __get_request_wait(q, rw);
-				head = &q->queue_head;
+				head = real_head;
 				spin_lock_irq(q->queue_lock);
 				get_request_wait_wakeup(q, rw);
 				goto again;
@@ -1132,10 +1279,13 @@ get_rq:
 	req_new_io(req, 0, count);
 	blk_started_io(req, count);
 	add_request(q, req, insert_here);
+	atomic_add = atomic;
 out:
 	if (freereq)
 		blkdev_release_request(freereq);
 	spin_unlock_irq(q->queue_lock);
+	if (atomic_add)
+		blk_atomic_add(q);
 	return 0;
 end_io:
 	bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
@@ -1184,6 +1334,8 @@ void generic_make_request (int rw, struc
 
 	if (!bh->b_end_io)
 		BUG();
+	if (!buffer_atomic(bh))
+		bh->b_elv_sequence = 0;
 
 	/* Test device size, when known. */
 	if (blk_size[major])
@@ -1470,6 +1622,10 @@ int __init blk_dev_init(void)
 	memset(max_readahead, 0, sizeof(max_readahead));
 	memset(max_sectors, 0, sizeof(max_sectors));
 
+#ifdef CONFIG_SMP
+	memset(blk_atomic_cpu, 0, sizeof(blk_atomic_cpu));
+#endif
+
 	blk_max_low_pfn = max_low_pfn - 1;
 	blk_max_pfn = max_pfn - 1;
 
@@ -1589,3 +1745,5 @@ EXPORT_SYMBOL(blk_max_low_pfn);
 EXPORT_SYMBOL(blk_max_pfn);
 EXPORT_SYMBOL(blk_seg_merge_ok);
 EXPORT_SYMBOL(blk_nohighio);
+EXPORT_SYMBOL(blk_get_atomic_seq);
+EXPORT_SYMBOL(blk_refile_atomic_queue);
diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/md.c x/drivers/md/md.c
--- x-ref/drivers/md/md.c	2003-05-14 01:29:17.000000000 +0200
+++ x/drivers/md/md.c	2003-05-14 01:29:31.000000000 +0200
@@ -494,6 +494,7 @@ static int sync_page_io(kdev_t dev, unsi
 	bh.b_page = page;
 	bh.b_reqnext = NULL;
 	bh.b_data = page_address(page);
+	bh.b_elv_sequence = 0;
 	generic_make_request(rw, &bh);
 
 	run_task_queue(&tq_disk);
diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/raid1.c x/drivers/md/raid1.c
--- x-ref/drivers/md/raid1.c	2003-05-11 05:06:25.000000000 +0200
+++ x/drivers/md/raid1.c	2003-05-14 01:29:31.000000000 +0200
@@ -686,6 +686,7 @@ static int raid1_make_request (mddev_t *
  		mbh->b_list       = BUF_LOCKED;
  		mbh->b_end_io     = raid1_end_request;
  		mbh->b_private    = r1_bh;
+		mbh->b_elv_sequence = bh->b_elv_sequence;
 
 		mbh->b_next = r1_bh->mirror_bh_list;
 		r1_bh->mirror_bh_list = mbh;
@@ -1455,6 +1456,7 @@ static int raid1_sync_request (mddev_t *
 	bh->b_private = r1_bh;
 	bh->b_blocknr = sector_nr;
 	bh->b_rsector = sector_nr;
+	bh->b_elv_sequence = 0;
 	init_waitqueue_head(&bh->b_wait);
 
 	generic_make_request(READ, bh);
diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/raid5.c x/drivers/md/raid5.c
--- x-ref/drivers/md/raid5.c	2003-05-11 05:06:25.000000000 +0200
+++ x/drivers/md/raid5.c	2003-05-14 01:29:31.000000000 +0200
@@ -151,7 +151,7 @@ static void shrink_buffers(struct stripe
 			return;
 		sh->bh_cache[i] = NULL;
 		free_page((unsigned long) bh->b_data);
-		kfree(bh);
+		kmem_cache_free(bh_cachep, bh);
 	}
 }
 
@@ -162,7 +162,7 @@ static int grow_buffers(struct stripe_he
 
 	for (i=0; i<num; i++) {
 		struct page *page;
-		bh = kmalloc(sizeof(struct buffer_head), priority);
+		bh = kmem_cache_alloc(bh_cachep, priority);
 		if (!bh)
 			return 1;
 		memset(bh, 0, sizeof (struct buffer_head));
@@ -170,7 +170,7 @@ static int grow_buffers(struct stripe_he
 		if ((page = alloc_page(priority)))
 			bh->b_data = page_address(page);
 		else {
-			kfree(bh);
+			kmem_cache_free(bh_cachep, bh);
 			return 1;
 		}
 		atomic_set(&bh->b_count, 0);
@@ -474,6 +474,7 @@ static struct buffer_head *raid5_build_b
 	bh->b_state	= (1 << BH_Req) | (1 << BH_Mapped);
 	bh->b_size	= sh->size;
 	bh->b_list	= BUF_LOCKED;
+	bh->b_elv_sequence = 0;
 	return bh;
 }
 
diff -urNp --exclude CVS --exclude BitKeeper x-ref/fs/buffer.c x/fs/buffer.c
--- x-ref/fs/buffer.c	2003-05-14 01:29:28.000000000 +0200
+++ x/fs/buffer.c	2003-05-14 01:38:08.000000000 +0200
@@ -132,6 +132,7 @@ void unlock_buffer(struct buffer_head *b
 {
 	clear_bit(BH_Wait_IO, &bh->b_state);
 	clear_bit(BH_Launder, &bh->b_state);
+	clear_bit(BH_Atomic, &bh->b_state);
 	/*
 	 * When a locked buffer is visible to the I/O layer BH_Launder
 	 * is set. This means before unlocking we must clear BH_Launder,
@@ -2269,6 +2270,7 @@ int brw_kiovec(int rw, int nr, struct ki
 	struct page *	map;
 	struct buffer_head *tmp, **bhs = NULL;
 	int iosize = size;
+	unsigned int	atomic_seq;
 
 	if (!nr)
 		return 0;
@@ -2285,6 +2287,10 @@ int brw_kiovec(int rw, int nr, struct ki
 			panic("brw_kiovec: iobuf not initialised");
 	}
 
+	atomic_seq = 0;
+	if (rw == WRITE)
+		atomic_seq = blk_get_atomic_seq();
+
 	/* 
 	 * OK to walk down the iovec doing page IO on each page we find. 
 	 */
@@ -2341,7 +2347,8 @@ int brw_kiovec(int rw, int nr, struct ki
 				init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
 				tmp->b_dev = dev;
 				tmp->b_blocknr = blocknr;
-				tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
+				tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req) | (1 << BH_Atomic);
+				bh_elv_seq(tmp) = atomic_seq;
 
 				if (rw == WRITE) {
 					set_bit(BH_Uptodate, &tmp->b_state);
@@ -2359,12 +2366,15 @@ int brw_kiovec(int rw, int nr, struct ki
 				 * Wait for IO if we have got too much 
 				 */
 				if (bhind >= KIO_MAX_SECTORS) {
+					blk_refile_atomic_queue(atomic_seq);
 					kiobuf_wait_for_io(iobuf); /* wake-one */
 					err = wait_kio(rw, bhind, bhs, size);
 					if (err >= 0)
 						transferred += err;
 					else
 						goto finished;
+					if (rw == WRITE)
+						atomic_seq = blk_get_atomic_seq();
 					bhind = 0;
 				}
 
@@ -2383,12 +2393,11 @@ int brw_kiovec(int rw, int nr, struct ki
 
 	/* Is there any IO still left to submit? */
 	if (bhind) {
+		blk_refile_atomic_queue(atomic_seq);
 		kiobuf_wait_for_io(iobuf); /* wake-one */
 		err = wait_kio(rw, bhind, bhs, size);
 		if (err >= 0)
 			transferred += err;
-		else
-			goto finished;
 	}
 
  finished:
@@ -3160,6 +3169,7 @@ int brw_kvec_async(int rw, kvec_cb_t cb,
 	int		length;
 	unsigned	sector_size = 1 << sector_shift;
 	int		i;
+	unsigned int	atomic_seq;
 
 	struct brw_cb	*brw_cb;
 
@@ -3200,6 +3210,10 @@ int brw_kvec_async(int rw, kvec_cb_t cb,
 	brw_cb->cb = cb;
 	brw_cb->nr = 0;
 
+	atomic_seq = 0;
+	if (rw == WRITE)
+		atomic_seq = blk_get_atomic_seq();
+
 	/* This is ugly.  FIXME. */
 	for (i=0, veclet=vec->veclet; i<vec->nr; i++,veclet++) {
 		struct page *page = veclet->page;
@@ -3224,8 +3238,9 @@ int brw_kvec_async(int rw, kvec_cb_t cb,
 			init_buffer(tmp, end_buffer_io_kiobuf_async, NULL);
 			tmp->b_dev = dev;
 			tmp->b_blocknr = blknr++;
-			tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock)
-					| (1 << BH_Req);
+			tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) |
+				       (1 << BH_Req) | (1 << BH_Atomic);
+			bh_elv_seq(tmp) = atomic_seq;
 			tmp->b_private = brw_cb;
 
 			if (rw == WRITE) {
@@ -3248,6 +3263,8 @@ int brw_kvec_async(int rw, kvec_cb_t cb,
 	} /* End of page loop */		
 
 submit:
+	blk_refile_atomic_queue(atomic_seq);
+
 	atomic_set(&brw_cb->io_count, brw_cb->nr+1);
 	/* okay, we've setup all our io requests, now fire them off! */
 	for (i=0; i<brw_cb->nr; i++) 
@@ -3257,6 +3274,8 @@ submit:
 	return 0;
 
 error:
+	blk_refile_atomic_queue(atomic_seq);
+
 	/* Walk brw_cb_table freeing all the goop associated with each kiobuf */
 	if (brw_cb) {
 		/* We got an error allocating the bh'es.  Just free the current
diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/blkdev.h x/include/linux/blkdev.h
--- x-ref/include/linux/blkdev.h	2003-05-14 01:29:28.000000000 +0200
+++ x/include/linux/blkdev.h	2003-05-14 01:29:31.000000000 +0200
@@ -111,6 +111,7 @@ struct request_queue
 	 * Together with queue_head for cacheline sharing
 	 */
 	struct list_head	queue_head;
+	struct list_head	atomic_head;
 	elevator_t		elevator;
 
 	request_fn_proc		* request_fn;
@@ -129,6 +130,7 @@ struct request_queue
 	 * This is used to remove the plug when tq_disk runs.
 	 */
 	struct tq_struct	plug_tq;
+	struct list_head	atomic_entry;
 
 	/*
 	 * Boolean that indicates whether this queue is plugged or not.
@@ -176,6 +178,14 @@ extern unsigned long blk_max_low_pfn, bl
 #define BLK_BOUNCE_HIGH		(blk_max_low_pfn << PAGE_SHIFT)
 #define BLK_BOUNCE_ANY		(blk_max_pfn << PAGE_SHIFT)
 
+/*
+ * max guaranteed atomic I/O size while dealing with bounce buffers.
+ * highmemio capable devices (pci64 in particular) can go well beyond
+ * this limit. Must be a multiple of 512bytes obviously.
+ */
+#define BLK_ATOMIC_BOUNCE_SIZE		32768
+#define BLK_ATOMIC_BOUNCE_ENTRIES	(BLK_ATOMIC_BOUNCE_SIZE >> 9)
+
 extern void blk_queue_bounce_limit(request_queue_t *, u64);
 
 #ifdef CONFIG_HIGHMEM
@@ -233,6 +243,13 @@ extern void generic_make_request(int rw,
 extern inline request_queue_t *blk_get_queue(kdev_t dev);
 extern void blkdev_release_request(struct request *);
 
+extern spinlock_cacheline_t blk_atomic_lock_cacheline;
+#define blk_atomic_lock (blk_atomic_lock_cacheline.lock)
+extern unsigned int blk_get_atomic_seq(void);
+extern spinlock_cacheline_t blk_atomic_queue_lock_cacheline;
+#define blk_atomic_queue_lock (blk_atomic_queue_lock_cacheline.lock)
+extern void FASTCALL(blk_refile_atomic_queue(int sequence));
+
 /*
  * Access functions for manipulating queue properties
  */
diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/fs.h x/include/linux/fs.h
--- x-ref/include/linux/fs.h	2003-05-14 01:29:28.000000000 +0200
+++ x/include/linux/fs.h	2003-05-14 01:29:31.000000000 +0200
@@ -226,6 +226,7 @@ enum bh_state_bits {
 	BH_Attached,	/* 1 if b_inode_buffers is linked into a list */
 	BH_JBD,		/* 1 if it has an attached journal_head */
 	BH_Delay,	/* 1 if the buffer is delayed allocate */
+	BH_Atomic,	/* 1 if b_elv_sequence is valid */
 
 	BH_PrivateStart,/* not a state bit, but the first bit available
 			 * for private allocation by other entities
@@ -270,6 +271,7 @@ struct buffer_head {
  	void *b_private;		/* reserved for b_end_io */
 
 	unsigned long b_rsector;	/* Real buffer location on disk */
+	int b_elv_sequence;			/* for atomic blocks */
 	wait_queue_head_t b_wait;
 
 	struct list_head     b_inode_buffers;	/* doubly linked list of inode dirty buffers */
@@ -289,6 +291,7 @@ void init_buffer(struct buffer_head *, b
 #define buffer_async(bh)	__buffer_state(bh,Async)
 #define buffer_launder(bh)	__buffer_state(bh,Launder)
 #define buffer_delay(bh)	__buffer_state(bh,Delay)
+#define buffer_atomic(bh)	__buffer_state(bh,Atomic)
 
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
 
@@ -296,6 +299,7 @@ extern void set_bh_page(struct buffer_he
 
 #define touch_buffer(bh)	mark_page_accessed(bh->b_page)
 
+#define bh_elv_seq(bh)		(bh)->b_elv_sequence
 
 #include <linux/pipe_fs_i.h>
 #include <linux/minix_fs_i.h>
diff -urNp --exclude CVS --exclude BitKeeper x-ref/mm/highmem.c x/mm/highmem.c
--- x-ref/mm/highmem.c	2003-05-14 01:29:20.000000000 +0200
+++ x/mm/highmem.c	2003-05-14 01:29:31.000000000 +0200
@@ -22,6 +22,7 @@
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
+#include <linux/blkdev.h>
 #include <asm/pgalloc.h>
 
 /*
@@ -211,6 +212,14 @@ static LIST_HEAD(emergency_pages);
 int nr_emergency_bhs;
 static LIST_HEAD(emergency_bhs);
 
+int nr_atomic_emergency_pages;
+static LIST_HEAD(atomic_emergency_pages);
+
+int nr_atomic_emergency_bhs;
+static LIST_HEAD(atomic_emergency_bhs);
+
+int atomic_emergency_owner;
+
 /*
  * Simple bounce buffer support for highmem pages.
  * This will be moved to the block layer in 2.5.
@@ -250,35 +259,66 @@ static inline void bounce_end_io (struct
 	struct page *page;
 	struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private);
 	unsigned long flags;
+	int atomic = bh_elv_seq(bh);
 
 	bh_orig->b_end_io(bh_orig, uptodate);
 
 	page = bh->b_page;
 
 	spin_lock_irqsave(&emergency_lock, flags);
-	if (nr_emergency_pages >= POOL_SIZE)
-		__free_page(page);
-	else {
-		/*
-		 * We are abusing page->list to manage
-		 * the highmem emergency pool:
-		 */
-		list_add(&page->list, &emergency_pages);
-		nr_emergency_pages++;
-	}
-	
-	if (nr_emergency_bhs >= POOL_SIZE) {
+	if (!atomic) {
+		if (nr_emergency_pages >= POOL_SIZE)
+			__free_page(page);
+		else {
+			/*
+			 * We are abusing page->list to manage
+			 * the highmem emergency pool:
+			 */
+			list_add(&page->list, &emergency_pages);
+			nr_emergency_pages++;
+		}
+
+		if (nr_emergency_bhs >= POOL_SIZE) {
 #ifdef HIGHMEM_DEBUG
-		/* Don't clobber the constructed slab cache */
-		init_waitqueue_head(&bh->b_wait);
+			/* Don't clobber the constructed slab cache */
+			init_waitqueue_head(&bh->b_wait);
 #endif
-		kmem_cache_free(bh_cachep, bh);
+			kmem_cache_free(bh_cachep, bh);
+		} else {
+			/*
+			 * Ditto in the bh case, here we abuse b_inode_buffers:
+			 */
+			list_add(&bh->b_inode_buffers, &emergency_bhs);
+			nr_emergency_bhs++;
+		}
 	} else {
-		/*
-		 * Ditto in the bh case, here we abuse b_inode_buffers:
-		 */
-		list_add(&bh->b_inode_buffers, &emergency_bhs);
-		nr_emergency_bhs++;
+		if (nr_atomic_emergency_pages >= BLK_ATOMIC_BOUNCE_ENTRIES)
+			__free_page(page);
+		else {
+			/*
+			 * We are abusing page->list to manage
+			 * the highmem emergency pool:
+			 */
+			list_add(&page->list, &atomic_emergency_pages);
+			nr_atomic_emergency_pages++;
+		}
+
+		if (nr_atomic_emergency_bhs >= BLK_ATOMIC_BOUNCE_ENTRIES) {
+#ifdef HIGHMEM_DEBUG
+			/* Don't clobber the constructed slab cache */
+			init_waitqueue_head(&bh->b_wait);
+#endif
+			kmem_cache_free(bh_cachep, bh);
+		} else {
+			/*
+			 * Ditto in the bh case, here we abuse b_inode_buffers:
+			 */
+			list_add(&bh->b_inode_buffers, &atomic_emergency_bhs);
+			nr_atomic_emergency_bhs++;
+		}
+		BUG_ON(nr_atomic_emergency_pages != nr_atomic_emergency_bhs);
+		if (nr_atomic_emergency_pages >= BLK_ATOMIC_BOUNCE_ENTRIES)
+			atomic_emergency_owner = 0;
 	}
 	spin_unlock_irqrestore(&emergency_lock, flags);
 }
@@ -311,6 +351,24 @@ static __init int init_emergency_pool(vo
 		list_add(&bh->b_inode_buffers, &emergency_bhs);
 		nr_emergency_bhs++;
 	}
+	while (nr_atomic_emergency_pages < BLK_ATOMIC_BOUNCE_ENTRIES) {
+		struct page * page = alloc_page(GFP_ATOMIC);
+		if (!page) {
+			printk("couldn't refill highmem emergency pages");
+			break;
+		}
+		list_add(&page->list, &atomic_emergency_pages);
+		nr_atomic_emergency_pages++;
+	}
+	while (nr_atomic_emergency_bhs < BLK_ATOMIC_BOUNCE_ENTRIES) {
+		struct buffer_head * bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
+		if (!bh) {
+			printk("couldn't refill highmem emergency bhs");
+			break;
+		}
+		list_add(&bh->b_inode_buffers, &atomic_emergency_bhs);
+		nr_atomic_emergency_bhs++;
+	}
 	spin_unlock_irq(&emergency_lock);
 	printk("allocated %d pages and %d bhs reserved for the highmem bounces\n",
 	       nr_emergency_pages, nr_emergency_bhs);
@@ -334,7 +392,7 @@ static void bounce_end_io_read (struct b
 	bounce_end_io(bh, uptodate);
 }
 
-struct page *alloc_bounce_page (void)
+struct page *alloc_bounce_page (int atomic)
 {
 	struct list_head *tmp;
 	struct page *page;
@@ -352,17 +410,30 @@ repeat_alloc:
 	/*
 	 * Try to allocate from the emergency pool.
 	 */
-	tmp = &emergency_pages;
 	spin_lock_irq(&emergency_lock);
-	if (!list_empty(tmp)) {
-		page = list_entry(tmp->next, struct page, list);
-		list_del(tmp->next);
-		nr_emergency_pages--;
+	if (!atomic) {
+		tmp = &emergency_pages;
+		if (!list_empty(tmp)) {
+			page = list_entry(tmp->next, struct page, list);
+			list_del(tmp->next);
+			nr_emergency_pages--;
+		}
+	} else {
+		tmp = &atomic_emergency_pages;
+		if ((!atomic_emergency_owner || atomic_emergency_owner == atomic) &&
+		    !list_empty(tmp)) {
+			page = list_entry(tmp->next, struct page, list);
+			list_del(tmp->next);
+			nr_atomic_emergency_pages--;
+			atomic_emergency_owner = atomic;
+		}
 	}
 	spin_unlock_irq(&emergency_lock);
 	if (page)
 		return page;
 
+	if (atomic)
+		blk_refile_atomic_queue(atomic);
 	/* we need to wait I/O completion */
 	run_task_queue(&tq_disk);
 
@@ -370,7 +441,7 @@ repeat_alloc:
 	goto repeat_alloc;
 }
 
-struct buffer_head *alloc_bounce_bh (void)
+struct buffer_head *alloc_bounce_bh (int atomic)
 {
 	struct list_head *tmp;
 	struct buffer_head *bh;
@@ -388,17 +459,31 @@ repeat_alloc:
 	/*
 	 * Try to allocate from the emergency pool.
 	 */
-	tmp = &emergency_bhs;
 	spin_lock_irq(&emergency_lock);
-	if (!list_empty(tmp)) {
-		bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers);
-		list_del(tmp->next);
-		nr_emergency_bhs--;
+	if (!atomic) {
+		tmp = &emergency_bhs;
+		if (!list_empty(tmp)) {
+			bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers);
+			list_del(tmp->next);
+			nr_emergency_bhs--;
+		}
+	} else {
+		tmp = &atomic_emergency_bhs;
+		if ((!atomic_emergency_owner || atomic_emergency_owner == atomic) &&
+		    !list_empty(tmp)) {
+			bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers);
+			list_del(tmp->next);
+			nr_atomic_emergency_bhs--;
+			atomic_emergency_owner = atomic;
+		}
+
 	}
 	spin_unlock_irq(&emergency_lock);
 	if (bh)
 		return bh;
 
+	if (atomic)
+		blk_refile_atomic_queue(atomic);
 	/* we need to wait I/O completion */
 	run_task_queue(&tq_disk);
 
@@ -414,14 +499,14 @@ struct buffer_head * create_bounce(int r
 	if (!PageHighMem(bh_orig->b_page))
 		return bh_orig;
 
-	bh = alloc_bounce_bh();
+	bh = alloc_bounce_bh(bh_elv_seq(bh_orig));
 	/*
 	 * This is wasteful for 1k buffers, but this is a stopgap measure
 	 * and we are being ineffective anyway. This approach simplifies
 	 * things immensly. On boxes with more than 4GB RAM this should
 	 * not be an issue anyway.
 	 */
-	page = alloc_bounce_page();
+	page = alloc_bounce_page(bh_elv_seq(bh_orig));
 
 	set_bh_page(bh, page, 0);
 
@@ -449,6 +534,7 @@ struct buffer_head * create_bounce(int r
 		bh->b_end_io = bounce_end_io_read;
 	bh->b_private = (void *)bh_orig;
 	bh->b_rsector = bh_orig->b_rsector;
+	bh_elv_seq(bh) = bh_elv_seq(bh_orig);
 #ifdef HIGHMEM_DEBUG
 	memset(&bh->b_wait, -1, sizeof(bh->b_wait));
 #endif