From: Lou Langholtz <ldl@aros.net>

Here is an updated patch 6 (I called it 6.1) that fixes some additional
locking issues as well as fixing the header file so it can be used by user
applications too (like the nbd-client tool).  This patch is also
incremental against patch 5 (that cleaned up PARANOI stuff).  The updated
header file could be split in two files.  For now though, this is enough
change at once (IMO).




 drivers/block/nbd.c |  366 +++++++++++++++++++++++++++-------------------------
 include/linux/nbd.h |   65 ++++-----
 2 files changed, 229 insertions(+), 202 deletions(-)

diff -puN drivers/block/nbd.c~nbd-locking-fixes drivers/block/nbd.c
--- 25/drivers/block/nbd.c~nbd-locking-fixes	2003-06-26 18:39:48.000000000 -0700
+++ 25-akpm/drivers/block/nbd.c	2003-06-26 18:39:48.000000000 -0700
@@ -36,6 +36,11 @@
  * 03-06-24 Remove unneeded blksize_bits field from nbd_device struct.
  *   <ldl@aros.net>
  * 03-06-24 Cleanup PARANOIA usage & code. <ldl@aros.net>
+ * 03-06-24 Fix resource locking issues with ioctl user interface. Note that
+ *   this change is incompatible with existing user tools (nbd-client) and
+ *   require them to be updated. Actually, the linux 2.5 block layer redisign
+ *   already required existing user tools to be updated to properly set the
+ *   correct device size. <ldl@aros.net>
  *
  * possible FIXME: make set_sock / set_blksize / set_size / do_it one syscall
  * why not: would need verify_area and friends, would share yet another 
@@ -137,21 +142,26 @@ static const char *nbdcmd_to_ascii(int c
 }
 #endif /* NDEBUG */
 
-static void nbd_end_request(struct request *req)
+static void request_end_while_locked(struct request *req)
 {
 	int uptodate = (req->errors == 0) ? 1 : 0;
-	request_queue_t *q = req->q;
-	unsigned long flags;
 
 	dprintk(DBG_BLKDEV, "%s: request %p: %s\n", req->rq_disk->disk_name,
 			req, uptodate? "done": "failed");
+	if (!end_that_request_first(req, uptodate, req->nr_sectors))
+		end_that_request_last(req);
 #ifdef PARANOIA
 	requests_out++;
 #endif
+}
+
+static void request_end(struct request *req)
+{
+	unsigned long flags;
+	request_queue_t *q = req->q;
+
 	spin_lock_irqsave(q->queue_lock, flags);
-	if (!end_that_request_first(req, uptodate, req->nr_sectors)) {
-		end_that_request_last(req);
-	}
+	request_end_while_locked(req);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
@@ -239,7 +249,7 @@ static inline int sock_send_bvec(struct 
 	return result;
 }
 
-void nbd_send_req(struct nbd_device *lo, struct request *req)
+static void nbd_send_req(struct nbd_device *lo, struct request *req)
 {
 	int result, i, flags;
 	struct nbd_request request;
@@ -252,13 +262,7 @@ void nbd_send_req(struct nbd_device *lo,
 	request.len = htonl(size);
 	memcpy(request.handle, &req, sizeof(req));
 
-	down(&lo->tx_lock);
-
-	if (!sock || !lo->sock) {
-		printk(KERN_ERR "%s: Attempted send on closed socket\n",
-				lo->disk->disk_name);
-		goto error_out;
-	}
+	BUG_ON(sock == NULL);
 
 	dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%luB)\n",
 			lo->disk->disk_name, req,
@@ -297,11 +301,9 @@ void nbd_send_req(struct nbd_device *lo,
 			}
 		}
 	}
-	up(&lo->tx_lock);
 	return;
 
-      error_out:
-	up(&lo->tx_lock);
+error_out:
 	req->errors++;
 }
 
@@ -398,28 +400,15 @@ struct request *nbd_read_stat(struct nbd
 	return req;
 }
 
-void nbd_do_it(struct nbd_device *lo)
-{
-	struct request *req;
-
-#ifdef PARANOIA
-	BUG_ON(lo->magic != LO_MAGIC);
-#endif
-	while ((req = nbd_read_stat(lo)) != NULL)
-		nbd_end_request(req);
-	printk(KERN_NOTICE "%s: req should never be null\n",
-			lo->disk->disk_name);
-	return;
-}
-
-void nbd_clear_que(struct nbd_device *lo)
+static int nbd_clear_que(struct nbd_device *lo)
 {
 	struct request *req;
 
-#ifdef PARANOIA
-	BUG_ON(lo->magic != LO_MAGIC);
-#endif
-
+	down(&lo->tx_lock);
+	if (lo->sock) {
+		up(&lo->tx_lock);
+		return -EBUSY;
+	}
 	do {
 		req = NULL;
 		spin_lock(&lo->queue_lock);
@@ -430,16 +419,18 @@ void nbd_clear_que(struct nbd_device *lo
 		spin_unlock(&lo->queue_lock);
 		if (req) {
 			req->errors++;
-			nbd_end_request(req);
+			request_end(req);
 		}
 	} while (req);
+	up(&lo->tx_lock);
+	return 0;
 }
 
 /*
  * We always wait for result of write, for now. It would be nice to make it optional
  * in future
  * if ((req->cmd == WRITE) && (lo->flags & NBD_WRITE_NOCHK)) 
- *   { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
+ *   { printk( "Warning: Ignoring result!\n"); request_end( req ); }
  */
 
 static void do_nbd_request(request_queue_t * q)
@@ -448,75 +439,69 @@ static void do_nbd_request(request_queue
 	
 	while ((req = elv_next_request(q)) != NULL) {
 		struct nbd_device *lo;
+		int notready;
 
 		blkdev_dequeue_request(req);
+#ifdef PARANOIA
+		requests_in++;
+#endif
+		req->errors = 0;
 		dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%lx)\n",
 				req->rq_disk->disk_name, req, req->flags);
 
 		if (!(req->flags & REQ_CMD))
-			goto error_out;
+			goto fail_request;
 
 		lo = req->rq_disk->private_data;
 #ifdef PARANOIA
 		BUG_ON(lo->magic != LO_MAGIC);
 #endif
-		if (!lo->file) {
-			printk(KERN_ERR "%s: Request when not-ready\n",
-					lo->disk->disk_name);
-			goto error_out;
-		}
+
 		nbd_cmd(req) = NBD_CMD_READ;
 		if (rq_data_dir(req) == WRITE) {
 			nbd_cmd(req) = NBD_CMD_WRITE;
 			if (lo->flags & NBD_READ_ONLY) {
 				printk(KERN_ERR "%s: Write on read-only\n",
 						lo->disk->disk_name);
-				goto error_out;
+				goto fail_request;
 			}
 		}
-#ifdef PARANOIA
-		requests_in++;
-#endif
 
-		req->errors = 0;
 		spin_unlock_irq(q->queue_lock);
-
-		spin_lock(&lo->queue_lock);
-
-		if (!lo->file) {
-			spin_unlock(&lo->queue_lock);
-			printk(KERN_ERR "%s: failed between accept and semaphore, file lost\n",
-					lo->disk->disk_name);
-			req->errors++;
-			nbd_end_request(req);
-			spin_lock_irq(q->queue_lock);
-			continue;
-		}
-
-		list_add(&req->queuelist, &lo->queue_head);
-		spin_unlock(&lo->queue_lock);
-
-		nbd_send_req(lo, req);
-
-		if (req->errors) {
-			printk(KERN_ERR "%s: Request send failed\n",
-					lo->disk->disk_name);
+		down(&lo->tx_lock);
+		if (lo->sock) {
+			notready = 0;
 			spin_lock(&lo->queue_lock);
-			list_del_init(&req->queuelist);
+			list_add(&req->queuelist, &lo->queue_head);
 			spin_unlock(&lo->queue_lock);
-			nbd_end_request(req);
-			spin_lock_irq(q->queue_lock);
-			continue;
+			nbd_send_req(lo, req);
+			if (req->errors) {
+				printk(KERN_ERR "%s: Request send failed\n",
+						lo->disk->disk_name);
+				spin_lock(&lo->queue_lock);
+				list_del_init(&req->queuelist);
+				spin_unlock(&lo->queue_lock);
+			}
 		}
-
+		else
+			notready = 1;
+		up(&lo->tx_lock);
 		spin_lock_irq(q->queue_lock);
+		if (notready) {
+			printk(KERN_ERR "%s: Request when not-ready\n",
+					lo->disk->disk_name);
+			req->errors++;
+			request_end_while_locked(req);
+		}
 		continue;
 
-	      error_out:
+fail_request:
+		/*
+		 * Fail the request: anyone waiting on a read or write gets
+		 * an error and can move on to their close() call.
+		 */
 		req->errors++;
-		spin_unlock(q->queue_lock);
-		nbd_end_request(req);
-		spin_lock(q->queue_lock);
+		request_end_while_locked(req);
 	}
 	return;
 }
@@ -548,12 +533,128 @@ static int nbd_release(struct inode *ino
 	return 0;
 }
 
+static int nbd_do_it(struct nbd_device *lo, unsigned int fd)
+{
+	struct file *file;
+	struct inode *inode;
+	struct socket *sock;
+	struct request *req;
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+	inode = file->f_dentry->d_inode;
+	if (!inode->i_sock) {
+		fput(file);
+		return -ENOTSOCK;
+	}
+	sock = SOCKET_I(inode);
+	if (sock->type != SOCK_STREAM) {
+		fput(file);
+		return -EPROTONOSUPPORT;
+	}
+
+	down(&lo->tx_lock);
+	if (lo->sock) {
+		up(&lo->tx_lock);
+		fput(file);
+		return -EBUSY;
+	}
+	lo->sock = sock;
+	up(&lo->tx_lock);
+
+	/* no need for rx lock since here is the rx... */
+	dprintk(DBG_IOCTL, "%s: nbd_do_it: commencing read loop\n",
+			lo->disk->disk_name);
+	while ((req = nbd_read_stat(lo)) != NULL)
+		request_end(req);
+	dprintk(DBG_IOCTL, "%s: nbd_do_it: read loop finished\n",
+			lo->disk->disk_name);
+
+	down(&lo->tx_lock);
+	lo->sock = NULL;
+	up(&lo->tx_lock);
+
+	fput(file);
+	return lo->harderror;
+}
+
+static int nbd_disconnect(struct nbd_device *lo)
+{
+	int error;
+	struct request sreq;
+
+	printk(KERN_INFO "%s: NBD_DISCONNECT\n", lo->disk->disk_name);
+
+	sreq.flags = REQ_SPECIAL;
+	nbd_cmd(&sreq) = NBD_CMD_DISC;
+
+	/*
+	 * Set sreq to sane values in case server implementation
+	 * fails to check the request type first and also to keep
+	 * debugging output cleaner.
+	 */
+	sreq.sector = 0;
+	sreq.nr_sectors = 0;
+
+	down(&lo->tx_lock);
+	if (lo->sock) {
+		error = 0;
+		nbd_send_req(lo, &sreq);
+	}
+	else {
+		error = -ENOTCONN;
+	}
+	up(&lo->tx_lock);
+	return error;
+}
+
+static int nbd_set_blksize(struct nbd_device *lo, unsigned long arg)
+{
+	if ((arg & (arg-1)) || (arg < 512) || (arg > PAGE_SIZE))
+		return -EINVAL;
+	down(&lo->tx_lock);
+	if (lo->sock) {
+		up(&lo->tx_lock);
+		return -EBUSY;
+	}
+	lo->blksize = arg;
+	lo->bytesize &= ~(lo->blksize-1);
+	set_capacity(lo->disk, lo->bytesize >> 9);
+	up(&lo->tx_lock);
+	return 0;
+}
+
+static int nbd_set_size(struct nbd_device *lo, unsigned long arg)
+{
+	down(&lo->tx_lock);
+	if (lo->sock) {
+		up(&lo->tx_lock);
+		return -EBUSY;
+	}
+	lo->bytesize = arg & ~(lo->blksize-1);
+	set_capacity(lo->disk, lo->bytesize >> 9);
+	up(&lo->tx_lock);
+	return 0;
+}
+
+static int nbd_set_size_blocks(struct nbd_device *lo, unsigned long arg)
+{
+	down(&lo->tx_lock);
+	if (lo->sock) {
+		up(&lo->tx_lock);
+		return -EBUSY;
+	}
+	lo->bytesize = ((u64) arg) * lo->blksize;
+	set_capacity(lo->disk, lo->bytesize >> 9);
+	up(&lo->tx_lock);
+	return 0;
+}
+
 static int nbd_ioctl(struct inode *inode, struct file *file,
 		     unsigned int cmd, unsigned long arg)
 {
 	struct nbd_device *lo = inode->i_bdev->bd_disk->private_data;
-	int error;
-	struct request sreq ;
 
 #ifdef PARANOIA
 	BUG_ON(lo->magic != LO_MAGIC);
@@ -565,101 +666,25 @@ static int nbd_ioctl(struct inode *inode
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	switch (cmd) {
+	case NBD_CLEAR_SOCK: /* deprecated! */
+	case NBD_SET_SOCK: /* deprecated! */
+		printk(KERN_WARNING "%s: %s[%d] called deprecated ioctl %s\n",
+				lo->disk->disk_name,
+				current->comm, current->pid,
+				ioctl_cmd_to_ascii(cmd));
+		return -EINVAL;
 	case NBD_DISCONNECT:
-	        printk(KERN_INFO "%s: NBD_DISCONNECT\n", lo->disk->disk_name);
-		sreq.flags = REQ_SPECIAL;
-		nbd_cmd(&sreq) = NBD_CMD_DISC;
-		/*
-		 * Set these to sane values in case server implementation
-		 * fails to check the request type first and also to keep
-		 * debugging output cleaner.
-		 */
-		sreq.sector = 0;
-		sreq.nr_sectors = 0;
-                if (!lo->sock)
-			return -EINVAL;
-                nbd_send_req(lo, &sreq);
-                return 0 ;
- 
-	case NBD_CLEAR_SOCK:
-		nbd_clear_que(lo);
-		spin_lock(&lo->queue_lock);
-		if (!list_empty(&lo->queue_head)) {
-			spin_unlock(&lo->queue_lock);
-			printk(KERN_ERR "%s: Some requests are in progress -> can not turn off.\n",
-					lo->disk->disk_name);
-			return -EBUSY;
-		}
-		file = lo->file;
-		if (!file) {
-			spin_unlock(&lo->queue_lock);
-			return -EINVAL;
-		}
-		lo->file = NULL;
-		lo->sock = NULL;
-		spin_unlock(&lo->queue_lock);
-		fput(file);
-		return 0;
-	case NBD_SET_SOCK:
-		if (lo->file)
-			return -EBUSY;
-		error = -EINVAL;
-		file = fget(arg);
-		if (file) {
-			inode = file->f_dentry->d_inode;
-			if (inode->i_sock) {
-				lo->file = file;
-				lo->sock = SOCKET_I(inode);
-				error = 0;
-			} else {
-				fput(file);
-			}
-		}
-		return error;
+		return nbd_disconnect(lo);
 	case NBD_SET_BLKSIZE:
-		if ((arg & (arg-1)) || (arg < 512) || (arg > PAGE_SIZE))
-			return -EINVAL;
-		lo->blksize = arg;
-		lo->bytesize &= ~(lo->blksize-1); 
-		set_capacity(lo->disk, lo->bytesize >> 9);
-		return 0;
+		return nbd_set_blksize(lo, arg);
 	case NBD_SET_SIZE:
-		lo->bytesize = arg & ~(lo->blksize-1); 
-		set_capacity(lo->disk, lo->bytesize >> 9);
-		return 0;
+		return nbd_set_size(lo, arg);
 	case NBD_SET_SIZE_BLOCKS:
-		lo->bytesize = ((u64) arg) * lo->blksize;
-		set_capacity(lo->disk, lo->bytesize >> 9);
-		return 0;
+		return nbd_set_size_blocks(lo, arg);
 	case NBD_DO_IT:
-		if (!lo->file)
-			return -EINVAL;
-		nbd_do_it(lo);
-		/* on return tidy up in case we have a signal */
-		/* Forcibly shutdown the socket causing all listeners
-		 * to error
-		 *
-		 * FIXME: This code is duplicated from sys_shutdown, but
-		 * there should be a more generic interface rather than
-		 * calling socket ops directly here */
-		down(&lo->tx_lock);
-		printk(KERN_WARNING "%s: shutting down socket\n",
-				lo->disk->disk_name);
-		lo->sock->ops->shutdown(lo->sock, SEND_SHUTDOWN|RCV_SHUTDOWN);
-		lo->sock = NULL;
-		up(&lo->tx_lock);
-		spin_lock(&lo->queue_lock);
-		file = lo->file;
-		lo->file = NULL;
-		spin_unlock(&lo->queue_lock);
-		nbd_clear_que(lo);
-		printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name);
-		if (file)
-			fput(file);
-		return lo->harderror;
+		return nbd_do_it(lo, arg);
 	case NBD_CLEAR_QUE:
-		nbd_clear_que(lo);
-		return 0;
+		return nbd_clear_que(lo);
 	case NBD_PRINT_DEBUG:
 #ifdef PARANOIA
 		printk(KERN_INFO "%s: next = %p, prev = %p. Global: in %d, out %d\n",
@@ -731,7 +756,6 @@ static int __init nbd_init(void)
 	for (i = 0; i < MAX_NBD; i++) {
 		struct gendisk *disk = nbd_dev[i].disk;
 		nbd_dev[i].refcnt = 0;
-		nbd_dev[i].file = NULL;
 #ifdef PARANOIA
 		nbd_dev[i].magic = LO_MAGIC;
 #endif
diff -puN include/linux/nbd.h~nbd-locking-fixes include/linux/nbd.h
--- 25/include/linux/nbd.h~nbd-locking-fixes	2003-06-26 18:39:48.000000000 -0700
+++ 25-akpm/include/linux/nbd.h	2003-06-26 18:39:48.000000000 -0700
@@ -8,6 +8,9 @@
  * 2003/06/24 Louis D. Langholtz <ldl@aros.net>
  *            Removed unneeded blksize_bits field from nbd_device struct.
  *            Cleanup PARANOIA usage & code.
+ *            Removed unneeded file field from nbd_device struct & moved
+ *              kernel specific stuff within __KERNEL__. The non-kernel stuff
+ *              should probably be moved to its own file (someday).
  */
 
 #ifndef LINUX_NBD_H
@@ -23,6 +26,35 @@
 #define NBD_SET_SIZE_BLOCKS	_IO( 0xab, 7 )
 #define NBD_DISCONNECT  _IO( 0xab, 8 )
 
+/* These are send over network in request/reply magic field */
+#define NBD_REQUEST_MAGIC 0x25609513
+#define NBD_REPLY_MAGIC 0x67446698
+/* Do *not* use magics: 0x12560953 0x96744668. */
+
+/*
+ * This is packet used for communication between client and
+ * server. All data are in network byte order.
+ */
+struct nbd_request {
+	u32 magic;
+	u32 type;	/* == READ || == WRITE 	*/
+	char handle[8];
+	u64 from;
+	u32 len;
+}
+#ifdef __GNUC__
+	__attribute__ ((packed))
+#endif
+;
+
+struct nbd_reply {
+	u32 magic;
+	u32 error;		/* 0 = ok, else error	*/
+	char handle[8];		/* handle you got from request	*/
+};
+
+#ifdef __KERNEL__
+
 enum {
 	NBD_CMD_READ = 0,
 	NBD_CMD_WRITE = 1,
@@ -42,7 +74,6 @@ struct nbd_device {
 #define NBD_READ_ONLY 0x0001
 #define NBD_WRITE_NOCHK 0x0002
 	struct socket * sock;
-	struct file * file; 	/* If == NULL, device is not ready, yet	*/
 #ifdef PARANOIA
 	int magic;		/* FIXME: not if debugging is off	*/
 #endif
@@ -54,33 +85,5 @@ struct nbd_device {
 	u64 bytesize;
 };
 
-/* This now IS in some kind of include file...	*/
-
-/* These are send over network in request/reply magic field */
-
-#define NBD_REQUEST_MAGIC 0x25609513
-#define NBD_REPLY_MAGIC 0x67446698
-/* Do *not* use magics: 0x12560953 0x96744668. */
-
-/*
- * This is packet used for communication between client and
- * server. All data are in network byte order.
- */
-struct nbd_request {
-	u32 magic;
-	u32 type;	/* == READ || == WRITE 	*/
-	char handle[8];
-	u64 from;
-	u32 len;
-}
-#ifdef __GNUC__
-	__attribute__ ((packed))
-#endif
-;
-
-struct nbd_reply {
-	u32 magic;
-	u32 error;		/* 0 = ok, else error	*/
-	char handle[8];		/* handle you got from request	*/
-};
-#endif
+#endif /* __KERNEL__ */
+#endif /* LINUX_NBD_H */

_