diff -Nurb linux.orig/Documentation/netswap.txt linux/Documentation/netswap.txt --- linux.orig/Documentation/netswap.txt 1969-12-31 19:00:00.000000000 -0500 +++ linux/Documentation/netswap.txt 2004-05-31 02:18:03.000000000 -0400 @@ -0,0 +1,51 @@ + Swapping over network + +Support for this is enabled via the CONFIG_NETSWAP option, which is +automatically enabled when enabling swap files located on NFS volumes +(CONFIG_SWAP_VIA_NFS). + +When swapping to files located on a network file system like NFS or +CODA or others or to nbd (network block device, see `nbd.txt') +partitions there is the problem that this requires additional memory, +besides the page which is currently swapped in or out, probably at +least two more pages for each page in question. + +This means that not only there needs to be free space left in the swap +file or the swap partition, but in addition there must be enough free +memory left in the system to perform the swap out of pages. + +This is particularly painful as receiving data over the network itself +consumes memory, and this memory is allocated from an interrupt +context (i.e. in the interrupt handler of the network card). That +means that on a congested network there are chances that the machine +runs out of memory, simply because the network device's interrupt +routines allocate memory faster that it is freed by swapping via +network. + +To cope with this problem, there is a new socket option `SO_SWAPPING' +which has to be set on the `SOL_SOCKET' level with setsockopt() (see +setsockopt(2)). When this option is set on any network socket, then +the system will start to drop network packets it receives on any other +socket when the number of free pages falls below a certain threshold. + +This threshold initially is 4 pages less than `freepages.min' (see +`Documentation/sysctl/vm.txt') but can be tuned using the sysctl +interface by writing to the file `/proc/sys/net/swapping/threshold' + +There are two other files: + +`/proc/sys/net/swapping/dropped': + how many network packets have been dropped so far. This file is + writable, writing to it simply sets the counter to the given value + (useful for resetting the counter). + +`/proc/sys/net/swapping/sock_count': + How many network sockets have the `SO_SWAPPING' option set (read + only, of course). + +When using swap-files on NFS volumes, then the `SO_SWAPPING' option is +set or cleared by swapon/swapoff system calls, so the user need not +care about it. + +Swapping over the network is insecure unless the data would be +encrypted, which is not the case with NFS. It is also very slow. diff -Nurb linux.orig/Documentation/nfsswap.txt linux/Documentation/nfsswap.txt --- linux.orig/Documentation/nfsswap.txt 1969-12-31 19:00:00.000000000 -0500 +++ linux/Documentation/nfsswap.txt 2004-05-31 02:18:03.000000000 -0400 @@ -0,0 +1,41 @@ + Swapping to files on NFS volumes + +To do this you have to say `Y' or `M' to the CONFIG_SWAP_VIA_NFS +configuration option. When compling support for this as a module you +should read `Documentation/modules.txt'. For auto-loading of the +module during the `swapon' system call you have to place a line like + +alias swapfile-mod nfsswap + +in `/etc/modules.conf' (or `/etc/conf.modules', depending on your +setup). NFS volumes holding swapfile should be mounted with `rsize' +and `wsize' set to something less than the size of a page, otherwise +deadlocks caused by memory fragmentation can happen, i.e. mount the +volume which is to hold the swapfiles with + +mount -t nfs -o rsize=2048,wsize=2048 NFS_SERVER_IP:/server_volume /mount_point + +or set the option in `/etc/fstab'. Read `Documentation/nfsroot.txt' to +learn how to set mount options for the root file system, if your swap +files are to be located on the root file system. + +Setting the `rsize' and `wsize' to anything less than PAGE_SIZE is a +performance hit, so you probably want to have at least two volumes +mounted, one for the swapfiles, one for the rest. + +You may want to read `Documentation/netswap.txt' as well. + +Swapfiles on NFS volumes can be treated like any other swapfile, +i.e. + +dd if=/dev/zero of=/swapfiles/SWAPFILE bs=1k count=20480 +mkswap /swapfiles/SWAPFILE +swapon /swapfiles/SWAPFILE + +will create a 20M swapfile and tell the system to use it. Actually, +one could use lseek(2) to create an empty swapfile. This is different +from swapfiles located on local harddisk. + +Swapping over the network is insecure unless the data would be +encrypted, which is not the case with NFS. It is also very slow. + diff -Nurb linux.orig/drivers/block/blkpg.c linux/drivers/block/blkpg.c --- linux.orig/drivers/block/blkpg.c 2003-07-04 04:11:31.000000000 -0400 +++ linux/drivers/block/blkpg.c 2004-05-31 02:18:03.000000000 -0400 @@ -34,7 +34,7 @@ #include <linux/blk.h> /* for set_device_ro() */ #include <linux/blkpg.h> #include <linux/genhd.h> -#include <linux/swap.h> /* for is_swap_partition() */ +#include <linux/swap.h> /* for swap_run_test() */ #include <linux/module.h> /* for EXPORT_SYMBOL */ #include <asm/uaccess.h> @@ -114,6 +114,29 @@ return 0; } +/* swap_run_test() applies this hook to all swapfiles until it returns + * "1". If it never returns "1", the result of swap_run_test() is "0", + * otherwise "1". + */ +static int is_swap_partition_hook(unsigned int flags, struct file *swap_file, + void *testdata) +{ + kdev_t swap_dev = S_ISBLK(swap_file->f_dentry->d_inode->i_mode) + ? swap_file->f_dentry->d_inode->i_rdev : 0; + kdev_t dev = *((kdev_t *)testdata); + + if (flags & SWP_USED && dev == swap_dev) { + return 1; + } else { + return 0; + } +} + +static inline int is_swap_partition(kdev_t dev) +{ + return swap_run_test(is_swap_partition_hook, &dev); +} + /* * Delete a partition given by partition number * diff -Nurb linux.orig/fs/Config.in linux/fs/Config.in --- linux.orig/fs/Config.in 2004-05-31 02:02:43.000000000 -0400 +++ linux/fs/Config.in 2004-05-31 02:18:03.000000000 -0400 @@ -4,6 +4,12 @@ mainmenu_option next_comment comment 'File systems' +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + tristate 'Swapping to block devices' CONFIG_BLKDEV_SWAP +else + define_bool CONFIG_BLKDEV_SWAP y +fi + bool 'Quota support' CONFIG_QUOTA tristate 'Kernel automounter support' CONFIG_AUTOFS_FS tristate 'Kernel automounter version 4 support (also supports v3)' CONFIG_AUTOFS4_FS @@ -110,6 +116,12 @@ dep_tristate 'NFS file system support' CONFIG_NFS_FS $CONFIG_INET dep_mbool ' Provide NFSv3 client support' CONFIG_NFS_V3 $CONFIG_NFS_FS dep_bool ' Root file system on NFS' CONFIG_ROOT_NFS $CONFIG_NFS_FS $CONFIG_IP_PNP + if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + dep_tristate ' Swapping via NFS (EXPERIMENTAL)' CONFIG_SWAP_VIA_NFS $CONFIG_NFS_FS + if [ "$CONFIG_SWAP_VIA_NFS" = "y" -o "$CONFIG_SWAP_VIA_NFS" = "m" ]; then + define_bool CONFIG_NETSWAP y + fi + fi dep_tristate 'NFS server support' CONFIG_NFSD $CONFIG_INET dep_mbool ' Provide NFSv3 server support' CONFIG_NFSD_V3 $CONFIG_NFSD diff -Nurb linux.orig/fs/Makefile linux/fs/Makefile --- linux.orig/fs/Makefile 2004-05-31 02:02:42.000000000 -0400 +++ linux/fs/Makefile 2004-05-31 02:18:03.000000000 -0400 @@ -8,7 +8,7 @@ O_TARGET := fs.o export-objs := filesystems.o open.o dcache.o buffer.o -mod-subdirs := nls +mod-subdirs := nls nfs obj-y := open.o read_write.o devices.o file_table.o buffer.o \ super.o block_dev.o char_dev.o stat.o exec.o pipe.o namei.o \ @@ -70,6 +70,7 @@ subdir-$(CONFIG_JFS_FS) += jfs subdir-$(CONFIG_SQUASHFS) += squashfs +obj-$(CONFIG_BLKDEV_SWAP) += blkdev_swap.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o diff -Nurb linux.orig/fs/blkdev_swap.c linux/fs/blkdev_swap.c --- linux.orig/fs/blkdev_swap.c 1969-12-31 19:00:00.000000000 -0500 +++ linux/fs/blkdev_swap.c 2004-05-31 02:18:03.000000000 -0400 @@ -0,0 +1,309 @@ +/* + * Swapping to partitions or files located on partitions. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/locks.h> +#include <linux/blkdev.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/fs.h> + +#ifdef DEBUG_BLKDEV_SWAP +# define dprintk(fmt...) printk(##fmt) +#else +# define dprintk(fmt...) do { /* */ } while (0) +#endif + +#define BLKDEV_SWAP_ID "blkdev" +#define BLKDEV_FILE_SWAP_ID "blkdev file" + +/* + * Helper function, copied here from buffer.c + */ + +/* + * Start I/O on a page. + * This function expects the page to be locked and may return + * before I/O is complete. You then have to check page->locked + * and page->uptodate. + * + * brw_page() is SMP-safe, although it's being called with the + * kernel lock held - but the code is ready. + * + * FIXME: we need a swapper_inode->get_block function to remove + * some of the bmap kludges and interface ugliness here. + */ +int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size) +{ + struct buffer_head *head, *bh; + + if (!PageLocked(page)) + panic("brw_page: page not locked for I/O"); + + if (!page->buffers) + create_empty_buffers(page, dev, size); + head = bh = page->buffers; + + /* Stage 1: lock all the buffers */ + do { + lock_buffer(bh); + bh->b_blocknr = *(b++); + set_bit(BH_Mapped, &bh->b_state); + set_buffer_async_io(bh); + bh = bh->b_this_page; + } while (bh != head); + + /* Stage 2: start the IO */ + do { + struct buffer_head *next = bh->b_this_page; + submit_bh(rw, bh); + bh = next; + } while (bh != head); + return 0; +} + +/* + * We implement to methods: swapping to partitions, and swapping to files + * located on partitions. + */ + +struct blkdev_swap_data { + kdev_t dev; +}; + +struct test_data { + struct file * filp; + kdev_t dev; +}; + +static int is_blkdev_swapping(unsigned int flags, + struct file * swapf, + void *data) +{ + struct test_data *testdata = (struct test_data *) data; + struct file * filp = testdata->filp; + kdev_t dev = testdata->dev; + + /* Only check filp's that don't match the one already opened + * for us by sys_swapon(). Otherwise, we will always flag a + * busy swap file. + */ + + if (swapf != filp) { + if (dev == swapf->f_dentry->d_inode->i_rdev) + return 1; + } + return 0; +} + +static int blkdev_swap_open(struct file * filp, void **dptr) +{ + int swapfilesize; + kdev_t dev; + struct blkdev_swap_data *data; + int error; + struct test_data testdata; + + MOD_INC_USE_COUNT; + + if (!S_ISBLK(filp->f_dentry->d_inode->i_mode)) { + dprintk(__FUNCTION__": can't handle this swap file: %s\n", + swapf->d_name.name); + error = 0; /* not for us */ + goto bad_swap; + } + + dev = filp->f_dentry->d_inode->i_rdev; + set_blocksize(dev, PAGE_SIZE); + error = -ENODEV; + if (!dev || + (blk_size[MAJOR(dev)] && !blk_size[MAJOR(dev)][MINOR(dev)])) { + printk("blkdev_swap_open: blkdev weirdness for %s\n", + filp->f_dentry->d_name.name); + goto bad_swap; + } + + /* Check to make sure that we aren't already swapping. */ + error = -EBUSY; + testdata.filp = filp; + testdata.dev = dev; + if (swap_run_test(is_blkdev_swapping, &testdata)) { + printk("blkdev_swap_open: already swapping to %s\n", + filp->f_dentry->d_name.name); + goto bad_swap; + } + + swapfilesize = 0; + if (blk_size[MAJOR(dev)]) + swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)] + >> (PAGE_SHIFT - 10); + + if ((data = kmalloc(sizeof(*data), GFP_KERNEL)) == NULL) { + printk("blkdev_swap_open: can't allocate data for %s\n", + filp->f_dentry->d_name.name); + error = -ENOMEM; + goto bad_swap; + } + data->dev = dev; + *dptr = data; + + dprintk("blkdev_swap_open: returning %d\n", swapfilesize); + return swapfilesize; + + bad_swap: + MOD_DEC_USE_COUNT; + return error; /* this swap thing is not for us */ +} + +static int blkdev_swap_release(struct file * filp, void *data) +{ + dprintk("blkdev_swap_release: releasing swap device %s\n", + filp->f_dentry->d_name.name); + kfree(data); + MOD_DEC_USE_COUNT; + return 0; +} + +static int blkdev_rw_page(int rw, struct page *page, unsigned long offset, + void *ptr) +{ + struct blkdev_swap_data *data = (struct blkdev_swap_data *)ptr; + brw_page(rw, page, data->dev, (int *)&offset, PAGE_SIZE); + return 1; +} + +static struct swap_ops blkdev_swap_ops = { + blkdev_swap_open, + blkdev_swap_release, + blkdev_rw_page +}; + +struct blkdevfile_swap_data { + struct inode *swapf; +}; + +static int is_blkdevfile_swapping(unsigned int flags, + struct file * swapf, + void * data) +{ + struct file * filp = (struct file *) data; + + /* Only check filp's that don't match the one already opened + * for us by sys_swapon(). Otherwise, we will always flag a + * busy swap file. + */ + + if (swapf != filp) { + if (filp->f_dentry->d_inode == swapf->f_dentry->d_inode) + return 1; + } + return 0; +} + +static int blkdevfile_swap_open(struct file *swapf, void **dptr) +{ + int error = 0; + int swapfilesize; + struct blkdevfile_swap_data *data; + + MOD_INC_USE_COUNT; + + /* first check whether this is a regular file located on a local + * hard disk + */ + if (!S_ISREG(swapf->f_dentry->d_inode->i_mode)) { + dprintk("blkdevfile_swap_open: " + "can't handle this swap file: %s\n", + swapf->d_name.name); + error = 0; /* not for us */ + goto bad_swap; + } + if (!swapf->f_dentry->d_inode->i_mapping->a_ops->bmap) { + dprintk("blkdevfile_swap_open: no bmap for file: %s\n", + swapf->d_name.name); + error = 0; /* not for us */ + goto bad_swap; + } + + if (swap_run_test(is_blkdevfile_swapping, swapf)) { + dprintk("blkdevfile_swap_open: already swapping to %s\n", + swapf->d_name.name); + error = -EBUSY; + goto bad_swap; + } + swapfilesize = swapf->f_dentry->d_inode->i_size >> PAGE_SHIFT; + if ((data = kmalloc(sizeof(*data), GFP_KERNEL)) == NULL) { + error = -ENOMEM; + goto bad_swap; + } + data->swapf = swapf->f_dentry->d_inode; + *dptr = data; + return swapfilesize; + + bad_swap: + MOD_DEC_USE_COUNT; + return error; +} + +static int blkdevfile_swap_release(struct file *swapf, void *data) +{ + kfree(data); + MOD_DEC_USE_COUNT; + return 0; +} + +static int blkdevfile_rw_page(int rw, struct page *page, unsigned long offset, + void *ptr) +{ + struct blkdevfile_swap_data *data = (struct blkdevfile_swap_data *)ptr; + struct inode * swapf = data->swapf; + int i, j; + unsigned int block = offset + << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits); + kdev_t dev = swapf->i_dev; + int block_size; + int zones[PAGE_SIZE/512]; + int zones_used; + + block_size = swapf->i_sb->s_blocksize; + for (i=0, j=0; j< PAGE_SIZE ; i++, j += block_size) + if (!(zones[i] = bmap(swapf,block++))) { + printk("blkdevfile_rw_page: bad swap file\n"); + return 0; + } + zones_used = i; + + /* block_size == PAGE_SIZE/zones_used */ + brw_page(rw, page, dev, zones, block_size); + return 1; +} + +static struct swap_ops blkdevfile_swap_ops = { + blkdevfile_swap_open, + blkdevfile_swap_release, + blkdevfile_rw_page + }; + +int __init blkdev_swap_init(void) +{ + (void)register_swap_method(BLKDEV_SWAP_ID, &blkdev_swap_ops); + (void)register_swap_method(BLKDEV_FILE_SWAP_ID, &blkdevfile_swap_ops); + return 0; +} + +void __exit blkdev_swap_exit(void) +{ + unregister_swap_method(BLKDEV_SWAP_ID); + unregister_swap_method(BLKDEV_FILE_SWAP_ID); +} + +module_init(blkdev_swap_init) +module_exit(blkdev_swap_exit) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Many. Stuffed into a module by cH (Claus-Justus Heine)"); +MODULE_DESCRIPTION("Swapping to partitions and files on local hard-disks"); diff -Nurb linux.orig/fs/buffer.c linux/fs/buffer.c --- linux.orig/fs/buffer.c 2003-07-04 04:12:05.000000000 -0400 +++ linux/fs/buffer.c 2004-05-31 02:21:05.000000000 -0400 @@ -743,7 +743,7 @@ bh->b_private = private; } -static void end_buffer_io_async(struct buffer_head * bh, int uptodate) +void end_buffer_io_async(struct buffer_head * bh, int uptodate) { static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED; unsigned long flags; @@ -2344,35 +2344,6 @@ return err; } -int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size) -{ - struct buffer_head *head, *bh; - - if (!PageLocked(page)) - panic("brw_page: page not locked for I/O"); - - if (!page->buffers) - create_empty_buffers(page, dev, size); - head = bh = page->buffers; - - /* Stage 1: lock all the buffers */ - do { - lock_buffer(bh); - bh->b_blocknr = *(b++); - set_bit(BH_Mapped, &bh->b_state); - set_buffer_async_io(bh); - bh = bh->b_this_page; - } while (bh != head); - - /* Stage 2: start the IO */ - do { - struct buffer_head *next = bh->b_this_page; - submit_bh(rw, bh); - bh = next; - } while (bh != head); - return 0; -} - int block_symlink(struct inode *inode, const char *symname, int len) { struct address_space *mapping = inode->i_mapping; diff -Nurb linux.orig/fs/nfs/Makefile linux/fs/nfs/Makefile --- linux.orig/fs/nfs/Makefile 2003-07-04 04:12:07.000000000 -0400 +++ linux/fs/nfs/Makefile 2004-05-31 02:18:03.000000000 -0400 @@ -15,6 +15,14 @@ obj-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o obj-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o -obj-m := $(O_TARGET) +obj-$(CONFIG_SWAP_VIA_NFS) += nfsswap.o +ifeq ($(CONFIG_SWAP_VIA_NFS),m) +export-objs := nfs_syms.o +obj-y += nfs_syms.o +endif + +ifeq ($(CONFIG_NFS_FS),m) +obj-m += $(O_TARGET) +endif include $(TOPDIR)/Rules.make diff -Nurb linux.orig/fs/nfs/file.c linux/fs/nfs/file.c --- linux.orig/fs/nfs/file.c 2003-07-04 04:12:07.000000000 -0400 +++ linux/fs/nfs/file.c 2004-05-31 02:18:03.000000000 -0400 @@ -58,11 +58,6 @@ setattr: nfs_notify_change, }; -/* Hack for future NFS swap support */ -#ifndef IS_SWAPFILE -# define IS_SWAPFILE(inode) (0) -#endif - /* * Flush all dirty pages, and check for write errors. * @@ -217,8 +212,6 @@ inode->i_ino, (unsigned long) count, (unsigned long) *ppos); result = -EBUSY; - if (IS_SWAPFILE(inode)) - goto out_swapfile; result = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (result) goto out; @@ -230,10 +223,6 @@ result = generic_file_write(file, buf, count, ppos); out: return result; - -out_swapfile: - printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); - goto out; } /* diff -Nurb linux.orig/fs/nfs/nfs_syms.c linux/fs/nfs/nfs_syms.c --- linux.orig/fs/nfs/nfs_syms.c 1969-12-31 19:00:00.000000000 -0500 +++ linux/fs/nfs/nfs_syms.c 2004-05-31 02:18:03.000000000 -0400 @@ -0,0 +1,10 @@ +#include <linux/config.h> +#define __NO_VERSION__ +#include <linux/module.h> +#include <linux/types.h> +#include <linux/sunrpc/clnt.h> +#include <linux/nfs_fs.h> + +EXPORT_SYMBOL(__nfs_refresh_inode); +EXPORT_SYMBOL(nfs_write_attributes); + diff -Nurb linux.orig/fs/nfs/nfsswap.c linux/fs/nfs/nfsswap.c --- linux.orig/fs/nfs/nfsswap.c 1969-12-31 19:00:00.000000000 -0500 +++ linux/fs/nfs/nfsswap.c 2004-05-31 02:18:03.000000000 -0400 @@ -0,0 +1,350 @@ +/* + * Swapping to files located on NFS mounted volumes + * Copyright (c) 2000 Claus-Justus Heine + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/socket.h> +#include <linux/smp_lock.h> +#include <net/netswapping.h> +#include <net/sock.h> + +#include <linux/sunrpc/clnt.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_fs_sb.h> +#include <asm/uaccess.h> + +#define NFSDBG_FACILITY NFSDBG_SWAP + +#define NFS_SWAP_ID "nfs file" + +/* we cache some values here. In principle, we only need the file. + */ +struct nfs_swap_data { + struct file *file; + struct inode *inode; + struct nfs_server *server; + struct socket *socket; +}; + +/* Nearly a clone of nfs_readpage_sync() in read.c, but "struct page" does not + * contain information about the file offset when swapping. So. + */ +static int nfs_read_swap_page(struct page *page, + struct nfs_server *server, + struct inode *inode, + struct file *file) +{ + unsigned int rsize = server->rsize; + unsigned int count = PAGE_SIZE; + unsigned int offset = 0; /* always at start of page */ + int result, eof; + struct rpc_cred *cred; + struct nfs_fattr fattr; + + cred = nfs_file_cred(file); + + do { + if (count < rsize) + rsize = count; + + lock_kernel(); + result = NFS_PROTO(inode)->read(inode, cred, + &fattr, + NFS_RPC_SWAPFLAGS, + offset, rsize, page, &eof); + nfs_refresh_inode(inode, &fattr); + unlock_kernel(); + + /* + * Even if we had a partial success we can't mark the page + * cache valid. + */ + if (result < 0) { + if (result == -EISDIR) + result = -EINVAL; + goto io_error; + } + count -= result; + offset += result; + if (result < rsize) /* NFSv2ism */ + break; + } while (count); + + if (count) { + char *kaddr = kmap(page); + memset(kaddr + offset, 0, count); + kunmap(page); + } + flush_dcache_page(page); + result = 0; + +io_error: + return result; +} + +/* Like nfs_writepage_sync(), but when swapping page->index does not encode + * the offset in the swap file alone. + * + */ +static int nfs_write_swap_page(struct page *page, + struct nfs_server *server, + struct inode *inode, + struct file *file) +{ + struct rpc_cred *cred; + unsigned int wsize = server->wsize; + unsigned int count = PAGE_SIZE; + unsigned int offset = 0; + int result; + struct nfs_writeverf verf; + struct nfs_fattr fattr; + + cred = nfs_file_cred(file); + + do { + if (count < wsize) + wsize = count; + + lock_kernel(); + result = NFS_PROTO(inode)->write(inode, cred, &fattr, + NFS_RW_SWAP|NFS_RW_SYNC, + offset, wsize, page, &verf); + nfs_write_attributes(inode, &fattr); + unlock_kernel(); + + if (result < 0) { + goto io_error; + } + if (result != wsize) + printk("NFS: short write, wsize=%u, result=%d\n", + wsize, result); + offset += wsize; + count -= wsize; + /* + * If we've extended the file, update the inode + * now so we don't invalidate the cache. + */ + if (offset > inode->i_size) + inode->i_size = offset; + } while (count); + + result = 0; + +io_error: + + return result; +} + +/* Unluckily (for us) form 2.4.19 -> 2.4.20 the nfs-proc's where + * changed and expect now a proper file-mapping page, where index + * encodes the offset alone. + * + * What we do: we save the original value of page->index, initialize + * page->index to what the NFS/sun-rpc subsystem expects and restore + * the index later. + */ +static int nfs_rw_swap_page(int rw, struct page *page, + unsigned long offset, void *dptr) +{ + int error; + struct nfs_swap_data *data = dptr; + unsigned long alloc_flag = current->flags & PF_MEMALLOC; + unsigned long page_index; + + if (!PageLocked(page)) + panic("nfs_rw_swap_page: page not locked for I/O"); + + /* prevent memory deadlocks */ + if (!(current->flags & PF_MEMALLOC)) { + dprintk("nfs_rw_swap_page: Setting PF_MEMALLOC\n"); + } + current->flags |= PF_MEMALLOC; + + /* now tweak the page->index field ... */ + page_index = page->index; + page->index = ((loff_t)offset*(loff_t)PAGE_SIZE) >> PAGE_CACHE_SHIFT; + + if (rw == WRITE) { + error = nfs_write_swap_page(page, + data->server, + data->inode, + data->file); + } else { + error = nfs_read_swap_page(page, + data->server, + data->inode, + data->file); + } + + if (!alloc_flag) { + current->flags &= ~PF_MEMALLOC; + } + + /* now restore the page->index field ... */ + page->index = page_index; + + if (error) { + /* Must mark the page invalid after I/O error */ + SetPageError(page); + ClearPageUptodate(page); + } else { + ClearPageError(page); + SetPageUptodate(page); + } + + if (!error) { /* in case of an error rw_swap_page() likes to unlock + * itself. + */ + UnlockPage(page); + } + + return error < 0 ? 0 : 1; +} + +static int is_nfsfile_swapping(unsigned int flags, + struct file * swapf, + void * data) +{ + struct file * filp = (struct file *) data; + + /* Only check filp's that don't match the one already opened + * for us by sys_swapon(). Otherwise, we will always flag a + * busy swap file. + */ + + if (swapf != filp) { + if (filp->f_dentry->d_inode == swapf->f_dentry->d_inode) + return 1; + } + return 0; +} + +static int nfs_swap_open(struct file *swapf, void **dptr) +{ + int error = 0; + int swapfilesize; + struct nfs_swap_data *data; + int on = 1; + mm_segment_t fs; + struct inode *inode = swapf->f_dentry->d_inode; + + MOD_INC_USE_COUNT; + + if (!S_ISREG(inode->i_mode)) { + dprintk("nfs_swap_open: can't handle this swap file: %s\n", + swapf->f_dentry->d_name.name); + error = 0; /* not for us */ + goto bad_swap; + } + /* determine whether this file really is located on an NFS mounted + * volume + */ + if (!inode->i_sb || inode->i_sb->s_magic != NFS_SUPER_MAGIC) { + dprintk("nfs_swap_open: %s is not an NFS file.\n", + swapf->f_dentry->d_name.name); + error = 0; /* not for us */ + goto bad_swap; + } + + if (swap_run_test(is_nfsfile_swapping, swapf)) { + dprintk("nfs_swap_open: already swapping to %s\n", + swapf->f_dentry->d_name.name); + error = -EBUSY; + goto bad_swap; + } + swapfilesize = inode->i_size >> PAGE_SHIFT; + if ((data = kmalloc(sizeof(*data), GFP_KERNEL)) == NULL) { + error = -ENOMEM; + goto bad_swap; + } + data->file = swapf; + data->inode = inode; + data->server = NFS_SERVER(inode); + data->socket = data->server->client->cl_xprt->sock; + + /* set socket option SO_SWAPPING */ + fs = get_fs(); + set_fs(KERNEL_DS); + error = sock_setsockopt(data->socket, SOL_SOCKET, SO_SWAPPING, + (char *)&on, sizeof(on)); + set_fs(fs); + if (error) { + dprintk("nfs_swap_open: error setting SO_SWAPPING\n"); + goto bad_swap_2; + } + + *dptr = data; + return swapfilesize; + + bad_swap_2: + kfree(data); + bad_swap: + MOD_DEC_USE_COUNT; + return error; +} + +static int nfs_swap_release(struct file *swapf, void *dptr) +{ + struct nfs_swap_data *data = (struct nfs_swap_data *)dptr; + int off = 0; + mm_segment_t fs; + int error; + +#if 1 + if (swapf != data->file || + swapf->f_dentry->d_inode != data->inode || + !swapf->f_dentry->d_inode->i_sb || + swapf->f_dentry->d_inode->i_sb->s_magic != NFS_SUPER_MAGIC || + NFS_SERVER(swapf->f_dentry->d_inode) != data->server || + data->socket != data->server->client->cl_xprt->sock) { + panic("nfs_swap_release: nfs swap data messed up"); + } +#endif + + /* remove socket option SO_SWAPPING */ + fs = get_fs(); + set_fs(KERNEL_DS); + error = sock_setsockopt(data->socket, SOL_SOCKET, SO_SWAPPING, + (char *)&off, sizeof(off)); + set_fs(fs); + if (error) { + dprintk("nfs_swap_open: error clearing SO_SWAPPING\n"); + } + kfree(data); + MOD_DEC_USE_COUNT; + return error; +} + +static struct swap_ops nfs_swap_ops = { + open: nfs_swap_open, + release: nfs_swap_release, + rw_page: nfs_rw_swap_page +}; + +int __init nfs_swap_init(void) +{ + (void)register_swap_method(NFS_SWAP_ID, &nfs_swap_ops); + return 0; +} + +void __exit nfs_swap_exit(void) +{ + unregister_swap_method(NFS_SWAP_ID); +} + +module_init(nfs_swap_init) +module_exit(nfs_swap_exit) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("(c) 1996-2002 cH (Claus-Justus Heine)"); +MODULE_DESCRIPTION("Swapping to files located on volumes mounted via NFS"); diff -Nurb linux.orig/fs/nfs/read.c linux/fs/nfs/read.c --- linux.orig/fs/nfs/read.c 2003-07-04 04:12:08.000000000 -0400 +++ linux/fs/nfs/read.c 2004-05-31 02:18:03.000000000 -0400 @@ -50,11 +50,6 @@ */ static void nfs_readpage_result(struct rpc_task *task); -/* Hack for future NFS swap support */ -#ifndef IS_SWAPFILE -# define IS_SWAPFILE(inode) (0) -#endif - static kmem_cache_t *nfs_rdata_cachep; static __inline__ struct nfs_read_data *nfs_readdata_alloc(void) @@ -92,7 +87,6 @@ int rsize = NFS_SERVER(inode)->rsize; int result; int count = PAGE_CACHE_SIZE; - int flags = IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0; int eof; dprintk("NFS: nfs_readpage_sync(%p)\n", page); @@ -114,7 +108,7 @@ offset, rsize, page); lock_kernel(); - result = NFS_PROTO(inode)->read(inode, cred, &fattr, flags, + result = NFS_PROTO(inode)->read(inode, cred, &fattr, 0, offset, rsize, page, &eof); nfs_refresh_inode(inode, &fattr); unlock_kernel(); @@ -246,7 +240,7 @@ task = &data->task; /* N.B. Do we need to test? Never called for swapfile inode */ - flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); + flags = RPC_TASK_ASYNC; nfs_read_rpcsetup(head, data); @@ -476,8 +470,6 @@ } error = nfs_readpage_sync(file, inode, page); - if (error < 0 && IS_SWAPFILE(inode)) - printk("Aiee.. nfs swap-in of page failed!\n"); out: return error; diff -Nurb linux.orig/fs/nfs/write.c linux/fs/nfs/write.c --- linux.orig/fs/nfs/write.c 2003-07-04 04:12:08.000000000 -0400 +++ linux/fs/nfs/write.c 2004-05-31 02:20:47.000000000 -0400 @@ -3,7 +3,6 @@ #include <linux/config.h> #include <linux/types.h> #include <linux/slab.h> -#include <linux/swap.h> #include <linux/pagemap.h> #include <linux/file.h> @@ -46,11 +45,6 @@ static void nfs_commit_done(struct rpc_task *); #endif -/* Hack for future NFS swap support */ -#ifndef IS_SWAPFILE -# define IS_SWAPFILE(inode) (0) -#endif - static kmem_cache_t *nfs_wdata_cachep; static __inline__ struct nfs_write_data *nfs_writedata_alloc(void) @@ -82,7 +76,7 @@ * For the moment, we just call nfs_refresh_inode(). */ static __inline__ int -nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr) +__nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr) { if ((fattr->valid & NFS_ATTR_FATTR) && !(fattr->valid & NFS_ATTR_WCC)) { fattr->pre_size = NFS_CACHE_ISIZE(inode); @@ -93,6 +87,11 @@ return nfs_refresh_inode(inode, fattr); } +int nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr) +{ + return __nfs_write_attributes(inode, fattr); +} + /* * Write a page synchronously. * Offset is the data offset within the page. @@ -104,8 +103,7 @@ struct rpc_cred *cred = NULL; loff_t base; unsigned int wsize = NFS_SERVER(inode)->wsize; - int result, refresh = 0, written = 0, flags; - u8 *buffer; + int result, refresh = 0, written = 0; struct nfs_fattr fattr; struct nfs_writeverf verf; @@ -121,15 +119,14 @@ base = page_offset(page) + offset; - flags = ((IS_SWAPFILE(inode)) ? NFS_RW_SWAP : 0) | NFS_RW_SYNC; - do { - if (count < wsize && !IS_SWAPFILE(inode)) + if (count < wsize) wsize = count; - result = NFS_PROTO(inode)->write(inode, cred, &fattr, flags, + result = NFS_PROTO(inode)->write(inode, cred, &fattr, + NFS_RW_SYNC, offset, wsize, page, &verf); - nfs_write_attributes(inode, &fattr); + __nfs_write_attributes(inode, &fattr); if (result < 0) { /* Must mark the page invalid after I/O error */ @@ -140,7 +137,6 @@ printk("NFS: short write, wsize=%u, result=%d\n", wsize, result); refresh = 1; - buffer += wsize; base += wsize; offset += wsize; written += wsize; @@ -979,7 +975,7 @@ } #endif - nfs_write_attributes(inode, resp->fattr); + __nfs_write_attributes(inode, resp->fattr); while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); @@ -1133,7 +1129,7 @@ if (nfs_async_handle_jukebox(task)) return; - nfs_write_attributes(inode, resp->fattr); + __nfs_write_attributes(inode, resp->fattr); while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); diff -Nurb linux.orig/include/linux/fs.h linux/include/linux/fs.h --- linux.orig/include/linux/fs.h 2004-05-31 02:06:19.000000000 -0400 +++ linux/include/linux/fs.h 2004-05-31 02:18:03.000000000 -0400 @@ -1500,6 +1500,10 @@ extern int inode_change_ok(struct inode *, struct iattr *); extern int inode_setattr(struct inode *, struct iattr *); +/* for swapping to block devices */ +void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize); +void end_buffer_io_async(struct buffer_head * bh, int uptodate); + /* * Common dentry functions for inclusion in the VFS * or in other stackable file systems. Some of these diff -Nurb linux.orig/include/linux/nfs_fs.h linux/include/linux/nfs_fs.h --- linux.orig/include/linux/nfs_fs.h 2004-05-31 02:06:28.000000000 -0400 +++ linux/include/linux/nfs_fs.h 2004-05-31 02:18:03.000000000 -0400 @@ -40,8 +40,8 @@ */ #define NFS_MAX_DIRCACHE 16 -#define NFS_MAX_FILE_IO_BUFFER_SIZE 32768 -#define NFS_DEF_FILE_IO_BUFFER_SIZE 4096 +#define NFS_MAX_FILE_IO_BUFFER_SIZE (8*PAGE_SIZE) +#define NFS_DEF_FILE_IO_BUFFER_SIZE PAGE_SIZE /* * The upper limit on timeouts for the exponential backoff algorithm. @@ -205,6 +205,8 @@ extern int nfs_writepage(struct page *); extern int nfs_flush_incompatible(struct file *file, struct page *page); extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); +extern int nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr); + /* * Try to write back everything synchronously (but check the * return value!) @@ -375,6 +377,7 @@ #define NFSDBG_XDR 0x0020 #define NFSDBG_FILE 0x0040 #define NFSDBG_ROOT 0x0080 +#define NFSDBG_SWAP 0x0100 #define NFSDBG_ALL 0xFFFF #ifdef __KERNEL__ diff -Nurb linux.orig/include/linux/slab.h linux/include/linux/slab.h --- linux.orig/include/linux/slab.h 2004-05-31 02:06:19.000000000 -0400 +++ linux/include/linux/slab.h 2004-05-31 02:18:03.000000000 -0400 @@ -39,6 +39,7 @@ #define SLAB_HWCACHE_ALIGN 0x00002000UL /* align objs on a h/w cache lines */ #define SLAB_CACHE_DMA 0x00004000UL /* use GFP_DMA memory */ #define SLAB_MUST_HWCACHE_ALIGN 0x00008000UL /* force alignment */ +#define SLAB_LOW_GFP_ORDER 0x00010000UL /* use as low a gfp order as possible */ /* flags passed to a constructor func */ #define SLAB_CTOR_CONSTRUCTOR 0x001UL /* if not set, then deconstructor */ diff -Nurb linux.orig/include/linux/swap.h linux/include/linux/swap.h --- linux.orig/include/linux/swap.h 2004-05-31 02:06:19.000000000 -0400 +++ linux/include/linux/swap.h 2004-05-31 02:18:03.000000000 -0400 @@ -58,15 +58,29 @@ #define SWAP_MAP_MAX 0x7fff #define SWAP_MAP_BAD 0x8000 +struct swap_ops { + int (*open)(struct file *swapf, void **data); + int (*release)(struct file *swapf, void *data); + int (*rw_page)(int rw, + struct page *page, unsigned long offset, void *data); +}; + +struct swap_method { + struct swap_method *next; + char * name; + struct swap_ops *ops; + int use_count; +}; + /* * The in-memory structure used to track swap areas. */ struct swap_info_struct { unsigned int flags; - kdev_t swap_device; + struct file *swap_file; + struct swap_method *method; + void *data; spinlock_t sdev_lock; - struct dentry * swap_file; - struct vfsmount *swap_vfsmnt; unsigned short * swap_map; unsigned int lowest_bit; unsigned int highest_bit; @@ -141,11 +155,15 @@ extern int total_swap_pages; extern unsigned int nr_swapfiles; extern struct swap_info_struct swap_info[]; -extern int is_swap_partition(kdev_t); +extern int register_swap_method(char *name, struct swap_ops *ops); +extern int unregister_swap_method(char *name); +extern int swap_run_test(int (*test_fct)(unsigned int flags, + struct file *swap_file, + void *testdata), void *testdata); extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); -extern void get_swaphandle_info(swp_entry_t, unsigned long *, kdev_t *, - struct inode **); +struct swap_method *get_swaphandle_info(swp_entry_t entry, + unsigned long *offset, void **data); extern int swap_duplicate(swp_entry_t); extern int swap_count(struct page *); extern int valid_swaphandles(swp_entry_t, unsigned long *); diff -Nurb linux.orig/include/net/netswapping.h linux/include/net/netswapping.h --- linux.orig/include/net/netswapping.h 1969-12-31 19:00:00.000000000 -0500 +++ linux/include/net/netswapping.h 2004-05-31 02:18:03.000000000 -0400 @@ -0,0 +1,47 @@ +#ifndef _LINUX_NETSWAPPING_H +#define _LINUX_NETSWAPPING_H + +#include <linux/swap.h> +#include <linux/init.h> + +/* It is a mess. Socket options are defined in asm-ARCH/socket.h */ + +#define SO_SWAPPING 0x00100000 /* hopefully not used by anybody else */ + +#ifdef __KERNEL__ + +#define CTL_NETSWAP 0x00100000 + +enum { + NET_SWAP_DROPPED = 1, + NET_SWAP_DROP_THRESHOLD = 2, + NET_SWAP_SOCK_COUNT = 3 +}; + +extern unsigned int netswap_free_pages_min; +extern int netswap_sock_count; +extern unsigned int netswap_dropped; + +/* this is "#defined" and not inline because sock.h includes us, but we need + * the "struct sock" definition. + */ +#define netswap_low_memory(sk, skb) \ +({ \ + int _ret = 0; \ + \ + if (netswap_sock_count > 0 && /* anybody swapping via network? */ \ + !(sk)->swapping && /* but we are not needed for swapping */ \ + nr_free_pages() < netswap_free_pages_min) { /* so drop us */ \ + printk("netswap_low_memory: " \ + "dropping skb 0x%p@0x%p\n", skb, sk); \ + netswap_dropped ++; \ + _ret = 1; \ + } \ + _ret; \ +}) + +extern int __init netswap_init(void); + +#endif + +#endif diff -Nurb linux.orig/include/net/sock.h linux/include/net/sock.h --- linux.orig/include/net/sock.h 2004-05-31 02:07:17.000000000 -0400 +++ linux/include/net/sock.h 2004-05-31 02:18:03.000000000 -0400 @@ -103,6 +103,10 @@ #include <linux/filter.h> #endif +#ifdef CONFIG_NETSWAP +#include <net/netswapping.h> +#endif + #include <asm/atomic.h> #include <net/dst.h> @@ -536,6 +540,12 @@ no_check, broadcast, bsdism; +#ifdef CONFIG_NETSWAP + /* Increased by SO_SWAPPING with arg != 0, decreased by + * SO_SWAPPING with arg 0 + */ + int swapping; +#endif unsigned char debug; unsigned char rcvtstamp; unsigned char use_write_queue; @@ -1165,6 +1175,11 @@ return err; /* Toss packet */ } #endif /* CONFIG_FILTER */ +#ifdef CONFIG_NETSWAP + /* an inline function defined in net/netswapping.h */ + if (netswap_low_memory(sk, skb)) + return -ENOMEM; +#endif /* CONFIG_NETSWAP */ skb->dev = NULL; skb_set_owner_r(skb, sk); diff -Nurb linux.orig/kernel/ksyms.c linux/kernel/ksyms.c --- linux.orig/kernel/ksyms.c 2004-05-31 02:02:43.000000000 -0400 +++ linux/kernel/ksyms.c 2004-05-31 02:18:03.000000000 -0400 @@ -41,6 +41,7 @@ #include <linux/mm.h> #include <linux/capability.h> #include <linux/highuid.h> +#include <linux/swapctl.h> #include <linux/brlock.h> #include <linux/fs.h> #include <linux/tty.h> @@ -127,6 +128,11 @@ EXPORT_SYMBOL(kmap_prot); EXPORT_SYMBOL(kmap_pte); #endif +EXPORT_SYMBOL(nr_free_pages); +/* EXPORT_SYMBOL(freepages); */ +EXPORT_SYMBOL(register_swap_method); +EXPORT_SYMBOL(unregister_swap_method); +EXPORT_SYMBOL(swap_run_test); /* filesystem internal functions */ EXPORT_SYMBOL(def_blk_fops); @@ -531,7 +537,7 @@ EXPORT_SYMBOL(make_bad_inode); EXPORT_SYMBOL(is_bad_inode); EXPORT_SYMBOL(event); -EXPORT_SYMBOL(brw_page); +EXPORT_SYMBOL(end_buffer_io_async); EXPORT_SYMBOL(__inode_dir_notify); #ifdef CONFIG_UID16 diff -Nurb linux.orig/mm/page_io.c linux/mm/page_io.c --- linux.orig/mm/page_io.c 2003-07-04 04:12:29.000000000 -0400 +++ linux/mm/page_io.c 2004-05-31 02:18:03.000000000 -0400 @@ -36,11 +36,8 @@ static int rw_swap_page_base(int rw, swp_entry_t entry, struct page *page) { unsigned long offset; - int zones[PAGE_SIZE/512]; - int zones_used; - kdev_t dev = 0; - int block_size; - struct inode *swapf = 0; + struct swap_method *method; + void *data; if (rw == READ) { ClearPageUptodate(page); @@ -48,30 +45,11 @@ } else kstat.pswpout++; - get_swaphandle_info(entry, &offset, &dev, &swapf); - if (dev) { - zones[0] = offset; - zones_used = 1; - block_size = PAGE_SIZE; - } else if (swapf) { - int i, j; - unsigned int block = offset - << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits); - - block_size = swapf->i_sb->s_blocksize; - for (i=0, j=0; j< PAGE_SIZE ; i++, j += block_size) - if (!(zones[i] = bmap(swapf,block++))) { - printk("rw_swap_page: bad swap file\n"); - return 0; - } - zones_used = i; - dev = swapf->i_dev; - } else { + method = get_swaphandle_info(entry, &offset, &data); + if (!method || !method->ops->rw_page(rw, page, offset, data)) { return 0; } - /* block_size == PAGE_SIZE/zones_used */ - brw_page(rw, page, dev, zones, block_size); return 1; } diff -Nurb linux.orig/mm/slab.c linux/mm/slab.c --- linux.orig/mm/slab.c 2003-07-04 04:12:29.000000000 -0400 +++ linux/mm/slab.c 2004-05-31 02:18:03.000000000 -0400 @@ -111,10 +111,12 @@ # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ SLAB_NO_REAP | SLAB_CACHE_DMA | \ - SLAB_MUST_HWCACHE_ALIGN) + SLAB_MUST_HWCACHE_ALIGN | \ + SLAB_LOW_GFP_ORDER) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ - SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN) + SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ + SLAB_LOW_GFP_ORDER) #endif /* @@ -247,8 +249,13 @@ }; /* internal c_flags */ -#define CFLGS_OFF_SLAB 0x010000UL /* slab management in own cache */ -#define CFLGS_OPTIMIZE 0x020000UL /* optimized slab lookup */ +#define CFLGS_OFF_SLAB 0x020000UL /* slab management in own cache */ +#define CFLGS_OPTIMIZE 0x040000UL /* optimized slab lookup */ +#define CFLGS_MASK (CFLGS_OFF_SLAB | CFLGS_OPTIMIZE) + +#if (CFLGS_MASK & CREATE_MASK) +# error BUG: internal and external SLAB flags overlap +#endif /* c_dflags (dynamic flags). Need to hold the spinlock to access this member */ #define DFLGS_GROWN 0x000001UL /* don't reap a recently grown */ @@ -452,7 +459,12 @@ snprintf(name, sizeof(name), "size-%Zd",sizes->cs_size); if (!(sizes->cs_cachep = kmem_cache_create(name, sizes->cs_size, - 0, SLAB_HWCACHE_ALIGN, NULL, NULL))) { + 0, +#if CONFIG_NETSWAP + SLAB_LOW_GFP_ORDER| /* sorry */ +#endif + SLAB_HWCACHE_ALIGN, + NULL, NULL))) { BUG(); } @@ -731,6 +743,8 @@ break; if (!cachep->num) goto next; + if (cachep->gfporder == 0 && (flags & SLAB_LOW_GFP_ORDER)) + break; if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) { /* Oops, this num of objs will cause problems. */ cachep->gfporder--; diff -Nurb linux.orig/mm/swapfile.c linux/mm/swapfile.c --- linux.orig/mm/swapfile.c 2003-07-04 04:12:29.000000000 -0400 +++ linux/mm/swapfile.c 2004-05-31 02:18:03.000000000 -0400 @@ -11,12 +11,17 @@ #include <linux/swap.h> #include <linux/swapctl.h> #include <linux/blkdev.h> /* for blk_size */ +#include <linux/file.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/shm.h> #include <asm/pgtable.h> +#ifdef CONFIG_KMOD +#include <linux/kmod.h> +#endif + spinlock_t swaplock = SPIN_LOCK_UNLOCKED; unsigned int nr_swapfiles; int total_swap_pages; @@ -31,8 +36,78 @@ struct swap_info_struct swap_info[MAX_SWAPFILES]; +static struct swap_method *swap_methods = NULL; + #define SWAPFILE_CLUSTER 256 +int register_swap_method(char *name, struct swap_ops *ops) +{ + struct swap_method *pos; + struct swap_method *new; + int result = 0; + + lock_kernel(); + + for (pos = swap_methods; pos; pos = pos->next) { + if (strcmp(pos->name, name) == 0) { + printk(KERN_ERR "register_swap_method: " + "method %s already registered\n", name); + result = -EBUSY; + goto out; + } + } + + if (!(new = kmalloc(sizeof(*new), GFP_KERNEL))) { + printk(KERN_ERR "register_swap_method: " + "no memory for new method \"%s\"\n", name); + result = -ENOMEM; + goto out; + } + + new->name = name; + new->ops = ops; + new->use_count = 0; + + /* ok, insert at top of list */ + printk("register_swap_method: method %s\n", name); + new->next = swap_methods; + swap_methods = new; + out: + unlock_kernel(); + return result; +} + +int unregister_swap_method(char *name) +{ + struct swap_method **method, *next; + int result = 0; + + lock_kernel(); + + for (method = &swap_methods; *method; method = &(*method)->next) { + if (strcmp((*method)->name, name) == 0) { + if ((*method)->use_count > 0) { + printk(KERN_ERR "unregister_swap_method: " + "method \"%s\" is in use\n", name); + result = -EBUSY; + goto out; + } + + next = (*method)->next; + kfree(*method); + *method = next; + printk("unregister_swap_method: method %s\n", name); + goto out; + } + } + /* not found */ + printk("unregister_swap_method: no such method %s\n", name); + result = -ENOENT; + out: + unlock_kernel(); + return result; +} + static inline int scan_swap_map(struct swap_info_struct *si) { unsigned long offset; @@ -711,13 +786,14 @@ struct nameidata nd; int i, type, prev; int err; + struct file *swap_file; if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = user_path_walk(specialfile, &nd); if (err) - goto out; + return err; lock_kernel(); prev = -1; @@ -725,15 +801,20 @@ for (type = swap_list.head; type >= 0; type = swap_info[type].next) { p = swap_info + type; if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { - if (p->swap_file == nd.dentry) + if (p->swap_file && + p->swap_file->f_dentry == nd.dentry) break; } prev = type; } err = -EINVAL; + /* p->swap_file contains all needed info, no need to keep nd, so + * release it now. + */ + path_release(&nd); if (type < 0) { swap_list_unlock(); - goto out_dput; + goto out; } if (prev < 0) { @@ -767,32 +848,30 @@ total_swap_pages += p->pages; p->flags = SWP_WRITEOK; swap_list_unlock(); - goto out_dput; + goto out; } - if (p->swap_device) - blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP); - path_release(&nd); + if (p->method->ops->release) + p->method->ops->release(p->swap_file, p->data); swap_list_lock(); swap_device_lock(p); - nd.mnt = p->swap_vfsmnt; - nd.dentry = p->swap_file; - p->swap_vfsmnt = NULL; + p->method->use_count --; + p->method = NULL; + p->data = NULL; + swap_file = p->swap_file; p->swap_file = NULL; - p->swap_device = 0; p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; p->flags = 0; swap_device_unlock(p); swap_list_unlock(); + filp_close(swap_file, NULL); vfree(swap_map); err = 0; -out_dput: - unlock_kernel(); - path_release(&nd); out: + unlock_kernel(); return err; } @@ -805,18 +884,17 @@ if (!page) return -ENOMEM; - len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n"); + len += sprintf(buf, "%-32s%-16s%-8s%-8sPriority\n", + "Filename", "Type", "Size", "Used"); for (i = 0 ; i < nr_swapfiles ; i++, ptr++) { if ((ptr->flags & SWP_USED) && ptr->swap_map) { - char * path = d_path(ptr->swap_file, ptr->swap_vfsmnt, + char * path = d_path(ptr->swap_file->f_dentry, + ptr->swap_file->f_vfsmnt, page, PAGE_SIZE); len += sprintf(buf + len, "%-31s ", path); - if (!ptr->swap_device) - len += sprintf(buf + len, "file\t\t"); - else - len += sprintf(buf + len, "partition\t"); + len += sprintf(buf + len, "%-15s ", ptr->method->name); usedswap = 0; for (j = 0; j < ptr->max; ++j) @@ -827,7 +905,7 @@ default: usedswap++; } - len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), + len += sprintf(buf + len, "%-8d%-8d%d\n", ptr->pages << (PAGE_SHIFT - 10), usedswap << (PAGE_SHIFT - 10), ptr->prio); } } @@ -835,18 +913,55 @@ return len; } -int is_swap_partition(kdev_t dev) { +/* apply a test function to all active swap objects. E.g. for checking + * whether a partition is used for swapping + */ +int swap_run_test(int (*test_fct)(unsigned int flags, + struct file * swap_file, + void *testdata), void *testdata) +{ struct swap_info_struct *ptr = swap_info; int i; for (i = 0 ; i < nr_swapfiles ; i++, ptr++) { - if (ptr->flags & SWP_USED) - if (ptr->swap_device == dev) + if (ptr->swap_file && + test_fct(ptr->flags, ptr->swap_file, testdata)) return 1; } return 0; } +/* Walk through the list of known swap method until somebody wants to + * handle this file. Pick the first one which claims to be able to + * swap to this kind of file. + * + * return value: < 0: error, 0: not found, > 0: swapfilesize + */ +int find_swap_method(struct file *swap_file, + struct swap_info_struct *p) +{ + int swapfilesize = 0; + struct swap_method *method; + + p->method = NULL; + for (method = swap_methods; method; method = method->next) { + swapfilesize = method->ops->open(swap_file, &p->data); + if (swapfilesize == 0) { + continue; + } + if (swapfilesize > 0) { + p->method = method; + p->method->use_count ++; + p->swap_file = swap_file; + break; + } + if (swapfilesize < 0) { + break; + } + } + return swapfilesize; +} + /* * Written 01/25/92 by Simmule Turner, heavily changed by Linus. * @@ -855,8 +970,6 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) { struct swap_info_struct * p; - struct nameidata nd; - struct inode * swap_inode; unsigned int type; int i, j, prev; int error; @@ -866,8 +979,9 @@ int nr_good_pages = 0; unsigned long maxpages = 1; int swapfilesize; - struct block_device *bdev = NULL; unsigned short *swap_map; + char * tmp_specialfile; + struct file *swap_file; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -886,8 +1000,7 @@ nr_swapfiles = type+1; p->flags = SWP_USED; p->swap_file = NULL; - p->swap_vfsmnt = NULL; - p->swap_device = 0; + p->method = NULL; p->swap_map = NULL; p->lowest_bit = 0; p->highest_bit = 0; @@ -901,53 +1014,56 @@ p->prio = --least_priority; } swap_list_unlock(); - error = user_path_walk(specialfile, &nd); - if (error) + + /* Open the swap using filp_open. Bail out on any errors. */ + tmp_specialfile = getname(specialfile); + if (IS_ERR(tmp_specialfile)) { + error = PTR_ERR(tmp_specialfile); goto bad_swap_2; + } + p->swap_file = filp_open(tmp_specialfile, O_RDWR, 0600); + putname(tmp_specialfile); + if (IS_ERR(p->swap_file)) { + error = PTR_ERR(p->swap_file); + goto bad_swap_1; + } - p->swap_file = nd.dentry; - p->swap_vfsmnt = nd.mnt; - swap_inode = nd.dentry->d_inode; error = -EINVAL; - if (S_ISBLK(swap_inode->i_mode)) { - kdev_t dev = swap_inode->i_rdev; - struct block_device_operations *bdops; - devfs_handle_t de; - - p->swap_device = dev; - set_blocksize(dev, PAGE_SIZE); - - bd_acquire(swap_inode); - bdev = swap_inode->i_bdev; - de = devfs_get_handle_from_inode(swap_inode); - bdops = devfs_get_ops(de); /* Increments module use count */ - if (bdops) bdev->bd_op = bdops; - - error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP); - devfs_put_ops(de);/*Decrement module use count now we're safe*/ - if (error) - goto bad_swap_2; - set_blocksize(dev, PAGE_SIZE); - error = -ENODEV; - if (!dev || (blk_size[MAJOR(dev)] && - !blk_size[MAJOR(dev)][MINOR(dev)])) - goto bad_swap; - swapfilesize = 0; - if (blk_size[MAJOR(dev)]) - swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)] - >> (PAGE_SHIFT - 10); - } else if (S_ISREG(swap_inode->i_mode)) - swapfilesize = swap_inode->i_size >> PAGE_SHIFT; - else - goto bad_swap; + swapfilesize = find_swap_method(p->swap_file, p); + if (swapfilesize < 0) { + error = swapfilesize; + goto bad_swap_1; + } +#ifdef CONFIG_KMOD + if (swapfilesize == 0) { + (void)request_module("swapfile-mod"); + + swapfilesize = find_swap_method(p->swap_file, p); + if (swapfilesize < 0) { + error = swapfilesize; + goto bad_swap_1; + } + } +#endif + if (swapfilesize == 0) { + printk("Don't know how to swap to this kind of file\n"); + goto bad_swap_1; /* free swap map */ + } + + /* After this point, the swap-file has been opened by the swap + * method. We must make sure to use the bad_swap label for any + * errors. + */ error = -EBUSY; for (i = 0 ; i < nr_swapfiles ; i++) { struct swap_info_struct *q = &swap_info[i]; if (i == type || !q->swap_file) continue; - if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping) + if (p->swap_file->f_dentry->d_inode->i_mapping + == + q->swap_file->f_dentry->d_inode->i_mapping) goto bad_swap; } @@ -1083,17 +1199,27 @@ swap_list_unlock(); error = 0; goto out; + bad_swap: - if (bdev) - blkdev_put(bdev, BDEV_SWAP); + if (p->method->ops->release) + p->method->ops->release(p->swap_file, p->data); + swap_list_lock(); + p->method->use_count --; + p->method = NULL; + p->data = NULL; + swap_list_unlock(); + +bad_swap_1: + swap_list_lock(); + swap_file = p->swap_file; + p->swap_file = NULL; + swap_list_unlock(); + filp_close(swap_file, NULL); + bad_swap_2: + swap_list_lock(); swap_map = p->swap_map; - nd.mnt = p->swap_vfsmnt; - nd.dentry = p->swap_file; - p->swap_device = 0; - p->swap_file = NULL; - p->swap_vfsmnt = NULL; p->swap_map = NULL; p->flags = 0; if (!(swap_flags & SWAP_FLAG_PREFER)) @@ -1101,7 +1227,7 @@ swap_list_unlock(); if (swap_map) vfree(swap_map); - path_release(&nd); + out: if (swap_header) free_page((long) swap_header); @@ -1217,8 +1343,8 @@ /* * Prior swap_duplicate protects against swap device deletion. */ -void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, - kdev_t *dev, struct inode **swapf) +struct swap_method *get_swaphandle_info(swp_entry_t entry, + unsigned long *offset, void **data) { unsigned long type; struct swap_info_struct *p; @@ -1226,32 +1352,26 @@ type = SWP_TYPE(entry); if (type >= nr_swapfiles) { printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val); - return; + return NULL; } p = &swap_info[type]; *offset = SWP_OFFSET(entry); if (*offset >= p->max && *offset != 0) { printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val); - return; + return NULL; } if (p->swap_map && !p->swap_map[*offset]) { printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val); - return; + return NULL; } if (!(p->flags & SWP_USED)) { printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val); - return; + return NULL; } - if (p->swap_device) { - *dev = p->swap_device; - } else if (p->swap_file) { - *swapf = p->swap_file->d_inode; - } else { - printk(KERN_ERR "rw_swap_page: no swap file or device\n"); - } - return; + *data = p->data; + return p->method; } /* diff -Nurb linux.orig/net/Config.in linux/net/Config.in --- linux.orig/net/Config.in 2003-07-04 04:12:29.000000000 -0400 +++ linux/net/Config.in 2004-05-31 02:18:03.000000000 -0400 @@ -16,6 +16,9 @@ fi bool 'Socket Filtering' CONFIG_FILTER tristate 'Unix domain sockets' CONFIG_UNIX +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + bool 'Swapping via network sockets (EXPERIMENTAL)' CONFIG_NETSWAP +fi bool 'TCP/IP networking' CONFIG_INET if [ "$CONFIG_INET" = "y" ]; then source net/ipv4/Config.in diff -Nurb linux.orig/net/Makefile linux/net/Makefile --- linux.orig/net/Makefile 2003-07-04 04:12:29.000000000 -0400 +++ linux/net/Makefile 2004-05-31 02:18:03.000000000 -0400 @@ -51,6 +51,7 @@ ifeq ($(CONFIG_NET),y) obj-$(CONFIG_MODULES) += netsyms.o obj-$(CONFIG_SYSCTL) += sysctl_net.o +obj-$(CONFIG_NETSWAP) += netswapping.o endif include $(TOPDIR)/Rules.make diff -Nurb linux.orig/net/core/sock.c linux/net/core/sock.c --- linux.orig/net/core/sock.c 2003-10-14 04:09:32.000000000 -0400 +++ linux/net/core/sock.c 2004-05-31 02:18:03.000000000 -0400 @@ -402,6 +402,21 @@ ret = -ENONET; break; #endif +#ifdef CONFIG_NETSWAP + case SO_SWAPPING: + if (valbool) { + if (!sk->swapping) { + netswap_sock_count ++; + } + sk->swapping ++; + } else if (sk->swapping > 0) { + sk->swapping --; + if (!sk->swapping) { + netswap_sock_count --; + } + } + break; +#endif /* We implement the SO_SNDLOWAT etc to not be settable (1003.1g 5.3) */ default: @@ -552,6 +567,12 @@ goto lenout; } +#ifdef CONFIG_NETSWAP + case SO_SWAPPING: + v.val = sk->swapping; + break; +#endif + /* Dubious BSD thing... Probably nobody even uses it, but * the UNIX standard wants it for whatever reason... -DaveM */ diff -Nurb linux.orig/net/ipv4/tcp_ipv4.c linux/net/ipv4/tcp_ipv4.c --- linux.orig/net/ipv4/tcp_ipv4.c 2003-10-14 04:09:33.000000000 -0400 +++ linux/net/ipv4/tcp_ipv4.c 2004-05-31 02:18:03.000000000 -0400 @@ -1657,6 +1657,12 @@ if (filter && sk_filter(skb, filter)) goto discard; #endif /* CONFIG_FILTER */ +#ifdef CONFIG_NETSWAP + /* tcp doesn't use sock_queue_rcv_skb() ... */ + /* an inline function defined in net/netswapping.h */ + if (netswap_low_memory(sk, skb)) + goto discard; +#endif /* CONFIG_NETSWAP */ IP_INC_STATS_BH(IpInDelivers); diff -Nurb linux.orig/net/ipv6/tcp_ipv6.c linux/net/ipv6/tcp_ipv6.c --- linux.orig/net/ipv6/tcp_ipv6.c 2003-10-14 04:09:34.000000000 -0400 +++ linux/net/ipv6/tcp_ipv6.c 2004-05-31 02:18:03.000000000 -0400 @@ -1424,6 +1424,12 @@ if (filter && sk_filter(skb, filter)) goto discard; #endif /* CONFIG_FILTER */ +#ifdef CONFIG_NETSWAP + /* tcp doesn't use sock_queue_rcv_skb() ... */ + /* an inline function defined in net/netswapping.h */ + if (netswap_low_memory(sk, skb)) + goto discard; +#endif /* CONFIG_NETSWAP */ /* * socket locking is here for SMP purposes as backlog rcv diff -Nurb linux.orig/net/netswapping.c linux/net/netswapping.c --- linux.orig/net/netswapping.c 1969-12-31 19:00:00.000000000 -0500 +++ linux/net/netswapping.c 2004-05-31 02:18:03.000000000 -0400 @@ -0,0 +1,76 @@ +/* + * linux/net/swapping.c + * + * Support paging over network connections (inet only) + * + * (c) 2000 Claus-Justus Heine <heine@instmath.rwth-aachen.de> + */ + +#include <linux/slab.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/skbuff.h> +#include <linux/module.h> +#include <linux/sysctl.h> +#include <linux/init.h> +#include <net/netswapping.h> +#include <net/sock.h> +#include <asm/uaccess.h> + +unsigned int netswap_dropped; /* statistics */ +unsigned int netswap_free_pages_min; +int netswap_sock_count; /* how many sockets have swapping option set */ + +#ifdef CONFIG_SYSCTL + +static ctl_table netswap_table[] = { + {NET_SWAP_DROPPED, "dropped", + &netswap_dropped, sizeof(int), 0644, NULL, &proc_dointvec }, + {NET_SWAP_DROP_THRESHOLD, "threshold", + &netswap_free_pages_min, sizeof(int), 0644, NULL, &proc_dointvec }, + {NET_SWAP_SOCK_COUNT, "sock_count", + &netswap_sock_count, sizeof(int), 0444, NULL, &proc_dointvec }, + {0}, +}; + +static struct ctl_table_header *netswap_sysctl_header; + +static ctl_table netswap_net_table[] = { + {CTL_NETSWAP, "swapping", NULL, 0, 0555, netswap_table}, + {0} +}; + +static ctl_table netswap_root_table[] = { + {CTL_NET, "net", NULL, 0, 0555, netswap_net_table}, + {0} +}; + +#endif + +int __init netswap_init(void) +{ + /* drop packets when below this threshold */ + netswap_free_pages_min = 32 /* freepages.min */; +#ifdef CONFIG_SYSCTL + netswap_sysctl_header = register_sysctl_table(netswap_root_table, 0); +#endif + return 0; +} + +void __exit netswap_exit(void) +{ +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(netswap_sysctl_header); +#endif +} + +/* linux/init.h -- VERY nice :-) + * + * On the other hand, we have no control over the order the initcalls + * are performed ... + * + * Actually, we are not compiled as module ... + */ + +module_init(netswap_init) +module_exit(netswap_exit) diff -Nurb linux.orig/net/netsyms.c linux/net/netsyms.c --- linux.orig/net/netsyms.c 2004-05-31 02:02:49.000000000 -0400 +++ linux/net/netsyms.c 2004-05-31 02:18:03.000000000 -0400 @@ -601,4 +601,10 @@ EXPORT_SYMBOL(wireless_send_event); #endif /* CONFIG_NET_RADIO || CONFIG_NET_PCMCIA_RADIO */ +#ifdef CONFIG_NETSWAP +EXPORT_SYMBOL(netswap_sock_count); +EXPORT_SYMBOL(netswap_free_pages_min); +EXPORT_SYMBOL(netswap_dropped); +#endif + #endif /* CONFIG_NET */ diff -Nurb linux.orig/net/packet/af_packet.c linux/net/packet/af_packet.c --- linux.orig/net/packet/af_packet.c 2003-10-14 04:09:35.000000000 -0400 +++ linux/net/packet/af_packet.c 2004-05-31 02:18:03.000000000 -0400 @@ -449,6 +449,12 @@ snaplen = res; } #endif /* CONFIG_FILTER */ +#ifdef CONFIG_NETSWAP + /* packet doesn't use sock_queue_rcv_skb() ... */ + /* an inline function defined in net/netswapping.h */ + if (netswap_low_memory(sk, skb)) + goto drop_n_restore; +#endif /* CONFIG_NETSWAP */ if (atomic_read(&sk->rmem_alloc) + skb->truesize >= (unsigned)sk->rcvbuf) goto drop_n_acct; @@ -496,7 +502,7 @@ po->stats.tp_drops++; spin_unlock(&sk->receive_queue.lock); -#ifdef CONFIG_FILTER +#if defined(CONFIG_FILTER) || defined(CONFIG_NETSWAP) drop_n_restore: #endif if (skb_head != skb->data && skb_shared(skb)) { @@ -557,6 +563,12 @@ snaplen = res; } #endif +#ifdef CONFIG_NETSWAP + /* packet doesn't use sock_queue_rcv_skb() ... */ + /* an inline function defined in net/netswapping.h */ + if (netswap_low_memory(sk, skb)) + goto drop_n_restore; +#endif /* CONFIG_NETSWAP */ if (sk->type == SOCK_DGRAM) { macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16; diff -Nurb linux.orig/net/sunrpc/sched.c linux/net/sunrpc/sched.c --- linux.orig/net/sunrpc/sched.c 2003-07-04 04:12:33.000000000 -0400 +++ linux/net/sunrpc/sched.c 2004-05-31 02:18:03.000000000 -0400 @@ -79,10 +79,11 @@ */ static spinlock_t rpc_sched_lock = SPIN_LOCK_UNLOCKED; +#if CONFIG_SWAP_VIA_NFS || CONFIG_SWAP_VIA_NFS_MODULE /* * This is the last-ditch buffer for NFS swap requests */ -static u32 swap_buffer[PAGE_SIZE >> 2]; +static u32 swap_buffer[2*PAGE_SIZE >> 2]; static long swap_buffer_used; /* @@ -96,6 +97,7 @@ { clear_bit(1, &swap_buffer_used); } +#endif /* * Disable the timer for a given RPC task. Should be called with @@ -501,6 +503,7 @@ __rpc_execute(struct rpc_task *task) { int status = 0; + unsigned long alloc_flag = current->flags & PF_MEMALLOC; dprintk("RPC: %4d rpc_execute flgs %x\n", task->tk_pid, task->tk_flags); @@ -510,6 +513,13 @@ return 0; } + if (task->tk_flags & RPC_TASK_SWAPPER) { + if (!current->flags & PF_MEMALLOC) { + dprintk("__rpc_execute: Setting PF_MEMALLOC\n"); + } + current->flags |= PF_MEMALLOC; + } + restarted: while (1) { /* @@ -554,7 +564,8 @@ rpc_set_sleeping(task); if (RPC_IS_ASYNC(task)) { spin_unlock_bh(&rpc_queue_lock); - return 0; + status = 0; + goto out; } } spin_unlock_bh(&rpc_queue_lock); @@ -563,7 +574,12 @@ /* sync task: sleep here */ dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid); - if (current->pid == rpciod_pid) + /* it's ok to wait for rpciod when swapping, + * because this means it needed memory and is + * doing the swap-out itself. + */ + if (current->pid == rpciod_pid && + !(task->tk_flags & RPC_TASK_SWAPPER)) printk(KERN_ERR "RPC: rpciod waiting on sync task!\n"); __wait_event(task->tk_wait, !RPC_IS_SLEEPING(task)); @@ -608,6 +624,10 @@ /* Release all resources associated with the task */ rpc_release_task(task); + out: + if (!alloc_flag) { + current->flags &= ~PF_MEMALLOC; + } return status; } @@ -699,10 +719,16 @@ { u32 *buffer; int gfp; + unsigned long alloc_flag = current->flags & PF_MEMALLOC; + void *ret = NULL; - if (flags & RPC_TASK_SWAPPER) + if (flags & RPC_TASK_SWAPPER) { gfp = GFP_ATOMIC; - else if (flags & RPC_TASK_ASYNC) + if (!(current->flags & PF_MEMALLOC)) { + dprintk("rpc_allocate: Setting PF_MEMALLOC\n"); + } + current->flags |= PF_MEMALLOC; + } else if (flags & RPC_TASK_ASYNC) gfp = GFP_RPC; else gfp = GFP_KERNEL; @@ -710,29 +736,44 @@ do { if ((buffer = (u32 *) kmalloc(size, gfp)) != NULL) { dprintk("RPC: allocated buffer %p\n", buffer); - return buffer; + ret = buffer; + goto out; } +#if CONFIG_SWAP_VIA_NFS || CONFIG_SWAP_VIA_NFS_MODULE if ((flags & RPC_TASK_SWAPPER) && size <= sizeof(swap_buffer) && rpc_lock_swapbuf()) { dprintk("RPC: used last-ditch swap buffer\n"); - return swap_buffer; + ret = swap_buffer; + goto out; +#endif + } + if (flags & RPC_TASK_ASYNC) { + ret = NULL; + goto out; } - if (flags & RPC_TASK_ASYNC) - return NULL; yield(); } while (!signalled()); - return NULL; + out: + if (!alloc_flag) { + current->flags &= ~PF_MEMALLOC; + } + return ret; } void rpc_free(void *buffer) { +#if CONFIG_SWAP_VIA_NFS || CONFIG_SWAP_VIA_NFS_MODULE if (buffer != swap_buffer) { +#endif kfree(buffer); return; +#if CONFIG_SWAP_VIA_NFS || CONFIG_SWAP_VIA_NFS_MODULE } rpc_unlock_swapbuf(); + printk("RPC: Released swap buffer\n"); +#endif } /* diff -Nurb linux.orig/net/sunrpc/xprt.c linux/net/sunrpc/xprt.c --- linux.orig/net/sunrpc/xprt.c 2003-07-04 04:12:33.000000000 -0400 +++ linux/net/sunrpc/xprt.c 2004-05-31 02:18:03.000000000 -0400 @@ -139,7 +139,7 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { if (!xprt->snd_task) { - if (xprt->nocong || __xprt_get_cong(xprt, task)) + if (__xprt_get_cong(xprt, task)) xprt->snd_task = task; } if (xprt->snd_task != task) { @@ -179,7 +179,7 @@ if (!task) return; } - if (xprt->nocong || __xprt_get_cong(xprt, task)) + if (__xprt_get_cong(xprt, task)) xprt->snd_task = task; } @@ -276,6 +276,9 @@ { struct rpc_rqst *req = task->tk_rqstp; + if (xprt->nocong || RPC_IS_SWAPPER(task)) + return 1; + if (req->rq_cong) return 1; dprintk("RPC: %4d xprt_cwnd_limited cong = %ld cwnd = %ld\n",