mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-04 20:19:47 +08:00 
			
		
		
		
	 3066a9670b
			
		
	
	
		3066a9670b
		
	
	
	
	
		
			
			Commit 42cb14b110 ("mm: migrate dirty page without
clear_page_dirty_for_io etc") simplified the migration of a PageDirty
pagecache page: one stat needs moving from zone to zone and that's about
all.
It's convenient and safest for it to shift the PageDirty bit from old
page to new, just before updating the zone stats: before copying data
and marking the new PageUptodate.  This is all done while both pages are
isolated and locked, just as before; and just as before, there's a
moment when the new page is visible in the radix_tree, but not yet
PageUptodate.  What's new is that it may now be briefly visible as
PageDirty before it is PageUptodate.
When I scoured the tree to see if this could cause a problem anywhere,
the only places I found were in two similar functions __r4w_get_page():
which look up a page with find_get_page() (not using page lock), then
claim it's uptodate if it's PageDirty or PageWriteback or PageUptodate.
I'm not sure whether that was right before, but now it might be wrong
(on rare occasions): only claim the page is uptodate if PageUptodate.
Or perhaps the page in question could never be migratable anyway?
Signed-off-by: Hugh Dickins <hughd@google.com>
Tested-by: Boaz Harrosh <ooo@electrozaur.com>
Cc: Benny Halevy <bhalevy@panasas.com>
Cc: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
		
	
			
		
			
				
	
	
		
			676 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			676 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  *  pNFS Objects layout implementation over open-osd initiator library
 | |
|  *
 | |
|  *  Copyright (C) 2009 Panasas Inc. [year of first publication]
 | |
|  *  All rights reserved.
 | |
|  *
 | |
|  *  Benny Halevy <bhalevy@panasas.com>
 | |
|  *  Boaz Harrosh <ooo@electrozaur.com>
 | |
|  *
 | |
|  *  This program is free software; you can redistribute it and/or modify
 | |
|  *  it under the terms of the GNU General Public License version 2
 | |
|  *  See the file COPYING included with this distribution for more details.
 | |
|  *
 | |
|  *  Redistribution and use in source and binary forms, with or without
 | |
|  *  modification, are permitted provided that the following conditions
 | |
|  *  are met:
 | |
|  *
 | |
|  *  1. Redistributions of source code must retain the above copyright
 | |
|  *     notice, this list of conditions and the following disclaimer.
 | |
|  *  2. Redistributions in binary form must reproduce the above copyright
 | |
|  *     notice, this list of conditions and the following disclaimer in the
 | |
|  *     documentation and/or other materials provided with the distribution.
 | |
|  *  3. Neither the name of the Panasas company nor the names of its
 | |
|  *     contributors may be used to endorse or promote products derived
 | |
|  *     from this software without specific prior written permission.
 | |
|  *
 | |
|  *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
 | |
|  *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 | |
|  *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 | |
|  *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 | |
|  *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 | |
|  *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 | |
|  *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 | |
|  *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 | |
|  *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 | |
|  *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 | |
|  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | |
|  */
 | |
| 
 | |
| #include <linux/module.h>
 | |
| #include <scsi/osd_ore.h>
 | |
| 
 | |
| #include "objlayout.h"
 | |
| #include "../internal.h"
 | |
| 
 | |
| #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 | |
| 
 | |
| struct objio_dev_ent {
 | |
| 	struct nfs4_deviceid_node id_node;
 | |
| 	struct ore_dev od;
 | |
| };
 | |
| 
 | |
| static void
 | |
| objio_free_deviceid_node(struct nfs4_deviceid_node *d)
 | |
| {
 | |
| 	struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
 | |
| 
 | |
| 	dprintk("%s: free od=%p\n", __func__, de->od.od);
 | |
| 	osduld_put_device(de->od.od);
 | |
| 	kfree_rcu(d, rcu);
 | |
| }
 | |
| 
 | |
| struct objio_segment {
 | |
| 	struct pnfs_layout_segment lseg;
 | |
| 
 | |
| 	struct ore_layout layout;
 | |
| 	struct ore_components oc;
 | |
| };
 | |
| 
 | |
| static inline struct objio_segment *
 | |
| OBJIO_LSEG(struct pnfs_layout_segment *lseg)
 | |
| {
 | |
| 	return container_of(lseg, struct objio_segment, lseg);
 | |
| }
 | |
| 
 | |
| struct objio_state {
 | |
| 	/* Generic layer */
 | |
| 	struct objlayout_io_res oir;
 | |
| 
 | |
| 	bool sync;
 | |
| 	/*FIXME: Support for extra_bytes at ore_get_rw_state() */
 | |
| 	struct ore_io_state *ios;
 | |
| };
 | |
| 
 | |
| /* Send and wait for a get_device_info of devices in the layout,
 | |
|    then look them up with the osd_initiator library */
 | |
| struct nfs4_deviceid_node *
 | |
| objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
 | |
| 			gfp_t gfp_flags)
 | |
| {
 | |
| 	struct pnfs_osd_deviceaddr *deviceaddr;
 | |
| 	struct objio_dev_ent *ode = NULL;
 | |
| 	struct osd_dev *od;
 | |
| 	struct osd_dev_info odi;
 | |
| 	bool retry_flag = true;
 | |
| 	__be32 *p;
 | |
| 	int err;
 | |
| 
 | |
| 	deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
 | |
| 	if (!deviceaddr)
 | |
| 		return NULL;
 | |
| 
 | |
| 	p = page_address(pdev->pages[0]);
 | |
| 	pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);
 | |
| 
 | |
| 	odi.systemid_len = deviceaddr->oda_systemid.len;
 | |
| 	if (odi.systemid_len > sizeof(odi.systemid)) {
 | |
| 		dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
 | |
| 			__func__, sizeof(odi.systemid));
 | |
| 		err = -EINVAL;
 | |
| 		goto out;
 | |
| 	} else if (odi.systemid_len)
 | |
| 		memcpy(odi.systemid, deviceaddr->oda_systemid.data,
 | |
| 		       odi.systemid_len);
 | |
| 	odi.osdname_len	 = deviceaddr->oda_osdname.len;
 | |
| 	odi.osdname	 = (u8 *)deviceaddr->oda_osdname.data;
 | |
| 
 | |
| 	if (!odi.osdname_len && !odi.systemid_len) {
 | |
| 		dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
 | |
| 			__func__);
 | |
| 		err = -ENODEV;
 | |
| 		goto out;
 | |
| 	}
 | |
| 
 | |
| retry_lookup:
 | |
| 	od = osduld_info_lookup(&odi);
 | |
| 	if (IS_ERR(od)) {
 | |
| 		err = PTR_ERR(od);
 | |
| 		dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
 | |
| 		if (err == -ENODEV && retry_flag) {
 | |
| 			err = objlayout_autologin(deviceaddr);
 | |
| 			if (likely(!err)) {
 | |
| 				retry_flag = false;
 | |
| 				goto retry_lookup;
 | |
| 			}
 | |
| 		}
 | |
| 		goto out;
 | |
| 	}
 | |
| 
 | |
| 	dprintk("Adding new dev_id(%llx:%llx)\n",
 | |
| 		_DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
 | |
| 
 | |
| 	ode = kzalloc(sizeof(*ode), gfp_flags);
 | |
| 	if (!ode) {
 | |
| 		dprintk("%s: -ENOMEM od=%p\n", __func__, od);
 | |
| 		goto out;
 | |
| 	}
 | |
| 
 | |
| 	nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
 | |
| 	kfree(deviceaddr);
 | |
| 
 | |
| 	ode->od.od = od;
 | |
| 	return &ode->id_node;
 | |
| 
 | |
| out:
 | |
| 	kfree(deviceaddr);
 | |
| 	return NULL;
 | |
| }
 | |
| 
 | |
| static void copy_single_comp(struct ore_components *oc, unsigned c,
 | |
| 			     struct pnfs_osd_object_cred *src_comp)
 | |
| {
 | |
| 	struct ore_comp *ocomp = &oc->comps[c];
 | |
| 
 | |
| 	WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */
 | |
| 	WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred));
 | |
| 
 | |
| 	ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id;
 | |
| 	ocomp->obj.id = src_comp->oc_object_id.oid_object_id;
 | |
| 
 | |
| 	memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
 | |
| }
 | |
| 
 | |
| static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
 | |
| 		       struct objio_segment **pseg)
 | |
| {
 | |
| /*	This is the in memory structure of the objio_segment
 | |
|  *
 | |
|  *	struct __alloc_objio_segment {
 | |
|  *		struct objio_segment olseg;
 | |
|  *		struct ore_dev *ods[numdevs];
 | |
|  *		struct ore_comp	comps[numdevs];
 | |
|  *	} *aolseg;
 | |
|  *	NOTE: The code as above compiles and runs perfectly. It is elegant,
 | |
|  *	type safe and compact. At some Past time Linus has decided he does not
 | |
|  *	like variable length arrays, For the sake of this principal we uglify
 | |
|  *	the code as below.
 | |
|  */
 | |
| 	struct objio_segment *lseg;
 | |
| 	size_t lseg_size = sizeof(*lseg) +
 | |
| 			numdevs * sizeof(lseg->oc.ods[0]) +
 | |
| 			numdevs * sizeof(*lseg->oc.comps);
 | |
| 
 | |
| 	lseg = kzalloc(lseg_size, gfp_flags);
 | |
| 	if (unlikely(!lseg)) {
 | |
| 		dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__,
 | |
| 			numdevs, lseg_size);
 | |
| 		return -ENOMEM;
 | |
| 	}
 | |
| 
 | |
| 	lseg->oc.numdevs = numdevs;
 | |
| 	lseg->oc.single_comp = EC_MULTPLE_COMPS;
 | |
| 	lseg->oc.ods = (void *)(lseg + 1);
 | |
| 	lseg->oc.comps = (void *)(lseg->oc.ods + numdevs);
 | |
| 
 | |
| 	*pseg = lseg;
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| int objio_alloc_lseg(struct pnfs_layout_segment **outp,
 | |
| 	struct pnfs_layout_hdr *pnfslay,
 | |
| 	struct pnfs_layout_range *range,
 | |
| 	struct xdr_stream *xdr,
 | |
| 	gfp_t gfp_flags)
 | |
| {
 | |
| 	struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
 | |
| 	struct objio_segment *objio_seg;
 | |
| 	struct pnfs_osd_xdr_decode_layout_iter iter;
 | |
| 	struct pnfs_osd_layout layout;
 | |
| 	struct pnfs_osd_object_cred src_comp;
 | |
| 	unsigned cur_comp;
 | |
| 	int err;
 | |
| 
 | |
| 	err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
 | |
| 	if (unlikely(err))
 | |
| 		return err;
 | |
| 
 | |
| 	err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg);
 | |
| 	if (unlikely(err))
 | |
| 		return err;
 | |
| 
 | |
| 	objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit;
 | |
| 	objio_seg->layout.group_width = layout.olo_map.odm_group_width;
 | |
| 	objio_seg->layout.group_depth = layout.olo_map.odm_group_depth;
 | |
| 	objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
 | |
| 	objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm;
 | |
| 
 | |
| 	err = ore_verify_layout(layout.olo_map.odm_num_comps,
 | |
| 					  &objio_seg->layout);
 | |
| 	if (unlikely(err))
 | |
| 		goto err;
 | |
| 
 | |
| 	objio_seg->oc.first_dev = layout.olo_comps_index;
 | |
| 	cur_comp = 0;
 | |
| 	while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
 | |
| 		struct nfs4_deviceid_node *d;
 | |
| 		struct objio_dev_ent *ode;
 | |
| 
 | |
| 		copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
 | |
| 
 | |
| 		d = nfs4_find_get_deviceid(server,
 | |
| 				&src_comp.oc_object_id.oid_device_id,
 | |
| 				pnfslay->plh_lc_cred, gfp_flags);
 | |
| 		if (!d) {
 | |
| 			err = -ENXIO;
 | |
| 			goto err;
 | |
| 		}
 | |
| 
 | |
| 		ode = container_of(d, struct objio_dev_ent, id_node);
 | |
| 		objio_seg->oc.ods[cur_comp++] = &ode->od;
 | |
| 	}
 | |
| 	/* pnfs_osd_xdr_decode_layout_comp returns false on error */
 | |
| 	if (unlikely(err))
 | |
| 		goto err;
 | |
| 
 | |
| 	*outp = &objio_seg->lseg;
 | |
| 	return 0;
 | |
| 
 | |
| err:
 | |
| 	kfree(objio_seg);
 | |
| 	dprintk("%s: Error: return %d\n", __func__, err);
 | |
| 	*outp = NULL;
 | |
| 	return err;
 | |
| }
 | |
| 
 | |
| void objio_free_lseg(struct pnfs_layout_segment *lseg)
 | |
| {
 | |
| 	int i;
 | |
| 	struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
 | |
| 
 | |
| 	for (i = 0; i < objio_seg->oc.numdevs; i++) {
 | |
| 		struct ore_dev *od = objio_seg->oc.ods[i];
 | |
| 		struct objio_dev_ent *ode;
 | |
| 
 | |
| 		if (!od)
 | |
| 			break;
 | |
| 		ode = container_of(od, typeof(*ode), od);
 | |
| 		nfs4_put_deviceid_node(&ode->id_node);
 | |
| 	}
 | |
| 	kfree(objio_seg);
 | |
| }
 | |
| 
 | |
| static int
 | |
| objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading,
 | |
| 	struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
 | |
| 	loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
 | |
| 	struct objio_state **outp)
 | |
| {
 | |
| 	struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
 | |
| 	struct ore_io_state *ios;
 | |
| 	int ret;
 | |
| 	struct __alloc_objio_state {
 | |
| 		struct objio_state objios;
 | |
| 		struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
 | |
| 	} *aos;
 | |
| 
 | |
| 	aos = kzalloc(sizeof(*aos), gfp_flags);
 | |
| 	if (unlikely(!aos))
 | |
| 		return -ENOMEM;
 | |
| 
 | |
| 	objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
 | |
| 			aos->ioerrs, rpcdata, pnfs_layout_type);
 | |
| 
 | |
| 	ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading,
 | |
| 			       offset, count, &ios);
 | |
| 	if (unlikely(ret)) {
 | |
| 		kfree(aos);
 | |
| 		return ret;
 | |
| 	}
 | |
| 
 | |
| 	ios->pages = pages;
 | |
| 	ios->pgbase = pgbase;
 | |
| 	ios->private = aos;
 | |
| 	BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
 | |
| 
 | |
| 	aos->objios.sync = 0;
 | |
| 	aos->objios.ios = ios;
 | |
| 	*outp = &aos->objios;
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| void objio_free_result(struct objlayout_io_res *oir)
 | |
| {
 | |
| 	struct objio_state *objios = container_of(oir, struct objio_state, oir);
 | |
| 
 | |
| 	ore_put_io_state(objios->ios);
 | |
| 	kfree(objios);
 | |
| }
 | |
| 
 | |
| static enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
 | |
| {
 | |
| 	switch (oep) {
 | |
| 	case OSD_ERR_PRI_NO_ERROR:
 | |
| 		return (enum pnfs_osd_errno)0;
 | |
| 
 | |
| 	case OSD_ERR_PRI_CLEAR_PAGES:
 | |
| 		BUG_ON(1);
 | |
| 		return 0;
 | |
| 
 | |
| 	case OSD_ERR_PRI_RESOURCE:
 | |
| 		return PNFS_OSD_ERR_RESOURCE;
 | |
| 	case OSD_ERR_PRI_BAD_CRED:
 | |
| 		return PNFS_OSD_ERR_BAD_CRED;
 | |
| 	case OSD_ERR_PRI_NO_ACCESS:
 | |
| 		return PNFS_OSD_ERR_NO_ACCESS;
 | |
| 	case OSD_ERR_PRI_UNREACHABLE:
 | |
| 		return PNFS_OSD_ERR_UNREACHABLE;
 | |
| 	case OSD_ERR_PRI_NOT_FOUND:
 | |
| 		return PNFS_OSD_ERR_NOT_FOUND;
 | |
| 	case OSD_ERR_PRI_NO_SPACE:
 | |
| 		return PNFS_OSD_ERR_NO_SPACE;
 | |
| 	default:
 | |
| 		WARN_ON(1);
 | |
| 		/* fallthrough */
 | |
| 	case OSD_ERR_PRI_EIO:
 | |
| 		return PNFS_OSD_ERR_EIO;
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static void __on_dev_error(struct ore_io_state *ios,
 | |
| 	struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
 | |
| 	u64 dev_offset, u64  dev_len)
 | |
| {
 | |
| 	struct objio_state *objios = ios->private;
 | |
| 	struct pnfs_osd_objid pooid;
 | |
| 	struct objio_dev_ent *ode = container_of(od, typeof(*ode), od);
 | |
| 	/* FIXME: what to do with more-then-one-group layouts. We need to
 | |
| 	 * translate from ore_io_state index to oc->comps index
 | |
| 	 */
 | |
| 	unsigned comp = dev_index;
 | |
| 
 | |
| 	pooid.oid_device_id = ode->id_node.deviceid;
 | |
| 	pooid.oid_partition_id = ios->oc->comps[comp].obj.partition;
 | |
| 	pooid.oid_object_id = ios->oc->comps[comp].obj.id;
 | |
| 
 | |
| 	objlayout_io_set_result(&objios->oir, comp,
 | |
| 				&pooid, osd_pri_2_pnfs_err(oep),
 | |
| 				dev_offset, dev_len, !ios->reading);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * read
 | |
|  */
 | |
| static void _read_done(struct ore_io_state *ios, void *private)
 | |
| {
 | |
| 	struct objio_state *objios = private;
 | |
| 	ssize_t status;
 | |
| 	int ret = ore_check_io(ios, &__on_dev_error);
 | |
| 
 | |
| 	/* FIXME: _io_free(ios) can we dealocate the libosd resources; */
 | |
| 
 | |
| 	if (likely(!ret))
 | |
| 		status = ios->length;
 | |
| 	else
 | |
| 		status = ret;
 | |
| 
 | |
| 	objlayout_read_done(&objios->oir, status, objios->sync);
 | |
| }
 | |
| 
 | |
| int objio_read_pagelist(struct nfs_pgio_header *hdr)
 | |
| {
 | |
| 	struct objio_state *objios;
 | |
| 	int ret;
 | |
| 
 | |
| 	ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true,
 | |
| 			hdr->lseg, hdr->args.pages, hdr->args.pgbase,
 | |
| 			hdr->args.offset, hdr->args.count, hdr,
 | |
| 			GFP_KERNEL, &objios);
 | |
| 	if (unlikely(ret))
 | |
| 		return ret;
 | |
| 
 | |
| 	objios->ios->done = _read_done;
 | |
| 	dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
 | |
| 		hdr->args.offset, hdr->args.count);
 | |
| 	ret = ore_read(objios->ios);
 | |
| 	if (unlikely(ret))
 | |
| 		objio_free_result(&objios->oir);
 | |
| 	return ret;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * write
 | |
|  */
 | |
| static void _write_done(struct ore_io_state *ios, void *private)
 | |
| {
 | |
| 	struct objio_state *objios = private;
 | |
| 	ssize_t status;
 | |
| 	int ret = ore_check_io(ios, &__on_dev_error);
 | |
| 
 | |
| 	/* FIXME: _io_free(ios) can we dealocate the libosd resources; */
 | |
| 
 | |
| 	if (likely(!ret)) {
 | |
| 		/* FIXME: should be based on the OSD's persistence model
 | |
| 		 * See OSD2r05 Section 4.13 Data persistence model */
 | |
| 		objios->oir.committed = NFS_FILE_SYNC;
 | |
| 		status = ios->length;
 | |
| 	} else {
 | |
| 		status = ret;
 | |
| 	}
 | |
| 
 | |
| 	objlayout_write_done(&objios->oir, status, objios->sync);
 | |
| }
 | |
| 
 | |
| static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
 | |
| {
 | |
| 	struct objio_state *objios = priv;
 | |
| 	struct nfs_pgio_header *hdr = objios->oir.rpcdata;
 | |
| 	struct address_space *mapping = hdr->inode->i_mapping;
 | |
| 	pgoff_t index = offset / PAGE_SIZE;
 | |
| 	struct page *page;
 | |
| 	loff_t i_size = i_size_read(hdr->inode);
 | |
| 
 | |
| 	if (offset >= i_size) {
 | |
| 		*uptodate = true;
 | |
| 		dprintk("%s: g_zero_page index=0x%lx\n", __func__, index);
 | |
| 		return ZERO_PAGE(0);
 | |
| 	}
 | |
| 
 | |
| 	page = find_get_page(mapping, index);
 | |
| 	if (!page) {
 | |
| 		page = find_or_create_page(mapping, index, GFP_NOFS);
 | |
| 		if (unlikely(!page)) {
 | |
| 			dprintk("%s: grab_cache_page Failed index=0x%lx\n",
 | |
| 				__func__, index);
 | |
| 			return NULL;
 | |
| 		}
 | |
| 		unlock_page(page);
 | |
| 	}
 | |
| 	*uptodate = PageUptodate(page);
 | |
| 	dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
 | |
| 	return page;
 | |
| }
 | |
| 
 | |
| static void __r4w_put_page(void *priv, struct page *page)
 | |
| {
 | |
| 	dprintk("%s: index=0x%lx\n", __func__,
 | |
| 		(page == ZERO_PAGE(0)) ? -1UL : page->index);
 | |
| 	if (ZERO_PAGE(0) != page)
 | |
| 		page_cache_release(page);
 | |
| 	return;
 | |
| }
 | |
| 
 | |
| static const struct _ore_r4w_op _r4w_op = {
 | |
| 	.get_page = &__r4w_get_page,
 | |
| 	.put_page = &__r4w_put_page,
 | |
| };
 | |
| 
 | |
| int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
 | |
| {
 | |
| 	struct objio_state *objios;
 | |
| 	int ret;
 | |
| 
 | |
| 	ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false,
 | |
| 			hdr->lseg, hdr->args.pages, hdr->args.pgbase,
 | |
| 			hdr->args.offset, hdr->args.count, hdr, GFP_NOFS,
 | |
| 			&objios);
 | |
| 	if (unlikely(ret))
 | |
| 		return ret;
 | |
| 
 | |
| 	objios->sync = 0 != (how & FLUSH_SYNC);
 | |
| 	objios->ios->r4w = &_r4w_op;
 | |
| 
 | |
| 	if (!objios->sync)
 | |
| 		objios->ios->done = _write_done;
 | |
| 
 | |
| 	dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
 | |
| 		hdr->args.offset, hdr->args.count);
 | |
| 	ret = ore_write(objios->ios);
 | |
| 	if (unlikely(ret)) {
 | |
| 		objio_free_result(&objios->oir);
 | |
| 		return ret;
 | |
| 	}
 | |
| 
 | |
| 	if (objios->sync)
 | |
| 		_write_done(objios->ios, objios);
 | |
| 
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
 | |
|  * of bytes (maximum @req->wb_bytes) that can be coalesced.
 | |
|  */
 | |
| static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
 | |
| 			  struct nfs_page *prev, struct nfs_page *req)
 | |
| {
 | |
| 	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio);
 | |
| 	unsigned int size;
 | |
| 
 | |
| 	size = pnfs_generic_pg_test(pgio, prev, req);
 | |
| 
 | |
| 	if (!size || mirror->pg_count + req->wb_bytes >
 | |
| 	    (unsigned long)pgio->pg_layout_private)
 | |
| 		return 0;
 | |
| 
 | |
| 	return min(size, req->wb_bytes);
 | |
| }
 | |
| 
 | |
| static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 | |
| {
 | |
| 	pnfs_generic_pg_init_read(pgio, req);
 | |
| 	if (unlikely(pgio->pg_lseg == NULL))
 | |
| 		return; /* Not pNFS */
 | |
| 
 | |
| 	pgio->pg_layout_private = (void *)
 | |
| 				OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
 | |
| }
 | |
| 
 | |
| static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout,
 | |
| 				   unsigned long *stripe_end)
 | |
| {
 | |
| 	u32 stripe_off;
 | |
| 	unsigned stripe_size;
 | |
| 
 | |
| 	if (layout->raid_algorithm == PNFS_OSD_RAID_0)
 | |
| 		return true;
 | |
| 
 | |
| 	stripe_size = layout->stripe_unit *
 | |
| 				(layout->group_width - layout->parity);
 | |
| 
 | |
| 	div_u64_rem(offset, stripe_size, &stripe_off);
 | |
| 	if (!stripe_off)
 | |
| 		return true;
 | |
| 
 | |
| 	*stripe_end = stripe_size - stripe_off;
 | |
| 	return false;
 | |
| }
 | |
| 
 | |
| static void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 | |
| {
 | |
| 	unsigned long stripe_end = 0;
 | |
| 	u64 wb_size;
 | |
| 
 | |
| 	if (pgio->pg_dreq == NULL)
 | |
| 		wb_size = i_size_read(pgio->pg_inode) - req_offset(req);
 | |
| 	else
 | |
| 		wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
 | |
| 
 | |
| 	pnfs_generic_pg_init_write(pgio, req, wb_size);
 | |
| 	if (unlikely(pgio->pg_lseg == NULL))
 | |
| 		return; /* Not pNFS */
 | |
| 
 | |
| 	if (req->wb_offset ||
 | |
| 	    !aligned_on_raid_stripe(req->wb_index * PAGE_SIZE,
 | |
| 			       &OBJIO_LSEG(pgio->pg_lseg)->layout,
 | |
| 			       &stripe_end)) {
 | |
| 		pgio->pg_layout_private = (void *)stripe_end;
 | |
| 	} else {
 | |
| 		pgio->pg_layout_private = (void *)
 | |
| 				OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static const struct nfs_pageio_ops objio_pg_read_ops = {
 | |
| 	.pg_init = objio_init_read,
 | |
| 	.pg_test = objio_pg_test,
 | |
| 	.pg_doio = pnfs_generic_pg_readpages,
 | |
| 	.pg_cleanup = pnfs_generic_pg_cleanup,
 | |
| };
 | |
| 
 | |
| static const struct nfs_pageio_ops objio_pg_write_ops = {
 | |
| 	.pg_init = objio_init_write,
 | |
| 	.pg_test = objio_pg_test,
 | |
| 	.pg_doio = pnfs_generic_pg_writepages,
 | |
| 	.pg_cleanup = pnfs_generic_pg_cleanup,
 | |
| };
 | |
| 
 | |
| static struct pnfs_layoutdriver_type objlayout_type = {
 | |
| 	.id = LAYOUT_OSD2_OBJECTS,
 | |
| 	.name = "LAYOUT_OSD2_OBJECTS",
 | |
| 	.flags                   = PNFS_LAYOUTRET_ON_SETATTR |
 | |
| 				   PNFS_LAYOUTRET_ON_ERROR,
 | |
| 
 | |
| 	.max_deviceinfo_size	 = PAGE_SIZE,
 | |
| 	.owner		       	 = THIS_MODULE,
 | |
| 	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,
 | |
| 	.free_layout_hdr         = objlayout_free_layout_hdr,
 | |
| 
 | |
| 	.alloc_lseg              = objlayout_alloc_lseg,
 | |
| 	.free_lseg               = objlayout_free_lseg,
 | |
| 
 | |
| 	.read_pagelist           = objlayout_read_pagelist,
 | |
| 	.write_pagelist          = objlayout_write_pagelist,
 | |
| 	.pg_read_ops             = &objio_pg_read_ops,
 | |
| 	.pg_write_ops            = &objio_pg_write_ops,
 | |
| 
 | |
| 	.sync			 = pnfs_generic_sync,
 | |
| 
 | |
| 	.free_deviceid_node	 = objio_free_deviceid_node,
 | |
| 
 | |
| 	.encode_layoutcommit	 = objlayout_encode_layoutcommit,
 | |
| 	.encode_layoutreturn     = objlayout_encode_layoutreturn,
 | |
| };
 | |
| 
 | |
| MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
 | |
| MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
 | |
| MODULE_LICENSE("GPL");
 | |
| 
 | |
| static int __init
 | |
| objlayout_init(void)
 | |
| {
 | |
| 	int ret = pnfs_register_layoutdriver(&objlayout_type);
 | |
| 
 | |
| 	if (ret)
 | |
| 		printk(KERN_INFO
 | |
| 			"NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n",
 | |
| 			__func__, ret);
 | |
| 	else
 | |
| 		printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n",
 | |
| 			__func__);
 | |
| 	return ret;
 | |
| }
 | |
| 
 | |
| static void __exit
 | |
| objlayout_exit(void)
 | |
| {
 | |
| 	pnfs_unregister_layoutdriver(&objlayout_type);
 | |
| 	printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n",
 | |
| 	       __func__);
 | |
| }
 | |
| 
 | |
| MODULE_ALIAS("nfs-layouttype4-2");
 | |
| 
 | |
| module_init(objlayout_init);
 | |
| module_exit(objlayout_exit);
 |