mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-04 20:19:47 +08:00 
			
		
		
		
	 30656177c4
			
		
	
	
		30656177c4
		
	
	
	
	
		
			
			The ioeventfd here is actually irqfd handling of an ioeventfd such as supported in KVM. A user is able to pre-program a device write to occur when the eventfd triggers. This is yet another instance of eventfd-irqfd triggering between KVM and vfio. The impetus for this is high frequency writes to pages which are virtualized in QEMU. Enabling this near-direct write path for selected registers within the virtualized page can improve performance and reduce overhead. Specifically this is initially targeted at NVIDIA graphics cards where the driver issues a write to an MMIO register within a virtualized region in order to allow the MSI interrupt to re-trigger. Reviewed-by: Peter Xu <peterx@redhat.com> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
		
			
				
	
	
		
			389 lines
		
	
	
		
			8.2 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			389 lines
		
	
	
		
			8.2 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * VFIO PCI I/O Port & MMIO access
 | |
|  *
 | |
|  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
 | |
|  *     Author: Alex Williamson <alex.williamson@redhat.com>
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or modify
 | |
|  * it under the terms of the GNU General Public License version 2 as
 | |
|  * published by the Free Software Foundation.
 | |
|  *
 | |
|  * Derived from original vfio:
 | |
|  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
 | |
|  * Author: Tom Lyon, pugs@cisco.com
 | |
|  */
 | |
| 
 | |
| #include <linux/fs.h>
 | |
| #include <linux/pci.h>
 | |
| #include <linux/uaccess.h>
 | |
| #include <linux/io.h>
 | |
| #include <linux/vfio.h>
 | |
| #include <linux/vgaarb.h>
 | |
| 
 | |
| #include "vfio_pci_private.h"
 | |
| 
 | |
| #ifdef __LITTLE_ENDIAN
 | |
| #define vfio_ioread64	ioread64
 | |
| #define vfio_iowrite64	iowrite64
 | |
| #define vfio_ioread32	ioread32
 | |
| #define vfio_iowrite32	iowrite32
 | |
| #define vfio_ioread16	ioread16
 | |
| #define vfio_iowrite16	iowrite16
 | |
| #else
 | |
| #define vfio_ioread64	ioread64be
 | |
| #define vfio_iowrite64	iowrite64be
 | |
| #define vfio_ioread32	ioread32be
 | |
| #define vfio_iowrite32	iowrite32be
 | |
| #define vfio_ioread16	ioread16be
 | |
| #define vfio_iowrite16	iowrite16be
 | |
| #endif
 | |
| #define vfio_ioread8	ioread8
 | |
| #define vfio_iowrite8	iowrite8
 | |
| 
 | |
| /*
 | |
|  * Read or write from an __iomem region (MMIO or I/O port) with an excluded
 | |
|  * range which is inaccessible.  The excluded range drops writes and fills
 | |
|  * reads with -1.  This is intended for handling MSI-X vector tables and
 | |
|  * leftover space for ROM BARs.
 | |
|  */
 | |
| static ssize_t do_io_rw(void __iomem *io, char __user *buf,
 | |
| 			loff_t off, size_t count, size_t x_start,
 | |
| 			size_t x_end, bool iswrite)
 | |
| {
 | |
| 	ssize_t done = 0;
 | |
| 
 | |
| 	while (count) {
 | |
| 		size_t fillable, filled;
 | |
| 
 | |
| 		if (off < x_start)
 | |
| 			fillable = min(count, (size_t)(x_start - off));
 | |
| 		else if (off >= x_end)
 | |
| 			fillable = count;
 | |
| 		else
 | |
| 			fillable = 0;
 | |
| 
 | |
| 		if (fillable >= 4 && !(off % 4)) {
 | |
| 			u32 val;
 | |
| 
 | |
| 			if (iswrite) {
 | |
| 				if (copy_from_user(&val, buf, 4))
 | |
| 					return -EFAULT;
 | |
| 
 | |
| 				vfio_iowrite32(val, io + off);
 | |
| 			} else {
 | |
| 				val = vfio_ioread32(io + off);
 | |
| 
 | |
| 				if (copy_to_user(buf, &val, 4))
 | |
| 					return -EFAULT;
 | |
| 			}
 | |
| 
 | |
| 			filled = 4;
 | |
| 		} else if (fillable >= 2 && !(off % 2)) {
 | |
| 			u16 val;
 | |
| 
 | |
| 			if (iswrite) {
 | |
| 				if (copy_from_user(&val, buf, 2))
 | |
| 					return -EFAULT;
 | |
| 
 | |
| 				vfio_iowrite16(val, io + off);
 | |
| 			} else {
 | |
| 				val = vfio_ioread16(io + off);
 | |
| 
 | |
| 				if (copy_to_user(buf, &val, 2))
 | |
| 					return -EFAULT;
 | |
| 			}
 | |
| 
 | |
| 			filled = 2;
 | |
| 		} else if (fillable) {
 | |
| 			u8 val;
 | |
| 
 | |
| 			if (iswrite) {
 | |
| 				if (copy_from_user(&val, buf, 1))
 | |
| 					return -EFAULT;
 | |
| 
 | |
| 				vfio_iowrite8(val, io + off);
 | |
| 			} else {
 | |
| 				val = vfio_ioread8(io + off);
 | |
| 
 | |
| 				if (copy_to_user(buf, &val, 1))
 | |
| 					return -EFAULT;
 | |
| 			}
 | |
| 
 | |
| 			filled = 1;
 | |
| 		} else {
 | |
| 			/* Fill reads with -1, drop writes */
 | |
| 			filled = min(count, (size_t)(x_end - off));
 | |
| 			if (!iswrite) {
 | |
| 				u8 val = 0xFF;
 | |
| 				size_t i;
 | |
| 
 | |
| 				for (i = 0; i < filled; i++)
 | |
| 					if (copy_to_user(buf + i, &val, 1))
 | |
| 						return -EFAULT;
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		count -= filled;
 | |
| 		done += filled;
 | |
| 		off += filled;
 | |
| 		buf += filled;
 | |
| 	}
 | |
| 
 | |
| 	return done;
 | |
| }
 | |
| 
 | |
| static int vfio_pci_setup_barmap(struct vfio_pci_device *vdev, int bar)
 | |
| {
 | |
| 	struct pci_dev *pdev = vdev->pdev;
 | |
| 	int ret;
 | |
| 	void __iomem *io;
 | |
| 
 | |
| 	if (vdev->barmap[bar])
 | |
| 		return 0;
 | |
| 
 | |
| 	ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
 | |
| 	if (ret)
 | |
| 		return ret;
 | |
| 
 | |
| 	io = pci_iomap(pdev, bar, 0);
 | |
| 	if (!io) {
 | |
| 		pci_release_selected_regions(pdev, 1 << bar);
 | |
| 		return -ENOMEM;
 | |
| 	}
 | |
| 
 | |
| 	vdev->barmap[bar] = io;
 | |
| 
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
 | |
| 			size_t count, loff_t *ppos, bool iswrite)
 | |
| {
 | |
| 	struct pci_dev *pdev = vdev->pdev;
 | |
| 	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
 | |
| 	int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
 | |
| 	size_t x_start = 0, x_end = 0;
 | |
| 	resource_size_t end;
 | |
| 	void __iomem *io;
 | |
| 	ssize_t done;
 | |
| 
 | |
| 	if (pci_resource_start(pdev, bar))
 | |
| 		end = pci_resource_len(pdev, bar);
 | |
| 	else if (bar == PCI_ROM_RESOURCE &&
 | |
| 		 pdev->resource[bar].flags & IORESOURCE_ROM_SHADOW)
 | |
| 		end = 0x20000;
 | |
| 	else
 | |
| 		return -EINVAL;
 | |
| 
 | |
| 	if (pos >= end)
 | |
| 		return -EINVAL;
 | |
| 
 | |
| 	count = min(count, (size_t)(end - pos));
 | |
| 
 | |
| 	if (bar == PCI_ROM_RESOURCE) {
 | |
| 		/*
 | |
| 		 * The ROM can fill less space than the BAR, so we start the
 | |
| 		 * excluded range at the end of the actual ROM.  This makes
 | |
| 		 * filling large ROM BARs much faster.
 | |
| 		 */
 | |
| 		io = pci_map_rom(pdev, &x_start);
 | |
| 		if (!io)
 | |
| 			return -ENOMEM;
 | |
| 		x_end = end;
 | |
| 	} else {
 | |
| 		int ret = vfio_pci_setup_barmap(vdev, bar);
 | |
| 		if (ret)
 | |
| 			return ret;
 | |
| 
 | |
| 		io = vdev->barmap[bar];
 | |
| 	}
 | |
| 
 | |
| 	if (bar == vdev->msix_bar) {
 | |
| 		x_start = vdev->msix_offset;
 | |
| 		x_end = vdev->msix_offset + vdev->msix_size;
 | |
| 	}
 | |
| 
 | |
| 	done = do_io_rw(io, buf, pos, count, x_start, x_end, iswrite);
 | |
| 
 | |
| 	if (done >= 0)
 | |
| 		*ppos += done;
 | |
| 
 | |
| 	if (bar == PCI_ROM_RESOURCE)
 | |
| 		pci_unmap_rom(pdev, io);
 | |
| 
 | |
| 	return done;
 | |
| }
 | |
| 
 | |
| ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
 | |
| 			       size_t count, loff_t *ppos, bool iswrite)
 | |
| {
 | |
| 	int ret;
 | |
| 	loff_t off, pos = *ppos & VFIO_PCI_OFFSET_MASK;
 | |
| 	void __iomem *iomem = NULL;
 | |
| 	unsigned int rsrc;
 | |
| 	bool is_ioport;
 | |
| 	ssize_t done;
 | |
| 
 | |
| 	if (!vdev->has_vga)
 | |
| 		return -EINVAL;
 | |
| 
 | |
| 	if (pos > 0xbfffful)
 | |
| 		return -EINVAL;
 | |
| 
 | |
| 	switch ((u32)pos) {
 | |
| 	case 0xa0000 ... 0xbffff:
 | |
| 		count = min(count, (size_t)(0xc0000 - pos));
 | |
| 		iomem = ioremap_nocache(0xa0000, 0xbffff - 0xa0000 + 1);
 | |
| 		off = pos - 0xa0000;
 | |
| 		rsrc = VGA_RSRC_LEGACY_MEM;
 | |
| 		is_ioport = false;
 | |
| 		break;
 | |
| 	case 0x3b0 ... 0x3bb:
 | |
| 		count = min(count, (size_t)(0x3bc - pos));
 | |
| 		iomem = ioport_map(0x3b0, 0x3bb - 0x3b0 + 1);
 | |
| 		off = pos - 0x3b0;
 | |
| 		rsrc = VGA_RSRC_LEGACY_IO;
 | |
| 		is_ioport = true;
 | |
| 		break;
 | |
| 	case 0x3c0 ... 0x3df:
 | |
| 		count = min(count, (size_t)(0x3e0 - pos));
 | |
| 		iomem = ioport_map(0x3c0, 0x3df - 0x3c0 + 1);
 | |
| 		off = pos - 0x3c0;
 | |
| 		rsrc = VGA_RSRC_LEGACY_IO;
 | |
| 		is_ioport = true;
 | |
| 		break;
 | |
| 	default:
 | |
| 		return -EINVAL;
 | |
| 	}
 | |
| 
 | |
| 	if (!iomem)
 | |
| 		return -ENOMEM;
 | |
| 
 | |
| 	ret = vga_get_interruptible(vdev->pdev, rsrc);
 | |
| 	if (ret) {
 | |
| 		is_ioport ? ioport_unmap(iomem) : iounmap(iomem);
 | |
| 		return ret;
 | |
| 	}
 | |
| 
 | |
| 	done = do_io_rw(iomem, buf, off, count, 0, 0, iswrite);
 | |
| 
 | |
| 	vga_put(vdev->pdev, rsrc);
 | |
| 
 | |
| 	is_ioport ? ioport_unmap(iomem) : iounmap(iomem);
 | |
| 
 | |
| 	if (done >= 0)
 | |
| 		*ppos += done;
 | |
| 
 | |
| 	return done;
 | |
| }
 | |
| 
 | |
| static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
 | |
| {
 | |
| 	struct vfio_pci_ioeventfd *ioeventfd = opaque;
 | |
| 
 | |
| 	switch (ioeventfd->count) {
 | |
| 	case 1:
 | |
| 		vfio_iowrite8(ioeventfd->data, ioeventfd->addr);
 | |
| 		break;
 | |
| 	case 2:
 | |
| 		vfio_iowrite16(ioeventfd->data, ioeventfd->addr);
 | |
| 		break;
 | |
| 	case 4:
 | |
| 		vfio_iowrite32(ioeventfd->data, ioeventfd->addr);
 | |
| 		break;
 | |
| #ifdef iowrite64
 | |
| 	case 8:
 | |
| 		vfio_iowrite64(ioeventfd->data, ioeventfd->addr);
 | |
| 		break;
 | |
| #endif
 | |
| 	}
 | |
| 
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
 | |
| 			uint64_t data, int count, int fd)
 | |
| {
 | |
| 	struct pci_dev *pdev = vdev->pdev;
 | |
| 	loff_t pos = offset & VFIO_PCI_OFFSET_MASK;
 | |
| 	int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset);
 | |
| 	struct vfio_pci_ioeventfd *ioeventfd;
 | |
| 
 | |
| 	/* Only support ioeventfds into BARs */
 | |
| 	if (bar > VFIO_PCI_BAR5_REGION_INDEX)
 | |
| 		return -EINVAL;
 | |
| 
 | |
| 	if (pos + count > pci_resource_len(pdev, bar))
 | |
| 		return -EINVAL;
 | |
| 
 | |
| 	/* Disallow ioeventfds working around MSI-X table writes */
 | |
| 	if (bar == vdev->msix_bar &&
 | |
| 	    !(pos + count <= vdev->msix_offset ||
 | |
| 	      pos >= vdev->msix_offset + vdev->msix_size))
 | |
| 		return -EINVAL;
 | |
| 
 | |
| #ifndef iowrite64
 | |
| 	if (count == 8)
 | |
| 		return -EINVAL;
 | |
| #endif
 | |
| 
 | |
| 	ret = vfio_pci_setup_barmap(vdev, bar);
 | |
| 	if (ret)
 | |
| 		return ret;
 | |
| 
 | |
| 	mutex_lock(&vdev->ioeventfds_lock);
 | |
| 
 | |
| 	list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) {
 | |
| 		if (ioeventfd->pos == pos && ioeventfd->bar == bar &&
 | |
| 		    ioeventfd->data == data && ioeventfd->count == count) {
 | |
| 			if (fd == -1) {
 | |
| 				vfio_virqfd_disable(&ioeventfd->virqfd);
 | |
| 				list_del(&ioeventfd->next);
 | |
| 				vdev->ioeventfds_nr--;
 | |
| 				kfree(ioeventfd);
 | |
| 				ret = 0;
 | |
| 			} else
 | |
| 				ret = -EEXIST;
 | |
| 
 | |
| 			goto out_unlock;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if (fd < 0) {
 | |
| 		ret = -ENODEV;
 | |
| 		goto out_unlock;
 | |
| 	}
 | |
| 
 | |
| 	if (vdev->ioeventfds_nr >= VFIO_PCI_IOEVENTFD_MAX) {
 | |
| 		ret = -ENOSPC;
 | |
| 		goto out_unlock;
 | |
| 	}
 | |
| 
 | |
| 	ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL);
 | |
| 	if (!ioeventfd) {
 | |
| 		ret = -ENOMEM;
 | |
| 		goto out_unlock;
 | |
| 	}
 | |
| 
 | |
| 	ioeventfd->addr = vdev->barmap[bar] + pos;
 | |
| 	ioeventfd->data = data;
 | |
| 	ioeventfd->pos = pos;
 | |
| 	ioeventfd->bar = bar;
 | |
| 	ioeventfd->count = count;
 | |
| 
 | |
| 	ret = vfio_virqfd_enable(ioeventfd, vfio_pci_ioeventfd_handler,
 | |
| 				 NULL, NULL, &ioeventfd->virqfd, fd);
 | |
| 	if (ret) {
 | |
| 		kfree(ioeventfd);
 | |
| 		goto out_unlock;
 | |
| 	}
 | |
| 
 | |
| 	list_add(&ioeventfd->next, &vdev->ioeventfds_list);
 | |
| 	vdev->ioeventfds_nr++;
 | |
| 
 | |
| out_unlock:
 | |
| 	mutex_unlock(&vdev->ioeventfds_lock);
 | |
| 
 | |
| 	return ret;
 | |
| }
 |