From 272dfe37a0026fafdb7cc31148c38fa4995c18a8 Mon Sep 17 00:00:00 2001 From: Justin M. Forbes Date: Feb 09 2010 17:10:07 +0000 Subject: Add vhost net support. --- diff --git a/qemu-exec-memory-notifiers.patch b/qemu-exec-memory-notifiers.patch new file mode 100644 index 0000000..09c0ea9 --- /dev/null +++ b/qemu-exec-memory-notifiers.patch @@ -0,0 +1,199 @@ +This adds notifiers for phys memory changes: a set of callbacks that +vhost can register and update kernel accordingly. Down the road, kvm +code can be switched to use these as well, instead of calling kvm code +directly from exec.c as is done now. + +Signed-off-by: Michael S. Tsirkin +--- + cpu-common.h | 19 ++++++++++ + exec.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- + 2 files changed, 130 insertions(+), 3 deletions(-) + +diff --git a/cpu-common.h b/cpu-common.h +index 5e59564..326513d 100644 +--- a/cpu-common.h ++++ b/cpu-common.h +@@ -8,6 +8,7 @@ + #endif + + #include "bswap.h" ++#include "qemu-queue.h" + + /* address in the RAM (different from a physical address) */ + typedef unsigned long ram_addr_t; +@@ -62,6 +63,24 @@ void cpu_physical_memory_unmap(void *buffer, target_phys_addr_t len, + void *cpu_register_map_client(void *opaque, void (*callback)(void *opaque)); + void cpu_unregister_map_client(void *cookie); + ++struct CPUPhysMemoryClient; ++typedef struct CPUPhysMemoryClient CPUPhysMemoryClient; ++struct CPUPhysMemoryClient { ++ void (*set_memory)(struct CPUPhysMemoryClient *client, ++ target_phys_addr_t start_addr, ++ ram_addr_t size, ++ ram_addr_t phys_offset); ++ int (*sync_dirty_bitmap)(struct CPUPhysMemoryClient *client, ++ target_phys_addr_t start_addr, ++ target_phys_addr_t end_addr); ++ int (*migration_log)(struct CPUPhysMemoryClient *client, ++ int enable); ++ QLIST_ENTRY(CPUPhysMemoryClient) list; ++}; ++ ++void cpu_register_phys_memory_client(CPUPhysMemoryClient *); ++void cpu_unregister_phys_memory_client(CPUPhysMemoryClient *); ++ + uint32_t ldub_phys(target_phys_addr_t addr); + uint32_t lduw_phys(target_phys_addr_t addr); + uint32_t ldl_phys(target_phys_addr_t addr); +diff --git a/exec.c b/exec.c +index 8f873ab..cbba15e 100644 +--- a/exec.c ++++ b/exec.c +@@ -1640,6 +1640,101 @@ const CPULogItem cpu_log_items[] = { + { 0, NULL, NULL }, + }; + ++#ifndef CONFIG_USER_ONLY ++static QLIST_HEAD(memory_client_list, CPUPhysMemoryClient) memory_client_list ++ = QLIST_HEAD_INITIALIZER(memory_client_list); ++ ++static void cpu_notify_set_memory(target_phys_addr_t start_addr, ++ ram_addr_t size, ++ ram_addr_t phys_offset) ++{ ++ CPUPhysMemoryClient *client; ++ QLIST_FOREACH(client, &memory_client_list, list) { ++ client->set_memory(client, start_addr, size, phys_offset); ++ } ++} ++ ++static int cpu_notify_sync_dirty_bitmap(target_phys_addr_t start, ++ target_phys_addr_t end) ++{ ++ CPUPhysMemoryClient *client; ++ QLIST_FOREACH(client, &memory_client_list, list) { ++ int r = client->sync_dirty_bitmap(client, start, end); ++ if (r < 0) ++ return r; ++ } ++ return 0; ++} ++ ++static int cpu_notify_migration_log(int enable) ++{ ++ CPUPhysMemoryClient *client; ++ QLIST_FOREACH(client, &memory_client_list, list) { ++ int r = client->migration_log(client, enable); ++ if (r < 0) ++ return r; ++ } ++ return 0; ++} ++ ++static void phys_page_for_each_in_l1_map(PhysPageDesc **phys_map, ++ CPUPhysMemoryClient *client) ++{ ++ PhysPageDesc *pd; ++ int l1, l2; ++ ++ for (l1 = 0; l1 < L1_SIZE; ++l1) { ++ pd = phys_map[l1]; ++ if (!pd) { ++ continue; ++ } ++ for (l2 = 0; l2 < L2_SIZE; ++l2) { ++ if (pd[l2].phys_offset == IO_MEM_UNASSIGNED) { ++ continue; ++ } ++ client->set_memory(client, pd[l2].region_offset, ++ TARGET_PAGE_SIZE, pd[l2].phys_offset); ++ } ++ } ++} ++ ++static void phys_page_for_each(CPUPhysMemoryClient *client) ++{ ++#if TARGET_PHYS_ADDR_SPACE_BITS > 32 ++ ++#if TARGET_PHYS_ADDR_SPACE_BITS > (32 + L1_BITS) ++#error unsupported TARGET_PHYS_ADDR_SPACE_BITS ++#endif ++ void **phys_map = (void **)l1_phys_map; ++ int l1; ++ if (!l1_phys_map) { ++ return; ++ } ++ for (l1 = 0; l1 < L1_SIZE; ++l1) { ++ if (phys_map[l1]) { ++ phys_page_for_each_in_l1_map(phys_map[l1], client); ++ } ++ } ++#else ++ if (!l1_phys_map) { ++ return; ++ } ++ phys_page_for_each_in_l1_map(l1_phys_map, client); ++#endif ++} ++ ++void cpu_register_phys_memory_client(CPUPhysMemoryClient *client) ++{ ++ QLIST_INSERT_HEAD(&memory_client_list, client, list); ++ phys_page_for_each(client); ++} ++ ++void cpu_unregister_phys_memory_client(CPUPhysMemoryClient *client) ++{ ++ QLIST_REMOVE(client, list); ++} ++#endif ++ + static int cmp1(const char *s1, int n, const char *s2) + { + if (strlen(s2) != n) +@@ -1899,10 +1994,16 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end, + + int cpu_physical_memory_set_dirty_tracking(int enable) + { ++ int ret = 0; ++ in_migration = enable; + if (kvm_enabled()) { +- return kvm_set_migration_log(enable); ++ ret = kvm_set_migration_log(enable); + } +- return 0; ++ if (ret < 0) { ++ return ret; ++ } ++ ret = cpu_notify_migration_log(!!enable); ++ return ret; + } + + int cpu_physical_memory_get_dirty_tracking(void) +@@ -1915,8 +2016,13 @@ int cpu_physical_sync_dirty_bitmap(target_phys_addr_t start_addr, + { + int ret = 0; + +- if (kvm_enabled()) ++ if (kvm_enabled()) { + ret = kvm_physical_sync_dirty_bitmap(start_addr, end_addr); ++ } ++ if (ret < 0) { ++ return ret; ++ } ++ ret = cpu_notify_sync_dirty_bitmap(start_addr, end_addr); + return ret; + } + +@@ -2331,6 +2437,8 @@ void cpu_register_physical_memory_offset(target_phys_addr_t start_addr, + if (kvm_enabled()) + kvm_set_phys_mem(start_addr, size, phys_offset); + ++ cpu_notify_set_memory(start_addr, size, phys_offset); ++ + if (phys_offset == IO_MEM_UNASSIGNED) { + region_offset = start_addr; + } +-- +1.6.6.144.g5c3af diff --git a/qemu-kvm-add-API-to-set-ioeventfd.patch b/qemu-kvm-add-API-to-set-ioeventfd.patch new file mode 100644 index 0000000..365bae8 --- /dev/null +++ b/qemu-kvm-add-API-to-set-ioeventfd.patch @@ -0,0 +1,81 @@ +This adds API to set ioeventfd to kvm, +as well as stubs for non-eventfd case, +making it possible for users to use this API +without ifdefs. + +Signed-off-by: Michael S. Tsirkin +--- + kvm-all.c | 20 ++++++++++++++++++++ + kvm.h | 16 ++++++++++++++++ + 2 files changed, 36 insertions(+), 0 deletions(-) + +diff --git a/kvm-all.c b/kvm-all.c +index 0423fff..efdf40c 100644 +--- a/kvm-all.c ++++ b/kvm-all.c +@@ -1102,4 +1102,24 @@ void kvm_remove_all_breakpoints(CPUState *current_env) + } + #endif /* !KVM_CAP_SET_GUEST_DEBUG */ + ++#ifdef KVM_IOEVENTFD ++int kvm_set_ioeventfd(uint16_t addr, uint16_t data, int fd, bool assigned) ++{ ++ struct kvm_ioeventfd kick = { ++ .datamatch = data, ++ .addr = addr, ++ .len = 2, ++ .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO, ++ .fd = fd, ++ }; ++ int r; ++ if (!assigned) ++ kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; ++ r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick); ++ if (r < 0) ++ return r; ++ return 0; ++} ++#endif ++ + #include "qemu-kvm.c" +diff --git a/kvm.h b/kvm.h +index 9fa4e25..e98b5c8 100644 +--- a/kvm.h ++++ b/kvm.h +@@ -14,6 +14,8 @@ + #ifndef QEMU_KVM_H + #define QEMU_KVM_H + ++#include ++#include + #include "config.h" + #include "qemu-queue.h" + #include "qemu-kvm.h" +@@ -21,6 +23,10 @@ + #ifdef KVM_UPSTREAM + + #ifdef CONFIG_KVM ++#include ++#endif ++ ++#ifdef CONFIG_KVM + extern int kvm_allowed; + + #define kvm_enabled() (kvm_allowed) +@@ -151,4 +157,14 @@ static inline void cpu_synchronize_state(CPUState *env) + + #endif + ++#if defined(KVM_IOEVENTFD) && defined(CONFIG_KVM) ++int kvm_set_ioeventfd(uint16_t addr, uint16_t data, int fd, bool assigned); ++#else ++static inline ++int kvm_set_ioeventfd(uint16_t data, uint16_t addr, int fd, bool assigned) ++{ ++ return -ENOSYS; ++} ++#endif ++ + #endif +-- +1.6.6.144.g5c3af diff --git a/qemu-kvm-add-vhost.h-header.patch b/qemu-kvm-add-vhost.h-header.patch new file mode 100644 index 0000000..e820504 --- /dev/null +++ b/qemu-kvm-add-vhost.h-header.patch @@ -0,0 +1,147 @@ +This makes it possible to build vhost support +on systems which do not have this header. + +Signed-off-by: Michael S. Tsirkin +--- + kvm/include/linux/vhost.h | 130 +++++++++++++++++++++++++++++++++++++++++++++ + 1 files changed, 130 insertions(+), 0 deletions(-) + create mode 100644 kvm/include/linux/vhost.h + +diff --git a/kvm/include/linux/vhost.h b/kvm/include/linux/vhost.h +new file mode 100644 +index 0000000..165a484 +--- /dev/null ++++ b/kvm/include/linux/vhost.h +@@ -0,0 +1,130 @@ ++#ifndef _LINUX_VHOST_H ++#define _LINUX_VHOST_H ++/* Userspace interface for in-kernel virtio accelerators. */ ++ ++/* vhost is used to reduce the number of system calls involved in virtio. ++ * ++ * Existing virtio net code is used in the guest without modification. ++ * ++ * This header includes interface used by userspace hypervisor for ++ * device configuration. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++ ++struct vhost_vring_state { ++ unsigned int index; ++ unsigned int num; ++}; ++ ++struct vhost_vring_file { ++ unsigned int index; ++ int fd; /* Pass -1 to unbind from file. */ ++ ++}; ++ ++struct vhost_vring_addr { ++ unsigned int index; ++ /* Option flags. */ ++ unsigned int flags; ++ /* Flag values: */ ++ /* Whether log address is valid. If set enables logging. */ ++#define VHOST_VRING_F_LOG 0 ++ ++ /* Start of array of descriptors (virtually contiguous) */ ++ __u64 desc_user_addr; ++ /* Used structure address. Must be 32 bit aligned */ ++ __u64 used_user_addr; ++ /* Available structure address. Must be 16 bit aligned */ ++ __u64 avail_user_addr; ++ /* Logging support. */ ++ /* Log writes to used structure, at offset calculated from specified ++ * address. Address must be 32 bit aligned. */ ++ __u64 log_guest_addr; ++}; ++ ++struct vhost_memory_region { ++ __u64 guest_phys_addr; ++ __u64 memory_size; /* bytes */ ++ __u64 userspace_addr; ++ __u64 flags_padding; /* No flags are currently specified. */ ++}; ++ ++/* All region addresses and sizes must be 4K aligned. */ ++#define VHOST_PAGE_SIZE 0x1000 ++ ++struct vhost_memory { ++ __u32 nregions; ++ __u32 padding; ++ struct vhost_memory_region regions[0]; ++}; ++ ++/* ioctls */ ++ ++#define VHOST_VIRTIO 0xAF ++ ++/* Features bitmask for forward compatibility. Transport bits are used for ++ * vhost specific features. */ ++#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) ++#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) ++ ++/* Set current process as the (exclusive) owner of this file descriptor. This ++ * must be called before any other vhost command. Further calls to ++ * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */ ++#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) ++/* Give up ownership, and reset the device to default values. ++ * Allows subsequent call to VHOST_OWNER_SET to succeed. */ ++#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) ++ ++/* Set up/modify memory layout */ ++#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) ++ ++/* Write logging setup. */ ++/* Memory writes can optionally be logged by setting bit at an offset ++ * (calculated from the physical address) from specified log base. ++ * The bit is set using an atomic 32 bit operation. */ ++/* Set base address for logging. */ ++#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) ++/* Specify an eventfd file descriptor to signal on log write. */ ++#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) ++ ++/* Ring setup. */ ++/* Set number of descriptors in ring. This parameter can not ++ * be modified while ring is running (bound to a device). */ ++#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) ++/* Set addresses for the ring. */ ++#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) ++/* Base value where queue looks for available descriptors */ ++#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state) ++/* Get accessor: reads index, writes value in num */ ++#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state) ++ ++/* The following ioctls use eventfd file descriptors to signal and poll ++ * for events. */ ++ ++/* Set eventfd to poll for added buffers */ ++#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) ++/* Set eventfd to signal when buffers have beed used */ ++#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) ++/* Set eventfd to signal an error */ ++#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) ++ ++/* VHOST_NET specific defines */ ++ ++/* Attach virtio net ring to a raw socket, or tap device. ++ * The socket must be already bound to an ethernet device, this device will be ++ * used for transmit. Pass fd -1 to unbind from the socket and the transmit ++ * device. This can be used to stop the ring (e.g. for migration). */ ++#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) ++ ++/* Feature bits */ ++/* Log all write descriptors. Can be changed while device is active. */ ++#define VHOST_F_LOG_ALL 26 ++/* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */ ++#define VHOST_NET_F_VIRTIO_NET_HDR 27 ++ ++#endif +-- +1.6.6.144.g5c3af diff --git a/qemu-kvm-irqfd-support.patch b/qemu-kvm-irqfd-support.patch new file mode 100644 index 0000000..4f74d87 --- /dev/null +++ b/qemu-kvm-irqfd-support.patch @@ -0,0 +1,59 @@ +Add API to assign/deassign irqfd to kvm. +Add stub so that users do not have to use +ifdefs. + +Signed-off-by: Michael S. Tsirkin +--- + kvm-all.c | 19 +++++++++++++++++++ + kvm.h | 10 ++++++++++ + 2 files changed, 29 insertions(+), 0 deletions(-) + +diff --git a/kvm-all.c b/kvm-all.c +index efdf40c..b3fdf29 100644 +--- a/kvm-all.c ++++ b/kvm-all.c +@@ -1122,4 +1122,23 @@ int kvm_set_ioeventfd(uint16_t addr, uint16_t data, int fd, bool assigned) + } + #endif + ++#if defined(KVM_IRQFD) ++int kvm_set_irqfd(int gsi, int fd, bool assigned) ++{ ++ struct kvm_irqfd irqfd = { ++ .fd = fd, ++ .gsi = gsi, ++ .flags = assigned ? 0 : KVM_IRQFD_FLAG_DEASSIGN, ++ }; ++ int r; ++ if (!kvm_irqchip_in_kernel()) ++ return -ENOSYS; ++ ++ r = kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd); ++ if (r < 0) ++ return r; ++ return 0; ++} ++#endif ++ + #include "qemu-kvm.c" +diff --git a/kvm.h b/kvm.h +index e98b5c8..ad8d122 100644 +--- a/kvm.h ++++ b/kvm.h +@@ -167,4 +167,14 @@ int kvm_set_ioeventfd(uint16_t data, uint16_t addr, int fd, bool assigned) + } + #endif + ++#if defined(KVM_IRQFD) && defined(CONFIG_KVM) ++int kvm_set_irqfd(int gsi, int fd, bool assigned); ++#else ++static inline ++int kvm_set_irqfd(int gsi, int fd, bool assigned) ++{ ++ return -ENOSYS; ++} ++#endif ++ + #endif +-- +1.6.6.144.g5c3af diff --git a/qemu-msix-add-mask-unmask-notifiers.patch b/qemu-msix-add-mask-unmask-notifiers.patch new file mode 100644 index 0000000..845bbc0 --- /dev/null +++ b/qemu-msix-add-mask-unmask-notifiers.patch @@ -0,0 +1,121 @@ +Support per-vector callbacks for msix mask/unmask. +Will be used for vhost net. + +Signed-off-by: Michael S. Tsirkin +--- + hw/msix.c | 36 +++++++++++++++++++++++++++++++++++- + hw/msix.h | 1 + + hw/pci.h | 6 ++++++ + 3 files changed, 42 insertions(+), 1 deletions(-) + +diff --git a/hw/msix.c b/hw/msix.c +index d117bcf..3fcf3a1 100644 +--- a/hw/msix.c ++++ b/hw/msix.c +@@ -318,6 +318,13 @@ static void msix_mmio_writel(void *opaque, target_phys_addr_t addr, + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + kvm_msix_update(dev, vector, was_masked, msix_is_masked(dev, vector)); + } ++ if (was_masked != msix_is_masked(dev, vector) && ++ dev->msix_mask_notifier && dev->msix_mask_notifier_opaque[vector]) { ++ int r = dev->msix_mask_notifier(dev, vector, ++ dev->msix_mask_notifier_opaque[vector], ++ msix_is_masked(dev, vector)); ++ assert(r >= 0); ++ } + msix_handle_mask_update(dev, vector); + } + +@@ -356,10 +363,18 @@ void msix_mmio_map(PCIDevice *d, int region_num, + + static void msix_mask_all(struct PCIDevice *dev, unsigned nentries) + { +- int vector; ++ int vector, r; + for (vector = 0; vector < nentries; ++vector) { + unsigned offset = vector * MSIX_ENTRY_SIZE + MSIX_VECTOR_CTRL; ++ int was_masked = msix_is_masked(dev, vector); + dev->msix_table_page[offset] |= MSIX_VECTOR_MASK; ++ if (was_masked != msix_is_masked(dev, vector) && ++ dev->msix_mask_notifier && dev->msix_mask_notifier_opaque[vector]) { ++ r = dev->msix_mask_notifier(dev, vector, ++ dev->msix_mask_notifier_opaque[vector], ++ msix_is_masked(dev, vector)); ++ assert(r >= 0); ++ } + } + } + +@@ -382,6 +397,9 @@ int msix_init(struct PCIDevice *dev, unsigned short nentries, + sizeof *dev->msix_irq_entries); + } + #endif ++ dev->msix_mask_notifier_opaque = ++ qemu_mallocz(nentries * sizeof *dev->msix_mask_notifier_opaque); ++ dev->msix_mask_notifier = NULL; + dev->msix_entry_used = qemu_mallocz(MSIX_MAX_ENTRIES * + sizeof *dev->msix_entry_used); + +@@ -444,6 +462,8 @@ int msix_uninit(PCIDevice *dev) + dev->msix_entry_used = NULL; + qemu_free(dev->msix_irq_entries); + dev->msix_irq_entries = NULL; ++ qemu_free(dev->msix_mask_notifier_opaque); ++ dev->msix_mask_notifier_opaque = NULL; + dev->cap_present &= ~QEMU_PCI_CAP_MSIX; + return 0; + } +@@ -587,3 +607,17 @@ void msix_unuse_all_vectors(PCIDevice *dev) + return; + msix_free_irq_entries(dev); + } ++ ++int msix_set_mask_notifier(PCIDevice *dev, unsigned vector, void *opaque) ++{ ++ int r = 0; ++ if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) ++ return 0; ++ ++ if (dev->msix_mask_notifier) ++ r = dev->msix_mask_notifier(dev, vector, opaque, ++ msix_is_masked(dev, vector)); ++ if (r >= 0) ++ dev->msix_mask_notifier_opaque[vector] = opaque; ++ return r; ++} +diff --git a/hw/msix.h b/hw/msix.h +index a9f7993..f167231 100644 +--- a/hw/msix.h ++++ b/hw/msix.h +@@ -33,4 +33,5 @@ void msix_reset(PCIDevice *dev); + + extern int msix_supported; + ++int msix_set_mask_notifier(PCIDevice *dev, unsigned vector, void *opaque); + #endif +diff --git a/hw/pci.h b/hw/pci.h +index a225a6a..bf722ca 100644 +--- a/hw/pci.h ++++ b/hw/pci.h +@@ -217,6 +217,9 @@ enum { + #define PCI_CAPABILITY_CONFIG_MSI_LENGTH 0x10 + #define PCI_CAPABILITY_CONFIG_MSIX_LENGTH 0x10 + ++typedef int (*msix_mask_notifier_func)(PCIDevice *, unsigned vector, ++ void *opaque, int masked); ++ + struct PCIDevice { + DeviceState qdev; + /* PCI config space */ +@@ -282,6 +285,9 @@ struct PCIDevice { + + struct kvm_irq_routing_entry *msix_irq_entries; + ++ void **msix_mask_notifier_opaque; ++ msix_mask_notifier_func msix_mask_notifier; ++ + /* Device capability configuration space */ + struct { + int supported; +-- +1.6.6.144.g5c3af diff --git a/qemu-net-add-API-to-disable-enable-polling.patch b/qemu-net-add-API-to-disable-enable-polling.patch new file mode 100644 index 0000000..d41adef --- /dev/null +++ b/qemu-net-add-API-to-disable-enable-polling.patch @@ -0,0 +1,68 @@ +When vhost is bound to a backend device, we need to stop polling it when +vhost is started, and restart polling when vhost is stopped. +Add an API for that for use by vhost, and implement in tap backend. + +Signed-off-by: Michael S. Tsirkin +Signed-off-by: Anthony Liguori +(cherry picked from commit ceb696159d569db5b2a7659ce38752398c236742) +--- + net.h | 3 +++ + net/tap.c | 8 ++++++++ + 2 files changed, 11 insertions(+), 0 deletions(-) + +diff --git a/net.h b/net.h +index 4971fcb..116bb80 100644 +--- a/net.h ++++ b/net.h +@@ -1,6 +1,7 @@ + #ifndef QEMU_NET_H + #define QEMU_NET_H + ++#include + #include "qemu-queue.h" + #include "qemu-common.h" + #include "qdict.h" +@@ -36,6 +37,7 @@ typedef enum { + NET_CLIENT_TYPE_DUMP + } net_client_type; + ++typedef void (NetPoll)(VLANClientState *, bool enable); + typedef int (NetCanReceive)(VLANClientState *); + typedef ssize_t (NetReceive)(VLANClientState *, const uint8_t *, size_t); + typedef ssize_t (NetReceiveIOV)(VLANClientState *, const struct iovec *, int); +@@ -51,6 +53,7 @@ typedef struct NetClientInfo { + NetCanReceive *can_receive; + NetCleanup *cleanup; + LinkStatusChanged *link_status_changed; ++ NetPoll *poll; + } NetClientInfo; + + struct VLANClientState { +diff --git a/net/tap.c b/net/tap.c +index 0d8b424..d3492de 100644 +--- a/net/tap.c ++++ b/net/tap.c +@@ -262,6 +262,13 @@ static void tap_cleanup(VLANClientState *nc) + close(s->fd); + } + ++static void tap_poll(VLANClientState *nc, bool enable) ++{ ++ TAPState *s = DO_UPCAST(TAPState, nc, nc); ++ tap_read_poll(s, enable); ++ tap_write_poll(s, enable); ++} ++ + /* fd support */ + + static NetClientInfo net_tap_info = { +@@ -270,6 +277,7 @@ static NetClientInfo net_tap_info = { + .receive = tap_receive, + .receive_raw = tap_receive_raw, + .receive_iov = tap_receive_iov, ++ .poll = tap_poll, + .cleanup = tap_cleanup, + }; + +-- +1.6.6.144.g5c3af diff --git a/qemu-notifier-event-notifier-implementation.patch b/qemu-notifier-event-notifier-implementation.patch new file mode 100644 index 0000000..b5c2d27 --- /dev/null +++ b/qemu-notifier-event-notifier-implementation.patch @@ -0,0 +1,122 @@ +event notifiers are slightly generalized eventfd descriptors. Current +implementation depends on eventfd because vhost is the only user, and +vhost depends on eventfd anyway, but a stub is provided for non-eventfd +case. + +We'll be able to further generalize this when another user comes along +and we see how to best do this. + +Signed-off-by: Michael S. Tsirkin +--- + Makefile.target | 1 + + hw/notifier.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ + hw/notifier.h | 16 ++++++++++++++++ + qemu-common.h | 1 + + 4 files changed, 68 insertions(+), 0 deletions(-) + create mode 100644 hw/notifier.c + create mode 100644 hw/notifier.h + +diff --git a/Makefile.target b/Makefile.target +index 6037fed..0c844a9 100644 +--- a/Makefile.target ++++ b/Makefile.target +@@ -167,6 +167,7 @@ obj-y = vl.o async.o monitor.o pci.o pci_host.o pcie_host.o machine.o gdbstub.o + # virtio has to be here due to weird dependency between PCI and virtio-net. + # need to fix this properly + obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-pci.o virtio-serial-bus.o ++obj-y += notifier.o + obj-$(CONFIG_KVM) += kvm.o kvm-all.o + # MSI-X depends on kvm for interrupt injection, + # so moved it from Makefile.hw to Makefile.target for now +diff --git a/hw/notifier.c b/hw/notifier.c +new file mode 100644 +index 0000000..dff38de +--- /dev/null ++++ b/hw/notifier.c +@@ -0,0 +1,50 @@ ++#include "hw.h" ++#include "notifier.h" ++#ifdef CONFIG_EVENTFD ++#include ++#endif ++ ++int event_notifier_init(EventNotifier *e, int active) ++{ ++#ifdef CONFIG_EVENTFD ++ int fd = eventfd(!!active, EFD_NONBLOCK | EFD_CLOEXEC); ++ if (fd < 0) ++ return -errno; ++ e->fd = fd; ++ return 0; ++#else ++ return -ENOSYS; ++#endif ++} ++ ++void event_notifier_cleanup(EventNotifier *e) ++{ ++ close(e->fd); ++} ++ ++int event_notifier_get_fd(EventNotifier *e) ++{ ++ return e->fd; ++} ++ ++int event_notifier_test_and_clear(EventNotifier *e) ++{ ++ uint64_t value; ++ int r = read(e->fd, &value, sizeof value); ++ return r == sizeof value; ++} ++ ++int event_notifier_test(EventNotifier *e) ++{ ++ uint64_t value; ++ int r = read(e->fd, &value, sizeof value); ++ if (r == sizeof value) { ++ /* restore previous value. */ ++ int s = write(e->fd, &value, sizeof value); ++ /* never blocks because we use EFD_SEMAPHORE. ++ * If we didn't we'd get EAGAIN on overflow ++ * and we'd have to write code to ignore it. */ ++ assert(s == sizeof value); ++ } ++ return r == sizeof value; ++} +diff --git a/hw/notifier.h b/hw/notifier.h +new file mode 100644 +index 0000000..24117ea +--- /dev/null ++++ b/hw/notifier.h +@@ -0,0 +1,16 @@ ++#ifndef QEMU_EVENT_NOTIFIER_H ++#define QEMU_EVENT_NOTIFIER_H ++ ++#include "qemu-common.h" ++ ++struct EventNotifier { ++ int fd; ++}; ++ ++int event_notifier_init(EventNotifier *, int active); ++void event_notifier_cleanup(EventNotifier *); ++int event_notifier_get_fd(EventNotifier *); ++int event_notifier_test_and_clear(EventNotifier *); ++int event_notifier_test(EventNotifier *); ++ ++#endif +diff --git a/qemu-common.h b/qemu-common.h +index 5fbe0f9..cdead98 100644 +--- a/qemu-common.h ++++ b/qemu-common.h +@@ -217,6 +217,7 @@ typedef struct uWireSlave uWireSlave; + typedef struct I2SCodec I2SCodec; + typedef struct DeviceState DeviceState; + typedef struct SSIBus SSIBus; ++typedef struct EventNotifier EventNotifier; + + /* CPU save/load. */ + void cpu_save(QEMUFile *f, void *opaque); +-- +1.6.6.144.g5c3af diff --git a/qemu-qdev-add-bit-property-type.patch b/qemu-qdev-add-bit-property-type.patch new file mode 100644 index 0000000..35a5958 --- /dev/null +++ b/qemu-qdev-add-bit-property-type.patch @@ -0,0 +1,156 @@ +This adds "bit" property type, which is a boolean stored in a 32 bit +integer field, with legal values on and off. Will be used by virtio for +feature bits. + +Signed-off-by: Michael S. Tsirkin +Acked-by: Gerd Hoffmann +Signed-off-by: Anthony Liguori +(cherry picked from commit d2364ee424ebf9180afaf21128a71da55321ad00) +--- + hw/qdev-properties.c | 62 ++++++++++++++++++++++++++++++++++++++++++++----- + hw/qdev.h | 11 +++++++++ + 2 files changed, 66 insertions(+), 7 deletions(-) + +diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c +index 217ddc0..9e123ae 100644 +--- a/hw/qdev-properties.c ++++ b/hw/qdev-properties.c +@@ -9,6 +9,59 @@ void *qdev_get_prop_ptr(DeviceState *dev, Property *prop) + return ptr; + } + ++static uint32_t qdev_get_prop_mask(Property *prop) ++{ ++ assert(prop->info->type == PROP_TYPE_BIT); ++ return 0x1 << prop->bitnr; ++} ++ ++static void bit_prop_set(DeviceState *dev, Property *props, bool val) ++{ ++ uint32_t *p = qdev_get_prop_ptr(dev, props); ++ uint32_t mask = qdev_get_prop_mask(props); ++ if (val) ++ *p |= ~mask; ++ else ++ *p &= ~mask; ++} ++ ++static void qdev_prop_cpy(DeviceState *dev, Property *props, void *src) ++{ ++ if (props->info->type == PROP_TYPE_BIT) { ++ bool *defval = src; ++ bit_prop_set(dev, props, *defval); ++ } else { ++ char *dst = qdev_get_prop_ptr(dev, props); ++ memcpy(dst, src, props->info->size); ++ } ++} ++ ++/* Bit */ ++static int parse_bit(DeviceState *dev, Property *prop, const char *str) ++{ ++ if (!strncasecmp(str, "on", 2)) ++ bit_prop_set(dev, prop, true); ++ else if (!strncasecmp(str, "off", 3)) ++ bit_prop_set(dev, prop, false); ++ else ++ return -1; ++ return 0; ++} ++ ++static int print_bit(DeviceState *dev, Property *prop, char *dest, size_t len) ++{ ++ uint8_t *p = qdev_get_prop_ptr(dev, prop); ++ return snprintf(dest, len, (*p & qdev_get_prop_mask(prop)) ? "on" : "off"); ++} ++ ++PropertyInfo qdev_prop_bit = { ++ .name = "on/off", ++ .type = PROP_TYPE_BIT, ++ .size = sizeof(uint32_t), ++ .parse = parse_bit, ++ .print = print_bit, ++}; ++ + /* --- 8bit integer --- */ + + static int parse_uint8(DeviceState *dev, Property *prop, const char *str) +@@ -511,7 +564,6 @@ int qdev_prop_parse(DeviceState *dev, const char *name, const char *value) + void qdev_prop_set(DeviceState *dev, const char *name, void *src, enum PropertyType type) + { + Property *prop; +- void *dst; + + prop = qdev_prop_find(dev, name); + if (!prop) { +@@ -524,8 +576,7 @@ void qdev_prop_set(DeviceState *dev, const char *name, void *src, enum PropertyT + __FUNCTION__, dev->info->name, name); + abort(); + } +- dst = qdev_get_prop_ptr(dev, prop); +- memcpy(dst, src, prop->info->size); ++ qdev_prop_cpy(dev, prop, src); + } + + void qdev_prop_set_uint8(DeviceState *dev, const char *name, uint8_t value) +@@ -585,14 +636,11 @@ void qdev_prop_set_ptr(DeviceState *dev, const char *name, void *value) + + void qdev_prop_set_defaults(DeviceState *dev, Property *props) + { +- char *dst; +- + if (!props) + return; + while (props->name) { + if (props->defval) { +- dst = qdev_get_prop_ptr(dev, props); +- memcpy(dst, props->defval, props->info->size); ++ qdev_prop_cpy(dev, props, props->defval); + } + props++; + } +diff --git a/hw/qdev.h b/hw/qdev.h +index bbcdba1..07b9603 100644 +--- a/hw/qdev.h ++++ b/hw/qdev.h +@@ -64,6 +64,7 @@ struct Property { + const char *name; + PropertyInfo *info; + int offset; ++ int bitnr; + void *defval; + }; + +@@ -82,6 +83,7 @@ enum PropertyType { + PROP_TYPE_NETDEV, + PROP_TYPE_VLAN, + PROP_TYPE_PTR, ++ PROP_TYPE_BIT, + }; + + struct PropertyInfo { +@@ -173,6 +175,7 @@ void do_device_del(Monitor *mon, const QDict *qdict); + + /*** qdev-properties.c ***/ + ++extern PropertyInfo qdev_prop_bit; + extern PropertyInfo qdev_prop_uint8; + extern PropertyInfo qdev_prop_uint16; + extern PropertyInfo qdev_prop_uint32; +@@ -202,6 +205,14 @@ extern PropertyInfo qdev_prop_pci_devfn; + + type_check(_type,typeof_field(_state, _field)), \ + .defval = (_type[]) { _defval }, \ + } ++#define DEFINE_PROP_BIT(_name, _state, _field, _bit, _defval) { \ ++ .name = (_name), \ ++ .info = &(qdev_prop_bit), \ ++ .bitnr = (_bit), \ ++ .offset = offsetof(_state, _field) \ ++ + type_check(uint32_t,typeof_field(_state, _field)), \ ++ .defval = (bool[]) { (_defval) }, \ ++ } + + #define DEFINE_PROP_UINT8(_n, _s, _f, _d) \ + DEFINE_PROP_DEFAULT(_n, _s, _f, _d, qdev_prop_uint8, uint8_t) +-- +1.6.6.144.g5c3af diff --git a/qemu-qdev-fix-thinko-leading-to-guest-crashes.patch b/qemu-qdev-fix-thinko-leading-to-guest-crashes.patch new file mode 100644 index 0000000..f6ed7dc --- /dev/null +++ b/qemu-qdev-fix-thinko-leading-to-guest-crashes.patch @@ -0,0 +1,24 @@ +Without this fix, guest crashes with drive=virtio. + +Signed-off-by: Michael S. Tsirkin +Signed-off-by: Anthony Liguori +(cherry picked from commit dbd483242c2e6dfaacb9fd3d20c333bbdad87243) +--- + hw/qdev-properties.c | 2 +- + 1 files changed, 1 insertions(+), 1 deletions(-) + +diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c +index 9e123ae..277ff9e 100644 +--- a/hw/qdev-properties.c ++++ b/hw/qdev-properties.c +@@ -20,7 +20,7 @@ static void bit_prop_set(DeviceState *dev, Property *props, bool val) + uint32_t *p = qdev_get_prop_ptr(dev, props); + uint32_t mask = qdev_get_prop_mask(props); + if (val) +- *p |= ~mask; ++ *p |= mask; + else + *p &= ~mask; + } +-- +1.6.6.144.g5c3af diff --git a/qemu-tap-add-API-to-retrieve-vhost-net-header.patch b/qemu-tap-add-API-to-retrieve-vhost-net-header.patch new file mode 100644 index 0000000..1b10ed3 --- /dev/null +++ b/qemu-tap-add-API-to-retrieve-vhost-net-header.patch @@ -0,0 +1,37 @@ +will be used by virtio-net for vhost net support + +Signed-off-by: Michael S. Tsirkin +--- + net/tap.c | 7 +++++++ + net/tap.h | 3 +++ + 2 files changed, 10 insertions(+), 0 deletions(-) + +diff --git a/net/tap.c b/net/tap.c +index d9f2e41..166cf05 100644 +--- a/net/tap.c ++++ b/net/tap.c +@@ -491,3 +491,10 @@ int net_init_tap(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan + + return 0; + } ++ ++struct vhost_net *tap_get_vhost_net(VLANClientState *nc) ++{ ++ TAPState *s = DO_UPCAST(TAPState, nc, nc); ++ assert(nc->info->type == NET_CLIENT_TYPE_TAP); ++ return s->vhost_net; ++} +diff --git a/net/tap.h b/net/tap.h +index a244b28..b8cec83 100644 +--- a/net/tap.h ++++ b/net/tap.h +@@ -50,4 +50,7 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo); + + int tap_get_fd(VLANClientState *vc); + ++struct vhost_net; ++struct vhost_net *tap_get_vhost_net(VLANClientState *vc); ++ + #endif /* QEMU_NET_TAP_H */ +-- +1.6.6.144.g5c3af diff --git a/qemu-tap-add-interface-to-get-device-fd.patch b/qemu-tap-add-interface-to-get-device-fd.patch new file mode 100644 index 0000000..6a198b3 --- /dev/null +++ b/qemu-tap-add-interface-to-get-device-fd.patch @@ -0,0 +1,39 @@ +Will be used by vhost to attach/detach to backend. + +Signed-off-by: Michael S. Tsirkin +--- + net/tap.c | 7 +++++++ + net/tap.h | 2 ++ + 2 files changed, 9 insertions(+), 0 deletions(-) + +diff --git a/net/tap.c b/net/tap.c +index d3492de..7e9ca79 100644 +--- a/net/tap.c ++++ b/net/tap.c +@@ -269,6 +269,13 @@ static void tap_poll(VLANClientState *nc, bool enable) + tap_write_poll(s, enable); + } + ++int tap_get_fd(VLANClientState *nc) ++{ ++ TAPState *s = DO_UPCAST(TAPState, nc, nc); ++ assert(nc->info->type == NET_CLIENT_TYPE_TAP); ++ return s->fd; ++} ++ + /* fd support */ + + static NetClientInfo net_tap_info = { +diff --git a/net/tap.h b/net/tap.h +index 538a562..a244b28 100644 +--- a/net/tap.h ++++ b/net/tap.h +@@ -48,4 +48,6 @@ int tap_probe_vnet_hdr(int fd); + int tap_probe_has_ufo(int fd); + void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo); + ++int tap_get_fd(VLANClientState *vc); ++ + #endif /* QEMU_NET_TAP_H */ +-- +1.6.6.144.g5c3af diff --git a/qemu-tap-add-vhost-vhostfd-options.patch b/qemu-tap-add-vhost-vhostfd-options.patch new file mode 100644 index 0000000..462d08f --- /dev/null +++ b/qemu-tap-add-vhost-vhostfd-options.patch @@ -0,0 +1,118 @@ +Signed-off-by: Michael S. Tsirkin +--- + net.c | 8 ++++++++ + net/tap.c | 29 +++++++++++++++++++++++++++++ + qemu-options.hx | 4 +++- + 3 files changed, 40 insertions(+), 1 deletions(-) + +diff --git a/net.c b/net.c +index 6ef93e6..b942d03 100644 +--- a/net.c ++++ b/net.c +@@ -976,6 +976,14 @@ static struct { + .name = "vnet_hdr", + .type = QEMU_OPT_BOOL, + .help = "enable the IFF_VNET_HDR flag on the tap interface" ++ }, { ++ .name = "vhost", ++ .type = QEMU_OPT_BOOL, ++ .help = "enable vhost-net network accelerator", ++ }, { ++ .name = "vhostfd", ++ .type = QEMU_OPT_STRING, ++ .help = "file descriptor of an already opened vhost net device", + }, + #endif /* _WIN32 */ + { /* end of list */ } +diff --git a/net/tap.c b/net/tap.c +index 7e9ca79..d9f2e41 100644 +--- a/net/tap.c ++++ b/net/tap.c +@@ -41,6 +41,8 @@ + + #include "net/tap-linux.h" + ++#include "hw/vhost_net.h" ++ + /* Maximum GSO packet size (64k) plus plenty of room for + * the ethernet and virtio_net headers + */ +@@ -57,6 +59,7 @@ typedef struct TAPState { + unsigned int has_vnet_hdr : 1; + unsigned int using_vnet_hdr : 1; + unsigned int has_ufo: 1; ++ struct vhost_net *vhost_net; + } TAPState; + + static int launch_script(const char *setup_script, const char *ifname, int fd); +@@ -252,6 +255,10 @@ static void tap_cleanup(VLANClientState *nc) + { + TAPState *s = DO_UPCAST(TAPState, nc, nc); + ++ if (s->vhost_net) { ++ vhost_net_cleanup(s->vhost_net); ++ } ++ + qemu_purge_queued_packets(nc); + + if (s->down_script[0]) +@@ -307,6 +314,7 @@ static TAPState *net_tap_fd_init(VLANState *vlan, + s->has_ufo = tap_probe_has_ufo(s->fd); + tap_set_offload(&s->nc, 0, 0, 0, 0, 0); + tap_read_poll(s, 1); ++ s->vhost_net = NULL; + return s; + } + +@@ -456,6 +464,27 @@ int net_init_tap(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan + } + } + ++ if (qemu_opt_get_bool(opts, "vhost", 0)) { ++ int vhostfd, r; ++ if (qemu_opt_get(opts, "vhostfd")) { ++ r = net_handle_fd_param(mon, qemu_opt_get(opts, "vhostfd")); ++ if (r == -1) { ++ return -1; ++ } ++ vhostfd = r; ++ } else { ++ vhostfd = -1; ++ } ++ s->vhost_net = vhost_net_init(&s->nc, vhostfd); ++ if (!s->vhost_net) { ++ qemu_error("vhost-net requested but could not be initialized\n"); ++ return -1; ++ } ++ } else if (qemu_opt_get(opts, "vhostfd")) { ++ qemu_error("vhostfd= is not valid without vhost\n"); ++ return -1; ++ } ++ + if (vlan) { + vlan->nb_host_devs++; + } +diff --git a/qemu-options.hx b/qemu-options.hx +index ca73ba5..2b3d9b8 100644 +--- a/qemu-options.hx ++++ b/qemu-options.hx +@@ -814,7 +814,7 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, + "-net tap[,vlan=n][,name=str],ifname=name\n" + " connect the host TAP network interface to VLAN 'n'\n" + #else +- "-net tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,sndbuf=nbytes][,vnet_hdr=on|off]\n" ++ "-net tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,sndbuf=nbytes][,vnet_hdr=on|off][,vhost=on|off][,vhostfd=h]\n" + " connect the host TAP network interface to VLAN 'n' and use the\n" + " network scripts 'file' (default=%s)\n" + " and 'dfile' (default=%s);\n" +@@ -824,6 +824,8 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, + " default of 'sndbuf=1048576' can be disabled using 'sndbuf=0'\n" + " use vnet_hdr=off to avoid enabling the IFF_VNET_HDR tap flag; use\n" + " vnet_hdr=on to make the lack of IFF_VNET_HDR support an error condition\n" ++ " use vhost=on to enable experimental in kernel accelerator\n" ++ " use 'vhostfd=h' to connect to an already opened vhost net device\n" + #endif + "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n" + " connect the vlan 'n' to another VLAN using a socket connection\n" +-- +1.6.6.144.g5c3af diff --git a/qemu-vhost-add-configure-check.patch b/qemu-vhost-add-configure-check.patch new file mode 100644 index 0000000..28ad946 --- /dev/null +++ b/qemu-vhost-add-configure-check.patch @@ -0,0 +1,118 @@ +Teach configure to check for vhost.h +and disable vhost_net if not present. + +Signed-off-by: Michael S. Tsirkin ' + +--- + +diff --git a/Makefile.target b/Makefile.target +index 2ebd30c..38783da 100644 +--- a/Makefile.target ++++ b/Makefile.target +@@ -168,7 +168,8 @@ obj-y = vl.o async.o monitor.o pci.o pci_host.o pcie_host.o machine.o gdbstub.o + # need to fix this properly + obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-pci.o virtio-serial-bus.o + obj-y += notifier.o +-obj-y += vhost_net.o vhost.o ++obj-y += vhost_net.o ++obj-$(CONFIG_VHOST_NET) += vhost.o + obj-$(CONFIG_KVM) += kvm.o kvm-all.o + # MSI-X depends on kvm for interrupt injection, + # so moved it from Makefile.hw to Makefile.target for now +diff --git a/configure b/configure +index 88ba002..4994506 100755 +--- a/configure ++++ b/configure +@@ -1510,6 +1510,23 @@ EOF + fi + + ########################################## ++# test for vhost net ++ ++if test "$kvm" != "no"; then ++ cat > $TMPC < ++int main(void) { return 0; } ++EOF ++ if compile_prog "$kvm_cflags" "" ; then ++ vhost_net=yes ++ else ++ vhost_net=no ++ fi ++else ++ vhost_net=no ++fi ++ ++########################################## + # libpci probe for kvm_cap_device_assignment + if test $kvm_cap_device_assignment = "yes" ; then + cat > $TMPC << EOF +@@ -2058,6 +2075,7 @@ echo "fdt support $fdt" + echo "preadv support $preadv" + echo "fdatasync $fdatasync" + echo "uuid support $uuid" ++echo "vhost-net support $vhost_net" + + if test $sdl_too_old = "yes"; then + echo "-> Your SDL version is too old - please upgrade to have SDL support" +@@ -2593,6 +2611,9 @@ case "$target_arch2" in + if test $kvm_cap_device_assignment = "yes" ; then + echo "CONFIG_KVM_DEVICE_ASSIGNMENT=y" >> $config_target_mak + fi ++ if test $vhost_net = "yes" ; then ++ echo "CONFIG_VHOST_NET=y" >> $config_target_mak ++ fi + fi + esac + echo "TARGET_PHYS_ADDR_BITS=$target_phys_bits" >> $config_target_mak +diff --git a/hw/vhost_net.c b/hw/vhost_net.c +index c89ff40..cab9a0a 100644 +--- a/hw/vhost_net.c ++++ b/hw/vhost_net.c +@@ -16,9 +16,13 @@ + #include "net/tap.h" + + #include "virtio-net.h" +-#include "vhost.h" + #include "vhost_net.h" + ++#include "config.h" ++ ++#ifdef CONFIG_VHOST_NET ++#include "vhost.h" ++ + struct vhost_net { + struct vhost_dev dev; + struct vhost_virtqueue vqs[2]; +@@ -145,3 +149,31 @@ void vhost_net_cleanup(struct vhost_net *net) + qemu_free(net); + } + /* TODO: log */ ++#else ++struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd) ++{ ++ return NULL; ++} ++ ++int vhost_net_start(struct vhost_net *net, ++ VirtIODevice *dev) ++{ ++ return -ENOSYS; ++} ++void vhost_net_stop(struct vhost_net *net, ++ VirtIODevice *dev) ++{ ++} ++ ++void vhost_net_cleanup(struct vhost_net *net) ++{ ++} ++ ++unsigned vhost_net_get_features(struct vhost_net *net, unsigned features) ++{ ++ return features; ++} ++void vhost_net_ack_features(struct vhost_net *net, unsigned features) ++{ ++} ++#endif diff --git a/qemu-vhost-vhost-net-support.patch b/qemu-vhost-vhost-net-support.patch new file mode 100644 index 0000000..aea0418 --- /dev/null +++ b/qemu-vhost-vhost-net-support.patch @@ -0,0 +1,869 @@ +This adds vhost net support in qemu. Will be tied to tap device and +virtio by following patches. Raw backend is currently missing, will be +worked on/submitted separately. + +Signed-off-by: Michael S. Tsirkin +--- + Makefile.target | 1 + + hw/vhost.c | 603 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ + hw/vhost.h | 44 ++++ + hw/vhost_net.c | 147 ++++++++++++++ + hw/vhost_net.h | 20 ++ + 5 files changed, 815 insertions(+), 0 deletions(-) + create mode 100644 hw/vhost.c + create mode 100644 hw/vhost.h + create mode 100644 hw/vhost_net.c + create mode 100644 hw/vhost_net.h + +diff --git a/Makefile.target b/Makefile.target +index 0c844a9..2ebd30c 100644 +--- a/Makefile.target ++++ b/Makefile.target +@@ -168,6 +168,7 @@ obj-y = vl.o async.o monitor.o pci.o pci_host.o pcie_host.o machine.o gdbstub.o + # need to fix this properly + obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-pci.o virtio-serial-bus.o + obj-y += notifier.o ++obj-y += vhost_net.o vhost.o + obj-$(CONFIG_KVM) += kvm.o kvm-all.o + # MSI-X depends on kvm for interrupt injection, + # so moved it from Makefile.hw to Makefile.target for now +diff --git a/hw/vhost.c b/hw/vhost.c +new file mode 100644 +index 0000000..e5c1ead +--- /dev/null ++++ b/hw/vhost.c +@@ -0,0 +1,603 @@ ++#include "linux/vhost.h" ++#include ++#include ++#include "vhost.h" ++#include "hw/hw.h" ++/* For range_get_last */ ++#include "pci.h" ++ ++static void vhost_dev_sync_region(struct vhost_dev *dev, ++ uint64_t mfirst, uint64_t mlast, ++ uint64_t rfirst, uint64_t rlast) ++{ ++ uint64_t start = MAX(mfirst, rfirst); ++ uint64_t end = MIN(mlast, rlast); ++ vhost_log_chunk_t *from = dev->log + start / VHOST_LOG_CHUNK; ++ vhost_log_chunk_t *to = dev->log + end / VHOST_LOG_CHUNK + 1; ++ uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK; ++ ++ assert(end / VHOST_LOG_CHUNK < dev->log_size); ++ assert(start / VHOST_LOG_CHUNK < dev->log_size); ++ if (end < start) { ++ return; ++ } ++ for (;from < to; ++from) { ++ vhost_log_chunk_t log; ++ int bit; ++ /* We first check with non-atomic: much cheaper, ++ * and we expect non-dirty to be the common case. */ ++ if (!*from) { ++ continue; ++ } ++ /* Data must be read atomically. We don't really ++ * need the barrier semantics of __sync ++ * builtins, but it's easier to use them than ++ * roll our own. */ ++ log = __sync_fetch_and_and(from, 0); ++ while ((bit = sizeof(log) > sizeof(int) ? ++ ffsll(log) : ffs(log))) { ++ bit -= 1; ++ cpu_physical_memory_set_dirty(addr + bit * VHOST_LOG_PAGE); ++ log &= ~(0x1ull << bit); ++ } ++ addr += VHOST_LOG_CHUNK; ++ } ++} ++ ++static int vhost_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client, ++ target_phys_addr_t start_addr, ++ target_phys_addr_t end_addr) ++{ ++ struct vhost_dev *dev = container_of(client, struct vhost_dev, client); ++ int i; ++ if (!dev->log_enabled || !dev->started) { ++ return 0; ++ } ++ for (i = 0; i < dev->mem->nregions; ++i) { ++ struct vhost_memory_region *reg = dev->mem->regions + i; ++ vhost_dev_sync_region(dev, start_addr, end_addr, ++ reg->guest_phys_addr, ++ range_get_last(reg->guest_phys_addr, ++ reg->memory_size)); ++ } ++ for (i = 0; i < dev->nvqs; ++i) { ++ struct vhost_virtqueue *vq = dev->vqs + i; ++ unsigned size = sizeof(struct vring_used_elem) * vq->num; ++ vhost_dev_sync_region(dev, start_addr, end_addr, vq->used_phys, ++ range_get_last(vq->used_phys, size)); ++ } ++ return 0; ++} ++ ++/* Assign/unassign. Keep an unsorted array of non-overlapping ++ * memory regions in dev->mem. */ ++static void vhost_dev_unassign_memory(struct vhost_dev *dev, ++ uint64_t start_addr, ++ uint64_t size) ++{ ++ int from, to, n = dev->mem->nregions; ++ /* Track overlapping/split regions for sanity checking. */ ++ int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0; ++ ++ for (from = 0, to = 0; from < n; ++from, ++to) { ++ struct vhost_memory_region *reg = dev->mem->regions + to; ++ uint64_t reglast; ++ uint64_t memlast; ++ uint64_t change; ++ ++ /* clone old region */ ++ if (to != from) { ++ memcpy(reg, dev->mem->regions + from, sizeof *reg); ++ } ++ ++ /* No overlap is simple */ ++ if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size, ++ start_addr, size)) { ++ continue; ++ } ++ ++ /* Split only happens if supplied region ++ * is in the middle of an existing one. Thus it can not ++ * overlap with any other existing region. */ ++ assert(!split); ++ ++ reglast = range_get_last(reg->guest_phys_addr, reg->memory_size); ++ memlast = range_get_last(start_addr, size); ++ ++ /* Remove whole region */ ++ if (start_addr <= reg->guest_phys_addr && memlast >= reglast) { ++ --dev->mem->nregions; ++ --to; ++ assert(to >= 0); ++ ++overlap_middle; ++ continue; ++ } ++ ++ /* Shrink region */ ++ if (memlast >= reglast) { ++ reg->memory_size = start_addr - reg->guest_phys_addr; ++ assert(reg->memory_size); ++ assert(!overlap_end); ++ ++overlap_end; ++ continue; ++ } ++ ++ /* Shift region */ ++ if (start_addr <= reg->guest_phys_addr) { ++ change = memlast + 1 - reg->guest_phys_addr; ++ reg->memory_size -= change; ++ reg->guest_phys_addr += change; ++ reg->userspace_addr += change; ++ assert(reg->memory_size); ++ assert(!overlap_start); ++ ++overlap_start; ++ continue; ++ } ++ ++ /* This only happens if supplied region ++ * is in the middle of an existing one. Thus it can not ++ * overlap with any other existing region. */ ++ assert(!overlap_start); ++ assert(!overlap_end); ++ assert(!overlap_middle); ++ /* Split region: shrink first part, shift second part. */ ++ memcpy(dev->mem->regions + n, reg, sizeof *reg); ++ reg->memory_size = start_addr - reg->guest_phys_addr; ++ assert(reg->memory_size); ++ change = memlast + 1 - reg->guest_phys_addr; ++ reg = dev->mem->regions + n; ++ reg->memory_size -= change; ++ assert(reg->memory_size); ++ reg->guest_phys_addr += change; ++ reg->userspace_addr += change; ++ /* Never add more than 1 region */ ++ assert(dev->mem->nregions == n); ++ ++dev->mem->nregions; ++ ++split; ++ } ++} ++ ++/* Called after unassign, so no regions overlap the given range. */ ++static void vhost_dev_assign_memory(struct vhost_dev *dev, ++ uint64_t start_addr, ++ uint64_t size, ++ uint64_t uaddr) ++{ ++ int from, to; ++ struct vhost_memory_region *merged = NULL; ++ for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) { ++ struct vhost_memory_region *reg = dev->mem->regions + to; ++ uint64_t prlast, urlast; ++ uint64_t pmlast, umlast; ++ uint64_t s, e, u; ++ ++ /* clone old region */ ++ if (to != from) { ++ memcpy(reg, dev->mem->regions + from, sizeof *reg); ++ } ++ prlast = range_get_last(reg->guest_phys_addr, reg->memory_size); ++ pmlast = range_get_last(start_addr, size); ++ urlast = range_get_last(reg->userspace_addr, reg->memory_size); ++ umlast = range_get_last(uaddr, size); ++ ++ /* check for overlapping regions: should never happen. */ ++ assert(prlast < start_addr || pmlast < reg->guest_phys_addr); ++ /* Not an adjacent or overlapping region - do not merge. */ ++ if ((prlast + 1 != start_addr || urlast + 1 != uaddr) && ++ (pmlast + 1 != reg->guest_phys_addr || ++ umlast + 1 != reg->userspace_addr)) { ++ continue; ++ } ++ ++ if (merged) { ++ --to; ++ assert(to >= 0); ++ } else { ++ merged = reg; ++ } ++ u = MIN(uaddr, reg->userspace_addr); ++ s = MIN(start_addr, reg->guest_phys_addr); ++ e = MAX(pmlast, prlast); ++ uaddr = merged->userspace_addr = u; ++ start_addr = merged->guest_phys_addr = s; ++ size = merged->memory_size = e - s + 1; ++ assert(merged->memory_size); ++ } ++ ++ if (!merged) { ++ struct vhost_memory_region *reg = dev->mem->regions + to; ++ memset(reg, 0, sizeof *reg); ++ reg->memory_size = size; ++ assert(reg->memory_size); ++ reg->guest_phys_addr = start_addr; ++ reg->userspace_addr = uaddr; ++ ++to; ++ } ++ assert(to <= dev->mem->nregions + 1); ++ dev->mem->nregions = to; ++} ++ ++static uint64_t vhost_get_log_size(struct vhost_dev *dev) ++{ ++ uint64_t log_size = 0; ++ int i; ++ for (i = 0; i < dev->mem->nregions; ++i) { ++ struct vhost_memory_region *reg = dev->mem->regions + i; ++ uint64_t last = range_get_last(reg->guest_phys_addr, ++ reg->memory_size); ++ log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); ++ } ++ for (i = 0; i < dev->nvqs; ++i) { ++ struct vhost_virtqueue *vq = dev->vqs + i; ++ uint64_t last = vq->used_phys + ++ sizeof(struct vring_used_elem) * vq->num - 1; ++ log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); ++ } ++ return log_size; ++} ++ ++static inline void vhost_dev_log_resize(struct vhost_dev* dev, uint64_t size) ++{ ++ vhost_log_chunk_t *log; ++ int r; ++ if (size) { ++ log = qemu_mallocz(size * sizeof *log); ++ } else { ++ log = NULL; ++ } ++ r = ioctl(dev->control, VHOST_SET_LOG_BASE, ++ (uint64_t)(unsigned long)log); ++ assert(r >= 0); ++ vhost_client_sync_dirty_bitmap(&dev->client, 0, ++ (target_phys_addr_t)~0x0ull); ++ if (dev->log) { ++ qemu_free(dev->log); ++ } ++ dev->log = log; ++ dev->log_size = size; ++} ++ ++static void vhost_client_set_memory(CPUPhysMemoryClient *client, ++ target_phys_addr_t start_addr, ++ ram_addr_t size, ++ ram_addr_t phys_offset) ++{ ++ struct vhost_dev *dev = container_of(client, struct vhost_dev, client); ++ ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK; ++ int s = offsetof(struct vhost_memory, regions) + ++ (dev->mem->nregions + 1) * sizeof dev->mem->regions[0]; ++ uint64_t log_size; ++ int r; ++ dev->mem = qemu_realloc(dev->mem, s); ++ ++ assert(size); ++ ++ vhost_dev_unassign_memory(dev, start_addr, size); ++ if (flags == IO_MEM_RAM) { ++ /* Add given mapping, merging adjacent regions if any */ ++ vhost_dev_assign_memory(dev, start_addr, size, ++ (uintptr_t)qemu_get_ram_ptr(phys_offset)); ++ } else { ++ /* Remove old mapping for this memory, if any. */ ++ vhost_dev_unassign_memory(dev, start_addr, size); ++ } ++ ++ if (!dev->started) { ++ return; ++ } ++ if (!dev->log_enabled) { ++ r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem); ++ assert(r >= 0); ++ return; ++ } ++ log_size = vhost_get_log_size(dev); ++ /* We allocate an extra 4K bytes to log, ++ * to reduce the * number of reallocations. */ ++#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log) ++ /* To log more, must increase log size before table update. */ ++ if (dev->log_size < log_size) { ++ vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); ++ } ++ r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem); ++ assert(r >= 0); ++ /* To log less, can only decrease log size after table update. */ ++ if (dev->log_size > log_size + VHOST_LOG_BUFFER) { ++ vhost_dev_log_resize(dev, log_size); ++ } ++} ++ ++static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) ++{ ++ uint64_t features = dev->acked_features; ++ int r; ++ if (dev->log_enabled) { ++ features |= 0x1 << VHOST_F_LOG_ALL; ++ } ++ r = ioctl(dev->control, VHOST_SET_FEATURES, &features); ++ return r < 0 ? -errno : 0; ++} ++ ++static int vhost_client_migration_log(struct CPUPhysMemoryClient *client, ++ int enable) ++{ ++ struct vhost_dev *dev = container_of(client, struct vhost_dev, client); ++ int r; ++ if (!!enable == dev->log_enabled) { ++ return 0; ++ } ++ if (!dev->started) { ++ dev->log_enabled = enable; ++ return 0; ++ } ++ if (!enable) { ++ r = vhost_dev_set_log(dev, false); ++ if (r < 0) { ++ return r; ++ } ++ if (dev->log) { ++ qemu_free(dev->log); ++ } ++ dev->log = NULL; ++ dev->log_size = 0; ++ } else { ++ vhost_dev_log_resize(dev, vhost_get_log_size(dev)); ++ r = vhost_dev_set_log(dev, false); ++ if (r < 0) { ++ return r; ++ } ++ } ++ dev->log_enabled = enable; ++ return 0; ++} ++ ++static int vhost_virtqueue_set_addr(struct vhost_dev *dev, ++ struct vhost_virtqueue *vq, ++ unsigned idx, bool enable_log) ++{ ++ struct vhost_vring_addr addr = { ++ .index = idx, ++ .desc_user_addr = (u_int64_t)(unsigned long)vq->desc, ++ .avail_user_addr = (u_int64_t)(unsigned long)vq->avail, ++ .used_user_addr = (u_int64_t)(unsigned long)vq->used, ++ .log_guest_addr = vq->used_phys, ++ .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0, ++ }; ++ int r = ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr); ++ if (r < 0) { ++ return -errno; ++ } ++ return 0; ++} ++ ++static int vhost_virtqueue_init(struct vhost_dev *dev, ++ struct VirtIODevice *vdev, ++ struct vhost_virtqueue *vq, ++ unsigned idx) ++{ ++ target_phys_addr_t s, l, a; ++ int r; ++ struct vhost_vring_file file = { ++ .index = idx, ++ }; ++ struct vhost_vring_state state = { ++ .index = idx, ++ }; ++ struct VirtQueue *q = virtio_queue(vdev, idx); ++ ++ vq->num = state.num = virtio_queue_get_num(vdev, idx); ++ r = ioctl(dev->control, VHOST_SET_VRING_NUM, &state); ++ if (r) { ++ return -errno; ++ } ++ ++ state.num = virtio_queue_last_avail_idx(vdev, idx); ++ r = ioctl(dev->control, VHOST_SET_VRING_BASE, &state); ++ if (r) { ++ return -errno; ++ } ++ ++ s = l = sizeof(struct vring_desc) * vq->num; ++ a = virtio_queue_get_desc(vdev, idx); ++ vq->desc = cpu_physical_memory_map(a, &l, 0); ++ if (!vq->desc || l != s) { ++ r = -ENOMEM; ++ goto fail_alloc; ++ } ++ s = l = offsetof(struct vring_avail, ring) + ++ sizeof(u_int64_t) * vq->num; ++ a = virtio_queue_get_avail(vdev, idx); ++ vq->avail = cpu_physical_memory_map(a, &l, 0); ++ if (!vq->avail || l != s) { ++ r = -ENOMEM; ++ goto fail_alloc; ++ } ++ s = l = offsetof(struct vring_used, ring) + ++ sizeof(struct vring_used_elem) * vq->num; ++ vq->used_phys = a = virtio_queue_get_used(vdev, idx); ++ vq->used = cpu_physical_memory_map(a, &l, 1); ++ if (!vq->used || l != s) { ++ r = -ENOMEM; ++ goto fail_alloc; ++ } ++ ++ r = vhost_virtqueue_set_addr(dev, vq, idx, dev->log_enabled); ++ if (r < 0) { ++ r = -errno; ++ goto fail_alloc; ++ } ++ if (!vdev->binding->guest_notifier || !vdev->binding->host_notifier) { ++ fprintf(stderr, "binding does not support irqfd/queuefd\n"); ++ r = -ENOSYS; ++ goto fail_alloc; ++ } ++ r = vdev->binding->guest_notifier(vdev->binding_opaque, idx, true); ++ if (r < 0) { ++ fprintf(stderr, "Error binding guest notifier: %d\n", -r); ++ goto fail_guest_notifier; ++ } ++ ++ r = vdev->binding->host_notifier(vdev->binding_opaque, idx, true); ++ if (r < 0) { ++ fprintf(stderr, "Error binding host notifier: %d\n", -r); ++ goto fail_host_notifier; ++ } ++ ++ file.fd = event_notifier_get_fd(virtio_queue_host_notifier(q)); ++ r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file); ++ if (r) { ++ goto fail_kick; ++ } ++ ++ file.fd = event_notifier_get_fd(virtio_queue_guest_notifier(q)); ++ r = ioctl(dev->control, VHOST_SET_VRING_CALL, &file); ++ if (r) { ++ goto fail_call; ++ } ++ ++ return 0; ++ ++fail_call: ++fail_kick: ++ vdev->binding->host_notifier(vdev->binding_opaque, idx, false); ++fail_host_notifier: ++ vdev->binding->guest_notifier(vdev->binding_opaque, idx, false); ++fail_guest_notifier: ++fail_alloc: ++ return r; ++} ++ ++static void vhost_virtqueue_cleanup(struct vhost_dev *dev, ++ struct VirtIODevice *vdev, ++ struct vhost_virtqueue *vq, ++ unsigned idx) ++{ ++ struct vhost_vring_state state = { ++ .index = idx, ++ }; ++ int r; ++ r = vdev->binding->guest_notifier(vdev->binding_opaque, idx, false); ++ if (r < 0) { ++ fprintf(stderr, "vhost VQ %d guest cleanup failed: %d\n", idx, r); ++ fflush(stderr); ++ } ++ assert (r >= 0); ++ ++ r = vdev->binding->host_notifier(vdev->binding_opaque, idx, false); ++ if (r < 0) { ++ fprintf(stderr, "vhost VQ %d host cleanup failed: %d\n", idx, r); ++ fflush(stderr); ++ } ++ assert (r >= 0); ++ r = ioctl(dev->control, VHOST_GET_VRING_BASE, &state); ++ if (r < 0) { ++ fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r); ++ fflush(stderr); ++ } ++ virtio_queue_set_last_avail_idx(vdev, idx, state.num); ++ assert (r >= 0); ++} ++ ++int vhost_dev_init(struct vhost_dev *hdev, int devfd) ++{ ++ uint64_t features; ++ int r; ++ if (devfd >= 0) { ++ hdev->control = devfd; ++ } else { ++ hdev->control = open("/dev/vhost-net", O_RDWR); ++ if (hdev->control < 0) ++ return -errno; ++ } ++ r = ioctl(hdev->control, VHOST_SET_OWNER, NULL); ++ if (r < 0) ++ goto fail; ++ ++ r = ioctl(hdev->control, VHOST_GET_FEATURES, &features); ++ if (r < 0) ++ goto fail; ++ hdev->features = features; ++ ++ hdev->client.set_memory = vhost_client_set_memory; ++ hdev->client.sync_dirty_bitmap = vhost_client_sync_dirty_bitmap; ++ hdev->client.migration_log = vhost_client_migration_log; ++ hdev->mem = qemu_mallocz(offsetof(struct vhost_memory, regions)); ++ hdev->log = NULL; ++ hdev->log_size = 0; ++ hdev->log_enabled = false; ++ hdev->started = false; ++ cpu_register_phys_memory_client(&hdev->client); ++ return 0; ++fail: ++ r = -errno; ++ close(hdev->control); ++ return r; ++} ++ ++void vhost_dev_cleanup(struct vhost_dev *hdev) ++{ ++ cpu_unregister_phys_memory_client(&hdev->client); ++ qemu_free(hdev->mem); ++ close(hdev->control); ++} ++ ++int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev) ++{ ++ int i, r; ++ ++ r = vhost_dev_set_log(hdev, hdev->log_enabled); ++ if (r < 0) ++ goto fail; ++ r = ioctl(hdev->control, VHOST_SET_MEM_TABLE, hdev->mem); ++ if (r < 0) { ++ r = -errno; ++ goto fail; ++ } ++ if (hdev->log_enabled) { ++ hdev->log_size = vhost_get_log_size(hdev); ++ hdev->log = hdev->log_size ? ++ qemu_mallocz(hdev->log_size * sizeof *hdev->log) : NULL; ++ r = ioctl(hdev->control, VHOST_SET_LOG_BASE, ++ (uint64_t)(unsigned long)hdev->log); ++ if (r < 0) { ++ r = -errno; ++ goto fail; ++ } ++ } ++ ++ for (i = 0; i < hdev->nvqs; ++i) { ++ r = vhost_virtqueue_init(hdev, ++ vdev, ++ hdev->vqs + i, ++ i); ++ if (r < 0) ++ goto fail_vq; ++ } ++ hdev->started = true; ++ ++ return 0; ++fail_vq: ++ while (--i >= 0) { ++ vhost_virtqueue_cleanup(hdev, ++ vdev, ++ hdev->vqs + i, ++ i); ++ } ++fail: ++ return r; ++} ++ ++void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev) ++{ ++ int i; ++ for (i = 0; i < hdev->nvqs; ++i) { ++ vhost_virtqueue_cleanup(hdev, ++ vdev, ++ hdev->vqs + i, ++ i); ++ } ++ vhost_client_sync_dirty_bitmap(&hdev->client, 0, ++ (target_phys_addr_t)~0x0ull); ++ hdev->started = false; ++ qemu_free(hdev->log); ++ hdev->log_size = 0; ++} +diff --git a/hw/vhost.h b/hw/vhost.h +new file mode 100644 +index 0000000..2ed3933 +--- /dev/null ++++ b/hw/vhost.h +@@ -0,0 +1,44 @@ ++#ifndef VHOST_H ++#define VHOST_H ++ ++#include "hw/hw.h" ++#include "hw/virtio.h" ++ ++/* Generic structures common for any vhost based device. */ ++struct vhost_virtqueue { ++ int kick; ++ int call; ++ void *desc; ++ void *avail; ++ void *used; ++ int num; ++ unsigned long long used_phys; ++}; ++ ++typedef unsigned long vhost_log_chunk_t; ++#define VHOST_LOG_PAGE 0x1000 ++#define VHOST_LOG_BITS (8 * sizeof(vhost_log_chunk_t)) ++#define VHOST_LOG_CHUNK (VHOST_LOG_PAGE * VHOST_LOG_BITS) ++ ++struct vhost_memory; ++struct vhost_dev { ++ CPUPhysMemoryClient client; ++ int control; ++ struct vhost_memory *mem; ++ struct vhost_virtqueue *vqs; ++ int nvqs; ++ unsigned long long features; ++ unsigned long long acked_features; ++ unsigned long long backend_features; ++ bool started; ++ bool log_enabled; ++ vhost_log_chunk_t *log; ++ unsigned long long log_size; ++}; ++ ++int vhost_dev_init(struct vhost_dev *hdev, int devfd); ++void vhost_dev_cleanup(struct vhost_dev *hdev); ++int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev); ++void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev); ++ ++#endif +diff --git a/hw/vhost_net.c b/hw/vhost_net.c +new file mode 100644 +index 0000000..c89ff40 +--- /dev/null ++++ b/hw/vhost_net.c +@@ -0,0 +1,147 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "net.h" ++#include "net/tap.h" ++ ++#include "virtio-net.h" ++#include "vhost.h" ++#include "vhost_net.h" ++ ++struct vhost_net { ++ struct vhost_dev dev; ++ struct vhost_virtqueue vqs[2]; ++ int backend; ++ VLANClientState *vc; ++}; ++ ++unsigned vhost_net_get_features(struct vhost_net *net, unsigned features) ++{ ++ /* Clear features not supported by host kernel. */ ++ if (!(net->dev.features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY))) ++ features &= ~(1 << VIRTIO_F_NOTIFY_ON_EMPTY); ++ if (!(net->dev.features & (1 << VIRTIO_RING_F_INDIRECT_DESC))) ++ features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC); ++ if (!(net->dev.features & (1 << VIRTIO_NET_F_MRG_RXBUF))) ++ features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF); ++ return features; ++} ++ ++void vhost_net_ack_features(struct vhost_net *net, unsigned features) ++{ ++ net->dev.acked_features = net->dev.backend_features; ++ if (features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) ++ net->dev.acked_features |= (1 << VIRTIO_F_NOTIFY_ON_EMPTY); ++ if (features & (1 << VIRTIO_RING_F_INDIRECT_DESC)) ++ net->dev.acked_features |= (1 << VIRTIO_RING_F_INDIRECT_DESC); ++} ++ ++static int vhost_net_get_fd(VLANClientState *backend) ++{ ++ switch (backend->info->type) { ++ case NET_CLIENT_TYPE_TAP: ++ return tap_get_fd(backend); ++ default: ++ fprintf(stderr, "vhost-net requires tap backend\n"); ++ return -EBADFD; ++ } ++} ++ ++struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd) ++{ ++ int r; ++ struct vhost_net *net = qemu_malloc(sizeof *net); ++ if (!backend) { ++ fprintf(stderr, "vhost-net requires backend to be setup\n"); ++ goto fail; ++ } ++ r = vhost_net_get_fd(backend); ++ if (r < 0) ++ goto fail; ++ net->vc = backend; ++ net->dev.backend_features = tap_has_vnet_hdr(backend) ? 0 : ++ (1 << VHOST_NET_F_VIRTIO_NET_HDR); ++ net->backend = r; ++ ++ r = vhost_dev_init(&net->dev, devfd); ++ if (r < 0) ++ goto fail; ++ if (~net->dev.features & net->dev.backend_features) { ++ fprintf(stderr, "vhost lacks feature mask %llu for backend\n", ++ ~net->dev.features & net->dev.backend_features); ++ vhost_dev_cleanup(&net->dev); ++ goto fail; ++ } ++ ++ /* Set sane init value. Override when guest acks. */ ++ vhost_net_ack_features(net, 0); ++ return net; ++fail: ++ qemu_free(net); ++ return NULL; ++} ++ ++int vhost_net_start(struct vhost_net *net, ++ VirtIODevice *dev) ++{ ++ struct vhost_vring_file file = { }; ++ int r; ++ ++ net->dev.nvqs = 2; ++ net->dev.vqs = net->vqs; ++ r = vhost_dev_start(&net->dev, dev); ++ if (r < 0) ++ return r; ++ ++ net->vc->info->poll(net->vc, false); ++ qemu_set_fd_handler(net->backend, NULL, NULL, NULL); ++ file.fd = net->backend; ++ for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { ++ r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); ++ if (r < 0) { ++ r = -errno; ++ goto fail; ++ } ++ } ++ return 0; ++fail: ++ file.fd = -1; ++ while (--file.index >= 0) { ++ int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); ++ assert(r >= 0); ++ } ++ net->vc->info->poll(net->vc, true); ++ vhost_dev_stop(&net->dev, dev); ++ return r; ++} ++ ++void vhost_net_stop(struct vhost_net *net, ++ VirtIODevice *dev) ++{ ++ struct vhost_vring_file file = { .fd = -1 }; ++ ++ for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { ++ int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); ++ assert(r >= 0); ++ } ++ net->vc->info->poll(net->vc, true); ++ vhost_dev_stop(&net->dev, dev); ++} ++ ++void vhost_net_cleanup(struct vhost_net *net) ++{ ++ vhost_dev_cleanup(&net->dev); ++ qemu_free(net); ++} ++/* TODO: log */ +diff --git a/hw/vhost_net.h b/hw/vhost_net.h +new file mode 100644 +index 0000000..21f0277 +--- /dev/null ++++ b/hw/vhost_net.h +@@ -0,0 +1,20 @@ ++#ifndef VHOST_NET_H ++#define VHOST_NET_H ++ ++#include "net.h" ++ ++struct vhost_net; ++ ++struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd); ++ ++int vhost_net_start(struct vhost_net *net, ++ VirtIODevice *dev); ++void vhost_net_stop(struct vhost_net *net, ++ VirtIODevice *dev); ++ ++void vhost_net_cleanup(struct vhost_net *net); ++ ++unsigned vhost_net_get_features(struct vhost_net *net, unsigned features); ++void vhost_net_ack_features(struct vhost_net *net, unsigned features); ++ ++#endif +-- +1.6.6.144.g5c3af diff --git a/qemu-virtio-add-APIs-for-queue-fields.patch b/qemu-virtio-add-APIs-for-queue-fields.patch new file mode 100644 index 0000000..e2a57ed --- /dev/null +++ b/qemu-virtio-add-APIs-for-queue-fields.patch @@ -0,0 +1,111 @@ +vhost needs physical addresses for ring and other queue fields, +so add APIs for these. + +Signed-off-by: Michael S. Tsirkin +--- + hw/virtio.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++---- + hw/virtio.h | 10 +++++++++- + 2 files changed, 56 insertions(+), 5 deletions(-) + +diff --git a/hw/virtio.c b/hw/virtio.c +index c2b80aa..b16ee1a 100644 +--- a/hw/virtio.c ++++ b/hw/virtio.c +@@ -73,6 +73,9 @@ struct VirtQueue + int inuse; + uint16_t vector; + void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq); ++ VirtIODevice *vdev; ++ EventNotifier guest_notifier; ++ EventNotifier host_notifier; + }; + + /* virt queue functions */ +@@ -594,10 +597,10 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, + return &vdev->vq[i]; + } + +-void virtio_irq(VirtIODevice *vdev, VirtQueue *vq) ++void virtio_irq(VirtQueue *vq) + { +- vdev->isr |= 0x01; +- virtio_notify_vector(vdev, vq->vector); ++ vq->vdev->isr |= 0x01; ++ virtio_notify_vector(vq->vdev, vq->vector); + } + + void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) +@@ -608,7 +611,8 @@ void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) + (vq->inuse || vring_avail_idx(vq) != vq->last_avail_idx))) + return; + +- virtio_irq(vdev, vq); ++ vdev->isr |= 0x01; ++ virtio_notify_vector(vdev, vq->vector); + } + + void virtio_notify_config(VirtIODevice *vdev) +@@ -742,3 +746,42 @@ void virtio_bind_device(VirtIODevice *vdev, const VirtIOBindings *binding, + vdev->binding = binding; + vdev->binding_opaque = opaque; + } ++ ++target_phys_addr_t virtio_queue_get_desc(VirtIODevice *vdev, int n) ++{ ++ return vdev->vq[n].vring.desc; ++} ++ ++target_phys_addr_t virtio_queue_get_avail(VirtIODevice *vdev, int n) ++{ ++ return vdev->vq[n].vring.avail; ++} ++ ++target_phys_addr_t virtio_queue_get_used(VirtIODevice *vdev, int n) ++{ ++ return vdev->vq[n].vring.used; ++} ++ ++uint16_t virtio_queue_last_avail_idx(VirtIODevice *vdev, int n) ++{ ++ return vdev->vq[n].last_avail_idx; ++} ++ ++void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx) ++{ ++ vdev->vq[n].last_avail_idx = idx; ++} ++ ++VirtQueue *virtio_queue(VirtIODevice *vdev, int n) ++{ ++ return vdev->vq + n; ++} ++ ++EventNotifier *virtio_queue_guest_notifier(VirtQueue *vq) ++{ ++ return &vq->guest_notifier; ++} ++EventNotifier *virtio_queue_host_notifier(VirtQueue *vq) ++{ ++ return &vq->host_notifier; ++} +diff --git a/hw/virtio.h b/hw/virtio.h +index 10a0959..f140ca3 100644 +--- a/hw/virtio.h ++++ b/hw/virtio.h +@@ -183,5 +183,13 @@ void virtio_net_exit(VirtIODevice *vdev); + DEFINE_PROP_BIT("indirect_desc", _state, _field, \ + VIRTIO_RING_F_INDIRECT_DESC, true) + +-void virtio_irq(VirtIODevice *vdev, VirtQueue *vq); ++target_phys_addr_t virtio_queue_get_desc(VirtIODevice *vdev, int n); ++target_phys_addr_t virtio_queue_get_avail(VirtIODevice *vdev, int n); ++target_phys_addr_t virtio_queue_get_used(VirtIODevice *vdev, int n); ++uint16_t virtio_queue_last_avail_idx(VirtIODevice *vdev, int n); ++void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx); ++VirtQueue *virtio_queue(VirtIODevice *vdev, int n); ++EventNotifier *virtio_queue_guest_notifier(VirtQueue *vq); ++EventNotifier *virtio_queue_host_notifier(VirtQueue *vq); ++void virtio_irq(VirtQueue *vq); + #endif +-- +1.6.6.144.g5c3af diff --git a/qemu-virtio-add-features-as-qdev-properties.patch b/qemu-virtio-add-features-as-qdev-properties.patch new file mode 100644 index 0000000..fc9bcac --- /dev/null +++ b/qemu-virtio-add-features-as-qdev-properties.patch @@ -0,0 +1,400 @@ +Add feature bits as properties to virtio. This makes it possible to e.g. define +machine without indirect buffer support, which is required for 0.10 +compatibility, or without hardware checksum support, which is required for 0.11 +compatibility. Since default values for optional features are now set by qdev, +get_features callback has been modified: it sets non-optional bits, and clears +bits not supported by host. + +Signed-off-by: Michael S. Tsirkin +Acked-by: Gerd Hoffmann +Signed-off-by: Anthony Liguori +(cherry picked from commit 8172539d21a03e982aa7f139ddc1607dc1422045) +--- + hw/s390-virtio-bus.c | 12 +++++++++--- + hw/s390-virtio-bus.h | 1 + + hw/syborg_virtio.c | 13 ++++++++----- + hw/virtio-balloon.c | 4 ++-- + hw/virtio-blk.c | 6 +----- + hw/virtio-blk.h | 8 ++++++++ + hw/virtio-net.c | 39 ++++++++++++++++----------------------- + hw/virtio-net.h | 20 ++++++++++++++++++++ + hw/virtio-pci.c | 25 +++++++++++++++++-------- + hw/virtio.c | 2 +- + hw/virtio.h | 7 ++++++- + 12 files changed, 91 insertions(+), 50 deletions(-) + +--- a/hw/s390-virtio-bus.c 2010-02-09 00:18:58.000000000 -0600 ++++ b/hw/s390-virtio-bus.c 2010-02-09 00:02:12.000000000 -0600 +@@ -101,6 +101,7 @@ static int s390_virtio_device_init(VirtI + bus->dev_offs += dev_len; + + virtio_bind_device(vdev, &virtio_s390_bindings, dev); ++ dev->host_features = vdev->get_features(vdev, dev->host_features); + s390_virtio_device_sync(dev); + + return 0; +@@ -222,9 +223,7 @@ static void s390_virtio_device_sync(Virt + cur_offs += num_vq * VIRTIO_VQCONFIG_LEN; + + /* Sync feature bitmap */ +- if (dev->vdev->get_features) { +- stl_phys(cur_offs, dev->vdev->get_features(dev->vdev)); +- } ++ stl_phys(cur_offs, dev->host_features); + + dev->feat_offs = cur_offs + dev->feat_len; + cur_offs += dev->feat_len * 2; +@@ -310,10 +309,17 @@ static void virtio_s390_notify(void *opa + kvm_s390_virtio_irq(s390_cpu_addr2state(0), 0, token); + } + ++static unsigned virtio_s390_get_features(void *opaque) ++{ ++ VirtIOS390Device *dev = (VirtIOS390Device*)opaque; ++ return dev->host_features; ++} ++ + /**************** S390 Virtio Bus Device Descriptions *******************/ + + static const VirtIOBindings virtio_s390_bindings = { + .notify = virtio_s390_notify, ++ .get_features = virtio_s390_get_features, + }; + + static VirtIOS390DeviceInfo s390_virtio_net = { +--- a/hw/s390-virtio-bus.h 2010-02-09 00:18:58.000000000 -0600 ++++ b/s390-virtio-bus.h 2010-02-09 00:18:16.000000000 -0600 +@@ -40,6 +40,7 @@ typedef struct VirtIOS390Device { + VirtIODevice *vdev; + DriveInfo *dinfo; + NICConf nic; ++ uint32_t host_features; + /* Max. number of ports we can have for a the virtio-serial device */ + uint32_t max_virtserial_ports; + } VirtIOS390Device; +--- a/hw/syborg_virtio.c 2010-02-09 00:18:58.000000000 -0600 ++++ b/hw/syborg_virtio.c 2010-02-09 00:02:12.000000000 -0600 +@@ -25,6 +25,7 @@ + #include "syborg.h" + #include "sysbus.h" + #include "virtio.h" ++#include "virtio-net.h" + #include "sysemu.h" + + //#define DEBUG_SYBORG_VIRTIO +@@ -66,6 +67,7 @@ typedef struct { + uint32_t int_enable; + uint32_t id; + NICConf nic; ++ uint32_t host_features; + } SyborgVirtIOProxy; + + static uint32_t syborg_virtio_readl(void *opaque, target_phys_addr_t offset) +@@ -86,8 +88,7 @@ static uint32_t syborg_virtio_readl(void + ret = s->id; + break; + case SYBORG_VIRTIO_HOST_FEATURES: +- ret = vdev->get_features(vdev); +- ret |= vdev->binding->get_features(s); ++ ret = s->host_features; + break; + case SYBORG_VIRTIO_GUEST_FEATURES: + ret = vdev->guest_features; +@@ -244,9 +245,8 @@ static void syborg_virtio_update_irq(voi + + static unsigned syborg_virtio_get_features(void *opaque) + { +- unsigned ret = 0; +- ret |= (1 << VIRTIO_F_NOTIFY_ON_EMPTY); +- return ret; ++ SyborgVirtIOProxy *proxy = opaque; ++ return proxy->host_features; + } + + static VirtIOBindings syborg_virtio_bindings = { +@@ -272,6 +272,8 @@ static int syborg_virtio_init(SyborgVirt + qemu_register_reset(virtio_reset, vdev); + + virtio_bind_device(vdev, &syborg_virtio_bindings, proxy); ++ proxy->host_features |= (0x1 << VIRTIO_F_NOTIFY_ON_EMPTY); ++ proxy->host_features = vdev->get_features(vdev, proxy->host_features); + return 0; + } + +@@ -292,6 +294,7 @@ static SysBusDeviceInfo syborg_virtio_ne + .qdev.size = sizeof(SyborgVirtIOProxy), + .qdev.props = (Property[]) { + DEFINE_NIC_PROPERTIES(SyborgVirtIOProxy, nic), ++ DEFINE_VIRTIO_NET_FEATURES(SyborgVirtIOProxy, host_features), + DEFINE_PROP_END_OF_LIST(), + } + }; +--- a/hw/virtio-balloon.c 2010-01-18 12:48:25.000000000 -0600 ++++ b/hw/virtio-balloon.c 2010-02-09 00:02:12.000000000 -0600 +@@ -125,9 +125,9 @@ static void virtio_balloon_set_config(Vi + dev->actual = config.actual; + } + +-static uint32_t virtio_balloon_get_features(VirtIODevice *vdev) ++static uint32_t virtio_balloon_get_features(VirtIODevice *vdev, uint32_t f) + { +- return 0; ++ return f; + } + + static ram_addr_t virtio_balloon_to_target(void *opaque, ram_addr_t target) +--- a/hw/virtio-blk.c 2010-01-18 12:48:25.000000000 -0600 ++++ b/hw/virtio-blk.c 2010-02-09 00:02:12.000000000 -0600 +@@ -432,19 +432,15 @@ static void virtio_blk_update_config(Vir + memcpy(config, &blkcfg, s->config_size); + } + +-static uint32_t virtio_blk_get_features(VirtIODevice *vdev) ++static uint32_t virtio_blk_get_features(VirtIODevice *vdev, uint32_t features) + { + VirtIOBlock *s = to_virtio_blk(vdev); +- uint32_t features = 0; + + features |= (1 << VIRTIO_BLK_F_SEG_MAX); + features |= (1 << VIRTIO_BLK_F_GEOMETRY); + + if (bdrv_enable_write_cache(s->bs)) + features |= (1 << VIRTIO_BLK_F_WCACHE); +-#ifdef __linux__ +- features |= (1 << VIRTIO_BLK_F_SCSI); +-#endif + if (strcmp(s->serial_str, "0")) + features |= 1 << VIRTIO_BLK_F_IDENTIFY; + +--- a/hw/virtio-blk.h 2010-01-18 12:48:25.000000000 -0600 ++++ b/hw/virtio-blk.h 2010-02-09 00:02:12.000000000 -0600 +@@ -92,4 +92,12 @@ struct virtio_scsi_inhdr + uint32_t residual; + }; + ++#ifdef __linux__ ++#define DEFINE_VIRTIO_BLK_FEATURES(_state, _field) \ ++ DEFINE_VIRTIO_COMMON_FEATURES(_state, _field), \ ++ DEFINE_PROP_BIT("scsi", _state, _field, VIRTIO_BLK_F_SCSI, true) ++#else ++#define DEFINE_VIRTIO_BLK_FEATURES(_state, _field) \ ++ DEFINE_VIRTIO_COMMON_FEATURES(_state, _field) ++#endif + #endif +--- a/hw/virtio.c 2010-02-09 00:18:58.000000000 -0600 ++++ b/hw/virtio.c 2010-02-09 00:02:12.000000000 -0600 +@@ -650,7 +650,7 @@ int virtio_load(VirtIODevice *vdev, QEMU + { + int num, i, ret; + uint32_t features; +- uint32_t supported_features = vdev->get_features(vdev) | ++ uint32_t supported_features = + vdev->binding->get_features(vdev->binding_opaque); + + if (vdev->binding->load_config) { +--- a/hw/virtio.h 2010-02-09 00:18:58.000000000 -0600 ++++ bhw/virtio.h 2010-02-09 00:02:12.000000000 -0600 +@@ -105,7 +105,7 @@ struct VirtIODevice + void *config; + uint16_t config_vector; + int nvectors; +- uint32_t (*get_features)(VirtIODevice *vdev); ++ uint32_t (*get_features)(VirtIODevice *vdev, uint32_t requested_features); + uint32_t (*bad_features)(VirtIODevice *vdev); + void (*set_features)(VirtIODevice *vdev, uint32_t val); + void (*get_config)(VirtIODevice *vdev, uint8_t *config); +@@ -176,4 +176,9 @@ VirtIODevice *virtio_balloon_init(Device + + void virtio_net_exit(VirtIODevice *vdev); + ++#define DEFINE_VIRTIO_COMMON_FEATURES(_state, _field) \ ++ DEFINE_PROP_BIT("indirect_desc", _state, _field, \ ++ VIRTIO_RING_F_INDIRECT_DESC, true) ++ ++ + #endif +--- a/hw/virtio-net.c 2010-02-09 00:18:58.000000000 -0600 ++++ b/hw/virtio-net.c 2010-02-09 00:02:12.000000000 -0600 +@@ -147,34 +147,27 @@ static int peer_has_ufo(VirtIONet *n) + return n->has_ufo; + } + +-static uint32_t virtio_net_get_features(VirtIODevice *vdev) ++static uint32_t virtio_net_get_features(VirtIODevice *vdev, uint32_t features) + { + VirtIONet *n = to_virtio_net(vdev); +- uint32_t features = (1 << VIRTIO_NET_F_MAC) | +- (1 << VIRTIO_NET_F_MRG_RXBUF) | +- (1 << VIRTIO_NET_F_STATUS) | +- (1 << VIRTIO_NET_F_CTRL_VQ) | +- (1 << VIRTIO_NET_F_CTRL_RX) | +- (1 << VIRTIO_NET_F_CTRL_VLAN) | +- (1 << VIRTIO_NET_F_CTRL_RX_EXTRA); + + if (peer_has_vnet_hdr(n)) { + tap_using_vnet_hdr(n->nic->nc.peer, 1); ++ } else { ++ features &= ~(0x1 << VIRTIO_NET_F_CSUM); ++ features &= ~(0x1 << VIRTIO_NET_F_HOST_TSO4); ++ features &= ~(0x1 << VIRTIO_NET_F_HOST_TSO6); ++ features &= ~(0x1 << VIRTIO_NET_F_HOST_ECN); ++ ++ features &= ~(0x1 << VIRTIO_NET_F_GUEST_CSUM); ++ features &= ~(0x1 << VIRTIO_NET_F_GUEST_TSO4); ++ features &= ~(0x1 << VIRTIO_NET_F_GUEST_TSO6); ++ features &= ~(0x1 << VIRTIO_NET_F_GUEST_ECN); ++ } + +- features |= (1 << VIRTIO_NET_F_CSUM); +- features |= (1 << VIRTIO_NET_F_HOST_TSO4); +- features |= (1 << VIRTIO_NET_F_HOST_TSO6); +- features |= (1 << VIRTIO_NET_F_HOST_ECN); +- +- features |= (1 << VIRTIO_NET_F_GUEST_CSUM); +- features |= (1 << VIRTIO_NET_F_GUEST_TSO4); +- features |= (1 << VIRTIO_NET_F_GUEST_TSO6); +- features |= (1 << VIRTIO_NET_F_GUEST_ECN); +- +- if (peer_has_ufo(n)) { +- features |= (1 << VIRTIO_NET_F_GUEST_UFO); +- features |= (1 << VIRTIO_NET_F_HOST_UFO); +- } ++ if (!peer_has_vnet_hdr(n) || !peer_has_ufo(n)) { ++ features &= ~(0x1 << VIRTIO_NET_F_GUEST_UFO); ++ features &= ~(0x1 << VIRTIO_NET_F_HOST_UFO); + } + + return features; +@@ -192,7 +185,7 @@ static uint32_t virtio_net_bad_features( + features |= (1 << VIRTIO_NET_F_HOST_TSO6); + features |= (1 << VIRTIO_NET_F_HOST_ECN); + +- return features & virtio_net_get_features(vdev); ++ return features; + } + + static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features) +--- a/hw/virtio-net.h 2010-01-18 12:48:25.000000000 -0600 ++++ b/hw/virtio-net.h 2010-02-09 00:02:12.000000000 -0600 +@@ -153,4 +153,24 @@ struct virtio_net_ctrl_mac { + #define VIRTIO_NET_CTRL_VLAN_ADD 0 + #define VIRTIO_NET_CTRL_VLAN_DEL 1 + ++#define DEFINE_VIRTIO_NET_FEATURES(_state, _field) \ ++ DEFINE_VIRTIO_COMMON_FEATURES(_state, _field), \ ++ DEFINE_PROP_BIT("csum", _state, _field, VIRTIO_NET_F_CSUM, true), \ ++ DEFINE_PROP_BIT("guest_csum", _state, _field, VIRTIO_NET_F_GUEST_CSUM, true), \ ++ DEFINE_PROP_BIT("mac", _state, _field, VIRTIO_NET_F_MAC, true), \ ++ DEFINE_PROP_BIT("gso", _state, _field, VIRTIO_NET_F_GSO, true), \ ++ DEFINE_PROP_BIT("guest_tso4", _state, _field, VIRTIO_NET_F_GUEST_TSO4, true), \ ++ DEFINE_PROP_BIT("guest_tso6", _state, _field, VIRTIO_NET_F_GUEST_TSO6, true), \ ++ DEFINE_PROP_BIT("guest_ecn", _state, _field, VIRTIO_NET_F_GUEST_ECN, true), \ ++ DEFINE_PROP_BIT("guest_ufo", _state, _field, VIRTIO_NET_F_GUEST_UFO, true), \ ++ DEFINE_PROP_BIT("host_tso4", _state, _field, VIRTIO_NET_F_HOST_TSO4, true), \ ++ DEFINE_PROP_BIT("host_tso6", _state, _field, VIRTIO_NET_F_HOST_TSO6, true), \ ++ DEFINE_PROP_BIT("host_ecn", _state, _field, VIRTIO_NET_F_HOST_ECN, true), \ ++ DEFINE_PROP_BIT("host_ufo", _state, _field, VIRTIO_NET_F_HOST_UFO, true), \ ++ DEFINE_PROP_BIT("mrg_rxbuf", _state, _field, VIRTIO_NET_F_MRG_RXBUF, true), \ ++ DEFINE_PROP_BIT("status", _state, _field, VIRTIO_NET_F_STATUS, true), \ ++ DEFINE_PROP_BIT("ctrl_vq", _state, _field, VIRTIO_NET_F_CTRL_VQ, true), \ ++ DEFINE_PROP_BIT("ctrl_rx", _state, _field, VIRTIO_NET_F_CTRL_RX, true), \ ++ DEFINE_PROP_BIT("ctrl_vlan", _state, _field, VIRTIO_NET_F_CTRL_VLAN, true), \ ++ DEFINE_PROP_BIT("ctrl_rx_extra", _state, _field, VIRTIO_NET_F_CTRL_RX_EXTRA, true) + #endif +--- a/hw/virtio-pci.c 2010-02-09 00:18:58.000000000 -0600 ++++ b/hw/virtio-pci.c 2010-02-09 00:16:13.000000000 -0600 +@@ -16,6 +16,8 @@ + #include + + #include "virtio.h" ++#include "virtio-blk.h" ++#include "virtio-net.h" + #include "pci.h" + #include "sysemu.h" + #include "msix.h" +@@ -92,6 +94,7 @@ typedef struct { + uint32_t nvectors; + DriveInfo *dinfo; + NICConf nic; ++ uint32_t host_features; + /* Max. number of ports we can have for a the virtio-serial device */ + uint32_t max_virtserial_ports; + } VirtIOPCIProxy; +@@ -177,7 +180,7 @@ static void virtio_ioport_write(void *op + /* Guest does not negotiate properly? We have to assume nothing. */ + if (val & (1 << VIRTIO_F_BAD_FEATURE)) { + if (vdev->bad_features) +- val = vdev->bad_features(vdev); ++ val = proxy->host_features & vdev->bad_features(vdev); + else + val = 0; + } +@@ -237,8 +240,7 @@ static uint32_t virtio_ioport_read(VirtI + + switch (addr) { + case VIRTIO_PCI_HOST_FEATURES: +- ret = vdev->get_features(vdev); +- ret |= vdev->binding->get_features(proxy); ++ ret = proxy->host_features; + break; + case VIRTIO_PCI_GUEST_FEATURES: + ret = vdev->guest_features; +@@ -384,11 +386,8 @@ static void virtio_write_config(PCIDevic + + static unsigned virtio_pci_get_features(void *opaque) + { +- unsigned ret = 0; +- ret |= (1 << VIRTIO_F_NOTIFY_ON_EMPTY); +- ret |= (1 << VIRTIO_RING_F_INDIRECT_DESC); +- ret |= (1 << VIRTIO_F_BAD_FEATURE); +- return ret; ++ VirtIOPCIProxy *proxy = opaque; ++ return proxy->host_features; + } + + static const VirtIOBindings virtio_pci_bindings = { +@@ -444,6 +443,9 @@ static void virtio_init_pci(VirtIOPCIPro + virtio_map); + + virtio_bind_device(vdev, &virtio_pci_bindings, proxy); ++ proxy->host_features |= 0x1 << VIRTIO_F_NOTIFY_ON_EMPTY; ++ proxy->host_features |= 0x1 << VIRTIO_F_BAD_FEATURE; ++ proxy->host_features = vdev->get_features(vdev, proxy->host_features); + } + + static int virtio_blk_init_pci(PCIDevice *pci_dev) +@@ -558,6 +560,7 @@ static PCIDeviceInfo virtio_info[] = { + DEFINE_PROP_HEX32("class", VirtIOPCIProxy, class_code, 0), + DEFINE_PROP_DRIVE("drive", VirtIOPCIProxy, dinfo), + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2), ++ DEFINE_VIRTIO_BLK_FEATURES(VirtIOPCIProxy, host_features), + DEFINE_PROP_END_OF_LIST(), + }, + .qdev.reset = virtio_pci_reset, +@@ -569,6 +572,7 @@ static PCIDeviceInfo virtio_info[] = { + .romfile = "pxe-virtio.bin", + .qdev.props = (Property[]) { + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 3), ++ DEFINE_VIRTIO_NET_FEATURES(VirtIOPCIProxy, host_features), + DEFINE_NIC_PROPERTIES(VirtIOPCIProxy, nic), + DEFINE_PROP_END_OF_LIST(), + }, +@@ -582,6 +586,7 @@ static PCIDeviceInfo virtio_info[] = { + .qdev.props = (Property[]) { + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 0), + DEFINE_PROP_HEX32("class", VirtIOPCIProxy, class_code, 0), ++ DEFINE_VIRTIO_COMMON_FEATURES(VirtIOPCIProxy, host_features), + DEFINE_PROP_UINT32("max_ports", VirtIOPCIProxy, max_virtserial_ports, + 31), + DEFINE_PROP_END_OF_LIST(), +@@ -592,6 +597,10 @@ static PCIDeviceInfo virtio_info[] = { + .qdev.size = sizeof(VirtIOPCIProxy), + .init = virtio_balloon_init_pci, + .exit = virtio_exit_pci, ++ .qdev.props = (Property[]) { ++ DEFINE_VIRTIO_COMMON_FEATURES(VirtIOPCIProxy, host_features), ++ DEFINE_PROP_END_OF_LIST(), ++ }, + .qdev.reset = virtio_pci_reset, + },{ + /* end of list */ diff --git a/qemu-virtio-add-notifier-support.patch b/qemu-virtio-add-notifier-support.patch new file mode 100644 index 0000000..7a58ad2 --- /dev/null +++ b/qemu-virtio-add-notifier-support.patch @@ -0,0 +1,78 @@ +Add binding API to set host/guest notifiers. +Will be used by vhost. + +Signed-off-by: Michael S. Tsirkin +--- + hw/virtio.c | 13 ++++++++++--- + hw/virtio.h | 5 ++++- + 2 files changed, 14 insertions(+), 4 deletions(-) + +diff --git a/hw/virtio.c b/hw/virtio.c +index fa7184a..c2b80aa 100644 +--- a/hw/virtio.c ++++ b/hw/virtio.c +@@ -594,6 +594,12 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, + return &vdev->vq[i]; + } + ++void virtio_irq(VirtIODevice *vdev, VirtQueue *vq) ++{ ++ vdev->isr |= 0x01; ++ virtio_notify_vector(vdev, vq->vector); ++} ++ + void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) + { + /* Always notify when queue is empty (when feature acknowledge) */ +@@ -602,8 +608,7 @@ void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) + (vq->inuse || vring_avail_idx(vq) != vq->last_avail_idx))) + return; + +- vdev->isr |= 0x01; +- virtio_notify_vector(vdev, vq->vector); ++ virtio_irq(vdev, vq); + } + + void virtio_notify_config(VirtIODevice *vdev) +@@ -716,8 +721,10 @@ VirtIODevice *virtio_common_init(const char *name, uint16_t device_id, + vdev->queue_sel = 0; + vdev->config_vector = VIRTIO_NO_VECTOR; + vdev->vq = qemu_mallocz(sizeof(VirtQueue) * VIRTIO_PCI_QUEUE_MAX); +- for(i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) ++ for(i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) { + vdev->vq[i].vector = VIRTIO_NO_VECTOR; ++ vdev->vq[i].vdev = vdev; ++ } + + vdev->name = name; + vdev->config_len = config_size; +diff --git a/hw/virtio.h b/hw/virtio.h +index 3994cc9..10a0959 100644 +--- a/hw/virtio.h ++++ b/hw/virtio.h +@@ -18,6 +18,7 @@ + #include "net.h" + #include "qdev.h" + #include "sysemu.h" ++#include "notifier.h" + + /* from Linux's linux/virtio_config.h */ + +@@ -88,6 +89,8 @@ typedef struct { + int (*load_config)(void * opaque, QEMUFile *f); + int (*load_queue)(void * opaque, int n, QEMUFile *f); + unsigned (*get_features)(void * opaque); ++ int (*guest_notifier)(void * opaque, int n, bool assigned); ++ int (*host_notifier)(void * opaque, int n, bool assigned); + } VirtIOBindings; + + #define VIRTIO_PCI_QUEUE_MAX 64 +@@ -180,5 +183,5 @@ void virtio_net_exit(VirtIODevice *vdev); + DEFINE_PROP_BIT("indirect_desc", _state, _field, \ + VIRTIO_RING_F_INDIRECT_DESC, true) + +- ++void virtio_irq(VirtIODevice *vdev, VirtQueue *vq); + #endif +-- +1.6.6.144.g5c3af diff --git a/qemu-virtio-add-status-change-callback.patch b/qemu-virtio-add-status-change-callback.patch new file mode 100644 index 0000000..3a85748 --- /dev/null +++ b/qemu-virtio-add-status-change-callback.patch @@ -0,0 +1,76 @@ +vhost net backend needs to be notified when +frontend status changes. Add a callback. + +Signed-off-by: Michael S. Tsirkin +--- + hw/s390-virtio-bus.c | 3 +++ + hw/syborg_virtio.c | 2 ++ + hw/virtio-pci.c | 6 ++++++ + hw/virtio.h | 1 + + 4 files changed, 12 insertions(+), 0 deletions(-) + +diff --git a/hw/s390-virtio-bus.c b/hw/s390-virtio-bus.c +index 980e7eb..f45b67d 100644 +--- a/hw/s390-virtio-bus.c ++++ b/hw/s390-virtio-bus.c +@@ -243,6 +243,9 @@ void s390_virtio_device_update_status(VirtIOS390Device *dev) + uint32_t features; + + vdev->status = ldub_phys(dev->dev_offs + VIRTIO_DEV_OFFS_STATUS); ++ if (vdev->set_status) { ++ vdev->set_status(vdev); ++ } + + /* Update guest supported feature bitmap */ + +diff --git a/hw/syborg_virtio.c b/hw/syborg_virtio.c +index 65239a0..19f6473 100644 +--- a/hw/syborg_virtio.c ++++ b/hw/syborg_virtio.c +@@ -152,6 +152,8 @@ static void syborg_virtio_writel(void *opaque, target_phys_addr_t offset, + vdev->status = value & 0xFF; + if (vdev->status == 0) + virtio_reset(vdev); ++ if (vdev->set_status) ++ vdev->set_status(vdev); + break; + case SYBORG_VIRTIO_INT_ENABLE: + s->int_enable = value; +diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c +index 573c98a..05898c8 100644 +--- a/hw/virtio-pci.c ++++ b/hw/virtio-pci.c +@@ -208,6 +208,9 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val) + virtio_reset(proxy->vdev); + msix_unuse_all_vectors(&proxy->pci_dev); + } ++ if (vdev->set_status) { ++ vdev->set_status(vdev); ++ } + break; + case VIRTIO_MSI_CONFIG_VECTOR: + msix_vector_unuse(&proxy->pci_dev, vdev->config_vector); +@@ -375,6 +378,9 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address, + if (PCI_COMMAND == address) { + if (!(val & PCI_COMMAND_MASTER)) { + proxy->vdev->status &= ~VIRTIO_CONFIG_S_DRIVER_OK; ++ if (proxy->vdev->set_status) { ++ proxy->vdev->set_status(proxy->vdev); ++ } + } + } + +diff --git a/hw/virtio.h b/hw/virtio.h +index f140ca3..39d0763 100644 +--- a/hw/virtio.h ++++ b/hw/virtio.h +@@ -114,6 +114,7 @@ struct VirtIODevice + void (*get_config)(VirtIODevice *vdev, uint8_t *config); + void (*set_config)(VirtIODevice *vdev, const uint8_t *config); + void (*reset)(VirtIODevice *vdev); ++ void (*set_status)(VirtIODevice *vdev); + VirtQueue *vq; + const VirtIOBindings *binding; + void *binding_opaque; +-- +1.6.6.144.g5c3af diff --git a/qemu-virtio-avoid-crash-with-non-tap-backends.patch b/qemu-virtio-avoid-crash-with-non-tap-backends.patch new file mode 100644 index 0000000..de869d0 --- /dev/null +++ b/qemu-virtio-avoid-crash-with-non-tap-backends.patch @@ -0,0 +1,25 @@ +verify that peer is tap before checking for vhost net + +Reported-by: Shirley Ma +Signed-off-by: Michael S. Tsirkin +--- + hw/virtio-net.c | 4 ++++ + 1 files changed, 4 insertions(+), 0 deletions(-) + +diff --git a/hw/virtio-net.c b/hw/virtio-net.c +index 088029b..b28fd92 100644 +--- a/hw/virtio-net.c ++++ b/hw/virtio-net.c +@@ -179,6 +179,10 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev, uint32_t features) + features &= ~(0x1 << VIRTIO_NET_F_HOST_UFO); + } + ++ if (!n->nic->nc.peer || ++ n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) { ++ return features; ++ } + if (!tap_get_vhost_net(n->nic->nc.peer)) { + return features; + } +-- +1.6.6.144.g5c3af diff --git a/qemu-virtio-move-typedef-to-qemu-common.patch b/qemu-virtio-move-typedef-to-qemu-common.patch new file mode 100644 index 0000000..6b50335 --- /dev/null +++ b/qemu-virtio-move-typedef-to-qemu-common.patch @@ -0,0 +1,34 @@ +make it possible to use type without header include + +Signed-off-by: Michael S. Tsirkin +--- + hw/virtio.h | 1 - + qemu-common.h | 1 + + 2 files changed, 1 insertions(+), 1 deletions(-) + +diff --git a/hw/virtio.h b/hw/virtio.h +index 39d0763..a5bd0ba 100644 +--- a/hw/virtio.h ++++ b/hw/virtio.h +@@ -68,7 +68,6 @@ static inline target_phys_addr_t vring_align(target_phys_addr_t addr, + } + + typedef struct VirtQueue VirtQueue; +-typedef struct VirtIODevice VirtIODevice; + + #define VIRTQUEUE_MAX_SIZE 1024 + +diff --git a/qemu-common.h b/qemu-common.h +index cdead98..1a54f9e 100644 +--- a/qemu-common.h ++++ b/qemu-common.h +@@ -218,6 +218,7 @@ typedef struct I2SCodec I2SCodec; + typedef struct DeviceState DeviceState; + typedef struct SSIBus SSIBus; + typedef struct EventNotifier EventNotifier; ++typedef struct VirtIODevice VirtIODevice; + + /* CPU save/load. */ + void cpu_save(QEMUFile *f, void *opaque); +-- +1.6.6.144.g5c3af diff --git a/qemu-virtio-net-mac-property-is-mandatory.patch b/qemu-virtio-net-mac-property-is-mandatory.patch new file mode 100644 index 0000000..8058b14 --- /dev/null +++ b/qemu-virtio-net-mac-property-is-mandatory.patch @@ -0,0 +1,40 @@ +Mac feature bit isn't going to work as all network cards already have a +'mac' property to set the mac address. Remove it from mask and add in +get_features. + +Reported-by: Gerd Hoffmann +Signed-off-by: Michael S. Tsirkin +Signed-off-by: Anthony Liguori +(cherry picked from commit c9f79a3f79a48de28b4575cb5644bcf45d3754d0) +--- + hw/virtio-net.c | 2 ++ + hw/virtio-net.h | 1 - + 2 files changed, 2 insertions(+), 1 deletions(-) + +diff --git a/hw/virtio-net.c b/hw/virtio-net.c +index c2a389f..02d9180 100644 +--- a/hw/virtio-net.c ++++ b/hw/virtio-net.c +@@ -151,6 +151,8 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev, uint32_t features) + { + VirtIONet *n = to_virtio_net(vdev); + ++ features |= (1 << VIRTIO_NET_F_MAC); ++ + if (peer_has_vnet_hdr(n)) { + tap_using_vnet_hdr(n->nic->nc.peer, 1); + } else { +diff --git a/hw/virtio-net.h b/hw/virtio-net.h +index 9130d75..e55119b 100644 +--- a/hw/virtio-net.h ++++ b/hw/virtio-net.h +@@ -157,7 +157,6 @@ struct virtio_net_ctrl_mac { + DEFINE_VIRTIO_COMMON_FEATURES(_state, _field), \ + DEFINE_PROP_BIT("csum", _state, _field, VIRTIO_NET_F_CSUM, true), \ + DEFINE_PROP_BIT("guest_csum", _state, _field, VIRTIO_NET_F_GUEST_CSUM, true), \ +- DEFINE_PROP_BIT("mac", _state, _field, VIRTIO_NET_F_MAC, true), \ + DEFINE_PROP_BIT("gso", _state, _field, VIRTIO_NET_F_GSO, true), \ + DEFINE_PROP_BIT("guest_tso4", _state, _field, VIRTIO_NET_F_GUEST_TSO4, true), \ + DEFINE_PROP_BIT("guest_tso6", _state, _field, VIRTIO_NET_F_GUEST_TSO6, true), \ +-- +1.6.6.144.g5c3af diff --git a/qemu-virtio-net-vhost-net-support.patch b/qemu-virtio-net-vhost-net-support.patch new file mode 100644 index 0000000..147fb82 --- /dev/null +++ b/qemu-virtio-net-vhost-net-support.patch @@ -0,0 +1,153 @@ +This connects virtio-net to vhost net backend. +The code is structured in a way analogous to what we have with vnet +header capability in tap. We start/stop backend on driver start/stop as +well as on save and vm start (for migration). + +Signed-off-by: Michael S. Tsirkin +--- + hw/virtio-net.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++- + 1 files changed, 65 insertions(+), 2 deletions(-) + +diff --git a/hw/virtio-net.c b/hw/virtio-net.c +index 02d9180..088029b 100644 +--- a/hw/virtio-net.c ++++ b/hw/virtio-net.c +@@ -17,6 +17,7 @@ + #include "net/tap.h" + #include "qemu-timer.h" + #include "virtio-net.h" ++#include "vhost_net.h" + + #define VIRTIO_NET_VM_VERSION 11 + +@@ -47,6 +48,8 @@ typedef struct VirtIONet + uint8_t nomulti; + uint8_t nouni; + uint8_t nobcast; ++ uint8_t vhost_started; ++ VMChangeStateEntry *vmstate; + struct { + int in_use; + int first_multi; +@@ -114,6 +117,10 @@ static void virtio_net_reset(VirtIODevice *vdev) + n->nomulti = 0; + n->nouni = 0; + n->nobcast = 0; ++ if (n->vhost_started) { ++ vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev); ++ n->vhost_started = 0; ++ } + + /* Flush any MAC and VLAN filter table state */ + n->mac_table.in_use = 0; +@@ -172,7 +179,10 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev, uint32_t features) + features &= ~(0x1 << VIRTIO_NET_F_HOST_UFO); + } + +- return features; ++ if (!tap_get_vhost_net(n->nic->nc.peer)) { ++ return features; ++ } ++ return vhost_net_get_features(tap_get_vhost_net(n->nic->nc.peer), features); + } + + static uint32_t virtio_net_bad_features(VirtIODevice *vdev) +@@ -690,6 +700,12 @@ static void virtio_net_save(QEMUFile *f, void *opaque) + { + VirtIONet *n = opaque; + ++ if (n->vhost_started) { ++ /* TODO: should we really stop the backend? ++ * If we don't, it might keep writing to memory. */ ++ vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), &n->vdev); ++ n->vhost_started = 0; ++ } + virtio_save(&n->vdev, f); + + qemu_put_buffer(f, n->mac, ETH_ALEN); +@@ -802,7 +818,6 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) + qemu_mod_timer(n->tx_timer, + qemu_get_clock(vm_clock) + TX_TIMER_INTERVAL); + } +- + return 0; + } + +@@ -822,6 +837,47 @@ static NetClientInfo net_virtio_info = { + .link_status_changed = virtio_net_set_link_status, + }; + ++static void virtio_net_set_status(struct VirtIODevice *vdev) ++{ ++ VirtIONet *n = to_virtio_net(vdev); ++ if (!n->nic->nc.peer) { ++ return; ++ } ++ if (n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) { ++ return; ++ } ++ ++ if (!tap_get_vhost_net(n->nic->nc.peer)) { ++ return; ++ } ++ if (!!n->vhost_started == !!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) { ++ return; ++ } ++ if (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) { ++ int r = vhost_net_start(tap_get_vhost_net(n->nic->nc.peer), vdev); ++ if (r < 0) { ++ fprintf(stderr, "unable to start vhost net: %d: " ++ "falling back on userspace virtio\n", -r); ++ } else { ++ n->vhost_started = 1; ++ } ++ } else { ++ vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev); ++ n->vhost_started = 0; ++ } ++} ++ ++static void virtio_net_vmstate_change(void *opaque, int running, int reason) ++{ ++ VirtIONet *n = opaque; ++ if (!running) { ++ return; ++ } ++ /* This is called when vm is started, it will start vhost backend if it ++ * appropriate e.g. after migration. */ ++ virtio_net_set_status(&n->vdev); ++} ++ + VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf) + { + VirtIONet *n; +@@ -837,6 +893,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf) + n->vdev.set_features = virtio_net_set_features; + n->vdev.bad_features = virtio_net_bad_features; + n->vdev.reset = virtio_net_reset; ++ n->vdev.set_status = virtio_net_set_status; + n->rx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_rx); + n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx); + n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl); +@@ -859,6 +916,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf) + + register_savevm("virtio-net", virtio_net_id++, VIRTIO_NET_VM_VERSION, + virtio_net_save, virtio_net_load, n); ++ n->vmstate = qemu_add_vm_change_state_handler(virtio_net_vmstate_change, n); + + return &n->vdev; + } +@@ -866,6 +924,11 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf) + void virtio_net_exit(VirtIODevice *vdev) + { + VirtIONet *n = DO_UPCAST(VirtIONet, vdev, vdev); ++ qemu_del_vm_change_state_handler(n->vmstate); ++ ++ if (n->vhost_started) { ++ vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev); ++ } + + qemu_purge_queued_packets(&n->nic->nc); + +-- +1.6.6.144.g5c3af diff --git a/qemu-virtio-pci-fill-in-notifier-support.patch b/qemu-virtio-pci-fill-in-notifier-support.patch new file mode 100644 index 0000000..f294576 --- /dev/null +++ b/qemu-virtio-pci-fill-in-notifier-support.patch @@ -0,0 +1,98 @@ +Support host/guest notifiers in virtio-pci. +The last one only with kvm, that's okay +because vhost relies on kvm anyway. + +Signed-off-by: Michael S. Tsirkin +--- + hw/virtio-pci.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 files changed, 62 insertions(+), 0 deletions(-) + +diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c +index 05898c8..c454093 100644 +--- a/hw/virtio-pci.c ++++ b/hw/virtio-pci.c +@@ -23,6 +23,7 @@ + #include "msix.h" + #include "net.h" + #include "loader.h" ++#include "kvm.h" + + /* from Linux's linux/virtio_pci.h */ + +@@ -394,6 +395,65 @@ static unsigned virtio_pci_get_features(void *opaque) + return proxy->host_features; + } + ++static void virtio_pci_guest_notifier_read(void *opaque) ++{ ++ VirtQueue *vq = opaque; ++ EventNotifier *n = virtio_queue_guest_notifier(vq); ++ if (event_notifier_test_and_clear(n)) { ++ virtio_irq(vq); ++ } ++} ++ ++static int virtio_pci_guest_notifier(void *opaque, int n, bool assign) ++{ ++ VirtIOPCIProxy *proxy = opaque; ++ VirtQueue *vq = virtio_queue(proxy->vdev, n); ++ EventNotifier *notifier = virtio_queue_guest_notifier(vq); ++ ++ if (assign) { ++ int r = event_notifier_init(notifier, 0); ++ if (r < 0) ++ return r; ++ qemu_set_fd_handler(event_notifier_get_fd(notifier), ++ virtio_pci_guest_notifier_read, NULL, vq); ++ } else { ++ qemu_set_fd_handler(event_notifier_get_fd(notifier), ++ NULL, NULL, vq); ++ event_notifier_cleanup(notifier); ++ } ++ ++ return 0; ++} ++ ++static int virtio_pci_host_notifier(void *opaque, int n, bool assign) ++{ ++ VirtIOPCIProxy *proxy = opaque; ++ VirtQueue *vq = virtio_queue(proxy->vdev, n); ++ EventNotifier *notifier = virtio_queue_host_notifier(vq); ++ int r; ++ if (assign) { ++ r = event_notifier_init(notifier, 1); ++ if (r < 0) { ++ return r; ++ } ++ r = kvm_set_ioeventfd(proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY, ++ n, event_notifier_get_fd(notifier), ++ assign); ++ if (r < 0) { ++ event_notifier_cleanup(notifier); ++ } ++ } else { ++ r = kvm_set_ioeventfd(proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY, ++ n, event_notifier_get_fd(notifier), ++ assign); ++ if (r < 0) { ++ return r; ++ } ++ event_notifier_cleanup(notifier); ++ } ++ return r; ++} ++ + static const VirtIOBindings virtio_pci_bindings = { + .notify = virtio_pci_notify, + .save_config = virtio_pci_save_config, +@@ -401,6 +461,8 @@ static const VirtIOBindings virtio_pci_bindings = { + .save_queue = virtio_pci_save_queue, + .load_queue = virtio_pci_load_queue, + .get_features = virtio_pci_get_features, ++ .host_notifier = virtio_pci_host_notifier, ++ .guest_notifier = virtio_pci_guest_notifier, + }; + + static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev, +-- +1.6.6.144.g5c3af diff --git a/qemu-virtio-pci-irqfd-fix-nonkvm-build.patch b/qemu-virtio-pci-irqfd-fix-nonkvm-build.patch new file mode 100644 index 0000000..cbdd372 --- /dev/null +++ b/qemu-virtio-pci-irqfd-fix-nonkvm-build.patch @@ -0,0 +1,34 @@ +Signed-off-by: Michael S. Tsirkin + +--- + +diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c +index e8e0d82..fd0c73f 100644 +--- a/hw/virtio-pci.c ++++ b/hw/virtio-pci.c +@@ -404,6 +404,7 @@ static void virtio_pci_guest_notifier_read(void *opaque) + } + } + ++#ifdef CONFIG_KVM + static int virtio_pci_mask_notifier(PCIDevice *dev, unsigned vector, + void *opaque, int masked) + { +@@ -424,6 +425,7 @@ static int virtio_pci_mask_notifier(PCIDevice *dev, unsigned vector, + } + return 0; + } ++#endif + + static int virtio_pci_guest_notifier(void *opaque, int n, bool assign) + { +@@ -526,7 +528,9 @@ static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev, + + proxy->pci_dev.config_write = virtio_write_config; + ++#ifdef CONFIG_KVM + proxy->pci_dev.msix_mask_notifier = virtio_pci_mask_notifier; ++#endif + + size = VIRTIO_PCI_REGION_SIZE(&proxy->pci_dev) + vdev->config_len; + if (size & (size-1)) diff --git a/qemu-virtio-pci-irqfd-support.patch b/qemu-virtio-pci-irqfd-support.patch new file mode 100644 index 0000000..385f2a4 --- /dev/null +++ b/qemu-virtio-pci-irqfd-support.patch @@ -0,0 +1,70 @@ +Use irqfd when supported by kernel. +This uses msix mask notifiers: when vector is masked, we poll it from +userspace. When it is unmasked, we poll it from kernel. + +Signed-off-by: Michael S. Tsirkin +--- + hw/virtio-pci.c | 31 +++++++++++++++++++++++++++++-- + 1 files changed, 29 insertions(+), 2 deletions(-) + +diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c +index c454093..e8e0d82 100644 +--- a/hw/virtio-pci.c ++++ b/hw/virtio-pci.c +@@ -404,6 +404,27 @@ static void virtio_pci_guest_notifier_read(void *opaque) + } + } + ++static int virtio_pci_mask_notifier(PCIDevice *dev, unsigned vector, ++ void *opaque, int masked) ++{ ++ VirtQueue *vq = opaque; ++ EventNotifier *notifier = virtio_queue_guest_notifier(vq); ++ int r = kvm_set_irqfd(dev->msix_irq_entries[vector].gsi, ++ event_notifier_get_fd(notifier), ++ !masked); ++ if (r < 0) { ++ return (r == -ENOSYS) ? 0 : r; ++ } ++ if (masked) { ++ qemu_set_fd_handler(event_notifier_get_fd(notifier), ++ virtio_pci_guest_notifier_read, NULL, vq); ++ } else { ++ qemu_set_fd_handler(event_notifier_get_fd(notifier), ++ NULL, NULL, vq); ++ } ++ return 0; ++} ++ + static int virtio_pci_guest_notifier(void *opaque, int n, bool assign) + { + VirtIOPCIProxy *proxy = opaque; +@@ -412,11 +433,15 @@ static int virtio_pci_guest_notifier(void *opaque, int n, bool assign) + + if (assign) { + int r = event_notifier_init(notifier, 0); +- if (r < 0) +- return r; ++ if (r < 0) ++ return r; + qemu_set_fd_handler(event_notifier_get_fd(notifier), + virtio_pci_guest_notifier_read, NULL, vq); ++ msix_set_mask_notifier(&proxy->pci_dev, ++ virtio_queue_vector(proxy->vdev, n), vq); + } else { ++ msix_set_mask_notifier(&proxy->pci_dev, ++ virtio_queue_vector(proxy->vdev, n), NULL); + qemu_set_fd_handler(event_notifier_get_fd(notifier), + NULL, NULL, vq); + event_notifier_cleanup(notifier); +@@ -501,6 +526,8 @@ static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev, + + proxy->pci_dev.config_write = virtio_write_config; + ++ proxy->pci_dev.msix_mask_notifier = virtio_pci_mask_notifier; ++ + size = VIRTIO_PCI_REGION_SIZE(&proxy->pci_dev) + vdev->config_len; + if (size & (size-1)) + size = 1 << qemu_fls(size); +-- +1.6.6.144.g5c3af diff --git a/qemu-virtio-rename-features-guest_features.patch b/qemu-virtio-rename-features-guest_features.patch new file mode 100644 index 0000000..8ffc084 --- /dev/null +++ b/qemu-virtio-rename-features-guest_features.patch @@ -0,0 +1,149 @@ +Rename features->guest_features. This is +what they are, avoid confusion with +host features which we also need to keep around. + +Signed-off-by: Michael S. Tsirkin +Signed-off-by: Anthony Liguori +(cherry picked from commit 704a76fcd24372a683652651b4597f6654084975) +--- + hw/s390-virtio-bus.c | 2 +- + hw/syborg_virtio.c | 4 ++-- + hw/virtio-net.c | 10 +++++----- + hw/virtio-pci.c | 4 ++-- + hw/virtio.c | 8 ++++---- + hw/virtio.h | 2 +- + 6 files changed, 15 insertions(+), 15 deletions(-) + +diff --git a/hw/s390-virtio-bus.c b/hw/s390-virtio-bus.c +index dc154ed..6c0da11 100644 +--- a/hw/s390-virtio-bus.c ++++ b/hw/s390-virtio-bus.c +@@ -251,7 +251,7 @@ void s390_virtio_device_update_status(VirtIOS390Device *dev) + if (vdev->set_features) { + vdev->set_features(vdev, features); + } +- vdev->features = features; ++ vdev->guest_features = features; + } + + VirtIOS390Device *s390_virtio_bus_console(VirtIOS390Bus *bus) +diff --git a/hw/syborg_virtio.c b/hw/syborg_virtio.c +index a84206a..fe6fc23 100644 +--- a/hw/syborg_virtio.c ++++ b/hw/syborg_virtio.c +@@ -90,7 +90,7 @@ static uint32_t syborg_virtio_readl(void *opaque, target_phys_addr_t offset) + ret |= vdev->binding->get_features(s); + break; + case SYBORG_VIRTIO_GUEST_FEATURES: +- ret = vdev->features; ++ ret = vdev->guest_features; + break; + case SYBORG_VIRTIO_QUEUE_BASE: + ret = virtio_queue_get_addr(vdev, vdev->queue_sel); +@@ -132,7 +132,7 @@ static void syborg_virtio_writel(void *opaque, target_phys_addr_t offset, + case SYBORG_VIRTIO_GUEST_FEATURES: + if (vdev->set_features) + vdev->set_features(vdev, value); +- vdev->features = value; ++ vdev->guest_features = value; + break; + case SYBORG_VIRTIO_QUEUE_BASE: + if (value == 0) +diff --git a/hw/virtio-net.c b/hw/virtio-net.c +index 2f201ff..ab20a33 100644 +--- a/hw/virtio-net.c ++++ b/hw/virtio-net.c +@@ -768,11 +768,11 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) + if (n->has_vnet_hdr) { + tap_using_vnet_hdr(n->nic->nc.peer, 1); + tap_set_offload(n->nic->nc.peer, +- (n->vdev.features >> VIRTIO_NET_F_GUEST_CSUM) & 1, +- (n->vdev.features >> VIRTIO_NET_F_GUEST_TSO4) & 1, +- (n->vdev.features >> VIRTIO_NET_F_GUEST_TSO6) & 1, +- (n->vdev.features >> VIRTIO_NET_F_GUEST_ECN) & 1, +- (n->vdev.features >> VIRTIO_NET_F_GUEST_UFO) & 1); ++ (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_CSUM) & 1, ++ (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_TSO4) & 1, ++ (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_TSO6) & 1, ++ (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_ECN) & 1, ++ (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_UFO) & 1); + } + } + +diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c +index 3594152..c23dbc0 100644 +--- a/hw/virtio-pci.c ++++ b/hw/virtio-pci.c +@@ -181,7 +181,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val) + } + if (vdev->set_features) + vdev->set_features(vdev, val); +- vdev->features = val; ++ vdev->guest_features = val; + break; + case VIRTIO_PCI_QUEUE_PFN: + pa = (target_phys_addr_t)val << VIRTIO_PCI_QUEUE_ADDR_SHIFT; +@@ -239,7 +239,7 @@ static uint32_t virtio_ioport_read(VirtIOPCIProxy *proxy, uint32_t addr) + ret |= vdev->binding->get_features(proxy); + break; + case VIRTIO_PCI_GUEST_FEATURES: +- ret = vdev->features; ++ ret = vdev->guest_features; + break; + case VIRTIO_PCI_QUEUE_PFN: + ret = virtio_queue_get_addr(vdev, vdev->queue_sel) +diff --git a/hw/virtio.c b/hw/virtio.c +index cecd0dc..c25a5f1 100644 +--- a/hw/virtio.c ++++ b/hw/virtio.c +@@ -445,7 +445,7 @@ void virtio_reset(void *opaque) + if (vdev->reset) + vdev->reset(vdev); + +- vdev->features = 0; ++ vdev->guest_features = 0; + vdev->queue_sel = 0; + vdev->status = 0; + vdev->isr = 0; +@@ -598,7 +598,7 @@ void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) + { + /* Always notify when queue is empty (when feature acknowledge) */ + if ((vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT) && +- (!(vdev->features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) || ++ (!(vdev->guest_features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) || + (vq->inuse || vring_avail_idx(vq) != vq->last_avail_idx))) + return; + +@@ -625,7 +625,7 @@ void virtio_save(VirtIODevice *vdev, QEMUFile *f) + qemu_put_8s(f, &vdev->status); + qemu_put_8s(f, &vdev->isr); + qemu_put_be16s(f, &vdev->queue_sel); +- qemu_put_be32s(f, &vdev->features); ++ qemu_put_be32s(f, &vdev->guest_features); + qemu_put_be32(f, vdev->config_len); + qemu_put_buffer(f, vdev->config, vdev->config_len); + +@@ -670,7 +670,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f) + features, supported_features); + return -1; + } +- vdev->features = features; ++ vdev->guest_features = features; + vdev->config_len = qemu_get_be32(f); + qemu_get_buffer(f, vdev->config, vdev->config_len); + +diff --git a/hw/virtio.h b/hw/virtio.h +index 35532a6..85ef171 100644 +--- a/hw/virtio.h ++++ b/hw/virtio.h +@@ -100,7 +100,7 @@ struct VirtIODevice + uint8_t status; + uint8_t isr; + uint16_t queue_sel; +- uint32_t features; ++ uint32_t guest_features; + size_t config_len; + void *config; + uint16_t config_vector; +-- +1.6.6.144.g5c3af diff --git a/qemu-virtio-serial-features-build-fix.patch b/qemu-virtio-serial-features-build-fix.patch new file mode 100644 index 0000000..b05629e --- /dev/null +++ b/qemu-virtio-serial-features-build-fix.patch @@ -0,0 +1,25 @@ +--- a/hw/virtio-serial-bus.c 2010-02-09 00:41:21.000000000 -0600 ++++ b/hw/virtio-serial-bus.c 2010-02-09 00:07:13.000000000 -0600 +@@ -68,7 +68,7 @@ static VirtIOSerialPort *find_port_by_vq + + static bool use_multiport(VirtIOSerial *vser) + { +- return vser->vdev.features & (1 << VIRTIO_CONSOLE_F_MULTIPORT); ++ return vser->vdev.guest_features & (1 << VIRTIO_CONSOLE_F_MULTIPORT); + } + + static size_t write_to_port(VirtIOSerialPort *port, +@@ -333,9 +333,11 @@ static void handle_input(VirtIODevice *v + { + } + +-static uint32_t get_features(VirtIODevice *vdev) ++static uint32_t get_features(VirtIODevice *vdev, uint32_t features) + { +- return 1 << VIRTIO_CONSOLE_F_MULTIPORT; ++ features |= (1 << VIRTIO_CONSOLE_F_MULTIPORT); ++ ++ return features; + } + + /* Guest requested config info */ diff --git a/qemu.spec b/qemu.spec index 8e35eb4..53d381b 100644 --- a/qemu.spec +++ b/qemu.spec @@ -1,7 +1,7 @@ Summary: QEMU is a FAST! processor emulator Name: qemu Version: 0.12.2 -Release: 5%{?dist} +Release: 6%{?dist} # Epoch because we pushed a qemu-1.0 package Epoch: 2 License: GPLv2+ and LGPLv2+ and BSD @@ -37,6 +37,35 @@ Patch09: qemu-virtio-console-Rename-virtio-serial.c-back-to-virti.patch Patch10: qemu-v2-block-avoid-creating-too-large-iovecs-in-multiwrite_merge.patch +# VHostNet Patches +Patch11: qemu-net-add-API-to-disable-enable-polling.patch +Patch12: qemu-virtio-rename-features-guest_features.patch +Patch13: qemu-qdev-add-bit-property-type.patch +Patch14: qemu-qdev-fix-thinko-leading-to-guest-crashes.patch +Patch15: qemu-virtio-add-features-as-qdev-properties.patch +Patch16: qemu-virtio-net-mac-property-is-mandatory.patch +Patch17: qemu-exec-memory-notifiers.patch +Patch18: qemu-kvm-add-API-to-set-ioeventfd.patch +Patch19: qemu-notifier-event-notifier-implementation.patch +Patch20: qemu-virtio-add-notifier-support.patch +Patch21: qemu-virtio-add-APIs-for-queue-fields.patch +Patch22: qemu-virtio-add-status-change-callback.patch +Patch23: qemu-virtio-move-typedef-to-qemu-common.patch +Patch24: qemu-virtio-pci-fill-in-notifier-support.patch +Patch25: qemu-tap-add-interface-to-get-device-fd.patch +Patch26: qemu-vhost-vhost-net-support.patch +Patch27: qemu-tap-add-vhost-vhostfd-options.patch +Patch28: qemu-tap-add-API-to-retrieve-vhost-net-header.patch +Patch29: qemu-virtio-net-vhost-net-support.patch +Patch30: qemu-kvm-add-vhost.h-header.patch +Patch31: qemu-kvm-irqfd-support.patch +Patch32: qemu-msix-add-mask-unmask-notifiers.patch +Patch33: qemu-virtio-pci-irqfd-support.patch +Patch34: qemu-virtio-avoid-crash-with-non-tap-backends.patch +Patch35: qemu-virtio-serial-features-build-fix.patch +Patch36: qemu-virtio-pci-irqfd-fix-nonkvm-build.patch +Patch37: qemu-vhost-add-configure-check.patch + BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) BuildRequires: SDL-devel zlib-devel which texi2html gnutls-devel cyrus-sasl-devel @@ -235,6 +264,33 @@ such as kvmtrace and kvm_stat. %patch08 -p1 %patch09 -p1 %patch10 -p1 +%patch11 -p1 +%patch12 -p1 +%patch13 -p1 +%patch14 -p1 +%patch15 -p1 +%patch16 -p1 +%patch17 -p1 +%patch18 -p1 +%patch19 -p1 +%patch20 -p1 +%patch21 -p1 +%patch22 -p1 +%patch23 -p1 +%patch24 -p1 +%patch25 -p1 +%patch26 -p1 +%patch27 -p1 +%patch28 -p1 +%patch29 -p1 +%patch30 -p1 +%patch31 -p1 +%patch32 -p1 +%patch33 -p1 +%patch34 -p1 +%patch35 -p1 +%patch36 -p1 +%patch37 -p1 %build # --build-id option is used fedora 8 onwards for giving info to the debug packages. @@ -518,6 +574,9 @@ fi %{_mandir}/man1/qemu-img.1* %changelog +* Tue Feb 09 2010 Justin M. Forbes - 2:0.12.2-6 +- Add vhost net support. + * Thu Feb 04 2010 Justin M. Forbes - 2:0.12.2-5 - Avoid creating too large iovecs in multiwrite merge (#559717) - Don't try to set max_kernel_pages during ksm init on newer kernels (#558281)