From 55289fb036481396466d7825fa01d829c891108c Mon Sep 17 00:00:00 2001 From: Pavel Butsykin Date: Tue, 19 Sep 2017 15:07:33 +0300 Subject: virtio-serial: add enable_backend callback We should guarantee that RAM will not be modified while VM has a stopped state, otherwise it can lead to negative consequences during post-copy migration. In RUN_STATE_FINISH_MIGRATE step, it's expected that RAM on source side will not be modified as this could lead to non-consistent vm state on the destination side. Also RAM access during postcopy-ram migration with enabled release-ram capability can lead to sad consequences. Let's add enable_backend() callback to avoid undesirable virtioqueue changes in the guest memory. Signed-off-by: Pavel Butsykin Message-Id: <20170919120733.22020-1-pbutsykin@virtuozzo.com> Signed-off-by: Paolo Bonzini --- hw/char/virtio-console.c | 21 +++++++++++++++++++++ hw/char/virtio-serial-bus.c | 7 +++++++ include/hw/virtio/virtio-serial.h | 3 +++ 3 files changed, 31 insertions(+) diff --git a/hw/char/virtio-console.c b/hw/char/virtio-console.c index 198b2a89c0..172c72d06c 100644 --- a/hw/char/virtio-console.c +++ b/hw/char/virtio-console.c @@ -187,6 +187,26 @@ static int chr_be_change(void *opaque) return 0; } +static void virtconsole_enable_backend(VirtIOSerialPort *port, bool enable) +{ + VirtConsole *vcon = VIRTIO_CONSOLE(port); + + if (!qemu_chr_fe_backend_connected(&vcon->chr)) { + return; + } + + if (enable) { + VirtIOSerialPortClass *k = VIRTIO_SERIAL_PORT_GET_CLASS(port); + + qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read, + k->is_console ? NULL : chr_event, + chr_be_change, vcon, NULL, false); + } else { + qemu_chr_fe_set_handlers(&vcon->chr, NULL, NULL, NULL, + NULL, NULL, NULL, false); + } +} + static void virtconsole_realize(DeviceState *dev, Error **errp) { VirtIOSerialPort *port = VIRTIO_SERIAL_PORT(dev); @@ -258,6 +278,7 @@ static void virtserialport_class_init(ObjectClass *klass, void *data) k->unrealize = virtconsole_unrealize; k->have_data = flush_buf; k->set_guest_connected = set_guest_connected; + k->enable_backend = virtconsole_enable_backend; k->guest_writable = guest_writable; dc->props = virtserialport_properties; } diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c index 17a1bb008a..9470bd7be7 100644 --- a/hw/char/virtio-serial-bus.c +++ b/hw/char/virtio-serial-bus.c @@ -637,6 +637,13 @@ static void set_status(VirtIODevice *vdev, uint8_t status) if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) { guest_reset(vser); } + + QTAILQ_FOREACH(port, &vser->ports, next) { + VirtIOSerialPortClass *vsc = VIRTIO_SERIAL_PORT_GET_CLASS(port); + if (vsc->enable_backend) { + vsc->enable_backend(port, vdev->vm_running); + } + } } static void vser_reset(VirtIODevice *vdev) diff --git a/include/hw/virtio/virtio-serial.h b/include/hw/virtio/virtio-serial.h index b19c44727f..12657a9f39 100644 --- a/include/hw/virtio/virtio-serial.h +++ b/include/hw/virtio/virtio-serial.h @@ -58,6 +58,9 @@ typedef struct VirtIOSerialPortClass { /* Guest opened/closed device. */ void (*set_guest_connected)(VirtIOSerialPort *port, int guest_connected); + /* Enable/disable backend for virtio serial port */ + void (*enable_backend)(VirtIOSerialPort *port, bool enable); + /* Guest is now ready to accept data (virtqueues set up). */ void (*guest_ready)(VirtIOSerialPort *port); -- cgit v1.2.3-70-g09d2 From 3110cdbd8a4845c5b5fb861b0a664c56d993dd3c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Sep 2017 16:50:25 +0200 Subject: kvm: drop wrong assertion creating problems with pflash pflash toggles mr->romd_mode. So this assert does not always hold. 1) a device was added with !mr->romd_mode, therefore effectively not creating a kvm slot as we want to trap every access (add = false). 2) mr->romd_mode was toggled on before remove it. There is now actually no slot to remove and the assert is wrong. So let's just drop the assert. Reported-by: Gerd Hoffmann Signed-off-by: David Hildenbrand Message-Id: <20170920145025.19403-1-david@redhat.com> Tested-by: Gerd Hoffmann Signed-off-by: Paolo Bonzini --- accel/kvm/kvm-all.c | 1 - 1 file changed, 1 deletion(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index b0181d7220..4f1997deec 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -722,7 +722,6 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, mem = kvm_lookup_matching_slot(kml, start_addr, size); if (!add) { if (!mem) { - g_assert(!memory_region_is_ram(mr) && !writeable && !mr->romd_mode); return; } if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { -- cgit v1.2.3-70-g09d2 From 05e015f73c3b5c50c237d3d8e555e25cfa543a5c Mon Sep 17 00:00:00 2001 From: KONRAD Frederic Date: Thu, 21 Sep 2017 12:04:20 +0200 Subject: memory: avoid a name clash with access macro This avoids a name clash with the access macro on windows 64: make CHK version_gen.h CC aarch64-softmmu/memory.o /home/konrad/qemu/memory.c: In function 'access_with_adjusted_size': /home/konrad/qemu/memory.c:591:73: error: macro "access" passed 7 arguments, \ but takes just 2 (size - access_size - i) * 8, access_mask, attrs); ^ Signed-off-by: KONRAD Frederic Message-Id: <1505988260-8483-1-git-send-email-frederic.konrad@adacore.com> Signed-off-by: Paolo Bonzini --- memory.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/memory.c b/memory.c index b9920a6540..2b90117c60 100644 --- a/memory.c +++ b/memory.c @@ -560,13 +560,14 @@ static MemTxResult access_with_adjusted_size(hwaddr addr, unsigned size, unsigned access_size_min, unsigned access_size_max, - MemTxResult (*access)(MemoryRegion *mr, - hwaddr addr, - uint64_t *value, - unsigned size, - unsigned shift, - uint64_t mask, - MemTxAttrs attrs), + MemTxResult (*access_fn) + (MemoryRegion *mr, + hwaddr addr, + uint64_t *value, + unsigned size, + unsigned shift, + uint64_t mask, + MemTxAttrs attrs), MemoryRegion *mr, MemTxAttrs attrs) { @@ -587,12 +588,12 @@ static MemTxResult access_with_adjusted_size(hwaddr addr, access_mask = -1ULL >> (64 - access_size * 8); if (memory_region_big_endian(mr)) { for (i = 0; i < size; i += access_size) { - r |= access(mr, addr + i, value, access_size, + r |= access_fn(mr, addr + i, value, access_size, (size - access_size - i) * 8, access_mask, attrs); } } else { for (i = 0; i < size; i += access_size) { - r |= access(mr, addr + i, value, access_size, i * 8, + r |= access_fn(mr, addr + i, value, access_size, i * 8, access_mask, attrs); } } -- cgit v1.2.3-70-g09d2 From db81b9953761cac71906728fb3dfefce661ab903 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 21 Sep 2017 14:44:08 +0200 Subject: atomic: update documentation Signed-off-by: Paolo Bonzini --- docs/devel/atomics.txt | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/devel/atomics.txt b/docs/devel/atomics.txt index 3ef5d85b1b..048e5f23cb 100644 --- a/docs/devel/atomics.txt +++ b/docs/devel/atomics.txt @@ -63,11 +63,22 @@ operations: typeof(*ptr) atomic_fetch_sub(ptr, val) typeof(*ptr) atomic_fetch_and(ptr, val) typeof(*ptr) atomic_fetch_or(ptr, val) + typeof(*ptr) atomic_fetch_xor(ptr, val) typeof(*ptr) atomic_xchg(ptr, val) typeof(*ptr) atomic_cmpxchg(ptr, old, new) all of which return the old value of *ptr. These operations are -polymorphic; they operate on any type that is as wide as an int. +polymorphic; they operate on any type that is as wide as a pointer. + +Similar operations return the new value of *ptr: + + typeof(*ptr) atomic_inc_fetch(ptr) + typeof(*ptr) atomic_dec_fetch(ptr) + typeof(*ptr) atomic_add_fetch(ptr, val) + typeof(*ptr) atomic_sub_fetch(ptr, val) + typeof(*ptr) atomic_and_fetch(ptr, val) + typeof(*ptr) atomic_or_fetch(ptr, val) + typeof(*ptr) atomic_xor_fetch(ptr, val) Sequentially consistent loads and stores can be done using: -- cgit v1.2.3-70-g09d2 From 447b0d0b9ee8a0ac216c3186e0f3c427a1001f0c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 21 Sep 2017 14:32:47 +0200 Subject: memory: avoid "resurrection" of dead FlatViews It's possible for address_space_get_flatview() as it currently stands to cause a use-after-free for the returned FlatView, if the reference count is incremented after the FlatView has been replaced by a writer: thread 1 thread 2 RCU thread ------------------------------------------------------------- rcu_read_lock read as->current_map set as->current_map flatview_unref '--> call_rcu flatview_ref [ref=1] rcu_read_unlock flatview_destroy Since FlatViews are not updated very often, we can just detect the situation using a new atomic op atomic_fetch_inc_nonzero, similar to Linux's atomic_inc_not_zero, which performs the refcount increment only if it hasn't already hit zero. This is similar to Linux commit de09a9771a53 ("CRED: Fix get_task_cred() and task_state() to not resurrect dead credentials", 2010-07-29). Signed-off-by: Paolo Bonzini --- docs/devel/atomics.txt | 1 + include/qemu/atomic.h | 8 ++++++++ memory.c | 12 ++++++++---- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/docs/devel/atomics.txt b/docs/devel/atomics.txt index 048e5f23cb..10c5fa37e8 100644 --- a/docs/devel/atomics.txt +++ b/docs/devel/atomics.txt @@ -64,6 +64,7 @@ operations: typeof(*ptr) atomic_fetch_and(ptr, val) typeof(*ptr) atomic_fetch_or(ptr, val) typeof(*ptr) atomic_fetch_xor(ptr, val) + typeof(*ptr) atomic_fetch_inc_nonzero(ptr) typeof(*ptr) atomic_xchg(ptr, val) typeof(*ptr) atomic_cmpxchg(ptr, old, new) diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h index b6b62fb771..d73c9e14d7 100644 --- a/include/qemu/atomic.h +++ b/include/qemu/atomic.h @@ -442,4 +442,12 @@ } while(0) #endif +#define atomic_fetch_inc_nonzero(ptr) ({ \ + typeof_strip_qual(*ptr) _oldn = atomic_read(ptr); \ + while (_oldn && atomic_cmpxchg(ptr, _oldn, _oldn + 1) != _oldn) { \ + _oldn = atomic_read(ptr); \ + } \ + _oldn; \ +}) + #endif /* QEMU_ATOMIC_H */ diff --git a/memory.c b/memory.c index 2b90117c60..51f54ab430 100644 --- a/memory.c +++ b/memory.c @@ -294,9 +294,9 @@ static void flatview_destroy(FlatView *view) g_free(view); } -static void flatview_ref(FlatView *view) +static bool flatview_ref(FlatView *view) { - atomic_inc(&view->ref); + return atomic_fetch_inc_nonzero(&view->ref) > 0; } static void flatview_unref(FlatView *view) @@ -773,8 +773,12 @@ static FlatView *address_space_get_flatview(AddressSpace *as) FlatView *view; rcu_read_lock(); - view = atomic_rcu_read(&as->current_map); - flatview_ref(view); + do { + view = atomic_rcu_read(&as->current_map); + /* If somebody has replaced as->current_map concurrently, + * flatview_ref returns false. + */ + } while (!flatview_ref(view)); rcu_read_unlock(); return view; } -- cgit v1.2.3-70-g09d2 From e76bb18f7e430e0c50fb38d051feacf268bd78f4 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:50:53 +1000 Subject: exec: Explicitly export target AS from address_space_translate_internal This adds an AS** parameter to address_space_do_translate() to make it easier for the next patch to share FlatViews. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-2-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- exec.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/exec.c b/exec.c index a25a4c6018..fd8994b25d 100644 --- a/exec.c +++ b/exec.c @@ -476,7 +476,8 @@ static MemoryRegionSection address_space_do_translate(AddressSpace *as, hwaddr *xlat, hwaddr *plen, bool is_write, - bool is_mmio) + bool is_mmio, + AddressSpace **target_as) { IOMMUTLBEntry iotlb; MemoryRegionSection *section; @@ -503,6 +504,7 @@ static MemoryRegionSection address_space_do_translate(AddressSpace *as, } as = iotlb.target_as; + *target_as = iotlb.target_as; } *xlat = addr; @@ -525,7 +527,7 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr, /* This can never be MMIO. */ section = address_space_do_translate(as, addr, &xlat, &plen, - is_write, false); + is_write, false, &as); /* Illegal translation */ if (section.mr == &io_mem_unassigned) { @@ -548,7 +550,7 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr, plen -= 1; return (IOMMUTLBEntry) { - .target_as = section.address_space, + .target_as = as, .iova = addr & ~plen, .translated_addr = xlat & ~plen, .addr_mask = plen, @@ -569,7 +571,8 @@ MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr, MemoryRegionSection section; /* This can be MMIO, so setup MMIO bit. */ - section = address_space_do_translate(as, addr, xlat, plen, is_write, true); + section = address_space_do_translate(as, addr, xlat, plen, is_write, true, + &as); mr = section.mr; if (xen_enabled() && memory_access_is_direct(mr, is_write)) { -- cgit v1.2.3-70-g09d2 From 9a62e24f45bc97f8eaf198caf58906b47c50a8d5 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:50:54 +1000 Subject: memory: Open code FlatView rendering We are going to share FlatView's between AddressSpace's and per-AS memory listeners won't suit the purpose anymore so open code the dispatch tree rendering. Since there is a good chance that dispatch_listener was the only listener, this avoids address_space_update_topology_pass() if there is no registered listeners; this should improve starting time. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-3-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- exec.c | 27 +++------------------------ include/exec/memory-internal.h | 6 ++++-- include/exec/memory.h | 1 - memory.c | 19 ++++++++++++++----- 4 files changed, 21 insertions(+), 32 deletions(-) diff --git a/exec.c b/exec.c index fd8994b25d..1626d254bb 100644 --- a/exec.c +++ b/exec.c @@ -1347,9 +1347,8 @@ static void register_multipage(AddressSpaceDispatch *d, phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index); } -static void mem_add(MemoryListener *listener, MemoryRegionSection *section) +void mem_add(AddressSpace *as, MemoryRegionSection *section) { - AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener); AddressSpaceDispatch *d = as->next_dispatch; MemoryRegionSection now = *section, remain = *section; Int128 page_size = int128_make64(TARGET_PAGE_SIZE); @@ -2673,9 +2672,8 @@ static void io_mem_init(void) NULL, UINT64_MAX); } -static void mem_begin(MemoryListener *listener) +void mem_begin(AddressSpace *as) { - AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener); AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1); uint16_t n; @@ -2699,9 +2697,8 @@ static void address_space_dispatch_free(AddressSpaceDispatch *d) g_free(d); } -static void mem_commit(MemoryListener *listener) +void mem_commit(AddressSpace *as) { - AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener); AddressSpaceDispatch *cur = as->dispatch; AddressSpaceDispatch *next = as->next_dispatch; @@ -2731,24 +2728,6 @@ static void tcg_commit(MemoryListener *listener) tlb_flush(cpuas->cpu); } -void address_space_init_dispatch(AddressSpace *as) -{ - as->dispatch = NULL; - as->dispatch_listener = (MemoryListener) { - .begin = mem_begin, - .commit = mem_commit, - .region_add = mem_add, - .region_nop = mem_add, - .priority = 0, - }; - memory_listener_register(&as->dispatch_listener, as); -} - -void address_space_unregister(AddressSpace *as) -{ - memory_listener_unregister(&as->dispatch_listener); -} - void address_space_destroy_dispatch(AddressSpace *as) { AddressSpaceDispatch *d = as->dispatch; diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h index fb467acdba..9abde2f11c 100644 --- a/include/exec/memory-internal.h +++ b/include/exec/memory-internal.h @@ -22,8 +22,6 @@ #ifndef CONFIG_USER_ONLY typedef struct AddressSpaceDispatch AddressSpaceDispatch; -void address_space_init_dispatch(AddressSpace *as); -void address_space_unregister(AddressSpace *as); void address_space_destroy_dispatch(AddressSpace *as); extern const MemoryRegionOps unassigned_mem_ops; @@ -31,5 +29,9 @@ extern const MemoryRegionOps unassigned_mem_ops; bool memory_region_access_valid(MemoryRegion *mr, hwaddr addr, unsigned size, bool is_write); +void mem_add(AddressSpace *as, MemoryRegionSection *section); +void mem_begin(AddressSpace *as); +void mem_commit(AddressSpace *as); + #endif #endif diff --git a/include/exec/memory.h b/include/exec/memory.h index 1dcd3122d7..9581f7a7db 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -318,7 +318,6 @@ struct AddressSpace { struct MemoryRegionIoeventfd *ioeventfds; struct AddressSpaceDispatch *dispatch; struct AddressSpaceDispatch *next_dispatch; - MemoryListener dispatch_listener; QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners; QTAILQ_ENTRY(AddressSpace) address_spaces_link; }; diff --git a/memory.c b/memory.c index 51f54ab430..3241e449e7 100644 --- a/memory.c +++ b/memory.c @@ -884,14 +884,24 @@ static void address_space_update_topology_pass(AddressSpace *as, } } - static void address_space_update_topology(AddressSpace *as) { FlatView *old_view = address_space_get_flatview(as); FlatView *new_view = generate_memory_topology(as->root); + int i; - address_space_update_topology_pass(as, old_view, new_view, false); - address_space_update_topology_pass(as, old_view, new_view, true); + mem_begin(as); + for (i = 0; i < new_view->nr; i++) { + MemoryRegionSection mrs = + section_from_flat_range(&new_view->ranges[i], as); + mem_add(as, &mrs); + } + mem_commit(as); + + if (!QTAILQ_EMPTY(&as->listeners)) { + address_space_update_topology_pass(as, old_view, new_view, false); + address_space_update_topology_pass(as, old_view, new_view, true); + } /* Writes are protected by the BQL. */ atomic_rcu_set(&as->current_map, new_view); @@ -2626,7 +2636,7 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) QTAILQ_INIT(&as->listeners); QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link); as->name = g_strdup(name ? name : "anonymous"); - address_space_init_dispatch(as); + as->dispatch = NULL; memory_region_update_pending |= root->enabled; memory_region_transaction_commit(); } @@ -2677,7 +2687,6 @@ void address_space_destroy(AddressSpace *as) as->root = NULL; memory_region_transaction_commit(); QTAILQ_REMOVE(&address_spaces, as, address_spaces_link); - address_space_unregister(as); /* At this point, as->dispatch and as->current_map are dummy * entries that the guest should never use. Wait for the old -- cgit v1.2.3-70-g09d2 From cc94cd6d36602d976a5e7bc29134d1eaefb4102e Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:50:55 +1000 Subject: memory: Move FlatView allocation to a helper This moves a FlatView allocation and initialization to a helper. While we are nere, replace g_new with g_new0 to not to bother if we add new fields in the future. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-4-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- memory.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/memory.c b/memory.c index 3241e449e7..eec668eec7 100644 --- a/memory.c +++ b/memory.c @@ -258,12 +258,14 @@ static bool flatrange_equal(FlatRange *a, FlatRange *b) && a->readonly == b->readonly; } -static void flatview_init(FlatView *view) +static FlatView *flatview_new(void) { + FlatView *view; + + view = g_new0(FlatView, 1); view->ref = 1; - view->ranges = NULL; - view->nr = 0; - view->nr_allocated = 0; + + return view; } /* Insert a range into a given position. Caller is responsible for maintaining @@ -707,8 +709,7 @@ static FlatView *generate_memory_topology(MemoryRegion *mr) { FlatView *view; - view = g_new(FlatView, 1); - flatview_init(view); + view = flatview_new(); if (mr) { render_memory_region(view, mr, int128_zero(), @@ -2629,8 +2630,7 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) as->ref_count = 1; as->root = root; as->malloced = false; - as->current_map = g_new(FlatView, 1); - flatview_init(as->current_map); + as->current_map = flatview_new(); as->ioeventfd_nb = 0; as->ioeventfds = NULL; QTAILQ_INIT(&as->listeners); -- cgit v1.2.3-70-g09d2 From 66a6df1dc6d5b28cc3e65db0d71683fbdddc6b62 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:50:56 +1000 Subject: memory: Move AddressSpaceDispatch from AddressSpace to FlatView As we are going to share FlatView's between AddressSpace's, and AddressSpaceDispatch is a structure to perform quick lookup in FlatView, this moves ASD to FlatView. After previosly open coded ASD rendering, we can also remove as->next_dispatch as the new FlatView pointer is stored on a stack and set to an AS atomically. flatview_destroy() is executed under RCU instead of address_space_dispatch_free() now. This makes mem_begin/mem_commit to work with ASD and mem_add with FV as later on mem_add will be taking FV as an argument anyway. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-5-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- exec.c | 41 +++++++++++------------------------------ include/exec/memory-internal.h | 12 +++++++----- include/exec/memory.h | 2 -- memory.c | 31 ++++++++++++++++++++++++------- 4 files changed, 42 insertions(+), 44 deletions(-) diff --git a/exec.c b/exec.c index 1626d254bb..afd64127e6 100644 --- a/exec.c +++ b/exec.c @@ -187,8 +187,6 @@ typedef struct PhysPageMap { } PhysPageMap; struct AddressSpaceDispatch { - struct rcu_head rcu; - MemoryRegionSection *mru_section; /* This is a multi-level map on the physical address space. * The bottom level has pointers to MemoryRegionSections. @@ -485,7 +483,7 @@ static MemoryRegionSection address_space_do_translate(AddressSpace *as, IOMMUMemoryRegionClass *imrc; for (;;) { - AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch); + AddressSpaceDispatch *d = address_space_to_dispatch(as); section = address_space_translate_internal(d, addr, &addr, plen, is_mmio); iommu_mr = memory_region_get_iommu(section->mr); @@ -1222,7 +1220,7 @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu, } else { AddressSpaceDispatch *d; - d = atomic_rcu_read(§ion->address_space->dispatch); + d = address_space_to_dispatch(section->address_space); iotlb = section - d->map.sections; iotlb += xlat; } @@ -1347,9 +1345,9 @@ static void register_multipage(AddressSpaceDispatch *d, phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index); } -void mem_add(AddressSpace *as, MemoryRegionSection *section) +void mem_add(AddressSpace *as, FlatView *fv, MemoryRegionSection *section) { - AddressSpaceDispatch *d = as->next_dispatch; + AddressSpaceDispatch *d = flatview_to_dispatch(fv); MemoryRegionSection now = *section, remain = *section; Int128 page_size = int128_make64(TARGET_PAGE_SIZE); @@ -2672,7 +2670,7 @@ static void io_mem_init(void) NULL, UINT64_MAX); } -void mem_begin(AddressSpace *as) +AddressSpaceDispatch *mem_begin(AddressSpace *as) { AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1); uint16_t n; @@ -2688,26 +2686,19 @@ void mem_begin(AddressSpace *as) d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 }; d->as = as; - as->next_dispatch = d; + + return d; } -static void address_space_dispatch_free(AddressSpaceDispatch *d) +void address_space_dispatch_free(AddressSpaceDispatch *d) { phys_sections_free(&d->map); g_free(d); } -void mem_commit(AddressSpace *as) +void mem_commit(AddressSpaceDispatch *d) { - AddressSpaceDispatch *cur = as->dispatch; - AddressSpaceDispatch *next = as->next_dispatch; - - phys_page_compact_all(next, next->map.nodes_nb); - - atomic_rcu_set(&as->dispatch, next); - if (cur) { - call_rcu(cur, address_space_dispatch_free, rcu); - } + phys_page_compact_all(d, d->map.nodes_nb); } static void tcg_commit(MemoryListener *listener) @@ -2723,21 +2714,11 @@ static void tcg_commit(MemoryListener *listener) * We reload the dispatch pointer now because cpu_reloading_memory_map() * may have split the RCU critical section. */ - d = atomic_rcu_read(&cpuas->as->dispatch); + d = address_space_to_dispatch(cpuas->as); atomic_rcu_set(&cpuas->memory_dispatch, d); tlb_flush(cpuas->cpu); } -void address_space_destroy_dispatch(AddressSpace *as) -{ - AddressSpaceDispatch *d = as->dispatch; - - atomic_rcu_set(&as->dispatch, NULL); - if (d) { - call_rcu(d, address_space_dispatch_free, rcu); - } -} - static void memory_map_init(void) { system_memory = g_malloc(sizeof(*system_memory)); diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h index 9abde2f11c..6e08eda256 100644 --- a/include/exec/memory-internal.h +++ b/include/exec/memory-internal.h @@ -22,16 +22,18 @@ #ifndef CONFIG_USER_ONLY typedef struct AddressSpaceDispatch AddressSpaceDispatch; -void address_space_destroy_dispatch(AddressSpace *as); - extern const MemoryRegionOps unassigned_mem_ops; bool memory_region_access_valid(MemoryRegion *mr, hwaddr addr, unsigned size, bool is_write); -void mem_add(AddressSpace *as, MemoryRegionSection *section); -void mem_begin(AddressSpace *as); -void mem_commit(AddressSpace *as); +void mem_add(AddressSpace *as, FlatView *fv, MemoryRegionSection *section); +AddressSpaceDispatch *mem_begin(AddressSpace *as); +void mem_commit(AddressSpaceDispatch *d); + +AddressSpaceDispatch *address_space_to_dispatch(AddressSpace *as); +AddressSpaceDispatch *flatview_to_dispatch(FlatView *fv); +void address_space_dispatch_free(AddressSpaceDispatch *d); #endif #endif diff --git a/include/exec/memory.h b/include/exec/memory.h index 9581f7a7db..2346f8b863 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -316,8 +316,6 @@ struct AddressSpace { int ioeventfd_nb; struct MemoryRegionIoeventfd *ioeventfds; - struct AddressSpaceDispatch *dispatch; - struct AddressSpaceDispatch *next_dispatch; QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners; QTAILQ_ENTRY(AddressSpace) address_spaces_link; }; diff --git a/memory.c b/memory.c index eec668eec7..962e9b961f 100644 --- a/memory.c +++ b/memory.c @@ -229,6 +229,7 @@ struct FlatView { FlatRange *ranges; unsigned nr; unsigned nr_allocated; + struct AddressSpaceDispatch *dispatch; }; typedef struct AddressSpaceOps AddressSpaceOps; @@ -289,6 +290,9 @@ static void flatview_destroy(FlatView *view) { int i; + if (view->dispatch) { + address_space_dispatch_free(view->dispatch); + } for (i = 0; i < view->nr; i++) { memory_region_unref(view->ranges[i].mr); } @@ -304,10 +308,25 @@ static bool flatview_ref(FlatView *view) static void flatview_unref(FlatView *view) { if (atomic_fetch_dec(&view->ref) == 1) { - flatview_destroy(view); + call_rcu(view, flatview_destroy, rcu); } } +static FlatView *address_space_to_flatview(AddressSpace *as) +{ + return atomic_rcu_read(&as->current_map); +} + +AddressSpaceDispatch *flatview_to_dispatch(FlatView *fv) +{ + return fv->dispatch; +} + +AddressSpaceDispatch *address_space_to_dispatch(AddressSpace *as) +{ + return flatview_to_dispatch(address_space_to_flatview(as)); +} + static bool can_merge(FlatRange *r1, FlatRange *r2) { return int128_eq(addrrange_end(r1->addr), r2->addr.start) @@ -891,13 +910,13 @@ static void address_space_update_topology(AddressSpace *as) FlatView *new_view = generate_memory_topology(as->root); int i; - mem_begin(as); + new_view->dispatch = mem_begin(as); for (i = 0; i < new_view->nr; i++) { MemoryRegionSection mrs = section_from_flat_range(&new_view->ranges[i], as); - mem_add(as, &mrs); + mem_add(as, new_view, &mrs); } - mem_commit(as); + mem_commit(new_view->dispatch); if (!QTAILQ_EMPTY(&as->listeners)) { address_space_update_topology_pass(as, old_view, new_view, false); @@ -906,7 +925,7 @@ static void address_space_update_topology(AddressSpace *as) /* Writes are protected by the BQL. */ atomic_rcu_set(&as->current_map, new_view); - call_rcu(old_view, flatview_unref, rcu); + flatview_unref(old_view); /* Note that all the old MemoryRegions are still alive up to this * point. This relieves most MemoryListeners from the need to @@ -2636,7 +2655,6 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) QTAILQ_INIT(&as->listeners); QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link); as->name = g_strdup(name ? name : "anonymous"); - as->dispatch = NULL; memory_region_update_pending |= root->enabled; memory_region_transaction_commit(); } @@ -2645,7 +2663,6 @@ static void do_address_space_destroy(AddressSpace *as) { bool do_free = as->malloced; - address_space_destroy_dispatch(as); assert(QTAILQ_EMPTY(&as->listeners)); flatview_unref(as->current_map); -- cgit v1.2.3-70-g09d2 From c7752523787dc148f5ee976162e80ab594c386a1 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:50:57 +1000 Subject: memory: Remove AddressSpace pointer from AddressSpaceDispatch AS in ASD is only used to pass AS from mem_begin() to register_subpage() to store it in MemoryRegionSection, we can do this directly now. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-6-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- exec.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/exec.c b/exec.c index afd64127e6..a54dde7835 100644 --- a/exec.c +++ b/exec.c @@ -193,7 +193,6 @@ struct AddressSpaceDispatch { */ PhysPageEntry phys_map; PhysPageMap map; - AddressSpace *as; }; #define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK) @@ -1303,7 +1302,8 @@ static void phys_sections_free(PhysPageMap *map) g_free(map->nodes); } -static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section) +static void register_subpage(AddressSpace *as, AddressSpaceDispatch *d, + MemoryRegionSection *section) { subpage_t *subpage; hwaddr base = section->offset_within_address_space @@ -1318,8 +1318,8 @@ static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *secti assert(existing->mr->subpage || existing->mr == &io_mem_unassigned); if (!(existing->mr->subpage)) { - subpage = subpage_init(d->as, base); - subsection.address_space = d->as; + subpage = subpage_init(as, base); + subsection.address_space = as; subsection.mr = &subpage->iomem; phys_page_set(d, base >> TARGET_PAGE_BITS, 1, phys_section_add(&d->map, &subsection)); @@ -1356,7 +1356,7 @@ void mem_add(AddressSpace *as, FlatView *fv, MemoryRegionSection *section) - now.offset_within_address_space; now.size = int128_min(int128_make64(left), now.size); - register_subpage(d, &now); + register_subpage(as, d, &now); } else { now.size = int128_zero(); } @@ -1366,10 +1366,10 @@ void mem_add(AddressSpace *as, FlatView *fv, MemoryRegionSection *section) remain.offset_within_region += int128_get64(now.size); now = remain; if (int128_lt(remain.size, page_size)) { - register_subpage(d, &now); + register_subpage(as, d, &now); } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) { now.size = page_size; - register_subpage(d, &now); + register_subpage(as, d, &now); } else { now.size = int128_and(now.size, int128_neg(page_size)); register_multipage(d, &now); @@ -2685,7 +2685,6 @@ AddressSpaceDispatch *mem_begin(AddressSpace *as) assert(n == PHYS_SECTION_WATCH); d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 }; - d->as = as; return d; } -- cgit v1.2.3-70-g09d2 From 166206845f7fd75e720e6feea0bb01957c8da07f Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:50:58 +1000 Subject: memory: Switch memory from using AddressSpace to FlatView FlatView's will be shared between AddressSpace's and subpage_t and MemoryRegionSection cannot store AS anymore, hence this change. In particular, for: typedef struct subpage_t { MemoryRegion iomem; - AddressSpace *as; + FlatView *fv; hwaddr base; uint16_t sub_section[]; } subpage_t; struct MemoryRegionSection { MemoryRegion *mr; - AddressSpace *address_space; + FlatView *fv; hwaddr offset_within_region; Int128 size; hwaddr offset_within_address_space; bool readonly; }; This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-7-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- exec.c | 180 ++++++++++++++++++++++++----------------- hw/intc/openpic_kvm.c | 2 +- include/exec/memory-internal.h | 2 +- include/exec/memory.h | 51 ++++++++---- memory.c | 33 ++++---- 5 files changed, 159 insertions(+), 109 deletions(-) diff --git a/exec.c b/exec.c index a54dde7835..d2b9f60494 100644 --- a/exec.c +++ b/exec.c @@ -198,7 +198,7 @@ struct AddressSpaceDispatch { #define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK) typedef struct subpage_t { MemoryRegion iomem; - AddressSpace *as; + FlatView *fv; hwaddr base; uint16_t sub_section[]; } subpage_t; @@ -468,13 +468,13 @@ address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *x } /* Called from RCU critical section */ -static MemoryRegionSection address_space_do_translate(AddressSpace *as, - hwaddr addr, - hwaddr *xlat, - hwaddr *plen, - bool is_write, - bool is_mmio, - AddressSpace **target_as) +static MemoryRegionSection flatview_do_translate(FlatView *fv, + hwaddr addr, + hwaddr *xlat, + hwaddr *plen, + bool is_write, + bool is_mmio, + AddressSpace **target_as) { IOMMUTLBEntry iotlb; MemoryRegionSection *section; @@ -482,8 +482,9 @@ static MemoryRegionSection address_space_do_translate(AddressSpace *as, IOMMUMemoryRegionClass *imrc; for (;;) { - AddressSpaceDispatch *d = address_space_to_dispatch(as); - section = address_space_translate_internal(d, addr, &addr, plen, is_mmio); + section = address_space_translate_internal( + flatview_to_dispatch(fv), addr, &addr, + plen, is_mmio); iommu_mr = memory_region_get_iommu(section->mr); if (!iommu_mr) { @@ -500,7 +501,7 @@ static MemoryRegionSection address_space_do_translate(AddressSpace *as, goto translate_fail; } - as = iotlb.target_as; + fv = address_space_to_flatview(iotlb.target_as); *target_as = iotlb.target_as; } @@ -523,8 +524,8 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr, plen = (hwaddr)-1; /* This can never be MMIO. */ - section = address_space_do_translate(as, addr, &xlat, &plen, - is_write, false, &as); + section = flatview_do_translate(address_space_to_flatview(as), addr, + &xlat, &plen, is_write, false, &as); /* Illegal translation */ if (section.mr == &io_mem_unassigned) { @@ -560,16 +561,15 @@ iotlb_fail: } /* Called from RCU critical section */ -MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr, - hwaddr *xlat, hwaddr *plen, - bool is_write) +MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat, + hwaddr *plen, bool is_write) { MemoryRegion *mr; MemoryRegionSection section; + AddressSpace *as = NULL; /* This can be MMIO, so setup MMIO bit. */ - section = address_space_do_translate(as, addr, xlat, plen, is_write, true, - &as); + section = flatview_do_translate(fv, addr, xlat, plen, is_write, true, &as); mr = section.mr; if (xen_enabled() && memory_access_is_direct(mr, is_write)) { @@ -1219,7 +1219,7 @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu, } else { AddressSpaceDispatch *d; - d = address_space_to_dispatch(section->address_space); + d = flatview_to_dispatch(section->fv); iotlb = section - d->map.sections; iotlb += xlat; } @@ -1245,7 +1245,7 @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu, static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end, uint16_t section); -static subpage_t *subpage_init(AddressSpace *as, hwaddr base); +static subpage_t *subpage_init(FlatView *fv, hwaddr base); static void *(*phys_mem_alloc)(size_t size, uint64_t *align) = qemu_anon_ram_alloc; @@ -1302,7 +1302,7 @@ static void phys_sections_free(PhysPageMap *map) g_free(map->nodes); } -static void register_subpage(AddressSpace *as, AddressSpaceDispatch *d, +static void register_subpage(FlatView *fv, AddressSpaceDispatch *d, MemoryRegionSection *section) { subpage_t *subpage; @@ -1318,8 +1318,8 @@ static void register_subpage(AddressSpace *as, AddressSpaceDispatch *d, assert(existing->mr->subpage || existing->mr == &io_mem_unassigned); if (!(existing->mr->subpage)) { - subpage = subpage_init(as, base); - subsection.address_space = as; + subpage = subpage_init(fv, base); + subsection.fv = fv; subsection.mr = &subpage->iomem; phys_page_set(d, base >> TARGET_PAGE_BITS, 1, phys_section_add(&d->map, &subsection)); @@ -1345,7 +1345,7 @@ static void register_multipage(AddressSpaceDispatch *d, phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index); } -void mem_add(AddressSpace *as, FlatView *fv, MemoryRegionSection *section) +void mem_add(FlatView *fv, MemoryRegionSection *section) { AddressSpaceDispatch *d = flatview_to_dispatch(fv); MemoryRegionSection now = *section, remain = *section; @@ -1356,7 +1356,7 @@ void mem_add(AddressSpace *as, FlatView *fv, MemoryRegionSection *section) - now.offset_within_address_space; now.size = int128_min(int128_make64(left), now.size); - register_subpage(as, d, &now); + register_subpage(fv, d, &now); } else { now.size = int128_zero(); } @@ -1366,10 +1366,10 @@ void mem_add(AddressSpace *as, FlatView *fv, MemoryRegionSection *section) remain.offset_within_region += int128_get64(now.size); now = remain; if (int128_lt(remain.size, page_size)) { - register_subpage(as, d, &now); + register_subpage(fv, d, &now); } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) { now.size = page_size; - register_subpage(as, d, &now); + register_subpage(fv, d, &now); } else { now.size = int128_and(now.size, int128_neg(page_size)); register_multipage(d, &now); @@ -2500,6 +2500,11 @@ static const MemoryRegionOps watch_mem_ops = { .endianness = DEVICE_NATIVE_ENDIAN, }; +static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs, + const uint8_t *buf, int len); +static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len, + bool is_write); + static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data, unsigned len, MemTxAttrs attrs) { @@ -2511,8 +2516,7 @@ static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data, printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__, subpage, len, addr); #endif - res = address_space_read(subpage->as, addr + subpage->base, - attrs, buf, len); + res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len); if (res) { return res; } @@ -2561,8 +2565,7 @@ static MemTxResult subpage_write(void *opaque, hwaddr addr, default: abort(); } - return address_space_write(subpage->as, addr + subpage->base, - attrs, buf, len); + return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len); } static bool subpage_accepts(void *opaque, hwaddr addr, @@ -2574,8 +2577,8 @@ static bool subpage_accepts(void *opaque, hwaddr addr, __func__, subpage, is_write ? 'w' : 'r', len, addr); #endif - return address_space_access_valid(subpage->as, addr + subpage->base, - len, is_write); + return flatview_access_valid(subpage->fv, addr + subpage->base, + len, is_write); } static const MemoryRegionOps subpage_ops = { @@ -2609,12 +2612,12 @@ static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end, return 0; } -static subpage_t *subpage_init(AddressSpace *as, hwaddr base) +static subpage_t *subpage_init(FlatView *fv, hwaddr base) { subpage_t *mmio; mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t)); - mmio->as = as; + mmio->fv = fv; mmio->base = base; memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio, NULL, TARGET_PAGE_SIZE); @@ -2628,12 +2631,11 @@ static subpage_t *subpage_init(AddressSpace *as, hwaddr base) return mmio; } -static uint16_t dummy_section(PhysPageMap *map, AddressSpace *as, - MemoryRegion *mr) +static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr) { - assert(as); + assert(fv); MemoryRegionSection section = { - .address_space = as, + .fv = fv, .mr = mr, .offset_within_address_space = 0, .offset_within_region = 0, @@ -2672,16 +2674,17 @@ static void io_mem_init(void) AddressSpaceDispatch *mem_begin(AddressSpace *as) { + FlatView *fv = address_space_to_flatview(as); AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1); uint16_t n; - n = dummy_section(&d->map, as, &io_mem_unassigned); + n = dummy_section(&d->map, fv, &io_mem_unassigned); assert(n == PHYS_SECTION_UNASSIGNED); - n = dummy_section(&d->map, as, &io_mem_notdirty); + n = dummy_section(&d->map, fv, &io_mem_notdirty); assert(n == PHYS_SECTION_NOTDIRTY); - n = dummy_section(&d->map, as, &io_mem_rom); + n = dummy_section(&d->map, fv, &io_mem_rom); assert(n == PHYS_SECTION_ROM); - n = dummy_section(&d->map, as, &io_mem_watch); + n = dummy_section(&d->map, fv, &io_mem_watch); assert(n == PHYS_SECTION_WATCH); d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 }; @@ -2861,11 +2864,11 @@ static bool prepare_mmio_access(MemoryRegion *mr) } /* Called within RCU critical section. */ -static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, - const uint8_t *buf, - int len, hwaddr addr1, - hwaddr l, MemoryRegion *mr) +static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, + const uint8_t *buf, + int len, hwaddr addr1, + hwaddr l, MemoryRegion *mr) { uint8_t *ptr; uint64_t val; @@ -2927,14 +2930,14 @@ static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr, } l = len; - mr = address_space_translate(as, addr, &addr1, &l, true); + mr = flatview_translate(fv, addr, &addr1, &l, true); } return result; } -MemTxResult address_space_write(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, - const uint8_t *buf, int len) +static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs, + const uint8_t *buf, int len) { hwaddr l; hwaddr addr1; @@ -2944,20 +2947,27 @@ MemTxResult address_space_write(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, if (len > 0) { rcu_read_lock(); l = len; - mr = address_space_translate(as, addr, &addr1, &l, true); - result = address_space_write_continue(as, addr, attrs, buf, len, - addr1, l, mr); + mr = flatview_translate(fv, addr, &addr1, &l, true); + result = flatview_write_continue(fv, addr, attrs, buf, len, + addr1, l, mr); rcu_read_unlock(); } return result; } +MemTxResult address_space_write(AddressSpace *as, hwaddr addr, + MemTxAttrs attrs, + const uint8_t *buf, int len) +{ + return flatview_write(address_space_to_flatview(as), addr, attrs, buf, len); +} + /* Called within RCU critical section. */ -MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, uint8_t *buf, - int len, hwaddr addr1, hwaddr l, - MemoryRegion *mr) +MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, uint8_t *buf, + int len, hwaddr addr1, hwaddr l, + MemoryRegion *mr) { uint8_t *ptr; uint64_t val; @@ -3017,14 +3027,14 @@ MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr, } l = len; - mr = address_space_translate(as, addr, &addr1, &l, false); + mr = flatview_translate(fv, addr, &addr1, &l, false); } return result; } -MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, uint8_t *buf, int len) +MemTxResult flatview_read_full(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, uint8_t *buf, int len) { hwaddr l; hwaddr addr1; @@ -3034,25 +3044,33 @@ MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr, if (len > 0) { rcu_read_lock(); l = len; - mr = address_space_translate(as, addr, &addr1, &l, false); - result = address_space_read_continue(as, addr, attrs, buf, len, - addr1, l, mr); + mr = flatview_translate(fv, addr, &addr1, &l, false); + result = flatview_read_continue(fv, addr, attrs, buf, len, + addr1, l, mr); rcu_read_unlock(); } return result; } -MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, - uint8_t *buf, int len, bool is_write) +static MemTxResult flatview_rw(FlatView *fv, hwaddr addr, MemTxAttrs attrs, + uint8_t *buf, int len, bool is_write) { if (is_write) { - return address_space_write(as, addr, attrs, (uint8_t *)buf, len); + return flatview_write(fv, addr, attrs, (uint8_t *)buf, len); } else { - return address_space_read(as, addr, attrs, (uint8_t *)buf, len); + return flatview_read(fv, addr, attrs, (uint8_t *)buf, len); } } +MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, + MemTxAttrs attrs, uint8_t *buf, + int len, bool is_write) +{ + return flatview_rw(address_space_to_flatview(as), + addr, attrs, buf, len, is_write); +} + void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf, int len, int is_write) { @@ -3210,7 +3228,8 @@ static void cpu_notify_map_clients(void) qemu_mutex_unlock(&map_client_list_lock); } -bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_write) +static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len, + bool is_write) { MemoryRegion *mr; hwaddr l, xlat; @@ -3218,7 +3237,7 @@ bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_ rcu_read_lock(); while (len > 0) { l = len; - mr = address_space_translate(as, addr, &xlat, &l, is_write); + mr = flatview_translate(fv, addr, &xlat, &l, is_write); if (!memory_access_is_direct(mr, is_write)) { l = memory_access_size(mr, l, addr); if (!memory_region_access_valid(mr, xlat, l, is_write)) { @@ -3234,8 +3253,16 @@ bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_ return true; } +bool address_space_access_valid(AddressSpace *as, hwaddr addr, + int len, bool is_write) +{ + return flatview_access_valid(address_space_to_flatview(as), + addr, len, is_write); +} + static hwaddr -address_space_extend_translation(AddressSpace *as, hwaddr addr, hwaddr target_len, +flatview_extend_translation(FlatView *fv, hwaddr addr, + hwaddr target_len, MemoryRegion *mr, hwaddr base, hwaddr len, bool is_write) { @@ -3252,7 +3279,8 @@ address_space_extend_translation(AddressSpace *as, hwaddr addr, hwaddr target_le } len = target_len; - this_mr = address_space_translate(as, addr, &xlat, &len, is_write); + this_mr = flatview_translate(fv, addr, &xlat, + &len, is_write); if (this_mr != mr || xlat != base + done) { return done; } @@ -3275,6 +3303,7 @@ void *address_space_map(AddressSpace *as, hwaddr l, xlat; MemoryRegion *mr; void *ptr; + FlatView *fv = address_space_to_flatview(as); if (len == 0) { return NULL; @@ -3282,7 +3311,7 @@ void *address_space_map(AddressSpace *as, l = len; rcu_read_lock(); - mr = address_space_translate(as, addr, &xlat, &l, is_write); + mr = flatview_translate(fv, addr, &xlat, &l, is_write); if (!memory_access_is_direct(mr, is_write)) { if (atomic_xchg(&bounce.in_use, true)) { @@ -3298,7 +3327,7 @@ void *address_space_map(AddressSpace *as, memory_region_ref(mr); bounce.mr = mr; if (!is_write) { - address_space_read(as, addr, MEMTXATTRS_UNSPECIFIED, + flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED, bounce.buffer, l); } @@ -3309,7 +3338,8 @@ void *address_space_map(AddressSpace *as, memory_region_ref(mr); - *plen = address_space_extend_translation(as, addr, len, mr, xlat, l, is_write); + *plen = flatview_extend_translation(fv, addr, len, mr, xlat, + l, is_write); ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true); rcu_read_unlock(); diff --git a/hw/intc/openpic_kvm.c b/hw/intc/openpic_kvm.c index 0518e017c4..fa83420254 100644 --- a/hw/intc/openpic_kvm.c +++ b/hw/intc/openpic_kvm.c @@ -124,7 +124,7 @@ static void kvm_openpic_region_add(MemoryListener *listener, uint64_t reg_base; int ret; - if (section->address_space != &address_space_memory) { + if (section->fv != address_space_to_flatview(&address_space_memory)) { abort(); } diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h index 6e08eda256..1cf8ad9869 100644 --- a/include/exec/memory-internal.h +++ b/include/exec/memory-internal.h @@ -27,7 +27,7 @@ extern const MemoryRegionOps unassigned_mem_ops; bool memory_region_access_valid(MemoryRegion *mr, hwaddr addr, unsigned size, bool is_write); -void mem_add(AddressSpace *as, FlatView *fv, MemoryRegionSection *section); +void mem_add(FlatView *fv, MemoryRegionSection *section); AddressSpaceDispatch *mem_begin(AddressSpace *as); void mem_commit(AddressSpaceDispatch *d); diff --git a/include/exec/memory.h b/include/exec/memory.h index 2346f8b863..7816e5d655 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -48,6 +48,7 @@ typedef struct MemoryRegionOps MemoryRegionOps; typedef struct MemoryRegionMmio MemoryRegionMmio; +typedef struct FlatView FlatView; struct MemoryRegionMmio { CPUReadMemoryFunc *read[3]; @@ -320,6 +321,8 @@ struct AddressSpace { QTAILQ_ENTRY(AddressSpace) address_spaces_link; }; +FlatView *address_space_to_flatview(AddressSpace *as); + /** * MemoryRegionSection: describes a fragment of a #MemoryRegion * @@ -333,7 +336,7 @@ struct AddressSpace { */ struct MemoryRegionSection { MemoryRegion *mr; - AddressSpace *address_space; + FlatView *fv; hwaddr offset_within_region; Int128 size; hwaddr offset_within_address_space; @@ -1842,9 +1845,17 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr, * @len: pointer to length * @is_write: indicates the transfer direction */ -MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr, - hwaddr *xlat, hwaddr *len, - bool is_write); +MemoryRegion *flatview_translate(FlatView *fv, + hwaddr addr, hwaddr *xlat, + hwaddr *len, bool is_write); + +static inline MemoryRegion *address_space_translate(AddressSpace *as, + hwaddr addr, hwaddr *xlat, + hwaddr *len, bool is_write) +{ + return flatview_translate(address_space_to_flatview(as), + addr, xlat, len, is_write); +} /* address_space_access_valid: check for validity of accessing an address * space range @@ -1895,12 +1906,13 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, /* Internal functions, part of the implementation of address_space_read. */ -MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, uint8_t *buf, - int len, hwaddr addr1, hwaddr l, - MemoryRegion *mr); -MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, uint8_t *buf, int len); +MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, uint8_t *buf, + int len, hwaddr addr1, hwaddr l, + MemoryRegion *mr); + +MemTxResult flatview_read_full(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, uint8_t *buf, int len); void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr); static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write) @@ -1927,8 +1939,8 @@ static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write) * @buf: buffer with the data transferred */ static inline __attribute__((__always_inline__)) -MemTxResult address_space_read(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, - uint8_t *buf, int len) +MemTxResult flatview_read(FlatView *fv, hwaddr addr, MemTxAttrs attrs, + uint8_t *buf, int len) { MemTxResult result = MEMTX_OK; hwaddr l, addr1; @@ -1939,22 +1951,29 @@ MemTxResult address_space_read(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, if (len) { rcu_read_lock(); l = len; - mr = address_space_translate(as, addr, &addr1, &l, false); + mr = flatview_translate(fv, addr, &addr1, &l, false); if (len == l && memory_access_is_direct(mr, false)) { ptr = qemu_map_ram_ptr(mr->ram_block, addr1); memcpy(buf, ptr, len); } else { - result = address_space_read_continue(as, addr, attrs, buf, len, - addr1, l, mr); + result = flatview_read_continue(fv, addr, attrs, buf, len, + addr1, l, mr); } rcu_read_unlock(); } } else { - result = address_space_read_full(as, addr, attrs, buf, len); + result = flatview_read_full(fv, addr, attrs, buf, len); } return result; } +static inline MemTxResult address_space_read(AddressSpace *as, hwaddr addr, + MemTxAttrs attrs, uint8_t *buf, + int len) +{ + return flatview_read(address_space_to_flatview(as), addr, attrs, buf, len); +} + /** * address_space_read_cached: read from a cached RAM region * diff --git a/memory.c b/memory.c index 962e9b961f..bf3728ae2f 100644 --- a/memory.c +++ b/memory.c @@ -154,7 +154,8 @@ enum ListenerDirection { Forward, Reverse }; /* No need to ref/unref .mr, the FlatRange keeps it alive. */ #define MEMORY_LISTENER_UPDATE_REGION(fr, as, dir, callback, _args...) \ do { \ - MemoryRegionSection mrs = section_from_flat_range(fr, as); \ + MemoryRegionSection mrs = section_from_flat_range(fr, \ + address_space_to_flatview(as)); \ MEMORY_LISTENER_CALL(as, callback, dir, &mrs, ##_args); \ } while(0) @@ -208,7 +209,6 @@ static bool memory_region_ioeventfd_equal(MemoryRegionIoeventfd a, } typedef struct FlatRange FlatRange; -typedef struct FlatView FlatView; /* Range of memory in the global map. Addresses are absolute. */ struct FlatRange { @@ -238,11 +238,11 @@ typedef struct AddressSpaceOps AddressSpaceOps; for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var) static inline MemoryRegionSection -section_from_flat_range(FlatRange *fr, AddressSpace *as) +section_from_flat_range(FlatRange *fr, FlatView *fv) { return (MemoryRegionSection) { .mr = fr->mr, - .address_space = as, + .fv = fv, .offset_within_region = fr->offset_in_region, .size = fr->addr.size, .offset_within_address_space = int128_get64(fr->addr.start), @@ -312,7 +312,7 @@ static void flatview_unref(FlatView *view) } } -static FlatView *address_space_to_flatview(AddressSpace *as) +FlatView *address_space_to_flatview(AddressSpace *as) { return atomic_rcu_read(&as->current_map); } @@ -761,7 +761,7 @@ static void address_space_add_del_ioeventfds(AddressSpace *as, fds_new[inew]))) { fd = &fds_old[iold]; section = (MemoryRegionSection) { - .address_space = as, + .fv = address_space_to_flatview(as), .offset_within_address_space = int128_get64(fd->addr.start), .size = fd->addr.size, }; @@ -774,7 +774,7 @@ static void address_space_add_del_ioeventfds(AddressSpace *as, fds_old[iold]))) { fd = &fds_new[inew]; section = (MemoryRegionSection) { - .address_space = as, + .fv = address_space_to_flatview(as), .offset_within_address_space = int128_get64(fd->addr.start), .size = fd->addr.size, }; @@ -794,7 +794,7 @@ static FlatView *address_space_get_flatview(AddressSpace *as) rcu_read_lock(); do { - view = atomic_rcu_read(&as->current_map); + view = address_space_to_flatview(as); /* If somebody has replaced as->current_map concurrently, * flatview_ref returns false. */ @@ -913,8 +913,8 @@ static void address_space_update_topology(AddressSpace *as) new_view->dispatch = mem_begin(as); for (i = 0; i < new_view->nr; i++) { MemoryRegionSection mrs = - section_from_flat_range(&new_view->ranges[i], as); - mem_add(as, new_view, &mrs); + section_from_flat_range(&new_view->ranges[i], new_view); + mem_add(new_view, &mrs); } mem_commit(new_view->dispatch); @@ -1870,7 +1870,7 @@ void memory_region_sync_dirty_bitmap(MemoryRegion *mr) view = address_space_get_flatview(as); FOR_EACH_FLAT_RANGE(fr, view) { if (fr->mr == mr) { - MemoryRegionSection mrs = section_from_flat_range(fr, as); + MemoryRegionSection mrs = section_from_flat_range(fr, view); listener->log_sync(listener, &mrs); } } @@ -1973,7 +1973,7 @@ static void memory_region_update_coalesced_range_as(MemoryRegion *mr, AddressSpa FOR_EACH_FLAT_RANGE(fr, view) { if (fr->mr == mr) { section = (MemoryRegionSection) { - .address_space = as, + .fv = view, .offset_within_address_space = int128_get64(fr->addr.start), .size = fr->addr.size, }; @@ -2324,7 +2324,7 @@ static MemoryRegionSection memory_region_find_rcu(MemoryRegion *mr, } range = addrrange_make(int128_make64(addr), int128_make64(size)); - view = atomic_rcu_read(&as->current_map); + view = address_space_to_flatview(as); fr = flatview_lookup(view, range); if (!fr) { return ret; @@ -2335,7 +2335,7 @@ static MemoryRegionSection memory_region_find_rcu(MemoryRegion *mr, } ret.mr = fr->mr; - ret.address_space = as; + ret.fv = view; range = addrrange_intersection(range, fr->addr); ret.offset_within_region = fr->offset_in_region; ret.offset_within_region += int128_get64(int128_sub(range.start, @@ -2384,7 +2384,8 @@ void memory_global_dirty_log_sync(void) view = address_space_get_flatview(as); FOR_EACH_FLAT_RANGE(fr, view) { if (fr->dirty_log_mask) { - MemoryRegionSection mrs = section_from_flat_range(fr, as); + MemoryRegionSection mrs = section_from_flat_range(fr, view); + listener->log_sync(listener, &mrs); } } @@ -2469,7 +2470,7 @@ static void listener_add_address_space(MemoryListener *listener, FOR_EACH_FLAT_RANGE(fr, view) { MemoryRegionSection section = { .mr = fr->mr, - .address_space = as, + .fv = view, .offset_within_region = fr->offset_in_region, .size = fr->addr.size, .offset_within_address_space = int128_get64(fr->addr.start), -- cgit v1.2.3-70-g09d2 From 9950322a593ff900a860fb52938159461798a831 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:50:59 +1000 Subject: memory: Cleanup after switching to FlatView We store AddressSpaceDispatch* in FlatView anyway so there is no need to carry it from mem_add() to register_subpage/register_multipage. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-8-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- exec.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/exec.c b/exec.c index d2b9f60494..548ec71b4c 100644 --- a/exec.c +++ b/exec.c @@ -1302,9 +1302,9 @@ static void phys_sections_free(PhysPageMap *map) g_free(map->nodes); } -static void register_subpage(FlatView *fv, AddressSpaceDispatch *d, - MemoryRegionSection *section) +static void register_subpage(FlatView *fv, MemoryRegionSection *section) { + AddressSpaceDispatch *d = flatview_to_dispatch(fv); subpage_t *subpage; hwaddr base = section->offset_within_address_space & TARGET_PAGE_MASK; @@ -1333,9 +1333,10 @@ static void register_subpage(FlatView *fv, AddressSpaceDispatch *d, } -static void register_multipage(AddressSpaceDispatch *d, +static void register_multipage(FlatView *fv, MemoryRegionSection *section) { + AddressSpaceDispatch *d = flatview_to_dispatch(fv); hwaddr start_addr = section->offset_within_address_space; uint16_t section_index = phys_section_add(&d->map, section); uint64_t num_pages = int128_get64(int128_rshift(section->size, @@ -1347,7 +1348,6 @@ static void register_multipage(AddressSpaceDispatch *d, void mem_add(FlatView *fv, MemoryRegionSection *section) { - AddressSpaceDispatch *d = flatview_to_dispatch(fv); MemoryRegionSection now = *section, remain = *section; Int128 page_size = int128_make64(TARGET_PAGE_SIZE); @@ -1356,7 +1356,7 @@ void mem_add(FlatView *fv, MemoryRegionSection *section) - now.offset_within_address_space; now.size = int128_min(int128_make64(left), now.size); - register_subpage(fv, d, &now); + register_subpage(fv, &now); } else { now.size = int128_zero(); } @@ -1366,13 +1366,13 @@ void mem_add(FlatView *fv, MemoryRegionSection *section) remain.offset_within_region += int128_get64(now.size); now = remain; if (int128_lt(remain.size, page_size)) { - register_subpage(fv, d, &now); + register_subpage(fv, &now); } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) { now.size = page_size; - register_subpage(fv, d, &now); + register_subpage(fv, &now); } else { now.size = int128_and(now.size, int128_neg(page_size)); - register_multipage(d, &now); + register_multipage(fv, &now); } } } -- cgit v1.2.3-70-g09d2 From 8629d3fcb77e9775e44d9051bad0fb5187925eae Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:51:00 +1000 Subject: memory: Rename mem_begin/mem_commit/mem_add helpers This renames some helpers to reflect better what they do. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-9-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- exec.c | 12 +++--------- include/exec/memory-internal.h | 6 +++--- memory.c | 6 +++--- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/exec.c b/exec.c index 548ec71b4c..b085f82503 100644 --- a/exec.c +++ b/exec.c @@ -358,7 +358,7 @@ static void phys_page_compact(PhysPageEntry *lp, Node *nodes) } } -static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb) +void address_space_dispatch_compact(AddressSpaceDispatch *d) { if (d->phys_map.skip) { phys_page_compact(&d->phys_map, d->map.nodes); @@ -1346,7 +1346,7 @@ static void register_multipage(FlatView *fv, phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index); } -void mem_add(FlatView *fv, MemoryRegionSection *section) +void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section) { MemoryRegionSection now = *section, remain = *section; Int128 page_size = int128_make64(TARGET_PAGE_SIZE); @@ -2672,9 +2672,8 @@ static void io_mem_init(void) NULL, UINT64_MAX); } -AddressSpaceDispatch *mem_begin(AddressSpace *as) +AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv) { - FlatView *fv = address_space_to_flatview(as); AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1); uint16_t n; @@ -2698,11 +2697,6 @@ void address_space_dispatch_free(AddressSpaceDispatch *d) g_free(d); } -void mem_commit(AddressSpaceDispatch *d) -{ - phys_page_compact_all(d, d->map.nodes_nb); -} - static void tcg_commit(MemoryListener *listener) { CPUAddressSpace *cpuas; diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h index 1cf8ad9869..d4a35c6e96 100644 --- a/include/exec/memory-internal.h +++ b/include/exec/memory-internal.h @@ -27,9 +27,9 @@ extern const MemoryRegionOps unassigned_mem_ops; bool memory_region_access_valid(MemoryRegion *mr, hwaddr addr, unsigned size, bool is_write); -void mem_add(FlatView *fv, MemoryRegionSection *section); -AddressSpaceDispatch *mem_begin(AddressSpace *as); -void mem_commit(AddressSpaceDispatch *d); +void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section); +AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv); +void address_space_dispatch_compact(AddressSpaceDispatch *d); AddressSpaceDispatch *address_space_to_dispatch(AddressSpace *as); AddressSpaceDispatch *flatview_to_dispatch(FlatView *fv); diff --git a/memory.c b/memory.c index bf3728ae2f..c374317ba2 100644 --- a/memory.c +++ b/memory.c @@ -910,13 +910,13 @@ static void address_space_update_topology(AddressSpace *as) FlatView *new_view = generate_memory_topology(as->root); int i; - new_view->dispatch = mem_begin(as); + new_view->dispatch = address_space_dispatch_new(new_view); for (i = 0; i < new_view->nr; i++) { MemoryRegionSection mrs = section_from_flat_range(&new_view->ranges[i], new_view); - mem_add(new_view, &mrs); + flatview_add_to_dispatch(new_view, &mrs); } - mem_commit(new_view->dispatch); + address_space_dispatch_compact(new_view->dispatch); if (!QTAILQ_EMPTY(&as->listeners)) { address_space_update_topology_pass(as, old_view, new_view, false); -- cgit v1.2.3-70-g09d2 From 89c177bbdd6cf8e50b3fd4831697d50e195d6432 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:51:01 +1000 Subject: memory: Store physical root MR in FlatView Address spaces get to keep a root MR (alias or not) but FlatView stores the actual MR as this is going to be used later on to decide whether to share a particular FlatView or not. Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-10-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- memory.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/memory.c b/memory.c index c374317ba2..15724db7d6 100644 --- a/memory.c +++ b/memory.c @@ -230,6 +230,7 @@ struct FlatView { unsigned nr; unsigned nr_allocated; struct AddressSpaceDispatch *dispatch; + MemoryRegion *root; }; typedef struct AddressSpaceOps AddressSpaceOps; @@ -259,12 +260,14 @@ static bool flatrange_equal(FlatRange *a, FlatRange *b) && a->readonly == b->readonly; } -static FlatView *flatview_new(void) +static FlatView *flatview_new(MemoryRegion *mr_root) { FlatView *view; view = g_new0(FlatView, 1); view->ref = 1; + view->root = mr_root; + memory_region_ref(mr_root); return view; } @@ -297,6 +300,7 @@ static void flatview_destroy(FlatView *view) memory_region_unref(view->ranges[i].mr); } g_free(view->ranges); + memory_region_unref(view->root); g_free(view); } @@ -723,12 +727,25 @@ static void render_memory_region(FlatView *view, } } +static MemoryRegion *memory_region_get_flatview_root(MemoryRegion *mr) +{ + while (mr->alias && !mr->alias_offset && + int128_ge(mr->size, mr->alias->size)) { + /* The alias is included in its entirety. Use it as + * the "real" root, so that we can share more FlatViews. + */ + mr = mr->alias; + } + + return mr; +} + /* Render a memory topology into a list of disjoint absolute ranges. */ static FlatView *generate_memory_topology(MemoryRegion *mr) { FlatView *view; - view = flatview_new(); + view = flatview_new(mr); if (mr) { render_memory_region(view, mr, int128_zero(), @@ -907,7 +924,8 @@ static void address_space_update_topology_pass(AddressSpace *as, static void address_space_update_topology(AddressSpace *as) { FlatView *old_view = address_space_get_flatview(as); - FlatView *new_view = generate_memory_topology(as->root); + MemoryRegion *physmr = memory_region_get_flatview_root(old_view->root); + FlatView *new_view = generate_memory_topology(physmr); int i; new_view->dispatch = address_space_dispatch_new(new_view); @@ -2650,7 +2668,7 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) as->ref_count = 1; as->root = root; as->malloced = false; - as->current_map = flatview_new(); + as->current_map = flatview_new(root); as->ioeventfd_nb = 0; as->ioeventfds = NULL; QTAILQ_INIT(&as->listeners); -- cgit v1.2.3-70-g09d2 From 9bf561e36cf8fed9565011a19ba9ea0100e1811e Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:51:02 +1000 Subject: memory: Alloc dispatch tree where topology is generared This is to make next patches simpler. Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-11-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- memory.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/memory.c b/memory.c index 15724db7d6..6f6c2332ca 100644 --- a/memory.c +++ b/memory.c @@ -743,6 +743,7 @@ static MemoryRegion *memory_region_get_flatview_root(MemoryRegion *mr) /* Render a memory topology into a list of disjoint absolute ranges. */ static FlatView *generate_memory_topology(MemoryRegion *mr) { + int i; FlatView *view; view = flatview_new(mr); @@ -753,6 +754,14 @@ static FlatView *generate_memory_topology(MemoryRegion *mr) } flatview_simplify(view); + view->dispatch = address_space_dispatch_new(view); + for (i = 0; i < view->nr; i++) { + MemoryRegionSection mrs = + section_from_flat_range(&view->ranges[i], view); + flatview_add_to_dispatch(view, &mrs); + } + address_space_dispatch_compact(view->dispatch); + return view; } @@ -926,15 +935,6 @@ static void address_space_update_topology(AddressSpace *as) FlatView *old_view = address_space_get_flatview(as); MemoryRegion *physmr = memory_region_get_flatview_root(old_view->root); FlatView *new_view = generate_memory_topology(physmr); - int i; - - new_view->dispatch = address_space_dispatch_new(new_view); - for (i = 0; i < new_view->nr; i++) { - MemoryRegionSection mrs = - section_from_flat_range(&new_view->ranges[i], new_view); - flatview_add_to_dispatch(new_view, &mrs); - } - address_space_dispatch_compact(new_view->dispatch); if (!QTAILQ_EMPTY(&as->listeners)) { address_space_update_topology_pass(as, old_view, new_view, false); -- cgit v1.2.3-70-g09d2 From 02218487649558ed66c3689d4cc55250a42601d8 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:51:03 +1000 Subject: memory: Move address_space_update_ioeventfds So it is called (twice) from the same function. This is to make the next patches a bit simpler. Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-12-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- memory.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/memory.c b/memory.c index 6f6c2332ca..bb582d052a 100644 --- a/memory.c +++ b/memory.c @@ -952,8 +952,6 @@ static void address_space_update_topology(AddressSpace *as) * counting is necessary. */ flatview_unref(old_view); - - address_space_update_ioeventfds(as); } void memory_region_transaction_begin(void) @@ -976,6 +974,7 @@ void memory_region_transaction_commit(void) QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { address_space_update_topology(as); + address_space_update_ioeventfds(as); } memory_region_update_pending = false; MEMORY_LISTENER_CALL_GLOBAL(commit, Forward); -- cgit v1.2.3-70-g09d2 From 967dc9b1194a9281124b2e1ce67b6c3359a2138f Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:51:04 +1000 Subject: memory: Share FlatView's and dispatch trees between address spaces This allows sharing flat views between address spaces (AS) when the same root memory region is used when creating a new address space. This is done by walking through all ASes and caching one FlatView per a physical root MR (i.e. not aliased). This removes search for duplicates from address_space_init_shareable() as FlatViews are shared elsewhere and keeping as::ref_count correct seems an unnecessary and useless complication. This should cause no change and memory use or boot time yet. Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-13-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- memory.c | 56 +++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/memory.c b/memory.c index bb582d052a..8ee3c81862 100644 --- a/memory.c +++ b/memory.c @@ -47,6 +47,8 @@ static QTAILQ_HEAD(memory_listeners, MemoryListener) memory_listeners static QTAILQ_HEAD(, AddressSpace) address_spaces = QTAILQ_HEAD_INITIALIZER(address_spaces); +static GHashTable *flat_views; + typedef struct AddrRange AddrRange; /* @@ -761,6 +763,7 @@ static FlatView *generate_memory_topology(MemoryRegion *mr) flatview_add_to_dispatch(view, &mrs); } address_space_dispatch_compact(view->dispatch); + g_hash_table_replace(flat_views, mr, view); return view; } @@ -930,11 +933,47 @@ static void address_space_update_topology_pass(AddressSpace *as, } } -static void address_space_update_topology(AddressSpace *as) +static void flatviews_init(void) +{ + if (flat_views) { + return; + } + + flat_views = g_hash_table_new_full(g_direct_hash, g_direct_equal, NULL, + (GDestroyNotify) flatview_unref); +} + +static void flatviews_reset(void) +{ + AddressSpace *as; + + if (flat_views) { + g_hash_table_unref(flat_views); + flat_views = NULL; + } + flatviews_init(); + + /* Render unique FVs */ + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + MemoryRegion *physmr = memory_region_get_flatview_root(as->root); + + if (g_hash_table_lookup(flat_views, physmr)) { + continue; + } + + generate_memory_topology(physmr); + } +} + +static void address_space_set_flatview(AddressSpace *as) { FlatView *old_view = address_space_get_flatview(as); - MemoryRegion *physmr = memory_region_get_flatview_root(old_view->root); - FlatView *new_view = generate_memory_topology(physmr); + MemoryRegion *physmr = memory_region_get_flatview_root(as->root); + FlatView *new_view = g_hash_table_lookup(flat_views, physmr); + + assert(new_view); + + flatview_ref(new_view); if (!QTAILQ_EMPTY(&as->listeners)) { address_space_update_topology_pass(as, old_view, new_view, false); @@ -970,10 +1009,12 @@ void memory_region_transaction_commit(void) --memory_region_transaction_depth; if (!memory_region_transaction_depth) { if (memory_region_update_pending) { + flatviews_reset(); + MEMORY_LISTENER_CALL_GLOBAL(begin, Forward); QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { - address_space_update_topology(as); + address_space_set_flatview(as); address_space_update_ioeventfds(as); } memory_region_update_pending = false; @@ -2696,13 +2737,6 @@ AddressSpace *address_space_init_shareable(MemoryRegion *root, const char *name) { AddressSpace *as; - QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { - if (root == as->root && as->malloced) { - as->ref_count++; - return as; - } - } - as = g_malloc0(sizeof *as); address_space_init(as, root, name); as->malloced = true; -- cgit v1.2.3-70-g09d2 From 67ace39b253ed5ae465275bc870f7e495547658b Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:51:05 +1000 Subject: memory: Do not allocate FlatView in address_space_init This creates a new AS object without any FlatView as memory_region_transaction_commit() may want to reuse the empty FV. Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-14-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- memory.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/memory.c b/memory.c index 8ee3c81862..3f6dd40303 100644 --- a/memory.c +++ b/memory.c @@ -967,22 +967,37 @@ static void flatviews_reset(void) static void address_space_set_flatview(AddressSpace *as) { - FlatView *old_view = address_space_get_flatview(as); + FlatView *old_view = address_space_to_flatview(as); MemoryRegion *physmr = memory_region_get_flatview_root(as->root); FlatView *new_view = g_hash_table_lookup(flat_views, physmr); assert(new_view); + if (old_view == new_view) { + return; + } + + if (old_view) { + flatview_ref(old_view); + } + flatview_ref(new_view); if (!QTAILQ_EMPTY(&as->listeners)) { - address_space_update_topology_pass(as, old_view, new_view, false); - address_space_update_topology_pass(as, old_view, new_view, true); + FlatView tmpview = { .nr = 0 }, *old_view2 = old_view; + + if (!old_view2) { + old_view2 = &tmpview; + } + address_space_update_topology_pass(as, old_view2, new_view, false); + address_space_update_topology_pass(as, old_view2, new_view, true); } /* Writes are protected by the BQL. */ atomic_rcu_set(&as->current_map, new_view); - flatview_unref(old_view); + if (old_view) { + flatview_unref(old_view); + } /* Note that all the old MemoryRegions are still alive up to this * point. This relieves most MemoryListeners from the need to @@ -990,7 +1005,9 @@ static void address_space_set_flatview(AddressSpace *as) * outside the iothread mutex, in which case precise reference * counting is necessary. */ - flatview_unref(old_view); + if (old_view) { + flatview_unref(old_view); + } } void memory_region_transaction_begin(void) @@ -2708,7 +2725,7 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) as->ref_count = 1; as->root = root; as->malloced = false; - as->current_map = flatview_new(root); + as->current_map = NULL; as->ioeventfd_nb = 0; as->ioeventfds = NULL; QTAILQ_INIT(&as->listeners); -- cgit v1.2.3-70-g09d2 From 5e8fd947e2670c3c18f139de6a83fafcb56abbcc Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:51:06 +1000 Subject: memory: Rework "info mtree" to print flat views and dispatch trees This adds a new "-d" switch to "info mtree" to print dispatch tree internals. This changes the way "-f" is handled - it prints now flat views and associated address spaces. Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-15-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- exec.c | 84 +++++++++++++++++++++++++++++++++++++++ hmp-commands-info.hx | 7 ++-- include/exec/memory-internal.h | 4 ++ include/exec/memory.h | 3 +- memory.c | 90 +++++++++++++++++++++++++++++++++++++----- monitor.c | 3 +- 6 files changed, 176 insertions(+), 15 deletions(-) diff --git a/exec.c b/exec.c index b085f82503..7a80460725 100644 --- a/exec.c +++ b/exec.c @@ -3616,3 +3616,87 @@ void page_size_init(void) } qemu_host_page_mask = -(intptr_t)qemu_host_page_size; } + +#if !defined(CONFIG_USER_ONLY) + +static void mtree_print_phys_entries(fprintf_function mon, void *f, + int start, int end, int skip, int ptr) +{ + if (start == end - 1) { + mon(f, "\t%3d ", start); + } else { + mon(f, "\t%3d..%-3d ", start, end - 1); + } + mon(f, " skip=%d ", skip); + if (ptr == PHYS_MAP_NODE_NIL) { + mon(f, " ptr=NIL"); + } else if (!skip) { + mon(f, " ptr=#%d", ptr); + } else { + mon(f, " ptr=[%d]", ptr); + } + mon(f, "\n"); +} + +#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \ + int128_sub((size), int128_one())) : 0) + +void mtree_print_dispatch(fprintf_function mon, void *f, + AddressSpaceDispatch *d, MemoryRegion *root) +{ + int i; + + mon(f, " Dispatch\n"); + mon(f, " Physical sections\n"); + + for (i = 0; i < d->map.sections_nb; ++i) { + MemoryRegionSection *s = d->map.sections + i; + const char *names[] = { " [unassigned]", " [not dirty]", + " [ROM]", " [watch]" }; + + mon(f, " #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx " %s%s%s%s%s", + i, + s->offset_within_address_space, + s->offset_within_address_space + MR_SIZE(s->mr->size), + s->mr->name ? s->mr->name : "(noname)", + i < ARRAY_SIZE(names) ? names[i] : "", + s->mr == root ? " [ROOT]" : "", + s == d->mru_section ? " [MRU]" : "", + s->mr->is_iommu ? " [iommu]" : ""); + + if (s->mr->alias) { + mon(f, " alias=%s", s->mr->alias->name ? + s->mr->alias->name : "noname"); + } + mon(f, "\n"); + } + + mon(f, " Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n", + P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip); + for (i = 0; i < d->map.nodes_nb; ++i) { + int j, jprev; + PhysPageEntry prev; + Node *n = d->map.nodes + i; + + mon(f, " [%d]\n", i); + + for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) { + PhysPageEntry *pe = *n + j; + + if (pe->ptr == prev.ptr && pe->skip == prev.skip) { + continue; + } + + mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr); + + jprev = j; + prev = *pe; + } + + if (jprev != ARRAY_SIZE(*n)) { + mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr); + } + } +} + +#endif diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx index 1c6772597d..4f1ece93e5 100644 --- a/hmp-commands-info.hx +++ b/hmp-commands-info.hx @@ -250,9 +250,10 @@ ETEXI { .name = "mtree", - .args_type = "flatview:-f", - .params = "[-f]", - .help = "show memory tree (-f: dump flat view for address spaces)", + .args_type = "flatview:-f,dispatch_tree:-d", + .params = "[-f][-d]", + .help = "show memory tree (-f: dump flat view for address spaces;" + "-d: dump dispatch tree, valid with -f only)", .cmd = hmp_info_mtree, }, diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h index d4a35c6e96..647e9bd5c4 100644 --- a/include/exec/memory-internal.h +++ b/include/exec/memory-internal.h @@ -35,5 +35,9 @@ AddressSpaceDispatch *address_space_to_dispatch(AddressSpace *as); AddressSpaceDispatch *flatview_to_dispatch(FlatView *fv); void address_space_dispatch_free(AddressSpaceDispatch *d); +void mtree_print_dispatch(fprintf_function mon, void *f, + struct AddressSpaceDispatch *d, + MemoryRegion *root); + #endif #endif diff --git a/include/exec/memory.h b/include/exec/memory.h index 7816e5d655..2f4f56cf40 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -1515,7 +1515,8 @@ void memory_global_dirty_log_start(void); */ void memory_global_dirty_log_stop(void); -void mtree_info(fprintf_function mon_printf, void *f, bool flatview); +void mtree_info(fprintf_function mon_printf, void *f, bool flatview, + bool dispatch_tree); /** * memory_region_request_mmio_ptr: request a pointer to an mmio diff --git a/memory.c b/memory.c index 3f6dd40303..6729fb3ab3 100644 --- a/memory.c +++ b/memory.c @@ -2907,18 +2907,44 @@ static void mtree_print_mr(fprintf_function mon_printf, void *f, } } -static void mtree_print_flatview(fprintf_function p, void *f, - AddressSpace *as) +struct FlatViewInfo { + fprintf_function mon_printf; + void *f; + int counter; + bool dispatch_tree; +}; + +static void mtree_print_flatview(gpointer key, gpointer value, + gpointer user_data) { - FlatView *view = address_space_get_flatview(as); + FlatView *view = key; + GArray *fv_address_spaces = value; + struct FlatViewInfo *fvi = user_data; + fprintf_function p = fvi->mon_printf; + void *f = fvi->f; FlatRange *range = &view->ranges[0]; MemoryRegion *mr; int n = view->nr; + int i; + AddressSpace *as; + + p(f, "FlatView #%d\n", fvi->counter); + ++fvi->counter; + + for (i = 0; i < fv_address_spaces->len; ++i) { + as = g_array_index(fv_address_spaces, AddressSpace*, i); + p(f, " AS \"%s\", root: %s", as->name, memory_region_name(as->root)); + if (as->root->alias) { + p(f, ", alias %s", memory_region_name(as->root->alias)); + } + p(f, "\n"); + } + + p(f, " Root memory region: %s\n", + view->root ? memory_region_name(view->root) : "(none)"); if (n <= 0) { - p(f, MTREE_INDENT "No rendered FlatView for " - "address space '%s'\n", as->name); - flatview_unref(view); + p(f, MTREE_INDENT "No rendered FlatView\n\n"); return; } @@ -2945,21 +2971,65 @@ static void mtree_print_flatview(fprintf_function p, void *f, range++; } +#if !defined(CONFIG_USER_ONLY) + if (fvi->dispatch_tree && view->root) { + mtree_print_dispatch(p, f, view->dispatch, view->root); + } +#endif + + p(f, "\n"); +} + +static gboolean mtree_info_flatview_free(gpointer key, gpointer value, + gpointer user_data) +{ + FlatView *view = key; + GArray *fv_address_spaces = value; + + g_array_unref(fv_address_spaces); flatview_unref(view); + + return true; } -void mtree_info(fprintf_function mon_printf, void *f, bool flatview) +void mtree_info(fprintf_function mon_printf, void *f, bool flatview, + bool dispatch_tree) { MemoryRegionListHead ml_head; MemoryRegionList *ml, *ml2; AddressSpace *as; if (flatview) { + FlatView *view; + struct FlatViewInfo fvi = { + .mon_printf = mon_printf, + .f = f, + .counter = 0, + .dispatch_tree = dispatch_tree + }; + GArray *fv_address_spaces; + GHashTable *views = g_hash_table_new(g_direct_hash, g_direct_equal); + + /* Gather all FVs in one table */ QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { - mon_printf(f, "address-space (flat view): %s\n", as->name); - mtree_print_flatview(mon_printf, f, as); - mon_printf(f, "\n"); + view = address_space_get_flatview(as); + + fv_address_spaces = g_hash_table_lookup(views, view); + if (!fv_address_spaces) { + fv_address_spaces = g_array_new(false, false, sizeof(as)); + g_hash_table_insert(views, view, fv_address_spaces); + } + + g_array_append_val(fv_address_spaces, as); } + + /* Print */ + g_hash_table_foreach(views, mtree_print_flatview, &fvi); + + /* Free */ + g_hash_table_foreach_remove(views, mtree_info_flatview_free, 0); + g_hash_table_unref(views); + return; } diff --git a/monitor.c b/monitor.c index 058045b3cb..f4856b9268 100644 --- a/monitor.c +++ b/monitor.c @@ -1703,8 +1703,9 @@ static void hmp_boot_set(Monitor *mon, const QDict *qdict) static void hmp_info_mtree(Monitor *mon, const QDict *qdict) { bool flatview = qdict_get_try_bool(qdict, "flatview", false); + bool dispatch_tree = qdict_get_try_bool(qdict, "dispatch_tree", false); - mtree_info((fprintf_function)monitor_printf, mon, flatview); + mtree_info((fprintf_function)monitor_printf, mon, flatview, dispatch_tree); } static void hmp_info_numa(Monitor *mon, const QDict *qdict) -- cgit v1.2.3-70-g09d2 From b516572f31c0ea0937cd9d11d9bd72dd83809886 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:51:08 +1000 Subject: memory: Get rid of address_space_init_shareable Since FlatViews are shared now and ASes not, this gets rid of address_space_init_shareable(). This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-17-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- cpus.c | 5 +++-- hw/arm/armv7m.c | 9 ++++----- include/exec/memory.h | 19 ------------------- include/hw/arm/armv7m.h | 2 +- memory.c | 21 --------------------- target/arm/cpu.c | 16 ++++++++-------- target/i386/cpu.c | 5 +++-- 7 files changed, 19 insertions(+), 58 deletions(-) diff --git a/cpus.c b/cpus.c index 9bed61eefc..c9a624003a 100644 --- a/cpus.c +++ b/cpus.c @@ -1764,8 +1764,9 @@ void qemu_init_vcpu(CPUState *cpu) /* If the target cpu hasn't set up any address spaces itself, * give it the default one. */ - AddressSpace *as = address_space_init_shareable(cpu->memory, - "cpu-memory"); + AddressSpace *as = g_new0(AddressSpace, 1); + + address_space_init(as, cpu->memory, "cpu-memory"); cpu->num_ases = 1; cpu_address_space_init(cpu, as, 0); } diff --git a/hw/arm/armv7m.c b/hw/arm/armv7m.c index 57a680687a..bb2dfc942b 100644 --- a/hw/arm/armv7m.c +++ b/hw/arm/armv7m.c @@ -41,7 +41,7 @@ static MemTxResult bitband_read(void *opaque, hwaddr offset, /* Find address in underlying memory and round down to multiple of size */ addr = bitband_addr(s, offset) & (-size); - res = address_space_read(s->source_as, addr, attrs, buf, size); + res = address_space_read(&s->source_as, addr, attrs, buf, size); if (res) { return res; } @@ -66,7 +66,7 @@ static MemTxResult bitband_write(void *opaque, hwaddr offset, uint64_t value, /* Find address in underlying memory and round down to multiple of size */ addr = bitband_addr(s, offset) & (-size); - res = address_space_read(s->source_as, addr, attrs, buf, size); + res = address_space_read(&s->source_as, addr, attrs, buf, size); if (res) { return res; } @@ -79,7 +79,7 @@ static MemTxResult bitband_write(void *opaque, hwaddr offset, uint64_t value, } else { buf[bitpos >> 3] &= ~bit; } - return address_space_write(s->source_as, addr, attrs, buf, size); + return address_space_write(&s->source_as, addr, attrs, buf, size); } static const MemoryRegionOps bitband_ops = { @@ -111,8 +111,7 @@ static void bitband_realize(DeviceState *dev, Error **errp) return; } - s->source_as = address_space_init_shareable(s->source_memory, - "bitband-source"); + address_space_init(&s->source_as, s->source_memory, "bitband-source"); } /* Board init. */ diff --git a/include/exec/memory.h b/include/exec/memory.h index 2f4f56cf40..402824c6f2 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -309,8 +309,6 @@ struct AddressSpace { struct rcu_head rcu; char *name; MemoryRegion *root; - int ref_count; - bool malloced; /* Accessed via RCU. */ struct FlatView *current_map; @@ -1585,23 +1583,6 @@ MemTxResult memory_region_dispatch_write(MemoryRegion *mr, */ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name); -/** - * address_space_init_shareable: return an address space for a memory region, - * creating it if it does not already exist - * - * @root: a #MemoryRegion that routes addresses for the address space - * @name: an address space name. The name is only used for debugging - * output. - * - * This function will return a pointer to an existing AddressSpace - * which was initialized with the specified MemoryRegion, or it will - * create and initialize one if it does not already exist. The ASes - * are reference-counted, so the memory will be freed automatically - * when the AddressSpace is destroyed via address_space_destroy. - */ -AddressSpace *address_space_init_shareable(MemoryRegion *root, - const char *name); - /** * address_space_destroy: destroy an address space * diff --git a/include/hw/arm/armv7m.h b/include/hw/arm/armv7m.h index 9ad316c76e..35ab757264 100644 --- a/include/hw/arm/armv7m.h +++ b/include/hw/arm/armv7m.h @@ -21,7 +21,7 @@ typedef struct { SysBusDevice parent_obj; /*< public >*/ - AddressSpace *source_as; + AddressSpace source_as; MemoryRegion iomem; uint32_t base; MemoryRegion *source_memory; diff --git a/memory.c b/memory.c index 6729fb3ab3..aa7355bafb 100644 --- a/memory.c +++ b/memory.c @@ -2722,9 +2722,7 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) { memory_region_ref(root); memory_region_transaction_begin(); - as->ref_count = 1; as->root = root; - as->malloced = false; as->current_map = NULL; as->ioeventfd_nb = 0; as->ioeventfds = NULL; @@ -2737,37 +2735,18 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) static void do_address_space_destroy(AddressSpace *as) { - bool do_free = as->malloced; - assert(QTAILQ_EMPTY(&as->listeners)); flatview_unref(as->current_map); g_free(as->name); g_free(as->ioeventfds); memory_region_unref(as->root); - if (do_free) { - g_free(as); - } -} - -AddressSpace *address_space_init_shareable(MemoryRegion *root, const char *name) -{ - AddressSpace *as; - - as = g_malloc0(sizeof *as); - address_space_init(as, root, name); - as->malloced = true; - return as; } void address_space_destroy(AddressSpace *as) { MemoryRegion *root = as->root; - as->ref_count--; - if (as->ref_count) { - return; - } /* Flush out anything from MemoryListeners listening in on this */ memory_region_transaction_begin(); as->root = NULL; diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 20a3445bda..f61ca660e6 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -684,6 +684,9 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp) CPUARMState *env = &cpu->env; int pagebits; Error *local_err = NULL; +#ifndef CONFIG_USER_ONLY + AddressSpace *as; +#endif cpu_exec_realizefn(cs, &local_err); if (local_err != NULL) { @@ -874,24 +877,21 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp) #ifndef CONFIG_USER_ONLY if (cpu->has_el3 || arm_feature(env, ARM_FEATURE_M_SECURITY)) { - AddressSpace *as; + as = g_new0(AddressSpace, 1); cs->num_ases = 2; if (!cpu->secure_memory) { cpu->secure_memory = cs->memory; } - as = address_space_init_shareable(cpu->secure_memory, - "cpu-secure-memory"); + address_space_init(as, cpu->secure_memory, "cpu-secure-memory"); cpu_address_space_init(cs, as, ARMASIdx_S); } else { cs->num_ases = 1; } - - cpu_address_space_init(cs, - address_space_init_shareable(cs->memory, - "cpu-memory"), - ARMASIdx_NS); + as = g_new0(AddressSpace, 1); + address_space_init(as, cs->memory, "cpu-memory"); + cpu_address_space_init(cs, as, ARMASIdx_NS); #endif qemu_init_vcpu(cs); diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 0aa28fc775..98732cd65f 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3738,10 +3738,11 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) #ifndef CONFIG_USER_ONLY if (tcg_enabled()) { - AddressSpace *as_normal = address_space_init_shareable(cs->memory, - "cpu-memory"); + AddressSpace *as_normal = g_new0(AddressSpace, 1); AddressSpace *as_smm = g_new(AddressSpace, 1); + address_space_init(as_normal, cs->memory, "cpu-memory"); + cpu->cpu_as_mem = g_new(MemoryRegion, 1); cpu->cpu_as_root = g_new(MemoryRegion, 1); -- cgit v1.2.3-70-g09d2 From 202fc01b05572ecb258fdf4c5bd56cf6de8140c7 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:51:09 +1000 Subject: memory: Create FlatView directly This avoids usual memory_region_transaction_commit() which rebuilds all FVs. On POWER8 with 255 CPUs, 255 virtio-net, 40 PCI bridges guest this brings down the boot time from 25s to 20s and reduces the amount of temporary FVs allocated during machine constructon (~800000 -> ~640000) and amount of temporary dispatch trees (~370000 -> ~300000), the total memory footprint goes down (18G -> 17G). Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-18-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- memory.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/memory.c b/memory.c index aa7355bafb..706c38508f 100644 --- a/memory.c +++ b/memory.c @@ -1010,6 +1010,17 @@ static void address_space_set_flatview(AddressSpace *as) } } +static void address_space_update_topology(AddressSpace *as) +{ + MemoryRegion *physmr = memory_region_get_flatview_root(as->root); + + flatviews_init(); + if (!g_hash_table_lookup(flat_views, physmr)) { + generate_memory_topology(physmr); + } + address_space_set_flatview(as); +} + void memory_region_transaction_begin(void) { qemu_flush_coalesced_mmio_buffer(); @@ -2721,7 +2732,6 @@ void memory_region_invalidate_mmio_ptr(MemoryRegion *mr, hwaddr offset, void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) { memory_region_ref(root); - memory_region_transaction_begin(); as->root = root; as->current_map = NULL; as->ioeventfd_nb = 0; @@ -2729,8 +2739,8 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) QTAILQ_INIT(&as->listeners); QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link); as->name = g_strdup(name ? name : "anonymous"); - memory_region_update_pending |= root->enabled; - memory_region_transaction_commit(); + address_space_update_topology(as); + address_space_update_ioeventfds(as); } static void do_address_space_destroy(AddressSpace *as) -- cgit v1.2.3-70-g09d2 From 02d9651d6a46479e9d70b72dca34e43605d06cda Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 21 Sep 2017 12:34:00 +0200 Subject: memory: trace FlatView creation and destruction Signed-off-by: Paolo Bonzini --- include/exec/memory.h | 1 - include/qemu/typedefs.h | 1 + memory.c | 3 +++ trace-events | 3 +++ 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/exec/memory.h b/include/exec/memory.h index 402824c6f2..5ed4042f87 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -48,7 +48,6 @@ typedef struct MemoryRegionOps MemoryRegionOps; typedef struct MemoryRegionMmio MemoryRegionMmio; -typedef struct FlatView FlatView; struct MemoryRegionMmio { CPUReadMemoryFunc *read[3]; diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index 163550214c..980d2b330e 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -30,6 +30,7 @@ typedef struct DisplaySurface DisplaySurface; typedef struct DriveInfo DriveInfo; typedef struct Error Error; typedef struct EventNotifier EventNotifier; +typedef struct FlatView FlatView; typedef struct FWCfgEntry FWCfgEntry; typedef struct FWCfgIoState FWCfgIoState; typedef struct FWCfgMemState FWCfgMemState; diff --git a/memory.c b/memory.c index 706c38508f..68cdf8bad0 100644 --- a/memory.c +++ b/memory.c @@ -270,6 +270,7 @@ static FlatView *flatview_new(MemoryRegion *mr_root) view->ref = 1; view->root = mr_root; memory_region_ref(mr_root); + trace_flatview_new(view, mr_root); return view; } @@ -295,6 +296,7 @@ static void flatview_destroy(FlatView *view) { int i; + trace_flatview_destroy(view, view->root); if (view->dispatch) { address_space_dispatch_free(view->dispatch); } @@ -314,6 +316,7 @@ static bool flatview_ref(FlatView *view) static void flatview_unref(FlatView *view) { if (atomic_fetch_dec(&view->ref) == 1) { + trace_flatview_destroy_rcu(view, view->root); call_rcu(view, flatview_destroy, rcu); } } diff --git a/trace-events b/trace-events index 1f50f56d9d..1d2eb5d3e4 100644 --- a/trace-events +++ b/trace-events @@ -64,6 +64,9 @@ memory_region_tb_read(int cpu_index, uint64_t addr, uint64_t value, unsigned siz memory_region_tb_write(int cpu_index, uint64_t addr, uint64_t value, unsigned size) "cpu %d addr 0x%"PRIx64" value 0x%"PRIx64" size %u" memory_region_ram_device_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u" memory_region_ram_device_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u" +flatview_new(FlatView *view, MemoryRegion *root) "%p (root %p)" +flatview_destroy(FlatView *view, MemoryRegion *root) "%p (root %p)" +flatview_destroy_rcu(FlatView *view, MemoryRegion *root) "%p (root %p)" ### Guest events, keep at bottom -- cgit v1.2.3-70-g09d2 From e673ba9af9bf8fd8e0f44025ac738b8285b3ed27 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 21 Sep 2017 12:28:16 +0200 Subject: memory: seek FlatView sharing candidates among children subregions A container can be used instead of an alias to allow switching between multiple subregions. In this case we cannot directly share the subregions (since they only belong to a single parent), but if the subregions are aliases we can in turn walk those. This is not enough to remove all source of quadratic FlatView creation, but it enables sharing of the PCI bus master FlatViews (and their AddressSpaceDispatch structures) across all PCI devices. For 112 virtio-net-pci devices, boot time is reduced from 25 to 10 seconds and memory consumption from 1.4 to 1 G. Signed-off-by: Paolo Bonzini --- memory.c | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/memory.c b/memory.c index 68cdf8bad0..15b1bd7d89 100644 --- a/memory.c +++ b/memory.c @@ -734,12 +734,40 @@ static void render_memory_region(FlatView *view, static MemoryRegion *memory_region_get_flatview_root(MemoryRegion *mr) { - while (mr->alias && !mr->alias_offset && - int128_ge(mr->size, mr->alias->size)) { - /* The alias is included in its entirety. Use it as - * the "real" root, so that we can share more FlatViews. - */ - mr = mr->alias; + while (mr->enabled) { + if (mr->alias) { + if (!mr->alias_offset && int128_ge(mr->size, mr->alias->size)) { + /* The alias is included in its entirety. Use it as + * the "real" root, so that we can share more FlatViews. + */ + mr = mr->alias; + continue; + } + } else if (!mr->terminates) { + unsigned int found = 0; + MemoryRegion *child, *next = NULL; + QTAILQ_FOREACH(child, &mr->subregions, subregions_link) { + if (child->enabled) { + if (++found > 1) { + next = NULL; + break; + } + if (!child->addr && int128_ge(mr->size, child->size)) { + /* A child is included in its entirety. If it's the only + * enabled one, use it in the hope of finding an alias down the + * way. This will also let us share FlatViews. + */ + next = child; + } + } + } + if (next) { + mr = next; + continue; + } + } + + break; } return mr; -- cgit v1.2.3-70-g09d2 From 092aa2fc65b7a35121616aad8f39d47b8f921618 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 21 Sep 2017 18:51:07 +1000 Subject: memory: Share special empty FlatView This shares an cached empty FlatView among address spaces. The empty FV is used every time when a root MR renders into a FV without memory sections which happens when MR or its children are not enabled or zero-sized. The empty_view is not NULL to keep the rest of memory API intact; it also has a dispatch tree for the same reason. On POWER8 with 255 CPUs, 255 virtio-net, 40 PCI bridges guest this halves the amount of FlatView's in use (557 -> 260) and dispatch tables (~800000 -> ~370000). In an unrelated experiment with 112 non-virtio devices on x86 ("-M pc"), only 4 FlatViews are alive, and about ~2000 are created at startup. Signed-off-by: Alexey Kardashevskiy Message-Id: <20170921085110.25598-16-aik@ozlabs.ru> Signed-off-by: Paolo Bonzini --- memory.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/memory.c b/memory.c index 15b1bd7d89..5e6351a6c1 100644 --- a/memory.c +++ b/memory.c @@ -317,6 +317,7 @@ static void flatview_unref(FlatView *view) { if (atomic_fetch_dec(&view->ref) == 1) { trace_flatview_destroy_rcu(view, view->root); + assert(view->root); call_rcu(view, flatview_destroy, rcu); } } @@ -761,16 +762,19 @@ static MemoryRegion *memory_region_get_flatview_root(MemoryRegion *mr) } } } + if (found == 0) { + return NULL; + } if (next) { mr = next; continue; } } - break; + return mr; } - return mr; + return NULL; } /* Render a memory topology into a list of disjoint absolute ranges. */ @@ -966,12 +970,22 @@ static void address_space_update_topology_pass(AddressSpace *as, static void flatviews_init(void) { + static FlatView *empty_view; + if (flat_views) { return; } flat_views = g_hash_table_new_full(g_direct_hash, g_direct_equal, NULL, (GDestroyNotify) flatview_unref); + if (!empty_view) { + empty_view = generate_memory_topology(NULL); + /* We keep it alive forever in the global variable. */ + flatview_ref(empty_view); + } else { + g_hash_table_replace(flat_views, NULL, empty_view); + flatview_ref(empty_view); + } } static void flatviews_reset(void) -- cgit v1.2.3-70-g09d2 From 7c9e527659c67d4d7b41d9504f93d2d7ee482488 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 21 Aug 2017 18:58:56 +0200 Subject: scsi, file-posix: add support for persistent reservation management It is a common requirement for virtual machine to send persistent reservations, but this currently requires either running QEMU with CAP_SYS_RAWIO, or using out-of-tree patches that let an unprivileged QEMU bypass Linux's filter on SG_IO commands. As an alternative mechanism, the next patches will introduce a privileged helper to run persistent reservation commands without expanding QEMU's attack surface unnecessarily. The helper is invoked through a "pr-manager" QOM object, to which file-posix.c passes SG_IO requests for PERSISTENT RESERVE OUT and PERSISTENT RESERVE IN commands. For example: $ qemu-system-x86_64 -device virtio-scsi \ -object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock -drive if=none,id=hd,driver=raw,file.filename=/dev/sdb,file.pr-manager=helper0 -device scsi-block,drive=hd or: $ qemu-system-x86_64 -device virtio-scsi \ -object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock -blockdev node-name=hd,driver=raw,file.driver=host_device,file.filename=/dev/sdb,file.pr-manager=helper0 -device scsi-block,drive=hd Multiple pr-manager implementations are conceivable and possible, though only one is implemented right now. For example, a pr-manager could: - talk directly to the multipath daemon from a privileged QEMU (i.e. QEMU links to libmpathpersist); this makes reservation work properly with multipath, but still requires CAP_SYS_RAWIO - use the Linux IOC_PR_* ioctls (they require CAP_SYS_ADMIN though) - more interestingly, implement reservations directly in QEMU through file system locks or a shared database (e.g. sqlite) Signed-off-by: Paolo Bonzini --- Makefile.objs | 1 + block/file-posix.c | 30 +++++++++++++ docs/pr-manager.rst | 51 ++++++++++++++++++++++ include/scsi/pr-manager.h | 56 ++++++++++++++++++++++++ qapi/block-core.json | 4 ++ scsi/Makefile.objs | 2 + scsi/pr-manager.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++ scsi/trace-events | 3 ++ vl.c | 3 +- 9 files changed, 258 insertions(+), 1 deletion(-) create mode 100644 docs/pr-manager.rst create mode 100644 include/scsi/pr-manager.h create mode 100644 scsi/pr-manager.c create mode 100644 scsi/trace-events diff --git a/Makefile.objs b/Makefile.objs index 9e89100302..bdfa3b6177 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -171,6 +171,7 @@ trace-events-subdirs += qapi trace-events-subdirs += accel/tcg trace-events-subdirs += accel/kvm trace-events-subdirs += nbd +trace-events-subdirs += scsi trace-events-files = $(SRC_PATH)/trace-events $(trace-events-subdirs:%=$(SRC_PATH)/%/trace-events) diff --git a/block/file-posix.c b/block/file-posix.c index 6acbd56238..ab12a2b591 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -33,6 +33,9 @@ #include "block/raw-aio.h" #include "qapi/qmp/qstring.h" +#include "scsi/pr-manager.h" +#include "scsi/constants.h" + #if defined(__APPLE__) && (__MACH__) #include #include @@ -155,6 +158,8 @@ typedef struct BDRVRawState { bool page_cache_inconsistent:1; bool has_fallocate; bool needs_alignment; + + PRManager *pr_mgr; } BDRVRawState; typedef struct BDRVRawReopenState { @@ -402,6 +407,11 @@ static QemuOptsList raw_runtime_opts = { .type = QEMU_OPT_STRING, .help = "file locking mode (on/off/auto, default: auto)", }, + { + .name = "pr-manager", + .type = QEMU_OPT_STRING, + .help = "id of persistent reservation manager object (default: none)", + }, { /* end of list */ } }, }; @@ -413,6 +423,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, QemuOpts *opts; Error *local_err = NULL; const char *filename = NULL; + const char *str; BlockdevAioOptions aio, aio_default; int fd, ret; struct stat st; @@ -476,6 +487,16 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, abort(); } + str = qemu_opt_get(opts, "pr-manager"); + if (str) { + s->pr_mgr = pr_manager_lookup(str, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + } + s->open_flags = open_flags; raw_parse_flags(bdrv_flags, &s->open_flags); @@ -2597,6 +2618,15 @@ static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, if (fd_open(bs) < 0) return NULL; + if (req == SG_IO && s->pr_mgr) { + struct sg_io_hdr *io_hdr = buf; + if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT || + io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) { + return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs), + s->fd, io_hdr, cb, opaque); + } + } + acb = g_new(RawPosixAIOData, 1); acb->bs = bs; acb->aio_type = QEMU_AIO_IOCTL; diff --git a/docs/pr-manager.rst b/docs/pr-manager.rst new file mode 100644 index 0000000000..b6089fb57c --- /dev/null +++ b/docs/pr-manager.rst @@ -0,0 +1,51 @@ +====================================== +Persistent reservation managers +====================================== + +SCSI persistent Reservations allow restricting access to block devices +to specific initiators in a shared storage setup. When implementing +clustering of virtual machines, it is a common requirement for virtual +machines to send persistent reservation SCSI commands. However, +the operating system restricts sending these commands to unprivileged +programs because incorrect usage can disrupt regular operation of the +storage fabric. + +For this reason, QEMU's SCSI passthrough devices, ``scsi-block`` +and ``scsi-generic`` (both are only available on Linux) can delegate +implementation of persistent reservations to a separate object, +the "persistent reservation manager". Only PERSISTENT RESERVE OUT and +PERSISTENT RESERVE IN commands are passed to the persistent reservation +manager object; other commands are processed by QEMU as usual. + +----------------------------------------- +Defining a persistent reservation manager +----------------------------------------- + +A persistent reservation manager is an instance of a subclass of the +"pr-manager" QOM class. + +Right now only one subclass is defined, ``pr-manager-helper``, which +forwards the commands to an external privileged helper program +over Unix sockets. The helper program only allows sending persistent +reservation commands to devices for which QEMU has a file descriptor, +so that QEMU will not be able to effect persistent reservations +unless it has access to both the socket and the device. + +``pr-manager-helper`` has a single string property, ``path``, which +accepts the path to the helper program's Unix socket. For example, +the following command line defines a ``pr-manager-helper`` object and +attaches it to a SCSI passthrough device:: + + $ qemu-system-x86_64 + -device virtio-scsi \ + -object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock + -drive if=none,id=hd,driver=raw,file.filename=/dev/sdb,file.pr-manager=helper0 + -device scsi-block,drive=hd + +Alternatively, using ``-blockdev``:: + + $ qemu-system-x86_64 + -device virtio-scsi \ + -object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock + -blockdev node-name=hd,driver=raw,file.driver=host_device,file.filename=/dev/sdb,file.pr-manager=helper0 + -device scsi-block,drive=hd diff --git a/include/scsi/pr-manager.h b/include/scsi/pr-manager.h new file mode 100644 index 0000000000..b2b37d63bc --- /dev/null +++ b/include/scsi/pr-manager.h @@ -0,0 +1,56 @@ +#ifndef PR_MANAGER_H +#define PR_MANAGER_H + +#include "qom/object.h" +#include "qapi/qmp/qdict.h" +#include "qapi/visitor.h" +#include "qom/object_interfaces.h" +#include "block/aio.h" + +#define TYPE_PR_MANAGER "pr-manager" + +#define PR_MANAGER_CLASS(klass) \ + OBJECT_CLASS_CHECK(PRManagerClass, (klass), TYPE_PR_MANAGER) +#define PR_MANAGER_GET_CLASS(obj) \ + OBJECT_GET_CLASS(PRManagerClass, (obj), TYPE_PR_MANAGER) +#define PR_MANAGER(obj) \ + OBJECT_CHECK(PRManager, (obj), TYPE_PR_MANAGER) + +struct sg_io_hdr; + +typedef struct PRManager { + /* */ + Object parent; +} PRManager; + +/** + * PRManagerClass: + * @parent_class: the base class + * @run: callback invoked in thread pool context + */ +typedef struct PRManagerClass { + /* */ + ObjectClass parent_class; + + /* */ + int (*run)(PRManager *pr_mgr, int fd, struct sg_io_hdr *hdr); +} PRManagerClass; + +BlockAIOCB *pr_manager_execute(PRManager *pr_mgr, + AioContext *ctx, int fd, + struct sg_io_hdr *hdr, + BlockCompletionFunc *complete, + void *opaque); + +#ifdef CONFIG_LINUX +PRManager *pr_manager_lookup(const char *id, Error **errp); +#else +static inline PRManager *pr_manager_lookup(const char *id, Error **errp) +{ + /* The classes do not exist at all! */ + error_setg(errp, "No persistent reservation manager with id '%s'", id); + return NULL; +} +#endif + +#endif diff --git a/qapi/block-core.json b/qapi/block-core.json index bb11815608..c69a395804 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2241,6 +2241,9 @@ # Driver specific block device options for the file backend. # # @filename: path to the image file +# @pr-manager: the id for the object that will handle persistent reservations +# for this device (default: none, forward the commands via SG_IO; +# since 2.11) # @aio: AIO backend (default: threads) (since: 2.8) # @locking: whether to enable file locking. If set to 'auto', only enable # when Open File Descriptor (OFD) locking API is available @@ -2250,6 +2253,7 @@ ## { 'struct': 'BlockdevOptionsFile', 'data': { 'filename': 'str', + '*pr-manager': 'str', '*locking': 'OnOffAuto', '*aio': 'BlockdevAioOptions' } } diff --git a/scsi/Makefile.objs b/scsi/Makefile.objs index 31b82a5a36..5496d2ae6a 100644 --- a/scsi/Makefile.objs +++ b/scsi/Makefile.objs @@ -1 +1,3 @@ block-obj-y += utils.o + +block-obj-$(CONFIG_LINUX) += pr-manager.o diff --git a/scsi/pr-manager.c b/scsi/pr-manager.c new file mode 100644 index 0000000000..87c45db5d4 --- /dev/null +++ b/scsi/pr-manager.c @@ -0,0 +1,109 @@ +/* + * Persistent reservation manager abstract class + * + * Copyright (c) 2017 Red Hat, Inc. + * + * Author: Paolo Bonzini + * + * This code is licensed under the LGPL. + * + */ + +#include "qemu/osdep.h" +#include + +#include "qapi/error.h" +#include "block/aio.h" +#include "block/thread-pool.h" +#include "scsi/pr-manager.h" +#include "trace.h" + +typedef struct PRManagerData { + PRManager *pr_mgr; + struct sg_io_hdr *hdr; + int fd; +} PRManagerData; + +static int pr_manager_worker(void *opaque) +{ + PRManagerData *data = opaque; + PRManager *pr_mgr = data->pr_mgr; + PRManagerClass *pr_mgr_class = + PR_MANAGER_GET_CLASS(pr_mgr); + struct sg_io_hdr *hdr = data->hdr; + int fd = data->fd; + int r; + + g_free(data); + trace_pr_manager_run(fd, hdr->cmdp[0], hdr->cmdp[1]); + + /* The reference was taken in pr_manager_execute. */ + r = pr_mgr_class->run(pr_mgr, fd, hdr); + object_unref(OBJECT(pr_mgr)); + return r; +} + + +BlockAIOCB *pr_manager_execute(PRManager *pr_mgr, + AioContext *ctx, int fd, + struct sg_io_hdr *hdr, + BlockCompletionFunc *complete, + void *opaque) +{ + PRManagerData *data = g_new(PRManagerData, 1); + ThreadPool *pool = aio_get_thread_pool(ctx); + + trace_pr_manager_execute(fd, hdr->cmdp[0], hdr->cmdp[1], opaque); + data->pr_mgr = pr_mgr; + data->fd = fd; + data->hdr = hdr; + + /* The matching object_unref is in pr_manager_worker. */ + object_ref(OBJECT(pr_mgr)); + return thread_pool_submit_aio(pool, pr_manager_worker, + data, complete, opaque); +} + +static const TypeInfo pr_manager_info = { + .parent = TYPE_OBJECT, + .name = TYPE_PR_MANAGER, + .class_size = sizeof(PRManagerClass), + .abstract = true, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +PRManager *pr_manager_lookup(const char *id, Error **errp) +{ + Object *obj; + PRManager *pr_mgr; + + obj = object_resolve_path_component(object_get_objects_root(), id); + if (!obj) { + error_setg(errp, "No persistent reservation manager with id '%s'", id); + return NULL; + } + + pr_mgr = (PRManager *) + object_dynamic_cast(obj, + TYPE_PR_MANAGER); + if (!pr_mgr) { + error_setg(errp, + "Object with id '%s' is not a persistent reservation manager", + id); + return NULL; + } + + return pr_mgr; +} + +static void +pr_manager_register_types(void) +{ + type_register_static(&pr_manager_info); +} + + +type_init(pr_manager_register_types); diff --git a/scsi/trace-events b/scsi/trace-events new file mode 100644 index 0000000000..45f5b6e49b --- /dev/null +++ b/scsi/trace-events @@ -0,0 +1,3 @@ +# scsi/pr-manager.c +pr_manager_execute(int fd, int cmd, int sa, void *opaque) "fd=%d cmd=0x%02x service action=0x%02x opaque=%p" +pr_manager_run(int fd, int cmd, int sa) "fd=%d cmd=0x%02x service action=0x%02x" diff --git a/vl.c b/vl.c index 9bb5058c3a..a121a65731 100644 --- a/vl.c +++ b/vl.c @@ -2893,7 +2893,8 @@ static int machine_set_property(void *opaque, */ static bool object_create_initial(const char *type) { - if (g_str_equal(type, "rng-egd")) { + if (g_str_equal(type, "rng-egd") || + g_str_has_prefix(type, "pr-manager-")) { return false; } -- cgit v1.2.3-70-g09d2 From b855f8d175a0a26c9798cbc5962bb8c0d9538231 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 22 Aug 2017 06:50:18 +0200 Subject: scsi: build qemu-pr-helper Introduce a privileged helper to run persistent reservation commands. This lets virtual machines send persistent reservations without using CAP_SYS_RAWIO or out-of-tree patches. The helper uses Unix permissions and SCM_RIGHTS to restrict access to processes that can access its socket and prove that they have an open file descriptor for a raw SCSI device. The next patch will also correct the usage of persistent reservations with multipath devices. It would also be possible to support for Linux's IOC_PR_* ioctls in the future, to support NVMe devices. For now, however, only SCSI is supported. Signed-off-by: Paolo Bonzini --- Makefile | 4 +- configure | 14 +- docs/interop/pr-helper.rst | 83 +++++ docs/pr-manager.rst | 33 ++ scsi/pr-helper.h | 41 +++ scsi/qemu-pr-helper.c | 735 +++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 905 insertions(+), 5 deletions(-) create mode 100644 docs/interop/pr-helper.rst create mode 100644 scsi/pr-helper.h create mode 100644 scsi/qemu-pr-helper.c diff --git a/Makefile b/Makefile index eb831b98d1..8406aeb8cb 100644 --- a/Makefile +++ b/Makefile @@ -372,6 +372,8 @@ qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o $(COMMON_LDADDS) fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o fsdev/9p-marshal.o fsdev/9p-iov-marshal.o $(COMMON_LDADDS) fsdev/virtfs-proxy-helper$(EXESUF): LIBS += -lcap +scsi/qemu-pr-helper$(EXESUF): scsi/qemu-pr-helper.o scsi/utils.o $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) $(COMMON_LDADDS) + qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx $(SRC_PATH)/scripts/hxtool $(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"GEN","$@") @@ -488,7 +490,7 @@ clean: rm -f *.msi find . \( -name '*.so' -o -name '*.dll' -o -name '*.mo' -o -name '*.[oda]' \) -type f -exec rm {} + rm -f $(filter-out %.tlb,$(TOOLS)) $(HELPERS-y) qemu-ga TAGS cscope.* *.pod *~ */*~ - rm -f fsdev/*.pod + rm -f fsdev/*.pod scsi/*.pod rm -f qemu-img-cmds.h rm -f ui/shader/*-vert.h ui/shader/*-frag.h @# May not be present in GENERATED_FILES diff --git a/configure b/configure index cb0f7ed0ab..becc21a0fe 100755 --- a/configure +++ b/configure @@ -5034,16 +5034,22 @@ if test "$want_tools" = "yes" ; then fi fi if test "$softmmu" = yes ; then - if test "$virtfs" != no ; then - if test "$cap" = yes && test "$linux" = yes && test "$attr" = yes ; then + if test "$linux" = yes; then + if test "$virtfs" != no && test "$cap" = yes && test "$attr" = yes ; then virtfs=yes tools="$tools fsdev/virtfs-proxy-helper\$(EXESUF)" else if test "$virtfs" = yes; then - error_exit "VirtFS is supported only on Linux and requires libcap devel and libattr devel" + error_exit "VirtFS requires libcap devel and libattr devel" fi virtfs=no fi + tools="$tools scsi/qemu-pr-helper\$(EXESUF)" + else + if test "$virtfs" = yes; then + error_exit "VirtFS is supported only on Linux" + fi + virtfs=no fi fi @@ -6506,7 +6512,7 @@ fi # build tree in object directory in case the source is not in the current directory DIRS="tests tests/tcg tests/tcg/cris tests/tcg/lm32 tests/libqos tests/qapi-schema tests/tcg/xtensa tests/qemu-iotests" -DIRS="$DIRS docs docs/interop fsdev" +DIRS="$DIRS docs docs/interop fsdev scsi" DIRS="$DIRS pc-bios/optionrom pc-bios/spapr-rtas pc-bios/s390-ccw" DIRS="$DIRS roms/seabios roms/vgabios" DIRS="$DIRS qapi-generated" diff --git a/docs/interop/pr-helper.rst b/docs/interop/pr-helper.rst new file mode 100644 index 0000000000..9f76d5bcf9 --- /dev/null +++ b/docs/interop/pr-helper.rst @@ -0,0 +1,83 @@ +.. + +====================================== +Persistent reservation helper protocol +====================================== + +QEMU's SCSI passthrough devices, ``scsi-block`` and ``scsi-generic``, +can delegate implementation of persistent reservations to an external +(and typically privileged) program. Persistent Reservations allow +restricting access to block devices to specific initiators in a shared +storage setup. + +For a more detailed reference please refer the the SCSI Primary +Commands standard, specifically the section on Reservations and the +"PERSISTENT RESERVE IN" and "PERSISTENT RESERVE OUT" commands. + +This document describes the socket protocol used between QEMU's +``pr-manager-helper`` object and the external program. + +.. contents:: + +Connection and initialization +----------------------------- + +All data transmitted on the socket is big-endian. + +After connecting to the helper program's socket, the helper starts a simple +feature negotiation process by writing four bytes corresponding to +the features it exposes (``supported_features``). QEMU reads it, +then writes four bytes corresponding to the desired features of the +helper program (``requested_features``). + +If a bit is 1 in ``requested_features`` and 0 in ``supported_features``, +the corresponding feature is not supported by the helper and the connection +is closed. On the other hand, it is acceptable for a bit to be 0 in +``requested_features`` and 1 in ``supported_features``; in this case, +the helper will not enable the feature. + +Right now no feature is defined, so the two parties always write four +zero bytes. + +Command format +-------------- + +It is invalid to send multiple commands concurrently on the same +socket. It is however possible to connect multiple sockets to the +helper and send multiple commands to the helper for one or more +file descriptors. + +A command consists of a request and a response. A request consists +of a 16-byte SCSI CDB. A file descriptor must be passed to the helper +together with the SCSI CDB using ancillary data. + +The CDB has the following limitations: + +- the command (stored in the first byte) must be one of 0x5E + (PERSISTENT RESERVE IN) or 0x5F (PERSISTENT RESERVE OUT). + +- the allocation length (stored in bytes 7-8 of the CDB for PERSISTENT + RESERVE IN) or parameter list length (stored in bytes 5-8 of the CDB + for PERSISTENT RESERVE OUT) is limited to 8 KiB. + +For PERSISTENT RESERVE OUT, the parameter list is sent right after the +CDB. The length of the parameter list is taken from the CDB itself. + +The helper's reply has the following structure: + +- 4 bytes for the SCSI status + +- 4 bytes for the payload size (nonzero only for PERSISTENT RESERVE IN + and only if the SCSI status is 0x00, i.e. GOOD) + +- 96 bytes for the SCSI sense data + +- if the size is nonzero, the payload follows + +The sense data is always sent to keep the protocol simple, even though +it is only valid if the SCSI status is CHECK CONDITION (0x02). + +The payload size is always less than or equal to the allocation length +specified in the CDB for the PERSISTENT RESERVE IN command. + +If the protocol is violated, the helper closes the socket. diff --git a/docs/pr-manager.rst b/docs/pr-manager.rst index b6089fb57c..7107e59fb8 100644 --- a/docs/pr-manager.rst +++ b/docs/pr-manager.rst @@ -49,3 +49,36 @@ Alternatively, using ``-blockdev``:: -object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock -blockdev node-name=hd,driver=raw,file.driver=host_device,file.filename=/dev/sdb,file.pr-manager=helper0 -device scsi-block,drive=hd + +---------------------------------- +Invoking :program:`qemu-pr-helper` +---------------------------------- + +QEMU provides an implementation of the persistent reservation helper, +called :program:`qemu-pr-helper`. The helper should be started as a +system service and supports the following option: + +-d, --daemon run in the background +-q, --quiet decrease verbosity +-f, --pidfile=path PID file when running as a daemon +-k, --socket=path path to the socket +-T, --trace=trace-opts tracing options + +By default, the socket and PID file are placed in the runtime state +directory, for example :file:`/var/run/qemu-pr-helper.sock` and +:file:`/var/run/qemu-pr-helper.pid`. The PID file is not created +unless :option:`-d` is passed too. + +:program:`qemu-pr-helper` can also use the systemd socket activation +protocol. In this case, the systemd socket unit should specify a +Unix stream socket, like this:: + + [Socket] + ListenStream=/var/run/qemu-pr-helper.sock + +After connecting to the socket, :program:`qemu-pr-helper`` can optionally drop +root privileges, except for those capabilities that are needed for +its operation. To do this, add the following options: + +-u, --user=user user to drop privileges to +-g, --group=group group to drop privileges to diff --git a/scsi/pr-helper.h b/scsi/pr-helper.h new file mode 100644 index 0000000000..96c50a9e5f --- /dev/null +++ b/scsi/pr-helper.h @@ -0,0 +1,41 @@ +/* Definitions for QEMU's persistent reservation helper daemon + * + * Copyright (C) 2017 Red Hat, Inc. + * + * Author: + * Paolo Bonzini + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifndef QEMU_PR_HELPER_H +#define QEMU_PR_HELPER_H 1 + +#include + +#define PR_HELPER_CDB_SIZE 16 +#define PR_HELPER_SENSE_SIZE 96 +#define PR_HELPER_DATA_SIZE 8192 + +typedef struct PRHelperResponse { + int32_t result; + int32_t sz; + uint8_t sense[PR_HELPER_SENSE_SIZE]; +} PRHelperResponse; + +#endif diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c new file mode 100644 index 0000000000..f46266f733 --- /dev/null +++ b/scsi/qemu-pr-helper.c @@ -0,0 +1,735 @@ +/* + * Privileged helper to handle persistent reservation commands for QEMU + * + * Copyright (C) 2017 Red Hat, Inc. + * + * Author: Paolo Bonzini + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "qemu/osdep.h" +#include +#include +#include +#include + +#ifdef CONFIG_LIBCAP +#include +#endif +#include +#include + +#include "qapi/error.h" +#include "qemu-common.h" +#include "qemu/cutils.h" +#include "qemu/main-loop.h" +#include "qemu/error-report.h" +#include "qemu/config-file.h" +#include "qemu/bswap.h" +#include "qemu/log.h" +#include "qemu/systemd.h" +#include "qapi/util.h" +#include "qapi/qmp/qstring.h" +#include "io/channel-socket.h" +#include "trace/control.h" +#include "qemu-version.h" + +#include "block/aio.h" +#include "block/thread-pool.h" + +#include "scsi/constants.h" +#include "scsi/utils.h" +#include "pr-helper.h" + +#define PR_OUT_FIXED_PARAM_SIZE 24 + +static char *socket_path; +static char *pidfile; +static enum { RUNNING, TERMINATE, TERMINATING } state; +static QIOChannelSocket *server_ioc; +static int server_watch; +static int num_active_sockets = 1; +static int verbose; + +#ifdef CONFIG_LIBCAP +static int uid = -1; +static int gid = -1; +#endif + +static void usage(const char *name) +{ + (printf) ( +"Usage: %s [OPTIONS] FILE\n" +"Persistent Reservation helper program for QEMU\n" +"\n" +" -h, --help display this help and exit\n" +" -V, --version output version information and exit\n" +"\n" +" -d, --daemon run in the background\n" +" -f, --pidfile=PATH PID file when running as a daemon\n" +" (default '%s')\n" +" -k, --socket=PATH path to the unix socket\n" +" (default '%s')\n" +" -T, --trace [[enable=]][,events=][,file=]\n" +" specify tracing options\n" +#ifdef CONFIG_LIBCAP +" -u, --user=USER user to drop privileges to\n" +" -g, --group=GROUP group to drop privileges to\n" +#endif +"\n" +QEMU_HELP_BOTTOM "\n" + , name, pidfile, socket_path); +} + +static void version(const char *name) +{ + printf( +"%s " QEMU_VERSION QEMU_PKGVERSION "\n" +"Written by Paolo Bonzini.\n" +"\n" +QEMU_COPYRIGHT "\n" +"This is free software; see the source for copying conditions. There is NO\n" +"warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n" + , name); +} + +static void write_pidfile(void) +{ + int pidfd; + char pidstr[32]; + + pidfd = qemu_open(pidfile, O_CREAT|O_WRONLY, S_IRUSR|S_IWUSR); + if (pidfd == -1) { + error_report("Cannot open pid file, %s", strerror(errno)); + exit(EXIT_FAILURE); + } + + if (lockf(pidfd, F_TLOCK, 0)) { + error_report("Cannot lock pid file, %s", strerror(errno)); + goto fail; + } + if (ftruncate(pidfd, 0)) { + error_report("Failed to truncate pid file"); + goto fail; + } + + snprintf(pidstr, sizeof(pidstr), "%d\n", getpid()); + if (write(pidfd, pidstr, strlen(pidstr)) != strlen(pidstr)) { + error_report("Failed to write pid file"); + goto fail; + } + return; + +fail: + unlink(pidfile); + close(pidfd); + exit(EXIT_FAILURE); +} + +/* SG_IO support */ + +typedef struct PRHelperSGIOData { + int fd; + const uint8_t *cdb; + uint8_t *sense; + uint8_t *buf; + int sz; /* input/output */ + int dir; +} PRHelperSGIOData; + +static int do_sgio_worker(void *opaque) +{ + PRHelperSGIOData *data = opaque; + struct sg_io_hdr io_hdr; + int ret; + int status; + SCSISense sense_code; + + memset(data->sense, 0, PR_HELPER_SENSE_SIZE); + memset(&io_hdr, 0, sizeof(io_hdr)); + io_hdr.interface_id = 'S'; + io_hdr.cmd_len = PR_HELPER_CDB_SIZE; + io_hdr.cmdp = (uint8_t *)data->cdb; + io_hdr.sbp = data->sense; + io_hdr.mx_sb_len = PR_HELPER_SENSE_SIZE; + io_hdr.timeout = 1; + io_hdr.dxfer_direction = data->dir; + io_hdr.dxferp = (char *)data->buf; + io_hdr.dxfer_len = data->sz; + ret = ioctl(data->fd, SG_IO, &io_hdr); + status = sg_io_sense_from_errno(ret < 0 ? errno : 0, &io_hdr, + &sense_code); + if (status == GOOD) { + data->sz -= io_hdr.resid; + } else { + data->sz = 0; + } + + if (status == CHECK_CONDITION && + !(io_hdr.driver_status & SG_ERR_DRIVER_SENSE)) { + scsi_build_sense(data->sense, sense_code); + } + + return status; +} + +static int do_sgio(int fd, const uint8_t *cdb, uint8_t *sense, + uint8_t *buf, int *sz, int dir) +{ + ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context()); + int r; + + PRHelperSGIOData data = { + .fd = fd, + .cdb = cdb, + .sense = sense, + .buf = buf, + .sz = *sz, + .dir = dir, + }; + + r = thread_pool_submit_co(pool, do_sgio_worker, &data); + *sz = data.sz; + return r; +} + +static int do_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, + uint8_t *data, int *resp_sz) +{ + return do_sgio(fd, cdb, sense, data, resp_sz, + SG_DXFER_FROM_DEV); +} + +static int do_pr_out(int fd, const uint8_t *cdb, uint8_t *sense, + const uint8_t *param, int sz) +{ + int resp_sz = sz; + return do_sgio(fd, cdb, sense, (uint8_t *)param, &resp_sz, + SG_DXFER_TO_DEV); +} + +/* Client */ + +typedef struct PRHelperClient { + QIOChannelSocket *ioc; + Coroutine *co; + int fd; + uint8_t data[PR_HELPER_DATA_SIZE]; +} PRHelperClient; + +typedef struct PRHelperRequest { + int fd; + size_t sz; + uint8_t cdb[PR_HELPER_CDB_SIZE]; +} PRHelperRequest; + +static int coroutine_fn prh_read(PRHelperClient *client, void *buf, int sz, + Error **errp) +{ + int ret = 0; + + while (sz > 0) { + int *fds = NULL; + size_t nfds = 0; + int i; + struct iovec iov; + ssize_t n_read; + + iov.iov_base = buf; + iov.iov_len = sz; + n_read = qio_channel_readv_full(QIO_CHANNEL(client->ioc), &iov, 1, + &fds, &nfds, errp); + + if (n_read == QIO_CHANNEL_ERR_BLOCK) { + qio_channel_yield(QIO_CHANNEL(client->ioc), G_IO_IN); + continue; + } + if (n_read <= 0) { + ret = n_read ? n_read : -1; + goto err; + } + + /* Stash one file descriptor per request. */ + if (nfds) { + bool too_many = false; + for (i = 0; i < nfds; i++) { + if (client->fd == -1) { + client->fd = fds[i]; + } else { + close(fds[i]); + too_many = true; + } + } + g_free(fds); + if (too_many) { + ret = -1; + goto err; + } + } + + buf += n_read; + sz -= n_read; + } + + return 0; + +err: + if (client->fd != -1) { + close(client->fd); + client->fd = -1; + } + return ret; +} + +static int coroutine_fn prh_read_request(PRHelperClient *client, + PRHelperRequest *req, + PRHelperResponse *resp, Error **errp) +{ + uint32_t sz; + + if (prh_read(client, req->cdb, sizeof(req->cdb), NULL) < 0) { + return -1; + } + + if (client->fd == -1) { + error_setg(errp, "No file descriptor in request."); + return -1; + } + + if (req->cdb[0] != PERSISTENT_RESERVE_OUT && + req->cdb[0] != PERSISTENT_RESERVE_IN) { + error_setg(errp, "Invalid CDB, closing socket."); + goto out_close; + } + + sz = scsi_cdb_xfer(req->cdb); + if (sz > sizeof(client->data)) { + goto out_close; + } + + if (req->cdb[0] == PERSISTENT_RESERVE_OUT) { + if (qio_channel_read_all(QIO_CHANNEL(client->ioc), + (char *)client->data, sz, + errp) < 0) { + goto out_close; + } + if ((fcntl(client->fd, F_GETFL) & O_ACCMODE) == O_RDONLY) { + scsi_build_sense(resp->sense, SENSE_CODE(INVALID_OPCODE)); + sz = 0; + } else if (sz < PR_OUT_FIXED_PARAM_SIZE) { + /* Illegal request, Parameter list length error. This isn't fatal; + * we have read the data, send an error without closing the socket. + */ + scsi_build_sense(resp->sense, SENSE_CODE(INVALID_PARAM_LEN)); + sz = 0; + } + if (sz == 0) { + resp->result = CHECK_CONDITION; + close(client->fd); + client->fd = -1; + } + } + + req->fd = client->fd; + req->sz = sz; + client->fd = -1; + return sz; + +out_close: + close(client->fd); + client->fd = -1; + return -1; +} + +static int coroutine_fn prh_write_response(PRHelperClient *client, + PRHelperRequest *req, + PRHelperResponse *resp, Error **errp) +{ + ssize_t r; + size_t sz; + + if (req->cdb[0] == PERSISTENT_RESERVE_IN && resp->result == GOOD) { + assert(resp->sz <= req->sz && resp->sz <= sizeof(client->data)); + } else { + assert(resp->sz == 0); + } + + sz = resp->sz; + + resp->result = cpu_to_be32(resp->result); + resp->sz = cpu_to_be32(resp->sz); + r = qio_channel_write_all(QIO_CHANNEL(client->ioc), + (char *) resp, sizeof(*resp), errp); + if (r < 0) { + return r; + } + + r = qio_channel_write_all(QIO_CHANNEL(client->ioc), + (char *) client->data, + sz, errp); + return r < 0 ? r : 0; +} + +static void coroutine_fn prh_co_entry(void *opaque) +{ + PRHelperClient *client = opaque; + Error *local_err = NULL; + uint32_t flags; + int r; + + qio_channel_set_blocking(QIO_CHANNEL(client->ioc), + false, NULL); + qio_channel_attach_aio_context(QIO_CHANNEL(client->ioc), + qemu_get_aio_context()); + + /* A very simple negotiation for future extensibility. No features + * are defined so write 0. + */ + flags = cpu_to_be32(0); + r = qio_channel_write_all(QIO_CHANNEL(client->ioc), + (char *) &flags, sizeof(flags), NULL); + if (r < 0) { + goto out; + } + + r = qio_channel_read_all(QIO_CHANNEL(client->ioc), + (char *) &flags, sizeof(flags), NULL); + if (be32_to_cpu(flags) != 0 || r < 0) { + goto out; + } + + while (atomic_read(&state) == RUNNING) { + PRHelperRequest req; + PRHelperResponse resp; + int sz; + + sz = prh_read_request(client, &req, &resp, &local_err); + if (sz < 0) { + break; + } + + if (sz > 0) { + num_active_sockets++; + if (req.cdb[0] == PERSISTENT_RESERVE_OUT) { + r = do_pr_out(req.fd, req.cdb, resp.sense, + client->data, sz); + resp.sz = 0; + } else { + resp.sz = sizeof(client->data); + r = do_pr_in(req.fd, req.cdb, resp.sense, + client->data, &resp.sz); + resp.sz = MIN(resp.sz, sz); + } + num_active_sockets--; + close(req.fd); + if (r == -1) { + break; + } + resp.result = r; + } + + if (prh_write_response(client, &req, &resp, &local_err) < 0) { + break; + } + } + + if (local_err) { + if (verbose == 0) { + error_free(local_err); + } else { + error_report_err(local_err); + } + } + +out: + qio_channel_detach_aio_context(QIO_CHANNEL(client->ioc)); + object_unref(OBJECT(client->ioc)); + g_free(client); +} + +static gboolean accept_client(QIOChannel *ioc, GIOCondition cond, gpointer opaque) +{ + QIOChannelSocket *cioc; + PRHelperClient *prh; + + cioc = qio_channel_socket_accept(QIO_CHANNEL_SOCKET(ioc), + NULL); + if (!cioc) { + return TRUE; + } + + prh = g_new(PRHelperClient, 1); + prh->ioc = cioc; + prh->fd = -1; + prh->co = qemu_coroutine_create(prh_co_entry, prh); + qemu_coroutine_enter(prh->co); + + return TRUE; +} + + +/* + * Check socket parameters compatibility when socket activation is used. + */ +static const char *socket_activation_validate_opts(void) +{ + if (socket_path != NULL) { + return "Unix socket can't be set when using socket activation"; + } + + return NULL; +} + +static void compute_default_paths(void) +{ + if (!socket_path) { + socket_path = qemu_get_local_state_pathname("run/qemu-pr-helper.sock"); + } +} + +static void termsig_handler(int signum) +{ + atomic_cmpxchg(&state, RUNNING, TERMINATE); + qemu_notify_event(); +} + +static void close_server_socket(void) +{ + assert(server_ioc); + + g_source_remove(server_watch); + server_watch = -1; + object_unref(OBJECT(server_ioc)); + num_active_sockets--; +} + +#ifdef CONFIG_LIBCAP +static int drop_privileges(void) +{ + /* clear all capabilities */ + capng_clear(CAPNG_SELECT_BOTH); + + if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE | CAPNG_PERMITTED, + CAP_SYS_RAWIO) < 0) { + return -1; + } + + /* Change user/group id, retaining the capabilities. Because file descriptors + * are passed via SCM_RIGHTS, we don't need supplementary groups (and in + * fact the helper can run as "nobody"). + */ + if (capng_change_id(uid != -1 ? uid : getuid(), + gid != -1 ? gid : getgid(), + CAPNG_DROP_SUPP_GRP | CAPNG_CLEAR_BOUNDING)) { + return -1; + } + + return 0; +} +#endif + +int main(int argc, char **argv) +{ + const char *sopt = "hVk:fdT:u:g:q"; + struct option lopt[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + { "socket", required_argument, NULL, 'k' }, + { "pidfile", no_argument, NULL, 'f' }, + { "daemon", no_argument, NULL, 'd' }, + { "trace", required_argument, NULL, 'T' }, + { "user", required_argument, NULL, 'u' }, + { "group", required_argument, NULL, 'g' }, + { "quiet", no_argument, NULL, 'q' }, + { NULL, 0, NULL, 0 } + }; + int opt_ind = 0; + int quiet = 0; + int ch; + Error *local_err = NULL; + char *trace_file = NULL; + bool daemonize = false; + unsigned socket_activation; + + struct sigaction sa_sigterm; + memset(&sa_sigterm, 0, sizeof(sa_sigterm)); + sa_sigterm.sa_handler = termsig_handler; + sigaction(SIGTERM, &sa_sigterm, NULL); + sigaction(SIGINT, &sa_sigterm, NULL); + sigaction(SIGHUP, &sa_sigterm, NULL); + + signal(SIGPIPE, SIG_IGN); + + module_call_init(MODULE_INIT_TRACE); + module_call_init(MODULE_INIT_QOM); + qemu_add_opts(&qemu_trace_opts); + qemu_init_exec_dir(argv[0]); + + pidfile = qemu_get_local_state_pathname("run/qemu-pr-helper.pid"); + + while ((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) { + switch (ch) { + case 'k': + socket_path = optarg; + if (socket_path[0] != '/') { + error_report("socket path must be absolute"); + exit(EXIT_FAILURE); + } + break; + case 'f': + pidfile = optarg; + break; +#ifdef CONFIG_LIBCAP + case 'u': { + unsigned long res; + struct passwd *userinfo = getpwnam(optarg); + if (userinfo) { + uid = userinfo->pw_uid; + } else if (qemu_strtoul(optarg, NULL, 10, &res) == 0 && + (uid_t)res == res) { + uid = res; + } else { + error_report("invalid user '%s'", optarg); + exit(EXIT_FAILURE); + } + break; + } + case 'g': { + unsigned long res; + struct group *groupinfo = getgrnam(optarg); + if (groupinfo) { + gid = groupinfo->gr_gid; + } else if (qemu_strtoul(optarg, NULL, 10, &res) == 0 && + (gid_t)res == res) { + gid = res; + } else { + error_report("invalid group '%s'", optarg); + exit(EXIT_FAILURE); + } + break; + } +#else + case 'u': + case 'g': + error_report("-%c not supported by this %s", ch, argv[0]); + exit(1); +#endif + case 'd': + daemonize = true; + break; + case 'q': + quiet = 1; + break; + case 'T': + g_free(trace_file); + trace_file = trace_opt_parse(optarg); + break; + case 'V': + version(argv[0]); + exit(EXIT_SUCCESS); + break; + case 'h': + usage(argv[0]); + exit(EXIT_SUCCESS); + break; + case '?': + error_report("Try `%s --help' for more information.", argv[0]); + exit(EXIT_FAILURE); + } + } + + /* set verbosity */ + verbose = !quiet; + + if (!trace_init_backends()) { + exit(EXIT_FAILURE); + } + trace_init_file(trace_file); + qemu_set_log(LOG_TRACE); + + socket_activation = check_socket_activation(); + if (socket_activation == 0) { + SocketAddress saddr; + compute_default_paths(); + saddr = (SocketAddress){ + .type = SOCKET_ADDRESS_TYPE_UNIX, + .u.q_unix.path = g_strdup(socket_path) + }; + server_ioc = qio_channel_socket_new(); + if (qio_channel_socket_listen_sync(server_ioc, &saddr, &local_err) < 0) { + object_unref(OBJECT(server_ioc)); + error_report_err(local_err); + return 1; + } + g_free(saddr.u.q_unix.path); + } else { + /* Using socket activation - check user didn't use -p etc. */ + const char *err_msg = socket_activation_validate_opts(); + if (err_msg != NULL) { + error_report("%s", err_msg); + exit(EXIT_FAILURE); + } + + /* Can only listen on a single socket. */ + if (socket_activation > 1) { + error_report("%s does not support socket activation with LISTEN_FDS > 1", + argv[0]); + exit(EXIT_FAILURE); + } + server_ioc = qio_channel_socket_new_fd(FIRST_SOCKET_ACTIVATION_FD, + &local_err); + if (server_ioc == NULL) { + error_report("Failed to use socket activation: %s", + error_get_pretty(local_err)); + exit(EXIT_FAILURE); + } + socket_path = NULL; + } + + if (qemu_init_main_loop(&local_err)) { + error_report_err(local_err); + exit(EXIT_FAILURE); + } + + server_watch = qio_channel_add_watch(QIO_CHANNEL(server_ioc), + G_IO_IN, + accept_client, + NULL, NULL); + +#ifdef CONFIG_LIBCAP + if (drop_privileges() < 0) { + error_report("Failed to drop privileges: %s", strerror(errno)); + exit(EXIT_FAILURE); + } +#endif + + if (daemonize) { + if (daemon(0, 0) < 0) { + error_report("Failed to daemonize: %s", strerror(errno)); + exit(EXIT_FAILURE); + } + write_pidfile(); + } + + state = RUNNING; + do { + main_loop_wait(false); + if (state == TERMINATE) { + state = TERMINATING; + close_server_socket(); + } + } while (num_active_sockets > 0); + + exit(EXIT_SUCCESS); +} -- cgit v1.2.3-70-g09d2 From fe8fc5ae5c808e037fa4746cbfeb3c07ffe0af81 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 22 Aug 2017 06:50:55 +0200 Subject: scsi: add multipath support to qemu-pr-helper Proper support of persistent reservation for multipath devices requires communication with the multipath daemon, so that the reservation is registered and applied when a path comes up. The device mapper utilities provide a library to do so; this patch makes qemu-pr-helper.c detect multipath devices and, when one is found, delegate the operation to libmpathpersist. Signed-off-by: Paolo Bonzini --- Makefile | 3 + configure | 46 +++++++ docs/pr-manager.rst | 27 ++++ include/scsi/utils.h | 4 + scsi/qemu-pr-helper.c | 346 +++++++++++++++++++++++++++++++++++++++++++++++++- scsi/utils.c | 10 ++ 6 files changed, 433 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 8406aeb8cb..4eb40376d2 100644 --- a/Makefile +++ b/Makefile @@ -373,6 +373,9 @@ fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o fsdev/9p-marshal fsdev/virtfs-proxy-helper$(EXESUF): LIBS += -lcap scsi/qemu-pr-helper$(EXESUF): scsi/qemu-pr-helper.o scsi/utils.o $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) $(COMMON_LDADDS) +ifdef CONFIG_MPATH +scsi/qemu-pr-helper$(EXESUF): LIBS += -ludev -lmultipath -lmpathpersist +endif qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx $(SRC_PATH)/scripts/hxtool $(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"GEN","$@") diff --git a/configure b/configure index becc21a0fe..f6edc2a33f 100755 --- a/configure +++ b/configure @@ -290,6 +290,7 @@ netmap="no" sdl="" sdlabi="" virtfs="" +mpath="" vnc="yes" sparse="no" vde="" @@ -936,6 +937,10 @@ for opt do ;; --enable-virtfs) virtfs="yes" ;; + --disable-mpath) mpath="no" + ;; + --enable-mpath) mpath="yes" + ;; --disable-vnc) vnc="no" ;; --enable-vnc) vnc="yes" @@ -1479,6 +1484,7 @@ disabled with --disable-FEATURE, default is enabled if available: vnc-png PNG compression for VNC server cocoa Cocoa UI (Mac OS X only) virtfs VirtFS + mpath Multipath persistent reservation passthrough xen xen backend driver support xen-pci-passthrough brlapi BrlAPI (Braile) @@ -3299,6 +3305,30 @@ else "Please install the pixman devel package." fi +########################################## +# libmpathpersist probe + +if test "$mpath" != "no" ; then + cat > $TMPC < +#include +unsigned mpath_mx_alloc_len = 1024; +int logsink; +int main(void) { + struct udev *udev = udev_new(); + mpath_lib_init(udev); + return 0; +} +EOF + if compile_prog "" "-ludev -lmultipath -lmpathpersist" ; then + mpathpersist=yes + else + mpathpersist=no + fi +else + mpathpersist=no +fi + ########################################## # libcap probe @@ -5044,12 +5074,24 @@ if test "$softmmu" = yes ; then fi virtfs=no fi + if test "$mpath" != no && test "$mpathpersist" = yes ; then + mpath=yes + else + if test "$mpath" = yes; then + error_exit "Multipath requires libmpathpersist devel" + fi + mpath=no + fi tools="$tools scsi/qemu-pr-helper\$(EXESUF)" else if test "$virtfs" = yes; then error_exit "VirtFS is supported only on Linux" fi virtfs=no + if test "$mpath" = yes; then + error_exit "Multipath is supported only on Linux" + fi + mpath=no fi fi @@ -5295,6 +5337,7 @@ echo "Audio drivers $audio_drv_list" echo "Block whitelist (rw) $block_drv_rw_whitelist" echo "Block whitelist (ro) $block_drv_ro_whitelist" echo "VirtFS support $virtfs" +echo "Multipath support $mpath" echo "VNC support $vnc" if test "$vnc" = "yes" ; then echo "VNC SASL support $vnc_sasl" @@ -5738,6 +5781,9 @@ fi if test "$virtfs" = "yes" ; then echo "CONFIG_VIRTFS=y" >> $config_host_mak fi +if test "$mpath" = "yes" ; then + echo "CONFIG_MPATH=y" >> $config_host_mak +fi if test "$vhost_scsi" = "yes" ; then echo "CONFIG_VHOST_SCSI=y" >> $config_host_mak fi diff --git a/docs/pr-manager.rst b/docs/pr-manager.rst index 7107e59fb8..9b1de198b1 100644 --- a/docs/pr-manager.rst +++ b/docs/pr-manager.rst @@ -60,6 +60,7 @@ system service and supports the following option: -d, --daemon run in the background -q, --quiet decrease verbosity +-v, --verbose increase verbosity -f, --pidfile=path PID file when running as a daemon -k, --socket=path path to the socket -T, --trace=trace-opts tracing options @@ -82,3 +83,29 @@ its operation. To do this, add the following options: -u, --user=user user to drop privileges to -g, --group=group group to drop privileges to + +--------------------------------------------- +Multipath devices and persistent reservations +--------------------------------------------- + +Proper support of persistent reservation for multipath devices requires +communication with the multipath daemon, so that the reservation is +registered and applied when a path is newly discovered or becomes online +again. :command:`qemu-pr-helper` can do this if the ``libmpathpersist`` +library was available on the system at build time. + +As of August 2017, a reservation key must be specified in ``multipath.conf`` +for ``multipathd`` to check for persistent reservation for newly +discovered paths or reinstated paths. The attribute can be added +to the ``defaults`` section or the ``multipaths`` section; for example:: + + multipaths { + multipath { + wwid XXXXXXXXXXXXXXXX + alias yellow + reservation_key 0x123abc + } + } + +Linking :program:`qemu-pr-helper` to ``libmpathpersist`` does not impede +its usage on regular SCSI devices. diff --git a/include/scsi/utils.h b/include/scsi/utils.h index d301b31768..00a4bdb080 100644 --- a/include/scsi/utils.h +++ b/include/scsi/utils.h @@ -72,10 +72,14 @@ extern const struct SCSISense sense_code_IO_ERROR; extern const struct SCSISense sense_code_I_T_NEXUS_LOSS; /* Command aborted, Logical Unit failure */ extern const struct SCSISense sense_code_LUN_FAILURE; +/* Command aborted, LUN Communication failure */ +extern const struct SCSISense sense_code_LUN_COMM_FAILURE; /* Command aborted, Overlapped Commands Attempted */ extern const struct SCSISense sense_code_OVERLAPPED_COMMANDS; /* LUN not ready, Capacity data has changed */ extern const struct SCSISense sense_code_CAPACITY_CHANGED; +/* Unit attention, SCSI bus reset */ +extern const struct SCSISense sense_code_SCSI_BUS_RESET; /* LUN not ready, Medium not present */ extern const struct SCSISense sense_code_UNIT_ATTENTION_NO_MEDIUM; /* Unit attention, Power on, reset or bus device reset occurred */ diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c index f46266f733..d58184833f 100644 --- a/scsi/qemu-pr-helper.c +++ b/scsi/qemu-pr-helper.c @@ -30,6 +30,12 @@ #include #include +#ifdef CONFIG_MPATH +#include +#include +#include +#endif + #include "qapi/error.h" #include "qemu-common.h" #include "qemu/cutils.h" @@ -60,6 +66,7 @@ static enum { RUNNING, TERMINATE, TERMINATING } state; static QIOChannelSocket *server_ioc; static int server_watch; static int num_active_sockets = 1; +static int noisy; static int verbose; #ifdef CONFIG_LIBCAP @@ -204,9 +211,316 @@ static int do_sgio(int fd, const uint8_t *cdb, uint8_t *sense, return r; } +/* Device mapper interface */ + +#ifdef CONFIG_MPATH +#define CONTROL_PATH "/dev/mapper/control" + +typedef struct DMData { + struct dm_ioctl dm; + uint8_t data[1024]; +} DMData; + +static int control_fd; + +static void *dm_ioctl(int ioc, struct dm_ioctl *dm) +{ + static DMData d; + memcpy(&d.dm, dm, sizeof(d.dm)); + QEMU_BUILD_BUG_ON(sizeof(d.data) < sizeof(struct dm_target_spec)); + + d.dm.version[0] = DM_VERSION_MAJOR; + d.dm.version[1] = 0; + d.dm.version[2] = 0; + d.dm.data_size = 1024; + d.dm.data_start = offsetof(DMData, data); + if (ioctl(control_fd, ioc, &d) < 0) { + return NULL; + } + memcpy(dm, &d.dm, sizeof(d.dm)); + return &d.data; +} + +static void *dm_dev_ioctl(int fd, int ioc, struct dm_ioctl *dm) +{ + struct stat st; + int r; + + r = fstat(fd, &st); + if (r < 0) { + perror("fstat"); + exit(1); + } + + dm->dev = st.st_rdev; + return dm_ioctl(ioc, dm); +} + +static void dm_init(void) +{ + control_fd = open(CONTROL_PATH, O_RDWR); + if (control_fd < 0) { + perror("Cannot open " CONTROL_PATH); + exit(1); + } + struct dm_ioctl dm = { 0 }; + if (!dm_ioctl(DM_VERSION, &dm)) { + perror("ioctl"); + exit(1); + } + if (dm.version[0] != DM_VERSION_MAJOR) { + fprintf(stderr, "Unsupported device mapper interface"); + exit(1); + } +} + +/* Variables required by libmultipath and libmpathpersist. */ +QEMU_BUILD_BUG_ON(PR_HELPER_DATA_SIZE > MPATH_MAX_PARAM_LEN); +unsigned mpath_mx_alloc_len = PR_HELPER_DATA_SIZE; +int logsink; + +static void multipath_pr_init(void) +{ + static struct udev *udev; + + udev = udev_new(); + mpath_lib_init(udev); +} + +static int is_mpath(int fd) +{ + struct dm_ioctl dm = { .flags = DM_NOFLUSH_FLAG }; + struct dm_target_spec *tgt; + + tgt = dm_dev_ioctl(fd, DM_TABLE_STATUS, &dm); + if (!tgt) { + if (errno == ENXIO) { + return 0; + } + perror("ioctl"); + exit(EXIT_FAILURE); + } + return !strncmp(tgt->target_type, "multipath", DM_MAX_TYPE_NAME); +} + +static int mpath_reconstruct_sense(int fd, int r, uint8_t *sense) +{ + switch (r) { + case MPATH_PR_SUCCESS: + return GOOD; + case MPATH_PR_SENSE_NOT_READY: + case MPATH_PR_SENSE_MEDIUM_ERROR: + case MPATH_PR_SENSE_HARDWARE_ERROR: + case MPATH_PR_SENSE_ABORTED_COMMAND: + { + /* libmpathpersist ate the exact sense. Try to find it by + * issuing TEST UNIT READY. + */ + uint8_t cdb[6] = { TEST_UNIT_READY }; + int sz = 0; + return do_sgio(fd, cdb, sense, NULL, &sz, SG_DXFER_NONE); + } + + case MPATH_PR_SENSE_UNIT_ATTENTION: + /* Congratulations libmpathpersist, you ruined the Unit Attention... + * Return a heavyweight one. + */ + scsi_build_sense(sense, SENSE_CODE(SCSI_BUS_RESET)); + return CHECK_CONDITION; + case MPATH_PR_SENSE_INVALID_OP: + /* Only one valid sense. */ + scsi_build_sense(sense, SENSE_CODE(INVALID_OPCODE)); + return CHECK_CONDITION; + case MPATH_PR_ILLEGAL_REQ: + /* Guess. */ + scsi_build_sense(sense, SENSE_CODE(INVALID_PARAM)); + return CHECK_CONDITION; + case MPATH_PR_NO_SENSE: + scsi_build_sense(sense, SENSE_CODE(NO_SENSE)); + return CHECK_CONDITION; + + case MPATH_PR_RESERV_CONFLICT: + return RESERVATION_CONFLICT; + + case MPATH_PR_OTHER: + default: + scsi_build_sense(sense, SENSE_CODE(LUN_COMM_FAILURE)); + return CHECK_CONDITION; + } +} + +static int multipath_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, + uint8_t *data, int sz) +{ + int rq_servact = cdb[1]; + struct prin_resp resp; + size_t written; + int r; + + switch (rq_servact) { + case MPATH_PRIN_RKEY_SA: + case MPATH_PRIN_RRES_SA: + case MPATH_PRIN_RCAP_SA: + break; + case MPATH_PRIN_RFSTAT_SA: + /* Nobody implements it anyway, so bail out. */ + default: + /* Cannot parse any other output. */ + scsi_build_sense(sense, SENSE_CODE(INVALID_FIELD)); + return CHECK_CONDITION; + } + + r = mpath_persistent_reserve_in(fd, rq_servact, &resp, noisy, verbose); + if (r == MPATH_PR_SUCCESS) { + switch (rq_servact) { + case MPATH_PRIN_RKEY_SA: + case MPATH_PRIN_RRES_SA: { + struct prin_readdescr *out = &resp.prin_descriptor.prin_readkeys; + assert(sz >= 8); + written = MIN(out->additional_length + 8, sz); + stl_be_p(&data[0], out->prgeneration); + stl_be_p(&data[4], out->additional_length); + memcpy(&data[8], out->key_list, written - 8); + break; + } + case MPATH_PRIN_RCAP_SA: { + struct prin_capdescr *out = &resp.prin_descriptor.prin_readcap; + assert(sz >= 6); + written = 6; + stw_be_p(&data[0], out->length); + data[2] = out->flags[0]; + data[3] = out->flags[1]; + stw_be_p(&data[4], out->pr_type_mask); + break; + } + default: + scsi_build_sense(sense, SENSE_CODE(INVALID_OPCODE)); + return CHECK_CONDITION; + } + assert(written <= sz); + memset(data + written, 0, sz - written); + } + + return mpath_reconstruct_sense(fd, r, sense); +} + +static int multipath_pr_out(int fd, const uint8_t *cdb, uint8_t *sense, + const uint8_t *param, int sz) +{ + int rq_servact = cdb[1]; + int rq_scope = cdb[2] >> 4; + int rq_type = cdb[2] & 0xf; + struct prout_param_descriptor paramp; + char transportids[PR_HELPER_DATA_SIZE]; + int r; + + switch (rq_servact) { + case MPATH_PROUT_REG_SA: + case MPATH_PROUT_RES_SA: + case MPATH_PROUT_REL_SA: + case MPATH_PROUT_CLEAR_SA: + case MPATH_PROUT_PREE_SA: + case MPATH_PROUT_PREE_AB_SA: + case MPATH_PROUT_REG_IGN_SA: + break; + case MPATH_PROUT_REG_MOV_SA: + /* Not supported by struct prout_param_descriptor. */ + default: + /* Cannot parse any other input. */ + scsi_build_sense(sense, SENSE_CODE(INVALID_FIELD)); + return CHECK_CONDITION; + } + + /* Convert input data, especially transport IDs, to the structs + * used by libmpathpersist (which, of course, will immediately + * do the opposite). + */ + memset(¶mp, 0, sizeof(paramp)); + memcpy(¶mp.key, ¶m[0], 8); + memcpy(¶mp.sa_key, ¶m[8], 8); + paramp.sa_flags = param[10]; + if (sz > PR_OUT_FIXED_PARAM_SIZE) { + size_t transportid_len; + int i, j; + if (sz < PR_OUT_FIXED_PARAM_SIZE + 4) { + scsi_build_sense(sense, SENSE_CODE(INVALID_PARAM_LEN)); + return CHECK_CONDITION; + } + transportid_len = ldl_be_p(¶m[24]) + PR_OUT_FIXED_PARAM_SIZE + 4; + if (transportid_len > sz) { + scsi_build_sense(sense, SENSE_CODE(INVALID_PARAM)); + return CHECK_CONDITION; + } + for (i = PR_OUT_FIXED_PARAM_SIZE + 4, j = 0; i < transportid_len; ) { + struct transportid *id = (struct transportid *) &transportids[j]; + int len; + + id->format_code = param[i] & 0xc0; + id->protocol_id = param[i] & 0x0f; + switch (param[i] & 0xcf) { + case 0: + /* FC transport. */ + if (i + 24 > transportid_len) { + goto illegal_req; + } + memcpy(id->n_port_name, ¶m[i + 8], 8); + j += offsetof(struct transportid, n_port_name[8]); + i += 24; + break; + case 3: + case 0x43: + /* iSCSI transport. */ + len = lduw_be_p(¶m[i + 2]); + if (len > 252 || (len & 3) || i + len + 4 > transportid_len) { + /* For format code 00, the standard says the maximum is 223 + * plus the NUL terminator. For format code 01 there is no + * maximum length, but libmpathpersist ignores the first + * byte of id->iscsi_name so our maximum is 252. + */ + goto illegal_req; + } + if (memchr(¶m[i + 4], 0, len) == NULL) { + goto illegal_req; + } + memcpy(id->iscsi_name, ¶m[i + 2], len + 2); + j += offsetof(struct transportid, iscsi_name[len + 2]); + i += len + 4; + break; + case 6: + /* SAS transport. */ + if (i + 24 > transportid_len) { + goto illegal_req; + } + memcpy(id->sas_address, ¶m[i + 4], 8); + j += offsetof(struct transportid, sas_address[8]); + i += 24; + break; + default: + illegal_req: + scsi_build_sense(sense, SENSE_CODE(INVALID_PARAM)); + return CHECK_CONDITION; + } + + paramp.trnptid_list[paramp.num_transportid++] = id; + } + } + + r = mpath_persistent_reserve_out(fd, rq_servact, rq_scope, rq_type, + ¶mp, noisy, verbose); + return mpath_reconstruct_sense(fd, r, sense); +} +#endif + static int do_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, uint8_t *data, int *resp_sz) { +#ifdef CONFIG_MPATH + if (is_mpath(fd)) { + /* multipath_pr_in fills the whole input buffer. */ + return multipath_pr_in(fd, cdb, sense, data, *resp_sz); + } +#endif + return do_sgio(fd, cdb, sense, data, resp_sz, SG_DXFER_FROM_DEV); } @@ -214,7 +528,14 @@ static int do_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, static int do_pr_out(int fd, const uint8_t *cdb, uint8_t *sense, const uint8_t *param, int sz) { - int resp_sz = sz; + int resp_sz; +#ifdef CONFIG_MPATH + if (is_mpath(fd)) { + return multipath_pr_out(fd, cdb, sense, param, sz); + } +#endif + + resp_sz = sz; return do_sgio(fd, cdb, sense, (uint8_t *)param, &resp_sz, SG_DXFER_TO_DEV); } @@ -525,6 +846,14 @@ static int drop_privileges(void) return -1; } +#ifdef CONFIG_MPATH + /* For /dev/mapper/control ioctls */ + if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE | CAPNG_PERMITTED, + CAP_SYS_ADMIN) < 0) { + return -1; + } +#endif + /* Change user/group id, retaining the capabilities. Because file descriptors * are passed via SCM_RIGHTS, we don't need supplementary groups (and in * fact the helper can run as "nobody"). @@ -541,7 +870,7 @@ static int drop_privileges(void) int main(int argc, char **argv) { - const char *sopt = "hVk:fdT:u:g:q"; + const char *sopt = "hVk:fdT:u:g:vq"; struct option lopt[] = { { "help", no_argument, NULL, 'h' }, { "version", no_argument, NULL, 'V' }, @@ -551,10 +880,12 @@ int main(int argc, char **argv) { "trace", required_argument, NULL, 'T' }, { "user", required_argument, NULL, 'u' }, { "group", required_argument, NULL, 'g' }, + { "verbose", no_argument, NULL, 'v' }, { "quiet", no_argument, NULL, 'q' }, { NULL, 0, NULL, 0 } }; int opt_ind = 0; + int loglevel = 1; int quiet = 0; int ch; Error *local_err = NULL; @@ -631,6 +962,9 @@ int main(int argc, char **argv) case 'q': quiet = 1; break; + case 'v': + ++loglevel; + break; case 'T': g_free(trace_file); trace_file = trace_opt_parse(optarg); @@ -650,7 +984,8 @@ int main(int argc, char **argv) } /* set verbosity */ - verbose = !quiet; + noisy = !quiet && (loglevel >= 3); + verbose = quiet ? 0 : MIN(loglevel, 3); if (!trace_init_backends()) { exit(EXIT_FAILURE); @@ -658,6 +993,11 @@ int main(int argc, char **argv) trace_init_file(trace_file); qemu_set_log(LOG_TRACE); +#ifdef CONFIG_MPATH + dm_init(); + multipath_pr_init(); +#endif + socket_activation = check_socket_activation(); if (socket_activation == 0) { SocketAddress saddr; diff --git a/scsi/utils.c b/scsi/utils.c index fab60bdf20..5684951b12 100644 --- a/scsi/utils.c +++ b/scsi/utils.c @@ -206,6 +206,11 @@ const struct SCSISense sense_code_OVERLAPPED_COMMANDS = { .key = ABORTED_COMMAND, .asc = 0x4e, .ascq = 0x00 }; +/* Command aborted, LUN Communication Failure */ +const struct SCSISense sense_code_LUN_COMM_FAILURE = { + .key = ABORTED_COMMAND, .asc = 0x08, .ascq = 0x00 +}; + /* Unit attention, Capacity data has changed */ const struct SCSISense sense_code_CAPACITY_CHANGED = { .key = UNIT_ATTENTION, .asc = 0x2a, .ascq = 0x09 @@ -216,6 +221,11 @@ const struct SCSISense sense_code_RESET = { .key = UNIT_ATTENTION, .asc = 0x29, .ascq = 0x00 }; +/* Unit attention, SCSI bus reset */ +const struct SCSISense sense_code_SCSI_BUS_RESET = { + .key = UNIT_ATTENTION, .asc = 0x29, .ascq = 0x02 +}; + /* Unit attention, No medium */ const struct SCSISense sense_code_UNIT_ATTENTION_NO_MEDIUM = { .key = UNIT_ATTENTION, .asc = 0x3a, .ascq = 0x00 -- cgit v1.2.3-70-g09d2 From 9bad2a6b9d0aeb2dcf91a07652cc63bbb6e73141 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 21 Aug 2017 18:58:56 +0200 Subject: scsi: add persistent reservation manager using qemu-pr-helper This adds a concrete subclass of pr-manager that talks to qemu-pr-helper. Signed-off-by: Paolo Bonzini --- scsi/Makefile.objs | 2 +- scsi/pr-manager-helper.c | 302 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 303 insertions(+), 1 deletion(-) create mode 100644 scsi/pr-manager-helper.c diff --git a/scsi/Makefile.objs b/scsi/Makefile.objs index 5496d2ae6a..4d25e476cf 100644 --- a/scsi/Makefile.objs +++ b/scsi/Makefile.objs @@ -1,3 +1,3 @@ block-obj-y += utils.o -block-obj-$(CONFIG_LINUX) += pr-manager.o +block-obj-$(CONFIG_LINUX) += pr-manager.o pr-manager-helper.o diff --git a/scsi/pr-manager-helper.c b/scsi/pr-manager-helper.c new file mode 100644 index 0000000000..82ff6b6123 --- /dev/null +++ b/scsi/pr-manager-helper.c @@ -0,0 +1,302 @@ +/* + * Persistent reservation manager that talks to qemu-pr-helper + * + * Copyright (c) 2017 Red Hat, Inc. + * + * Author: Paolo Bonzini + * + * This code is licensed under the LGPL v2.1 or later. + * + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "scsi/constants.h" +#include "scsi/pr-manager.h" +#include "scsi/utils.h" +#include "io/channel.h" +#include "io/channel-socket.h" +#include "pr-helper.h" + +#include + +#define PR_MAX_RECONNECT_ATTEMPTS 5 + +#define TYPE_PR_MANAGER_HELPER "pr-manager-helper" + +#define PR_MANAGER_HELPER(obj) \ + OBJECT_CHECK(PRManagerHelper, (obj), \ + TYPE_PR_MANAGER_HELPER) + +typedef struct PRManagerHelper { + /* */ + PRManager parent; + + char *path; + + QemuMutex lock; + QIOChannel *ioc; +} PRManagerHelper; + +/* Called with lock held. */ +static int pr_manager_helper_read(PRManagerHelper *pr_mgr, + void *buf, int sz, Error **errp) +{ + ssize_t r = qio_channel_read_all(pr_mgr->ioc, buf, sz, errp); + + if (r < 0) { + object_unref(OBJECT(pr_mgr->ioc)); + pr_mgr->ioc = NULL; + return -EINVAL; + } + + return 0; +} + +/* Called with lock held. */ +static int pr_manager_helper_write(PRManagerHelper *pr_mgr, + int fd, + const void *buf, int sz, Error **errp) +{ + size_t nfds = (fd != -1); + while (sz > 0) { + struct iovec iov; + ssize_t n_written; + + iov.iov_base = (void *)buf; + iov.iov_len = sz; + n_written = qio_channel_writev_full(QIO_CHANNEL(pr_mgr->ioc), &iov, 1, + nfds ? &fd : NULL, nfds, errp); + + if (n_written <= 0) { + assert(n_written != QIO_CHANNEL_ERR_BLOCK); + object_unref(OBJECT(pr_mgr->ioc)); + return n_written < 0 ? -EINVAL : 0; + } + + nfds = 0; + buf += n_written; + sz -= n_written; + } + + return 0; +} + +/* Called with lock held. */ +static int pr_manager_helper_initialize(PRManagerHelper *pr_mgr, + Error **errp) +{ + char *path = g_strdup(pr_mgr->path); + SocketAddress saddr = { + .type = SOCKET_ADDRESS_TYPE_UNIX, + .u.q_unix.path = path + }; + QIOChannelSocket *sioc = qio_channel_socket_new(); + Error *local_err = NULL; + + uint32_t flags; + int r; + + assert(!pr_mgr->ioc); + qio_channel_set_name(QIO_CHANNEL(sioc), "pr-manager-helper"); + qio_channel_socket_connect_sync(sioc, + &saddr, + &local_err); + g_free(path); + if (local_err) { + object_unref(OBJECT(sioc)); + error_propagate(errp, local_err); + return -ENOTCONN; + } + + qio_channel_set_delay(QIO_CHANNEL(sioc), false); + pr_mgr->ioc = QIO_CHANNEL(sioc); + + /* A simple feature negotation protocol, even though there is + * no optional feature right now. + */ + r = pr_manager_helper_read(pr_mgr, &flags, sizeof(flags), errp); + if (r < 0) { + goto out_close; + } + + flags = 0; + r = pr_manager_helper_write(pr_mgr, -1, &flags, sizeof(flags), errp); + if (r < 0) { + goto out_close; + } + + return 0; + +out_close: + object_unref(OBJECT(pr_mgr->ioc)); + pr_mgr->ioc = NULL; + return r; +} + +static int pr_manager_helper_run(PRManager *p, + int fd, struct sg_io_hdr *io_hdr) +{ + PRManagerHelper *pr_mgr = PR_MANAGER_HELPER(p); + + uint32_t len; + PRHelperResponse resp; + int ret; + int expected_dir; + int attempts; + uint8_t cdb[PR_HELPER_CDB_SIZE] = { 0 }; + + if (!io_hdr->cmd_len || io_hdr->cmd_len > PR_HELPER_CDB_SIZE) { + return -EINVAL; + } + + memcpy(cdb, io_hdr->cmdp, io_hdr->cmd_len); + assert(cdb[0] == PERSISTENT_RESERVE_OUT || cdb[0] == PERSISTENT_RESERVE_IN); + expected_dir = + (cdb[0] == PERSISTENT_RESERVE_OUT ? SG_DXFER_TO_DEV : SG_DXFER_FROM_DEV); + if (io_hdr->dxfer_direction != expected_dir) { + return -EINVAL; + } + + len = scsi_cdb_xfer(cdb); + if (io_hdr->dxfer_len < len || len > PR_HELPER_DATA_SIZE) { + return -EINVAL; + } + + qemu_mutex_lock(&pr_mgr->lock); + + /* Try to reconnect while sending the CDB. */ + for (attempts = 0; attempts < PR_MAX_RECONNECT_ATTEMPTS; attempts++) { + if (!pr_mgr->ioc) { + ret = pr_manager_helper_initialize(pr_mgr, NULL); + if (ret < 0) { + qemu_mutex_unlock(&pr_mgr->lock); + g_usleep(G_USEC_PER_SEC); + qemu_mutex_lock(&pr_mgr->lock); + continue; + } + } + + ret = pr_manager_helper_write(pr_mgr, fd, cdb, ARRAY_SIZE(cdb), NULL); + if (ret >= 0) { + break; + } + } + if (ret < 0) { + goto out; + } + + /* After sending the CDB, any communications failure causes the + * command to fail. The failure is transient, retrying the command + * will invoke pr_manager_helper_initialize again. + */ + if (expected_dir == SG_DXFER_TO_DEV) { + io_hdr->resid = io_hdr->dxfer_len - len; + ret = pr_manager_helper_write(pr_mgr, -1, io_hdr->dxferp, len, NULL); + if (ret < 0) { + goto out; + } + } + ret = pr_manager_helper_read(pr_mgr, &resp, sizeof(resp), NULL); + if (ret < 0) { + goto out; + } + + resp.result = be32_to_cpu(resp.result); + resp.sz = be32_to_cpu(resp.sz); + if (io_hdr->dxfer_direction == SG_DXFER_FROM_DEV) { + assert(resp.sz <= io_hdr->dxfer_len); + ret = pr_manager_helper_read(pr_mgr, io_hdr->dxferp, resp.sz, NULL); + if (ret < 0) { + goto out; + } + io_hdr->resid = io_hdr->dxfer_len - resp.sz; + } else { + assert(resp.sz == 0); + } + + io_hdr->status = resp.result; + if (resp.result == CHECK_CONDITION) { + io_hdr->driver_status = SG_ERR_DRIVER_SENSE; + io_hdr->sb_len_wr = MIN(io_hdr->mx_sb_len, PR_HELPER_SENSE_SIZE); + memcpy(io_hdr->sbp, resp.sense, io_hdr->sb_len_wr); + } + +out: + if (ret < 0) { + int sense_len = scsi_build_sense(io_hdr->sbp, + SENSE_CODE(LUN_COMM_FAILURE)); + io_hdr->driver_status = SG_ERR_DRIVER_SENSE; + io_hdr->sb_len_wr = MIN(io_hdr->mx_sb_len, sense_len); + io_hdr->status = CHECK_CONDITION; + } + qemu_mutex_unlock(&pr_mgr->lock); + return ret; +} + +static void pr_manager_helper_complete(UserCreatable *uc, Error **errp) +{ + PRManagerHelper *pr_mgr = PR_MANAGER_HELPER(uc); + + qemu_mutex_lock(&pr_mgr->lock); + pr_manager_helper_initialize(pr_mgr, errp); + qemu_mutex_unlock(&pr_mgr->lock); +} + +static char *get_path(Object *obj, Error **errp) +{ + PRManagerHelper *pr_mgr = PR_MANAGER_HELPER(obj); + + return g_strdup(pr_mgr->path); +} + +static void set_path(Object *obj, const char *str, Error **errp) +{ + PRManagerHelper *pr_mgr = PR_MANAGER_HELPER(obj); + + g_free(pr_mgr->path); + pr_mgr->path = g_strdup(str); +} + +static void pr_manager_helper_instance_finalize(Object *obj) +{ + PRManagerHelper *pr_mgr = PR_MANAGER_HELPER(obj); + + object_unref(OBJECT(pr_mgr->ioc)); + qemu_mutex_destroy(&pr_mgr->lock); +} + +static void pr_manager_helper_instance_init(Object *obj) +{ + PRManagerHelper *pr_mgr = PR_MANAGER_HELPER(obj); + + qemu_mutex_init(&pr_mgr->lock); +} + +static void pr_manager_helper_class_init(ObjectClass *klass, + void *class_data G_GNUC_UNUSED) +{ + PRManagerClass *prmgr_klass = PR_MANAGER_CLASS(klass); + UserCreatableClass *uc_klass = USER_CREATABLE_CLASS(klass); + + object_class_property_add_str(klass, "path", get_path, set_path, + &error_abort); + uc_klass->complete = pr_manager_helper_complete; + prmgr_klass->run = pr_manager_helper_run; +} + +static const TypeInfo pr_manager_helper_info = { + .parent = TYPE_PR_MANAGER, + .name = TYPE_PR_MANAGER_HELPER, + .instance_size = sizeof(PRManagerHelper), + .instance_init = pr_manager_helper_instance_init, + .instance_finalize = pr_manager_helper_instance_finalize, + .class_init = pr_manager_helper_class_init, +}; + +static void pr_manager_helper_register_types(void) +{ + type_register_static(&pr_manager_helper_info); +} + +type_init(pr_manager_helper_register_types); -- cgit v1.2.3-70-g09d2 From 07241c205c2c0b3e5e17e883c8ae523e90d172c2 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 21 Sep 2017 14:35:51 +0800 Subject: chardev: new qemu_chr_be_update_read_handlers() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a wrapper for the chr_update_read_handler(). Signed-off-by: Peter Xu Message-Id: <1505975754-21555-2-git-send-email-peterx@redhat.com> Reviewed-by: Marc-André Lureau Signed-off-by: Paolo Bonzini --- chardev/char-fe.c | 7 ++----- chardev/char.c | 10 ++++++++++ include/chardev/char.h | 10 ++++++++++ 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/chardev/char-fe.c b/chardev/char-fe.c index f3af6ae584..ee6d596100 100644 --- a/chardev/char-fe.c +++ b/chardev/char-fe.c @@ -253,7 +253,6 @@ void qemu_chr_fe_set_handlers(CharBackend *b, bool set_open) { Chardev *s; - ChardevClass *cc; int fe_open; s = b->chr; @@ -261,7 +260,6 @@ void qemu_chr_fe_set_handlers(CharBackend *b, return; } - cc = CHARDEV_GET_CLASS(s); if (!opaque && !fd_can_read && !fd_read && !fd_event) { fe_open = 0; remove_fd_in_watch(s); @@ -273,9 +271,8 @@ void qemu_chr_fe_set_handlers(CharBackend *b, b->chr_event = fd_event; b->chr_be_change = be_change; b->opaque = opaque; - if (cc->chr_update_read_handler) { - cc->chr_update_read_handler(s, context); - } + + qemu_chr_be_update_read_handlers(s, context); if (set_open) { qemu_chr_fe_set_open(b, fe_open); diff --git a/chardev/char.c b/chardev/char.c index b6fd5eb6a6..e090dd5a27 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -180,6 +180,16 @@ void qemu_chr_be_write(Chardev *s, uint8_t *buf, int len) } } +void qemu_chr_be_update_read_handlers(Chardev *s, + GMainContext *context) +{ + ChardevClass *cc = CHARDEV_GET_CLASS(s); + + if (cc->chr_update_read_handler) { + cc->chr_update_read_handler(s, context); + } +} + int qemu_chr_add_client(Chardev *s, int fd) { return CHARDEV_GET_CLASS(s)->chr_add_client ? diff --git a/include/chardev/char.h b/include/chardev/char.h index 66dde4637e..2068ea496d 100644 --- a/include/chardev/char.h +++ b/include/chardev/char.h @@ -168,6 +168,16 @@ void qemu_chr_be_write(Chardev *s, uint8_t *buf, int len); */ void qemu_chr_be_write_impl(Chardev *s, uint8_t *buf, int len); +/** + * @qemu_chr_be_update_read_handlers: + * + * Invoked when frontend read handlers are setup + * + * @context the gcontext that will be used to attach the watch sources + */ +void qemu_chr_be_update_read_handlers(Chardev *s, + GMainContext *context); + /** * @qemu_chr_be_event: * -- cgit v1.2.3-70-g09d2 From 95eeeba669dca94492d708b2893f296839652c84 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 21 Sep 2017 14:35:52 +0800 Subject: chardev: add Chardev.gcontext field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It caches the gcontext that is used to poll the chardev IO. Before this patch, we only passed it in via chr_update_read_handlers(). However that may not be enough if the char backend is disconnected and reconnected afterward. There are chardev codes that still assumed the context be NULL (which is the main context). Will fix that up in following up patches. Signed-off-by: Peter Xu Message-Id: <1505975754-21555-3-git-send-email-peterx@redhat.com> Reviewed-by: Marc-André Lureau Signed-off-by: Paolo Bonzini --- chardev/char.c | 1 + include/chardev/char.h | 1 + 2 files changed, 2 insertions(+) diff --git a/chardev/char.c b/chardev/char.c index e090dd5a27..89eabea5ac 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -185,6 +185,7 @@ void qemu_chr_be_update_read_handlers(Chardev *s, { ChardevClass *cc = CHARDEV_GET_CLASS(s); + s->gcontext = context; if (cc->chr_update_read_handler) { cc->chr_update_read_handler(s, context); } diff --git a/include/chardev/char.h b/include/chardev/char.h index 2068ea496d..84fb773591 100644 --- a/include/chardev/char.h +++ b/include/chardev/char.h @@ -55,6 +55,7 @@ struct Chardev { int logfd; int be_open; GSource *gsource; + GMainContext *gcontext; DECLARE_BITMAP(features, QEMU_CHAR_FEATURE_LAST); }; -- cgit v1.2.3-70-g09d2 From 6bbb6c0644f76b58012bd7ed4279d44c59bb43ab Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 21 Sep 2017 14:35:53 +0800 Subject: chardev: use per-dev context for io_add_watch_poll MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It was only passed in by chr_update_read_handlers(). However when reconnect, we'll lose that context information. So if a chardev was running on another context (rather than the default context, the NULL pointer), it'll switch back to the default context if reconnection happens. But, it should really stick to the old context. Convert all the callers of io_add_watch_poll() to use the internally cached gcontext. Then the context should be able to survive even after reconnections. Signed-off-by: Peter Xu Message-Id: <1505975754-21555-4-git-send-email-peterx@redhat.com> Reviewed-by: Marc-André Lureau Signed-off-by: Paolo Bonzini --- chardev/char-fd.c | 2 +- chardev/char-pty.c | 2 +- chardev/char-socket.c | 4 ++-- chardev/char-udp.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/chardev/char-fd.c b/chardev/char-fd.c index 6a62a545f2..09fbb078af 100644 --- a/chardev/char-fd.c +++ b/chardev/char-fd.c @@ -94,7 +94,7 @@ static void fd_chr_update_read_handler(Chardev *chr, chr->gsource = io_add_watch_poll(chr, s->ioc_in, fd_chr_read_poll, fd_chr_read, chr, - context); + chr->gcontext); } } diff --git a/chardev/char-pty.c b/chardev/char-pty.c index e5d20a0e6a..d239c04bc3 100644 --- a/chardev/char-pty.c +++ b/chardev/char-pty.c @@ -219,7 +219,7 @@ static void pty_chr_state(Chardev *chr, int connected) chr->gsource = io_add_watch_poll(chr, s->ioc, pty_chr_read_poll, pty_chr_read, - chr, NULL); + chr, chr->gcontext); } } } diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 1ae730a4cb..ee71cbed5b 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -516,7 +516,7 @@ static void tcp_chr_connect(void *opaque) chr->gsource = io_add_watch_poll(chr, s->ioc, tcp_chr_read_poll, tcp_chr_read, - chr, NULL); + chr, chr->gcontext); } qemu_chr_be_event(chr, CHR_EVENT_OPENED); } @@ -535,7 +535,7 @@ static void tcp_chr_update_read_handler(Chardev *chr, chr->gsource = io_add_watch_poll(chr, s->ioc, tcp_chr_read_poll, tcp_chr_read, chr, - context); + chr->gcontext); } } diff --git a/chardev/char-udp.c b/chardev/char-udp.c index 4ee11d3ebf..106dee1a29 100644 --- a/chardev/char-udp.c +++ b/chardev/char-udp.c @@ -110,7 +110,7 @@ static void udp_chr_update_read_handler(Chardev *chr, chr->gsource = io_add_watch_poll(chr, s->ioc, udp_chr_read_poll, udp_chr_read, chr, - context); + chr->gcontext); } } -- cgit v1.2.3-70-g09d2 From bb86d05f4afab3ebfee2e897e295d61dbd8cc28e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 21 Sep 2017 14:35:54 +0800 Subject: chardev: remove context in chr_update_read_handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We had a per-chardev cache for context, then we don't need this parameter to be passed in every time when chr_update_read_handler() called. As long as we are calling chr_update_read_handler() using qemu_chr_be_update_read_handlers() we'll be fine. Signed-off-by: Peter Xu Message-Id: <1505975754-21555-5-git-send-email-peterx@redhat.com> Reviewed-by: Marc-André Lureau Signed-off-by: Paolo Bonzini --- chardev/char-fd.c | 3 +-- chardev/char-pty.c | 3 +-- chardev/char-socket.c | 3 +-- chardev/char-udp.c | 3 +-- chardev/char.c | 2 +- include/chardev/char.h | 2 +- 6 files changed, 6 insertions(+), 10 deletions(-) diff --git a/chardev/char-fd.c b/chardev/char-fd.c index 09fbb078af..2c9b2ce567 100644 --- a/chardev/char-fd.c +++ b/chardev/char-fd.c @@ -84,8 +84,7 @@ static GSource *fd_chr_add_watch(Chardev *chr, GIOCondition cond) return qio_channel_create_watch(s->ioc_out, cond); } -static void fd_chr_update_read_handler(Chardev *chr, - GMainContext *context) +static void fd_chr_update_read_handler(Chardev *chr) { FDChardev *s = FD_CHARDEV(chr); diff --git a/chardev/char-pty.c b/chardev/char-pty.c index d239c04bc3..761ae6dec1 100644 --- a/chardev/char-pty.c +++ b/chardev/char-pty.c @@ -112,8 +112,7 @@ static void pty_chr_update_read_handler_locked(Chardev *chr) } } -static void pty_chr_update_read_handler(Chardev *chr, - GMainContext *context) +static void pty_chr_update_read_handler(Chardev *chr) { qemu_mutex_lock(&chr->chr_write_lock); pty_chr_update_read_handler_locked(chr); diff --git a/chardev/char-socket.c b/chardev/char-socket.c index ee71cbed5b..e65148fe97 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -521,8 +521,7 @@ static void tcp_chr_connect(void *opaque) qemu_chr_be_event(chr, CHR_EVENT_OPENED); } -static void tcp_chr_update_read_handler(Chardev *chr, - GMainContext *context) +static void tcp_chr_update_read_handler(Chardev *chr) { SocketChardev *s = SOCKET_CHARDEV(chr); diff --git a/chardev/char-udp.c b/chardev/char-udp.c index 106dee1a29..d46ff7ab53 100644 --- a/chardev/char-udp.c +++ b/chardev/char-udp.c @@ -100,8 +100,7 @@ static gboolean udp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque) return TRUE; } -static void udp_chr_update_read_handler(Chardev *chr, - GMainContext *context) +static void udp_chr_update_read_handler(Chardev *chr) { UdpChardev *s = UDP_CHARDEV(chr); diff --git a/chardev/char.c b/chardev/char.c index 89eabea5ac..2ae4f465ec 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -187,7 +187,7 @@ void qemu_chr_be_update_read_handlers(Chardev *s, s->gcontext = context; if (cc->chr_update_read_handler) { - cc->chr_update_read_handler(s, context); + cc->chr_update_read_handler(s); } } diff --git a/include/chardev/char.h b/include/chardev/char.h index 84fb773591..43aabccef5 100644 --- a/include/chardev/char.h +++ b/include/chardev/char.h @@ -238,7 +238,7 @@ typedef struct ChardevClass { int (*chr_write)(Chardev *s, const uint8_t *buf, int len); int (*chr_sync_read)(Chardev *s, const uint8_t *buf, int len); GSource *(*chr_add_watch)(Chardev *s, GIOCondition cond); - void (*chr_update_read_handler)(Chardev *s, GMainContext *context); + void (*chr_update_read_handler)(Chardev *s); int (*chr_ioctl)(Chardev *s, int cmd, void *arg); int (*get_msgfds)(Chardev *s, int* fds, int num); int (*set_msgfds)(Chardev *s, int *fds, int num); -- cgit v1.2.3-70-g09d2