summaryrefslogtreecommitdiff
path: root/arch/s390
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-08-04 14:59:54 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-08-04 14:59:54 -0700
commit7c5c3a6177fa9646884114fc7f2e970b0bc50dc9 (patch)
tree956857522574ae7cb07d2227dc16e53d7e9e00e7 /arch/s390
parentf0a892f599c46af673e47418c47c15e69a7b67f4 (diff)
parent281106f938d3daaea6f8b6723a8217a2a1ef6936 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini: "Quite a large pull request due to a selftest API overhaul and some patches that had come in too late for 5.19. ARM: - Unwinder implementations for both nVHE modes (classic and protected), complete with an overflow stack - Rework of the sysreg access from userspace, with a complete rewrite of the vgic-v3 view to allign with the rest of the infrastructure - Disagregation of the vcpu flags in separate sets to better track their use model. - A fix for the GICv2-on-v3 selftest - A small set of cosmetic fixes RISC-V: - Track ISA extensions used by Guest using bitmap - Added system instruction emulation framework - Added CSR emulation framework - Added gfp_custom flag in struct kvm_mmu_memory_cache - Added G-stage ioremap() and iounmap() functions - Added support for Svpbmt inside Guest s390: - add an interface to provide a hypervisor dump for secure guests - improve selftests to use TAP interface - enable interpretive execution of zPCI instructions (for PCI passthrough) - First part of deferred teardown - CPU Topology - PV attestation - Minor fixes x86: - Permit guests to ignore single-bit ECC errors - Intel IPI virtualization - Allow getting/setting pending triple fault with KVM_GET/SET_VCPU_EVENTS - PEBS virtualization - Simplify PMU emulation by just using PERF_TYPE_RAW events - More accurate event reinjection on SVM (avoid retrying instructions) - Allow getting/setting the state of the speaker port data bit - Refuse starting the kvm-intel module if VM-Entry/VM-Exit controls are inconsistent - "Notify" VM exit (detect microarchitectural hangs) for Intel - Use try_cmpxchg64 instead of cmpxchg64 - Ignore benign host accesses to PMU MSRs when PMU is disabled - Allow disabling KVM's "MONITOR/MWAIT are NOPs!" behavior - Allow NX huge page mitigation to be disabled on a per-vm basis - Port eager page splitting to shadow MMU as well - Enable CMCI capability by default and handle injected UCNA errors - Expose pid of vcpu threads in debugfs - x2AVIC support for AMD - cleanup PIO emulation - Fixes for LLDT/LTR emulation - Don't require refcounted "struct page" to create huge SPTEs - Miscellaneous cleanups: - MCE MSR emulation - Use separate namespaces for guest PTEs and shadow PTEs bitmasks - PIO emulation - Reorganize rmap API, mostly around rmap destruction - Do not workaround very old KVM bugs for L0 that runs with nesting enabled - new selftests API for CPUID Generic: - Fix races in gfn->pfn cache refresh; do not pin pages tracked by the cache - new selftests API using struct kvm_vcpu instead of a (vm, id) tuple" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (606 commits) selftests: kvm: set rax before vmcall selftests: KVM: Add exponent check for boolean stats selftests: KVM: Provide descriptive assertions in kvm_binary_stats_test selftests: KVM: Check stat name before other fields KVM: x86/mmu: remove unused variable RISC-V: KVM: Add support for Svpbmt inside Guest/VM RISC-V: KVM: Use PAGE_KERNEL_IO in kvm_riscv_gstage_ioremap() RISC-V: KVM: Add G-stage ioremap() and iounmap() functions KVM: Add gfp_custom flag in struct kvm_mmu_memory_cache RISC-V: KVM: Add extensible CSR emulation framework RISC-V: KVM: Add extensible system instruction emulation framework RISC-V: KVM: Factor-out instruction emulation into separate sources RISC-V: KVM: move preempt_disable() call in kvm_arch_vcpu_ioctl_run RISC-V: KVM: Make kvm_riscv_guest_timer_init a void function RISC-V: KVM: Fix variable spelling mistake RISC-V: KVM: Improve ISA extension by using a bitmap KVM, x86/mmu: Fix the comment around kvm_tdp_mmu_zap_leafs() KVM: SVM: Dump Virtual Machine Save Area (VMSA) to klog KVM: x86/mmu: Treat NX as a valid SPTE bit for NPT KVM: x86: Do not block APIC write for non ICR registers ...
Diffstat (limited to 'arch/s390')
-rw-r--r--arch/s390/boot/uv.c6
-rw-r--r--arch/s390/include/asm/airq.h7
-rw-r--r--arch/s390/include/asm/gmap.h39
-rw-r--r--arch/s390/include/asm/kvm_host.h44
-rw-r--r--arch/s390/include/asm/mmu.h2
-rw-r--r--arch/s390/include/asm/mmu_context.h2
-rw-r--r--arch/s390/include/asm/pci.h11
-rw-r--r--arch/s390/include/asm/pci_clp.h9
-rw-r--r--arch/s390/include/asm/pci_insn.h29
-rw-r--r--arch/s390/include/asm/pgtable.h21
-rw-r--r--arch/s390/include/asm/sclp.h4
-rw-r--r--arch/s390/include/asm/tpi.h13
-rw-r--r--arch/s390/include/asm/uv.h51
-rw-r--r--arch/s390/include/uapi/asm/kvm.h1
-rw-r--r--arch/s390/kernel/uv.c156
-rw-r--r--arch/s390/kvm/Kconfig1
-rw-r--r--arch/s390/kvm/Makefile1
-rw-r--r--arch/s390/kvm/gaccess.c96
-rw-r--r--arch/s390/kvm/gaccess.h6
-rw-r--r--arch/s390/kvm/intercept.c15
-rw-r--r--arch/s390/kvm/interrupt.c98
-rw-r--r--arch/s390/kvm/kvm-s390.c482
-rw-r--r--arch/s390/kvm/kvm-s390.h16
-rw-r--r--arch/s390/kvm/pci.c690
-rw-r--r--arch/s390/kvm/pci.h87
-rw-r--r--arch/s390/kvm/priv.c26
-rw-r--r--arch/s390/kvm/pv.c269
-rw-r--r--arch/s390/kvm/sigp.c4
-rw-r--r--arch/s390/kvm/vsie.c8
-rw-r--r--arch/s390/mm/fault.c23
-rw-r--r--arch/s390/mm/gmap.c177
-rw-r--r--arch/s390/pci/pci.c16
-rw-r--r--arch/s390/pci/pci_clp.c7
-rw-r--r--arch/s390/pci/pci_insn.c4
-rw-r--r--arch/s390/pci/pci_irq.c48
-rw-r--r--arch/s390/tools/gen_facilities.c1
36 files changed, 2315 insertions, 155 deletions
diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c
index e6be155ab2e5..a5fa667160b2 100644
--- a/arch/s390/boot/uv.c
+++ b/arch/s390/boot/uv.c
@@ -41,6 +41,12 @@ void uv_query_info(void)
uv_info.max_num_sec_conf = uvcb.max_num_sec_conf;
uv_info.max_guest_cpu_id = uvcb.max_guest_cpu_id;
uv_info.uv_feature_indications = uvcb.uv_feature_indications;
+ uv_info.supp_se_hdr_ver = uvcb.supp_se_hdr_versions;
+ uv_info.supp_se_hdr_pcf = uvcb.supp_se_hdr_pcf;
+ uv_info.conf_dump_storage_state_len = uvcb.conf_dump_storage_state_len;
+ uv_info.conf_dump_finalize_len = uvcb.conf_dump_finalize_len;
+ uv_info.supp_att_req_hdr_ver = uvcb.supp_att_req_hdr_ver;
+ uv_info.supp_att_pflags = uvcb.supp_att_pflags;
}
#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h
index 01936fdfaddb..e82e5626e139 100644
--- a/arch/s390/include/asm/airq.h
+++ b/arch/s390/include/asm/airq.h
@@ -12,10 +12,11 @@
#include <linux/bit_spinlock.h>
#include <linux/dma-mapping.h>
+#include <asm/tpi.h>
struct airq_struct {
struct hlist_node list; /* Handler queueing. */
- void (*handler)(struct airq_struct *airq, bool floating);
+ void (*handler)(struct airq_struct *airq, struct tpi_info *tpi_info);
u8 *lsi_ptr; /* Local-Summary-Indicator pointer */
u8 lsi_mask; /* Local-Summary-Indicator mask */
u8 isc; /* Interrupt-subclass */
@@ -46,8 +47,10 @@ struct airq_iv {
#define AIRQ_IV_PTR 4 /* Allocate the ptr array */
#define AIRQ_IV_DATA 8 /* Allocate the data array */
#define AIRQ_IV_CACHELINE 16 /* Cacheline alignment for the vector */
+#define AIRQ_IV_GUESTVEC 32 /* Vector is a pinned guest page */
-struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags);
+struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags,
+ unsigned long *vec);
void airq_iv_release(struct airq_iv *iv);
unsigned long airq_iv_alloc(struct airq_iv *iv, unsigned long num);
void airq_iv_free(struct airq_iv *iv, unsigned long bit, unsigned long num);
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 40264f60b0da..5cc46e0dde62 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -147,5 +147,42 @@ int gmap_mprotect_notify(struct gmap *, unsigned long start,
void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
unsigned long gaddr, unsigned long vmaddr);
int gmap_mark_unmergeable(void);
-void s390_reset_acc(struct mm_struct *mm);
+void s390_unlist_old_asce(struct gmap *gmap);
+int s390_replace_asce(struct gmap *gmap);
+void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
+int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, bool interruptible);
+
+/**
+ * s390_uv_destroy_range - Destroy a range of pages in the given mm.
+ * @mm: the mm on which to operate on
+ * @start: the start of the range
+ * @end: the end of the range
+ *
+ * This function will call cond_sched, so it should not generate stalls, but
+ * it will otherwise only return when it completed.
+ */
+static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end)
+{
+ (void)__s390_uv_destroy_range(mm, start, end, false);
+}
+
+/**
+ * s390_uv_destroy_range_interruptible - Destroy a range of pages in the
+ * given mm, but stop when a fatal signal is received.
+ * @mm: the mm on which to operate on
+ * @start: the start of the range
+ * @end: the end of the range
+ *
+ * This function will call cond_sched, so it should not generate stalls. If
+ * a fatal signal is received, it will return with -EINTR immediately,
+ * without finishing destroying the whole range. Upon successful
+ * completion, 0 is returned.
+ */
+static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start,
+ unsigned long end)
+{
+ return __s390_uv_destroy_range(mm, start, end, true);
+}
#endif /* _ASM_S390_GMAP_H */
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 766028d54a3e..f39092e0ceaa 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -19,6 +19,8 @@
#include <linux/kvm.h>
#include <linux/seqlock.h>
#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/mmu_notifier.h>
#include <asm/debug.h>
#include <asm/cpu.h>
#include <asm/fpu/api.h>
@@ -93,19 +95,30 @@ union ipte_control {
};
};
+union sca_utility {
+ __u16 val;
+ struct {
+ __u16 mtcr : 1;
+ __u16 reserved : 15;
+ };
+};
+
struct bsca_block {
union ipte_control ipte_control;
__u64 reserved[5];
__u64 mcn;
- __u64 reserved2;
+ union sca_utility utility;
+ __u8 reserved2[6];
struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS];
};
struct esca_block {
union ipte_control ipte_control;
- __u64 reserved1[7];
+ __u64 reserved1[6];
+ union sca_utility utility;
+ __u8 reserved2[6];
__u64 mcn[4];
- __u64 reserved2[20];
+ __u64 reserved3[20];
struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS];
};
@@ -249,12 +262,16 @@ struct kvm_s390_sie_block {
#define ECB_SPECI 0x08
#define ECB_SRSI 0x04
#define ECB_HOSTPROTINT 0x02
+#define ECB_PTF 0x01
__u8 ecb; /* 0x0061 */
#define ECB2_CMMA 0x80
#define ECB2_IEP 0x20
#define ECB2_PFMFI 0x08
#define ECB2_ESCA 0x04
+#define ECB2_ZPCI_LSI 0x02
__u8 ecb2; /* 0x0062 */
+#define ECB3_AISI 0x20
+#define ECB3_AISII 0x10
#define ECB3_DEA 0x08
#define ECB3_AES 0x04
#define ECB3_RI 0x01
@@ -759,6 +776,7 @@ struct kvm_vm_stat {
u64 inject_pfault_done;
u64 inject_service_signal;
u64 inject_virtio;
+ u64 aen_forward;
};
struct kvm_arch_memory_slot {
@@ -923,6 +941,8 @@ struct kvm_s390_pv {
u64 guest_len;
unsigned long stor_base;
void *stor_var;
+ bool dumping;
+ struct mmu_notifier mmu_notifier;
};
struct kvm_arch{
@@ -939,6 +959,7 @@ struct kvm_arch{
int use_cmma;
int use_pfmfi;
int use_skf;
+ int use_zpci_interp;
int user_cpu_state_ctrl;
int user_sigp;
int user_stsi;
@@ -962,6 +983,8 @@ struct kvm_arch{
DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
struct kvm_s390_gisa_interrupt gisa_int;
struct kvm_s390_pv pv;
+ struct list_head kzdev_list;
+ spinlock_t kzdev_list_lock;
};
#define KVM_HVA_ERR_BAD (-1UL)
@@ -1012,4 +1035,19 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+#define __KVM_HAVE_ARCH_VM_FREE
+void kvm_arch_free_vm(struct kvm *kvm);
+
+#ifdef CONFIG_VFIO_PCI_ZDEV_KVM
+int kvm_s390_pci_register_kvm(struct zpci_dev *zdev, struct kvm *kvm);
+void kvm_s390_pci_unregister_kvm(struct zpci_dev *zdev);
+#else
+static inline int kvm_s390_pci_register_kvm(struct zpci_dev *dev,
+ struct kvm *kvm)
+{
+ return -EPERM;
+}
+static inline void kvm_s390_pci_unregister_kvm(struct zpci_dev *dev) {}
+#endif
+
#endif
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index 82aae78e1315..1572b3634cdd 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -18,7 +18,7 @@ typedef struct {
unsigned long asce_limit;
unsigned long vdso_base;
/* The mmu context belongs to a secure guest. */
- atomic_t is_protected;
+ atomic_t protected_count;
/*
* The following bitfields need a down_write on the mm
* semaphore when they are written to. As they are only
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index c7937f369e62..2a38af5a00c2 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -26,7 +26,7 @@ static inline int init_new_context(struct task_struct *tsk,
INIT_LIST_HEAD(&mm->context.gmap_list);
cpumask_clear(&mm->context.cpu_attach_mask);
atomic_set(&mm->context.flush_count, 0);
- atomic_set(&mm->context.is_protected, 0);
+ atomic_set(&mm->context.protected_count, 0);
mm->context.gmap_asce = 0;
mm->context.flush_mm = 0;
#ifdef CONFIG_PGSTE
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index fdb9745ee998..85eb0ef9d4c3 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -9,6 +9,7 @@
#include <asm-generic/pci.h>
#include <asm/pci_clp.h>
#include <asm/pci_debug.h>
+#include <asm/pci_insn.h>
#include <asm/sclp.h>
#define PCIBIOS_MIN_IO 0x1000
@@ -97,6 +98,7 @@ struct zpci_bar_struct {
};
struct s390_domain;
+struct kvm_zdev;
#define ZPCI_FUNCTIONS_PER_BUS 256
struct zpci_bus {
@@ -123,11 +125,14 @@ struct zpci_dev {
enum zpci_state state;
u32 fid; /* function ID, used by sclp */
u32 fh; /* function handle, used by insn's */
+ u32 gisa; /* GISA designation for passthrough */
u16 vfn; /* virtual function number */
u16 pchid; /* physical channel ID */
+ u16 maxstbl; /* Maximum store block size */
u8 pfgid; /* function group ID */
u8 pft; /* pci function type */
u8 port;
+ u8 dtsm; /* Supported DT mask */
u8 rid_available : 1;
u8 has_hp_slot : 1;
u8 has_resources : 1;
@@ -186,7 +191,10 @@ struct zpci_dev {
struct dentry *debugfs_dev;
+ /* IOMMU and passthrough */
struct s390_domain *s390_domain; /* s390 IOMMU domain data */
+ struct kvm_zdev *kzdev;
+ struct mutex kzdev_lock;
};
static inline bool zdev_enabled(struct zpci_dev *zdev)
@@ -198,6 +206,9 @@ extern const struct attribute_group *zpci_attr_groups[];
extern unsigned int s390_pci_force_floating __initdata;
extern unsigned int s390_pci_no_rid;
+extern union zpci_sic_iib *zpci_aipb;
+extern struct airq_iv *zpci_aif_sbv;
+
/* -----------------------------------------------------------------------------
Prototypes
----------------------------------------------------------------------------- */
diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h
index 1f4b666e85ee..d6189ed14f84 100644
--- a/arch/s390/include/asm/pci_clp.h
+++ b/arch/s390/include/asm/pci_clp.h
@@ -153,9 +153,11 @@ struct clp_rsp_query_pci_grp {
u8 : 6;
u8 frame : 1;
u8 refresh : 1; /* TLB refresh mode */
- u16 reserved2;
+ u16 : 3;
+ u16 maxstbl : 13; /* Maximum store block size */
u16 mui;
- u16 : 16;
+ u8 dtsm; /* Supported DT mask */
+ u8 reserved3;
u16 maxfaal;
u16 : 4;
u16 dnoi : 12;
@@ -173,7 +175,8 @@ struct clp_req_set_pci {
u16 reserved2;
u8 oc; /* operation controls */
u8 ndas; /* number of dma spaces */
- u64 reserved3;
+ u32 reserved3;
+ u32 gisa; /* GISA designation */
} __packed;
/* Set PCI function response */
diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h
index 61cf9531f68f..e5f57cfe1d45 100644
--- a/arch/s390/include/asm/pci_insn.h
+++ b/arch/s390/include/asm/pci_insn.h
@@ -98,6 +98,15 @@ struct zpci_fib {
u32 gd;
} __packed __aligned(8);
+/* Set Interruption Controls Operation Controls */
+#define SIC_IRQ_MODE_ALL 0
+#define SIC_IRQ_MODE_SINGLE 1
+#define SIC_SET_AENI_CONTROLS 2
+#define SIC_IRQ_MODE_DIRECT 4
+#define SIC_IRQ_MODE_D_ALL 16
+#define SIC_IRQ_MODE_D_SINGLE 17
+#define SIC_IRQ_MODE_SET_CPU 18
+
/* directed interruption information block */
struct zpci_diib {
u32 : 1;
@@ -119,9 +128,20 @@ struct zpci_cdiib {
u64 : 64;
} __packed __aligned(8);
+/* adapter interruption parameters block */
+struct zpci_aipb {
+ u64 faisb;
+ u64 gait;
+ u16 : 13;
+ u16 afi : 3;
+ u32 : 32;
+ u16 faal;
+} __packed __aligned(8);
+
union zpci_sic_iib {
struct zpci_diib diib;
struct zpci_cdiib cdiib;
+ struct zpci_aipb aipb;
};
DECLARE_STATIC_KEY_FALSE(have_mio);
@@ -134,13 +154,6 @@ int __zpci_store(u64 data, u64 req, u64 offset);
int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len);
int __zpci_store_block(const u64 *data, u64 req, u64 offset);
void zpci_barrier(void);
-int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib);
-
-static inline int zpci_set_irq_ctrl(u16 ctl, u8 isc)
-{
- union zpci_sic_iib iib = {{0}};
-
- return __zpci_set_irq_ctrl(ctl, isc, &iib);
-}
+int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib);
#endif
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index a397b072a580..cf81acf3879c 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -525,7 +525,7 @@ static inline int mm_has_pgste(struct mm_struct *mm)
static inline int mm_is_protected(struct mm_struct *mm)
{
#ifdef CONFIG_PGSTE
- if (unlikely(atomic_read(&mm->context.is_protected)))
+ if (unlikely(atomic_read(&mm->context.protected_count)))
return 1;
#endif
return 0;
@@ -1182,9 +1182,22 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
} else {
res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
}
- /* At this point the reference through the mapping is still present */
- if (mm_is_protected(mm) && pte_present(res))
- uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK);
+ /* Nothing to do */
+ if (!mm_is_protected(mm) || !pte_present(res))
+ return res;
+ /*
+ * At this point the reference through the mapping is still present.
+ * The notifier should have destroyed all protected vCPUs at this
+ * point, so the destroy should be successful.
+ */
+ if (full && !uv_destroy_owned_page(pte_val(res) & PAGE_MASK))
+ return res;
+ /*
+ * If something went wrong and the page could not be destroyed, or
+ * if this is not a mm teardown, the slower export is used as
+ * fallback instead.
+ */
+ uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK);
return res;
}
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index 236b34b75ddb..addefe8ccdba 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -88,6 +88,10 @@ struct sclp_info {
unsigned char has_sipl : 1;
unsigned char has_dirq : 1;
unsigned char has_iplcc : 1;
+ unsigned char has_zpci_lsi : 1;
+ unsigned char has_aisii : 1;
+ unsigned char has_aeni : 1;
+ unsigned char has_aisi : 1;
unsigned int ibc;
unsigned int mtid;
unsigned int mtid_cp;
diff --git a/arch/s390/include/asm/tpi.h b/arch/s390/include/asm/tpi.h
index 1ac538b8cbf5..f76e5fdff23a 100644
--- a/arch/s390/include/asm/tpi.h
+++ b/arch/s390/include/asm/tpi.h
@@ -19,6 +19,19 @@ struct tpi_info {
u32 :12;
} __packed __aligned(4);
+/* I/O-Interruption Code as stored by TPI for an Adapter I/O */
+struct tpi_adapter_info {
+ u32 aism:8;
+ u32 :22;
+ u32 error:1;
+ u32 forward:1;
+ u32 reserved;
+ u32 adapter_IO:1;
+ u32 directed_irq:1;
+ u32 isc:3;
+ u32 :27;
+} __packed __aligned(4);
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_S390_TPI_H */
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index cfea7b77a5b8..be3ef9dd6972 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -50,6 +50,10 @@
#define UVC_CMD_SET_UNSHARE_ALL 0x0340
#define UVC_CMD_PIN_PAGE_SHARED 0x0341
#define UVC_CMD_UNPIN_PAGE_SHARED 0x0342
+#define UVC_CMD_DUMP_INIT 0x0400
+#define UVC_CMD_DUMP_CONF_STOR_STATE 0x0401
+#define UVC_CMD_DUMP_CPU 0x0402
+#define UVC_CMD_DUMP_COMPLETE 0x0403
#define UVC_CMD_SET_SHARED_ACCESS 0x1000
#define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001
#define UVC_CMD_RETR_ATTEST 0x1020
@@ -77,6 +81,10 @@ enum uv_cmds_inst {
BIT_UVC_CMD_UNSHARE_ALL = 20,
BIT_UVC_CMD_PIN_PAGE_SHARED = 21,
BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22,
+ BIT_UVC_CMD_DUMP_INIT = 24,
+ BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE = 25,
+ BIT_UVC_CMD_DUMP_CPU = 26,
+ BIT_UVC_CMD_DUMP_COMPLETE = 27,
BIT_UVC_CMD_RETR_ATTEST = 28,
};
@@ -110,7 +118,16 @@ struct uv_cb_qui {
u8 reserved88[158 - 136]; /* 0x0088 */
u16 max_guest_cpu_id; /* 0x009e */
u64 uv_feature_indications; /* 0x00a0 */
- u8 reserveda8[200 - 168]; /* 0x00a8 */
+ u64 reserveda8; /* 0x00a8 */
+ u64 supp_se_hdr_versions; /* 0x00b0 */
+ u64 supp_se_hdr_pcf; /* 0x00b8 */
+ u64 reservedc0; /* 0x00c0 */
+ u64 conf_dump_storage_state_len; /* 0x00c8 */
+ u64 conf_dump_finalize_len; /* 0x00d0 */
+ u64 reservedd8; /* 0x00d8 */
+ u64 supp_att_req_hdr_ver; /* 0x00e0 */
+ u64 supp_att_pflags; /* 0x00e8 */
+ u8 reservedf0[256 - 240]; /* 0x00f0 */
} __packed __aligned(8);
/* Initialize Ultravisor */
@@ -240,6 +257,31 @@ struct uv_cb_attest {
u64 reserved168[4]; /* 0x0168 */
} __packed __aligned(8);
+struct uv_cb_dump_cpu {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 cpu_handle;
+ u64 dump_area_origin;
+ u64 reserved28[5];
+} __packed __aligned(8);
+
+struct uv_cb_dump_stor_state {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 config_handle;
+ u64 dump_area_origin;
+ u64 gaddr;
+ u64 reserved28[4];
+} __packed __aligned(8);
+
+struct uv_cb_dump_complete {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 config_handle;
+ u64 dump_area_origin;
+ u64 reserved30[5];
+} __packed __aligned(8);
+
static inline int __uv_call(unsigned long r1, unsigned long r2)
{
int cc;
@@ -307,6 +349,12 @@ struct uv_info {
unsigned int max_num_sec_conf;
unsigned short max_guest_cpu_id;
unsigned long uv_feature_indications;
+ unsigned long supp_se_hdr_ver;
+ unsigned long supp_se_hdr_pcf;
+ unsigned long conf_dump_storage_state_len;
+ unsigned long conf_dump_finalize_len;
+ unsigned long supp_att_req_hdr_ver;
+ unsigned long supp_att_pflags;
};
extern struct uv_info uv_info;
@@ -378,6 +426,7 @@ static inline int is_prot_virt_host(void)
}
int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb);
+int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr);
int uv_destroy_owned_page(unsigned long paddr);
int uv_convert_from_secure(unsigned long paddr);
int uv_convert_owned_from_secure(unsigned long paddr);
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 7a6b14874d65..a73cf01a1606 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -74,6 +74,7 @@ struct kvm_s390_io_adapter_req {
#define KVM_S390_VM_CRYPTO 2
#define KVM_S390_VM_CPU_MODEL 3
#define KVM_S390_VM_MIGRATION 4
+#define KVM_S390_VM_CPU_TOPOLOGY 5
/* kvm attributes for mem_ctrl */
#define KVM_S390_VM_MEM_ENABLE_CMMA 0
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index a5425075dd25..f9810d2a267c 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -234,6 +234,32 @@ static int make_secure_pte(pte_t *ptep, unsigned long addr,
return uvcb->rc == 0x10a ? -ENXIO : -EINVAL;
}
+/**
+ * should_export_before_import - Determine whether an export is needed
+ * before an import-like operation
+ * @uvcb: the Ultravisor control block of the UVC to be performed
+ * @mm: the mm of the process
+ *
+ * Returns whether an export is needed before every import-like operation.
+ * This is needed for shared pages, which don't trigger a secure storage
+ * exception when accessed from a different guest.
+ *
+ * Although considered as one, the Unpin Page UVC is not an actual import,
+ * so it is not affected.
+ *
+ * No export is needed also when there is only one protected VM, because the
+ * page cannot belong to the wrong VM in that case (there is no "other VM"
+ * it can belong to).
+ *
+ * Return: true if an export is needed before every import, otherwise false.
+ */
+static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
+{
+ if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
+ return false;
+ return atomic_read(&mm->context.protected_count) > 1;
+}
+
/*
* Requests the Ultravisor to make a page accessible to a guest.
* If it's brought in the first time, it will be cleared. If
@@ -277,6 +303,8 @@ again:
lock_page(page);
ptep = get_locked_pte(gmap->mm, uaddr, &ptelock);
+ if (should_export_before_import(uvcb, gmap->mm))
+ uv_convert_from_secure(page_to_phys(page));
rc = make_secure_pte(ptep, uaddr, page, uvcb);
pte_unmap_unlock(ptep, ptelock);
unlock_page(page);
@@ -334,6 +362,61 @@ int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
}
EXPORT_SYMBOL_GPL(gmap_convert_to_secure);
+/**
+ * gmap_destroy_page - Destroy a guest page.
+ * @gmap: the gmap of the guest
+ * @gaddr: the guest address to destroy
+ *
+ * An attempt will be made to destroy the given guest page. If the attempt
+ * fails, an attempt is made to export the page. If both attempts fail, an
+ * appropriate error is returned.
+ */
+int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr)
+{
+ struct vm_area_struct *vma;
+ unsigned long uaddr;
+ struct page *page;
+ int rc;
+
+ rc = -EFAULT;
+ mmap_read_lock(gmap->mm);
+
+ uaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(uaddr))
+ goto out;
+ vma = vma_lookup(gmap->mm, uaddr);
+ if (!vma)
+ goto out;
+ /*
+ * Huge pages should not be able to become secure
+ */
+ if (is_vm_hugetlb_page(vma))
+ goto out;
+
+ rc = 0;
+ /* we take an extra reference here */
+ page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET);
+ if (IS_ERR_OR_NULL(page))
+ goto out;
+ rc = uv_destroy_owned_page(page_to_phys(page));
+ /*
+ * Fault handlers can race; it is possible that two CPUs will fault
+ * on the same secure page. One CPU can destroy the page, reboot,
+ * re-enter secure mode and import it, while the second CPU was
+ * stuck at the beginning of the handler. At some point the second
+ * CPU will be able to progress, and it will not be able to destroy
+ * the page. In that case we do not want to terminate the process,
+ * we instead try to export the page.
+ */
+ if (rc)
+ rc = uv_convert_owned_from_secure(page_to_phys(page));
+ put_page(page);
+out:
+ mmap_read_unlock(gmap->mm);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_destroy_page);
+
/*
* To be called with the page locked or with an extra reference! This will
* prevent gmap_make_secure from touching the page concurrently. Having 2
@@ -392,6 +475,54 @@ static ssize_t uv_query_facilities(struct kobject *kobj,
static struct kobj_attribute uv_query_facilities_attr =
__ATTR(facilities, 0444, uv_query_facilities, NULL);
+static ssize_t uv_query_supp_se_hdr_ver(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_ver);
+}
+
+static struct kobj_attribute uv_query_supp_se_hdr_ver_attr =
+ __ATTR(supp_se_hdr_ver, 0444, uv_query_supp_se_hdr_ver, NULL);
+
+static ssize_t uv_query_supp_se_hdr_pcf(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_pcf);
+}
+
+static struct kobj_attribute uv_query_supp_se_hdr_pcf_attr =
+ __ATTR(supp_se_hdr_pcf, 0444, uv_query_supp_se_hdr_pcf, NULL);
+
+static ssize_t uv_query_dump_cpu_len(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%lx\n",
+ uv_info.guest_cpu_stor_len);
+}
+
+static struct kobj_attribute uv_query_dump_cpu_len_attr =
+ __ATTR(uv_query_dump_cpu_len, 0444, uv_query_dump_cpu_len, NULL);
+
+static ssize_t uv_query_dump_storage_state_len(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%lx\n",
+ uv_info.conf_dump_storage_state_len);
+}
+
+static struct kobj_attribute uv_query_dump_storage_state_len_attr =
+ __ATTR(dump_storage_state_len, 0444, uv_query_dump_storage_state_len, NULL);
+
+static ssize_t uv_query_dump_finalize_len(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%lx\n",
+ uv_info.conf_dump_finalize_len);
+}
+
+static struct kobj_attribute uv_query_dump_finalize_len_attr =
+ __ATTR(dump_finalize_len, 0444, uv_query_dump_finalize_len, NULL);
+
static ssize_t uv_query_feature_indications(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -431,12 +562,37 @@ static ssize_t uv_query_max_guest_addr(struct kobject *kobj,
static struct kobj_attribute uv_query_max_guest_addr_attr =
__ATTR(max_address, 0444, uv_query_max_guest_addr, NULL);
+static ssize_t uv_query_supp_att_req_hdr_ver(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_req_hdr_ver);
+}
+
+static struct kobj_attribute uv_query_supp_att_req_hdr_ver_attr =
+ __ATTR(supp_att_req_hdr_ver, 0444, uv_query_supp_att_req_hdr_ver, NULL);
+
+static ssize_t uv_query_supp_att_pflags(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_pflags);
+}
+
+static struct kobj_attribute uv_query_supp_att_pflags_attr =
+ __ATTR(supp_att_pflags, 0444, uv_query_supp_att_pflags, NULL);
+
static struct attribute *uv_query_attrs[] = {
&uv_query_facilities_attr.attr,
&uv_query_feature_indications_attr.attr,
&uv_query_max_guest_cpus_attr.attr,
&uv_query_max_guest_vms_attr.attr,
&uv_query_max_guest_addr_attr.attr,
+ &uv_query_supp_se_hdr_ver_attr.attr,
+ &uv_query_supp_se_hdr_pcf_attr.attr,
+ &uv_query_dump_storage_state_len_attr.attr,
+ &uv_query_dump_finalize_len_attr.attr,
+ &uv_query_dump_cpu_len_attr.attr,
+ &uv_query_supp_att_req_hdr_ver_attr.attr,
+ &uv_query_supp_att_pflags_attr.attr,
NULL,
};
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 2e84d3922f7c..33f4ff909476 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -34,6 +34,7 @@ config KVM
select SRCU
select KVM_VFIO
select INTERVAL_TREE
+ select MMU_NOTIFIER
help
Support hosting paravirtualized guest machines using the SIE
virtualization capability on the mainframe. This should work
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 26f4a74e5ce4..02217fb4ae10 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -10,4 +10,5 @@ ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o
kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o
+kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 227ed0009354..082ec5f2c3a5 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -262,77 +262,77 @@ struct aste {
/* .. more fields there */
};
-int ipte_lock_held(struct kvm_vcpu *vcpu)
+int ipte_lock_held(struct kvm *kvm)
{
- if (vcpu->arch.sie_block->eca & ECA_SII) {
+ if (sclp.has_siif) {
int rc;
- read_lock(&vcpu->kvm->arch.sca_lock);
- rc = kvm_s390_get_ipte_control(vcpu->kvm)->kh != 0;
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_lock(&kvm->arch.sca_lock);
+ rc = kvm_s390_get_ipte_control(kvm)->kh != 0;
+ read_unlock(&kvm->arch.sca_lock);
return rc;
}
- return vcpu->kvm->arch.ipte_lock_count != 0;
+ return kvm->arch.ipte_lock_count != 0;
}
-static void ipte_lock_simple(struct kvm_vcpu *vcpu)
+static void ipte_lock_simple(struct kvm *kvm)
{
union ipte_control old, new, *ic;
- mutex_lock(&vcpu->kvm->arch.ipte_mutex);
- vcpu->kvm->arch.ipte_lock_count++;
- if (vcpu->kvm->arch.ipte_lock_count > 1)
+ mutex_lock(&kvm->arch.ipte_mutex);
+ kvm->arch.ipte_lock_count++;
+ if (kvm->arch.ipte_lock_count > 1)
goto out;
retry:
- read_lock(&vcpu->kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(vcpu->kvm);
+ read_lock(&kvm->arch.sca_lock);
+ ic = kvm_s390_get_ipte_control(kvm);
do {
old = READ_ONCE(*ic);
if (old.k) {
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_unlock(&kvm->arch.sca_lock);
cond_resched();
goto retry;
}
new = old;
new.k = 1;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_unlock(&kvm->arch.sca_lock);
out:
- mutex_unlock(&vcpu->kvm->arch.ipte_mutex);
+ mutex_unlock(&kvm->arch.ipte_mutex);
}
-static void ipte_unlock_simple(struct kvm_vcpu *vcpu)
+static void ipte_unlock_simple(struct kvm *kvm)
{
union ipte_control old, new, *ic;
- mutex_lock(&vcpu->kvm->arch.ipte_mutex);
- vcpu->kvm->arch.ipte_lock_count--;
- if (vcpu->kvm->arch.ipte_lock_count)
+ mutex_lock(&kvm->arch.ipte_mutex);
+ kvm->arch.ipte_lock_count--;
+ if (kvm->arch.ipte_lock_count)
goto out;
- read_lock(&vcpu->kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(vcpu->kvm);
+ read_lock(&kvm->arch.sca_lock);
+ ic = kvm_s390_get_ipte_control(kvm);
do {
old = READ_ONCE(*ic);
new = old;
new.k = 0;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
- read_unlock(&vcpu->kvm->arch.sca_lock);
- wake_up(&vcpu->kvm->arch.ipte_wq);
+ read_unlock(&kvm->arch.sca_lock);
+ wake_up(&kvm->arch.ipte_wq);
out:
- mutex_unlock(&vcpu->kvm->arch.ipte_mutex);
+ mutex_unlock(&kvm->arch.ipte_mutex);
}
-static void ipte_lock_siif(struct kvm_vcpu *vcpu)
+static void ipte_lock_siif(struct kvm *kvm)
{
union ipte_control old, new, *ic;
retry:
- read_lock(&vcpu->kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(vcpu->kvm);
+ read_lock(&kvm->arch.sca_lock);
+ ic = kvm_s390_get_ipte_control(kvm);
do {
old = READ_ONCE(*ic);
if (old.kg) {
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_unlock(&kvm->arch.sca_lock);
cond_resched();
goto retry;
}
@@ -340,15 +340,15 @@ retry:
new.k = 1;
new.kh++;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_unlock(&kvm->arch.sca_lock);
}
-static void ipte_unlock_siif(struct kvm_vcpu *vcpu)
+static void ipte_unlock_siif(struct kvm *kvm)
{
union ipte_control old, new, *ic;
- read_lock(&vcpu->kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(vcpu->kvm);
+ read_lock(&kvm->arch.sca_lock);
+ ic = kvm_s390_get_ipte_control(kvm);
do {
old = READ_ONCE(*ic);
new = old;
@@ -356,25 +356,25 @@ static void ipte_unlock_siif(struct kvm_vcpu *vcpu)
if (!new.kh)
new.k = 0;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ read_unlock(&kvm->arch.sca_lock);
if (!new.kh)
- wake_up(&vcpu->kvm->arch.ipte_wq);
+ wake_up(&kvm->arch.ipte_wq);
}
-void ipte_lock(struct kvm_vcpu *vcpu)
+void ipte_lock(struct kvm *kvm)
{
- if (vcpu->arch.sie_block->eca & ECA_SII)
- ipte_lock_siif(vcpu);
+ if (sclp.has_siif)
+ ipte_lock_siif(kvm);
else
- ipte_lock_simple(vcpu);
+ ipte_lock_simple(kvm);
}
-void ipte_unlock(struct kvm_vcpu *vcpu)
+void ipte_unlock(struct kvm *kvm)
{
- if (vcpu->arch.sie_block->eca & ECA_SII)
- ipte_unlock_siif(vcpu);
+ if (sclp.has_siif)
+ ipte_unlock_siif(kvm);
else
- ipte_unlock_simple(vcpu);
+ ipte_unlock_simple(kvm);
}
static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar,
@@ -1086,7 +1086,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
try_storage_prot_override = storage_prot_override_applicable(vcpu);
need_ipte_lock = psw_bits(*psw).dat && !asce.r;
if (need_ipte_lock)
- ipte_lock(vcpu);
+ ipte_lock(vcpu->kvm);
/*
* Since we do the access further down ultimately via a move instruction
* that does key checking and returns an error in case of a protection
@@ -1127,7 +1127,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
}
out_unlock:
if (need_ipte_lock)
- ipte_unlock(vcpu);
+ ipte_unlock(vcpu->kvm);
if (nr_pages > ARRAY_SIZE(gpa_array))
vfree(gpas);
return rc;
@@ -1199,10 +1199,10 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
if (rc)
return rc;
- ipte_lock(vcpu);
+ ipte_lock(vcpu->kvm);
rc = guest_range_to_gpas(vcpu, gva, ar, NULL, length, asce, mode,
access_key);
- ipte_unlock(vcpu);
+ ipte_unlock(vcpu->kvm);
return rc;
}
@@ -1465,7 +1465,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
* tables/pointers we read stay valid - unshadowing is however
* always possible - only guest_table_lock protects us.
*/
- ipte_lock(vcpu);
+ ipte_lock(vcpu->kvm);
rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
if (rc)
@@ -1499,7 +1499,7 @@ shadow_page:
pte.p |= dat_protection;
if (!rc)
rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
- ipte_unlock(vcpu);
+ ipte_unlock(vcpu->kvm);
mmap_read_unlock(sg->mm);
return rc;
}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 1124ff282012..9408d6cc8e2c 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -440,9 +440,9 @@ int read_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
return access_guest_real(vcpu, gra, data, len, 0);
}
-void ipte_lock(struct kvm_vcpu *vcpu);
-void ipte_unlock(struct kvm_vcpu *vcpu);
-int ipte_lock_held(struct kvm_vcpu *vcpu);
+void ipte_lock(struct kvm *kvm);
+void ipte_unlock(struct kvm *kvm);
+int ipte_lock_held(struct kvm *kvm);
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
/* MVPG PEI indication bits */
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 8bd42a20d924..88112065d941 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -528,12 +528,27 @@ static int handle_pv_uvc(struct kvm_vcpu *vcpu)
static int handle_pv_notification(struct kvm_vcpu *vcpu)
{
+ int ret;
+
if (vcpu->arch.sie_block->ipa == 0xb210)
return handle_pv_spx(vcpu);
if (vcpu->arch.sie_block->ipa == 0xb220)
return handle_pv_sclp(vcpu);
if (vcpu->arch.sie_block->ipa == 0xb9a4)
return handle_pv_uvc(vcpu);
+ if (vcpu->arch.sie_block->ipa >> 8 == 0xae) {
+ /*
+ * Besides external call, other SIGP orders also cause a
+ * 108 (pv notify) intercept. In contrast to external call,
+ * these orders need to be emulated and hence the appropriate
+ * place to handle them is in handle_instruction().
+ * So first try kvm_s390_handle_sigp_pei() and if that isn't
+ * successful, go on with handle_instruction().
+ */
+ ret = kvm_s390_handle_sigp_pei(vcpu);
+ if (!ret)
+ return ret;
+ }
return handle_instruction(vcpu);
}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index af96dc0549a4..b9c944b262c7 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -28,9 +28,11 @@
#include <asm/switch_to.h>
#include <asm/nmi.h>
#include <asm/airq.h>
+#include <asm/tpi.h>
#include "kvm-s390.h"
#include "gaccess.h"
#include "trace-s390.h"
+#include "pci.h"
#define PFAULT_INIT 0x0600
#define PFAULT_DONE 0x0680
@@ -702,7 +704,7 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
/*
* We indicate floating repressible conditions along with
* other pending conditions. Channel Report Pending and Channel
- * Subsystem damage are the only two and and are indicated by
+ * Subsystem damage are the only two and are indicated by
* bits in mcic and masked in cr14.
*/
if (test_and_clear_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs)) {
@@ -3311,10 +3313,87 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister);
-static void gib_alert_irq_handler(struct airq_struct *airq, bool floating)
+static void aen_host_forward(unsigned long si)
{
+ struct kvm_s390_gisa_interrupt *gi;
+ struct zpci_gaite *gaite;
+ struct kvm *kvm;
+
+ gaite = (struct zpci_gaite *)aift->gait +
+ (si * sizeof(struct zpci_gaite));
+ if (gaite->count == 0)
+ return;
+ if (gaite->aisb != 0)
+ set_bit_inv(gaite->aisbo, (unsigned long *)gaite->aisb);
+
+ kvm = kvm_s390_pci_si_to_kvm(aift, si);
+ if (!kvm)
+ return;
+ gi = &kvm->arch.gisa_int;
+
+ if (!(gi->origin->g1.simm & AIS_MODE_MASK(gaite->gisc)) ||
+ !(gi->origin->g1.nimm & AIS_MODE_MASK(gaite->gisc))) {
+ gisa_set_ipm_gisc(gi->origin, gaite->gisc);
+ if (hrtimer_active(&gi->timer))
+ hrtimer_cancel(&gi->timer);
+ hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL);
+ kvm->stat.aen_forward++;
+ }
+}
+
+static void aen_process_gait(u8 isc)
+{
+ bool found = false, first = true;
+ union zpci_sic_iib iib = {{0}};
+ unsigned long si, flags;
+
+ spin_lock_irqsave(&aift->gait_lock, flags);
+
+ if (!aift->gait) {
+ spin_unlock_irqrestore(&aift->gait_lock, flags);
+ return;
+ }
+
+ for (si = 0;;) {
+ /* Scan adapter summary indicator bit vector */
+ si = airq_iv_scan(aift->sbv, si, airq_iv_end(aift->sbv));
+ if (si == -1UL) {
+ if (first || found) {
+ /* Re-enable interrupts. */
+ zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, isc,
+ &iib);
+ first = found = false;
+ } else {
+ /* Interrupts on and all bits processed */
+ break;
+ }
+ found = false;
+ si = 0;
+ /* Scan again after re-enabling interrupts */
+ continue;
+ }
+ found = true;
+ aen_host_forward(si);
+ }
+
+ spin_unlock_irqrestore(&aift->gait_lock, flags);
+}
+
+static void gib_alert_irq_handler(struct airq_struct *airq,
+ struct tpi_info *tpi_info)
+{
+ struct tpi_adapter_info *info = (struct tpi_adapter_info *)tpi_info;
+
inc_irq_stat(IRQIO_GAL);
- process_gib_alert_list();
+
+ if ((info->forward || info->error) &&
+ IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
+ aen_process_gait(info->isc);
+ if (info->aism != 0)
+ process_gib_alert_list();
+ } else {
+ process_gib_alert_list();
+ }
}
static struct airq_struct gib_alert_irq = {
@@ -3326,6 +3405,11 @@ void kvm_s390_gib_destroy(void)
{
if (!gib)
return;
+ if (kvm_s390_pci_interp_allowed() && aift) {
+ mutex_lock(&aift->aift_lock);
+ kvm_s390_pci_aen_exit();
+ mutex_unlock(&aift->aift_lock);
+ }
chsc_sgib(0);
unregister_adapter_interrupt(&gib_alert_irq);
free_page((unsigned long)gib);
@@ -3363,6 +3447,14 @@ int kvm_s390_gib_init(u8 nisc)
goto out_unreg_gal;
}
+ if (kvm_s390_pci_interp_allowed()) {
+ if (kvm_s390_pci_aen_init(nisc)) {
+ pr_err("Initializing AEN for PCI failed\n");
+ rc = -EIO;
+ goto out_unreg_gal;
+ }
+ }
+
KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc);
goto out;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 8fcb56141689..edfd4bbd0cba 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -31,6 +31,7 @@
#include <linux/sched/signal.h>
#include <linux/string.h>
#include <linux/pgtable.h>
+#include <linux/mmu_notifier.h>
#include <asm/asm-offsets.h>
#include <asm/lowcore.h>
@@ -47,6 +48,7 @@
#include <asm/fpu/api.h>
#include "kvm-s390.h"
#include "gaccess.h"
+#include "pci.h"
#define CREATE_TRACE_POINTS
#include "trace.h"
@@ -63,7 +65,8 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
STATS_DESC_COUNTER(VM, inject_float_mchk),
STATS_DESC_COUNTER(VM, inject_pfault_done),
STATS_DESC_COUNTER(VM, inject_service_signal),
- STATS_DESC_COUNTER(VM, inject_virtio)
+ STATS_DESC_COUNTER(VM, inject_virtio),
+ STATS_DESC_COUNTER(VM, aen_forward)
};
const struct kvm_stats_header kvm_vm_stats_header = {
@@ -502,6 +505,14 @@ int kvm_arch_init(void *opaque)
goto out;
}
+ if (kvm_s390_pci_interp_allowed()) {
+ rc = kvm_s390_pci_init();
+ if (rc) {
+ pr_err("Unable to allocate AIFT for PCI\n");
+ goto out;
+ }
+ }
+
rc = kvm_s390_gib_init(GAL_ISC);
if (rc)
goto out;
@@ -516,6 +527,8 @@ out:
void kvm_arch_exit(void)
{
kvm_s390_gib_destroy();
+ if (kvm_s390_pci_interp_allowed())
+ kvm_s390_pci_exit();
debug_unregister(kvm_s390_dbf);
debug_unregister(kvm_s390_dbf_uv);
}
@@ -606,6 +619,32 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_PROTECTED:
r = is_prot_virt_host();
break;
+ case KVM_CAP_S390_PROTECTED_DUMP: {
+ u64 pv_cmds_dump[] = {
+ BIT_UVC_CMD_DUMP_INIT,
+ BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE,
+ BIT_UVC_CMD_DUMP_CPU,
+ BIT_UVC_CMD_DUMP_COMPLETE,
+ };
+ int i;
+
+ r = is_prot_virt_host();
+
+ for (i = 0; i < ARRAY_SIZE(pv_cmds_dump); i++) {
+ if (!test_bit_inv(pv_cmds_dump[i],
+ (unsigned long *)&uv_info.inst_calls_list)) {
+ r = 0;
+ break;
+ }
+ }
+ break;
+ }
+ case KVM_CAP_S390_ZPCI_OP:
+ r = kvm_s390_pci_interp_allowed();
+ break;
+ case KVM_CAP_S390_CPU_TOPOLOGY:
+ r = test_facility(11);
+ break;
default:
r = 0;
}
@@ -817,6 +856,20 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
icpt_operexc_on_all_vcpus(kvm);
r = 0;
break;
+ case KVM_CAP_S390_CPU_TOPOLOGY:
+ r = -EINVAL;
+ mutex_lock(&kvm->lock);
+ if (kvm->created_vcpus) {
+ r = -EBUSY;
+ } else if (test_facility(11)) {
+ set_kvm_facility(kvm->arch.model.fac_mask, 11);
+ set_kvm_facility(kvm->arch.model.fac_list, 11);
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s",
+ r ? "(not available)" : "(success)");
+ break;
default:
r = -EINVAL;
break;
@@ -1019,6 +1072,42 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
return 0;
}
+static void kvm_s390_vcpu_pci_setup(struct kvm_vcpu *vcpu)
+{
+ /* Only set the ECB bits after guest requests zPCI interpretation */
+ if (!vcpu->kvm->arch.use_zpci_interp)
+ return;
+
+ vcpu->arch.sie_block->ecb2 |= ECB2_ZPCI_LSI;
+ vcpu->arch.sie_block->ecb3 |= ECB3_AISII + ECB3_AISI;
+}
+
+void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ lockdep_assert_held(&kvm->lock);
+
+ if (!kvm_s390_pci_interp_allowed())
+ return;
+
+ /*
+ * If host is configured for PCI and the necessary facilities are
+ * available, turn on interpretation for the life of this guest
+ */
+ kvm->arch.use_zpci_interp = 1;
+
+ kvm_s390_vcpu_block_all(kvm);
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_s390_vcpu_pci_setup(vcpu);
+ kvm_s390_sync_request(KVM_REQ_VSIE_RESTART, vcpu);
+ }
+
+ kvm_s390_vcpu_unblock_all(kvm);
+}
+
static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req)
{
unsigned long cx;
@@ -1691,6 +1780,57 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
return ret;
}
+/**
+ * kvm_s390_update_topology_change_report - update CPU topology change report
+ * @kvm: guest KVM description
+ * @val: set or clear the MTCR bit
+ *
+ * Updates the Multiprocessor Topology-Change-Report bit to signal
+ * the guest with a topology change.
+ * This is only relevant if the topology facility is present.
+ *
+ * The SCA version, bsca or esca, doesn't matter as offset is the same.
+ */
+static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val)
+{
+ union sca_utility new, old;
+ struct bsca_block *sca;
+
+ read_lock(&kvm->arch.sca_lock);
+ sca = kvm->arch.sca;
+ do {
+ old = READ_ONCE(sca->utility);
+ new = old;
+ new.mtcr = val;
+ } while (cmpxchg(&sca->utility.val, old.val, new.val) != old.val);
+ read_unlock(&kvm->arch.sca_lock);
+}
+
+static int kvm_s390_set_topo_change_indication(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ if (!test_kvm_facility(kvm, 11))
+ return -ENXIO;
+
+ kvm_s390_update_topology_change_report(kvm, !!attr->attr);
+ return 0;
+}
+
+static int kvm_s390_get_topo_change_indication(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ u8 topo;
+
+ if (!test_kvm_facility(kvm, 11))
+ return -ENXIO;
+
+ read_lock(&kvm->arch.sca_lock);
+ topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr;
+ read_unlock(&kvm->arch.sca_lock);
+
+ return put_user(topo, (u8 __user *)attr->addr);
+}
+
static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret;
@@ -1711,6 +1851,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_MIGRATION:
ret = kvm_s390_vm_set_migration(kvm, attr);
break;
+ case KVM_S390_VM_CPU_TOPOLOGY:
+ ret = kvm_s390_set_topo_change_indication(kvm, attr);
+ break;
default:
ret = -ENXIO;
break;
@@ -1736,6 +1879,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_MIGRATION:
ret = kvm_s390_vm_get_migration(kvm, attr);
break;
+ case KVM_S390_VM_CPU_TOPOLOGY:
+ ret = kvm_s390_get_topo_change_indication(kvm, attr);
+ break;
default:
ret = -ENXIO;
break;
@@ -1809,6 +1955,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_MIGRATION:
ret = 0;
break;
+ case KVM_S390_VM_CPU_TOPOLOGY:
+ ret = test_kvm_facility(kvm, 11) ? 0 : -ENXIO;
+ break;
default:
ret = -ENXIO;
break;
@@ -2166,12 +2315,25 @@ out:
return r;
}
-static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp)
+/**
+ * kvm_s390_cpus_from_pv - Convert all protected vCPUs in a protected VM to
+ * non protected.
+ * @kvm: the VM whose protected vCPUs are to be converted
+ * @rc: return value for the RC field of the UVC (in case of error)
+ * @rrc: return value for the RRC field of the UVC (in case of error)
+ *
+ * Does not stop in case of error, tries to convert as many
+ * CPUs as possible. In case of error, the RC and RRC of the last error are
+ * returned.
+ *
+ * Return: 0 in case of success, otherwise -EIO
+ */
+int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
{
struct kvm_vcpu *vcpu;
- u16 rc, rrc;
- int ret = 0;
unsigned long i;
+ u16 _rc, _rrc;
+ int ret = 0;
/*
* We ignore failures and try to destroy as many CPUs as possible.
@@ -2183,9 +2345,9 @@ static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp)
*/
kvm_for_each_vcpu(i, vcpu, kvm) {
mutex_lock(&vcpu->mutex);
- if (kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc) && !ret) {
- *rcp = rc;
- *rrcp = rrc;
+ if (kvm_s390_pv_destroy_cpu(vcpu, &_rc, &_rrc) && !ret) {
+ *rc = _rc;
+ *rrc = _rrc;
ret = -EIO;
}
mutex_unlock(&vcpu->mutex);
@@ -2196,6 +2358,17 @@ static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp)
return ret;
}
+/**
+ * kvm_s390_cpus_to_pv - Convert all non-protected vCPUs in a protected VM
+ * to protected.
+ * @kvm: the VM whose protected vCPUs are to be converted
+ * @rc: return value for the RC field of the UVC (in case of error)
+ * @rrc: return value for the RRC field of the UVC (in case of error)
+ *
+ * Tries to undo the conversion in case of error.
+ *
+ * Return: 0 in case of success, otherwise -EIO
+ */
static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
{
unsigned long i;
@@ -2220,6 +2393,115 @@ static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
return r;
}
+/*
+ * Here we provide user space with a direct interface to query UV
+ * related data like UV maxima and available features as well as
+ * feature specific data.
+ *
+ * To facilitate future extension of the data structures we'll try to
+ * write data up to the maximum requested length.
+ */
+static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info)
+{
+ ssize_t len_min;
+
+ switch (info->header.id) {
+ case KVM_PV_INFO_VM: {
+ len_min = sizeof(info->header) + sizeof(info->vm);
+
+ if (info->header.len_max < len_min)
+ return -EINVAL;
+
+ memcpy(info->vm.inst_calls_list,
+ uv_info.inst_calls_list,
+ sizeof(uv_info.inst_calls_list));
+
+ /* It's max cpuid not max cpus, so it's off by one */
+ info->vm.max_cpus = uv_info.max_guest_cpu_id + 1;
+ info->vm.max_guests = uv_info.max_num_sec_conf;
+ info->vm.max_guest_addr = uv_info.max_sec_stor_addr;
+ info->vm.feature_indication = uv_info.uv_feature_indications;
+
+ return len_min;
+ }
+ case KVM_PV_INFO_DUMP: {
+ len_min = sizeof(info->header) + sizeof(info->dump);
+
+ if (info->header.len_max < len_min)
+ return -EINVAL;
+
+ info->dump.dump_cpu_buffer_len = uv_info.guest_cpu_stor_len;
+ info->dump.dump_config_mem_buffer_per_1m = uv_info.conf_dump_storage_state_len;
+ info->dump.dump_config_finalize_len = uv_info.conf_dump_finalize_len;
+ return len_min;
+ }
+ default:
+ return -EINVAL;
+ }
+}
+
+static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd,
+ struct kvm_s390_pv_dmp dmp)
+{
+ int r = -EINVAL;
+ void __user *result_buff = (void __user *)dmp.buff_addr;
+
+ switch (dmp.subcmd) {
+ case KVM_PV_DUMP_INIT: {
+ if (kvm->arch.pv.dumping)
+ break;
+
+ /*
+ * Block SIE entry as concurrent dump UVCs could lead
+ * to validities.
+ */
+ kvm_s390_vcpu_block_all(kvm);
+
+ r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+ UVC_CMD_DUMP_INIT, &cmd->rc, &cmd->rrc);
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP INIT: rc %x rrc %x",
+ cmd->rc, cmd->rrc);
+ if (!r) {
+ kvm->arch.pv.dumping = true;
+ } else {
+ kvm_s390_vcpu_unblock_all(kvm);
+ r = -EINVAL;
+ }
+ break;
+ }
+ case KVM_PV_DUMP_CONFIG_STOR_STATE: {
+ if (!kvm->arch.pv.dumping)
+ break;
+
+ /*
+ * gaddr is an output parameter since we might stop
+ * early. As dmp will be copied back in our caller, we
+ * don't need to do it ourselves.
+ */
+ r = kvm_s390_pv_dump_stor_state(kvm, result_buff, &dmp.gaddr, dmp.buff_len,
+ &cmd->rc, &cmd->rrc);
+ break;
+ }
+ case KVM_PV_DUMP_COMPLETE: {
+ if (!kvm->arch.pv.dumping)
+ break;
+
+ r = -EINVAL;
+ if (dmp.buff_len < uv_info.conf_dump_finalize_len)
+ break;
+
+ r = kvm_s390_pv_dump_complete(kvm, result_buff,
+ &cmd->rc, &cmd->rrc);
+ break;
+ }
+ default:
+ r = -ENOTTY;
+ break;
+ }
+
+ return r;
+}
+
static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
{
int r = 0;
@@ -2356,6 +2638,68 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
cmd->rc, cmd->rrc);
break;
}
+ case KVM_PV_INFO: {
+ struct kvm_s390_pv_info info = {};
+ ssize_t data_len;
+
+ /*
+ * No need to check the VM protection here.
+ *
+ * Maybe user space wants to query some of the data
+ * when the VM is still unprotected. If we see the
+ * need to fence a new data command we can still
+ * return an error in the info handler.
+ */
+
+ r = -EFAULT;
+ if (copy_from_user(&info, argp, sizeof(info.header)))
+ break;
+
+ r = -EINVAL;
+ if (info.header.len_max < sizeof(info.header))
+ break;
+
+ data_len = kvm_s390_handle_pv_info(&info);
+ if (data_len < 0) {
+ r = data_len;
+ break;
+ }
+ /*
+ * If a data command struct is extended (multiple
+ * times) this can be used to determine how much of it
+ * is valid.
+ */
+ info.header.len_written = data_len;
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &info, data_len))
+ break;
+
+ r = 0;
+ break;
+ }
+ case KVM_PV_DUMP: {
+ struct kvm_s390_pv_dmp dmp;
+
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm))
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(&dmp, argp, sizeof(dmp)))
+ break;
+
+ r = kvm_s390_pv_dmp(kvm, cmd, dmp);
+ if (r)
+ break;
+
+ if (copy_to_user(argp, &dmp, sizeof(dmp))) {
+ r = -EFAULT;
+ break;
+ }
+
+ break;
+ }
default:
r = -ENOTTY;
}
@@ -2581,6 +2925,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = -EFAULT;
break;
}
+ case KVM_S390_ZPCI_OP: {
+ struct kvm_s390_zpci_op args;
+
+ r = -EINVAL;
+ if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+ break;
+ if (copy_from_user(&args, argp, sizeof(args))) {
+ r = -EFAULT;
+ break;
+ }
+ r = kvm_s390_pci_zpci_op(kvm, &args);
+ break;
+ }
default:
r = -ENOTTY;
}
@@ -2742,6 +3099,14 @@ static void sca_dispose(struct kvm *kvm)
kvm->arch.sca = NULL;
}
+void kvm_arch_free_vm(struct kvm *kvm)
+{
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+ kvm_s390_pci_clear_list(kvm);
+
+ __kvm_arch_free_vm(kvm);
+}
+
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
gfp_t alloc_flags = GFP_KERNEL_ACCOUNT;
@@ -2824,6 +3189,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_s390_crypto_init(kvm);
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
+ mutex_lock(&kvm->lock);
+ kvm_s390_pci_init_list(kvm);
+ kvm_s390_vcpu_pci_enable_interp(kvm);
+ mutex_unlock(&kvm->lock);
+ }
+
mutex_init(&kvm->arch.float_int.ais_lock);
spin_lock_init(&kvm->arch.float_int.lock);
for (i = 0; i < FIRQ_LIST_COUNT; i++)
@@ -2877,6 +3249,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvm_clear_async_pf_completion_queue(vcpu);
if (!kvm_is_ucontrol(vcpu->kvm))
sca_del_vcpu(vcpu);
+ kvm_s390_update_topology_change_report(vcpu->kvm, 1);
if (kvm_is_ucontrol(vcpu->kvm))
gmap_remove(vcpu->arch.gmap);
@@ -2904,6 +3277,15 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
*/
if (kvm_s390_pv_get_handle(kvm))
kvm_s390_pv_deinit_vm(kvm, &rc, &rrc);
+ /*
+ * Remove the mmu notifier only when the whole KVM VM is torn down,
+ * and only if one was registered to begin with. If the VM is
+ * currently not protected, but has been previously been protected,
+ * then it's possible that the notifier is still registered.
+ */
+ if (kvm->arch.pv.mmu_notifier.ops)
+ mmu_notifier_unregister(&kvm->arch.pv.mmu_notifier, kvm->mm);
+
debug_unregister(kvm->arch.dbf);
free_page((unsigned long)kvm->arch.sie_page2);
if (!kvm_is_ucontrol(kvm))
@@ -3047,9 +3429,7 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
if (!sclp.has_esca || !sclp.has_64bscao)
return false;
- mutex_lock(&kvm->lock);
rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm);
- mutex_unlock(&kvm->lock);
return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS;
}
@@ -3272,6 +3652,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT;
if (test_kvm_facility(vcpu->kvm, 9))
vcpu->arch.sie_block->ecb |= ECB_SRSI;
+ if (test_kvm_facility(vcpu->kvm, 11))
+ vcpu->arch.sie_block->ecb |= ECB_PTF;
if (test_kvm_facility(vcpu->kvm, 73))
vcpu->arch.sie_block->ecb |= ECB_TE;
if (!kvm_is_ucontrol(vcpu->kvm))
@@ -3324,6 +3706,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
kvm_s390_vcpu_crypto_setup(vcpu);
+ kvm_s390_vcpu_pci_setup(vcpu);
+
mutex_lock(&vcpu->kvm->lock);
if (kvm_s390_pv_is_protected(vcpu->kvm)) {
rc = kvm_s390_pv_create_cpu(vcpu, &uvrc, &uvrrc);
@@ -3403,6 +3787,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
rc = kvm_s390_vcpu_setup(vcpu);
if (rc)
goto out_ucontrol_uninit;
+
+ kvm_s390_update_topology_change_report(vcpu->kvm, 1);
return 0;
out_ucontrol_uninit:
@@ -4473,6 +4859,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
int rc;
+ /*
+ * Running a VM while dumping always has the potential to
+ * produce inconsistent dump data. But for PV vcpus a SIE
+ * entry while dumping could also lead to a fatal validity
+ * intercept which we absolutely want to avoid.
+ */
+ if (vcpu->kvm->arch.pv.dumping)
+ return -EINVAL;
+
if (kvm_run->immediate_exit)
return -EINTR;
@@ -4912,6 +5307,48 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp,
return -ENOIOCTLCMD;
}
+static int kvm_s390_handle_pv_vcpu_dump(struct kvm_vcpu *vcpu,
+ struct kvm_pv_cmd *cmd)
+{
+ struct kvm_s390_pv_dmp dmp;
+ void *data;
+ int ret;
+
+ /* Dump initialization is a prerequisite */
+ if (!vcpu->kvm->arch.pv.dumping)
+ return -EINVAL;
+
+ if (copy_from_user(&dmp, (__u8 __user *)cmd->data, sizeof(dmp)))
+ return -EFAULT;
+
+ /* We only handle this subcmd right now */
+ if (dmp.subcmd != KVM_PV_DUMP_CPU)
+ return -EINVAL;
+
+ /* CPU dump length is the same as create cpu storage donation. */
+ if (dmp.buff_len != uv_info.guest_cpu_stor_len)
+ return -EINVAL;
+
+ data = kvzalloc(uv_info.guest_cpu_stor_len, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ ret = kvm_s390_pv_dump_cpu(vcpu, data, &cmd->rc, &cmd->rrc);
+
+ VCPU_EVENT(vcpu, 3, "PROTVIRT DUMP CPU %d rc %x rrc %x",
+ vcpu->vcpu_id, cmd->rc, cmd->rrc);
+
+ if (ret)
+ ret = -EINVAL;
+
+ /* On success copy over the dump data */
+ if (!ret && copy_to_user((__u8 __user *)dmp.buff_addr, data, uv_info.guest_cpu_stor_len))
+ ret = -EFAULT;
+
+ kvfree(data);
+ return ret;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -5076,6 +5513,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
irq_state.len);
break;
}
+ case KVM_S390_PV_CPU_COMMAND: {
+ struct kvm_pv_cmd cmd;
+
+ r = -EINVAL;
+ if (!is_prot_virt_host())
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(&cmd, argp, sizeof(cmd)))
+ break;
+
+ r = -EINVAL;
+ if (cmd.flags)
+ break;
+
+ /* We only handle this cmd right now */
+ if (cmd.cmd != KVM_PV_DUMP)
+ break;
+
+ r = kvm_s390_handle_pv_vcpu_dump(vcpu, &cmd);
+
+ /* Always copy over UV rc / rrc data */
+ if (copy_to_user((__u8 __user *)argp, &cmd.rc,
+ sizeof(cmd.rc) + sizeof(cmd.rrc)))
+ r = -EFAULT;
+ break;
+ }
default:
r = -ENOTTY;
}
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 497d52a83c78..f6fd668f887e 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -250,6 +250,11 @@ int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
unsigned long tweak, u16 *rc, u16 *rrc);
int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state);
+int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc);
+int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
+ u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc);
+int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
+ u16 *rc, u16 *rrc);
static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm)
{
@@ -374,6 +379,7 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm);
__u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu);
+int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc);
/* implemented in diag.c */
int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
@@ -508,6 +514,16 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm);
/**
+ * kvm_s390_vcpu_pci_enable_interp
+ *
+ * Set the associated PCI attributes for each vcpu to allow for zPCI Load/Store
+ * interpretation as well as adapter interruption forwarding.
+ *
+ * @kvm: the KVM guest
+ */
+void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm);
+
+/**
* diag9c_forwarding_hz
*
* Set the maximum number of diag9c forwarding per second
diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
new file mode 100644
index 000000000000..4946fb7757d6
--- /dev/null
+++ b/arch/s390/kvm/pci.c
@@ -0,0 +1,690 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * s390 kvm PCI passthrough support
+ *
+ * Copyright IBM Corp. 2022
+ *
+ * Author(s): Matthew Rosato <mjrosato@linux.ibm.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/pci.h>
+#include <asm/pci.h>
+#include <asm/pci_insn.h>
+#include <asm/pci_io.h>
+#include <asm/sclp.h>
+#include "pci.h"
+#include "kvm-s390.h"
+
+struct zpci_aift *aift;
+
+static inline int __set_irq_noiib(u16 ctl, u8 isc)
+{
+ union zpci_sic_iib iib = {{0}};
+
+ return zpci_set_irq_ctrl(ctl, isc, &iib);
+}
+
+void kvm_s390_pci_aen_exit(void)
+{
+ unsigned long flags;
+ struct kvm_zdev **gait_kzdev;
+
+ lockdep_assert_held(&aift->aift_lock);
+
+ /*
+ * Contents of the aipb remain registered for the life of the host
+ * kernel, the information preserved in zpci_aipb and zpci_aif_sbv
+ * in case we insert the KVM module again later. Clear the AIFT
+ * information and free anything not registered with underlying
+ * firmware.
+ */
+ spin_lock_irqsave(&aift->gait_lock, flags);
+ gait_kzdev = aift->kzdev;
+ aift->gait = NULL;
+ aift->sbv = NULL;
+ aift->kzdev = NULL;
+ spin_unlock_irqrestore(&aift->gait_lock, flags);
+
+ kfree(gait_kzdev);
+}
+
+static int zpci_setup_aipb(u8 nisc)
+{
+ struct page *page;
+ int size, rc;
+
+ zpci_aipb = kzalloc(sizeof(union zpci_sic_iib), GFP_KERNEL);
+ if (!zpci_aipb)
+ return -ENOMEM;
+
+ aift->sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, 0);
+ if (!aift->sbv) {
+ rc = -ENOMEM;
+ goto free_aipb;
+ }
+ zpci_aif_sbv = aift->sbv;
+ size = get_order(PAGE_ALIGN(ZPCI_NR_DEVICES *
+ sizeof(struct zpci_gaite)));
+ page = alloc_pages(GFP_KERNEL | __GFP_ZERO, size);
+ if (!page) {
+ rc = -ENOMEM;
+ goto free_sbv;
+ }
+ aift->gait = (struct zpci_gaite *)page_to_phys(page);
+
+ zpci_aipb->aipb.faisb = virt_to_phys(aift->sbv->vector);
+ zpci_aipb->aipb.gait = virt_to_phys(aift->gait);
+ zpci_aipb->aipb.afi = nisc;
+ zpci_aipb->aipb.faal = ZPCI_NR_DEVICES;
+
+ /* Setup Adapter Event Notification Interpretation */
+ if (zpci_set_irq_ctrl(SIC_SET_AENI_CONTROLS, 0, zpci_aipb)) {
+ rc = -EIO;
+ goto free_gait;
+ }
+
+ return 0;
+
+free_gait:
+ free_pages((unsigned long)aift->gait, size);
+free_sbv:
+ airq_iv_release(aift->sbv);
+ zpci_aif_sbv = NULL;
+free_aipb:
+ kfree(zpci_aipb);
+ zpci_aipb = NULL;
+
+ return rc;
+}
+
+static int zpci_reset_aipb(u8 nisc)
+{
+ /*
+ * AEN registration can only happen once per system boot. If
+ * an aipb already exists then AEN was already registered and
+ * we can re-use the aipb contents. This can only happen if
+ * the KVM module was removed and re-inserted. However, we must
+ * ensure that the same forwarding ISC is used as this is assigned
+ * during KVM module load.
+ */
+ if (zpci_aipb->aipb.afi != nisc)
+ return -EINVAL;
+
+ aift->sbv = zpci_aif_sbv;
+ aift->gait = (struct zpci_gaite *)zpci_aipb->aipb.gait;
+
+ return 0;
+}
+
+int kvm_s390_pci_aen_init(u8 nisc)
+{
+ int rc = 0;
+
+ /* If already enabled for AEN, bail out now */
+ if (aift->gait || aift->sbv)
+ return -EPERM;
+
+ mutex_lock(&aift->aift_lock);
+ aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev),
+ GFP_KERNEL);
+ if (!aift->kzdev) {
+ rc = -ENOMEM;
+ goto unlock;
+ }
+
+ if (!zpci_aipb)
+ rc = zpci_setup_aipb(nisc);
+ else
+ rc = zpci_reset_aipb(nisc);
+ if (rc)
+ goto free_zdev;
+
+ /* Enable floating IRQs */
+ if (__set_irq_noiib(SIC_IRQ_MODE_SINGLE, nisc)) {
+ rc = -EIO;
+ kvm_s390_pci_aen_exit();
+ }
+
+ goto unlock;
+
+free_zdev:
+ kfree(aift->kzdev);
+unlock:
+ mutex_unlock(&aift->aift_lock);
+ return rc;
+}
+
+/* Modify PCI: Register floating adapter interruption forwarding */
+static int kvm_zpci_set_airq(struct zpci_dev *zdev)
+{
+ u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT);
+ struct zpci_fib fib = {};
+ u8 status;
+
+ fib.fmt0.isc = zdev->kzdev->fib.fmt0.isc;
+ fib.fmt0.sum = 1; /* enable summary notifications */
+ fib.fmt0.noi = airq_iv_end(zdev->aibv);
+ fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector);
+ fib.fmt0.aibvo = 0;
+ fib.fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8);
+ fib.fmt0.aisbo = zdev->aisb & 63;
+ fib.gd = zdev->gisa;
+
+ return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
+}
+
+/* Modify PCI: Unregister floating adapter interruption forwarding */
+static int kvm_zpci_clear_airq(struct zpci_dev *zdev)
+{
+ u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT);
+ struct zpci_fib fib = {};
+ u8 cc, status;
+
+ fib.gd = zdev->gisa;
+
+ cc = zpci_mod_fc(req, &fib, &status);
+ if (cc == 3 || (cc == 1 && status == 24))
+ /* Function already gone or IRQs already deregistered. */
+ cc = 0;
+
+ return cc ? -EIO : 0;
+}
+
+static inline void unaccount_mem(unsigned long nr_pages)
+{
+ struct user_struct *user = get_uid(current_user());
+
+ if (user)
+ atomic_long_sub(nr_pages, &user->locked_vm);
+ if (current->mm)
+ atomic64_sub(nr_pages, &current->mm->pinned_vm);
+}
+
+static inline int account_mem(unsigned long nr_pages)
+{
+ struct user_struct *user = get_uid(current_user());
+ unsigned long page_limit, cur_pages, new_pages;
+
+ page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ do {
+ cur_pages = atomic_long_read(&user->locked_vm);
+ new_pages = cur_pages + nr_pages;
+ if (new_pages > page_limit)
+ return -ENOMEM;
+ } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
+ new_pages) != cur_pages);
+
+ atomic64_add(nr_pages, &current->mm->pinned_vm);
+
+ return 0;
+}
+
+static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib,
+ bool assist)
+{
+ struct page *pages[1], *aibv_page, *aisb_page = NULL;
+ unsigned int msi_vecs, idx;
+ struct zpci_gaite *gaite;
+ unsigned long hva, bit;
+ struct kvm *kvm;
+ phys_addr_t gaddr;
+ int rc = 0, gisc, npages, pcount = 0;
+
+ /*
+ * Interrupt forwarding is only applicable if the device is already
+ * enabled for interpretation
+ */
+ if (zdev->gisa == 0)
+ return -EINVAL;
+
+ kvm = zdev->kzdev->kvm;
+ msi_vecs = min_t(unsigned int, fib->fmt0.noi, zdev->max_msi);
+
+ /* Get the associated forwarding ISC - if invalid, return the error */
+ gisc = kvm_s390_gisc_register(kvm, fib->fmt0.isc);
+ if (gisc < 0)
+ return gisc;
+
+ /* Replace AIBV address */
+ idx = srcu_read_lock(&kvm->srcu);
+ hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aibv));
+ npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, pages);
+ srcu_read_unlock(&kvm->srcu, idx);
+ if (npages < 1) {
+ rc = -EIO;
+ goto out;
+ }
+ aibv_page = pages[0];
+ pcount++;
+ gaddr = page_to_phys(aibv_page) + (fib->fmt0.aibv & ~PAGE_MASK);
+ fib->fmt0.aibv = gaddr;
+
+ /* Pin the guest AISB if one was specified */
+ if (fib->fmt0.sum == 1) {
+ idx = srcu_read_lock(&kvm->srcu);
+ hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aisb));
+ npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM,
+ pages);
+ srcu_read_unlock(&kvm->srcu, idx);
+ if (npages < 1) {
+ rc = -EIO;
+ goto unpin1;
+ }
+ aisb_page = pages[0];
+ pcount++;
+ }
+
+ /* Account for pinned pages, roll back on failure */
+ if (account_mem(pcount))
+ goto unpin2;
+
+ /* AISB must be allocated before we can fill in GAITE */
+ mutex_lock(&aift->aift_lock);
+ bit = airq_iv_alloc_bit(aift->sbv);
+ if (bit == -1UL)
+ goto unlock;
+ zdev->aisb = bit; /* store the summary bit number */
+ zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA |
+ AIRQ_IV_BITLOCK |
+ AIRQ_IV_GUESTVEC,
+ phys_to_virt(fib->fmt0.aibv));
+
+ spin_lock_irq(&aift->gait_lock);
+ gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb *
+ sizeof(struct zpci_gaite));
+
+ /* If assist not requested, host will get all alerts */
+ if (assist)
+ gaite->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa);
+ else
+ gaite->gisa = 0;
+
+ gaite->gisc = fib->fmt0.isc;
+ gaite->count++;
+ gaite->aisbo = fib->fmt0.aisbo;
+ gaite->aisb = virt_to_phys(page_address(aisb_page) + (fib->fmt0.aisb &
+ ~PAGE_MASK));
+ aift->kzdev[zdev->aisb] = zdev->kzdev;
+ spin_unlock_irq(&aift->gait_lock);
+
+ /* Update guest FIB for re-issue */
+ fib->fmt0.aisbo = zdev->aisb & 63;
+ fib->fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8);
+ fib->fmt0.isc = gisc;
+
+ /* Save some guest fib values in the host for later use */
+ zdev->kzdev->fib.fmt0.isc = fib->fmt0.isc;
+ zdev->kzdev->fib.fmt0.aibv = fib->fmt0.aibv;
+ mutex_unlock(&aift->aift_lock);
+
+ /* Issue the clp to setup the irq now */
+ rc = kvm_zpci_set_airq(zdev);
+ return rc;
+
+unlock:
+ mutex_unlock(&aift->aift_lock);
+unpin2:
+ if (fib->fmt0.sum == 1)
+ unpin_user_page(aisb_page);
+unpin1:
+ unpin_user_page(aibv_page);
+out:
+ return rc;
+}
+
+static int kvm_s390_pci_aif_disable(struct zpci_dev *zdev, bool force)
+{
+ struct kvm_zdev *kzdev = zdev->kzdev;
+ struct zpci_gaite *gaite;
+ struct page *vpage = NULL, *spage = NULL;
+ int rc, pcount = 0;
+ u8 isc;
+
+ if (zdev->gisa == 0)
+ return -EINVAL;
+
+ mutex_lock(&aift->aift_lock);
+
+ /*
+ * If the clear fails due to an error, leave now unless we know this
+ * device is about to go away (force) -- In that case clear the GAITE
+ * regardless.
+ */
+ rc = kvm_zpci_clear_airq(zdev);
+ if (rc && !force)
+ goto out;
+
+ if (zdev->kzdev->fib.fmt0.aibv == 0)
+ goto out;
+ spin_lock_irq(&aift->gait_lock);
+ gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb *
+ sizeof(struct zpci_gaite));
+ isc = gaite->gisc;
+ gaite->count--;
+ if (gaite->count == 0) {
+ /* Release guest AIBV and AISB */
+ vpage = phys_to_page(kzdev->fib.fmt0.aibv);
+ if (gaite->aisb != 0)
+ spage = phys_to_page(gaite->aisb);
+ /* Clear the GAIT entry */
+ gaite->aisb = 0;
+ gaite->gisc = 0;
+ gaite->aisbo = 0;
+ gaite->gisa = 0;
+ aift->kzdev[zdev->aisb] = 0;
+ /* Clear zdev info */
+ airq_iv_free_bit(aift->sbv, zdev->aisb);
+ airq_iv_release(zdev->aibv);
+ zdev->aisb = 0;
+ zdev->aibv = NULL;
+ }
+ spin_unlock_irq(&aift->gait_lock);
+ kvm_s390_gisc_unregister(kzdev->kvm, isc);
+ kzdev->fib.fmt0.isc = 0;
+ kzdev->fib.fmt0.aibv = 0;
+
+ if (vpage) {
+ unpin_user_page(vpage);
+ pcount++;
+ }
+ if (spage) {
+ unpin_user_page(spage);
+ pcount++;
+ }
+ if (pcount > 0)
+ unaccount_mem(pcount);
+out:
+ mutex_unlock(&aift->aift_lock);
+
+ return rc;
+}
+
+static int kvm_s390_pci_dev_open(struct zpci_dev *zdev)
+{
+ struct kvm_zdev *kzdev;
+
+ kzdev = kzalloc(sizeof(struct kvm_zdev), GFP_KERNEL);
+ if (!kzdev)
+ return -ENOMEM;
+
+ kzdev->zdev = zdev;
+ zdev->kzdev = kzdev;
+
+ return 0;
+}
+
+static void kvm_s390_pci_dev_release(struct zpci_dev *zdev)
+{
+ struct kvm_zdev *kzdev;
+
+ kzdev = zdev->kzdev;
+ WARN_ON(kzdev->zdev != zdev);
+ zdev->kzdev = NULL;
+ kfree(kzdev);
+}
+
+
+/*
+ * Register device with the specified KVM. If interpetation facilities are
+ * available, enable them and let userspace indicate whether or not they will
+ * be used (specify SHM bit to disable).
+ */
+int kvm_s390_pci_register_kvm(struct zpci_dev *zdev, struct kvm *kvm)
+{
+ int rc;
+
+ if (!zdev)
+ return -EINVAL;
+
+ mutex_lock(&zdev->kzdev_lock);
+
+ if (zdev->kzdev || zdev->gisa != 0 || !kvm) {
+ mutex_unlock(&zdev->kzdev_lock);
+ return -EINVAL;
+ }
+
+ kvm_get_kvm(kvm);
+
+ mutex_lock(&kvm->lock);
+
+ rc = kvm_s390_pci_dev_open(zdev);
+ if (rc)
+ goto err;
+
+ /*
+ * If interpretation facilities aren't available, add the device to
+ * the kzdev list but don't enable for interpretation.
+ */
+ if (!kvm_s390_pci_interp_allowed())
+ goto out;
+
+ /*
+ * If this is the first request to use an interpreted device, make the
+ * necessary vcpu changes
+ */
+ if (!kvm->arch.use_zpci_interp)
+ kvm_s390_vcpu_pci_enable_interp(kvm);
+
+ if (zdev_enabled(zdev)) {
+ rc = zpci_disable_device(zdev);
+ if (rc)
+ goto err;
+ }
+
+ /*
+ * Store information about the identity of the kvm guest allowed to
+ * access this device via interpretation to be used by host CLP
+ */
+ zdev->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa);
+
+ rc = zpci_enable_device(zdev);
+ if (rc)
+ goto clear_gisa;
+
+ /* Re-register the IOMMU that was already created */
+ rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
+ virt_to_phys(zdev->dma_table));
+ if (rc)
+ goto clear_gisa;
+
+out:
+ zdev->kzdev->kvm = kvm;
+
+ spin_lock(&kvm->arch.kzdev_list_lock);
+ list_add_tail(&zdev->kzdev->entry, &kvm->arch.kzdev_list);
+ spin_unlock(&kvm->arch.kzdev_list_lock);
+
+ mutex_unlock(&kvm->lock);
+ mutex_unlock(&zdev->kzdev_lock);
+ return 0;
+
+clear_gisa:
+ zdev->gisa = 0;
+err:
+ if (zdev->kzdev)
+ kvm_s390_pci_dev_release(zdev);
+ mutex_unlock(&kvm->lock);
+ mutex_unlock(&zdev->kzdev_lock);
+ kvm_put_kvm(kvm);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(kvm_s390_pci_register_kvm);
+
+void kvm_s390_pci_unregister_kvm(struct zpci_dev *zdev)
+{
+ struct kvm *kvm;
+
+ if (!zdev)
+ return;
+
+ mutex_lock(&zdev->kzdev_lock);
+
+ if (WARN_ON(!zdev->kzdev)) {
+ mutex_unlock(&zdev->kzdev_lock);
+ return;
+ }
+
+ kvm = zdev->kzdev->kvm;
+ mutex_lock(&kvm->lock);
+
+ /*
+ * A 0 gisa means interpretation was never enabled, just remove the
+ * device from the list.
+ */
+ if (zdev->gisa == 0)
+ goto out;
+
+ /* Forwarding must be turned off before interpretation */
+ if (zdev->kzdev->fib.fmt0.aibv != 0)
+ kvm_s390_pci_aif_disable(zdev, true);
+
+ /* Remove the host CLP guest designation */
+ zdev->gisa = 0;
+
+ if (zdev_enabled(zdev)) {
+ if (zpci_disable_device(zdev))
+ goto out;
+ }
+
+ if (zpci_enable_device(zdev))
+ goto out;
+
+ /* Re-register the IOMMU that was already created */
+ zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
+ virt_to_phys(zdev->dma_table));
+
+out:
+ spin_lock(&kvm->arch.kzdev_list_lock);
+ list_del(&zdev->kzdev->entry);
+ spin_unlock(&kvm->arch.kzdev_list_lock);
+ kvm_s390_pci_dev_release(zdev);
+
+ mutex_unlock(&kvm->lock);
+ mutex_unlock(&zdev->kzdev_lock);
+
+ kvm_put_kvm(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_s390_pci_unregister_kvm);
+
+void kvm_s390_pci_init_list(struct kvm *kvm)
+{
+ spin_lock_init(&kvm->arch.kzdev_list_lock);
+ INIT_LIST_HEAD(&kvm->arch.kzdev_list);
+}
+
+void kvm_s390_pci_clear_list(struct kvm *kvm)
+{
+ /*
+ * This list should already be empty, either via vfio device closures
+ * or kvm fd cleanup.
+ */
+ spin_lock(&kvm->arch.kzdev_list_lock);
+ WARN_ON_ONCE(!list_empty(&kvm->arch.kzdev_list));
+ spin_unlock(&kvm->arch.kzdev_list_lock);
+}
+
+static struct zpci_dev *get_zdev_from_kvm_by_fh(struct kvm *kvm, u32 fh)
+{
+ struct zpci_dev *zdev = NULL;
+ struct kvm_zdev *kzdev;
+
+ spin_lock(&kvm->arch.kzdev_list_lock);
+ list_for_each_entry(kzdev, &kvm->arch.kzdev_list, entry) {
+ if (kzdev->zdev->fh == fh) {
+ zdev = kzdev->zdev;
+ break;
+ }
+ }
+ spin_unlock(&kvm->arch.kzdev_list_lock);
+
+ return zdev;
+}
+
+static int kvm_s390_pci_zpci_reg_aen(struct zpci_dev *zdev,
+ struct kvm_s390_zpci_op *args)
+{
+ struct zpci_fib fib = {};
+ bool hostflag;
+
+ fib.fmt0.aibv = args->u.reg_aen.ibv;
+ fib.fmt0.isc = args->u.reg_aen.isc;
+ fib.fmt0.noi = args->u.reg_aen.noi;
+ if (args->u.reg_aen.sb != 0) {
+ fib.fmt0.aisb = args->u.reg_aen.sb;
+ fib.fmt0.aisbo = args->u.reg_aen.sbo;
+ fib.fmt0.sum = 1;
+ } else {
+ fib.fmt0.aisb = 0;
+ fib.fmt0.aisbo = 0;
+ fib.fmt0.sum = 0;
+ }
+
+ hostflag = !(args->u.reg_aen.flags & KVM_S390_ZPCIOP_REGAEN_HOST);
+ return kvm_s390_pci_aif_enable(zdev, &fib, hostflag);
+}
+
+int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args)
+{
+ struct kvm_zdev *kzdev;
+ struct zpci_dev *zdev;
+ int r;
+
+ zdev = get_zdev_from_kvm_by_fh(kvm, args->fh);
+ if (!zdev)
+ return -ENODEV;
+
+ mutex_lock(&zdev->kzdev_lock);
+ mutex_lock(&kvm->lock);
+
+ kzdev = zdev->kzdev;
+ if (!kzdev) {
+ r = -ENODEV;
+ goto out;
+ }
+ if (kzdev->kvm != kvm) {
+ r = -EPERM;
+ goto out;
+ }
+
+ switch (args->op) {
+ case KVM_S390_ZPCIOP_REG_AEN:
+ /* Fail on unknown flags */
+ if (args->u.reg_aen.flags & ~KVM_S390_ZPCIOP_REGAEN_HOST) {
+ r = -EINVAL;
+ break;
+ }
+ r = kvm_s390_pci_zpci_reg_aen(zdev, args);
+ break;
+ case KVM_S390_ZPCIOP_DEREG_AEN:
+ r = kvm_s390_pci_aif_disable(zdev, false);
+ break;
+ default:
+ r = -EINVAL;
+ }
+
+out:
+ mutex_unlock(&kvm->lock);
+ mutex_unlock(&zdev->kzdev_lock);
+ return r;
+}
+
+int kvm_s390_pci_init(void)
+{
+ aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL);
+ if (!aift)
+ return -ENOMEM;
+
+ spin_lock_init(&aift->gait_lock);
+ mutex_init(&aift->aift_lock);
+
+ return 0;
+}
+
+void kvm_s390_pci_exit(void)
+{
+ mutex_destroy(&aift->aift_lock);
+
+ kfree(aift);
+}
diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h
new file mode 100644
index 000000000000..3a3606c3a0fe
--- /dev/null
+++ b/arch/s390/kvm/pci.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * s390 kvm PCI passthrough support
+ *
+ * Copyright IBM Corp. 2022
+ *
+ * Author(s): Matthew Rosato <mjrosato@linux.ibm.com>
+ */
+
+#ifndef __KVM_S390_PCI_H
+#define __KVM_S390_PCI_H
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <asm/airq.h>
+#include <asm/cpu.h>
+
+struct kvm_zdev {
+ struct zpci_dev *zdev;
+ struct kvm *kvm;
+ struct zpci_fib fib;
+ struct list_head entry;
+};
+
+struct zpci_gaite {
+ u32 gisa;
+ u8 gisc;
+ u8 count;
+ u8 reserved;
+ u8 aisbo;
+ u64 aisb;
+};
+
+struct zpci_aift {
+ struct zpci_gaite *gait;
+ struct airq_iv *sbv;
+ struct kvm_zdev **kzdev;
+ spinlock_t gait_lock; /* Protects the gait, used during AEN forward */
+ struct mutex aift_lock; /* Protects the other structures in aift */
+};
+
+extern struct zpci_aift *aift;
+
+static inline struct kvm *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift,
+ unsigned long si)
+{
+ if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) || aift->kzdev == 0 ||
+ aift->kzdev[si] == 0)
+ return 0;
+ return aift->kzdev[si]->kvm;
+};
+
+int kvm_s390_pci_aen_init(u8 nisc);
+void kvm_s390_pci_aen_exit(void);
+
+void kvm_s390_pci_init_list(struct kvm *kvm);
+void kvm_s390_pci_clear_list(struct kvm *kvm);
+
+int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args);
+
+int kvm_s390_pci_init(void);
+void kvm_s390_pci_exit(void);
+
+static inline bool kvm_s390_pci_interp_allowed(void)
+{
+ struct cpuid cpu_id;
+
+ get_cpu_id(&cpu_id);
+ switch (cpu_id.machine) {
+ case 0x2817:
+ case 0x2818:
+ case 0x2827:
+ case 0x2828:
+ case 0x2964:
+ case 0x2965:
+ /* No SHM on certain machines */
+ return false;
+ default:
+ return (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) &&
+ sclp.has_zpci_lsi && sclp.has_aeni && sclp.has_aisi &&
+ sclp.has_aisii);
+ }
+}
+
+#endif /* __KVM_S390_PCI_H */
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 83bb5cf97282..3335fa09b6f1 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -442,7 +442,7 @@ static int handle_ipte_interlock(struct kvm_vcpu *vcpu)
vcpu->stat.instruction_ipte_interlock++;
if (psw_bits(vcpu->arch.sie_block->gpsw).pstate)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
- wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu));
+ wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu->kvm));
kvm_s390_retry_instr(vcpu);
VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation");
return 0;
@@ -873,10 +873,18 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
- if (fc > 3) {
- kvm_s390_set_psw_cc(vcpu, 3);
- return 0;
- }
+ /* Bailout forbidden function codes */
+ if (fc > 3 && fc != 15)
+ goto out_no_data;
+
+ /*
+ * fc 15 is provided only with
+ * - PTF/CPU topology support through facility 15
+ * - KVM_CAP_S390_USER_STSI
+ */
+ if (fc == 15 && (!test_kvm_facility(vcpu->kvm, 11) ||
+ !vcpu->kvm->arch.user_stsi))
+ goto out_no_data;
if (vcpu->run->s.regs.gprs[0] & 0x0fffff00
|| vcpu->run->s.regs.gprs[1] & 0xffff0000)
@@ -910,6 +918,10 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
goto out_no_data;
handle_stsi_3_2_2(vcpu, (void *) mem);
break;
+ case 15: /* fc 15 is fully handled in userspace */
+ insert_stsi_usr_data(vcpu, operand2, ar, fc, sel1, sel2);
+ trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
+ return -EREMOTE;
}
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
memcpy((void *)sida_origin(vcpu->arch.sie_block), (void *)mem,
@@ -1471,7 +1483,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
access_key = (operand2 & 0xf0) >> 4;
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
- ipte_lock(vcpu);
+ ipte_lock(vcpu->kvm);
ret = guest_translate_address_with_key(vcpu, address, ar, &gpa,
GACC_STORE, access_key);
@@ -1508,7 +1520,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
}
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
- ipte_unlock(vcpu);
+ ipte_unlock(vcpu->kvm);
return ret;
}
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index cc7c9599f43e..7cb7799a0acb 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -7,13 +7,25 @@
*/
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+#include <linux/minmax.h>
#include <linux/pagemap.h>
#include <linux/sched/signal.h>
#include <asm/gmap.h>
#include <asm/uv.h>
#include <asm/mman.h>
+#include <linux/pagewalk.h>
+#include <linux/sched/mm.h>
+#include <linux/mmu_notifier.h>
#include "kvm-s390.h"
+static void kvm_s390_clear_pv_state(struct kvm *kvm)
+{
+ kvm->arch.pv.handle = 0;
+ kvm->arch.pv.guest_len = 0;
+ kvm->arch.pv.stor_base = 0;
+ kvm->arch.pv.stor_var = NULL;
+}
+
int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
{
int cc;
@@ -108,7 +120,7 @@ static void kvm_s390_pv_dealloc_vm(struct kvm *kvm)
vfree(kvm->arch.pv.stor_var);
free_pages(kvm->arch.pv.stor_base,
get_order(uv_info.guest_base_stor_len));
- memset(&kvm->arch.pv, 0, sizeof(kvm->arch.pv));
+ kvm_s390_clear_pv_state(kvm);
}
static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
@@ -152,21 +164,51 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
{
int cc;
- /* make all pages accessible before destroying the guest */
- s390_reset_acc(kvm->mm);
-
cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
- atomic_set(&kvm->mm->context.is_protected, 0);
+ /*
+ * if the mm still has a mapping, make all its pages accessible
+ * before destroying the guest
+ */
+ if (mmget_not_zero(kvm->mm)) {
+ s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE);
+ mmput(kvm->mm);
+ }
+
+ if (!cc) {
+ atomic_dec(&kvm->mm->context.protected_count);
+ kvm_s390_pv_dealloc_vm(kvm);
+ } else {
+ /* Intended memory leak on "impossible" error */
+ s390_replace_asce(kvm->arch.gmap);
+ }
KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc);
WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc);
- /* Inteded memory leak on "impossible" error */
- if (!cc)
- kvm_s390_pv_dealloc_vm(kvm);
+
return cc ? -EIO : 0;
}
+static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription,
+ struct mm_struct *mm)
+{
+ struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier);
+ u16 dummy;
+
+ /*
+ * No locking is needed since this is the last thread of the last user of this
+ * struct mm.
+ * When the struct kvm gets deinitialized, this notifier is also
+ * unregistered. This means that if this notifier runs, then the
+ * struct kvm is still valid.
+ */
+ kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
+}
+
+static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = {
+ .release = kvm_s390_pv_mmu_notifier_release,
+};
+
int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
{
struct uv_cb_cgc uvcb = {
@@ -197,14 +239,22 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
/* Outputs */
kvm->arch.pv.handle = uvcb.guest_handle;
+ atomic_inc(&kvm->mm->context.protected_count);
if (cc) {
- if (uvcb.header.rc & UVC_RC_NEED_DESTROY)
+ if (uvcb.header.rc & UVC_RC_NEED_DESTROY) {
kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
- else
+ } else {
+ atomic_dec(&kvm->mm->context.protected_count);
kvm_s390_pv_dealloc_vm(kvm);
+ }
return -EIO;
}
kvm->arch.gmap->guest_handle = uvcb.guest_handle;
+ /* Add the notifier only once. No races because we hold kvm->lock */
+ if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) {
+ kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops;
+ mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm);
+ }
return 0;
}
@@ -224,8 +274,6 @@ int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
*rrc = uvcb.header.rrc;
KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x",
*rc, *rrc);
- if (!cc)
- atomic_set(&kvm->mm->context.is_protected, 1);
return cc ? -EINVAL : 0;
}
@@ -298,3 +346,200 @@ int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state)
return -EINVAL;
return 0;
}
+
+int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc)
+{
+ struct uv_cb_dump_cpu uvcb = {
+ .header.cmd = UVC_CMD_DUMP_CPU,
+ .header.len = sizeof(uvcb),
+ .cpu_handle = vcpu->arch.pv.handle,
+ .dump_area_origin = (u64)buff,
+ };
+ int cc;
+
+ cc = uv_call_sched(0, (u64)&uvcb);
+ *rc = uvcb.header.rc;
+ *rrc = uvcb.header.rrc;
+ return cc;
+}
+
+/* Size of the cache for the storage state dump data. 1MB for now */
+#define DUMP_BUFF_LEN HPAGE_SIZE
+
+/**
+ * kvm_s390_pv_dump_stor_state
+ *
+ * @kvm: pointer to the guest's KVM struct
+ * @buff_user: Userspace pointer where we will write the results to
+ * @gaddr: Starting absolute guest address for which the storage state
+ * is requested.
+ * @buff_user_len: Length of the buff_user buffer
+ * @rc: Pointer to where the uvcb return code is stored
+ * @rrc: Pointer to where the uvcb return reason code is stored
+ *
+ * Stores buff_len bytes of tweak component values to buff_user
+ * starting with the 1MB block specified by the absolute guest address
+ * (gaddr). The gaddr pointer will be updated with the last address
+ * for which data was written when returning to userspace. buff_user
+ * might be written to even if an error rc is returned. For instance
+ * if we encounter a fault after writing the first page of data.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return:
+ * 0 on success
+ * -ENOMEM if allocating the cache fails
+ * -EINVAL if gaddr is not aligned to 1MB
+ * -EINVAL if buff_user_len is not aligned to uv_info.conf_dump_storage_state_len
+ * -EINVAL if the UV call fails, rc and rrc will be set in this case
+ * -EFAULT if copying the result to buff_user failed
+ */
+int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
+ u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc)
+{
+ struct uv_cb_dump_stor_state uvcb = {
+ .header.cmd = UVC_CMD_DUMP_CONF_STOR_STATE,
+ .header.len = sizeof(uvcb),
+ .config_handle = kvm->arch.pv.handle,
+ .gaddr = *gaddr,
+ .dump_area_origin = 0,
+ };
+ const u64 increment_len = uv_info.conf_dump_storage_state_len;
+ size_t buff_kvm_size;
+ size_t size_done = 0;
+ u8 *buff_kvm = NULL;
+ int cc, ret;
+
+ ret = -EINVAL;
+ /* UV call processes 1MB guest storage chunks at a time */
+ if (!IS_ALIGNED(*gaddr, HPAGE_SIZE))
+ goto out;
+
+ /*
+ * We provide the storage state for 1MB chunks of guest
+ * storage. The buffer will need to be aligned to
+ * conf_dump_storage_state_len so we don't end on a partial
+ * chunk.
+ */
+ if (!buff_user_len ||
+ !IS_ALIGNED(buff_user_len, increment_len))
+ goto out;
+
+ /*
+ * Allocate a buffer from which we will later copy to the user
+ * process. We don't want userspace to dictate our buffer size
+ * so we limit it to DUMP_BUFF_LEN.
+ */
+ ret = -ENOMEM;
+ buff_kvm_size = min_t(u64, buff_user_len, DUMP_BUFF_LEN);
+ buff_kvm = vzalloc(buff_kvm_size);
+ if (!buff_kvm)
+ goto out;
+
+ ret = 0;
+ uvcb.dump_area_origin = (u64)buff_kvm;
+ /* We will loop until the user buffer is filled or an error occurs */
+ do {
+ /* Get 1MB worth of guest storage state data */
+ cc = uv_call_sched(0, (u64)&uvcb);
+
+ /* All or nothing */
+ if (cc) {
+ ret = -EINVAL;
+ break;
+ }
+
+ size_done += increment_len;
+ uvcb.dump_area_origin += increment_len;
+ buff_user_len -= increment_len;
+ uvcb.gaddr += HPAGE_SIZE;
+
+ /* KVM Buffer full, time to copy to the process */
+ if (!buff_user_len || size_done == DUMP_BUFF_LEN) {
+ if (copy_to_user(buff_user, buff_kvm, size_done)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ buff_user += size_done;
+ size_done = 0;
+ uvcb.dump_area_origin = (u64)buff_kvm;
+ }
+ } while (buff_user_len);
+
+ /* Report back where we ended dumping */
+ *gaddr = uvcb.gaddr;
+
+ /* Lets only log errors, we don't want to spam */
+out:
+ if (ret)
+ KVM_UV_EVENT(kvm, 3,
+ "PROTVIRT DUMP STORAGE STATE: addr %llx ret %d, uvcb rc %x rrc %x",
+ uvcb.gaddr, ret, uvcb.header.rc, uvcb.header.rrc);
+ *rc = uvcb.header.rc;
+ *rrc = uvcb.header.rrc;
+ vfree(buff_kvm);
+
+ return ret;
+}
+
+/**
+ * kvm_s390_pv_dump_complete
+ *
+ * @kvm: pointer to the guest's KVM struct
+ * @buff_user: Userspace pointer where we will write the results to
+ * @rc: Pointer to where the uvcb return code is stored
+ * @rrc: Pointer to where the uvcb return reason code is stored
+ *
+ * Completes the dumping operation and writes the completion data to
+ * user space.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return:
+ * 0 on success
+ * -ENOMEM if allocating the completion buffer fails
+ * -EINVAL if the UV call fails, rc and rrc will be set in this case
+ * -EFAULT if copying the result to buff_user failed
+ */
+int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
+ u16 *rc, u16 *rrc)
+{
+ struct uv_cb_dump_complete complete = {
+ .header.len = sizeof(complete),
+ .header.cmd = UVC_CMD_DUMP_COMPLETE,
+ .config_handle = kvm_s390_pv_get_handle(kvm),
+ };
+ u64 *compl_data;
+ int ret;
+
+ /* Allocate dump area */
+ compl_data = vzalloc(uv_info.conf_dump_finalize_len);
+ if (!compl_data)
+ return -ENOMEM;
+ complete.dump_area_origin = (u64)compl_data;
+
+ ret = uv_call_sched(0, (u64)&complete);
+ *rc = complete.header.rc;
+ *rrc = complete.header.rrc;
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP COMPLETE: rc %x rrc %x",
+ complete.header.rc, complete.header.rrc);
+
+ if (!ret) {
+ /*
+ * kvm_s390_pv_dealloc_vm() will also (mem)set
+ * this to false on a reboot or other destroy
+ * operation for this vm.
+ */
+ kvm->arch.pv.dumping = false;
+ kvm_s390_vcpu_unblock_all(kvm);
+ ret = copy_to_user(buff_user, compl_data, uv_info.conf_dump_finalize_len);
+ if (ret)
+ ret = -EFAULT;
+ }
+ vfree(compl_data);
+ /* If the UVC returned an error, translate it to -EINVAL */
+ if (ret > 0)
+ ret = -EINVAL;
+ return ret;
+}
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 8aaee2892ec3..cb747bf6c798 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -480,9 +480,9 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu)
struct kvm_vcpu *dest_vcpu;
u8 order_code = kvm_s390_get_base_disp_rs(vcpu, NULL);
- trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr);
-
if (order_code == SIGP_EXTERNAL_CALL) {
+ trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr);
+
dest_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, cpu_addr);
BUG_ON(dest_vcpu == NULL);
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index dada78b92691..94138f8f0c1c 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -503,6 +503,14 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
/* Host-protection-interruption introduced with ESOP */
if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT;
+ /*
+ * CPU Topology
+ * This facility only uses the utility field of the SCA and none of
+ * the cpu entries that are problematic with the other interpretation
+ * facilities so we can pass it through
+ */
+ if (test_kvm_facility(vcpu->kvm, 11))
+ scb_s->ecb |= scb_o->ecb & ECB_PTF;
/* transactional execution */
if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) {
/* remap the prefix is tx is toggled on */
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index e173b6187ad5..ee7871f770fb 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -754,6 +754,7 @@ void do_secure_storage_access(struct pt_regs *regs)
struct vm_area_struct *vma;
struct mm_struct *mm;
struct page *page;
+ struct gmap *gmap;
int rc;
/*
@@ -783,6 +784,17 @@ void do_secure_storage_access(struct pt_regs *regs)
}
switch (get_fault_type(regs)) {
+ case GMAP_FAULT:
+ mm = current->mm;
+ gmap = (struct gmap *)S390_lowcore.gmap;
+ mmap_read_lock(mm);
+ addr = __gmap_translate(gmap, addr);
+ mmap_read_unlock(mm);
+ if (IS_ERR_VALUE(addr)) {
+ do_fault_error(regs, VM_ACCESS_FLAGS, VM_FAULT_BADMAP);
+ break;
+ }
+ fallthrough;
case USER_FAULT:
mm = current->mm;
mmap_read_lock(mm);
@@ -811,7 +823,6 @@ void do_secure_storage_access(struct pt_regs *regs)
if (rc)
BUG();
break;
- case GMAP_FAULT:
default:
do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
WARN_ON_ONCE(1);
@@ -837,6 +848,16 @@ NOKPROBE_SYMBOL(do_non_secure_storage_access);
void do_secure_storage_violation(struct pt_regs *regs)
{
+ unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
+ struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
+
+ /*
+ * If the VM has been rebooted, its address space might still contain
+ * secure pages from the previous boot.
+ * Clear the page so it can be reused.
+ */
+ if (!gmap_destroy_page(gmap, gaddr))
+ return;
/*
* Either KVM messed up the secure guest mapping or the same
* page is mapped into multiple secure guests.
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index b8ae4a4aa2ba..62758cb5872f 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2697,41 +2697,168 @@ void s390_reset_cmma(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(s390_reset_cmma);
+#define GATHER_GET_PAGES 32
+
+struct reset_walk_state {
+ unsigned long next;
+ unsigned long count;
+ unsigned long pfns[GATHER_GET_PAGES];
+};
+
+static int s390_gather_pages(pte_t *ptep, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct reset_walk_state *p = walk->private;
+ pte_t pte = READ_ONCE(*ptep);
+
+ if (pte_present(pte)) {
+ /* we have a reference from the mapping, take an extra one */
+ get_page(phys_to_page(pte_val(pte)));
+ p->pfns[p->count] = phys_to_pfn(pte_val(pte));
+ p->next = next;
+ p->count++;
+ }
+ return p->count >= GATHER_GET_PAGES;
+}
+
+static const struct mm_walk_ops gather_pages_ops = {
+ .pte_entry = s390_gather_pages,
+};
+
/*
- * make inaccessible pages accessible again
+ * Call the Destroy secure page UVC on each page in the given array of PFNs.
+ * Each page needs to have an extra reference, which will be released here.
*/
-static int __s390_reset_acc(pte_t *ptep, unsigned long addr,
- unsigned long next, struct mm_walk *walk)
+void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
{
- pte_t pte = READ_ONCE(*ptep);
+ unsigned long i;
- /* There is a reference through the mapping */
- if (pte_present(pte))
- WARN_ON_ONCE(uv_destroy_owned_page(pte_val(pte) & PAGE_MASK));
+ for (i = 0; i < count; i++) {
+ /* we always have an extra reference */
+ uv_destroy_owned_page(pfn_to_phys(pfns[i]));
+ /* get rid of the extra reference */
+ put_page(pfn_to_page(pfns[i]));
+ cond_resched();
+ }
+}
+EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
+/**
+ * __s390_uv_destroy_range - Call the destroy secure page UVC on each page
+ * in the given range of the given address space.
+ * @mm: the mm to operate on
+ * @start: the start of the range
+ * @end: the end of the range
+ * @interruptible: if not 0, stop when a fatal signal is received
+ *
+ * Walk the given range of the given address space and call the destroy
+ * secure page UVC on each page. Optionally exit early if a fatal signal is
+ * pending.
+ *
+ * Return: 0 on success, -EINTR if the function stopped before completing
+ */
+int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, bool interruptible)
+{
+ struct reset_walk_state state = { .next = start };
+ int r = 1;
+
+ while (r > 0) {
+ state.count = 0;
+ mmap_read_lock(mm);
+ r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state);
+ mmap_read_unlock(mm);
+ cond_resched();
+ s390_uv_destroy_pfns(state.count, state.pfns);
+ if (interruptible && fatal_signal_pending(current))
+ return -EINTR;
+ }
return 0;
}
+EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
-static const struct mm_walk_ops reset_acc_walk_ops = {
- .pte_entry = __s390_reset_acc,
-};
+/**
+ * s390_unlist_old_asce - Remove the topmost level of page tables from the
+ * list of page tables of the gmap.
+ * @gmap: the gmap whose table is to be removed
+ *
+ * On s390x, KVM keeps a list of all pages containing the page tables of the
+ * gmap (the CRST list). This list is used at tear down time to free all
+ * pages that are now not needed anymore.
+ *
+ * This function removes the topmost page of the tree (the one pointed to by
+ * the ASCE) from the CRST list.
+ *
+ * This means that it will not be freed when the VM is torn down, and needs
+ * to be handled separately by the caller, unless a leak is actually
+ * intended. Notice that this function will only remove the page from the
+ * list, the page will still be used as a top level page table (and ASCE).
+ */
+void s390_unlist_old_asce(struct gmap *gmap)
+{
+ struct page *old;
-#include <linux/sched/mm.h>
-void s390_reset_acc(struct mm_struct *mm)
+ old = virt_to_page(gmap->table);
+ spin_lock(&gmap->guest_table_lock);
+ list_del(&old->lru);
+ /*
+ * Sometimes the topmost page might need to be "removed" multiple
+ * times, for example if the VM is rebooted into secure mode several
+ * times concurrently, or if s390_replace_asce fails after calling
+ * s390_remove_old_asce and is attempted again later. In that case
+ * the old asce has been removed from the list, and therefore it
+ * will not be freed when the VM terminates, but the ASCE is still
+ * in use and still pointed to.
+ * A subsequent call to replace_asce will follow the pointer and try
+ * to remove the same page from the list again.
+ * Therefore it's necessary that the page of the ASCE has valid
+ * pointers, so list_del can work (and do nothing) without
+ * dereferencing stale or invalid pointers.
+ */
+ INIT_LIST_HEAD(&old->lru);
+ spin_unlock(&gmap->guest_table_lock);
+}
+EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
+
+/**
+ * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
+ * @gmap: the gmap whose ASCE needs to be replaced
+ *
+ * If the allocation of the new top level page table fails, the ASCE is not
+ * replaced.
+ * In any case, the old ASCE is always removed from the gmap CRST list.
+ * Therefore the caller has to make sure to save a pointer to it
+ * beforehand, unless a leak is actually intended.
+ */
+int s390_replace_asce(struct gmap *gmap)
{
- if (!mm_is_protected(mm))
- return;
+ unsigned long asce;
+ struct page *page;
+ void *table;
+
+ s390_unlist_old_asce(gmap);
+
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+ if (!page)
+ return -ENOMEM;
+ table = page_to_virt(page);
+ memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
+
/*
- * we might be called during
- * reset: we walk the pages and clear
- * close of all kvm file descriptors: we walk the pages and clear
- * exit of process on fd closure: vma already gone, do nothing
+ * The caller has to deal with the old ASCE, but here we make sure
+ * the new one is properly added to the CRST list, so that
+ * it will be freed when the VM is torn down.
*/
- if (!mmget_not_zero(mm))
- return;
- mmap_read_lock(mm);
- walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL);
- mmap_read_unlock(mm);
- mmput(mm);
+ spin_lock(&gmap->guest_table_lock);
+ list_add(&page->lru, &gmap->crst_list);
+ spin_unlock(&gmap->guest_table_lock);
+
+ /* Set new table origin while preserving existing ASCE control bits */
+ asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
+ WRITE_ONCE(gmap->asce, asce);
+ WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
+ WRITE_ONCE(gmap->table, table);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(s390_reset_acc);
+EXPORT_SYMBOL_GPL(s390_replace_asce);
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index bc980fd313d5..73cdc5539384 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -61,6 +61,12 @@ DEFINE_STATIC_KEY_FALSE(have_mio);
static struct kmem_cache *zdev_fmb_cache;
+/* AEN structures that must be preserved over KVM module re-insertion */
+union zpci_sic_iib *zpci_aipb;
+EXPORT_SYMBOL_GPL(zpci_aipb);
+struct airq_iv *zpci_aif_sbv;
+EXPORT_SYMBOL_GPL(zpci_aif_sbv);
+
struct zpci_dev *get_zdev_by_fid(u32 fid)
{
struct zpci_dev *tmp, *zdev = NULL;
@@ -120,11 +126,13 @@ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas,
fib.pba = base;
fib.pal = limit;
fib.iota = iota | ZPCI_IOTA_RTTO_FLAG;
+ fib.gd = zdev->gisa;
cc = zpci_mod_fc(req, &fib, &status);
if (cc)
zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status);
return cc;
}
+EXPORT_SYMBOL_GPL(zpci_register_ioat);
/* Modify PCI: Unregister I/O address translation parameters */
int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas)
@@ -133,6 +141,8 @@ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas)
struct zpci_fib fib = {0};
u8 cc, status;
+ fib.gd = zdev->gisa;
+
cc = zpci_mod_fc(req, &fib, &status);
if (cc)
zpci_dbg(3, "unreg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status);
@@ -160,6 +170,7 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev)
atomic64_set(&zdev->unmapped_pages, 0);
fib.fmb_addr = virt_to_phys(zdev->fmb);
+ fib.gd = zdev->gisa;
cc = zpci_mod_fc(req, &fib, &status);
if (cc) {
kmem_cache_free(zdev_fmb_cache, zdev->fmb);
@@ -178,6 +189,8 @@ int zpci_fmb_disable_device(struct zpci_dev *zdev)
if (!zdev->fmb)
return -EINVAL;
+ fib.gd = zdev->gisa;
+
/* Function measurement is disabled if fmb address is zero */
cc = zpci_mod_fc(req, &fib, &status);
if (cc == 3) /* Function already gone. */
@@ -700,6 +713,7 @@ int zpci_enable_device(struct zpci_dev *zdev)
zpci_update_fh(zdev, fh);
return rc;
}
+EXPORT_SYMBOL_GPL(zpci_enable_device);
int zpci_disable_device(struct zpci_dev *zdev)
{
@@ -723,6 +737,7 @@ int zpci_disable_device(struct zpci_dev *zdev)
}
return rc;
}
+EXPORT_SYMBOL_GPL(zpci_disable_device);
/**
* zpci_hot_reset_device - perform a reset of the given zPCI function
@@ -816,6 +831,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
kref_init(&zdev->kref);
mutex_init(&zdev->lock);
+ mutex_init(&zdev->kzdev_lock);
rc = zpci_init_iommu(zdev);
if (rc)
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index 375e0a5120bc..ee367798e388 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -106,6 +106,8 @@ static void clp_store_query_pci_fngrp(struct zpci_dev *zdev,
zdev->max_msi = response->noi;
zdev->fmb_update = response->mui;
zdev->version = response->version;
+ zdev->maxstbl = response->maxstbl;
+ zdev->dtsm = response->dtsm;
switch (response->version) {
case 1:
@@ -229,12 +231,16 @@ static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 comma
{
struct clp_req_rsp_set_pci *rrb;
int rc, retries = 100;
+ u32 gisa = 0;
*fh = 0;
rrb = clp_alloc_block(GFP_KERNEL);
if (!rrb)
return -ENOMEM;
+ if (command != CLP_SET_DISABLE_PCI_FN)
+ gisa = zdev->gisa;
+
do {
memset(rrb, 0, sizeof(*rrb));
rrb->request.hdr.len = sizeof(rrb->request);
@@ -243,6 +249,7 @@ static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 comma
rrb->request.fh = zdev->fh;
rrb->request.oc = command;
rrb->request.ndas = nr_dma_as;
+ rrb->request.gisa = gisa;
rc = clp_req(rrb, CLP_LPS_PCI);
if (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY) {
diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c
index 1a822b7799f8..56480be48244 100644
--- a/arch/s390/pci/pci_insn.c
+++ b/arch/s390/pci/pci_insn.c
@@ -92,6 +92,7 @@ u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status)
return cc;
}
+EXPORT_SYMBOL_GPL(zpci_mod_fc);
/* Refresh PCI Translations */
static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status)
@@ -138,7 +139,7 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range)
}
/* Set Interruption Controls */
-int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
+int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
{
if (!test_facility(72))
return -EIO;
@@ -149,6 +150,7 @@ int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
return 0;
}
+EXPORT_SYMBOL_GPL(zpci_set_irq_ctrl);
/* PCI Load */
static inline int ____pcilg(u64 *data, u64 req, u64 offset, u8 *status)
diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
index 500cd2dbdf53..a2b42a63a53b 100644
--- a/arch/s390/pci/pci_irq.c
+++ b/arch/s390/pci/pci_irq.c
@@ -11,16 +11,10 @@
#include <asm/isc.h>
#include <asm/airq.h>
+#include <asm/tpi.h>
static enum {FLOATING, DIRECTED} irq_delivery;
-#define SIC_IRQ_MODE_ALL 0
-#define SIC_IRQ_MODE_SINGLE 1
-#define SIC_IRQ_MODE_DIRECT 4
-#define SIC_IRQ_MODE_D_ALL 16
-#define SIC_IRQ_MODE_D_SINGLE 17
-#define SIC_IRQ_MODE_SET_CPU 18
-
/*
* summary bit vector
* FLOATING - summary bit per function
@@ -49,6 +43,7 @@ static int zpci_set_airq(struct zpci_dev *zdev)
fib.fmt0.aibvo = 0; /* each zdev has its own interrupt vector */
fib.fmt0.aisb = virt_to_phys(zpci_sbv->vector) + (zdev->aisb / 64) * 8;
fib.fmt0.aisbo = zdev->aisb & 63;
+ fib.gd = zdev->gisa;
return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
}
@@ -60,6 +55,8 @@ static int zpci_clear_airq(struct zpci_dev *zdev)
struct zpci_fib fib = {0};
u8 cc, status;
+ fib.gd = zdev->gisa;
+
cc = zpci_mod_fc(req, &fib, &status);
if (cc == 3 || (cc == 1 && status == 24))
/* Function already gone or IRQs already deregistered. */
@@ -78,6 +75,7 @@ static int zpci_set_directed_irq(struct zpci_dev *zdev)
fib.fmt = 1;
fib.fmt1.noi = zdev->msi_nr_irqs;
fib.fmt1.dibvo = zdev->msi_first_bit;
+ fib.gd = zdev->gisa;
return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
}
@@ -90,6 +88,7 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev)
u8 cc, status;
fib.fmt = 1;
+ fib.gd = zdev->gisa;
cc = zpci_mod_fc(req, &fib, &status);
if (cc == 3 || (cc == 1 && status == 24))
/* Function already gone or IRQs already deregistered. */
@@ -153,6 +152,7 @@ static struct irq_chip zpci_irq_chip = {
static void zpci_handle_cpu_local_irq(bool rescan)
{
struct airq_iv *dibv = zpci_ibv[smp_processor_id()];
+ union zpci_sic_iib iib = {{0}};
unsigned long bit;
int irqs_on = 0;
@@ -164,7 +164,7 @@ static void zpci_handle_cpu_local_irq(bool rescan)
/* End of second scan with interrupts on. */
break;
/* First scan complete, reenable interrupts. */
- if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC))
+ if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &iib))
break;
bit = 0;
continue;
@@ -192,6 +192,7 @@ static void zpci_handle_remote_irq(void *data)
static void zpci_handle_fallback_irq(void)
{
struct cpu_irq_data *cpu_data;
+ union zpci_sic_iib iib = {{0}};
unsigned long cpu;
int irqs_on = 0;
@@ -202,7 +203,7 @@ static void zpci_handle_fallback_irq(void)
/* End of second scan with interrupts on. */
break;
/* First scan complete, reenable interrupts. */
- if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC))
+ if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib))
break;
cpu = 0;
continue;
@@ -216,8 +217,11 @@ static void zpci_handle_fallback_irq(void)
}
}
-static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating)
+static void zpci_directed_irq_handler(struct airq_struct *airq,
+ struct tpi_info *tpi_info)
{
+ bool floating = !tpi_info->directed_irq;
+
if (floating) {
inc_irq_stat(IRQIO_PCF);
zpci_handle_fallback_irq();
@@ -227,8 +231,10 @@ static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating)
}
}
-static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating)
+static void zpci_floating_irq_handler(struct airq_struct *airq,
+ struct tpi_info *tpi_info)
{
+ union zpci_sic_iib iib = {{0}};
unsigned long si, ai;
struct airq_iv *aibv;
int irqs_on = 0;
@@ -242,7 +248,7 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating)
/* End of second scan with interrupts on. */
break;
/* First scan complete, reenable interrupts. */
- if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC))
+ if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib))
break;
si = 0;
continue;
@@ -291,7 +297,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
zdev->aisb = bit;
/* Create adapter interrupt vector */
- zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK);
+ zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL);
if (!zdev->aibv)
return -ENOMEM;
@@ -402,11 +408,12 @@ static struct airq_struct zpci_airq = {
static void __init cpu_enable_directed_irq(void *unused)
{
union zpci_sic_iib iib = {{0}};
+ union zpci_sic_iib ziib = {{0}};
iib.cdiib.dibv_addr = (u64) zpci_ibv[smp_processor_id()]->vector;
- __zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib);
- zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC);
+ zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib);
+ zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &ziib);
}
static int __init zpci_directed_irq_init(void)
@@ -414,14 +421,14 @@ static int __init zpci_directed_irq_init(void)
union zpci_sic_iib iib = {{0}};
unsigned int cpu;
- zpci_sbv = airq_iv_create(num_possible_cpus(), 0);
+ zpci_sbv = airq_iv_create(num_possible_cpus(), 0, NULL);
if (!zpci_sbv)
return -ENOMEM;
iib.diib.isc = PCI_ISC;
iib.diib.nr_cpus = num_possible_cpus();
iib.diib.disb_addr = virt_to_phys(zpci_sbv->vector);
- __zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib);
+ zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib);
zpci_ibv = kcalloc(num_possible_cpus(), sizeof(*zpci_ibv),
GFP_KERNEL);
@@ -436,7 +443,7 @@ static int __init zpci_directed_irq_init(void)
zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE,
AIRQ_IV_DATA |
AIRQ_IV_CACHELINE |
- (!cpu ? AIRQ_IV_ALLOC : 0));
+ (!cpu ? AIRQ_IV_ALLOC : 0), NULL);
if (!zpci_ibv[cpu])
return -ENOMEM;
}
@@ -453,7 +460,7 @@ static int __init zpci_floating_irq_init(void)
if (!zpci_ibv)
return -ENOMEM;
- zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC);
+ zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL);
if (!zpci_sbv)
goto out_free;
@@ -466,6 +473,7 @@ out_free:
int __init zpci_irq_init(void)
{
+ union zpci_sic_iib iib = {{0}};
int rc;
irq_delivery = sclp.has_dirq ? DIRECTED : FLOATING;
@@ -497,7 +505,7 @@ int __init zpci_irq_init(void)
* Enable floating IRQs (with suppression after one IRQ). When using
* directed IRQs this enables the fallback path.
*/
- zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC);
+ zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib);
return 0;
out_airq:
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index 530dd941d140..cb0aff5c0187 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -111,6 +111,7 @@ static struct facility_def facility_defs[] = {
193, /* bear enhancement facility */
194, /* rdp enhancement facility */
196, /* processor activity instrumentation facility */
+ 197, /* processor activity instrumentation extension 1 */
-1 /* END */
}
},