diff options
140 files changed, 1298 insertions, 16591 deletions
diff --git a/0001-automation-Remove-clang-8-from-Debian-unstable-conta.patch b/0001-automation-Remove-clang-8-from-Debian-unstable-conta.patch new file mode 100644 index 0000000..765ff82 --- /dev/null +++ b/0001-automation-Remove-clang-8-from-Debian-unstable-conta.patch @@ -0,0 +1,84 @@ +From fa875574b73618daf3bc70e6ff4d342493fa11d9 Mon Sep 17 00:00:00 2001 +From: Anthony PERARD <anthony.perard@citrix.com> +Date: Tue, 21 Feb 2023 16:55:38 +0000 +Subject: [PATCH 01/13] automation: Remove clang-8 from Debian unstable + container + +First, apt complain that it isn't the right way to add keys anymore, +but hopefully that's just a warning. + +Second, we can't install clang-8: +The following packages have unmet dependencies: + clang-8 : Depends: libstdc++-8-dev but it is not installable + Depends: libgcc-8-dev but it is not installable + Depends: libobjc-8-dev but it is not installable + Recommends: llvm-8-dev but it is not going to be installed + Recommends: libomp-8-dev but it is not going to be installed + libllvm8 : Depends: libffi7 (>= 3.3~20180313) but it is not installable +E: Unable to correct problems, you have held broken packages. + +clang on Debian unstable is now version 14.0.6. + +Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +(cherry picked from commit a6b1e2b80fe2053b1c9c9843fb086a668513ea36) +--- + automation/build/debian/unstable-llvm-8.list | 3 --- + automation/build/debian/unstable.dockerfile | 12 ------------ + automation/gitlab-ci/build.yaml | 10 ---------- + 3 files changed, 25 deletions(-) + delete mode 100644 automation/build/debian/unstable-llvm-8.list + +diff --git a/automation/build/debian/unstable-llvm-8.list b/automation/build/debian/unstable-llvm-8.list +deleted file mode 100644 +index dc119fa0b4..0000000000 +--- a/automation/build/debian/unstable-llvm-8.list ++++ /dev/null +@@ -1,3 +0,0 @@ +-# Unstable LLVM 8 repos +-deb http://apt.llvm.org/unstable/ llvm-toolchain-8 main +-deb-src http://apt.llvm.org/unstable/ llvm-toolchain-8 main +diff --git a/automation/build/debian/unstable.dockerfile b/automation/build/debian/unstable.dockerfile +index 9a10ee08d6..1200245c9b 100644 +--- a/automation/build/debian/unstable.dockerfile ++++ b/automation/build/debian/unstable.dockerfile +@@ -51,15 +51,3 @@ RUN apt-get update && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* /tmp/* /var/tmp/* +- +-RUN wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key|apt-key add - +-COPY unstable-llvm-8.list /etc/apt/sources.list.d/ +- +-RUN apt-get update && \ +- apt-get --quiet --yes install \ +- clang-8 \ +- lld-8 \ +- && \ +- apt-get autoremove -y && \ +- apt-get clean && \ +- rm -rf /var/lib/apt/lists* /tmp/* /var/tmp/* +diff --git a/automation/gitlab-ci/build.yaml b/automation/gitlab-ci/build.yaml +index 23ab81d892..6cca2122aa 100644 +--- a/automation/gitlab-ci/build.yaml ++++ b/automation/gitlab-ci/build.yaml +@@ -276,16 +276,6 @@ debian-unstable-clang-debug: + variables: + CONTAINER: debian:unstable + +-debian-unstable-clang-8: +- extends: .clang-8-x86-64-build +- variables: +- CONTAINER: debian:unstable +- +-debian-unstable-clang-8-debug: +- extends: .clang-8-x86-64-build-debug +- variables: +- CONTAINER: debian:unstable +- + debian-unstable-gcc: + extends: .gcc-x86-64-build + variables: +-- +2.40.0 + diff --git a/0001-build-fix-exported-variable-name-CFLAGS_stack_bounda.patch b/0001-build-fix-exported-variable-name-CFLAGS_stack_bounda.patch deleted file mode 100644 index 4b643e1..0000000 --- a/0001-build-fix-exported-variable-name-CFLAGS_stack_bounda.patch +++ /dev/null @@ -1,68 +0,0 @@ -From f6e26ce7d9317abc41130ead6dc2443a7e2dde00 Mon Sep 17 00:00:00 2001 -From: Anthony PERARD <anthony.perard@citrix.com> -Date: Tue, 12 Jul 2022 11:20:46 +0200 -Subject: [PATCH 001/126] build: fix exported variable name - CFLAGS_stack_boundary - -Exporting a variable with a dash doesn't work reliably, they may be -striped from the environment when calling a sub-make or sub-shell. - -CFLAGS-stack-boundary start to be removed from env in patch "build: -set ALL_OBJS in main Makefile; move prelink.o to main Makefile" when -running `make "ALL_OBJS=.."` due to the addition of the quote. At -least in my empirical tests. - -Fixes: 2740d96efd ("xen/build: have the root Makefile generates the CFLAGS") -Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -master commit: aa390d513a67a6ec0a069eea7478e5ecd54a7ea6 -master date: 2022-01-28 11:44:33 +0100 ---- - xen/arch/x86/Rules.mk | 4 ++-- - xen/arch/x86/arch.mk | 4 ++-- - xen/arch/x86/efi/Makefile | 2 +- - 3 files changed, 5 insertions(+), 5 deletions(-) - -diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk -index 56fe22c979ea..7aef93f5f3a0 100644 ---- a/xen/arch/x86/Rules.mk -+++ b/xen/arch/x86/Rules.mk -@@ -6,5 +6,5 @@ object_label_flags = '-D__OBJECT_LABEL__=$(subst $(BASEDIR)/,,$(CURDIR))/$@' - else - object_label_flags = '-D__OBJECT_LABEL__=$(subst /,$$,$(subst -,_,$(subst $(BASEDIR)/,,$(CURDIR))/$@))' - endif --c_flags += $(object_label_flags) $(CFLAGS-stack-boundary) --a_flags += $(object_label_flags) $(CFLAGS-stack-boundary) -+c_flags += $(object_label_flags) $(CFLAGS_stack_boundary) -+a_flags += $(object_label_flags) $(CFLAGS_stack_boundary) -diff --git a/xen/arch/x86/arch.mk b/xen/arch/x86/arch.mk -index 033048ab6b2d..456e5d5c1ad7 100644 ---- a/xen/arch/x86/arch.mk -+++ b/xen/arch/x86/arch.mk -@@ -57,8 +57,8 @@ endif - - # If supported by the compiler, reduce stack alignment to 8 bytes. But allow - # this to be overridden elsewhere. --$(call cc-option-add,CFLAGS-stack-boundary,CC,-mpreferred-stack-boundary=3) --export CFLAGS-stack-boundary -+$(call cc-option-add,CFLAGS_stack_boundary,CC,-mpreferred-stack-boundary=3) -+export CFLAGS_stack_boundary - - ifeq ($(CONFIG_UBSAN),y) - # Don't enable alignment sanitisation. x86 has efficient unaligned accesses, -diff --git a/xen/arch/x86/efi/Makefile b/xen/arch/x86/efi/Makefile -index e857c0f2cc2c..a5b2041f9b96 100644 ---- a/xen/arch/x86/efi/Makefile -+++ b/xen/arch/x86/efi/Makefile -@@ -11,7 +11,7 @@ boot.init.o: buildid.o - EFIOBJ := boot.init.o pe.init.o ebmalloc.o compat.o runtime.o - - $(call cc-option-add,cflags-stack-boundary,CC,-mpreferred-stack-boundary=4) --$(EFIOBJ): CFLAGS-stack-boundary := $(cflags-stack-boundary) -+$(EFIOBJ): CFLAGS_stack_boundary := $(cflags-stack-boundary) - - obj-y := stub.o - obj-$(XEN_BUILD_EFI) := $(filter-out %.init.o,$(EFIOBJ)) --- -2.37.4 - diff --git a/0002-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch b/0002-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch deleted file mode 100644 index edc6857..0000000 --- a/0002-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch +++ /dev/null @@ -1,52 +0,0 @@ -From b89b932cfe86556c5de4ad56702aed83142e22a3 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 12 Jul 2022 11:21:14 +0200 -Subject: [PATCH 002/126] IOMMU/x86: work around bogus gcc12 warning in - hvm_gsi_eoi() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -As per [1] the expansion of the pirq_dpci() macro causes a -Waddress -controlled warning (enabled implicitly in our builds, if not by default) -tying the middle part of the involved conditional expression to the -surrounding boolean context. Work around this by introducing a local -inline function in the affected source file. - -Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Roger Pau Monné <roger.pau@citrix.com> - -[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102967 -master commit: 80ad8db8a4d9bb24952f0aea788ce6f47566fa76 -master date: 2022-06-15 10:19:32 +0200 ---- - xen/drivers/passthrough/x86/hvm.c | 12 ++++++++++++ - 1 file changed, 12 insertions(+) - -diff --git a/xen/drivers/passthrough/x86/hvm.c b/xen/drivers/passthrough/x86/hvm.c -index 9544f3234e65..50865eec2c04 100644 ---- a/xen/drivers/passthrough/x86/hvm.c -+++ b/xen/drivers/passthrough/x86/hvm.c -@@ -25,6 +25,18 @@ - #include <asm/hvm/support.h> - #include <asm/io_apic.h> - -+/* -+ * Gcc12 takes issue with pirq_dpci() being used in boolean context (see gcc -+ * bug 102967). While we can't replace the macro definition in the header by an -+ * inline function, we can do so here. -+ */ -+static inline struct hvm_pirq_dpci *_pirq_dpci(struct pirq *pirq) -+{ -+ return pirq_dpci(pirq); -+} -+#undef pirq_dpci -+#define pirq_dpci(pirq) _pirq_dpci(pirq) -+ - static DEFINE_PER_CPU(struct list_head, dpci_list); - - /* --- -2.37.4 - diff --git a/0002-x86-shadow-account-for-log-dirty-mode-when-pre-alloc.patch b/0002-x86-shadow-account-for-log-dirty-mode-when-pre-alloc.patch new file mode 100644 index 0000000..0b5d582 --- /dev/null +++ b/0002-x86-shadow-account-for-log-dirty-mode-when-pre-alloc.patch @@ -0,0 +1,105 @@ +From 3a0b7fb38a3e40fcf82c10980775f0fecab667b5 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 21 Mar 2023 12:00:02 +0000 +Subject: [PATCH 02/13] x86/shadow: account for log-dirty mode when + pre-allocating + +Pre-allocation is intended to ensure that in the course of constructing +or updating shadows there won't be any risk of just made shadows or +shadows being acted upon can disappear under our feet. The amount of +pages pre-allocated then, however, needs to account for all possible +subsequent allocations. While the use in sh_page_fault() accounts for +all shadows which may need making, so far it didn't account for +allocations coming from log-dirty tracking (which piggybacks onto the +P2M allocation functions). + +Since shadow_prealloc() takes a count of shadows (or other data +structures) rather than a count of pages, putting the adjustment at the +call site of this function won't work very well: We simply can't express +the correct count that way in all cases. Instead take care of this in +the function itself, by "snooping" for L1 type requests. (While not +applicable right now, future new request sites of L1 tables would then +also be covered right away.) + +It is relevant to note here that pre-allocations like the one done from +shadow_alloc_p2m_page() are benign when they fall in the "scope" of an +earlier pre-alloc which already included that count: The inner call will +simply find enough pages available then; it'll bail right away. + +This is CVE-2022-42332 / XSA-427. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Tim Deegan <tim@xen.org> +(cherry picked from commit 91767a71061035ae42be93de495cd976f863a41a) +--- + xen/arch/x86/mm/paging.c | 1 + + xen/arch/x86/mm/shadow/common.c | 12 +++++++++++- + xen/arch/x86/mm/shadow/private.h | 1 + + xen/include/asm-x86/paging.h | 4 ++++ + 4 files changed, 17 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c +index 579d01c161..ab1cdf1e72 100644 +--- a/xen/arch/x86/mm/paging.c ++++ b/xen/arch/x86/mm/paging.c +@@ -280,6 +280,7 @@ void paging_mark_pfn_dirty(struct domain *d, pfn_t pfn) + if ( unlikely(!VALID_M2P(pfn_x(pfn))) ) + return; + ++ BUILD_BUG_ON(paging_logdirty_levels() != 4); + i1 = L1_LOGDIRTY_IDX(pfn); + i2 = L2_LOGDIRTY_IDX(pfn); + i3 = L3_LOGDIRTY_IDX(pfn); +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index e36d49d1fc..e73931573b 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -1014,7 +1014,17 @@ bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count) + if ( unlikely(d->is_dying) ) + return false; + +- ret = _shadow_prealloc(d, shadow_size(type) * count); ++ count *= shadow_size(type); ++ /* ++ * Log-dirty handling may result in allocations when populating its ++ * tracking structures. Tie this to the caller requesting space for L1 ++ * shadows. ++ */ ++ if ( paging_mode_log_dirty(d) && ++ ((SHF_L1_ANY | SHF_FL1_ANY) & (1u << type)) ) ++ count += paging_logdirty_levels(); ++ ++ ret = _shadow_prealloc(d, count); + if ( !ret && (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) + /* + * Failing to allocate memory required for shadow usage can only result in +diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h +index 3fe0388e7c..1be84fc951 100644 +--- a/xen/arch/x86/mm/shadow/private.h ++++ b/xen/arch/x86/mm/shadow/private.h +@@ -269,6 +269,7 @@ static inline void sh_terminate_list(struct page_list_head *tmp_list) + #define SHF_64 (SHF_L1_64|SHF_FL1_64|SHF_L2_64|SHF_L2H_64|SHF_L3_64|SHF_L4_64) + + #define SHF_L1_ANY (SHF_L1_32|SHF_L1_PAE|SHF_L1_64) ++#define SHF_FL1_ANY (SHF_FL1_32|SHF_FL1_PAE|SHF_FL1_64) + + #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Marks a guest L1 page table which is shadowed but not write-protected. +diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h +index eb23652978..5ec508a351 100644 +--- a/xen/include/asm-x86/paging.h ++++ b/xen/include/asm-x86/paging.h +@@ -190,6 +190,10 @@ int paging_mfn_is_dirty(struct domain *d, mfn_t gmfn); + #define L4_LOGDIRTY_IDX(pfn) ((pfn_x(pfn) >> (PAGE_SHIFT + 3 + PAGETABLE_ORDER * 2)) & \ + (LOGDIRTY_NODE_ENTRIES-1)) + ++#define paging_logdirty_levels() \ ++ (DIV_ROUND_UP(PADDR_BITS - PAGE_SHIFT - (PAGE_SHIFT + 3), \ ++ PAGE_SHIFT - ilog2(sizeof(mfn_t))) + 1) ++ + #ifdef CONFIG_HVM + /* VRAM dirty tracking support */ + struct sh_dirty_vram { +-- +2.40.0 + diff --git a/0003-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch b/0003-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch deleted file mode 100644 index fd460e0..0000000 --- a/0003-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch +++ /dev/null @@ -1,36 +0,0 @@ -From b53df5b4341fa97614ad064a7c8e781c88b6ed71 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= - <marmarek@invisiblethingslab.com> -Date: Tue, 12 Jul 2022 11:22:09 +0200 -Subject: [PATCH 003/126] ehci-dbgp: fix selecting n-th ehci controller -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The ehci<n> number was parsed but ignored. - -Fixes: 322ecbe4ac85 ("console: add EHCI debug port based serial console") -Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: d6d0cb659fda64430d4649f8680c5cead32da8fd -master date: 2022-06-16 14:23:37 +0100 ---- - xen/drivers/char/ehci-dbgp.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/drivers/char/ehci-dbgp.c b/xen/drivers/char/ehci-dbgp.c -index c893d246defa..66b4811af24a 100644 ---- a/xen/drivers/char/ehci-dbgp.c -+++ b/xen/drivers/char/ehci-dbgp.c -@@ -1478,7 +1478,7 @@ void __init ehci_dbgp_init(void) - unsigned int num = 0; - - if ( opt_dbgp[4] ) -- simple_strtoul(opt_dbgp + 4, &e, 10); -+ num = simple_strtoul(opt_dbgp + 4, &e, 10); - - dbgp->cap = find_dbgp(dbgp, num); - if ( !dbgp->cap ) --- -2.37.4 - diff --git a/0003-x86-HVM-bound-number-of-pinned-cache-attribute-regio.patch b/0003-x86-HVM-bound-number-of-pinned-cache-attribute-regio.patch new file mode 100644 index 0000000..9974108 --- /dev/null +++ b/0003-x86-HVM-bound-number-of-pinned-cache-attribute-regio.patch @@ -0,0 +1,50 @@ +From 887ba097cfcd4454d4707e1bee6504322335ea79 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 21 Mar 2023 12:01:01 +0000 +Subject: [PATCH 03/13] x86/HVM: bound number of pinned cache attribute regions + +This is exposed via DMOP, i.e. to potentially not fully privileged +device models. With that we may not permit registration of an (almost) +unbounded amount of such regions. + +This is CVE-2022-42333 / part of XSA-428. + +Fixes: 642123c5123f ("x86/hvm: provide XEN_DMOP_pin_memory_cacheattr") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +(cherry picked from commit a5e768640f786b681063f4e08af45d0c4e91debf) +--- + xen/arch/x86/hvm/mtrr.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/xen/arch/x86/hvm/mtrr.c b/xen/arch/x86/hvm/mtrr.c +index fb051d59c3..09a51f415d 100644 +--- a/xen/arch/x86/hvm/mtrr.c ++++ b/xen/arch/x86/hvm/mtrr.c +@@ -596,6 +596,7 @@ int hvm_set_mem_pinned_cacheattr(struct domain *d, uint64_t gfn_start, + uint64_t gfn_end, uint32_t type) + { + struct hvm_mem_pinned_cacheattr_range *range; ++ unsigned int nr = 0; + int rc = 1; + + if ( !is_hvm_domain(d) ) +@@ -667,11 +668,15 @@ int hvm_set_mem_pinned_cacheattr(struct domain *d, uint64_t gfn_start, + rc = -EBUSY; + break; + } ++ ++nr; + } + rcu_read_unlock(&pinned_cacheattr_rcu_lock); + if ( rc <= 0 ) + return rc; + ++ if ( nr >= 64 /* The limit is arbitrary. */ ) ++ return -ENOSPC; ++ + range = xzalloc(struct hvm_mem_pinned_cacheattr_range); + if ( range == NULL ) + return -ENOMEM; +-- +2.40.0 + diff --git a/0004-tools-xenstored-Harden-corrupt.patch b/0004-tools-xenstored-Harden-corrupt.patch deleted file mode 100644 index c9e6852..0000000 --- a/0004-tools-xenstored-Harden-corrupt.patch +++ /dev/null @@ -1,44 +0,0 @@ -From 7fe638c28fa693d8bb8f9419de1220d4359a1b2d Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Tue, 12 Jul 2022 11:23:01 +0200 -Subject: [PATCH 004/126] tools/xenstored: Harden corrupt() - -At the moment, corrupt() is neither checking for allocation failure -nor freeing the allocated memory. - -Harden the code by printing ENOMEM if the allocation failed and -free 'str' after the last use. - -This is not considered to be a security issue because corrupt() should -only be called when Xenstored thinks the database is corrupted. Note -that the trigger (i.e. a guest reliably provoking the call) would be -a security issue. - -Fixes: 06d17943f0cd ("Added a basic integrity checker, and some basic ability to recover from store") -Signed-off-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Juergen Gross <jgross@suse.com> -master commit: db3382dd4f468c763512d6bf91c96773395058fb -master date: 2022-06-23 13:44:10 +0100 ---- - tools/xenstore/xenstored_core.c | 5 ++++- - 1 file changed, 4 insertions(+), 1 deletion(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 8033c1e0eb28..9172dd767140 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1988,7 +1988,10 @@ void corrupt(struct connection *conn, const char *fmt, ...) - va_end(arglist); - - log("corruption detected by connection %i: err %s: %s", -- conn ? (int)conn->id : -1, strerror(saved_errno), str); -+ conn ? (int)conn->id : -1, strerror(saved_errno), -+ str ?: "ENOMEM"); -+ -+ talloc_free(str); - - check_store(); - } --- -2.37.4 - diff --git a/0004-x86-HVM-serialize-pinned-cache-attribute-list-manipu.patch b/0004-x86-HVM-serialize-pinned-cache-attribute-list-manipu.patch new file mode 100644 index 0000000..b655f04 --- /dev/null +++ b/0004-x86-HVM-serialize-pinned-cache-attribute-list-manipu.patch @@ -0,0 +1,126 @@ +From fc7dfd94432f4c55a97e86ff63387bfac5da58e3 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 21 Mar 2023 12:01:01 +0000 +Subject: [PATCH 04/13] x86/HVM: serialize pinned cache attribute list + manipulation + +While the RCU variants of list insertion and removal allow lockless list +traversal (with RCU just read-locked), insertions and removals still +need serializing amongst themselves. To keep things simple, use the +domain lock for this purpose. + +This is CVE-2022-42334 / part of XSA-428. + +Fixes: 642123c5123f ("x86/hvm: provide XEN_DMOP_pin_memory_cacheattr") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 829ec245cf66560e3b50d140ccb3168e7fb7c945) +--- + xen/arch/x86/hvm/mtrr.c | 51 +++++++++++++++++++++++++---------------- + 1 file changed, 31 insertions(+), 20 deletions(-) + +diff --git a/xen/arch/x86/hvm/mtrr.c b/xen/arch/x86/hvm/mtrr.c +index 09a51f415d..1d31def3c3 100644 +--- a/xen/arch/x86/hvm/mtrr.c ++++ b/xen/arch/x86/hvm/mtrr.c +@@ -595,7 +595,7 @@ static void free_pinned_cacheattr_entry(struct rcu_head *rcu) + int hvm_set_mem_pinned_cacheattr(struct domain *d, uint64_t gfn_start, + uint64_t gfn_end, uint32_t type) + { +- struct hvm_mem_pinned_cacheattr_range *range; ++ struct hvm_mem_pinned_cacheattr_range *range, *newr; + unsigned int nr = 0; + int rc = 1; + +@@ -609,14 +609,15 @@ int hvm_set_mem_pinned_cacheattr(struct domain *d, uint64_t gfn_start, + { + case XEN_DOMCTL_DELETE_MEM_CACHEATTR: + /* Remove the requested range. */ +- rcu_read_lock(&pinned_cacheattr_rcu_lock); +- list_for_each_entry_rcu ( range, +- &d->arch.hvm.pinned_cacheattr_ranges, +- list ) ++ domain_lock(d); ++ list_for_each_entry ( range, ++ &d->arch.hvm.pinned_cacheattr_ranges, ++ list ) + if ( range->start == gfn_start && range->end == gfn_end ) + { +- rcu_read_unlock(&pinned_cacheattr_rcu_lock); + list_del_rcu(&range->list); ++ domain_unlock(d); ++ + type = range->type; + call_rcu(&range->rcu, free_pinned_cacheattr_entry); + p2m_memory_type_changed(d); +@@ -637,7 +638,7 @@ int hvm_set_mem_pinned_cacheattr(struct domain *d, uint64_t gfn_start, + } + return 0; + } +- rcu_read_unlock(&pinned_cacheattr_rcu_lock); ++ domain_unlock(d); + return -ENOENT; + + case PAT_TYPE_UC_MINUS: +@@ -652,7 +653,10 @@ int hvm_set_mem_pinned_cacheattr(struct domain *d, uint64_t gfn_start, + return -EINVAL; + } + +- rcu_read_lock(&pinned_cacheattr_rcu_lock); ++ newr = xzalloc(struct hvm_mem_pinned_cacheattr_range); ++ ++ domain_lock(d); ++ + list_for_each_entry_rcu ( range, + &d->arch.hvm.pinned_cacheattr_ranges, + list ) +@@ -670,27 +674,34 @@ int hvm_set_mem_pinned_cacheattr(struct domain *d, uint64_t gfn_start, + } + ++nr; + } +- rcu_read_unlock(&pinned_cacheattr_rcu_lock); ++ + if ( rc <= 0 ) +- return rc; ++ /* nothing */; ++ else if ( nr >= 64 /* The limit is arbitrary. */ ) ++ rc = -ENOSPC; ++ else if ( !newr ) ++ rc = -ENOMEM; ++ else ++ { ++ newr->start = gfn_start; ++ newr->end = gfn_end; ++ newr->type = type; + +- if ( nr >= 64 /* The limit is arbitrary. */ ) +- return -ENOSPC; ++ list_add_rcu(&newr->list, &d->arch.hvm.pinned_cacheattr_ranges); + +- range = xzalloc(struct hvm_mem_pinned_cacheattr_range); +- if ( range == NULL ) +- return -ENOMEM; ++ newr = NULL; ++ rc = 0; ++ } ++ ++ domain_unlock(d); + +- range->start = gfn_start; +- range->end = gfn_end; +- range->type = type; ++ xfree(newr); + +- list_add_rcu(&range->list, &d->arch.hvm.pinned_cacheattr_ranges); + p2m_memory_type_changed(d); + if ( type != PAT_TYPE_WRBACK ) + flush_all(FLUSH_CACHE); + +- return 0; ++ return rc; + } + + static int hvm_save_mtrr_msr(struct vcpu *v, hvm_domain_context_t *h) +-- +2.40.0 + diff --git a/0005-x86-spec-ctrl-Defer-CR4_PV32_RESTORE-on-the-cstar_en.patch b/0005-x86-spec-ctrl-Defer-CR4_PV32_RESTORE-on-the-cstar_en.patch new file mode 100644 index 0000000..5a65bda --- /dev/null +++ b/0005-x86-spec-ctrl-Defer-CR4_PV32_RESTORE-on-the-cstar_en.patch @@ -0,0 +1,56 @@ +From 11193e13e5359ba1896be46be3e9b468154c1295 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Fri, 10 Feb 2023 21:11:14 +0000 +Subject: [PATCH 05/13] x86/spec-ctrl: Defer CR4_PV32_RESTORE on the + cstar_enter path + +As stated (correctly) by the comment next to SPEC_CTRL_ENTRY_FROM_PV, between +the two hunks visible in the patch, RET's are not safe prior to this point. + +CR4_PV32_RESTORE hides a CALL/RET pair in certain configurations (PV32 +compiled in, SMEP or SMAP active), and the RET can be attacked with one of +several known speculative issues. + +Furthermore, CR4_PV32_RESTORE also hides a reference to the cr4_pv32_mask +global variable, which is not safe when XPTI is active before restoring Xen's +full pagetables. + +This crash has gone unnoticed because it is only AMD CPUs which permit the +SYSCALL instruction in compatibility mode, and these are not vulnerable to +Meltdown so don't activate XPTI by default. + +This is XSA-429 / CVE-2022-42331 + +Fixes: 5e7962901131 ("x86/entry: Organise the use of MSR_SPEC_CTRL at each entry/exit point") +Fixes: 5784de3e2067 ("x86: Meltdown band-aid against malicious 64-bit PV guests") +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit df5b055b12116d9e63ced59ae5389e69a2a3de48) +--- + xen/arch/x86/x86_64/compat/entry.S | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S +index 5c999271e6..09a86768ac 100644 +--- a/xen/arch/x86/x86_64/compat/entry.S ++++ b/xen/arch/x86/x86_64/compat/entry.S +@@ -206,7 +206,6 @@ ENTRY(cstar_enter) + ALTERNATIVE "", "setssbsy", X86_FEATURE_XEN_SHSTK + #endif + push %rax /* Guest %rsp */ +- CR4_PV32_RESTORE + movq 8(%rsp), %rax /* Restore guest %rax. */ + movq $FLAT_USER_SS32, 8(%rsp) /* Assume a 64bit domain. Compat handled lower. */ + pushq %r11 +@@ -230,6 +229,8 @@ ENTRY(cstar_enter) + .Lcstar_cr3_okay: + sti + ++ CR4_PV32_RESTORE ++ + movq STACK_CPUINFO_FIELD(current_vcpu)(%rbx), %rbx + movq VCPU_domain(%rbx),%rcx + cmpb $0,DOMAIN_is_32bit_pv(%rcx) +-- +2.40.0 + diff --git a/0005-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch b/0005-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch deleted file mode 100644 index dcfc447..0000000 --- a/0005-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch +++ /dev/null @@ -1,93 +0,0 @@ -From 799a8d49237a62ea0d33c3756a6a7f665b8389b2 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 12 Jul 2022 11:23:32 +0200 -Subject: [PATCH 005/126] x86/spec-ctrl: Only adjust MSR_SPEC_CTRL for idle - with legacy IBRS -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Back at the time of the original Spectre-v2 fixes, it was recommended to clear -MSR_SPEC_CTRL when going idle. This is because of the side effects on the -sibling thread caused by the microcode IBRS and STIBP implementations which -were retrofitted to existing CPUs. - -However, there are no relevant cross-thread impacts for the hardware -IBRS/STIBP implementations, so this logic should not be used on Intel CPUs -supporting eIBRS, or any AMD CPUs; doing so only adds unnecessary latency to -the idle path. - -Furthermore, there's no point playing with MSR_SPEC_CTRL in the idle paths if -SMT is disabled for other reasons. - -Fixes: 8d03080d2a33 ("x86/spec-ctrl: Cease using thunk=lfence on AMD") -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: ffc7694e0c99eea158c32aa164b7d1e1bb1dc46b -master date: 2022-06-30 18:07:13 +0100 ---- - xen/arch/x86/spec_ctrl.c | 10 ++++++++-- - xen/include/asm-x86/cpufeatures.h | 2 +- - xen/include/asm-x86/spec_ctrl.h | 5 +++-- - 3 files changed, 12 insertions(+), 5 deletions(-) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 83b856fa9158..eb7fb70e86f9 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -1103,8 +1103,14 @@ void __init init_speculation_mitigations(void) - /* (Re)init BSP state now that default_spec_ctrl_flags has been calculated. */ - init_shadow_spec_ctrl_state(); - -- /* If Xen is using any MSR_SPEC_CTRL settings, adjust the idle path. */ -- if ( default_xen_spec_ctrl ) -+ /* -+ * For microcoded IBRS only (i.e. Intel, pre eIBRS), it is recommended to -+ * clear MSR_SPEC_CTRL before going idle, to avoid impacting sibling -+ * threads. Activate this if SMT is enabled, and Xen is using a non-zero -+ * MSR_SPEC_CTRL setting. -+ */ -+ if ( boot_cpu_has(X86_FEATURE_IBRSB) && !(caps & ARCH_CAPS_IBRS_ALL) && -+ hw_smt_enabled && default_xen_spec_ctrl ) - setup_force_cpu_cap(X86_FEATURE_SC_MSR_IDLE); - - xpti_init_default(caps); -diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h -index 9eaab7a2a1fa..f7488d3ccbfa 100644 ---- a/xen/include/asm-x86/cpufeatures.h -+++ b/xen/include/asm-x86/cpufeatures.h -@@ -33,7 +33,7 @@ XEN_CPUFEATURE(SC_MSR_HVM, X86_SYNTH(17)) /* MSR_SPEC_CTRL used by Xen fo - XEN_CPUFEATURE(SC_RSB_PV, X86_SYNTH(18)) /* RSB overwrite needed for PV */ - XEN_CPUFEATURE(SC_RSB_HVM, X86_SYNTH(19)) /* RSB overwrite needed for HVM */ - XEN_CPUFEATURE(XEN_SELFSNOOP, X86_SYNTH(20)) /* SELFSNOOP gets used by Xen itself */ --XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */ -+XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* Clear MSR_SPEC_CTRL on idle */ - XEN_CPUFEATURE(XEN_LBR, X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */ - /* Bits 23,24 unused. */ - XEN_CPUFEATURE(SC_VERW_IDLE, X86_SYNTH(25)) /* VERW used by Xen for idle */ -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index 68f6c46c470c..12283573cdd5 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -78,7 +78,8 @@ static always_inline void spec_ctrl_enter_idle(struct cpu_info *info) - uint32_t val = 0; - - /* -- * Branch Target Injection: -+ * It is recommended in some cases to clear MSR_SPEC_CTRL when going idle, -+ * to avoid impacting sibling threads. - * - * Latch the new shadow value, then enable shadowing, then update the MSR. - * There are no SMP issues here; only local processor ordering concerns. -@@ -114,7 +115,7 @@ static always_inline void spec_ctrl_exit_idle(struct cpu_info *info) - uint32_t val = info->xen_spec_ctrl; - - /* -- * Branch Target Injection: -+ * Restore MSR_SPEC_CTRL on exit from idle. - * - * Disable shadowing before updating the MSR. There are no SMP issues - * here; only local processor ordering concerns. --- -2.37.4 - diff --git a/0006-tools-Drop-gettext-as-a-build-dependency.patch b/0006-tools-Drop-gettext-as-a-build-dependency.patch new file mode 100644 index 0000000..9aaae10 --- /dev/null +++ b/0006-tools-Drop-gettext-as-a-build-dependency.patch @@ -0,0 +1,173 @@ +From 708b0d75e77a8305a756b55eefa0f226212f4d36 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Fri, 26 Mar 2021 11:25:07 +0000 +Subject: [PATCH 06/13] tools: Drop gettext as a build dependency +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +It has not been a dependency since at least 4.13. Remove its mandatory check +from ./configure. + +Annotate the dependency in the CI dockerfiles, and drop them from CirrusCI and +TravisCI. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Roger Pau Monné <roger.pau@citrix.com> +(cherry picked from commit e21a6a4f966a7e91cb0bb014dbe15d15cc0502ad) +--- + .cirrus.yml | 2 +- + .travis.yml | 1 - + README | 1 - + tools/configure | 49 ---------------------------------------------- + tools/configure.ac | 2 -- + 5 files changed, 1 insertion(+), 54 deletions(-) + +diff --git a/.cirrus.yml b/.cirrus.yml +index 0efff6fa98..fdb1c9c94d 100644 +--- a/.cirrus.yml ++++ b/.cirrus.yml +@@ -4,7 +4,7 @@ freebsd_template: &FREEBSD_TEMPLATE + APPEND_LIB: /usr/local/lib + APPEND_INCLUDES: /usr/local/include + +- install_script: pkg install -y seabios gettext-tools gmake ++ install_script: pkg install -y seabios gmake + pkgconf python libiconv bison perl5 + yajl lzo2 pixman argp-standalone + libxml2 glib git +diff --git a/.travis.yml b/.travis.yml +index 2362475f7a..f3cd15b79f 100644 +--- a/.travis.yml ++++ b/.travis.yml +@@ -54,7 +54,6 @@ addons: + - pkg-config + - flex + - bison +- - gettext + - acpica-tools + - bin86 + - bcc +diff --git a/README b/README +index de579080d7..efaa1451cb 100644 +--- a/README ++++ b/README +@@ -63,7 +63,6 @@ provided by your OS distributor: + * bridge-utils package (/sbin/brctl) + * iproute package (/sbin/ip) + * GNU bison and GNU flex +- * GNU gettext + * ACPI ASL compiler (iasl) + + In addition to the above there are a number of optional build +diff --git a/tools/configure b/tools/configure +index bb5acf9d43..5df30df9b3 100755 +--- a/tools/configure ++++ b/tools/configure +@@ -689,7 +689,6 @@ INSTALL_PROGRAM + SET_MAKE + AWK + IASL +-XGETTEXT + FLEX + BISON + PERL +@@ -847,7 +846,6 @@ PYTHON + PERL + BISON + FLEX +-XGETTEXT + AS86 + LD86 + BCC +@@ -1597,7 +1595,6 @@ Some influential environment variables: + PERL Path to Perl parser + BISON Path to Bison parser generator + FLEX Path to Flex lexical analyser generator +- XGETTEXT Path to xgetttext tool + AS86 Path to as86 tool + LD86 Path to ld86 tool + BCC Path to bcc tool +@@ -4738,7 +4735,6 @@ LDFLAGS="$PREPEND_LDFLAGS $LDFLAGS $APPEND_LDFLAGS" + + + +- + # Checks for programs. + ac_ext=c + ac_cpp='$CPP $CPPFLAGS' +@@ -7846,51 +7842,6 @@ fi + + if ! $rump; then + +-# Extract the first word of "xgettext", so it can be a program name with args. +-set dummy xgettext; ac_word=$2 +-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +-$as_echo_n "checking for $ac_word... " >&6; } +-if ${ac_cv_path_XGETTEXT+:} false; then : +- $as_echo_n "(cached) " >&6 +-else +- case $XGETTEXT in +- [\\/]* | ?:[\\/]*) +- ac_cv_path_XGETTEXT="$XGETTEXT" # Let the user override the test with a path. +- ;; +- *) +- as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +-for as_dir in $PATH +-do +- IFS=$as_save_IFS +- test -z "$as_dir" && as_dir=. +- for ac_exec_ext in '' $ac_executable_extensions; do +- if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then +- ac_cv_path_XGETTEXT="$as_dir/$ac_word$ac_exec_ext" +- $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 +- break 2 +- fi +-done +- done +-IFS=$as_save_IFS +- +- test -z "$ac_cv_path_XGETTEXT" && ac_cv_path_XGETTEXT="no" +- ;; +-esac +-fi +-XGETTEXT=$ac_cv_path_XGETTEXT +-if test -n "$XGETTEXT"; then +- { $as_echo "$as_me:${as_lineno-$LINENO}: result: $XGETTEXT" >&5 +-$as_echo "$XGETTEXT" >&6; } +-else +- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +-$as_echo "no" >&6; } +-fi +- +- +-if test x"${XGETTEXT}" = x"no" +-then +- as_fn_error $? "Unable to find xgettext, please install xgettext" "$LINENO" 5 +-fi + case "$host_cpu" in + i[3456]86|x86_64|aarch64) + # Extract the first word of "iasl", so it can be a program name with args. +diff --git a/tools/configure.ac b/tools/configure.ac +index 636e7077be..6414fcbb44 100644 +--- a/tools/configure.ac ++++ b/tools/configure.ac +@@ -298,7 +298,6 @@ AC_ARG_VAR([PYTHON], [Path to the Python parser]) + AC_ARG_VAR([PERL], [Path to Perl parser]) + AC_ARG_VAR([BISON], [Path to Bison parser generator]) + AC_ARG_VAR([FLEX], [Path to Flex lexical analyser generator]) +-AC_ARG_VAR([XGETTEXT], [Path to xgetttext tool]) + AC_ARG_VAR([AS86], [Path to as86 tool]) + AC_ARG_VAR([LD86], [Path to ld86 tool]) + AC_ARG_VAR([BCC], [Path to bcc tool]) +@@ -381,7 +380,6 @@ AS_IF([test "$cross_compiling" != yes], [ + + if ! $rump; then + +-AX_PATH_PROG_OR_FAIL([XGETTEXT], [xgettext]) + dnl as86, ld86, bcc and iasl are only required when the host system is x86*. + dnl "host" here means the platform on which the hypervisor and tools is + dnl going to run, not the platform on which we are building (known as +-- +2.40.0 + diff --git a/0006-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch b/0006-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch deleted file mode 100644 index 177d677..0000000 --- a/0006-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch +++ /dev/null @@ -1,234 +0,0 @@ -From cd5081e8c31651e623d86532306b4c56bbcb6e6d Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 12 Jul 2022 11:24:11 +0200 -Subject: [PATCH 006/126] x86/spec-ctrl: Knobs for STIBP and PSFD, and follow - hardware STIBP hint -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -STIBP and PSFD are slightly weird bits, because they're both implied by other -bits in MSR_SPEC_CTRL. Add fine grain controls for them, and take the -implications into account when setting IBRS/SSBD. - -Rearrange the IBPB text/variables/logic to keep all the MSR_SPEC_CTRL bits -together, for consistency. - -However, AMD have a hardware hint CPUID bit recommending that STIBP be set -unilaterally. This is advertised on Zen3, so follow the recommendation. -Furthermore, in such cases, set STIBP behind the guest's back for now. This -has negligible overhead for the guest, but saves a WRMSR on vmentry. This is -the only default change. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: fef244b179c06fcdfa581f7d57fa6e578c49ff50 -master date: 2022-06-30 18:07:13 +0100 ---- - docs/misc/xen-command-line.pandoc | 21 +++++++--- - xen/arch/x86/hvm/svm/vmcb.c | 9 +++++ - xen/arch/x86/spec_ctrl.c | 67 ++++++++++++++++++++++++++----- - 3 files changed, 82 insertions(+), 15 deletions(-) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index e17a835ed254..1db3da9ef78e 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2170,8 +2170,9 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). - - ### spec-ctrl (x86) - > `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb,md-clear}=<bool>, --> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu, --> l1d-flush,branch-harden,srb-lock,unpriv-mmio}=<bool> ]` -+> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, -+> eager-fpu,l1d-flush,branch-harden,srb-lock, -+> unpriv-mmio}=<bool> ]` - - Controls for speculative execution sidechannel mitigations. By default, Xen - will pick the most appropriate mitigations based on compiled in support, -@@ -2221,9 +2222,10 @@ On hardware supporting IBRS (Indirect Branch Restricted Speculation), the - If Xen is not using IBRS itself, functionality is still set up so IBRS can be - virtualised for guests. - --On hardware supporting IBPB (Indirect Branch Prediction Barrier), the `ibpb=` --option can be used to force (the default) or prevent Xen from issuing branch --prediction barriers on vcpu context switches. -+On hardware supporting STIBP (Single Thread Indirect Branch Predictors), the -+`stibp=` option can be used to force or prevent Xen using the feature itself. -+By default, Xen will use STIBP when IBRS is in use (IBRS implies STIBP), and -+when hardware hints recommend using it as a blanket setting. - - On hardware supporting SSBD (Speculative Store Bypass Disable), the `ssbd=` - option can be used to force or prevent Xen using the feature itself. On AMD -@@ -2231,6 +2233,15 @@ hardware, this is a global option applied at boot, and not virtualised for - guest use. On Intel hardware, the feature is virtualised for guests, - independently of Xen's choice of setting. - -+On hardware supporting PSFD (Predictive Store Forwarding Disable), the `psfd=` -+option can be used to force or prevent Xen using the feature itself. By -+default, Xen will not use PSFD. PSFD is implied by SSBD, and SSBD is off by -+default. -+ -+On hardware supporting IBPB (Indirect Branch Prediction Barrier), the `ibpb=` -+option can be used to force (the default) or prevent Xen from issuing branch -+prediction barriers on vcpu context switches. -+ - On all hardware, the `eager-fpu=` option can be used to force or prevent Xen - from using fully eager FPU context switches. This is currently implemented as - a global control. By default, Xen will choose to use fully eager context -diff --git a/xen/arch/x86/hvm/svm/vmcb.c b/xen/arch/x86/hvm/svm/vmcb.c -index 55da9302e5d7..a0bf9f4e056a 100644 ---- a/xen/arch/x86/hvm/svm/vmcb.c -+++ b/xen/arch/x86/hvm/svm/vmcb.c -@@ -29,6 +29,7 @@ - #include <asm/hvm/support.h> - #include <asm/hvm/svm/svm.h> - #include <asm/hvm/svm/svmdebug.h> -+#include <asm/spec_ctrl.h> - - struct vmcb_struct *alloc_vmcb(void) - { -@@ -175,6 +176,14 @@ static int construct_vmcb(struct vcpu *v) - vmcb->_pause_filter_thresh = SVM_PAUSETHRESH_INIT; - } - -+ /* -+ * When default_xen_spec_ctrl simply SPEC_CTRL_STIBP, default this behind -+ * the back of the VM too. Our SMT topology isn't accurate, the overhead -+ * is neglegable, and doing this saves a WRMSR on the vmentry path. -+ */ -+ if ( default_xen_spec_ctrl == SPEC_CTRL_STIBP ) -+ v->arch.msrs->spec_ctrl.raw = SPEC_CTRL_STIBP; -+ - return 0; - } - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index eb7fb70e86f9..8212227ee02a 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -48,9 +48,13 @@ static enum ind_thunk { - THUNK_LFENCE, - THUNK_JMP, - } opt_thunk __initdata = THUNK_DEFAULT; -+ - static int8_t __initdata opt_ibrs = -1; -+int8_t __initdata opt_stibp = -1; -+bool __read_mostly opt_ssbd; -+int8_t __initdata opt_psfd = -1; -+ - bool __read_mostly opt_ibpb = true; --bool __read_mostly opt_ssbd = false; - int8_t __read_mostly opt_eager_fpu = -1; - int8_t __read_mostly opt_l1d_flush = -1; - bool __read_mostly opt_branch_harden = true; -@@ -172,12 +176,20 @@ static int __init parse_spec_ctrl(const char *s) - else - rc = -EINVAL; - } -+ -+ /* Bits in MSR_SPEC_CTRL. */ - else if ( (val = parse_boolean("ibrs", s, ss)) >= 0 ) - opt_ibrs = val; -- else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 ) -- opt_ibpb = val; -+ else if ( (val = parse_boolean("stibp", s, ss)) >= 0 ) -+ opt_stibp = val; - else if ( (val = parse_boolean("ssbd", s, ss)) >= 0 ) - opt_ssbd = val; -+ else if ( (val = parse_boolean("psfd", s, ss)) >= 0 ) -+ opt_psfd = val; -+ -+ /* Misc settings. */ -+ else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 ) -+ opt_ibpb = val; - else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 ) - opt_eager_fpu = val; - else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 ) -@@ -376,7 +388,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - "\n"); - - /* Settings for Xen's protection, irrespective of guests. */ -- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s, Other:%s%s%s%s%s\n", -+ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s\n", - thunk == THUNK_NONE ? "N/A" : - thunk == THUNK_RETPOLINE ? "RETPOLINE" : - thunk == THUNK_LFENCE ? "LFENCE" : -@@ -390,6 +402,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - (!boot_cpu_has(X86_FEATURE_SSBD) && - !boot_cpu_has(X86_FEATURE_AMD_SSBD)) ? "" : - (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-", -+ (!boot_cpu_has(X86_FEATURE_PSFD) && -+ !boot_cpu_has(X86_FEATURE_INTEL_PSFD)) ? "" : -+ (default_xen_spec_ctrl & SPEC_CTRL_PSFD) ? " PSFD+" : " PSFD-", - !(caps & ARCH_CAPS_TSX_CTRL) ? "" : - (opt_tsx & 1) ? " TSX+" : " TSX-", - !cpu_has_srbds_ctrl ? "" : -@@ -950,10 +965,7 @@ void __init init_speculation_mitigations(void) - if ( !has_spec_ctrl ) - printk(XENLOG_WARNING "?!? CET active, but no MSR_SPEC_CTRL?\n"); - else if ( opt_ibrs == -1 ) -- { - opt_ibrs = ibrs = true; -- default_xen_spec_ctrl |= SPEC_CTRL_IBRS | SPEC_CTRL_STIBP; -- } - - if ( opt_thunk == THUNK_DEFAULT || opt_thunk == THUNK_RETPOLINE ) - thunk = THUNK_JMP; -@@ -1057,14 +1069,49 @@ void __init init_speculation_mitigations(void) - setup_force_cpu_cap(X86_FEATURE_SC_MSR_HVM); - } - -- /* If we have IBRS available, see whether we should use it. */ -+ /* Figure out default_xen_spec_ctrl. */ - if ( has_spec_ctrl && ibrs ) -- default_xen_spec_ctrl |= SPEC_CTRL_IBRS; -+ { -+ /* IBRS implies STIBP. */ -+ if ( opt_stibp == -1 ) -+ opt_stibp = 1; -+ -+ default_xen_spec_ctrl |= SPEC_CTRL_IBRS; -+ } -+ -+ /* -+ * Use STIBP by default if the hardware hint is set. Otherwise, leave it -+ * off as it a severe performance pentalty on pre-eIBRS Intel hardware -+ * where it was retrofitted in microcode. -+ */ -+ if ( opt_stibp == -1 ) -+ opt_stibp = !!boot_cpu_has(X86_FEATURE_STIBP_ALWAYS); -+ -+ if ( opt_stibp && (boot_cpu_has(X86_FEATURE_STIBP) || -+ boot_cpu_has(X86_FEATURE_AMD_STIBP)) ) -+ default_xen_spec_ctrl |= SPEC_CTRL_STIBP; - -- /* If we have SSBD available, see whether we should use it. */ - if ( opt_ssbd && (boot_cpu_has(X86_FEATURE_SSBD) || - boot_cpu_has(X86_FEATURE_AMD_SSBD)) ) -+ { -+ /* SSBD implies PSFD */ -+ if ( opt_psfd == -1 ) -+ opt_psfd = 1; -+ - default_xen_spec_ctrl |= SPEC_CTRL_SSBD; -+ } -+ -+ /* -+ * Don't use PSFD by default. AMD designed the predictor to -+ * auto-clear on privilege change. PSFD is implied by SSBD, which is -+ * off by default. -+ */ -+ if ( opt_psfd == -1 ) -+ opt_psfd = 0; -+ -+ if ( opt_psfd && (boot_cpu_has(X86_FEATURE_PSFD) || -+ boot_cpu_has(X86_FEATURE_INTEL_PSFD)) ) -+ default_xen_spec_ctrl |= SPEC_CTRL_PSFD; - - /* - * PV guests can poison the RSB to any virtual address from which --- -2.37.4 - diff --git a/0007-CI-Drop-TravisCI.patch b/0007-CI-Drop-TravisCI.patch new file mode 100644 index 0000000..bb31ecf --- /dev/null +++ b/0007-CI-Drop-TravisCI.patch @@ -0,0 +1,177 @@ +From e006948dd27b320602a718e2728678160c61593f Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Wed, 21 Apr 2021 10:16:13 +0100 +Subject: [PATCH 07/13] CI: Drop TravisCI + +Travis-ci.org is shutting down shortly. The arm cross-compile testing has +been broken for a long time now, and all testing has now been superseded by +our Gitlab infrastructure. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Wei Liu <wl@xen.org> +(cherry picked from commit e0dc9b095e7c73dcf6dbfe5c87c33c4708da4d1f) + +CI: Drop more TravisCI remnants + +This was missed from previous attempts to remove Travis. + +Fixes: e0dc9b095e7c ("CI: Drop TravisCI") +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Stefano Stabellini <sstabellini@kernel.org> +Release-acked-by: Henry Wang <Henry.Wang@arm.com> +(cherry picked from commit bad4832710c7261fad1abe2d0e8e2e1d259b3e8d) +--- + .travis.yml | 86 -------------------------------------------- + MAINTAINERS | 1 - + scripts/travis-build | 32 ----------------- + 3 files changed, 119 deletions(-) + delete mode 100644 .travis.yml + delete mode 100755 scripts/travis-build + +diff --git a/.travis.yml b/.travis.yml +deleted file mode 100644 +index f3cd15b79f..0000000000 +--- a/.travis.yml ++++ /dev/null +@@ -1,86 +0,0 @@ +-language: c +-dist: trusty +-sudo: required +-# don't test master, smoke and coverity branches +-branches: +- except: +- - master +- - smoke +- - /^coverity-tested\/.*/ +- - /^stable-.*/ +-matrix: +- include: +- - compiler: gcc +- env: XEN_TARGET_ARCH=x86_64 debug=n +- - compiler: gcc +- env: XEN_TARGET_ARCH=x86_64 XEN_CONFIG_EXPERT=y RANDCONFIG=y debug=n +- - compiler: gcc-5 +- env: XEN_TARGET_ARCH=x86_64 debug=n +- - compiler: gcc +- env: XEN_TARGET_ARCH=x86_64 debug=y +- - compiler: gcc-5 +- env: XEN_TARGET_ARCH=x86_64 debug=y +- - compiler: clang +- env: XEN_TARGET_ARCH=x86_64 clang=y debug=n +- - compiler: clang +- env: XEN_TARGET_ARCH=x86_64 clang=y debug=y +- - compiler: gcc +- env: XEN_TARGET_ARCH=arm32 CROSS_COMPILE=arm-linux-gnueabihf- debug=n +- - compiler: gcc +- env: XEN_TARGET_ARCH=arm32 CROSS_COMPILE=arm-linux-gnueabihf- XEN_CONFIG_EXPERT=y RANDCONFIG=y debug=n +- - compiler: gcc +- env: XEN_TARGET_ARCH=arm32 CROSS_COMPILE=arm-linux-gnueabihf- debug=y +- - compiler: gcc +- env: XEN_TARGET_ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- debug=n +- - compiler: gcc +- env: XEN_TARGET_ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- XEN_CONFIG_EXPERT=y RANDCONFIG=y debug=n +- - compiler: gcc +- env: XEN_TARGET_ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- debug=y +-addons: +- apt: +- sources: +- - ubuntu-toolchain-r-test +- packages: +- - zlib1g-dev +- - libncurses5-dev +- - libssl-dev +- - python-dev +- - xorg-dev +- - uuid-dev +- - libyajl-dev +- - libaio-dev +- - libglib2.0-dev +- - libpixman-1-dev +- - pkg-config +- - flex +- - bison +- - acpica-tools +- - bin86 +- - bcc +- - libnl-3-dev +- - ocaml-nox +- - libfindlib-ocaml-dev +- - transfig +- - pandoc +- - gcc-arm-linux-gnueabihf +- - gcc-aarch64-linux-gnu +- - gcc-5 +- - g++-5 +- - seabios +- - checkpolicy +- - ghostscript +-# we must set CXX manually instead of using 'language: cpp' due to +-# travis-ci/travis-ci#3871 +-before_script: +- - export CXX=${CC/cc/++} +- - export CXX=${CXX/clang/clang++} +-script: +- - ./scripts/travis-build +-after_script: +- - cat xen/.config +- - cat tools/config.log +- - cat docs/config.log +-notifications: +- irc: +- channels: +- - secure: "mPIFllF6eW3F3talvccMy55Tfcid66IPkkXZYCxDKRF2DQrMyvmg4qt0xN6gGZsdfOBMNr+/YfO5PxusBCUkVdBGBzd3QhFoIDYZbJZgzVh3yNDQ+x4L7p1cZNrwJ2loMmSX6KxGKZxZX9NRStrTUkVyp0jGZB9xkwT8Rl6jXj7EQkgQ95K1Wqafx0ycLfyDQmzX9bzi/3KIBFKMGmK18AFMh+R30zK0FPUUsS4+VhepIkVqO5puU3OYePd34wRnWlt7hjU2Vj5vYmVXp3UOE+E8/Lf9IGVAhitDi+EC35b8zo2BHJ9z6xZARYPvfSqbXcXV20RycabI+e3ufZJ40eatssly5QjWH+HhKS42C4gV1psmQhkTCNCM62Ty5uf6R1hsZJQuiOZrc8ojdje8ey2MxJk4R+Xz+Igg1/kD6+WX9/Y6Y3iRuj5HL1xCYfpTbK4mC7ofw0SofW2aAGI68jHpCqJdQCDzMl6748PlDMM0eKe0MPKIEenYHcoBnOEC/jciXUDa6wduV75EEip7oq2i+m44MopcsEDTpdliH077GhKapF0ActjvBTLpyoTRSfkKm0NZol/dgwd3PGG/mY8clIoeXWRb4opk93ejPC967KmSNC68SlfwaJmFZS5T9vAgb6k7r6i9G3dmYtrLKzws8IV1CPWqLzk58+v4pRk=" +diff --git a/MAINTAINERS b/MAINTAINERS +index 91064c09f9..37aa60dd64 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -195,7 +195,6 @@ S: Supported + F: .gitlab-ci.yml + F: .travis.yml + F: automation/ +-F: scripts/travis-build + + CPU POOLS + M: Juergen Gross <jgross@suse.com> +diff --git a/scripts/travis-build b/scripts/travis-build +deleted file mode 100755 +index 84d74266a0..0000000000 +--- a/scripts/travis-build ++++ /dev/null +@@ -1,32 +0,0 @@ +-#!/bin/bash -ex +- +-$CC --version +- +-# random config or default config +-if [[ "${RANDCONFIG}" == "y" ]]; then +- make -C xen KCONFIG_ALLCONFIG=tools/kconfig/allrandom.config randconfig +-else +- make -C xen defconfig +-fi +- +-# build up our configure options +-cfgargs=() +-cfgargs+=("--disable-stubdom") # more work needed into building this +-cfgargs+=("--disable-rombios") +-cfgargs+=("--enable-docs") +-cfgargs+=("--with-system-seabios=/usr/share/seabios/bios.bin") +- +-# Qemu requires Python 3.5 or later +-if ! type python3 || python3 -c "import sys; res = sys.version_info < (3, 5); exit(not(res))"; then +- cfgargs+=("--with-system-qemu=/bin/false") +-fi +- +-if [[ "${XEN_TARGET_ARCH}" == "x86_64" ]]; then +- cfgargs+=("--enable-tools") +-else +- cfgargs+=("--disable-tools") # we don't have the cross depends installed +-fi +- +-./configure "${cfgargs[@]}" +- +-make dist +-- +2.40.0 + diff --git a/0007-libxc-fix-compilation-error-with-gcc13.patch b/0007-libxc-fix-compilation-error-with-gcc13.patch deleted file mode 100644 index 388111e..0000000 --- a/0007-libxc-fix-compilation-error-with-gcc13.patch +++ /dev/null @@ -1,33 +0,0 @@ -From 77deab4233b5d9ec5cf214fdc1652424fd4fc9d6 Mon Sep 17 00:00:00 2001 -From: Charles Arnold <carnold@suse.com> -Date: Tue, 12 Jul 2022 11:24:39 +0200 -Subject: [PATCH 007/126] libxc: fix compilation error with gcc13 - -xc_psr.c:161:5: error: conflicting types for 'xc_psr_cmt_get_data' -due to enum/integer mismatch; - -Signed-off-by: Charles Arnold <carnold@suse.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Anthony PERARD <anthony.perard@citrix.com> -master commit: 8eeae8c2b4efefda8e946461e86cf2ae9c18e5a9 -master date: 2022-07-06 13:06:40 +0200 ---- - tools/include/xenctrl.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/include/xenctrl.h b/tools/include/xenctrl.h -index 318920166c5e..2013200b9eff 100644 ---- a/tools/include/xenctrl.h -+++ b/tools/include/xenctrl.h -@@ -2577,7 +2577,7 @@ int xc_psr_cmt_get_l3_event_mask(xc_interface *xch, uint32_t *event_mask); - int xc_psr_cmt_get_l3_cache_size(xc_interface *xch, uint32_t cpu, - uint32_t *l3_cache_size); - int xc_psr_cmt_get_data(xc_interface *xch, uint32_t rmid, uint32_t cpu, -- uint32_t psr_cmt_type, uint64_t *monitor_data, -+ xc_psr_cmt_type type, uint64_t *monitor_data, - uint64_t *tsc); - int xc_psr_cmt_enabled(xc_interface *xch); - --- -2.37.4 - diff --git a/0008-build-add-full-to-version.sh-to-guess-XEN_FULLVERSIO.patch b/0008-build-add-full-to-version.sh-to-guess-XEN_FULLVERSIO.patch new file mode 100644 index 0000000..779d805 --- /dev/null +++ b/0008-build-add-full-to-version.sh-to-guess-XEN_FULLVERSIO.patch @@ -0,0 +1,95 @@ +From b461db64b37f9a04ba5a726b0e474e870226f2ac Mon Sep 17 00:00:00 2001 +From: Anthony PERARD <anthony.perard@citrix.com> +Date: Thu, 9 Sep 2021 15:33:06 +0100 +Subject: [PATCH 08/13] build: add --full to version.sh to guess + $(XEN_FULLVERSION) + +Running $(MAKE) like that in a $(shell ) while parsing the Makefile +doesn't work reliably. In some case, make will complain with +"jobserver unavailable: using -j1. Add '+' to parent make rule.". +Also, it isn't possible to distinguish between the output produced by +the target "xenversion" and `make`'s own output. + +Instead of running make, this patch "improve" `version.sh` to try to +guess the output of `make xenversion`. + +In order to have version.sh works in more scenario, it will use +XEN_EXTRAVERSION and XEN_VENDORVERSION from the environment when +present. As for the cases were those two variables are overridden by a +make command line arguments, we export them when invoking version.sh +via a new $(XEN_FULLVERSION) macro. + +That should hopefully get us to having ./version.sh returning the same +value that `make xenversion` would. + +This fix GitLab CI's build job "debian-unstable-gcc-arm64". + +Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> +Reviewed-by: Daniel P. Smith <dpsmith@apertussolutions.com> +Reviewed-by: Ian Jackson <iwj@xenproject.org> +(cherry picked from commit ab4a83023eda9f04ad864877c1956b087ec6fc4f) +--- + tools/Rules.mk | 5 +++++ + tools/flask/policy/Makefile.common | 2 +- + version.sh | 18 +++++++++++++++++- + 3 files changed, 23 insertions(+), 2 deletions(-) + +diff --git a/tools/Rules.mk b/tools/Rules.mk +index 444e5bacdd..051a5d3555 100644 +--- a/tools/Rules.mk ++++ b/tools/Rules.mk +@@ -6,6 +6,11 @@ all: + -include $(XEN_ROOT)/config/Tools.mk + include $(XEN_ROOT)/Config.mk + ++XEN_FULLVERSION=$(shell env \ ++ XEN_EXTRAVERSION=$(XEN_EXTRAVERSION) \ ++ XEN_VENDORVERSION=$(XEN_VENDORVERSION) \ ++ $(SHELL) $(XEN_ROOT)/version.sh --full $(XEN_ROOT)/xen/Makefile) ++ + export _INSTALL := $(INSTALL) + INSTALL = $(XEN_ROOT)/tools/cross-install + +diff --git a/tools/flask/policy/Makefile.common b/tools/flask/policy/Makefile.common +index bea5ba4b6a..e5ed58200e 100644 +--- a/tools/flask/policy/Makefile.common ++++ b/tools/flask/policy/Makefile.common +@@ -35,7 +35,7 @@ OUTPUT_POLICY ?= $(BEST_POLICY_VER) + # + ######################################## + +-POLICY_FILENAME = $(FLASK_BUILD_DIR)/xenpolicy-$(shell $(MAKE) -C $(XEN_ROOT)/xen xenversion --no-print-directory) ++POLICY_FILENAME = $(FLASK_BUILD_DIR)/xenpolicy-$(XEN_FULLVERSION) + POLICY_LOADPATH = /boot + + # List of policy versions supported by the hypervisor +diff --git a/version.sh b/version.sh +index e894ee7e04..c6a5692c19 100755 +--- a/version.sh ++++ b/version.sh +@@ -1,5 +1,21 @@ + #!/bin/sh + ++opt_full=false ++while [ $# -gt 1 ]; do ++ case "$1" in ++ --full) opt_full=true ;; ++ *) break ;; ++ esac ++ shift ++done ++ + MAJOR=`grep "export XEN_VERSION" $1 | sed 's/.*=//g' | tr -s " "` + MINOR=`grep "export XEN_SUBVERSION" $1 | sed 's/.*=//g' | tr -s " "` +-printf "%d.%d" $MAJOR $MINOR ++ ++if $opt_full; then ++ extraversion=$(grep "export XEN_EXTRAVERSION" $1 | sed 's/^.* ?=\s\+//; s/\$([^)]*)//g; s/ //g') ++ : ${XEN_EXTRAVERSION:=${extraversion}${XEN_VENDORVERSION}} ++else ++ unset XEN_EXTRAVERSION ++fi ++printf "%d.%d%s" $MAJOR $MINOR $XEN_EXTRAVERSION +-- +2.40.0 + diff --git a/0008-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch b/0008-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch deleted file mode 100644 index 18ec7de..0000000 --- a/0008-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch +++ /dev/null @@ -1,32 +0,0 @@ -From 5be1f46f435f8b05608b1eae029cb17d8bd3a560 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 12 Jul 2022 11:25:05 +0200 -Subject: [PATCH 008/126] x86/spec-ctrl: Honour spec-ctrl=0 for unpriv-mmio - sub-option - -This was an oversight from when unpriv-mmio was introduced. - -Fixes: 8c24b70fedcb ("x86/spec-ctrl: Add spec-ctrl=unpriv-mmio") -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 4cdb519d797c19ebb8fadc5938cdb47479d5a21b -master date: 2022-07-11 15:21:35 +0100 ---- - xen/arch/x86/spec_ctrl.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 8212227ee02a..06790897e496 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -122,6 +122,7 @@ static int __init parse_spec_ctrl(const char *s) - opt_l1d_flush = 0; - opt_branch_harden = false; - opt_srb_lock = 0; -+ opt_unpriv_mmio = false; - } - else if ( val > 0 ) - rc = -EINVAL; --- -2.37.4 - diff --git a/0009-bump-default-SeaBIOS-version-to-1.16.0.patch b/0009-bump-default-SeaBIOS-version-to-1.16.0.patch new file mode 100644 index 0000000..e96cb31 --- /dev/null +++ b/0009-bump-default-SeaBIOS-version-to-1.16.0.patch @@ -0,0 +1,28 @@ +From b11fc96b1e6e474da614b87b6ac97183273c6ebd Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Fri, 6 May 2022 14:46:52 +0200 +Subject: [PATCH 09/13] bump default SeaBIOS version to 1.16.0 + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 944e389daa133dd310d87c4eebacba9f6da76018) +--- + Config.mk | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/Config.mk b/Config.mk +index 9696e1a151..381bdf17b1 100644 +--- a/Config.mk ++++ b/Config.mk +@@ -247,7 +247,7 @@ OVMF_UPSTREAM_REVISION ?= a3741780fe3535e19e02efa869a7cac481891129 + QEMU_UPSTREAM_REVISION ?= qemu-xen-4.15.4 + MINIOS_UPSTREAM_REVISION ?= xen-RELEASE-4.15.4 + +-SEABIOS_UPSTREAM_REVISION ?= rel-1.14.0 ++SEABIOS_UPSTREAM_REVISION ?= rel-1.16.0 + + ETHERBOOT_NICS ?= rtl8139 8086100e + +-- +2.40.0 + diff --git a/0009-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch b/0009-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch deleted file mode 100644 index bfae8e2..0000000 --- a/0009-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch +++ /dev/null @@ -1,87 +0,0 @@ -From ae417706870333bb52ebcf33c527809cdd2d7265 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 12 Jul 2022 11:25:40 +0200 -Subject: [PATCH 009/126] xen/cmdline: Extend parse_boolean() to signal a name - match - -This will help parsing a sub-option which has boolean and non-boolean options -available. - -First, rework 'int val' into 'bool has_neg_prefix'. This inverts it's value, -but the resulting logic is far easier to follow. - -Second, reject anything of the form 'no-$FOO=' which excludes ambiguous -constructs such as 'no-$foo=yes' which have never been valid. - -This just leaves the case where everything is otherwise fine, but parse_bool() -can't interpret the provided string. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 382326cac528dd1eb0d04efd5c05363c453e29f4 -master date: 2022-07-11 15:21:35 +0100 ---- - xen/common/kernel.c | 20 ++++++++++++++++---- - xen/include/xen/lib.h | 3 ++- - 2 files changed, 18 insertions(+), 5 deletions(-) - -diff --git a/xen/common/kernel.c b/xen/common/kernel.c -index 7a345ae45e1e..daf965266573 100644 ---- a/xen/common/kernel.c -+++ b/xen/common/kernel.c -@@ -272,9 +272,9 @@ int parse_bool(const char *s, const char *e) - int parse_boolean(const char *name, const char *s, const char *e) - { - size_t slen, nlen; -- int val = !!strncmp(s, "no-", 3); -+ bool has_neg_prefix = !strncmp(s, "no-", 3); - -- if ( !val ) -+ if ( has_neg_prefix ) - s += 3; - - slen = e ? ({ ASSERT(e >= s); e - s; }) : strlen(s); -@@ -286,11 +286,23 @@ int parse_boolean(const char *name, const char *s, const char *e) - - /* Exact, unadorned name? Result depends on the 'no-' prefix. */ - if ( slen == nlen ) -- return val; -+ return !has_neg_prefix; -+ -+ /* Inexact match with a 'no-' prefix? Not valid. */ -+ if ( has_neg_prefix ) -+ return -1; - - /* =$SOMETHING? Defer to the regular boolean parsing. */ - if ( s[nlen] == '=' ) -- return parse_bool(&s[nlen + 1], e); -+ { -+ int b = parse_bool(&s[nlen + 1], e); -+ -+ if ( b >= 0 ) -+ return b; -+ -+ /* Not a boolean, but the name matched. Signal specially. */ -+ return -2; -+ } - - /* Unrecognised. Give up. */ - return -1; -diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h -index 1198c7c0b207..be7498135170 100644 ---- a/xen/include/xen/lib.h -+++ b/xen/include/xen/lib.h -@@ -80,7 +80,8 @@ int parse_bool(const char *s, const char *e); - /** - * Given a specific name, parses a string of the form: - * [no-]$NAME[=...] -- * returning 0 or 1 for a recognised boolean, or -1 for an error. -+ * returning 0 or 1 for a recognised boolean. Returns -1 for general errors, -+ * and -2 for "not a boolean, but $NAME= matches". - */ - int parse_boolean(const char *name, const char *s, const char *e); - --- -2.37.4 - diff --git a/0010-CI-Drop-automation-configs.patch b/0010-CI-Drop-automation-configs.patch new file mode 100644 index 0000000..ac5a0eb --- /dev/null +++ b/0010-CI-Drop-automation-configs.patch @@ -0,0 +1,87 @@ +From 8455998e32d07e103a44826ac7f721639a4ebc26 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Thu, 29 Dec 2022 15:39:13 +0000 +Subject: [PATCH 10/13] CI: Drop automation/configs/ + +Having 3 extra hypervisor builds on the end of a full build is deeply +confusing to debug if one of them fails, because the .config file presented in +the artefacts is not the one which caused a build failure. Also, the log +tends to be truncated in the UI. + +PV-only is tested as part of PV-Shim in a full build anyway, so doesn't need +repeating. HVM-only and neither appear frequently in randconfig, so drop all +the logic here to simplify things. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Michal Orzel <michal.orzel@amd.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +(cherry picked from commit 7b20009a812f26e74bdbde2ab96165376b3dad34) +--- + automation/configs/x86/hvm_only_config | 3 --- + automation/configs/x86/no_hvm_pv_config | 3 --- + automation/configs/x86/pv_only_config | 3 --- + automation/scripts/build | 21 --------------------- + 4 files changed, 30 deletions(-) + delete mode 100644 automation/configs/x86/hvm_only_config + delete mode 100644 automation/configs/x86/no_hvm_pv_config + delete mode 100644 automation/configs/x86/pv_only_config + +diff --git a/automation/configs/x86/hvm_only_config b/automation/configs/x86/hvm_only_config +deleted file mode 100644 +index 9efbddd535..0000000000 +--- a/automation/configs/x86/hvm_only_config ++++ /dev/null +@@ -1,3 +0,0 @@ +-CONFIG_HVM=y +-# CONFIG_PV is not set +-# CONFIG_DEBUG is not set +diff --git a/automation/configs/x86/no_hvm_pv_config b/automation/configs/x86/no_hvm_pv_config +deleted file mode 100644 +index 0bf6a8e468..0000000000 +--- a/automation/configs/x86/no_hvm_pv_config ++++ /dev/null +@@ -1,3 +0,0 @@ +-# CONFIG_HVM is not set +-# CONFIG_PV is not set +-# CONFIG_DEBUG is not set +diff --git a/automation/configs/x86/pv_only_config b/automation/configs/x86/pv_only_config +deleted file mode 100644 +index e9d8b4a7c7..0000000000 +--- a/automation/configs/x86/pv_only_config ++++ /dev/null +@@ -1,3 +0,0 @@ +-CONFIG_PV=y +-# CONFIG_HVM is not set +-# CONFIG_DEBUG is not set +diff --git a/automation/scripts/build b/automation/scripts/build +index 1b752edfe6..fd87b02036 100755 +--- a/automation/scripts/build ++++ b/automation/scripts/build +@@ -62,24 +62,3 @@ if [[ "${XEN_TARGET_ARCH}" != "x86_32" ]]; then + cp -r dist binaries/ + fi + fi +- +-if [[ "${hypervisor_only}" == "y" ]]; then +- # If we are build testing a specific Kconfig exit now, there's no point in +- # testing all the possible configs. +- exit 0 +-fi +- +-# Build all the configs we care about +-case ${XEN_TARGET_ARCH} in +- x86_64) arch=x86 ;; +- *) exit 0 ;; +-esac +- +-cfg_dir="automation/configs/${arch}" +-for cfg in `ls ${cfg_dir}`; do +- echo "Building $cfg" +- make -j$(nproc) -C xen clean +- rm -f xen/.config +- make -C xen KBUILD_DEFCONFIG=../../../../${cfg_dir}/${cfg} XEN_CONFIG_EXPERT=y defconfig +- make -j$(nproc) -C xen XEN_CONFIG_EXPERT=y +-done +-- +2.40.0 + diff --git a/0010-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch b/0010-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch deleted file mode 100644 index 621d372..0000000 --- a/0010-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch +++ /dev/null @@ -1,137 +0,0 @@ -From 08bfd4d01185e94fda1be9dd79a981d890a9085e Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 12 Jul 2022 11:26:14 +0200 -Subject: [PATCH 010/126] x86/spec-ctrl: Add fine-grained cmdline suboptions - for primitives - -Support controling the PV/HVM suboption of msr-sc/rsb/md-clear, which -previously wasn't possible. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 27357c394ba6e1571a89105b840ce1c6f026485c -master date: 2022-07-11 15:21:35 +0100 ---- - docs/misc/xen-command-line.pandoc | 12 ++++-- - xen/arch/x86/spec_ctrl.c | 66 ++++++++++++++++++++++++++----- - 2 files changed, 66 insertions(+), 12 deletions(-) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index 1db3da9ef78e..b06db5f654e5 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2169,7 +2169,8 @@ not be able to control the state of the mitigation. - By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). - - ### spec-ctrl (x86) --> `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb,md-clear}=<bool>, -+> `= List of [ <bool>, xen=<bool>, {pv,hvm}=<bool>, -+> {msr-sc,rsb,md-clear}=<bool>|{pv,hvm}=<bool>, - > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, - > eager-fpu,l1d-flush,branch-harden,srb-lock, - > unpriv-mmio}=<bool> ]` -@@ -2194,12 +2195,17 @@ in place for guests to use. - - Use of a positive boolean value for either of these options is invalid. - --The booleans `pv=`, `hvm=`, `msr-sc=`, `rsb=` and `md-clear=` offer fine -+The `pv=`, `hvm=`, `msr-sc=`, `rsb=` and `md-clear=` options offer fine - grained control over the primitives by Xen. These impact Xen's ability to --protect itself, and Xen's ability to virtualise support for guests to use. -+protect itself, and/or Xen's ability to virtualise support for guests to use. - - * `pv=` and `hvm=` offer control over all suboptions for PV and HVM guests - respectively. -+* Each other option can be used either as a plain boolean -+ (e.g. `spec-ctrl=rsb` to control both the PV and HVM sub-options), or with -+ `pv=` or `hvm=` subsuboptions (e.g. `spec-ctrl=rsb=no-hvm` to disable HVM -+ RSB only). -+ - * `msr-sc=` offers control over Xen's support for manipulating `MSR_SPEC_CTRL` - on entry and exit. These blocks are necessary to virtualise support for - guests and if disabled, guests will be unable to use IBRS/STIBP/SSBD/etc. -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 06790897e496..225fe08259b3 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -147,20 +147,68 @@ static int __init parse_spec_ctrl(const char *s) - opt_rsb_hvm = val; - opt_md_clear_hvm = val; - } -- else if ( (val = parse_boolean("msr-sc", s, ss)) >= 0 ) -+ else if ( (val = parse_boolean("msr-sc", s, ss)) != -1 ) - { -- opt_msr_sc_pv = val; -- opt_msr_sc_hvm = val; -+ switch ( val ) -+ { -+ case 0: -+ case 1: -+ opt_msr_sc_pv = opt_msr_sc_hvm = val; -+ break; -+ -+ case -2: -+ s += strlen("msr-sc="); -+ if ( (val = parse_boolean("pv", s, ss)) >= 0 ) -+ opt_msr_sc_pv = val; -+ else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) -+ opt_msr_sc_hvm = val; -+ else -+ default: -+ rc = -EINVAL; -+ break; -+ } - } -- else if ( (val = parse_boolean("rsb", s, ss)) >= 0 ) -+ else if ( (val = parse_boolean("rsb", s, ss)) != -1 ) - { -- opt_rsb_pv = val; -- opt_rsb_hvm = val; -+ switch ( val ) -+ { -+ case 0: -+ case 1: -+ opt_rsb_pv = opt_rsb_hvm = val; -+ break; -+ -+ case -2: -+ s += strlen("rsb="); -+ if ( (val = parse_boolean("pv", s, ss)) >= 0 ) -+ opt_rsb_pv = val; -+ else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) -+ opt_rsb_hvm = val; -+ else -+ default: -+ rc = -EINVAL; -+ break; -+ } - } -- else if ( (val = parse_boolean("md-clear", s, ss)) >= 0 ) -+ else if ( (val = parse_boolean("md-clear", s, ss)) != -1 ) - { -- opt_md_clear_pv = val; -- opt_md_clear_hvm = val; -+ switch ( val ) -+ { -+ case 0: -+ case 1: -+ opt_md_clear_pv = opt_md_clear_hvm = val; -+ break; -+ -+ case -2: -+ s += strlen("md-clear="); -+ if ( (val = parse_boolean("pv", s, ss)) >= 0 ) -+ opt_md_clear_pv = val; -+ else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) -+ opt_md_clear_hvm = val; -+ else -+ default: -+ rc = -EINVAL; -+ break; -+ } - } - - /* Xen's speculative sidechannel mitigation settings. */ --- -2.37.4 - diff --git a/0011-automation-Remove-CentOS-7.2-containers-and-builds.patch b/0011-automation-Remove-CentOS-7.2-containers-and-builds.patch new file mode 100644 index 0000000..a37b6cf --- /dev/null +++ b/0011-automation-Remove-CentOS-7.2-containers-and-builds.patch @@ -0,0 +1,144 @@ +From c1367de50a304dcb327890d5c69fd7d2f66f1beb Mon Sep 17 00:00:00 2001 +From: Anthony PERARD <anthony.perard@citrix.com> +Date: Tue, 21 Feb 2023 16:55:36 +0000 +Subject: [PATCH 11/13] automation: Remove CentOS 7.2 containers and builds + +We already have a container which track the latest CentOS 7, no need +for this one as well. + +Also, 7.2 have outdated root certificate which prevent connection to +website which use Let's Encrypt. + +Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +(cherry picked from commit ba512629f76dfddb39ea9133ee51cdd9e392a927) +--- + automation/build/centos/7.2.dockerfile | 51 ------------------------- + automation/build/centos/CentOS-7.2.repo | 35 ----------------- + automation/gitlab-ci/build.yaml | 10 ----- + 3 files changed, 96 deletions(-) + delete mode 100644 automation/build/centos/7.2.dockerfile + delete mode 100644 automation/build/centos/CentOS-7.2.repo + +diff --git a/automation/build/centos/7.2.dockerfile b/automation/build/centos/7.2.dockerfile +deleted file mode 100644 +index af672a0be1..0000000000 +--- a/automation/build/centos/7.2.dockerfile ++++ /dev/null +@@ -1,51 +0,0 @@ +-FROM centos:7.2.1511 +-LABEL maintainer.name="The Xen Project" \ +- maintainer.email="xen-devel@lists.xenproject.org" +- +-# ensure we only get bits from the vault for +-# the version we want +-COPY CentOS-7.2.repo /etc/yum.repos.d/CentOS-Base.repo +- +-# install EPEL for dev86, xz-devel and possibly other packages +-RUN yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \ +- yum clean all +- +-RUN mkdir /build +-WORKDIR /build +- +-# work around https://github.com/moby/moby/issues/10180 +-# and install Xen depends +-RUN rpm --rebuilddb && \ +- yum -y install \ +- yum-plugin-ovl \ +- gcc \ +- gcc-c++ \ +- ncurses-devel \ +- zlib-devel \ +- openssl-devel \ +- python-devel \ +- libuuid-devel \ +- pkgconfig \ +- gettext \ +- flex \ +- bison \ +- libaio-devel \ +- glib2-devel \ +- yajl-devel \ +- pixman-devel \ +- glibc-devel \ +- # glibc-devel.i686 for Xen < 4.15 +- glibc-devel.i686 \ +- make \ +- binutils \ +- git \ +- wget \ +- acpica-tools \ +- python-markdown \ +- patch \ +- checkpolicy \ +- dev86 \ +- xz-devel \ +- bzip2 \ +- nasm \ +- && yum clean all +diff --git a/automation/build/centos/CentOS-7.2.repo b/automation/build/centos/CentOS-7.2.repo +deleted file mode 100644 +index 4da27faeb5..0000000000 +--- a/automation/build/centos/CentOS-7.2.repo ++++ /dev/null +@@ -1,35 +0,0 @@ +-# CentOS-Base.repo +-# +-# This is a replacement file that pins things to just use CentOS 7.2 +-# from the CentOS Vault. +-# +- +-[base] +-name=CentOS-7.2.1511 - Base +-baseurl=http://vault.centos.org/7.2.1511/os/$basearch/ +-gpgcheck=1 +-gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7 +- +-#released updates +-[updates] +-name=CentOS-7.2.1511 - Updates +-baseurl=http://vault.centos.org/7.2.1511/updates/$basearch/ +-gpgcheck=1 +-gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7 +- +-#additional packages that may be useful +-[extras] +-name=CentOS-7.2.1511 - Extras +-baseurl=http://vault.centos.org/7.2.1511/extras/$basearch/ +-gpgcheck=1 +-gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7 +- +-#additional packages that extend functionality of existing packages +-[centosplus] +-name=CentOS-7.2.1511 - Plus +-baseurl=http://vault.centos.org/7.2.1511/centosplus/$basearch/ +-gpgcheck=1 +-gpgcheck=1 +-enabled=0 +-gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7 +- +diff --git a/automation/gitlab-ci/build.yaml b/automation/gitlab-ci/build.yaml +index 6cca2122aa..35a6643c3d 100644 +--- a/automation/gitlab-ci/build.yaml ++++ b/automation/gitlab-ci/build.yaml +@@ -156,16 +156,6 @@ archlinux-gcc-debug: + variables: + CONTAINER: archlinux:current + +-centos-7-2-gcc: +- extends: .gcc-x86-64-build +- variables: +- CONTAINER: centos:7.2 +- +-centos-7-2-gcc-debug: +- extends: .gcc-x86-64-build-debug +- variables: +- CONTAINER: centos:7.2 +- + centos-7-gcc: + extends: .gcc-x86-64-build + variables: +-- +2.40.0 + diff --git a/0011-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch b/0011-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch deleted file mode 100644 index 34acad9..0000000 --- a/0011-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch +++ /dev/null @@ -1,29 +0,0 @@ -From f241cc48dabeef6cb0b381db62f2562b0a3970eb Mon Sep 17 00:00:00 2001 -From: Anthony PERARD <anthony.perard@citrix.com> -Date: Tue, 12 Jul 2022 11:26:47 +0200 -Subject: [PATCH 011/126] tools/helpers: fix build of xen-init-dom0 with - -Werror - -Missing prototype of asprintf() without _GNU_SOURCE. - -Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> -Reviewed-by: Henry Wang <Henry.Wang@arm.com> -master commit: d693b22733044d68e9974766b5c9e6259c9b1708 -master date: 2022-07-12 08:38:35 +0200 ---- - tools/helpers/xen-init-dom0.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/tools/helpers/xen-init-dom0.c b/tools/helpers/xen-init-dom0.c -index c99224a4b607..b4861c9e8041 100644 ---- a/tools/helpers/xen-init-dom0.c -+++ b/tools/helpers/xen-init-dom0.c -@@ -1,3 +1,5 @@ -+#define _GNU_SOURCE -+ - #include <stdlib.h> - #include <stdint.h> - #include <string.h> --- -2.37.4 - diff --git a/0012-automation-Remove-non-debug-x86_32-build-jobs.patch b/0012-automation-Remove-non-debug-x86_32-build-jobs.patch new file mode 100644 index 0000000..5b5aa10 --- /dev/null +++ b/0012-automation-Remove-non-debug-x86_32-build-jobs.patch @@ -0,0 +1,67 @@ +From 7fa798d8615df5e3451fda020d6e1ebc850819c2 Mon Sep 17 00:00:00 2001 +From: Anthony PERARD <anthony.perard@citrix.com> +Date: Fri, 24 Feb 2023 17:29:15 +0000 +Subject: [PATCH 12/13] automation: Remove non-debug x86_32 build jobs + +In the interest of having less jobs, we remove the x86_32 build jobs +that do release build. Debug build is very likely to be enough to find +32bit build issues. + +Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +(cherry picked from commit 7b66792ea7f77fb9e587e1e9c530a7c869eecba1) +--- + automation/gitlab-ci/build.yaml | 20 -------------------- + 1 file changed, 20 deletions(-) + +diff --git a/automation/gitlab-ci/build.yaml b/automation/gitlab-ci/build.yaml +index 35a6643c3d..44f8bc6725 100644 +--- a/automation/gitlab-ci/build.yaml ++++ b/automation/gitlab-ci/build.yaml +@@ -236,21 +236,11 @@ debian-stretch-gcc-debug: + variables: + CONTAINER: debian:stretch + +-debian-stretch-32-clang: +- extends: .clang-x86-32-build +- variables: +- CONTAINER: debian:stretch-i386 +- + debian-stretch-32-clang-debug: + extends: .clang-x86-32-build-debug + variables: + CONTAINER: debian:stretch-i386 + +-debian-stretch-32-gcc: +- extends: .gcc-x86-32-build +- variables: +- CONTAINER: debian:stretch-i386 +- + debian-stretch-32-gcc-debug: + extends: .gcc-x86-32-build-debug + variables: +@@ -288,21 +278,11 @@ debian-unstable-gcc-debug-randconfig: + CONTAINER: debian:unstable + RANDCONFIG: y + +-debian-unstable-32-clang: +- extends: .clang-x86-32-build +- variables: +- CONTAINER: debian:unstable-i386 +- + debian-unstable-32-clang-debug: + extends: .clang-x86-32-build-debug + variables: + CONTAINER: debian:unstable-i386 + +-debian-unstable-32-gcc: +- extends: .gcc-x86-32-build +- variables: +- CONTAINER: debian:unstable-i386 +- + debian-unstable-32-gcc-debug: + extends: .gcc-x86-32-build-debug + variables: +-- +2.40.0 + diff --git a/0012-libxl-check-return-value-of-libxl__xs_directory-in-n.patch b/0012-libxl-check-return-value-of-libxl__xs_directory-in-n.patch deleted file mode 100644 index 1ca34af..0000000 --- a/0012-libxl-check-return-value-of-libxl__xs_directory-in-n.patch +++ /dev/null @@ -1,38 +0,0 @@ -From d470a54087e0fbd813dae4d773ad0b830eeec4a1 Mon Sep 17 00:00:00 2001 -From: Anthony PERARD <anthony.perard@citrix.com> -Date: Tue, 12 Jul 2022 11:26:58 +0200 -Subject: [PATCH 012/126] libxl: check return value of libxl__xs_directory in - name2bdf - -libxl__xs_directory() can potentially return NULL without setting `n`. -As `n` isn't initialised, we need to check libxl__xs_directory() -return value before checking `n`. Otherwise, `n` might be non-zero -with `bdfs` NULL which would lead to a segv. - -Fixes: 57bff091f4 ("libxl: add 'name' field to 'libxl_device_pci' in the IDL...") -Reported-by: "G.R." <firemeteor@users.sourceforge.net> -Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> -Reviewed-by: Juergen Gross <jgross@suse.com> -Tested-by: "G.R." <firemeteor@users.sourceforge.net> -master commit: d778089ac70e5b8e3bdea0c85fc8c0b9ed0eaf2f -master date: 2022-07-12 08:38:51 +0200 ---- - tools/libs/light/libxl_pci.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/libs/light/libxl_pci.c b/tools/libs/light/libxl_pci.c -index 92bf86b2bebd..a5f5cdf62b80 100644 ---- a/tools/libs/light/libxl_pci.c -+++ b/tools/libs/light/libxl_pci.c -@@ -859,7 +859,7 @@ static int name2bdf(libxl__gc *gc, libxl_device_pci *pci) - int rc = ERROR_NOTFOUND; - - bdfs = libxl__xs_directory(gc, XBT_NULL, PCI_INFO_PATH, &n); -- if (!n) -+ if (!bdfs || !n) - goto out; - - for (i = 0; i < n; i++) { --- -2.37.4 - diff --git a/0013-CI-Remove-llvm-8-from-the-Debian-Stretch-container.patch b/0013-CI-Remove-llvm-8-from-the-Debian-Stretch-container.patch new file mode 100644 index 0000000..c4c6226 --- /dev/null +++ b/0013-CI-Remove-llvm-8-from-the-Debian-Stretch-container.patch @@ -0,0 +1,103 @@ +From 7963cdbf91d8a8d2f8338171adab3807b20f658a Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Fri, 24 Mar 2023 17:59:56 +0000 +Subject: [PATCH 13/13] CI: Remove llvm-8 from the Debian Stretch container + +For similar reasons to c/s a6b1e2b80fe20. While this container is still +build-able for now, all the other problems with explicitly-versioned compilers +remain. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +(cherry picked from commit 7a298375721636290a57f31bb0f7c2a5a38956a4) +--- + automation/build/debian/stretch-llvm-8.list | 3 --- + automation/build/debian/stretch.dockerfile | 12 --------- + automation/gitlab-ci/build.yaml | 27 --------------------- + 3 files changed, 42 deletions(-) + delete mode 100644 automation/build/debian/stretch-llvm-8.list + +diff --git a/automation/build/debian/stretch-llvm-8.list b/automation/build/debian/stretch-llvm-8.list +deleted file mode 100644 +index 09fe843fb2..0000000000 +--- a/automation/build/debian/stretch-llvm-8.list ++++ /dev/null +@@ -1,3 +0,0 @@ +-# Strech LLVM 8 repos +-deb http://apt.llvm.org/stretch/ llvm-toolchain-stretch-8 main +-deb-src http://apt.llvm.org/stretch/ llvm-toolchain-stretch-8 main +diff --git a/automation/build/debian/stretch.dockerfile b/automation/build/debian/stretch.dockerfile +index e3bace1f87..09e8a522ea 100644 +--- a/automation/build/debian/stretch.dockerfile ++++ b/automation/build/debian/stretch.dockerfile +@@ -50,15 +50,3 @@ RUN apt-get update && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* /tmp/* /var/tmp/* +- +-RUN wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - +-COPY stretch-llvm-8.list /etc/apt/sources.list.d/ +- +-RUN apt-get update && \ +- apt-get --quiet --yes install \ +- clang-8 \ +- lld-8 \ +- && \ +- apt-get autoremove -y && \ +- apt-get clean && \ +- rm -rf /var/lib/apt/lists* /tmp/* /var/tmp/* +diff --git a/automation/gitlab-ci/build.yaml b/automation/gitlab-ci/build.yaml +index 44f8bc6725..321e167b57 100644 +--- a/automation/gitlab-ci/build.yaml ++++ b/automation/gitlab-ci/build.yaml +@@ -26,13 +26,6 @@ + CXX: clang++ + clang: y + +-.clang-8-tmpl: +- variables: &clang-8 +- CC: clang-8 +- CXX: clang++-8 +- LD: ld.lld-8 +- clang: y +- + .x86-64-build-tmpl: + <<: *build + variables: +@@ -97,16 +90,6 @@ + variables: + <<: *clang + +-.clang-8-x86-64-build: +- extends: .x86-64-build +- variables: +- <<: *clang-8 +- +-.clang-8-x86-64-build-debug: +- extends: .x86-64-build-debug +- variables: +- <<: *clang-8 +- + .clang-x86-32-build: + extends: .x86-32-build + variables: +@@ -216,16 +199,6 @@ debian-stretch-clang-debug: + variables: + CONTAINER: debian:stretch + +-debian-stretch-clang-8: +- extends: .clang-8-x86-64-build +- variables: +- CONTAINER: debian:stretch +- +-debian-stretch-clang-8-debug: +- extends: .clang-8-x86-64-build-debug +- variables: +- CONTAINER: debian:stretch +- + debian-stretch-gcc: + extends: .gcc-x86-64-build + variables: +-- +2.40.0 + diff --git a/0013-update-Xen-version-to-4.15.4-pre.patch b/0013-update-Xen-version-to-4.15.4-pre.patch deleted file mode 100644 index 6e8c05b..0000000 --- a/0013-update-Xen-version-to-4.15.4-pre.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 505771bb1dffdf6f763fad18ee49a913b98abfea Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 12 Jul 2022 11:28:33 +0200 -Subject: [PATCH 013/126] update Xen version to 4.15.4-pre - ---- - xen/Makefile | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/Makefile b/xen/Makefile -index e9a88325c467..cd66bb3b1c84 100644 ---- a/xen/Makefile -+++ b/xen/Makefile -@@ -2,7 +2,7 @@ - # All other places this is stored (eg. compile.h) should be autogenerated. - export XEN_VERSION = 4 - export XEN_SUBVERSION = 15 --export XEN_EXTRAVERSION ?= .3$(XEN_VENDORVERSION) -+export XEN_EXTRAVERSION ?= .4-pre$(XEN_VENDORVERSION) - export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION) - -include xen-version - --- -2.37.4 - diff --git a/0014-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch b/0014-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch deleted file mode 100644 index 1c237f2..0000000 --- a/0014-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch +++ /dev/null @@ -1,168 +0,0 @@ -From 156ab775769d39b2dfb048ccd34dee7e86ba83a2 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Fri, 1 Jul 2022 15:59:40 +0100 -Subject: [PATCH 014/126] x86/spec-ctrl: Rework spec_ctrl_flags context - switching - -We are shortly going to need to context switch new bits in both the vcpu and -S3 paths. Introduce SCF_IST_MASK and SCF_DOM_MASK, and rework d->arch.verw -into d->arch.spec_ctrl_flags to accommodate. - -No functional change. - -This is part of XSA-407. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 5796912f7279d9348a3166655588d30eae9f72cc) ---- - xen/arch/x86/acpi/power.c | 8 ++++---- - xen/arch/x86/domain.c | 8 ++++---- - xen/arch/x86/spec_ctrl.c | 9 ++++++--- - xen/include/asm-x86/domain.h | 3 +-- - xen/include/asm-x86/spec_ctrl.h | 30 ++++++++++++++++++++++++++++- - xen/include/asm-x86/spec_ctrl_asm.h | 3 --- - 6 files changed, 44 insertions(+), 17 deletions(-) - -diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c -index 5eaa77f66a28..dd397f713067 100644 ---- a/xen/arch/x86/acpi/power.c -+++ b/xen/arch/x86/acpi/power.c -@@ -248,8 +248,8 @@ static int enter_state(u32 state) - error = 0; - - ci = get_cpu_info(); -- /* Avoid NMI/#MC using MSR_SPEC_CTRL until we've reloaded microcode. */ -- ci->spec_ctrl_flags &= ~SCF_ist_wrmsr; -+ /* Avoid NMI/#MC using unsafe MSRs until we've reloaded microcode. */ -+ ci->spec_ctrl_flags &= ~SCF_IST_MASK; - - ACPI_FLUSH_CPU_CACHE(); - -@@ -292,8 +292,8 @@ static int enter_state(u32 state) - if ( !recheck_cpu_features(0) ) - panic("Missing previously available feature(s)\n"); - -- /* Re-enabled default NMI/#MC use of MSR_SPEC_CTRL. */ -- ci->spec_ctrl_flags |= (default_spec_ctrl_flags & SCF_ist_wrmsr); -+ /* Re-enabled default NMI/#MC use of MSRs now microcode is loaded. */ -+ ci->spec_ctrl_flags |= (default_spec_ctrl_flags & SCF_IST_MASK); - - if ( boot_cpu_has(X86_FEATURE_IBRSB) || boot_cpu_has(X86_FEATURE_IBRS) ) - { -diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c -index 4a61e951facf..79f2c6ab19b8 100644 ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -2069,10 +2069,10 @@ void context_switch(struct vcpu *prev, struct vcpu *next) - } - } - -- /* Update the top-of-stack block with the VERW disposition. */ -- info->spec_ctrl_flags &= ~SCF_verw; -- if ( nextd->arch.verw ) -- info->spec_ctrl_flags |= SCF_verw; -+ /* Update the top-of-stack block with the new spec_ctrl settings. */ -+ info->spec_ctrl_flags = -+ (info->spec_ctrl_flags & ~SCF_DOM_MASK) | -+ (nextd->arch.spec_ctrl_flags & SCF_DOM_MASK); - } - - sched_context_switched(prev, next); -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 225fe08259b3..0fabfbe2a9f4 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -981,9 +981,12 @@ void spec_ctrl_init_domain(struct domain *d) - { - bool pv = is_pv_domain(d); - -- d->arch.verw = -- (pv ? opt_md_clear_pv : opt_md_clear_hvm) || -- (opt_fb_clear_mmio && is_iommu_enabled(d)); -+ bool verw = ((pv ? opt_md_clear_pv : opt_md_clear_hvm) || -+ (opt_fb_clear_mmio && is_iommu_enabled(d))); -+ -+ d->arch.spec_ctrl_flags = -+ (verw ? SCF_verw : 0) | -+ 0; - } - - void __init init_speculation_mitigations(void) -diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h -index d0df7f83aa0c..7d6483f21bb1 100644 ---- a/xen/include/asm-x86/domain.h -+++ b/xen/include/asm-x86/domain.h -@@ -319,8 +319,7 @@ struct arch_domain - uint32_t pci_cf8; - uint8_t cmos_idx; - -- /* Use VERW on return-to-guest for its flushing side effect. */ -- bool verw; -+ uint8_t spec_ctrl_flags; /* See SCF_DOM_MASK */ - - union { - struct pv_domain pv; -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index 12283573cdd5..60d6d2dc9407 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -20,12 +20,40 @@ - #ifndef __X86_SPEC_CTRL_H__ - #define __X86_SPEC_CTRL_H__ - --/* Encoding of cpuinfo.spec_ctrl_flags */ -+/* -+ * Encoding of: -+ * cpuinfo.spec_ctrl_flags -+ * default_spec_ctrl_flags -+ * domain.spec_ctrl_flags -+ * -+ * Live settings are in the top-of-stack block, because they need to be -+ * accessable when XPTI is active. Some settings are fixed from boot, some -+ * context switched per domain, and some inhibited in the S3 path. -+ */ - #define SCF_use_shadow (1 << 0) - #define SCF_ist_wrmsr (1 << 1) - #define SCF_ist_rsb (1 << 2) - #define SCF_verw (1 << 3) - -+/* -+ * The IST paths (NMI/#MC) can interrupt any arbitrary context. Some -+ * functionality requires updated microcode to work. -+ * -+ * On boot, this is easy; we load microcode before figuring out which -+ * speculative protections to apply. However, on the S3 resume path, we must -+ * be able to disable the configured mitigations until microcode is reloaded. -+ * -+ * These are the controls to inhibit on the S3 resume path until microcode has -+ * been reloaded. -+ */ -+#define SCF_IST_MASK (SCF_ist_wrmsr) -+ -+/* -+ * Some speculative protections are per-domain. These settings are merged -+ * into the top-of-stack block in the context switch path. -+ */ -+#define SCF_DOM_MASK (SCF_verw) -+ - #ifndef __ASSEMBLY__ - - #include <asm/alternative.h> -diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h -index 5a590bac44aa..66b00d511fc6 100644 ---- a/xen/include/asm-x86/spec_ctrl_asm.h -+++ b/xen/include/asm-x86/spec_ctrl_asm.h -@@ -248,9 +248,6 @@ - - /* - * Use in IST interrupt/exception context. May interrupt Xen or PV context. -- * Fine grain control of SCF_ist_wrmsr is needed for safety in the S3 resume -- * path to avoid using MSR_SPEC_CTRL before the microcode introducing it has -- * been reloaded. - */ - .macro SPEC_CTRL_ENTRY_FROM_INTR_IST - /* --- -2.37.4 - diff --git a/0015-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch b/0015-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch deleted file mode 100644 index a9cc63f..0000000 --- a/0015-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch +++ /dev/null @@ -1,110 +0,0 @@ -From 2cfbca32b9dc3a8d6520549ff468a7f550daf1b1 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 28 Jun 2022 14:36:56 +0100 -Subject: [PATCH 015/126] x86/spec-ctrl: Rename SCF_ist_wrmsr to SCF_ist_sc_msr - -We are about to introduce SCF_ist_ibpb, at which point SCF_ist_wrmsr becomes -ambiguous. - -No functional change. - -This is part of XSA-407. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 76d6a36f645dfdbad8830559d4d52caf36efc75e) ---- - xen/arch/x86/spec_ctrl.c | 6 +++--- - xen/include/asm-x86/spec_ctrl.h | 4 ++-- - xen/include/asm-x86/spec_ctrl_asm.h | 8 ++++---- - 3 files changed, 9 insertions(+), 9 deletions(-) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 0fabfbe2a9f4..a6def47061e8 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -1086,7 +1086,7 @@ void __init init_speculation_mitigations(void) - { - if ( opt_msr_sc_pv ) - { -- default_spec_ctrl_flags |= SCF_ist_wrmsr; -+ default_spec_ctrl_flags |= SCF_ist_sc_msr; - setup_force_cpu_cap(X86_FEATURE_SC_MSR_PV); - } - -@@ -1097,7 +1097,7 @@ void __init init_speculation_mitigations(void) - * Xen's value is not restored atomically. An early NMI hitting - * the VMExit path needs to restore Xen's value for safety. - */ -- default_spec_ctrl_flags |= SCF_ist_wrmsr; -+ default_spec_ctrl_flags |= SCF_ist_sc_msr; - setup_force_cpu_cap(X86_FEATURE_SC_MSR_HVM); - } - } -@@ -1110,7 +1110,7 @@ void __init init_speculation_mitigations(void) - * on real hardware matches the availability of MSR_SPEC_CTRL in the - * first place. - * -- * No need for SCF_ist_wrmsr because Xen's value is restored -+ * No need for SCF_ist_sc_msr because Xen's value is restored - * atomically WRT NMIs in the VMExit path. - * - * TODO: Adjust cpu_has_svm_spec_ctrl to be usable earlier on boot. -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index 60d6d2dc9407..6f8b0e09348e 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -31,7 +31,7 @@ - * context switched per domain, and some inhibited in the S3 path. - */ - #define SCF_use_shadow (1 << 0) --#define SCF_ist_wrmsr (1 << 1) -+#define SCF_ist_sc_msr (1 << 1) - #define SCF_ist_rsb (1 << 2) - #define SCF_verw (1 << 3) - -@@ -46,7 +46,7 @@ - * These are the controls to inhibit on the S3 resume path until microcode has - * been reloaded. - */ --#define SCF_IST_MASK (SCF_ist_wrmsr) -+#define SCF_IST_MASK (SCF_ist_sc_msr) - - /* - * Some speculative protections are per-domain. These settings are merged -diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h -index 66b00d511fc6..0ff1b118f882 100644 ---- a/xen/include/asm-x86/spec_ctrl_asm.h -+++ b/xen/include/asm-x86/spec_ctrl_asm.h -@@ -266,8 +266,8 @@ - - .L\@_skip_rsb: - -- test $SCF_ist_wrmsr, %al -- jz .L\@_skip_wrmsr -+ test $SCF_ist_sc_msr, %al -+ jz .L\@_skip_msr_spec_ctrl - - xor %edx, %edx - testb $3, UREGS_cs(%rsp) -@@ -290,7 +290,7 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - * to speculate around the WRMSR. As a result, we need a dispatch - * serialising instruction in the else clause. - */ --.L\@_skip_wrmsr: -+.L\@_skip_msr_spec_ctrl: - lfence - UNLIKELY_END(\@_serialise) - .endm -@@ -301,7 +301,7 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - * Requires %rbx=stack_end - * Clobbers %rax, %rcx, %rdx - */ -- testb $SCF_ist_wrmsr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) -+ testb $SCF_ist_sc_msr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) - jz .L\@_skip - - DO_SPEC_CTRL_EXIT_TO_XEN --- -2.37.4 - diff --git a/0016-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch b/0016-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch deleted file mode 100644 index cfe270c..0000000 --- a/0016-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch +++ /dev/null @@ -1,98 +0,0 @@ -From c707015bf118df2c43e3a48b3774916322fca50a Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Mon, 4 Jul 2022 21:32:17 +0100 -Subject: [PATCH 016/126] x86/spec-ctrl: Rename opt_ibpb to - opt_ibpb_ctxt_switch - -We are about to introduce the use of IBPB at different points in Xen, making -opt_ibpb ambiguous. Rename it to opt_ibpb_ctxt_switch. - -No functional change. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit a8e5ef079d6f5c88c472e3e620db5a8d1402a50d) ---- - xen/arch/x86/domain.c | 2 +- - xen/arch/x86/spec_ctrl.c | 10 +++++----- - xen/include/asm-x86/spec_ctrl.h | 2 +- - 3 files changed, 7 insertions(+), 7 deletions(-) - -diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c -index 79f2c6ab19b8..2838f976d729 100644 ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -2041,7 +2041,7 @@ void context_switch(struct vcpu *prev, struct vcpu *next) - - ctxt_switch_levelling(next); - -- if ( opt_ibpb && !is_idle_domain(nextd) ) -+ if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) ) - { - static DEFINE_PER_CPU(unsigned int, last); - unsigned int *last_id = &this_cpu(last); -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index a6def47061e8..ced0f8c2aea6 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -54,7 +54,7 @@ int8_t __initdata opt_stibp = -1; - bool __read_mostly opt_ssbd; - int8_t __initdata opt_psfd = -1; - --bool __read_mostly opt_ibpb = true; -+bool __read_mostly opt_ibpb_ctxt_switch = true; - int8_t __read_mostly opt_eager_fpu = -1; - int8_t __read_mostly opt_l1d_flush = -1; - bool __read_mostly opt_branch_harden = true; -@@ -117,7 +117,7 @@ static int __init parse_spec_ctrl(const char *s) - - opt_thunk = THUNK_JMP; - opt_ibrs = 0; -- opt_ibpb = false; -+ opt_ibpb_ctxt_switch = false; - opt_ssbd = false; - opt_l1d_flush = 0; - opt_branch_harden = false; -@@ -238,7 +238,7 @@ static int __init parse_spec_ctrl(const char *s) - - /* Misc settings. */ - else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 ) -- opt_ibpb = val; -+ opt_ibpb_ctxt_switch = val; - else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 ) - opt_eager_fpu = val; - else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 ) -@@ -458,7 +458,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - (opt_tsx & 1) ? " TSX+" : " TSX-", - !cpu_has_srbds_ctrl ? "" : - opt_srb_lock ? " SRB_LOCK+" : " SRB_LOCK-", -- opt_ibpb ? " IBPB" : "", -+ opt_ibpb_ctxt_switch ? " IBPB-ctxt" : "", - opt_l1d_flush ? " L1D_FLUSH" : "", - opt_md_clear_pv || opt_md_clear_hvm || - opt_fb_clear_mmio ? " VERW" : "", -@@ -1193,7 +1193,7 @@ void __init init_speculation_mitigations(void) - - /* Check we have hardware IBPB support before using it... */ - if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) ) -- opt_ibpb = false; -+ opt_ibpb_ctxt_switch = false; - - /* Check whether Eager FPU should be enabled by default. */ - if ( opt_eager_fpu == -1 ) -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index 6f8b0e09348e..fd8162ca9ab9 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -63,7 +63,7 @@ - void init_speculation_mitigations(void); - void spec_ctrl_init_domain(struct domain *d); - --extern bool opt_ibpb; -+extern bool opt_ibpb_ctxt_switch; - extern bool opt_ssbd; - extern int8_t opt_eager_fpu; - extern int8_t opt_l1d_flush; --- -2.37.4 - diff --git a/0017-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch b/0017-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch deleted file mode 100644 index 5a6bfa5..0000000 --- a/0017-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch +++ /dev/null @@ -1,106 +0,0 @@ -From d7f5fb1e2abd0d56cada9bfcf96ab530d214d9aa Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Fri, 1 Jul 2022 15:59:40 +0100 -Subject: [PATCH 017/126] x86/spec-ctrl: Rework SPEC_CTRL_ENTRY_FROM_INTR_IST - -We are shortly going to add a conditional IBPB in this path. - -Therefore, we cannot hold spec_ctrl_flags in %eax, and rely on only clobbering -it after we're done with its contents. %rbx is available for use, and the -more normal register to hold preserved information in. - -With %rax freed up, use it instead of %rdx for the RSB tmp register, and for -the adjustment to spec_ctrl_flags. - -This leaves no use of %rdx, except as 0 for the upper half of WRMSR. In -practice, %rdx is 0 from SAVE_ALL on all paths and isn't likely to change in -the foreseeable future, so update the macro entry requirements to state this -dependency. This marginal optimisation can be revisited if circumstances -change. - -No practical change. - -This is part of XSA-407. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit e9b8d31981f184c6539f91ec54bd9cae29cdae36) ---- - xen/arch/x86/x86_64/entry.S | 4 ++-- - xen/include/asm-x86/spec_ctrl_asm.h | 21 ++++++++++----------- - 2 files changed, 12 insertions(+), 13 deletions(-) - -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index 2f3f48ff27c3..9bfc5964a911 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -874,7 +874,7 @@ ENTRY(double_fault) - - GET_STACK_END(14) - -- SPEC_CTRL_ENTRY_FROM_INTR_IST /* Req: %rsp=regs, %r14=end, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_INTR_IST /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: abcd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rbx -@@ -910,7 +910,7 @@ handle_ist_exception: - - GET_STACK_END(14) - -- SPEC_CTRL_ENTRY_FROM_INTR_IST /* Req: %rsp=regs, %r14=end, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_INTR_IST /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: abcd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx -diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h -index 0ff1b118f882..15e24cde00d1 100644 ---- a/xen/include/asm-x86/spec_ctrl_asm.h -+++ b/xen/include/asm-x86/spec_ctrl_asm.h -@@ -251,34 +251,33 @@ - */ - .macro SPEC_CTRL_ENTRY_FROM_INTR_IST - /* -- * Requires %rsp=regs, %r14=stack_end -- * Clobbers %rax, %rcx, %rdx -+ * Requires %rsp=regs, %r14=stack_end, %rdx=0 -+ * Clobbers %rax, %rbx, %rcx, %rdx - * - * This is logical merge of DO_OVERWRITE_RSB and DO_SPEC_CTRL_ENTRY - * maybexen=1, but with conditionals rather than alternatives. - */ -- movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %eax -+ movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx - -- test $SCF_ist_rsb, %al -+ test $SCF_ist_rsb, %bl - jz .L\@_skip_rsb - -- DO_OVERWRITE_RSB tmp=rdx /* Clobbers %rcx/%rdx */ -+ DO_OVERWRITE_RSB /* Clobbers %rax/%rcx */ - - .L\@_skip_rsb: - -- test $SCF_ist_sc_msr, %al -+ test $SCF_ist_sc_msr, %bl - jz .L\@_skip_msr_spec_ctrl - -- xor %edx, %edx -+ xor %eax, %eax - testb $3, UREGS_cs(%rsp) -- setnz %dl -- not %edx -- and %dl, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) -+ setnz %al -+ not %eax -+ and %al, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) - - /* Load Xen's intended value. */ - mov $MSR_SPEC_CTRL, %ecx - movzbl STACK_CPUINFO_FIELD(xen_spec_ctrl)(%r14), %eax -- xor %edx, %edx - wrmsr - - /* Opencoded UNLIKELY_START() with no condition. */ --- -2.37.4 - diff --git a/0018-x86-spec-ctrl-Support-IBPB-on-entry.patch b/0018-x86-spec-ctrl-Support-IBPB-on-entry.patch deleted file mode 100644 index 43b2d76..0000000 --- a/0018-x86-spec-ctrl-Support-IBPB-on-entry.patch +++ /dev/null @@ -1,300 +0,0 @@ -From f0d78e0c11d3984c74f34a7325f862dee93a5835 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Thu, 24 Feb 2022 13:44:33 +0000 -Subject: [PATCH 018/126] x86/spec-ctrl: Support IBPB-on-entry - -We are going to need this to mitigate Branch Type Confusion on AMD/Hygon CPUs, -but as we've talked about using it in other cases too, arrange to support it -generally. However, this is also very expensive in some cases, so we're going -to want per-domain controls. - -Introduce SCF_ist_ibpb and SCF_entry_ibpb controls, adding them to the IST and -DOM masks as appropriate. Also introduce X86_FEATURE_IBPB_ENTRY_{PV,HVM} to -to patch the code blocks. - -For SVM, the STGI is serialising enough to protect against Spectre-v1 attacks, -so no "else lfence" is necessary. VT-x will use use the MSR host load list, -so doesn't need any code in the VMExit path. - -For the IST path, we can't safely check CPL==0 to skip a flush, as we might -have hit an entry path before it's IBPB. As IST hitting Xen is rare, flush -irrespective of CPL. A later path, SCF_ist_sc_msr, provides Spectre-v1 -safety. - -For the PV paths, we know we're interrupting CPL>0, while for the INTR paths, -we can safely check CPL==0. Only flush when interrupting guest context. - -An "else lfence" is needed for safety, but we want to be able to skip it on -unaffected CPUs, so the block wants to be an alternative, which means the -lfence has to be inline rather than UNLIKELY() (the replacement block doesn't -have displacements fixed up for anything other than the first instruction). - -As with SPEC_CTRL_ENTRY_FROM_INTR_IST, %rdx is 0 on entry so rely on this to -shrink the logic marginally. Update the comments to specify this new -dependency. - -This is part of XSA-407. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 53a570b285694947776d5190f591a0d5b9b18de7) ---- - xen/arch/x86/hvm/svm/entry.S | 18 ++++++++++- - xen/arch/x86/hvm/vmx/vmcs.c | 4 +++ - xen/arch/x86/x86_64/compat/entry.S | 4 +-- - xen/arch/x86/x86_64/entry.S | 10 +++--- - xen/include/asm-x86/cpufeatures.h | 2 ++ - xen/include/asm-x86/spec_ctrl.h | 6 ++-- - xen/include/asm-x86/spec_ctrl_asm.h | 49 +++++++++++++++++++++++++++-- - 7 files changed, 81 insertions(+), 12 deletions(-) - -diff --git a/xen/arch/x86/hvm/svm/entry.S b/xen/arch/x86/hvm/svm/entry.S -index 4ae55a2ef605..0ff4008060fa 100644 ---- a/xen/arch/x86/hvm/svm/entry.S -+++ b/xen/arch/x86/hvm/svm/entry.S -@@ -97,7 +97,19 @@ __UNLIKELY_END(nsvm_hap) - - GET_CURRENT(bx) - -- /* SPEC_CTRL_ENTRY_FROM_SVM Req: %rsp=regs/cpuinfo Clob: acd */ -+ /* SPEC_CTRL_ENTRY_FROM_SVM Req: %rsp=regs/cpuinfo, %rdx=0 Clob: acd */ -+ -+ .macro svm_vmexit_cond_ibpb -+ testb $SCF_entry_ibpb, CPUINFO_xen_spec_ctrl(%rsp) -+ jz .L_skip_ibpb -+ -+ mov $MSR_PRED_CMD, %ecx -+ mov $PRED_CMD_IBPB, %eax -+ wrmsr -+.L_skip_ibpb: -+ .endm -+ ALTERNATIVE "", svm_vmexit_cond_ibpb, X86_FEATURE_IBPB_ENTRY_HVM -+ - ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_HVM - - .macro svm_vmexit_spec_ctrl -@@ -114,6 +126,10 @@ __UNLIKELY_END(nsvm_hap) - ALTERNATIVE "", svm_vmexit_spec_ctrl, X86_FEATURE_SC_MSR_HVM - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - -+ /* -+ * STGI is executed unconditionally, and is sufficiently serialising -+ * to safely resolve any Spectre-v1 concerns in the above logic. -+ */ - stgi - GLOBAL(svm_stgi_label) - mov %rsp,%rdi -diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c -index f9f9bc18cdbc..dd817cee4e69 100644 ---- a/xen/arch/x86/hvm/vmx/vmcs.c -+++ b/xen/arch/x86/hvm/vmx/vmcs.c -@@ -1345,6 +1345,10 @@ static int construct_vmcs(struct vcpu *v) - rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D, - VMX_MSR_GUEST_LOADONLY); - -+ if ( !rc && (d->arch.spec_ctrl_flags & SCF_entry_ibpb) ) -+ rc = vmx_add_msr(v, MSR_PRED_CMD, PRED_CMD_IBPB, -+ VMX_MSR_HOST); -+ - out: - vmx_vmcs_exit(v); - -diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S -index 0cfe95314249..5c999271e617 100644 ---- a/xen/arch/x86/x86_64/compat/entry.S -+++ b/xen/arch/x86/x86_64/compat/entry.S -@@ -20,7 +20,7 @@ ENTRY(entry_int82) - movl $HYPERCALL_VECTOR, 4(%rsp) - SAVE_ALL compat=1 /* DPL1 gate, restricted to 32bit PV guests only. */ - -- SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - CR4_PV32_RESTORE -@@ -216,7 +216,7 @@ ENTRY(cstar_enter) - movl $TRAP_syscall, 4(%rsp) - SAVE_ALL - -- SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - GET_STACK_END(bx) -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index 9bfc5964a911..3c8593325606 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -260,7 +260,7 @@ ENTRY(lstar_enter) - movl $TRAP_syscall, 4(%rsp) - SAVE_ALL - -- SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - GET_STACK_END(bx) -@@ -299,7 +299,7 @@ GLOBAL(sysenter_eflags_saved) - movl $TRAP_syscall, 4(%rsp) - SAVE_ALL - -- SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - GET_STACK_END(bx) -@@ -351,7 +351,7 @@ ENTRY(int80_direct_trap) - movl $0x80, 4(%rsp) - SAVE_ALL - -- SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - GET_STACK_END(bx) -@@ -618,7 +618,7 @@ ENTRY(common_interrupt) - - GET_STACK_END(14) - -- SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: acd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx -@@ -652,7 +652,7 @@ GLOBAL(handle_exception) - - GET_STACK_END(14) - -- SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: acd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx -diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h -index f7488d3ccbfa..b233e5835fb5 100644 ---- a/xen/include/asm-x86/cpufeatures.h -+++ b/xen/include/asm-x86/cpufeatures.h -@@ -39,6 +39,8 @@ XEN_CPUFEATURE(XEN_LBR, X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */ - XEN_CPUFEATURE(SC_VERW_IDLE, X86_SYNTH(25)) /* VERW used by Xen for idle */ - XEN_CPUFEATURE(XEN_SHSTK, X86_SYNTH(26)) /* Xen uses CET Shadow Stacks */ - XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch Tracking */ -+XEN_CPUFEATURE(IBPB_ENTRY_PV, X86_SYNTH(28)) /* MSR_PRED_CMD used by Xen for PV */ -+XEN_CPUFEATURE(IBPB_ENTRY_HVM, X86_SYNTH(29)) /* MSR_PRED_CMD used by Xen for HVM */ - - /* Bug words follow the synthetic words. */ - #define X86_NR_BUG 1 -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index fd8162ca9ab9..10cd0cd2518f 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -34,6 +34,8 @@ - #define SCF_ist_sc_msr (1 << 1) - #define SCF_ist_rsb (1 << 2) - #define SCF_verw (1 << 3) -+#define SCF_ist_ibpb (1 << 4) -+#define SCF_entry_ibpb (1 << 5) - - /* - * The IST paths (NMI/#MC) can interrupt any arbitrary context. Some -@@ -46,13 +48,13 @@ - * These are the controls to inhibit on the S3 resume path until microcode has - * been reloaded. - */ --#define SCF_IST_MASK (SCF_ist_sc_msr) -+#define SCF_IST_MASK (SCF_ist_sc_msr | SCF_ist_ibpb) - - /* - * Some speculative protections are per-domain. These settings are merged - * into the top-of-stack block in the context switch path. - */ --#define SCF_DOM_MASK (SCF_verw) -+#define SCF_DOM_MASK (SCF_verw | SCF_entry_ibpb) - - #ifndef __ASSEMBLY__ - -diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h -index 15e24cde00d1..9eb4ad9ab71d 100644 ---- a/xen/include/asm-x86/spec_ctrl_asm.h -+++ b/xen/include/asm-x86/spec_ctrl_asm.h -@@ -88,6 +88,35 @@ - * - SPEC_CTRL_EXIT_TO_{SVM,VMX} - */ - -+.macro DO_SPEC_CTRL_COND_IBPB maybexen:req -+/* -+ * Requires %rsp=regs (also cpuinfo if !maybexen) -+ * Requires %r14=stack_end (if maybexen), %rdx=0 -+ * Clobbers %rax, %rcx, %rdx -+ * -+ * Conditionally issue IBPB if SCF_entry_ibpb is active. In the maybexen -+ * case, we can safely look at UREGS_cs to skip taking the hit when -+ * interrupting Xen. -+ */ -+ .if \maybexen -+ testb $SCF_entry_ibpb, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) -+ jz .L\@_skip -+ testb $3, UREGS_cs(%rsp) -+ .else -+ testb $SCF_entry_ibpb, CPUINFO_xen_spec_ctrl(%rsp) -+ .endif -+ jz .L\@_skip -+ -+ mov $MSR_PRED_CMD, %ecx -+ mov $PRED_CMD_IBPB, %eax -+ wrmsr -+ jmp .L\@_done -+ -+.L\@_skip: -+ lfence -+.L\@_done: -+.endm -+ - .macro DO_OVERWRITE_RSB tmp=rax - /* - * Requires nothing -@@ -225,12 +254,16 @@ - - /* Use after an entry from PV context (syscall/sysenter/int80/int82/etc). */ - #define SPEC_CTRL_ENTRY_FROM_PV \ -+ ALTERNATIVE "", __stringify(DO_SPEC_CTRL_COND_IBPB maybexen=0), \ -+ X86_FEATURE_IBPB_ENTRY_PV; \ - ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV; \ - ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=0), \ - X86_FEATURE_SC_MSR_PV - - /* Use in interrupt/exception context. May interrupt Xen or PV context. */ - #define SPEC_CTRL_ENTRY_FROM_INTR \ -+ ALTERNATIVE "", __stringify(DO_SPEC_CTRL_COND_IBPB maybexen=1), \ -+ X86_FEATURE_IBPB_ENTRY_PV; \ - ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV; \ - ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=1), \ - X86_FEATURE_SC_MSR_PV -@@ -254,11 +287,23 @@ - * Requires %rsp=regs, %r14=stack_end, %rdx=0 - * Clobbers %rax, %rbx, %rcx, %rdx - * -- * This is logical merge of DO_OVERWRITE_RSB and DO_SPEC_CTRL_ENTRY -- * maybexen=1, but with conditionals rather than alternatives. -+ * This is logical merge of: -+ * DO_SPEC_CTRL_COND_IBPB maybexen=0 -+ * DO_OVERWRITE_RSB -+ * DO_SPEC_CTRL_ENTRY maybexen=1 -+ * but with conditionals rather than alternatives. - */ - movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx - -+ test $SCF_ist_ibpb, %bl -+ jz .L\@_skip_ibpb -+ -+ mov $MSR_PRED_CMD, %ecx -+ mov $PRED_CMD_IBPB, %eax -+ wrmsr -+ -+.L\@_skip_ibpb: -+ - test $SCF_ist_rsb, %bl - jz .L\@_skip_rsb - --- -2.37.4 - diff --git a/0019-x86-cpuid-Enumeration-for-BTC_NO.patch b/0019-x86-cpuid-Enumeration-for-BTC_NO.patch deleted file mode 100644 index 626bfd8..0000000 --- a/0019-x86-cpuid-Enumeration-for-BTC_NO.patch +++ /dev/null @@ -1,106 +0,0 @@ -From 2b29ac476fa0c91655906fac3512202e514ecbed Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Mon, 16 May 2022 15:48:24 +0100 -Subject: [PATCH 019/126] x86/cpuid: Enumeration for BTC_NO - -BTC_NO indicates that hardware is not succeptable to Branch Type Confusion. - -Zen3 CPUs don't suffer BTC. - -This is part of XSA-407. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 76cb04ad64f3ab9ae785988c40655a71dde9c319) ---- - tools/libs/light/libxl_cpuid.c | 1 + - tools/misc/xen-cpuid.c | 2 +- - xen/arch/x86/cpu/amd.c | 10 ++++++++++ - xen/arch/x86/spec_ctrl.c | 5 +++-- - xen/include/public/arch-x86/cpufeatureset.h | 1 + - 5 files changed, 16 insertions(+), 3 deletions(-) - -diff --git a/tools/libs/light/libxl_cpuid.c b/tools/libs/light/libxl_cpuid.c -index 9a4eb8015a43..2632efc6adb0 100644 ---- a/tools/libs/light/libxl_cpuid.c -+++ b/tools/libs/light/libxl_cpuid.c -@@ -283,6 +283,7 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str) - {"virt-ssbd", 0x80000008, NA, CPUID_REG_EBX, 25, 1}, - {"ssb-no", 0x80000008, NA, CPUID_REG_EBX, 26, 1}, - {"psfd", 0x80000008, NA, CPUID_REG_EBX, 28, 1}, -+ {"btc-no", 0x80000008, NA, CPUID_REG_EBX, 29, 1}, - - {"nc", 0x80000008, NA, CPUID_REG_ECX, 0, 8}, - {"apicidsize", 0x80000008, NA, CPUID_REG_ECX, 12, 4}, -diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c -index 12111fe12d16..e83bc4793d6e 100644 ---- a/tools/misc/xen-cpuid.c -+++ b/tools/misc/xen-cpuid.c -@@ -157,7 +157,7 @@ static const char *const str_e8b[32] = - /* [22] */ [23] = "ppin", - [24] = "amd-ssbd", [25] = "virt-ssbd", - [26] = "ssb-no", -- [28] = "psfd", -+ [28] = "psfd", [29] = "btc-no", - }; - - static const char *const str_7d0[32] = -diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c -index 986672a072b7..675b877f193c 100644 ---- a/xen/arch/x86/cpu/amd.c -+++ b/xen/arch/x86/cpu/amd.c -@@ -822,6 +822,16 @@ static void init_amd(struct cpuinfo_x86 *c) - warning_add(text); - } - break; -+ -+ case 0x19: -+ /* -+ * Zen3 (Fam19h model < 0x10) parts are not susceptible to -+ * Branch Type Confusion, but predate the allocation of the -+ * BTC_NO bit. Fill it back in if we're not virtualised. -+ */ -+ if (!cpu_has_hypervisor && !cpu_has(c, X86_FEATURE_BTC_NO)) -+ __set_bit(X86_FEATURE_BTC_NO, c->x86_capability); -+ break; - } - - display_cacheinfo(c); -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index ced0f8c2aea6..9f66c715516c 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -388,7 +388,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - * Hardware read-only information, stating immunity to certain issues, or - * suggestions of which mitigation to use. - */ -- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", -+ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", - (caps & ARCH_CAPS_IBRS_ALL) ? " IBRS_ALL" : "", - (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", -@@ -403,7 +403,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - (e8b & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS)) ? " IBRS_ALWAYS" : "", - (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", -- (e8b & cpufeat_mask(X86_FEATURE_IBRS_SAME_MODE)) ? " IBRS_SAME_MODE" : ""); -+ (e8b & cpufeat_mask(X86_FEATURE_IBRS_SAME_MODE)) ? " IBRS_SAME_MODE" : "", -+ (e8b & cpufeat_mask(X86_FEATURE_BTC_NO)) ? " BTC_NO" : ""); - - /* Hardware features which need driving to mitigate issues. */ - printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n", -diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h -index 9686c82ed75c..1bbc7da4b53c 100644 ---- a/xen/include/public/arch-x86/cpufeatureset.h -+++ b/xen/include/public/arch-x86/cpufeatureset.h -@@ -265,6 +265,7 @@ XEN_CPUFEATURE(AMD_SSBD, 8*32+24) /*S MSR_SPEC_CTRL.SSBD available */ - XEN_CPUFEATURE(VIRT_SSBD, 8*32+25) /* MSR_VIRT_SPEC_CTRL.SSBD */ - XEN_CPUFEATURE(SSB_NO, 8*32+26) /*A Hardware not vulnerable to SSB */ - XEN_CPUFEATURE(PSFD, 8*32+28) /*S MSR_SPEC_CTRL.PSFD */ -+XEN_CPUFEATURE(BTC_NO, 8*32+29) /*A Hardware not vulnerable to Branch Type Confusion */ - - /* Intel-defined CPU features, CPUID level 0x00000007:0.edx, word 9 */ - XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions */ --- -2.37.4 - diff --git a/0020-x86-spec-ctrl-Enable-Zen2-chickenbit.patch b/0020-x86-spec-ctrl-Enable-Zen2-chickenbit.patch deleted file mode 100644 index 933660d..0000000 --- a/0020-x86-spec-ctrl-Enable-Zen2-chickenbit.patch +++ /dev/null @@ -1,105 +0,0 @@ -From 409976bed91f61fb7b053d536d2fc87cf3ad7018 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 15 Mar 2022 18:30:25 +0000 -Subject: [PATCH 020/126] x86/spec-ctrl: Enable Zen2 chickenbit - -... as instructed in the Branch Type Confusion whitepaper. - -This is part of XSA-407. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -(cherry picked from commit 9deaf2d932f08c16c6b96a1c426e4b1142c0cdbe) ---- - xen/arch/x86/cpu/amd.c | 28 ++++++++++++++++++++++++++++ - xen/arch/x86/cpu/cpu.h | 1 + - xen/arch/x86/cpu/hygon.c | 6 ++++++ - xen/include/asm-x86/msr-index.h | 1 + - 4 files changed, 36 insertions(+) - -diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c -index 675b877f193c..60dbe61a61ca 100644 ---- a/xen/arch/x86/cpu/amd.c -+++ b/xen/arch/x86/cpu/amd.c -@@ -731,6 +731,31 @@ void amd_init_ssbd(const struct cpuinfo_x86 *c) - printk_once(XENLOG_ERR "No SSBD controls available\n"); - } - -+/* -+ * On Zen2 we offer this chicken (bit) on the altar of Speculation. -+ * -+ * Refer to the AMD Branch Type Confusion whitepaper: -+ * https://XXX -+ * -+ * Setting this unnamed bit supposedly causes prediction information on -+ * non-branch instructions to be ignored. It is to be set unilaterally in -+ * newer microcode. -+ * -+ * This chickenbit is something unrelated on Zen1, and Zen1 vs Zen2 isn't a -+ * simple model number comparison, so use STIBP as a heuristic to separate the -+ * two uarches in Fam17h(AMD)/18h(Hygon). -+ */ -+void amd_init_spectral_chicken(void) -+{ -+ uint64_t val, chickenbit = 1 << 1; -+ -+ if (cpu_has_hypervisor || !boot_cpu_has(X86_FEATURE_AMD_STIBP)) -+ return; -+ -+ if (rdmsr_safe(MSR_AMD64_DE_CFG2, val) == 0 && !(val & chickenbit)) -+ wrmsr_safe(MSR_AMD64_DE_CFG2, val | chickenbit); -+} -+ - static void init_amd(struct cpuinfo_x86 *c) - { - u32 l, h; -@@ -783,6 +808,9 @@ static void init_amd(struct cpuinfo_x86 *c) - - amd_init_ssbd(c); - -+ if (c->x86 == 0x17) -+ amd_init_spectral_chicken(); -+ - /* MFENCE stops RDTSC speculation */ - if (!cpu_has_lfence_dispatch) - __set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability); -diff --git a/xen/arch/x86/cpu/cpu.h b/xen/arch/x86/cpu/cpu.h -index 1a5b3918b37e..e76ab5ce1ae2 100644 ---- a/xen/arch/x86/cpu/cpu.h -+++ b/xen/arch/x86/cpu/cpu.h -@@ -22,3 +22,4 @@ void early_init_amd(struct cpuinfo_x86 *c); - void amd_log_freq(const struct cpuinfo_x86 *c); - void amd_init_lfence(struct cpuinfo_x86 *c); - void amd_init_ssbd(const struct cpuinfo_x86 *c); -+void amd_init_spectral_chicken(void); -diff --git a/xen/arch/x86/cpu/hygon.c b/xen/arch/x86/cpu/hygon.c -index 3845e0cf0e89..0cb0e7d55e61 100644 ---- a/xen/arch/x86/cpu/hygon.c -+++ b/xen/arch/x86/cpu/hygon.c -@@ -36,6 +36,12 @@ static void init_hygon(struct cpuinfo_x86 *c) - - amd_init_ssbd(c); - -+ /* -+ * TODO: Check heuristic safety with Hygon first -+ if (c->x86 == 0x18) -+ amd_init_spectral_chicken(); -+ */ -+ - /* MFENCE stops RDTSC speculation */ - if (!cpu_has_lfence_dispatch) - __set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability); -diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h -index 1e743461e91d..b4a360723b14 100644 ---- a/xen/include/asm-x86/msr-index.h -+++ b/xen/include/asm-x86/msr-index.h -@@ -359,6 +359,7 @@ - #define MSR_AMD64_DE_CFG 0xc0011029 - #define AMD64_DE_CFG_LFENCE_SERIALISE (_AC(1, ULL) << 1) - #define MSR_AMD64_EX_CFG 0xc001102c -+#define MSR_AMD64_DE_CFG2 0xc00110e3 - - #define MSR_AMD64_DR0_ADDRESS_MASK 0xc0011027 - #define MSR_AMD64_DR1_ADDRESS_MASK 0xc0011019 --- -2.37.4 - diff --git a/0021-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch b/0021-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch deleted file mode 100644 index 01be575..0000000 --- a/0021-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch +++ /dev/null @@ -1,305 +0,0 @@ -From 35bf91d30f1a480dcf5bfd99b79384b2b283da7f Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Mon, 27 Jun 2022 19:29:40 +0100 -Subject: [PATCH 021/126] x86/spec-ctrl: Mitigate Branch Type Confusion when - possible - -Branch Type Confusion affects AMD/Hygon CPUs on Zen2 and earlier. To -mitigate, we require SMT safety (STIBP on Zen2, no-SMT on Zen1), and to issue -an IBPB on each entry to Xen, to flush the BTB. - -Due to performance concerns, dom0 (which is trusted in most configurations) is -excluded from protections by default. - -Therefore: - * Use STIBP by default on Zen2 too, which now means we want it on by default - on all hardware supporting STIBP. - * Break the current IBPB logic out into a new function, extending it with - IBPB-at-entry logic. - * Change the existing IBPB-at-ctxt-switch boolean to be tristate, and disable - it by default when IBPB-at-entry is providing sufficient safety. - -If all PV guests on the system are trusted, then it is recommended to boot -with `spec-ctrl=ibpb-entry=no-pv`, as this will provide an additional marginal -perf improvement. - -This is part of XSA-407 / CVE-2022-23825. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit d8cb7e0f069e0f106d24941355b59b45a731eabe) ---- - docs/misc/xen-command-line.pandoc | 14 ++-- - xen/arch/x86/spec_ctrl.c | 113 ++++++++++++++++++++++++++---- - xen/include/asm-x86/spec_ctrl.h | 2 +- - 3 files changed, 112 insertions(+), 17 deletions(-) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index b06db5f654e5..b73c4a605011 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2170,7 +2170,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). - - ### spec-ctrl (x86) - > `= List of [ <bool>, xen=<bool>, {pv,hvm}=<bool>, --> {msr-sc,rsb,md-clear}=<bool>|{pv,hvm}=<bool>, -+> {msr-sc,rsb,md-clear,ibpb-entry}=<bool>|{pv,hvm}=<bool>, - > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, - > eager-fpu,l1d-flush,branch-harden,srb-lock, - > unpriv-mmio}=<bool> ]` -@@ -2195,9 +2195,10 @@ in place for guests to use. - - Use of a positive boolean value for either of these options is invalid. - --The `pv=`, `hvm=`, `msr-sc=`, `rsb=` and `md-clear=` options offer fine --grained control over the primitives by Xen. These impact Xen's ability to --protect itself, and/or Xen's ability to virtualise support for guests to use. -+The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `md-clear=` and `ibpb-entry=` options -+offer fine grained control over the primitives by Xen. These impact Xen's -+ability to protect itself, and/or Xen's ability to virtualise support for -+guests to use. - - * `pv=` and `hvm=` offer control over all suboptions for PV and HVM guests - respectively. -@@ -2216,6 +2217,11 @@ protect itself, and/or Xen's ability to virtualise support for guests to use. - compatibility with development versions of this fix, `mds=` is also accepted - on Xen 4.12 and earlier as an alias. Consult vendor documentation in - preference to here.* -+* `ibpb-entry=` offers control over whether IBPB (Indirect Branch Prediction -+ Barrier) is used on entry to Xen. This is used by default on hardware -+ vulnerable to Branch Type Confusion, but for performance reasons, dom0 is -+ unprotected by default. If it necessary to protect dom0 too, boot with -+ `spec-ctrl=ibpb-entry`. - - If Xen was compiled with INDIRECT_THUNK support, `bti-thunk=` can be used to - select which of the thunks gets patched into the `__x86_indirect_thunk_%reg` -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 9f66c715516c..563519ce0e31 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -39,6 +39,10 @@ static bool __initdata opt_rsb_hvm = true; - static int8_t __read_mostly opt_md_clear_pv = -1; - static int8_t __read_mostly opt_md_clear_hvm = -1; - -+static int8_t __read_mostly opt_ibpb_entry_pv = -1; -+static int8_t __read_mostly opt_ibpb_entry_hvm = -1; -+static bool __read_mostly opt_ibpb_entry_dom0; -+ - /* Cmdline controls for Xen's speculative settings. */ - static enum ind_thunk { - THUNK_DEFAULT, /* Decide which thunk to use at boot time. */ -@@ -54,7 +58,7 @@ int8_t __initdata opt_stibp = -1; - bool __read_mostly opt_ssbd; - int8_t __initdata opt_psfd = -1; - --bool __read_mostly opt_ibpb_ctxt_switch = true; -+int8_t __read_mostly opt_ibpb_ctxt_switch = -1; - int8_t __read_mostly opt_eager_fpu = -1; - int8_t __read_mostly opt_l1d_flush = -1; - bool __read_mostly opt_branch_harden = true; -@@ -114,6 +118,9 @@ static int __init parse_spec_ctrl(const char *s) - opt_rsb_hvm = false; - opt_md_clear_pv = 0; - opt_md_clear_hvm = 0; -+ opt_ibpb_entry_pv = 0; -+ opt_ibpb_entry_hvm = 0; -+ opt_ibpb_entry_dom0 = false; - - opt_thunk = THUNK_JMP; - opt_ibrs = 0; -@@ -140,12 +147,14 @@ static int __init parse_spec_ctrl(const char *s) - opt_msr_sc_pv = val; - opt_rsb_pv = val; - opt_md_clear_pv = val; -+ opt_ibpb_entry_pv = val; - } - else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) - { - opt_msr_sc_hvm = val; - opt_rsb_hvm = val; - opt_md_clear_hvm = val; -+ opt_ibpb_entry_hvm = val; - } - else if ( (val = parse_boolean("msr-sc", s, ss)) != -1 ) - { -@@ -210,6 +219,28 @@ static int __init parse_spec_ctrl(const char *s) - break; - } - } -+ else if ( (val = parse_boolean("ibpb-entry", s, ss)) != -1 ) -+ { -+ switch ( val ) -+ { -+ case 0: -+ case 1: -+ opt_ibpb_entry_pv = opt_ibpb_entry_hvm = -+ opt_ibpb_entry_dom0 = val; -+ break; -+ -+ case -2: -+ s += strlen("ibpb-entry="); -+ if ( (val = parse_boolean("pv", s, ss)) >= 0 ) -+ opt_ibpb_entry_pv = val; -+ else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) -+ opt_ibpb_entry_hvm = val; -+ else -+ default: -+ rc = -EINVAL; -+ break; -+ } -+ } - - /* Xen's speculative sidechannel mitigation settings. */ - else if ( !strncmp(s, "bti-thunk=", 10) ) -@@ -477,27 +508,31 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - * mitigation support for guests. - */ - #ifdef CONFIG_HVM -- printk(" Support for HVM VMs:%s%s%s%s%s\n", -+ printk(" Support for HVM VMs:%s%s%s%s%s%s\n", - (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) || - boot_cpu_has(X86_FEATURE_SC_RSB_HVM) || - boot_cpu_has(X86_FEATURE_MD_CLEAR) || -+ boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) || - opt_eager_fpu) ? "" : " None", - boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ? " MSR_SPEC_CTRL" : "", - boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ? " RSB" : "", - opt_eager_fpu ? " EAGER_FPU" : "", -- boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : ""); -+ boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : "", -+ boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) ? " IBPB-entry" : ""); - - #endif - #ifdef CONFIG_PV -- printk(" Support for PV VMs:%s%s%s%s%s\n", -+ printk(" Support for PV VMs:%s%s%s%s%s%s\n", - (boot_cpu_has(X86_FEATURE_SC_MSR_PV) || - boot_cpu_has(X86_FEATURE_SC_RSB_PV) || - boot_cpu_has(X86_FEATURE_MD_CLEAR) || -+ boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) || - opt_eager_fpu) ? "" : " None", - boot_cpu_has(X86_FEATURE_SC_MSR_PV) ? " MSR_SPEC_CTRL" : "", - boot_cpu_has(X86_FEATURE_SC_RSB_PV) ? " RSB" : "", - opt_eager_fpu ? " EAGER_FPU" : "", -- boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : ""); -+ boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : "", -+ boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ? " IBPB-entry" : ""); - - printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n", - opt_xpti_hwdom ? "enabled" : "disabled", -@@ -730,6 +765,55 @@ static bool __init should_use_eager_fpu(void) - } - } - -+static void __init ibpb_calculations(void) -+{ -+ /* Check we have hardware IBPB support before using it... */ -+ if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) ) -+ { -+ opt_ibpb_entry_hvm = opt_ibpb_entry_pv = opt_ibpb_ctxt_switch = 0; -+ opt_ibpb_entry_dom0 = false; -+ return; -+ } -+ -+ /* -+ * IBPB-on-entry mitigations for Branch Type Confusion. -+ * -+ * IBPB && !BTC_NO selects all AMD/Hygon hardware, not known to be safe, -+ * that we can provide some form of mitigation on. -+ */ -+ if ( opt_ibpb_entry_pv == -1 ) -+ opt_ibpb_entry_pv = (IS_ENABLED(CONFIG_PV) && -+ boot_cpu_has(X86_FEATURE_IBPB) && -+ !boot_cpu_has(X86_FEATURE_BTC_NO)); -+ if ( opt_ibpb_entry_hvm == -1 ) -+ opt_ibpb_entry_hvm = (IS_ENABLED(CONFIG_HVM) && -+ boot_cpu_has(X86_FEATURE_IBPB) && -+ !boot_cpu_has(X86_FEATURE_BTC_NO)); -+ -+ if ( opt_ibpb_entry_pv ) -+ { -+ setup_force_cpu_cap(X86_FEATURE_IBPB_ENTRY_PV); -+ -+ /* -+ * We only need to flush in IST context if we're protecting against PV -+ * guests. HVM IBPB-on-entry protections are both atomic with -+ * NMI/#MC, so can't interrupt Xen ahead of having already flushed the -+ * BTB. -+ */ -+ default_spec_ctrl_flags |= SCF_ist_ibpb; -+ } -+ if ( opt_ibpb_entry_hvm ) -+ setup_force_cpu_cap(X86_FEATURE_IBPB_ENTRY_HVM); -+ -+ /* -+ * If we're using IBPB-on-entry to protect against PV and HVM guests -+ * (ignoring dom0 if trusted), then there's no need to also issue IBPB on -+ * context switch too. -+ */ -+ if ( opt_ibpb_ctxt_switch == -1 ) -+ opt_ibpb_ctxt_switch = !(opt_ibpb_entry_hvm && opt_ibpb_entry_pv); -+} -+ - /* Calculate whether this CPU is vulnerable to L1TF. */ - static __init void l1tf_calculations(uint64_t caps) - { -@@ -985,8 +1069,12 @@ void spec_ctrl_init_domain(struct domain *d) - bool verw = ((pv ? opt_md_clear_pv : opt_md_clear_hvm) || - (opt_fb_clear_mmio && is_iommu_enabled(d))); - -+ bool ibpb = ((pv ? opt_ibpb_entry_pv : opt_ibpb_entry_hvm) && -+ (d->domain_id != 0 || opt_ibpb_entry_dom0)); -+ - d->arch.spec_ctrl_flags = - (verw ? SCF_verw : 0) | -+ (ibpb ? SCF_entry_ibpb : 0) | - 0; - } - -@@ -1133,12 +1221,15 @@ void __init init_speculation_mitigations(void) - } - - /* -- * Use STIBP by default if the hardware hint is set. Otherwise, leave it -- * off as it a severe performance pentalty on pre-eIBRS Intel hardware -- * where it was retrofitted in microcode. -+ * Use STIBP by default on all AMD systems. Zen3 and later enumerate -+ * STIBP_ALWAYS, but STIBP is needed on Zen2 as part of the mitigations -+ * for Branch Type Confusion. -+ * -+ * Leave STIBP off by default on Intel. Pre-eIBRS systems suffer a -+ * substantial perf hit when it was implemented in microcode. - */ - if ( opt_stibp == -1 ) -- opt_stibp = !!boot_cpu_has(X86_FEATURE_STIBP_ALWAYS); -+ opt_stibp = !!boot_cpu_has(X86_FEATURE_AMD_STIBP); - - if ( opt_stibp && (boot_cpu_has(X86_FEATURE_STIBP) || - boot_cpu_has(X86_FEATURE_AMD_STIBP)) ) -@@ -1192,9 +1283,7 @@ void __init init_speculation_mitigations(void) - if ( opt_rsb_hvm ) - setup_force_cpu_cap(X86_FEATURE_SC_RSB_HVM); - -- /* Check we have hardware IBPB support before using it... */ -- if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) ) -- opt_ibpb_ctxt_switch = false; -+ ibpb_calculations(); - - /* Check whether Eager FPU should be enabled by default. */ - if ( opt_eager_fpu == -1 ) -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index 10cd0cd2518f..33e845991b0a 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -65,7 +65,7 @@ - void init_speculation_mitigations(void); - void spec_ctrl_init_domain(struct domain *d); - --extern bool opt_ibpb_ctxt_switch; -+extern int8_t opt_ibpb_ctxt_switch; - extern bool opt_ssbd; - extern int8_t opt_eager_fpu; - extern int8_t opt_l1d_flush; --- -2.37.4 - diff --git a/0022-x86-mm-correct-TLB-flush-condition-in-_get_page_type.patch b/0022-x86-mm-correct-TLB-flush-condition-in-_get_page_type.patch deleted file mode 100644 index 5b038c4..0000000 --- a/0022-x86-mm-correct-TLB-flush-condition-in-_get_page_type.patch +++ /dev/null @@ -1,46 +0,0 @@ -From 3859f3ee7e37323ae5e0014c07ba8d3a4d7890b2 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 26 Jul 2022 15:03:14 +0200 -Subject: [PATCH 022/126] x86/mm: correct TLB flush condition in - _get_page_type() - -When this logic was moved, it was moved across the point where nx is -updated to hold the new type for the page. IOW originally it was -equivalent to using x (and perhaps x would better have been used), but -now it isn't anymore. Switch to using x, which then brings things in -line again with the slightly earlier comment there (now) talking about -transitions _from_ writable. - -I have to confess though that I cannot make a direct connection between -the reported observed behavior of guests leaving several pages around -with pending general references and the change here. Repeated testing, -nevertheless, confirms the reported issue is no longer there. - -This is CVE-2022-33745 / XSA-408. - -Reported-by: Charles Arnold <carnold@suse.com> -Fixes: 8cc5036bc385 ("x86/pv: Fix ABAC cmpxchg() race in _get_page_type()") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: a9949efb288fd6e21bbaf9d5826207c7c41cda27 -master date: 2022-07-26 14:54:34 +0200 ---- - xen/arch/x86/mm.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 7d0747017db5..c88dc749d431 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -2992,7 +2992,7 @@ static int _get_page_type(struct page_info *page, unsigned long type, - if ( unlikely(!cpumask_empty(mask)) && - /* Shadow mode: track only writable pages. */ - (!shadow_mode_enabled(d) || -- ((nx & PGT_type_mask) == PGT_writable_page)) ) -+ ((x & PGT_type_mask) == PGT_writable_page)) ) - { - perfc_incr(need_flush_tlb_flush); - /* --- -2.37.4 - diff --git a/0023-xl-relax-freemem-s-retry-calculation.patch b/0023-xl-relax-freemem-s-retry-calculation.patch deleted file mode 100644 index 1879884..0000000 --- a/0023-xl-relax-freemem-s-retry-calculation.patch +++ /dev/null @@ -1,80 +0,0 @@ -From 2173d9c8be28d5f33c0e299a363ac994867d111b Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Wed, 27 Jul 2022 09:28:46 +0200 -Subject: [PATCH 023/126] xl: relax freemem()'s retry calculation - -While in principle possible also under other conditions as long as other -parallel operations potentially consuming memory aren't "locked out", in -particular with IOMMU large page mappings used in Dom0 (for PV when in -strict mode; for PVH when not sharing page tables with HAP) ballooning -out of individual pages can actually lead to less free memory available -afterwards. This is because to split a large page, one or more page -table pages are necessary (one per level that is split). - -When rebooting a guest I've observed freemem() to fail: A single page -was required to be ballooned out (presumably because of heap -fragmentation in the hypervisor). This ballooning out of a single page -of course went fast, but freemem() then found that it would require to -balloon out another page. This repeating just another time leads to the -function to signal failure to the caller - without having come anywhere -near the designated 30s that the whole process is allowed to not make -any progress at all. - -Convert from a simple retry count to actually calculating elapsed time, -subtracting from an initial credit of 30s. Don't go as far as limiting -the "wait_secs" value passed to libxl_wait_for_memory_target(), though. -While this leads to the overall process now possibly taking longer (if -the previous iteration ended very close to the intended 30s), this -compensates to some degree for the value passed really meaning "allowed -to run for this long without making progress". - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> -master commit: e58370df76eacf1f7ca0340e9b96430c77b41a79 -master date: 2022-07-12 15:25:00 +0200 ---- - tools/xl/xl_vmcontrol.c | 10 +++++++--- - 1 file changed, 7 insertions(+), 3 deletions(-) - -diff --git a/tools/xl/xl_vmcontrol.c b/tools/xl/xl_vmcontrol.c -index 435155a03396..5dee7730ca76 100644 ---- a/tools/xl/xl_vmcontrol.c -+++ b/tools/xl/xl_vmcontrol.c -@@ -321,7 +321,8 @@ static int domain_wait_event(uint32_t domid, libxl_event **event_r) - */ - static bool freemem(uint32_t domid, libxl_domain_config *d_config) - { -- int rc, retries = 3; -+ int rc; -+ double credit = 30; - uint64_t need_memkb, free_memkb; - - if (!autoballoon) -@@ -332,6 +333,8 @@ static bool freemem(uint32_t domid, libxl_domain_config *d_config) - return false; - - do { -+ time_t start; -+ - rc = libxl_get_free_memory(ctx, &free_memkb); - if (rc < 0) - return false; -@@ -345,12 +348,13 @@ static bool freemem(uint32_t domid, libxl_domain_config *d_config) - - /* wait until dom0 reaches its target, as long as we are making - * progress */ -+ start = time(NULL); - rc = libxl_wait_for_memory_target(ctx, 0, 10); - if (rc < 0) - return false; - -- retries--; -- } while (retries > 0); -+ credit -= difftime(time(NULL), start); -+ } while (credit > 0); - - return false; - } --- -2.37.4 - diff --git a/0024-tools-init-xenstore-domain-fix-memory-map-for-PVH-st.patch b/0024-tools-init-xenstore-domain-fix-memory-map-for-PVH-st.patch deleted file mode 100644 index ccde751..0000000 --- a/0024-tools-init-xenstore-domain-fix-memory-map-for-PVH-st.patch +++ /dev/null @@ -1,59 +0,0 @@ -From a2684d9cbbfb02b268be7e551674f709db0617a4 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Wed, 27 Jul 2022 09:29:08 +0200 -Subject: [PATCH 024/126] tools/init-xenstore-domain: fix memory map for PVH - stubdom - -In case of maxmem != memsize the E820 map of the PVH stubdom is wrong, -as it is missing the RAM above memsize. - -Additionally the memory map should only specify the Xen special pages -as reserved. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> -master commit: 134d53f577076d4f26091e25762f27cc3c73bf58 -master date: 2022-07-12 15:25:20 +0200 ---- - tools/helpers/init-xenstore-domain.c | 14 +++++++++----- - 1 file changed, 9 insertions(+), 5 deletions(-) - -diff --git a/tools/helpers/init-xenstore-domain.c b/tools/helpers/init-xenstore-domain.c -index 6836002f0bad..32689abd7479 100644 ---- a/tools/helpers/init-xenstore-domain.c -+++ b/tools/helpers/init-xenstore-domain.c -@@ -72,8 +72,9 @@ static int build(xc_interface *xch) - char cmdline[512]; - int rv, xs_fd; - struct xc_dom_image *dom = NULL; -- int limit_kb = (maxmem ? : (memory + 1)) * 1024; -+ int limit_kb = (maxmem ? : memory) * 1024 + X86_HVM_NR_SPECIAL_PAGES * 4; - uint64_t mem_size = MB(memory); -+ uint64_t max_size = MB(maxmem ? : memory); - struct e820entry e820[3]; - struct xen_domctl_createdomain config = { - .ssidref = SECINITSID_DOMU, -@@ -157,13 +158,16 @@ static int build(xc_interface *xch) - dom->mmio_start = LAPIC_BASE_ADDRESS; - dom->max_vcpus = 1; - e820[0].addr = 0; -- e820[0].size = dom->lowmem_end; -+ e820[0].size = (max_size > LAPIC_BASE_ADDRESS) ? -+ LAPIC_BASE_ADDRESS : max_size; - e820[0].type = E820_RAM; -- e820[1].addr = LAPIC_BASE_ADDRESS; -- e820[1].size = dom->mmio_size; -+ e820[1].addr = (X86_HVM_END_SPECIAL_REGION - -+ X86_HVM_NR_SPECIAL_PAGES) << XC_PAGE_SHIFT; -+ e820[1].size = X86_HVM_NR_SPECIAL_PAGES << XC_PAGE_SHIFT; - e820[1].type = E820_RESERVED; - e820[2].addr = GB(4); -- e820[2].size = dom->highmem_end - GB(4); -+ e820[2].size = (max_size > LAPIC_BASE_ADDRESS) ? -+ max_size - LAPIC_BASE_ADDRESS : 0; - e820[2].type = E820_RAM; - } - --- -2.37.4 - diff --git a/0025-xl-move-freemem-s-credit-expired-loop-exit.patch b/0025-xl-move-freemem-s-credit-expired-loop-exit.patch deleted file mode 100644 index a3b2e2b..0000000 --- a/0025-xl-move-freemem-s-credit-expired-loop-exit.patch +++ /dev/null @@ -1,55 +0,0 @@ -From c37099426ea678c1d5b6c99ae5ad6834f4edd2e6 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Wed, 27 Jul 2022 09:29:31 +0200 -Subject: [PATCH 025/126] xl: move freemem()'s "credit expired" loop exit - -Move the "credit expired" loop exit to the middle of the loop, -immediately after "return true". This way having reached the goal on the -last iteration would be reported as success to the caller, rather than -as "timed out". - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> -master commit: d8f8cb8bdd02fad3b6986ae93511f750fa7f7e6a -master date: 2022-07-18 17:48:18 +0200 ---- - tools/xl/xl_vmcontrol.c | 9 +++++---- - 1 file changed, 5 insertions(+), 4 deletions(-) - -diff --git a/tools/xl/xl_vmcontrol.c b/tools/xl/xl_vmcontrol.c -index 5dee7730ca76..d1c6f8aae67a 100644 ---- a/tools/xl/xl_vmcontrol.c -+++ b/tools/xl/xl_vmcontrol.c -@@ -332,7 +332,7 @@ static bool freemem(uint32_t domid, libxl_domain_config *d_config) - if (rc < 0) - return false; - -- do { -+ for (;;) { - time_t start; - - rc = libxl_get_free_memory(ctx, &free_memkb); -@@ -342,6 +342,9 @@ static bool freemem(uint32_t domid, libxl_domain_config *d_config) - if (free_memkb >= need_memkb) - return true; - -+ if (credit <= 0) -+ return false; -+ - rc = libxl_set_memory_target(ctx, 0, free_memkb - need_memkb, 1, 0); - if (rc < 0) - return false; -@@ -354,9 +357,7 @@ static bool freemem(uint32_t domid, libxl_domain_config *d_config) - return false; - - credit -= difftime(time(NULL), start); -- } while (credit > 0); -- -- return false; -+ } - } - - static void reload_domain_config(uint32_t domid, --- -2.37.4 - diff --git a/0026-x86-spec-ctrl-correct-per-guest-type-reporting-of-MD.patch b/0026-x86-spec-ctrl-correct-per-guest-type-reporting-of-MD.patch deleted file mode 100644 index fbbf450..0000000 --- a/0026-x86-spec-ctrl-correct-per-guest-type-reporting-of-MD.patch +++ /dev/null @@ -1,56 +0,0 @@ -From 5f1d0179e15d726622a49044a825894d5010df15 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Wed, 27 Jul 2022 09:29:54 +0200 -Subject: [PATCH 026/126] x86/spec-ctrl: correct per-guest-type reporting of - MD_CLEAR - -There are command line controls for this and the default also isn't "always -enable when hardware supports it", which logging should take into account. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: fdbf8bdfebc2ed323c521848f642cc4f6b8cb662 -master date: 2022-07-19 08:36:53 +0200 ---- - xen/arch/x86/spec_ctrl.c | 10 ++++------ - 1 file changed, 4 insertions(+), 6 deletions(-) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 563519ce0e31..f7b0251c42bc 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -511,13 +511,12 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - printk(" Support for HVM VMs:%s%s%s%s%s%s\n", - (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) || - boot_cpu_has(X86_FEATURE_SC_RSB_HVM) || -- boot_cpu_has(X86_FEATURE_MD_CLEAR) || - boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) || -- opt_eager_fpu) ? "" : " None", -+ opt_eager_fpu || opt_md_clear_hvm) ? "" : " None", - boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ? " MSR_SPEC_CTRL" : "", - boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ? " RSB" : "", - opt_eager_fpu ? " EAGER_FPU" : "", -- boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : "", -+ opt_md_clear_hvm ? " MD_CLEAR" : "", - boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) ? " IBPB-entry" : ""); - - #endif -@@ -525,13 +524,12 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - printk(" Support for PV VMs:%s%s%s%s%s%s\n", - (boot_cpu_has(X86_FEATURE_SC_MSR_PV) || - boot_cpu_has(X86_FEATURE_SC_RSB_PV) || -- boot_cpu_has(X86_FEATURE_MD_CLEAR) || - boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) || -- opt_eager_fpu) ? "" : " None", -+ opt_eager_fpu || opt_md_clear_pv) ? "" : " None", - boot_cpu_has(X86_FEATURE_SC_MSR_PV) ? " MSR_SPEC_CTRL" : "", - boot_cpu_has(X86_FEATURE_SC_RSB_PV) ? " RSB" : "", - opt_eager_fpu ? " EAGER_FPU" : "", -- boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : "", -+ opt_md_clear_pv ? " MD_CLEAR" : "", - boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ? " IBPB-entry" : ""); - - printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n", --- -2.37.4 - diff --git a/0027-x86-deal-with-gcc12-release-build-issues.patch b/0027-x86-deal-with-gcc12-release-build-issues.patch deleted file mode 100644 index b30c65b..0000000 --- a/0027-x86-deal-with-gcc12-release-build-issues.patch +++ /dev/null @@ -1,65 +0,0 @@ -From a095c6cde8a717325cc31bb393c547cad5e16e35 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Wed, 27 Jul 2022 09:30:24 +0200 -Subject: [PATCH 027/126] x86: deal with gcc12 release build issues - -While a number of issues we previously had with pre-release gcc12 were -fixed in the final release, we continue to have one issue (with multiple -instances) when doing release builds (i.e. at higher optimization -levels): The compiler takes issue with subtracting (always 1 in our -case) from artifical labels (expressed as array) marking the end of -certain regions. This isn't an unreasonable position to take. Simply -hide the "array-ness" by casting to an integer type. To keep things -looking consistently, apply the same cast also on the respective -expressions dealing with the starting addresses. (Note how -efi_arch_memory_setup()'s l2_table_offset() invocations avoid a similar -issue by already having the necessary casts.) In is_xen_fixed_mfn() -further switch from __pa() to virt_to_maddr() to better match the left -sides of the <= operators. - -Reported-by: Charles Arnold <carnold@suse.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 9723507daf2120131410c91980d4e4d9b0d0aa90 -master date: 2022-07-19 08:37:29 +0200 ---- - xen/arch/x86/efi/efi-boot.h | 6 +++--- - xen/include/asm-x86/mm.h | 4 ++-- - 2 files changed, 5 insertions(+), 5 deletions(-) - -diff --git a/xen/arch/x86/efi/efi-boot.h b/xen/arch/x86/efi/efi-boot.h -index 2541ba1f320a..84fd77931456 100644 ---- a/xen/arch/x86/efi/efi-boot.h -+++ b/xen/arch/x86/efi/efi-boot.h -@@ -624,10 +624,10 @@ static void __init efi_arch_memory_setup(void) - * appropriate l2 slots to map. - */ - #define l2_4G_offset(a) \ -- (((UINTN)(a) >> L2_PAGETABLE_SHIFT) & (4 * L2_PAGETABLE_ENTRIES - 1)) -+ (((a) >> L2_PAGETABLE_SHIFT) & (4 * L2_PAGETABLE_ENTRIES - 1)) - -- for ( i = l2_4G_offset(_start); -- i <= l2_4G_offset(_end - 1); ++i ) -+ for ( i = l2_4G_offset((UINTN)_start); -+ i <= l2_4G_offset((UINTN)_end - 1); ++i ) - { - l2_pgentry_t pte = l2e_from_paddr(i << L2_PAGETABLE_SHIFT, - __PAGE_HYPERVISOR | _PAGE_PSE); -diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h -index 5c19b71eca70..71dd28f126c3 100644 ---- a/xen/include/asm-x86/mm.h -+++ b/xen/include/asm-x86/mm.h -@@ -309,8 +309,8 @@ struct page_info - #define is_xen_heap_mfn(mfn) \ - (mfn_valid(mfn) && is_xen_heap_page(mfn_to_page(mfn))) - #define is_xen_fixed_mfn(mfn) \ -- (((mfn_to_maddr(mfn)) >= __pa(_stext)) && \ -- ((mfn_to_maddr(mfn)) <= __pa(__2M_rwdata_end - 1))) -+ (((mfn_to_maddr(mfn)) >= virt_to_maddr((unsigned long)_stext)) && \ -+ ((mfn_to_maddr(mfn)) <= virt_to_maddr((unsigned long)__2M_rwdata_end - 1))) - - #define PRtype_info "016lx"/* should only be used for printk's */ - --- -2.37.4 - diff --git a/0028-x86emul-add-memory-operand-low-bits-checks-for-ENQCM.patch b/0028-x86emul-add-memory-operand-low-bits-checks-for-ENQCM.patch deleted file mode 100644 index 1a63be4..0000000 --- a/0028-x86emul-add-memory-operand-low-bits-checks-for-ENQCM.patch +++ /dev/null @@ -1,45 +0,0 @@ -From 4799a202a9017360708c18aa8cd699bd8d6be08b Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Wed, 27 Jul 2022 09:31:01 +0200 -Subject: [PATCH 028/126] x86emul: add memory operand low bits checks for - ENQCMD{,S} - -Already ISE rev 044 added text to this effect; rev 045 further dropped -leftover earlier text indicating the contrary: -- ENQCMD requires the low 32 bits of the memory operand to be clear, -- ENDCMDS requires bits 20...30 of the memory operand to be clear. - -Fixes: d27385968741 ("x86emul: support ENQCMD insns") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: d620c66bdbe5510c3bae89be8cc7ca9a2a6cbaba -master date: 2022-07-20 15:46:48 +0200 ---- - xen/arch/x86/x86_emulate/x86_emulate.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c -index 5e297f797187..247c14dc4e68 100644 ---- a/xen/arch/x86/x86_emulate/x86_emulate.c -+++ b/xen/arch/x86/x86_emulate/x86_emulate.c -@@ -10464,6 +10464,7 @@ x86_emulate( - goto done; - if ( vex.pfx == vex_f2 ) /* enqcmd */ - { -+ generate_exception_if(mmvalp->data32[0], EXC_GP, 0); - fail_if(!ops->read_msr); - if ( (rc = ops->read_msr(MSR_PASID, &msr_val, - ctxt)) != X86EMUL_OKAY ) -@@ -10471,7 +10472,8 @@ x86_emulate( - generate_exception_if(!(msr_val & PASID_VALID), EXC_GP, 0); - mmvalp->data32[0] = MASK_EXTR(msr_val, PASID_PASID_MASK); - } -- mmvalp->data32[0] &= ~0x7ff00000; -+ else -+ generate_exception_if(mmvalp->data32[0] & 0x7ff00000, EXC_GP, 0); - state->blk = blk_enqcmd; - if ( (rc = ops->blk(x86_seg_es, src.val, mmvalp, 64, &_regs.eflags, - state, ctxt)) != X86EMUL_OKAY ) --- -2.37.4 - diff --git a/0029-x86-also-suppress-use-of-MMX-insns.patch b/0029-x86-also-suppress-use-of-MMX-insns.patch deleted file mode 100644 index d954cdd..0000000 --- a/0029-x86-also-suppress-use-of-MMX-insns.patch +++ /dev/null @@ -1,39 +0,0 @@ -From 30d3de4c61c297e12662df1fdb89af335947e59d Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Wed, 27 Jul 2022 09:31:31 +0200 -Subject: [PATCH 029/126] x86: also suppress use of MMX insns - -Passing -mno-sse alone is not enough: The compiler may still find -(questionable) reasons to use MMX insns. In particular with gcc12 use -of MOVD+PUNPCKLDQ+MOVQ was observed in an apparent attempt to auto- -vectorize the storing of two adjacent zeroes, 32 bits each. - -Reported-by: ChrisD <chris@dalessio.org> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 6fe2e39a0243bddba60f83b77b972a5922d25eb8 -master date: 2022-07-20 15:48:49 +0200 ---- - xen/arch/x86/arch.mk | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/arch.mk b/xen/arch/x86/arch.mk -index 456e5d5c1ad7..c4337a1a118c 100644 ---- a/xen/arch/x86/arch.mk -+++ b/xen/arch/x86/arch.mk -@@ -37,9 +37,9 @@ $(call as-option-add,CFLAGS,CC,\ - - CFLAGS += -mno-red-zone -fpic - --# Xen doesn't use SSE interally. If the compiler supports it, also skip the --# SSE setup for variadic function calls. --CFLAGS += -mno-sse $(call cc-option,$(CC),-mskip-rax-setup) -+# Xen doesn't use MMX or SSE interally. If the compiler supports it, also skip -+# the SSE setup for variadic function calls. -+CFLAGS += -mno-mmx -mno-sse $(call cc-option,$(CC),-mskip-rax-setup) - - # Compile with thunk-extern, indirect-branch-register if avaiable. - CFLAGS-$(CONFIG_INDIRECT_THUNK) += -mindirect-branch=thunk-extern --- -2.37.4 - diff --git a/0030-common-memory-Fix-ifdefs-for-ptdom_max_order.patch b/0030-common-memory-Fix-ifdefs-for-ptdom_max_order.patch deleted file mode 100644 index b4f6881..0000000 --- a/0030-common-memory-Fix-ifdefs-for-ptdom_max_order.patch +++ /dev/null @@ -1,52 +0,0 @@ -From b64f1c9e3e3a2a416c7bb5aab77ba5d2cba98638 Mon Sep 17 00:00:00 2001 -From: Luca Fancellu <luca.fancellu@arm.com> -Date: Wed, 27 Jul 2022 09:31:49 +0200 -Subject: [PATCH 030/126] common/memory: Fix ifdefs for ptdom_max_order - -In common/memory.c the ifdef code surrounding ptdom_max_order is -using HAS_PASSTHROUGH instead of CONFIG_HAS_PASSTHROUGH, fix the -problem using the correct macro. - -Fixes: e0d44c1f9461 ("build: convert HAS_PASSTHROUGH use to Kconfig") -Signed-off-by: Luca Fancellu <luca.fancellu@arm.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 5707470bf3103ebae43697a7ac2faced6cd35f92 -master date: 2022-07-26 08:33:46 +0200 ---- - xen/common/memory.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/xen/common/memory.c b/xen/common/memory.c -index 297b98a562b2..95b2b934e4a2 100644 ---- a/xen/common/memory.c -+++ b/xen/common/memory.c -@@ -58,7 +58,7 @@ struct memop_args { - static unsigned int __read_mostly domu_max_order = CONFIG_DOMU_MAX_ORDER; - static unsigned int __read_mostly ctldom_max_order = CONFIG_CTLDOM_MAX_ORDER; - static unsigned int __read_mostly hwdom_max_order = CONFIG_HWDOM_MAX_ORDER; --#ifdef HAS_PASSTHROUGH -+#ifdef CONFIG_HAS_PASSTHROUGH - static unsigned int __read_mostly ptdom_max_order = CONFIG_PTDOM_MAX_ORDER; - #endif - -@@ -70,7 +70,7 @@ static int __init parse_max_order(const char *s) - ctldom_max_order = simple_strtoul(s, &s, 0); - if ( *s == ',' && *++s != ',' ) - hwdom_max_order = simple_strtoul(s, &s, 0); --#ifdef HAS_PASSTHROUGH -+#ifdef CONFIG_HAS_PASSTHROUGH - if ( *s == ',' && *++s != ',' ) - ptdom_max_order = simple_strtoul(s, &s, 0); - #endif -@@ -83,7 +83,7 @@ static unsigned int max_order(const struct domain *d) - { - unsigned int order = domu_max_order; - --#ifdef HAS_PASSTHROUGH -+#ifdef CONFIG_HAS_PASSTHROUGH - if ( cache_flush_permitted(d) && order < ptdom_max_order ) - order = ptdom_max_order; - #endif --- -2.37.4 - diff --git a/0031-tools-libxl-env-variable-to-signal-whether-disk-nic-.patch b/0031-tools-libxl-env-variable-to-signal-whether-disk-nic-.patch deleted file mode 100644 index 65fe05b..0000000 --- a/0031-tools-libxl-env-variable-to-signal-whether-disk-nic-.patch +++ /dev/null @@ -1,107 +0,0 @@ -From 1b9845dcf959421db3a071a6bc0aa9d8edbffb50 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Wed, 3 Aug 2022 12:41:18 +0200 -Subject: [PATCH 031/126] tools/libxl: env variable to signal whether disk/nic - backend is trusted -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Introduce support in libxl for fetching the default backend trusted -option for disk and nic devices. - -Users can set LIBXL_{DISK,NIC}_BACKEND_UNTRUSTED environment variable -to notify libxl of whether the backends for disk and nic devices -should be trusted. Such information is passed into the frontend so it -can take the appropriate measures. - -This is part of XSA-403. - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> ---- - docs/man/xl.1.pod.in | 18 ++++++++++++++++++ - tools/libs/light/libxl_disk.c | 5 +++++ - tools/libs/light/libxl_nic.c | 7 +++++++ - 3 files changed, 30 insertions(+) - -diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in -index e2176bd696cb..45e1430aeb74 100644 ---- a/docs/man/xl.1.pod.in -+++ b/docs/man/xl.1.pod.in -@@ -1946,6 +1946,24 @@ shows the decimal value. For non-linear mode, it shows hexadecimal value. - - =back - -+=head1 ENVIRONMENT -+ -+=over 4 -+ -+=item B<LIBXL_DISK_BACKEND_UNTRUSTED> -+ -+Set this environment variable to "1" to suggest to the guest that the disk -+backend shouldn't be trusted. If the variable is absent or set to "0", the -+backend will be trusted. -+ -+=item B<LIBXL_NIC_BACKEND_UNTRUSTED> -+ -+Set this environment variable to "1" to suggest to the guest that the network -+backend shouldn't be trusted. If the variable is absent or set to "0", the -+backend will be trusted. -+ -+=back -+ - =head1 IGNORED FOR COMPATIBILITY WITH XM - - xl is mostly command-line compatible with the old xm utility used with -diff --git a/tools/libs/light/libxl_disk.c b/tools/libs/light/libxl_disk.c -index 93936d0dd0f8..67d1cc18578f 100644 ---- a/tools/libs/light/libxl_disk.c -+++ b/tools/libs/light/libxl_disk.c -@@ -246,6 +246,7 @@ static void device_disk_add(libxl__egc *egc, uint32_t domid, - libxl_domain_config d_config; - libxl_device_disk disk_saved; - libxl__flock *lock = NULL; -+ const char *envvar; - - libxl_domain_config_init(&d_config); - libxl_device_disk_init(&disk_saved); -@@ -395,6 +396,10 @@ static void device_disk_add(libxl__egc *egc, uint32_t domid, - flexarray_append(front, GCSPRINTF("%d", device->devid)); - flexarray_append(front, "device-type"); - flexarray_append(front, disk->is_cdrom ? "cdrom" : "disk"); -+ flexarray_append(front, "trusted"); -+ envvar = getenv("LIBXL_DISK_BACKEND_UNTRUSTED"); -+ /* Set "trusted=1" if envvar missing or is "0". */ -+ flexarray_append(front, !envvar || !strcmp("0", envvar) ? "1" : "0"); - - /* - * Old PV kernel disk frontends before 2.6.26 rely on tool stack to -diff --git a/tools/libs/light/libxl_nic.c b/tools/libs/light/libxl_nic.c -index 0b9e70c9d13d..f87890d1d65f 100644 ---- a/tools/libs/light/libxl_nic.c -+++ b/tools/libs/light/libxl_nic.c -@@ -132,6 +132,8 @@ static int libxl__set_xenstore_nic(libxl__gc *gc, uint32_t domid, - flexarray_t *back, flexarray_t *front, - flexarray_t *ro_front) - { -+ const char *envvar; -+ - flexarray_grow(back, 2); - - if (nic->script) -@@ -255,6 +257,11 @@ static int libxl__set_xenstore_nic(libxl__gc *gc, uint32_t domid, - flexarray_append(back, "hotplug-status"); - flexarray_append(back, ""); - -+ flexarray_append(front, "trusted"); -+ envvar = getenv("LIBXL_NIC_BACKEND_UNTRUSTED"); -+ /* Set "trusted=1" if envvar missing or is "0". */ -+ flexarray_append(front, !envvar || !strcmp("0", envvar) ? "1" : "0"); -+ - return 0; - } - --- -2.37.4 - diff --git a/0032-x86-msr-fix-X2APIC_LAST.patch b/0032-x86-msr-fix-X2APIC_LAST.patch deleted file mode 100644 index 4046822..0000000 --- a/0032-x86-msr-fix-X2APIC_LAST.patch +++ /dev/null @@ -1,66 +0,0 @@ -From df3395f6b2d759aba39fb67a7bc0fe49147c8b39 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Wed, 3 Aug 2022 12:41:49 +0200 -Subject: [PATCH 032/126] x86/msr: fix X2APIC_LAST -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The latest Intel manual now says the X2APIC reserved range is only -0x800 to 0x8ff (NOT 0xbff). -This changed between SDM 68 (Nov 2018) and SDM 69 (Jan 2019). -The AMD manual documents 0x800-0x8ff too. - -There are non-X2APIC MSRs in the 0x900-0xbff range now: -e.g. 0x981 is IA32_TME_CAPABILITY, an architectural MSR. - -The new MSR in this range appears to have been introduced in Icelake, -so this commit should be backported to Xen versions supporting Icelake. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 13316827faadbb4f72ae6c625af9938d8f976f86 -master date: 2022-07-27 12:57:10 +0200 ---- - xen/arch/x86/hvm/vmx/vmx.c | 4 ++-- - xen/include/asm-x86/msr-index.h | 2 +- - 2 files changed, 3 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c -index 868151a2e533..775b36433e24 100644 ---- a/xen/arch/x86/hvm/vmx/vmx.c -+++ b/xen/arch/x86/hvm/vmx/vmx.c -@@ -3401,7 +3401,7 @@ void vmx_vlapic_msr_changed(struct vcpu *v) - if ( cpu_has_vmx_apic_reg_virt ) - { - for ( msr = MSR_X2APIC_FIRST; -- msr <= MSR_X2APIC_FIRST + 0xff; msr++ ) -+ msr <= MSR_X2APIC_LAST; msr++ ) - vmx_clear_msr_intercept(v, msr, VMX_MSR_R); - - vmx_set_msr_intercept(v, MSR_X2APIC_PPR, VMX_MSR_R); -@@ -3422,7 +3422,7 @@ void vmx_vlapic_msr_changed(struct vcpu *v) - if ( !(v->arch.hvm.vmx.secondary_exec_control & - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE) ) - for ( msr = MSR_X2APIC_FIRST; -- msr <= MSR_X2APIC_FIRST + 0xff; msr++ ) -+ msr <= MSR_X2APIC_LAST; msr++ ) - vmx_set_msr_intercept(v, msr, VMX_MSR_RW); - - vmx_update_secondary_exec_control(v); -diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h -index b4a360723b14..f1b2cf5460c1 100644 ---- a/xen/include/asm-x86/msr-index.h -+++ b/xen/include/asm-x86/msr-index.h -@@ -459,7 +459,7 @@ - #define MSR_IA32_TSC_ADJUST 0x0000003b - - #define MSR_X2APIC_FIRST 0x00000800 --#define MSR_X2APIC_LAST 0x00000bff -+#define MSR_X2APIC_LAST 0x000008ff - - #define MSR_X2APIC_TPR 0x00000808 - #define MSR_X2APIC_PPR 0x0000080a --- -2.37.4 - diff --git a/0033-x86-spec-ctrl-Use-IST-RSB-protection-for-SVM-systems.patch b/0033-x86-spec-ctrl-Use-IST-RSB-protection-for-SVM-systems.patch deleted file mode 100644 index f1400b8..0000000 --- a/0033-x86-spec-ctrl-Use-IST-RSB-protection-for-SVM-systems.patch +++ /dev/null @@ -1,55 +0,0 @@ -From 8ae0b4d1331c14fb9e30a42987c0152c9b00f530 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Mon, 15 Aug 2022 15:40:05 +0200 -Subject: [PATCH 033/126] x86/spec-ctrl: Use IST RSB protection for !SVM - systems - -There is a corner case where a VT-x guest which manages to reliably trigger -non-fatal #MC's could evade the rogue RSB speculation protections that were -supposed to be in place. - -This is a lack of defence in depth; Xen does not architecturally execute more -RET than CALL instructions, so an attacker would have to locate a different -gadget (e.g. SpectreRSB) first to execute a transient path of excess RET -instructions. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: e570e8d520ab542d8d35666b95cb3a0125b7b110 -master date: 2022-08-05 12:16:24 +0100 ---- - xen/arch/x86/spec_ctrl.c | 16 ++++++++++++++++ - 1 file changed, 16 insertions(+) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index f7b0251c42bc..ac73806eacd8 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -1279,8 +1279,24 @@ void __init init_speculation_mitigations(void) - * mappings. - */ - if ( opt_rsb_hvm ) -+ { - setup_force_cpu_cap(X86_FEATURE_SC_RSB_HVM); - -+ /* -+ * For SVM, Xen's RSB safety actions are performed before STGI, so -+ * behave atomically with respect to IST sources. -+ * -+ * For VT-x, NMIs are atomic with VMExit (the NMI gets queued but not -+ * delivered) whereas other IST sources are not atomic. Specifically, -+ * #MC can hit ahead the RSB safety action in the vmexit path. -+ * -+ * Therefore, it is necessary for the IST logic to protect Xen against -+ * possible rogue RSB speculation. -+ */ -+ if ( !cpu_has_svm ) -+ default_spec_ctrl_flags |= SCF_ist_rsb; -+ } -+ - ibpb_calculations(); - - /* Check whether Eager FPU should be enabled by default. */ --- -2.37.4 - diff --git a/0034-x86-Expose-more-MSR_ARCH_CAPS-to-hwdom.patch b/0034-x86-Expose-more-MSR_ARCH_CAPS-to-hwdom.patch deleted file mode 100644 index 5433ddb..0000000 --- a/0034-x86-Expose-more-MSR_ARCH_CAPS-to-hwdom.patch +++ /dev/null @@ -1,68 +0,0 @@ -From 5efcae1eb30ff24e100954e00889a568c1745ea1 Mon Sep 17 00:00:00 2001 -From: Jason Andryuk <jandryuk@gmail.com> -Date: Mon, 15 Aug 2022 15:40:47 +0200 -Subject: [PATCH 034/126] x86: Expose more MSR_ARCH_CAPS to hwdom - -commit e46474278a0e ("x86/intel: Expose MSR_ARCH_CAPS to dom0") started -exposing MSR_ARCH_CAPS to dom0. More bits in MSR_ARCH_CAPS have since -been defined, but they haven't been exposed. Update the list to allow -them through. - -As one example, this allows a Linux Dom0 to know that it has the -appropriate microcode via FB_CLEAR. Notably, and with the updated -microcode, this changes dom0's -/sys/devices/system/cpu/vulnerabilities/mmio_stale_data changes from: - - "Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown" - -to: - - "Mitigation: Clear CPU buffers; SMT Host state unknown" - -This exposes the MMIO Stale Data and Intel Branch History Injection -(BHI) controls as well as the page size change MCE issue bit. - -Fixes: commit 2ebe8fe9b7e0 ("x86/spec-ctrl: Enumeration for MMIO Stale Data controls") -Fixes: commit cea9ae062295 ("x86/spec-ctrl: Enumeration for new Intel BHI controls") -Fixes: commit 59e89cdabc71 ("x86/vtx: Disable executable EPT superpages to work around CVE-2018-12207") -Signed-off-by: Jason Andryuk <jandryuk@gmail.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: e83cd54611fec5b7a539fa1281a14319143490e6 -master date: 2022-08-09 16:35:25 +0100 ---- - xen/arch/x86/msr.c | 5 ++++- - xen/include/asm-x86/msr-index.h | 2 ++ - 2 files changed, 6 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c -index 0739d00e74f1..aa9face9aad3 100644 ---- a/xen/arch/x86/msr.c -+++ b/xen/arch/x86/msr.c -@@ -145,7 +145,10 @@ int init_domain_msr_policy(struct domain *d) - - mp->arch_caps.raw = val & - (ARCH_CAPS_RDCL_NO | ARCH_CAPS_IBRS_ALL | ARCH_CAPS_RSBA | -- ARCH_CAPS_SSB_NO | ARCH_CAPS_MDS_NO | ARCH_CAPS_TAA_NO); -+ ARCH_CAPS_SSB_NO | ARCH_CAPS_MDS_NO | ARCH_CAPS_IF_PSCHANGE_MC_NO | -+ ARCH_CAPS_TAA_NO | ARCH_CAPS_SBDR_SSDP_NO | ARCH_CAPS_FBSDP_NO | -+ ARCH_CAPS_PSDP_NO | ARCH_CAPS_FB_CLEAR | ARCH_CAPS_RRSBA | -+ ARCH_CAPS_BHI_NO); - } - - d->arch.msr = mp; -diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h -index f1b2cf5460c1..49ca1f1845e6 100644 ---- a/xen/include/asm-x86/msr-index.h -+++ b/xen/include/asm-x86/msr-index.h -@@ -64,6 +64,8 @@ - #define ARCH_CAPS_PSDP_NO (_AC(1, ULL) << 15) - #define ARCH_CAPS_FB_CLEAR (_AC(1, ULL) << 17) - #define ARCH_CAPS_FB_CLEAR_CTRL (_AC(1, ULL) << 18) -+#define ARCH_CAPS_RRSBA (_AC(1, ULL) << 19) -+#define ARCH_CAPS_BHI_NO (_AC(1, ULL) << 20) - - #define MSR_FLUSH_CMD 0x0000010b - #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) --- -2.37.4 - diff --git a/0035-xen-sched-setup-dom0-vCPUs-affinity-only-once.patch b/0035-xen-sched-setup-dom0-vCPUs-affinity-only-once.patch deleted file mode 100644 index 150de40..0000000 --- a/0035-xen-sched-setup-dom0-vCPUs-affinity-only-once.patch +++ /dev/null @@ -1,123 +0,0 @@ -From 1e31848cdd8d2ff3cb76f364f04f9771f9b3a8b1 Mon Sep 17 00:00:00 2001 -From: Dario Faggioli <dfaggioli@suse.com> -Date: Mon, 15 Aug 2022 15:41:25 +0200 -Subject: [PATCH 035/126] xen/sched: setup dom0 vCPUs affinity only once - -Right now, affinity for dom0 vCPUs is setup in two steps. This is a -problem as, at least in Credit2, unit_insert() sees and uses the -"intermediate" affinity, and place the vCPUs on CPUs where they cannot -be run. And this in turn results in boot hangs, if the "dom0_nodes" -parameter is used. - -Fix this by setting up the affinity properly once and for all, in -sched_init_vcpu() called by create_vcpu(). - -Note that, unless a soft-affinity is explicitly specified for dom0 (by -using the relaxed mode of "dom0_nodes") we set it to the default, which -is all CPUs, instead of computing it basing on hard affinity (if any). -This is because hard and soft affinity should be considered as -independent user controlled properties. In fact, if we dor derive dom0's -soft-affinity from its boot-time hard-affinity, such computed value will -continue to be used even if later the user changes the hard-affinity. -And this could result in the vCPUs behaving differently than what the -user wanted and expects. - -Fixes: dafd936dddbd ("Make credit2 the default scheduler") -Reported-by: Olaf Hering <ohering@suse.de> -Signed-off-by: Dario Faggioli <dfaggioli@suse.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: c79e4d209be3ed2a6b8e97c35944786ed2a66b94 -master date: 2022-08-11 11:46:22 +0200 ---- - xen/common/sched/core.c | 63 +++++++++++++++++++++++++---------------- - 1 file changed, 39 insertions(+), 24 deletions(-) - -diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c -index 8f4b1ca10d1c..f07bd2681fcb 100644 ---- a/xen/common/sched/core.c -+++ b/xen/common/sched/core.c -@@ -571,12 +571,46 @@ int sched_init_vcpu(struct vcpu *v) - return 1; - } - -- /* -- * Initialize affinity settings. The idler, and potentially -- * domain-0 VCPUs, are pinned onto their respective physical CPUs. -- */ -- if ( is_idle_domain(d) || (is_hardware_domain(d) && opt_dom0_vcpus_pin) ) -+ if ( is_idle_domain(d) ) -+ { -+ /* Idle vCPUs are always pinned onto their respective pCPUs */ - sched_set_affinity(unit, cpumask_of(processor), &cpumask_all); -+ } -+ else if ( pv_shim && v->vcpu_id == 0 ) -+ { -+ /* -+ * PV-shim: vcpus are pinned 1:1. Initially only 1 cpu is online, -+ * others will be dealt with when onlining them. This avoids pinning -+ * a vcpu to a not yet online cpu here. -+ */ -+ sched_set_affinity(unit, cpumask_of(0), cpumask_of(0)); -+ } -+ else if ( is_hardware_domain(d) && opt_dom0_vcpus_pin ) -+ { -+ /* -+ * If dom0_vcpus_pin is specified, dom0 vCPUs are pinned 1:1 to -+ * their respective pCPUs too. -+ */ -+ sched_set_affinity(unit, cpumask_of(processor), &cpumask_all); -+ } -+#ifdef CONFIG_X86 -+ else if ( d->domain_id == 0 ) -+ { -+ /* -+ * In absence of dom0_vcpus_pin instead, the hard and soft affinity of -+ * dom0 is controlled by the (x86 only) dom0_nodes parameter. At this -+ * point it has been parsed and decoded into the dom0_cpus mask. -+ * -+ * Note that we always honor what user explicitly requested, for both -+ * hard and soft affinity, without doing any dynamic computation of -+ * either of them. -+ */ -+ if ( !dom0_affinity_relaxed ) -+ sched_set_affinity(unit, &dom0_cpus, &cpumask_all); -+ else -+ sched_set_affinity(unit, &cpumask_all, &dom0_cpus); -+ } -+#endif - else - sched_set_affinity(unit, &cpumask_all, &cpumask_all); - -@@ -3386,29 +3420,10 @@ void wait(void) - void __init sched_setup_dom0_vcpus(struct domain *d) - { - unsigned int i; -- struct sched_unit *unit; - - for ( i = 1; i < d->max_vcpus; i++ ) - vcpu_create(d, i); - -- /* -- * PV-shim: vcpus are pinned 1:1. -- * Initially only 1 cpu is online, others will be dealt with when -- * onlining them. This avoids pinning a vcpu to a not yet online cpu here. -- */ -- if ( pv_shim ) -- sched_set_affinity(d->vcpu[0]->sched_unit, -- cpumask_of(0), cpumask_of(0)); -- else -- { -- for_each_sched_unit ( d, unit ) -- { -- if ( !opt_dom0_vcpus_pin && !dom0_affinity_relaxed ) -- sched_set_affinity(unit, &dom0_cpus, NULL); -- sched_set_affinity(unit, NULL, &dom0_cpus); -- } -- } -- - domain_update_node_affinity(d); - } - #endif --- -2.37.4 - diff --git a/0036-tools-libxl-Replace-deprecated-sdl-option-on-QEMU-co.patch b/0036-tools-libxl-Replace-deprecated-sdl-option-on-QEMU-co.patch deleted file mode 100644 index bd1b1cb..0000000 --- a/0036-tools-libxl-Replace-deprecated-sdl-option-on-QEMU-co.patch +++ /dev/null @@ -1,38 +0,0 @@ -From c373ad3d084614a93c55e25dc20e70ffc7574971 Mon Sep 17 00:00:00 2001 -From: Anthony PERARD <anthony.perard@citrix.com> -Date: Mon, 15 Aug 2022 15:42:09 +0200 -Subject: [PATCH 036/126] tools/libxl: Replace deprecated -sdl option on QEMU - command line - -"-sdl" is deprecated upstream since 6695e4c0fd9e ("softmmu/vl: -Deprecate the -sdl and -curses option"), QEMU v6.2, and the option is -removed by 707d93d4abc6 ("ui: Remove deprecated options "-sdl" and -"-curses""), in upcoming QEMU v7.1. - -Instead, use "-display sdl", available since 1472a95bab1e ("Introduce --display argument"), before QEMU v1.0. - -Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> -Reviewed-by: Jason Andryuk <jandryuk@gmail.com> -master commit: 41fcb3af8ad6d4c9f65a9d72798e6d18afec55ac -master date: 2022-08-11 11:47:11 +0200 ---- - tools/libs/light/libxl_dm.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c -index 24f6e73b0a77..ae5f35e0c3fd 100644 ---- a/tools/libs/light/libxl_dm.c -+++ b/tools/libs/light/libxl_dm.c -@@ -1349,7 +1349,7 @@ static int libxl__build_device_model_args_new(libxl__gc *gc, - flexarray_append_pair(dm_args, "-display", "none"); - - if (sdl && !is_stubdom) { -- flexarray_append(dm_args, "-sdl"); -+ flexarray_append_pair(dm_args, "-display", "sdl"); - if (sdl->display) - flexarray_append_pair(dm_envs, "DISPLAY", sdl->display); - if (sdl->xauthority) --- -2.37.4 - diff --git a/0037-x86-spec-ctrl-Enumeration-for-PBRSB_NO.patch b/0037-x86-spec-ctrl-Enumeration-for-PBRSB_NO.patch deleted file mode 100644 index bfd812b..0000000 --- a/0037-x86-spec-ctrl-Enumeration-for-PBRSB_NO.patch +++ /dev/null @@ -1,67 +0,0 @@ -From fba0c22e79922085c46527eb1391123aadfb24d1 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Mon, 15 Aug 2022 15:42:31 +0200 -Subject: [PATCH 037/126] x86/spec-ctrl: Enumeration for PBRSB_NO - -The PBRSB_NO bit indicates that the CPU is not vulnerable to the Post-Barrier -RSB speculative vulnerability. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: b874e47eb13feb75be3ee7b5dc4ae9c97d80d774 -master date: 2022-08-11 16:19:50 +0100 ---- - xen/arch/x86/msr.c | 2 +- - xen/arch/x86/spec_ctrl.c | 3 ++- - xen/include/asm-x86/msr-index.h | 1 + - 3 files changed, 4 insertions(+), 2 deletions(-) - -diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c -index aa9face9aad3..9bced8d36caa 100644 ---- a/xen/arch/x86/msr.c -+++ b/xen/arch/x86/msr.c -@@ -148,7 +148,7 @@ int init_domain_msr_policy(struct domain *d) - ARCH_CAPS_SSB_NO | ARCH_CAPS_MDS_NO | ARCH_CAPS_IF_PSCHANGE_MC_NO | - ARCH_CAPS_TAA_NO | ARCH_CAPS_SBDR_SSDP_NO | ARCH_CAPS_FBSDP_NO | - ARCH_CAPS_PSDP_NO | ARCH_CAPS_FB_CLEAR | ARCH_CAPS_RRSBA | -- ARCH_CAPS_BHI_NO); -+ ARCH_CAPS_BHI_NO | ARCH_CAPS_PBRSB_NO); - } - - d->arch.msr = mp; -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index ac73806eacd8..3ff602bd0281 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -419,7 +419,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - * Hardware read-only information, stating immunity to certain issues, or - * suggestions of which mitigation to use. - */ -- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", -+ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", - (caps & ARCH_CAPS_IBRS_ALL) ? " IBRS_ALL" : "", - (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", -@@ -431,6 +431,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - (caps & ARCH_CAPS_SBDR_SSDP_NO) ? " SBDR_SSDP_NO" : "", - (caps & ARCH_CAPS_FBSDP_NO) ? " FBSDP_NO" : "", - (caps & ARCH_CAPS_PSDP_NO) ? " PSDP_NO" : "", -+ (caps & ARCH_CAPS_PBRSB_NO) ? " PBRSB_NO" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS)) ? " IBRS_ALWAYS" : "", - (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", -diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h -index 49ca1f1845e6..5a830f76a8d4 100644 ---- a/xen/include/asm-x86/msr-index.h -+++ b/xen/include/asm-x86/msr-index.h -@@ -66,6 +66,7 @@ - #define ARCH_CAPS_FB_CLEAR_CTRL (_AC(1, ULL) << 18) - #define ARCH_CAPS_RRSBA (_AC(1, ULL) << 19) - #define ARCH_CAPS_BHI_NO (_AC(1, ULL) << 20) -+#define ARCH_CAPS_PBRSB_NO (_AC(1, ULL) << 24) - - #define MSR_FLUSH_CMD 0x0000010b - #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) --- -2.37.4 - diff --git a/0038-x86-amd-only-call-setup_force_cpu_cap-for-boot-CPU.patch b/0038-x86-amd-only-call-setup_force_cpu_cap-for-boot-CPU.patch deleted file mode 100644 index e3d159b..0000000 --- a/0038-x86-amd-only-call-setup_force_cpu_cap-for-boot-CPU.patch +++ /dev/null @@ -1,33 +0,0 @@ -From 104a54a307b08945365faf6d285cd5a02f94a80f Mon Sep 17 00:00:00 2001 -From: Ross Lagerwall <ross.lagerwall@citrix.com> -Date: Mon, 15 Aug 2022 15:43:08 +0200 -Subject: [PATCH 038/126] x86/amd: only call setup_force_cpu_cap for boot CPU - -This should only be called for the boot CPU to avoid calling _init code -after it has been unloaded. - -Fixes: 062868a5a8b4 ("x86/amd: Work around CLFLUSH ordering on older parts") -Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 31b41ce858c8bd5159212d40969f8e0b7124bbf0 -master date: 2022-08-11 17:44:26 +0200 ---- - xen/arch/x86/cpu/amd.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c -index 60dbe61a61ca..a8d2fb8a1590 100644 ---- a/xen/arch/x86/cpu/amd.c -+++ b/xen/arch/x86/cpu/amd.c -@@ -820,7 +820,7 @@ static void init_amd(struct cpuinfo_x86 *c) - * everything, including reads and writes to address, and - * LFENCE/SFENCE instructions. - */ -- if (!cpu_has_clflushopt) -+ if (c == &boot_cpu_data && !cpu_has_clflushopt) - setup_force_cpu_cap(X86_BUG_CLFLUSH_MFENCE); - - switch(c->x86) --- -2.37.4 - diff --git a/0039-build-x86-suppress-GNU-ld-2.39-warning-about-RWX-loa.patch b/0039-build-x86-suppress-GNU-ld-2.39-warning-about-RWX-loa.patch deleted file mode 100644 index f6e62b7..0000000 --- a/0039-build-x86-suppress-GNU-ld-2.39-warning-about-RWX-loa.patch +++ /dev/null @@ -1,38 +0,0 @@ -From a075900cf768fe45f270b6f1d09c4e99281da142 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Mon, 15 Aug 2022 15:43:56 +0200 -Subject: [PATCH 039/126] build/x86: suppress GNU ld 2.39 warning about RWX - load segments - -Commit 68f5aac012b9 ("build: suppress future GNU ld warning about RWX -load segments") didn't quite cover all the cases: Apparently I missed -ones in the building of 32-bit helper objects because of only looking at -incremental builds (where those wouldn't normally be re-built). Clone -the workaround there to the specific Makefile in question. - -Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 3eb1865ae305772b558757904d81951e31de43de -master date: 2022-08-11 17:45:12 +0200 ---- - xen/arch/x86/boot/build32.mk | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/xen/arch/x86/boot/build32.mk b/xen/arch/x86/boot/build32.mk -index e90680cd9f52..d2fae5cf9eee 100644 ---- a/xen/arch/x86/boot/build32.mk -+++ b/xen/arch/x86/boot/build32.mk -@@ -8,6 +8,9 @@ CFLAGS += -Werror -fno-builtin -g0 -msoft-float - CFLAGS += -I$(BASEDIR)/include - CFLAGS := $(filter-out -flto,$(CFLAGS)) - -+LDFLAGS_DIRECT-$(shell $(LD) -v --warn-rwx-segments >/dev/null 2>&1 && echo y) := --no-warn-rwx-segments -+LDFLAGS_DIRECT += $(LDFLAGS_DIRECT-y) -+ - # NB. awk invocation is a portable alternative to 'head -n -1' - %.S: %.bin - (od -v -t x $< | tr -s ' ' | awk 'NR > 1 {print s} {s=$$0}' | \ --- -2.37.4 - diff --git a/0040-PCI-simplify-and-thus-correct-pci_get_pdev-_by_domai.patch b/0040-PCI-simplify-and-thus-correct-pci_get_pdev-_by_domai.patch deleted file mode 100644 index 1de5d0d..0000000 --- a/0040-PCI-simplify-and-thus-correct-pci_get_pdev-_by_domai.patch +++ /dev/null @@ -1,153 +0,0 @@ -From 9acedc3c58c31930737edbe212f2ccf437a0b757 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Mon, 15 Aug 2022 15:44:23 +0200 -Subject: [PATCH 040/126] PCI: simplify (and thus correct) - pci_get_pdev{,_by_domain}() - -The last "wildcard" use of either function went away with f591755823a7 -("IOMMU/PCI: don't let domain cleanup continue when device de-assignment -failed"). Don't allow them to be called this way anymore. Besides -simplifying the code this also fixes two bugs: - -1) When seg != -1, the outer loops should have been terminated after the - first iteration, or else a device with the same BDF but on another - segment could be found / returned. - -Reported-by: Rahul Singh <rahul.singh@arm.com> - -2) When seg == -1 calling get_pseg() is bogus. The function (taking a - u16) would look for segment 0xffff, which might exist. If it exists, - we might then find / return a wrong device. - -In pci_get_pdev_by_domain() also switch from using the per-segment list -to using the per-domain one, with the exception of the hardware domain -(see the code comment there). - -While there also constify "pseg" and drop "pdev"'s already previously -unnecessary initializer. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Rahul Singh <rahul.singh@arm.com> -Tested-by: Rahul Singh <rahul.singh@arm.com> -master commit: 8cf6e0738906fc269af40135ed82a07815dd3b9c -master date: 2022-08-12 08:34:33 +0200 ---- - xen/drivers/passthrough/pci.c | 61 +++++++++++++++-------------------- - xen/include/xen/pci.h | 6 ++-- - 2 files changed, 29 insertions(+), 38 deletions(-) - -diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c -index bbacbe41dac4..9b81b941c8bb 100644 ---- a/xen/drivers/passthrough/pci.c -+++ b/xen/drivers/passthrough/pci.c -@@ -528,30 +528,19 @@ int __init pci_ro_device(int seg, int bus, int devfn) - return 0; - } - --struct pci_dev *pci_get_pdev(int seg, int bus, int devfn) -+struct pci_dev *pci_get_pdev(uint16_t seg, uint8_t bus, uint8_t devfn) - { -- struct pci_seg *pseg = get_pseg(seg); -- struct pci_dev *pdev = NULL; -+ const struct pci_seg *pseg = get_pseg(seg); -+ struct pci_dev *pdev; - - ASSERT(pcidevs_locked()); -- ASSERT(seg != -1 || bus == -1); -- ASSERT(bus != -1 || devfn == -1); - - if ( !pseg ) -- { -- if ( seg == -1 ) -- radix_tree_gang_lookup(&pci_segments, (void **)&pseg, 0, 1); -- if ( !pseg ) -- return NULL; -- } -+ return NULL; - -- do { -- list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list ) -- if ( (pdev->bus == bus || bus == -1) && -- (pdev->devfn == devfn || devfn == -1) ) -- return pdev; -- } while ( radix_tree_gang_lookup(&pci_segments, (void **)&pseg, -- pseg->nr + 1, 1) ); -+ list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list ) -+ if ( pdev->bus == bus && pdev->devfn == devfn ) -+ return pdev; - - return NULL; - } -@@ -577,31 +566,33 @@ struct pci_dev *pci_get_real_pdev(int seg, int bus, int devfn) - return pdev; - } - --struct pci_dev *pci_get_pdev_by_domain(const struct domain *d, int seg, -- int bus, int devfn) -+struct pci_dev *pci_get_pdev_by_domain(const struct domain *d, uint16_t seg, -+ uint8_t bus, uint8_t devfn) - { -- struct pci_seg *pseg = get_pseg(seg); -- struct pci_dev *pdev = NULL; -+ struct pci_dev *pdev; - -- ASSERT(seg != -1 || bus == -1); -- ASSERT(bus != -1 || devfn == -1); -- -- if ( !pseg ) -+ /* -+ * The hardware domain owns the majority of the devices in the system. -+ * When there are multiple segments, traversing the per-segment list is -+ * likely going to be faster, whereas for a single segment the difference -+ * shouldn't be that large. -+ */ -+ if ( is_hardware_domain(d) ) - { -- if ( seg == -1 ) -- radix_tree_gang_lookup(&pci_segments, (void **)&pseg, 0, 1); -+ const struct pci_seg *pseg = get_pseg(seg); -+ - if ( !pseg ) - return NULL; -- } - -- do { - list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list ) -- if ( (pdev->bus == bus || bus == -1) && -- (pdev->devfn == devfn || devfn == -1) && -- (pdev->domain == d) ) -+ if ( pdev->bus == bus && pdev->devfn == devfn && -+ pdev->domain == d ) -+ return pdev; -+ } -+ else -+ list_for_each_entry ( pdev, &d->pdev_list, domain_list ) -+ if ( pdev->bus == bus && pdev->devfn == devfn ) - return pdev; -- } while ( radix_tree_gang_lookup(&pci_segments, (void **)&pseg, -- pseg->nr + 1, 1) ); - - return NULL; - } -diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h -index 8e3d4d94543a..cd238ae852b0 100644 ---- a/xen/include/xen/pci.h -+++ b/xen/include/xen/pci.h -@@ -166,10 +166,10 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn, - int pci_remove_device(u16 seg, u8 bus, u8 devfn); - int pci_ro_device(int seg, int bus, int devfn); - int pci_hide_device(unsigned int seg, unsigned int bus, unsigned int devfn); --struct pci_dev *pci_get_pdev(int seg, int bus, int devfn); -+struct pci_dev *pci_get_pdev(uint16_t seg, uint8_t bus, uint8_t devfn); - struct pci_dev *pci_get_real_pdev(int seg, int bus, int devfn); --struct pci_dev *pci_get_pdev_by_domain(const struct domain *, int seg, -- int bus, int devfn); -+struct pci_dev *pci_get_pdev_by_domain(const struct domain *, uint16_t seg, -+ uint8_t bus, uint8_t devfn); - void pci_check_disable_device(u16 seg, u8 bus, u8 devfn); - - uint8_t pci_conf_read8(pci_sbdf_t sbdf, unsigned int reg); --- -2.37.4 - diff --git a/0041-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch b/0041-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch deleted file mode 100644 index e695f96..0000000 --- a/0041-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch +++ /dev/null @@ -1,62 +0,0 @@ -From 09fc590c15773c2471946a78740c6b02e8c34a45 Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Tue, 11 Oct 2022 15:05:53 +0200 -Subject: [PATCH 041/126] xen/arm: p2m: Prevent adding mapping when domain is - dying - -During the domain destroy process, the domain will still be accessible -until it is fully destroyed. So does the P2M because we don't bail -out early if is_dying is non-zero. If a domain has permission to -modify the other domain's P2M (i.e. dom0, or a stubdomain), then -foreign mapping can be added past relinquish_p2m_mapping(). - -Therefore, we need to prevent mapping to be added when the domain -is dying. This commit prevents such adding of mapping by adding the -d->is_dying check to p2m_set_entry(). Also this commit enhances the -check in relinquish_p2m_mapping() to make sure that no mappings can -be added in the P2M after the P2M lock is released. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Henry Wang <Henry.Wang@arm.com> -Tested-by: Henry Wang <Henry.Wang@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -master commit: 3ebe773293e3b945460a3d6f54f3b91915397bab -master date: 2022-10-11 14:20:18 +0200 ---- - xen/arch/arm/p2m.c | 11 +++++++++++ - 1 file changed, 11 insertions(+) - -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index 2ddd06801a82..8398251c518b 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -1093,6 +1093,15 @@ int p2m_set_entry(struct p2m_domain *p2m, - { - int rc = 0; - -+ /* -+ * Any reference taken by the P2M mappings (e.g. foreign mapping) will -+ * be dropped in relinquish_p2m_mapping(). As the P2M will still -+ * be accessible after, we need to prevent mapping to be added when the -+ * domain is dying. -+ */ -+ if ( unlikely(p2m->domain->is_dying) ) -+ return -ENOMEM; -+ - while ( nr ) - { - unsigned long mask; -@@ -1613,6 +1622,8 @@ int relinquish_p2m_mapping(struct domain *d) - unsigned int order; - gfn_t start, end; - -+ BUG_ON(!d->is_dying); -+ /* No mappings can be added in the P2M after the P2M lock is released. */ - p2m_write_lock(p2m); - - start = p2m->lowest_mapped_gfn; --- -2.37.4 - diff --git a/0042-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch b/0042-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch deleted file mode 100644 index 96b8528..0000000 --- a/0042-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch +++ /dev/null @@ -1,167 +0,0 @@ -From 0d805f9fba4bc155d15047685024f7d842e925e4 Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Tue, 11 Oct 2022 15:06:36 +0200 -Subject: [PATCH 042/126] xen/arm: p2m: Handle preemption when freeing - intermediate page tables - -At the moment the P2M page tables will be freed when the domain structure -is freed without any preemption. As the P2M is quite large, iterating -through this may take more time than it is reasonable without intermediate -preemption (to run softirqs and perhaps scheduler). - -Split p2m_teardown() in two parts: one preemptible and called when -relinquishing the resources, the other one non-preemptible and called -when freeing the domain structure. - -As we are now freeing the P2M pages early, we also need to prevent -further allocation if someone call p2m_set_entry() past p2m_teardown() -(I wasn't able to prove this will never happen). This is done by -the checking domain->is_dying from previous patch in p2m_set_entry(). - -Similarly, we want to make sure that no-one can accessed the free -pages. Therefore the root is cleared before freeing pages. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Henry Wang <Henry.Wang@arm.com> -Tested-by: Henry Wang <Henry.Wang@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -master commit: 3202084566bba0ef0c45caf8c24302f83d92f9c8 -master date: 2022-10-11 14:20:56 +0200 ---- - xen/arch/arm/domain.c | 10 +++++++-- - xen/arch/arm/p2m.c | 47 ++++++++++++++++++++++++++++++++++++--- - xen/include/asm-arm/p2m.h | 13 +++++++++-- - 3 files changed, 63 insertions(+), 7 deletions(-) - -diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c -index 5eaf4c718ec3..223ec9694df1 100644 ---- a/xen/arch/arm/domain.c -+++ b/xen/arch/arm/domain.c -@@ -779,10 +779,10 @@ fail: - void arch_domain_destroy(struct domain *d) - { - /* IOMMU page table is shared with P2M, always call -- * iommu_domain_destroy() before p2m_teardown(). -+ * iommu_domain_destroy() before p2m_final_teardown(). - */ - iommu_domain_destroy(d); -- p2m_teardown(d); -+ p2m_final_teardown(d); - domain_vgic_free(d); - domain_vuart_free(d); - free_xenheap_page(d->shared_info); -@@ -984,6 +984,7 @@ enum { - PROG_xen, - PROG_page, - PROG_mapping, -+ PROG_p2m, - PROG_done, - }; - -@@ -1038,6 +1039,11 @@ int domain_relinquish_resources(struct domain *d) - if ( ret ) - return ret; - -+ PROGRESS(p2m): -+ ret = p2m_teardown(d); -+ if ( ret ) -+ return ret; -+ - PROGRESS(done): - break; - -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index 8398251c518b..4ad3e0606e9c 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -1530,17 +1530,58 @@ static void p2m_free_vmid(struct domain *d) - spin_unlock(&vmid_alloc_lock); - } - --void p2m_teardown(struct domain *d) -+int p2m_teardown(struct domain *d) - { - struct p2m_domain *p2m = p2m_get_hostp2m(d); -+ unsigned long count = 0; - struct page_info *pg; -+ unsigned int i; -+ int rc = 0; -+ -+ p2m_write_lock(p2m); -+ -+ /* -+ * We are about to free the intermediate page-tables, so clear the -+ * root to prevent any walk to use them. -+ */ -+ for ( i = 0; i < P2M_ROOT_PAGES; i++ ) -+ clear_and_clean_page(p2m->root + i); -+ -+ /* -+ * The domain will not be scheduled anymore, so in theory we should -+ * not need to flush the TLBs. Do it for safety purpose. -+ * -+ * Note that all the devices have already been de-assigned. So we don't -+ * need to flush the IOMMU TLB here. -+ */ -+ p2m_force_tlb_flush_sync(p2m); -+ -+ while ( (pg = page_list_remove_head(&p2m->pages)) ) -+ { -+ free_domheap_page(pg); -+ count++; -+ /* Arbitrarily preempt every 512 iterations */ -+ if ( !(count % 512) && hypercall_preempt_check() ) -+ { -+ rc = -ERESTART; -+ break; -+ } -+ } -+ -+ p2m_write_unlock(p2m); -+ -+ return rc; -+} -+ -+void p2m_final_teardown(struct domain *d) -+{ -+ struct p2m_domain *p2m = p2m_get_hostp2m(d); - - /* p2m not actually initialized */ - if ( !p2m->domain ) - return; - -- while ( (pg = page_list_remove_head(&p2m->pages)) ) -- free_domheap_page(pg); -+ ASSERT(page_list_empty(&p2m->pages)); - - if ( p2m->root ) - free_domheap_pages(p2m->root, P2M_ROOT_ORDER); -diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h -index 6a2108398fd7..3a2d51b35d71 100644 ---- a/xen/include/asm-arm/p2m.h -+++ b/xen/include/asm-arm/p2m.h -@@ -192,8 +192,17 @@ void setup_virt_paging(void); - /* Init the datastructures for later use by the p2m code */ - int p2m_init(struct domain *d); - --/* Return all the p2m resources to Xen. */ --void p2m_teardown(struct domain *d); -+/* -+ * The P2M resources are freed in two parts: -+ * - p2m_teardown() will be called when relinquish the resources. It -+ * will free large resources (e.g. intermediate page-tables) that -+ * requires preemption. -+ * - p2m_final_teardown() will be called when domain struct is been -+ * freed. This *cannot* be preempted and therefore one small -+ * resources should be freed here. -+ */ -+int p2m_teardown(struct domain *d); -+void p2m_final_teardown(struct domain *d); - - /* - * Remove mapping refcount on each mapping page in the p2m --- -2.37.4 - diff --git a/0043-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch b/0043-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch deleted file mode 100644 index f8d61bb..0000000 --- a/0043-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch +++ /dev/null @@ -1,138 +0,0 @@ -From 0f3eab90f327210d91e8e31a769376f286e8819a Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 11 Oct 2022 15:07:25 +0200 -Subject: [PATCH 043/126] x86/p2m: add option to skip root pagetable removal in - p2m_teardown() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Add a new parameter to p2m_teardown() in order to select whether the -root page table should also be freed. Note that all users are -adjusted to pass the parameter to remove the root page tables, so -behavior is not modified. - -No functional change intended. - -This is part of CVE-2022-33746 / XSA-410. - -Suggested-by: Julien Grall <julien@xen.org> -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Tim Deegan <tim@xen.org> -master commit: 1df52a270225527ae27bfa2fc40347bf93b78357 -master date: 2022-10-11 14:21:23 +0200 ---- - xen/arch/x86/mm/hap/hap.c | 6 +++--- - xen/arch/x86/mm/p2m.c | 20 ++++++++++++++++---- - xen/arch/x86/mm/shadow/common.c | 4 ++-- - xen/include/asm-x86/p2m.h | 2 +- - 4 files changed, 22 insertions(+), 10 deletions(-) - -diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c -index 47a7487fa7a3..a8f5a19da917 100644 ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -541,18 +541,18 @@ void hap_final_teardown(struct domain *d) - } - - for ( i = 0; i < MAX_ALTP2M; i++ ) -- p2m_teardown(d->arch.altp2m_p2m[i]); -+ p2m_teardown(d->arch.altp2m_p2m[i], true); - } - - /* Destroy nestedp2m's first */ - for (i = 0; i < MAX_NESTEDP2M; i++) { -- p2m_teardown(d->arch.nested_p2m[i]); -+ p2m_teardown(d->arch.nested_p2m[i], true); - } - - if ( d->arch.paging.hap.total_pages != 0 ) - hap_teardown(d, NULL); - -- p2m_teardown(p2m_get_hostp2m(d)); -+ p2m_teardown(p2m_get_hostp2m(d), true); - /* Free any memory that the p2m teardown released */ - paging_lock(d); - hap_set_allocation(d, 0, NULL); -diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c -index 85681dee2623..8ba73082c1bf 100644 ---- a/xen/arch/x86/mm/p2m.c -+++ b/xen/arch/x86/mm/p2m.c -@@ -741,11 +741,11 @@ int p2m_alloc_table(struct p2m_domain *p2m) - * hvm fixme: when adding support for pvh non-hardware domains, this path must - * cleanup any foreign p2m types (release refcnts on them). - */ --void p2m_teardown(struct p2m_domain *p2m) -+void p2m_teardown(struct p2m_domain *p2m, bool remove_root) - /* Return all the p2m pages to Xen. - * We know we don't have any extra mappings to these pages */ - { -- struct page_info *pg; -+ struct page_info *pg, *root_pg = NULL; - struct domain *d; - - if (p2m == NULL) -@@ -755,10 +755,22 @@ void p2m_teardown(struct p2m_domain *p2m) - - p2m_lock(p2m); - ASSERT(atomic_read(&d->shr_pages) == 0); -- p2m->phys_table = pagetable_null(); -+ -+ if ( remove_root ) -+ p2m->phys_table = pagetable_null(); -+ else if ( !pagetable_is_null(p2m->phys_table) ) -+ { -+ root_pg = pagetable_get_page(p2m->phys_table); -+ clear_domain_page(pagetable_get_mfn(p2m->phys_table)); -+ } - - while ( (pg = page_list_remove_head(&p2m->pages)) ) -- d->arch.paging.free_page(d, pg); -+ if ( pg != root_pg ) -+ d->arch.paging.free_page(d, pg); -+ -+ if ( root_pg ) -+ page_list_add(root_pg, &p2m->pages); -+ - p2m_unlock(p2m); - } - -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index 4a8882430b3f..abe6d4334382 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -2768,7 +2768,7 @@ int shadow_enable(struct domain *d, u32 mode) - paging_unlock(d); - out_unlocked: - if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) ) -- p2m_teardown(p2m); -+ p2m_teardown(p2m, true); - if ( rv != 0 && pg != NULL ) - { - pg->count_info &= ~PGC_count_mask; -@@ -2933,7 +2933,7 @@ void shadow_final_teardown(struct domain *d) - shadow_teardown(d, NULL); - - /* It is now safe to pull down the p2m map. */ -- p2m_teardown(p2m_get_hostp2m(d)); -+ p2m_teardown(p2m_get_hostp2m(d), true); - /* Free any shadow memory that the p2m teardown released */ - paging_lock(d); - shadow_set_allocation(d, 0, NULL); -diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h -index 46e8b94a49df..46eb51d44cf5 100644 ---- a/xen/include/asm-x86/p2m.h -+++ b/xen/include/asm-x86/p2m.h -@@ -619,7 +619,7 @@ int p2m_init(struct domain *d); - int p2m_alloc_table(struct p2m_domain *p2m); - - /* Return all the p2m resources to Xen. */ --void p2m_teardown(struct p2m_domain *p2m); -+void p2m_teardown(struct p2m_domain *p2m, bool remove_root); - void p2m_final_teardown(struct domain *d); - - /* Add a page to a domain's p2m table */ --- -2.37.4 - diff --git a/0044-x86-HAP-adjust-monitor-table-related-error-handling.patch b/0044-x86-HAP-adjust-monitor-table-related-error-handling.patch deleted file mode 100644 index 97a55a5..0000000 --- a/0044-x86-HAP-adjust-monitor-table-related-error-handling.patch +++ /dev/null @@ -1,77 +0,0 @@ -From d24a10a91d46a56e1d406239643ec651a31033d4 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 11 Oct 2022 15:07:42 +0200 -Subject: [PATCH 044/126] x86/HAP: adjust monitor table related error handling -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -hap_make_monitor_table() will return INVALID_MFN if it encounters an -error condition, but hap_update_paging_modes() wasn’t handling this -value, resulting in an inappropriate value being stored in -monitor_table. This would subsequently misguide at least -hap_vcpu_teardown(). Avoid this by bailing early. - -Further, when a domain has/was already crashed or (perhaps less -important as there's no such path known to lead here) is already dying, -avoid calling domain_crash() on it again - that's at best confusing. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: 5b44a61180f4f2e4f490a28400c884dd357ff45d -master date: 2022-10-11 14:21:56 +0200 ---- - xen/arch/x86/mm/hap/hap.c | 14 ++++++++++++-- - 1 file changed, 12 insertions(+), 2 deletions(-) - -diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c -index a8f5a19da917..d75dc2b9ed3d 100644 ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -39,6 +39,7 @@ - #include <asm/domain.h> - #include <xen/numa.h> - #include <asm/hvm/nestedhvm.h> -+#include <public/sched.h> - - #include "private.h" - -@@ -405,8 +406,13 @@ static mfn_t hap_make_monitor_table(struct vcpu *v) - return m4mfn; - - oom: -- printk(XENLOG_G_ERR "out of memory building monitor pagetable\n"); -- domain_crash(d); -+ if ( !d->is_dying && -+ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) -+ { -+ printk(XENLOG_G_ERR "%pd: out of memory building monitor pagetable\n", -+ d); -+ domain_crash(d); -+ } - return INVALID_MFN; - } - -@@ -766,6 +772,9 @@ static void hap_update_paging_modes(struct vcpu *v) - if ( pagetable_is_null(v->arch.hvm.monitor_table) ) - { - mfn_t mmfn = hap_make_monitor_table(v); -+ -+ if ( mfn_eq(mmfn, INVALID_MFN) ) -+ goto unlock; - v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn); - make_cr3(v, mmfn); - hvm_update_host_cr3(v); -@@ -774,6 +783,7 @@ static void hap_update_paging_modes(struct vcpu *v) - /* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */ - hap_update_cr3(v, 0, false); - -+ unlock: - paging_unlock(d); - put_gfn(d, cr3_gfn); - } --- -2.37.4 - diff --git a/0045-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch b/0045-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch deleted file mode 100644 index 08ff309..0000000 --- a/0045-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch +++ /dev/null @@ -1,76 +0,0 @@ -From 95f6d555ec84383f7daaf3374f65bec5ff4351f5 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 11 Oct 2022 15:07:57 +0200 -Subject: [PATCH 045/126] x86/shadow: tolerate failure of - sh_set_toplevel_shadow() - -Subsequently sh_set_toplevel_shadow() will be adjusted to install a -blank entry in case prealloc fails. There are, in fact, pre-existing -error paths which would put in place a blank entry. The 4- and 2-level -code in sh_update_cr3(), however, assume the top level entry to be -valid. - -Hence bail from the function in the unlikely event that it's not. Note -that 3-level logic works differently: In particular a guest is free to -supply a PDPTR pointing at 4 non-present (or otherwise deemed invalid) -entries. The guest will crash, but we already cope with that. - -Really mfn_valid() is likely wrong to use in sh_set_toplevel_shadow(), -and it should instead be !mfn_eq(gmfn, INVALID_MFN). Avoid such a change -in security context, but add a respective assertion. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Tim Deegan <tim@xen.org> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: eac000978c1feb5a9ee3236ab0c0da9a477e5336 -master date: 2022-10-11 14:22:24 +0200 ---- - xen/arch/x86/mm/shadow/common.c | 1 + - xen/arch/x86/mm/shadow/multi.c | 10 ++++++++++ - 2 files changed, 11 insertions(+) - -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index abe6d4334382..0ab2ac6b7a3c 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -2583,6 +2583,7 @@ void sh_set_toplevel_shadow(struct vcpu *v, - /* Now figure out the new contents: is this a valid guest MFN? */ - if ( !mfn_valid(gmfn) ) - { -+ ASSERT(mfn_eq(gmfn, INVALID_MFN)); - new_entry = pagetable_null(); - goto install_new_entry; - } -diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c -index 9b43cb116c47..7e0494cf7faa 100644 ---- a/xen/arch/x86/mm/shadow/multi.c -+++ b/xen/arch/x86/mm/shadow/multi.c -@@ -3697,6 +3697,11 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) - if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 ) - guest_flush_tlb_mask(d, d->dirty_cpumask); - sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, sh_make_shadow); -+ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) -+ { -+ ASSERT(d->is_dying || d->is_shutting_down); -+ return; -+ } - if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) ) - { - mfn_t smfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[0]); -@@ -3757,6 +3762,11 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) - if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 ) - guest_flush_tlb_mask(d, d->dirty_cpumask); - sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, sh_make_shadow); -+ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) -+ { -+ ASSERT(d->is_dying || d->is_shutting_down); -+ return; -+ } - #else - #error This should never happen - #endif --- -2.37.4 - diff --git a/0046-x86-shadow-tolerate-failure-in-shadow_prealloc.patch b/0046-x86-shadow-tolerate-failure-in-shadow_prealloc.patch deleted file mode 100644 index 4773eef..0000000 --- a/0046-x86-shadow-tolerate-failure-in-shadow_prealloc.patch +++ /dev/null @@ -1,279 +0,0 @@ -From 1e26afa846fb9a00b9155280eeae3b8cb8375dd6 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 11 Oct 2022 15:08:14 +0200 -Subject: [PATCH 046/126] x86/shadow: tolerate failure in shadow_prealloc() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Prevent _shadow_prealloc() from calling BUG() when unable to fulfill -the pre-allocation and instead return true/false. Modify -shadow_prealloc() to crash the domain on allocation failure (if the -domain is not already dying), as shadow cannot operate normally after -that. Modify callers to also gracefully handle {_,}shadow_prealloc() -failing to fulfill the request. - -Note this in turn requires adjusting the callers of -sh_make_monitor_table() also to handle it returning INVALID_MFN. -sh_update_paging_modes() is also modified to add additional error -paths in case of allocation failure, some of those will return with -null monitor page tables (and the domain likely crashed). This is no -different that current error paths, but the newly introduced ones are -more likely to trigger. - -The now added failure points in sh_update_paging_modes() also require -that on some error return paths the previous structures are cleared, -and thus monitor table is null. - -While there adjust the 'type' parameter type of shadow_prealloc() to -unsigned int rather than u32. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Tim Deegan <tim@xen.org> -master commit: b7f93c6afb12b6061e2d19de2f39ea09b569ac68 -master date: 2022-10-11 14:22:53 +0200 ---- - xen/arch/x86/mm/shadow/common.c | 69 ++++++++++++++++++++++++-------- - xen/arch/x86/mm/shadow/hvm.c | 4 +- - xen/arch/x86/mm/shadow/multi.c | 11 +++-- - xen/arch/x86/mm/shadow/private.h | 3 +- - 4 files changed, 66 insertions(+), 21 deletions(-) - -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index 0ab2ac6b7a3c..fc4f7f78ce43 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -36,6 +36,7 @@ - #include <asm/flushtlb.h> - #include <asm/shadow.h> - #include <xen/numa.h> -+#include <public/sched.h> - #include "private.h" - - DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags); -@@ -927,14 +928,15 @@ static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn) - - /* Make sure there are at least count order-sized pages - * available in the shadow page pool. */ --static void _shadow_prealloc(struct domain *d, unsigned int pages) -+static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) - { - struct vcpu *v; - struct page_info *sp, *t; - mfn_t smfn; - int i; - -- if ( d->arch.paging.shadow.free_pages >= pages ) return; -+ if ( d->arch.paging.shadow.free_pages >= pages ) -+ return true; - - /* Shouldn't have enabled shadows if we've no vcpus. */ - ASSERT(d->vcpu && d->vcpu[0]); -@@ -950,7 +952,8 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages) - sh_unpin(d, smfn); - - /* See if that freed up enough space */ -- if ( d->arch.paging.shadow.free_pages >= pages ) return; -+ if ( d->arch.paging.shadow.free_pages >= pages ) -+ return true; - } - - /* Stage two: all shadow pages are in use in hierarchies that are -@@ -973,7 +976,7 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages) - if ( d->arch.paging.shadow.free_pages >= pages ) - { - guest_flush_tlb_mask(d, d->dirty_cpumask); -- return; -+ return true; - } - } - } -@@ -986,7 +989,12 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages) - d->arch.paging.shadow.total_pages, - d->arch.paging.shadow.free_pages, - d->arch.paging.shadow.p2m_pages); -- BUG(); -+ -+ ASSERT(d->is_dying); -+ -+ guest_flush_tlb_mask(d, d->dirty_cpumask); -+ -+ return false; - } - - /* Make sure there are at least count pages of the order according to -@@ -994,9 +1002,19 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages) - * This must be called before any calls to shadow_alloc(). Since this - * will free existing shadows to make room, it must be called early enough - * to avoid freeing shadows that the caller is currently working on. */ --void shadow_prealloc(struct domain *d, u32 type, unsigned int count) -+bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count) - { -- return _shadow_prealloc(d, shadow_size(type) * count); -+ bool ret = _shadow_prealloc(d, shadow_size(type) * count); -+ -+ if ( !ret && !d->is_dying && -+ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) -+ /* -+ * Failing to allocate memory required for shadow usage can only result in -+ * a domain crash, do it here rather that relying on every caller to do it. -+ */ -+ domain_crash(d); -+ -+ return ret; - } - - /* Deliberately free all the memory we can: this will tear down all of -@@ -1215,7 +1233,7 @@ void shadow_free(struct domain *d, mfn_t smfn) - static struct page_info * - shadow_alloc_p2m_page(struct domain *d) - { -- struct page_info *pg; -+ struct page_info *pg = NULL; - - /* This is called both from the p2m code (which never holds the - * paging lock) and the log-dirty code (which always does). */ -@@ -1233,16 +1251,18 @@ shadow_alloc_p2m_page(struct domain *d) - d->arch.paging.shadow.p2m_pages, - shadow_min_acceptable_pages(d)); - } -- paging_unlock(d); -- return NULL; -+ goto out; - } - -- shadow_prealloc(d, SH_type_p2m_table, 1); -+ if ( !shadow_prealloc(d, SH_type_p2m_table, 1) ) -+ goto out; -+ - pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0)); - d->arch.paging.shadow.p2m_pages++; - d->arch.paging.shadow.total_pages--; - ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask)); - -+ out: - paging_unlock(d); - - return pg; -@@ -1333,7 +1353,9 @@ int shadow_set_allocation(struct domain *d, unsigned int pages, bool *preempted) - else if ( d->arch.paging.shadow.total_pages > pages ) - { - /* Need to return memory to domheap */ -- _shadow_prealloc(d, 1); -+ if ( !_shadow_prealloc(d, 1) ) -+ return -ENOMEM; -+ - sp = page_list_remove_head(&d->arch.paging.shadow.freelist); - ASSERT(sp); - /* -@@ -2401,12 +2423,13 @@ static void sh_update_paging_modes(struct vcpu *v) - if ( mfn_eq(v->arch.paging.shadow.oos_snapshot[0], INVALID_MFN) ) - { - int i; -+ -+ if ( !shadow_prealloc(d, SH_type_oos_snapshot, SHADOW_OOS_PAGES) ) -+ return; -+ - for(i = 0; i < SHADOW_OOS_PAGES; i++) -- { -- shadow_prealloc(d, SH_type_oos_snapshot, 1); - v->arch.paging.shadow.oos_snapshot[i] = - shadow_alloc(d, SH_type_oos_snapshot, 0); -- } - } - #endif /* OOS */ - -@@ -2470,6 +2493,9 @@ static void sh_update_paging_modes(struct vcpu *v) - mfn_t mmfn = sh_make_monitor_table( - v, v->arch.paging.mode->shadow.shadow_levels); - -+ if ( mfn_eq(mmfn, INVALID_MFN) ) -+ return; -+ - v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn); - make_cr3(v, mmfn); - hvm_update_host_cr3(v); -@@ -2508,6 +2534,12 @@ static void sh_update_paging_modes(struct vcpu *v) - v->arch.hvm.monitor_table = pagetable_null(); - new_mfn = sh_make_monitor_table( - v, v->arch.paging.mode->shadow.shadow_levels); -+ if ( mfn_eq(new_mfn, INVALID_MFN) ) -+ { -+ sh_destroy_monitor_table(v, old_mfn, -+ old_mode->shadow.shadow_levels); -+ return; -+ } - v->arch.hvm.monitor_table = pagetable_from_mfn(new_mfn); - SHADOW_PRINTK("new monitor table %"PRI_mfn "\n", - mfn_x(new_mfn)); -@@ -2593,7 +2625,12 @@ void sh_set_toplevel_shadow(struct vcpu *v, - if ( !mfn_valid(smfn) ) - { - /* Make sure there's enough free shadow memory. */ -- shadow_prealloc(d, root_type, 1); -+ if ( !shadow_prealloc(d, root_type, 1) ) -+ { -+ new_entry = pagetable_null(); -+ goto install_new_entry; -+ } -+ - /* Shadow the page. */ - smfn = make_shadow(v, gmfn, root_type); - } -diff --git a/xen/arch/x86/mm/shadow/hvm.c b/xen/arch/x86/mm/shadow/hvm.c -index 87fc57704f25..d68796c495b7 100644 ---- a/xen/arch/x86/mm/shadow/hvm.c -+++ b/xen/arch/x86/mm/shadow/hvm.c -@@ -700,7 +700,9 @@ mfn_t sh_make_monitor_table(const struct vcpu *v, unsigned int shadow_levels) - ASSERT(!pagetable_get_pfn(v->arch.hvm.monitor_table)); - - /* Guarantee we can get the memory we need */ -- shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS); -+ if ( !shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS) ) -+ return INVALID_MFN; -+ - m4mfn = shadow_alloc(d, SH_type_monitor_table, 0); - mfn_to_page(m4mfn)->shadow_flags = 4; - -diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c -index 7e0494cf7faa..6a9f82d39ce6 100644 ---- a/xen/arch/x86/mm/shadow/multi.c -+++ b/xen/arch/x86/mm/shadow/multi.c -@@ -2825,9 +2825,14 @@ static int sh_page_fault(struct vcpu *v, - * Preallocate shadow pages *before* removing writable accesses - * otherwhise an OOS L1 might be demoted and promoted again with - * writable mappings. */ -- shadow_prealloc(d, -- SH_type_l1_shadow, -- GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1); -+ if ( !shadow_prealloc(d, SH_type_l1_shadow, -+ GUEST_PAGING_LEVELS < 4 -+ ? 1 : GUEST_PAGING_LEVELS - 1) ) -+ { -+ paging_unlock(d); -+ put_gfn(d, gfn_x(gfn)); -+ return 0; -+ } - - rc = gw_remove_write_accesses(v, va, &gw); - -diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h -index 911db46e7399..3fe0388e7c4f 100644 ---- a/xen/arch/x86/mm/shadow/private.h -+++ b/xen/arch/x86/mm/shadow/private.h -@@ -351,7 +351,8 @@ void shadow_promote(struct domain *d, mfn_t gmfn, u32 type); - void shadow_demote(struct domain *d, mfn_t gmfn, u32 type); - - /* Shadow page allocation functions */ --void shadow_prealloc(struct domain *d, u32 shadow_type, unsigned int count); -+bool __must_check shadow_prealloc(struct domain *d, unsigned int shadow_type, -+ unsigned int count); - mfn_t shadow_alloc(struct domain *d, - u32 shadow_type, - unsigned long backpointer); --- -2.37.4 - diff --git a/0047-x86-p2m-refuse-new-allocations-for-dying-domains.patch b/0047-x86-p2m-refuse-new-allocations-for-dying-domains.patch deleted file mode 100644 index 880b68d..0000000 --- a/0047-x86-p2m-refuse-new-allocations-for-dying-domains.patch +++ /dev/null @@ -1,100 +0,0 @@ -From 4f9b535194f70582863f2a78f113547d8822b2b9 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 11 Oct 2022 15:08:28 +0200 -Subject: [PATCH 047/126] x86/p2m: refuse new allocations for dying domains -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -This will in particular prevent any attempts to add entries to the p2m, -once - in a subsequent change - non-root entries have been removed. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Tim Deegan <tim@xen.org> -master commit: ff600a8cf8e36f8ecbffecf96a035952e022ab87 -master date: 2022-10-11 14:23:22 +0200 ---- - xen/arch/x86/mm/hap/hap.c | 5 ++++- - xen/arch/x86/mm/shadow/common.c | 18 ++++++++++++++---- - 2 files changed, 18 insertions(+), 5 deletions(-) - -diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c -index d75dc2b9ed3d..787991233e53 100644 ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -245,6 +245,9 @@ static struct page_info *hap_alloc(struct domain *d) - - ASSERT(paging_locked_by_me(d)); - -+ if ( unlikely(d->is_dying) ) -+ return NULL; -+ - pg = page_list_remove_head(&d->arch.paging.hap.freelist); - if ( unlikely(!pg) ) - return NULL; -@@ -281,7 +284,7 @@ static struct page_info *hap_alloc_p2m_page(struct domain *d) - d->arch.paging.hap.p2m_pages++; - ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask)); - } -- else if ( !d->arch.paging.p2m_alloc_failed ) -+ else if ( !d->arch.paging.p2m_alloc_failed && !d->is_dying ) - { - d->arch.paging.p2m_alloc_failed = 1; - dprintk(XENLOG_ERR, "d%i failed to allocate from HAP pool\n", -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index fc4f7f78ce43..9ad7e5a88650 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -938,6 +938,10 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) - if ( d->arch.paging.shadow.free_pages >= pages ) - return true; - -+ if ( unlikely(d->is_dying) ) -+ /* No reclaim when the domain is dying, teardown will take care of it. */ -+ return false; -+ - /* Shouldn't have enabled shadows if we've no vcpus. */ - ASSERT(d->vcpu && d->vcpu[0]); - -@@ -990,7 +994,7 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) - d->arch.paging.shadow.free_pages, - d->arch.paging.shadow.p2m_pages); - -- ASSERT(d->is_dying); -+ ASSERT_UNREACHABLE(); - - guest_flush_tlb_mask(d, d->dirty_cpumask); - -@@ -1004,10 +1008,13 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) - * to avoid freeing shadows that the caller is currently working on. */ - bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count) - { -- bool ret = _shadow_prealloc(d, shadow_size(type) * count); -+ bool ret; - -- if ( !ret && !d->is_dying && -- (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) -+ if ( unlikely(d->is_dying) ) -+ return false; -+ -+ ret = _shadow_prealloc(d, shadow_size(type) * count); -+ if ( !ret && (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) - /* - * Failing to allocate memory required for shadow usage can only result in - * a domain crash, do it here rather that relying on every caller to do it. -@@ -1235,6 +1242,9 @@ shadow_alloc_p2m_page(struct domain *d) - { - struct page_info *pg = NULL; - -+ if ( unlikely(d->is_dying) ) -+ return NULL; -+ - /* This is called both from the p2m code (which never holds the - * paging lock) and the log-dirty code (which always does). */ - paging_lock_recursive(d); --- -2.37.4 - diff --git a/0048-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch b/0048-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch deleted file mode 100644 index 280b6d8..0000000 --- a/0048-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch +++ /dev/null @@ -1,115 +0,0 @@ -From 7f055b011a657f8f16b0df242301efb312058eea Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 11 Oct 2022 15:08:42 +0200 -Subject: [PATCH 048/126] x86/p2m: truly free paging pool memory for dying - domains -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Modify {hap,shadow}_free to free the page immediately if the domain is -dying, so that pages don't accumulate in the pool when -{shadow,hap}_final_teardown() get called. This is to limit the amount of -work which needs to be done there (in a non-preemptable manner). - -Note the call to shadow_free() in shadow_free_p2m_page() is moved after -increasing total_pages, so that the decrease done in shadow_free() in -case the domain is dying doesn't underflow the counter, even if just for -a short interval. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Tim Deegan <tim@xen.org> -master commit: f50a2c0e1d057c00d6061f40ae24d068226052ad -master date: 2022-10-11 14:23:51 +0200 ---- - xen/arch/x86/mm/hap/hap.c | 12 ++++++++++++ - xen/arch/x86/mm/shadow/common.c | 28 +++++++++++++++++++++++++--- - 2 files changed, 37 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c -index 787991233e53..aef2297450e1 100644 ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -265,6 +265,18 @@ static void hap_free(struct domain *d, mfn_t mfn) - - ASSERT(paging_locked_by_me(d)); - -+ /* -+ * For dying domains, actually free the memory here. This way less work is -+ * left to hap_final_teardown(), which cannot easily have preemption checks -+ * added. -+ */ -+ if ( unlikely(d->is_dying) ) -+ { -+ free_domheap_page(pg); -+ d->arch.paging.hap.total_pages--; -+ return; -+ } -+ - d->arch.paging.hap.free_pages++; - page_list_add_tail(pg, &d->arch.paging.hap.freelist); - } -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index 9ad7e5a88650..366956c146aa 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -1184,6 +1184,7 @@ mfn_t shadow_alloc(struct domain *d, - void shadow_free(struct domain *d, mfn_t smfn) - { - struct page_info *next = NULL, *sp = mfn_to_page(smfn); -+ bool dying = ACCESS_ONCE(d->is_dying); - struct page_list_head *pin_list; - unsigned int pages; - u32 shadow_type; -@@ -1226,11 +1227,32 @@ void shadow_free(struct domain *d, mfn_t smfn) - * just before the allocator hands the page out again. */ - page_set_tlbflush_timestamp(sp); - perfc_decr(shadow_alloc_count); -- page_list_add_tail(sp, &d->arch.paging.shadow.freelist); -+ -+ /* -+ * For dying domains, actually free the memory here. This way less -+ * work is left to shadow_final_teardown(), which cannot easily have -+ * preemption checks added. -+ */ -+ if ( unlikely(dying) ) -+ { -+ /* -+ * The backpointer field (sh.back) used by shadow code aliases the -+ * domain owner field, unconditionally clear it here to avoid -+ * free_domheap_page() attempting to parse it. -+ */ -+ page_set_owner(sp, NULL); -+ free_domheap_page(sp); -+ } -+ else -+ page_list_add_tail(sp, &d->arch.paging.shadow.freelist); -+ - sp = next; - } - -- d->arch.paging.shadow.free_pages += pages; -+ if ( unlikely(dying) ) -+ d->arch.paging.shadow.total_pages -= pages; -+ else -+ d->arch.paging.shadow.free_pages += pages; - } - - /* Divert a page from the pool to be used by the p2m mapping. -@@ -1300,9 +1322,9 @@ shadow_free_p2m_page(struct domain *d, struct page_info *pg) - * paging lock) and the log-dirty code (which always does). */ - paging_lock_recursive(d); - -- shadow_free(d, page_to_mfn(pg)); - d->arch.paging.shadow.p2m_pages--; - d->arch.paging.shadow.total_pages++; -+ shadow_free(d, page_to_mfn(pg)); - - paging_unlock(d); - } --- -2.37.4 - diff --git a/0049-x86-p2m-free-the-paging-memory-pool-preemptively.patch b/0049-x86-p2m-free-the-paging-memory-pool-preemptively.patch deleted file mode 100644 index aef6a24..0000000 --- a/0049-x86-p2m-free-the-paging-memory-pool-preemptively.patch +++ /dev/null @@ -1,181 +0,0 @@ -From 686c920fa9389fe2b6b619643024ed98b4b7d51f Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 11 Oct 2022 15:08:58 +0200 -Subject: [PATCH 049/126] x86/p2m: free the paging memory pool preemptively -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The paging memory pool is currently freed in two different places: -from {shadow,hap}_teardown() via domain_relinquish_resources() and -from {shadow,hap}_final_teardown() via complete_domain_destroy(). -While the former does handle preemption, the later doesn't. - -Attempt to move as much p2m related freeing as possible to happen -before the call to {shadow,hap}_teardown(), so that most memory can be -freed in a preemptive way. In order to avoid causing issues to -existing callers leave the root p2m page tables set and free them in -{hap,shadow}_final_teardown(). Also modify {hap,shadow}_free to free -the page immediately if the domain is dying, so that pages don't -accumulate in the pool when {shadow,hap}_final_teardown() get called. - -Move altp2m_vcpu_disable_ve() to be done in hap_teardown(), as that's -the place where altp2m_active gets disabled now. - -This is part of CVE-2022-33746 / XSA-410. - -Reported-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Tim Deegan <tim@xen.org> -master commit: e7aa55c0aab36d994bf627c92bd5386ae167e16e -master date: 2022-10-11 14:24:21 +0200 ---- - xen/arch/x86/domain.c | 7 ------ - xen/arch/x86/mm/hap/hap.c | 42 ++++++++++++++++++++------------- - xen/arch/x86/mm/shadow/common.c | 12 ++++++++++ - 3 files changed, 38 insertions(+), 23 deletions(-) - -diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c -index 2838f976d729..ce6ddcf31397 100644 ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -38,7 +38,6 @@ - #include <xen/livepatch.h> - #include <public/sysctl.h> - #include <public/hvm/hvm_vcpu.h> --#include <asm/altp2m.h> - #include <asm/regs.h> - #include <asm/mc146818rtc.h> - #include <asm/system.h> -@@ -2358,12 +2357,6 @@ int domain_relinquish_resources(struct domain *d) - vpmu_destroy(v); - } - -- if ( altp2m_active(d) ) -- { -- for_each_vcpu ( d, v ) -- altp2m_vcpu_disable_ve(v); -- } -- - if ( is_pv_domain(d) ) - { - for_each_vcpu ( d, v ) -diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c -index aef2297450e1..a44fcfd95e1e 100644 ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -28,6 +28,7 @@ - #include <xen/domain_page.h> - #include <xen/guest_access.h> - #include <xen/keyhandler.h> -+#include <asm/altp2m.h> - #include <asm/event.h> - #include <asm/page.h> - #include <asm/current.h> -@@ -546,24 +547,8 @@ void hap_final_teardown(struct domain *d) - unsigned int i; - - if ( hvm_altp2m_supported() ) -- { -- d->arch.altp2m_active = 0; -- -- if ( d->arch.altp2m_eptp ) -- { -- free_xenheap_page(d->arch.altp2m_eptp); -- d->arch.altp2m_eptp = NULL; -- } -- -- if ( d->arch.altp2m_visible_eptp ) -- { -- free_xenheap_page(d->arch.altp2m_visible_eptp); -- d->arch.altp2m_visible_eptp = NULL; -- } -- - for ( i = 0; i < MAX_ALTP2M; i++ ) - p2m_teardown(d->arch.altp2m_p2m[i], true); -- } - - /* Destroy nestedp2m's first */ - for (i = 0; i < MAX_NESTEDP2M; i++) { -@@ -578,6 +563,8 @@ void hap_final_teardown(struct domain *d) - paging_lock(d); - hap_set_allocation(d, 0, NULL); - ASSERT(d->arch.paging.hap.p2m_pages == 0); -+ ASSERT(d->arch.paging.hap.free_pages == 0); -+ ASSERT(d->arch.paging.hap.total_pages == 0); - paging_unlock(d); - } - -@@ -603,6 +590,7 @@ void hap_vcpu_teardown(struct vcpu *v) - void hap_teardown(struct domain *d, bool *preempted) - { - struct vcpu *v; -+ unsigned int i; - - ASSERT(d->is_dying); - ASSERT(d != current->domain); -@@ -611,6 +599,28 @@ void hap_teardown(struct domain *d, bool *preempted) - for_each_vcpu ( d, v ) - hap_vcpu_teardown(v); - -+ /* Leave the root pt in case we get further attempts to modify the p2m. */ -+ if ( hvm_altp2m_supported() ) -+ { -+ if ( altp2m_active(d) ) -+ for_each_vcpu ( d, v ) -+ altp2m_vcpu_disable_ve(v); -+ -+ d->arch.altp2m_active = 0; -+ -+ FREE_XENHEAP_PAGE(d->arch.altp2m_eptp); -+ FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp); -+ -+ for ( i = 0; i < MAX_ALTP2M; i++ ) -+ p2m_teardown(d->arch.altp2m_p2m[i], false); -+ } -+ -+ /* Destroy nestedp2m's after altp2m. */ -+ for ( i = 0; i < MAX_NESTEDP2M; i++ ) -+ p2m_teardown(d->arch.nested_p2m[i], false); -+ -+ p2m_teardown(p2m_get_hostp2m(d), false); -+ - paging_lock(d); /* Keep various asserts happy */ - - if ( d->arch.paging.hap.total_pages != 0 ) -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index 366956c146aa..680766fd5170 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -2891,8 +2891,17 @@ void shadow_teardown(struct domain *d, bool *preempted) - for_each_vcpu ( d, v ) - shadow_vcpu_teardown(v); - -+ p2m_teardown(p2m_get_hostp2m(d), false); -+ - paging_lock(d); - -+ /* -+ * Reclaim all shadow memory so that shadow_set_allocation() doesn't find -+ * in-use pages, as _shadow_prealloc() will no longer try to reclaim pages -+ * because the domain is dying. -+ */ -+ shadow_blow_tables(d); -+ - #if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) - /* Free the virtual-TLB array attached to each vcpu */ - for_each_vcpu(d, v) -@@ -3013,6 +3022,9 @@ void shadow_final_teardown(struct domain *d) - d->arch.paging.shadow.total_pages, - d->arch.paging.shadow.free_pages, - d->arch.paging.shadow.p2m_pages); -+ ASSERT(!d->arch.paging.shadow.total_pages); -+ ASSERT(!d->arch.paging.shadow.free_pages); -+ ASSERT(!d->arch.paging.shadow.p2m_pages); - paging_unlock(d); - } - --- -2.37.4 - diff --git a/0050-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch b/0050-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch deleted file mode 100644 index 8ab565d..0000000 --- a/0050-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch +++ /dev/null @@ -1,197 +0,0 @@ -From b03074bb47d10c9373688b3661c7c31da01c21a3 Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Tue, 11 Oct 2022 15:09:12 +0200 -Subject: [PATCH 050/126] xen/x86: p2m: Add preemption in p2m_teardown() - -The list p2m->pages contain all the pages used by the P2M. On large -instance this can be quite large and the time spent to call -d->arch.paging.free_page() will take more than 1ms for a 80GB guest -on a Xen running in nested environment on a c5.metal. - -By extrapolation, it would take > 100ms for a 8TB guest (what we -current security support). So add some preemption in p2m_teardown() -and propagate to the callers. Note there are 3 places where -the preemption is not enabled: - - hap_final_teardown()/shadow_final_teardown(): We are - preventing update the P2M once the domain is dying (so - no more pages could be allocated) and most of the P2M pages - will be freed in preemptive manneer when relinquishing the - resources. So this is fine to disable preemption. - - shadow_enable(): This is fine because it will undo the allocation - that may have been made by p2m_alloc_table() (so only the root - page table). - -The preemption is arbitrarily checked every 1024 iterations. - -We now need to include <xen/event.h> in p2m-basic in order to -import the definition for local_events_need_delivery() used by -general_preempt_check(). Ideally, the inclusion should happen in -xen/sched.h but it opened a can of worms. - -Note that with the current approach, Xen doesn't keep track on whether -the alt/nested P2Ms have been cleared. So there are some redundant work. -However, this is not expected to incurr too much overhead (the P2M lock -shouldn't be contended during teardown). So this is optimization is -left outside of the security event. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -master commit: 8a2111250b424edc49c65c4d41b276766d30635c -master date: 2022-10-11 14:24:48 +0200 ---- - xen/arch/x86/mm/hap/hap.c | 22 ++++++++++++++++------ - xen/arch/x86/mm/p2m.c | 18 +++++++++++++++--- - xen/arch/x86/mm/shadow/common.c | 12 +++++++++--- - xen/include/asm-x86/p2m.h | 2 +- - 4 files changed, 41 insertions(+), 13 deletions(-) - -diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c -index a44fcfd95e1e..1f9a157a0c34 100644 ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -548,17 +548,17 @@ void hap_final_teardown(struct domain *d) - - if ( hvm_altp2m_supported() ) - for ( i = 0; i < MAX_ALTP2M; i++ ) -- p2m_teardown(d->arch.altp2m_p2m[i], true); -+ p2m_teardown(d->arch.altp2m_p2m[i], true, NULL); - - /* Destroy nestedp2m's first */ - for (i = 0; i < MAX_NESTEDP2M; i++) { -- p2m_teardown(d->arch.nested_p2m[i], true); -+ p2m_teardown(d->arch.nested_p2m[i], true, NULL); - } - - if ( d->arch.paging.hap.total_pages != 0 ) - hap_teardown(d, NULL); - -- p2m_teardown(p2m_get_hostp2m(d), true); -+ p2m_teardown(p2m_get_hostp2m(d), true, NULL); - /* Free any memory that the p2m teardown released */ - paging_lock(d); - hap_set_allocation(d, 0, NULL); -@@ -612,14 +612,24 @@ void hap_teardown(struct domain *d, bool *preempted) - FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp); - - for ( i = 0; i < MAX_ALTP2M; i++ ) -- p2m_teardown(d->arch.altp2m_p2m[i], false); -+ { -+ p2m_teardown(d->arch.altp2m_p2m[i], false, preempted); -+ if ( preempted && *preempted ) -+ return; -+ } - } - - /* Destroy nestedp2m's after altp2m. */ - for ( i = 0; i < MAX_NESTEDP2M; i++ ) -- p2m_teardown(d->arch.nested_p2m[i], false); -+ { -+ p2m_teardown(d->arch.nested_p2m[i], false, preempted); -+ if ( preempted && *preempted ) -+ return; -+ } - -- p2m_teardown(p2m_get_hostp2m(d), false); -+ p2m_teardown(p2m_get_hostp2m(d), false, preempted); -+ if ( preempted && *preempted ) -+ return; - - paging_lock(d); /* Keep various asserts happy */ - -diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c -index 8ba73082c1bf..107f6778a6e1 100644 ---- a/xen/arch/x86/mm/p2m.c -+++ b/xen/arch/x86/mm/p2m.c -@@ -741,12 +741,13 @@ int p2m_alloc_table(struct p2m_domain *p2m) - * hvm fixme: when adding support for pvh non-hardware domains, this path must - * cleanup any foreign p2m types (release refcnts on them). - */ --void p2m_teardown(struct p2m_domain *p2m, bool remove_root) -+void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted) - /* Return all the p2m pages to Xen. - * We know we don't have any extra mappings to these pages */ - { - struct page_info *pg, *root_pg = NULL; - struct domain *d; -+ unsigned int i = 0; - - if (p2m == NULL) - return; -@@ -765,8 +766,19 @@ void p2m_teardown(struct p2m_domain *p2m, bool remove_root) - } - - while ( (pg = page_list_remove_head(&p2m->pages)) ) -- if ( pg != root_pg ) -- d->arch.paging.free_page(d, pg); -+ { -+ if ( pg == root_pg ) -+ continue; -+ -+ d->arch.paging.free_page(d, pg); -+ -+ /* Arbitrarily check preemption every 1024 iterations */ -+ if ( preempted && !(++i % 1024) && general_preempt_check() ) -+ { -+ *preempted = true; -+ break; -+ } -+ } - - if ( root_pg ) - page_list_add(root_pg, &p2m->pages); -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index 680766fd5170..8f7fddcee1e5 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -2837,8 +2837,12 @@ int shadow_enable(struct domain *d, u32 mode) - out_locked: - paging_unlock(d); - out_unlocked: -+ /* -+ * This is fine to ignore the preemption here because only the root -+ * will be allocated by p2m_alloc_table(). -+ */ - if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) ) -- p2m_teardown(p2m, true); -+ p2m_teardown(p2m, true, NULL); - if ( rv != 0 && pg != NULL ) - { - pg->count_info &= ~PGC_count_mask; -@@ -2891,7 +2895,9 @@ void shadow_teardown(struct domain *d, bool *preempted) - for_each_vcpu ( d, v ) - shadow_vcpu_teardown(v); - -- p2m_teardown(p2m_get_hostp2m(d), false); -+ p2m_teardown(p2m_get_hostp2m(d), false, preempted); -+ if ( preempted && *preempted ) -+ return; - - paging_lock(d); - -@@ -3012,7 +3018,7 @@ void shadow_final_teardown(struct domain *d) - shadow_teardown(d, NULL); - - /* It is now safe to pull down the p2m map. */ -- p2m_teardown(p2m_get_hostp2m(d), true); -+ p2m_teardown(p2m_get_hostp2m(d), true, NULL); - /* Free any shadow memory that the p2m teardown released */ - paging_lock(d); - shadow_set_allocation(d, 0, NULL); -diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h -index 46eb51d44cf5..edbe4cee2717 100644 ---- a/xen/include/asm-x86/p2m.h -+++ b/xen/include/asm-x86/p2m.h -@@ -619,7 +619,7 @@ int p2m_init(struct domain *d); - int p2m_alloc_table(struct p2m_domain *p2m); - - /* Return all the p2m resources to Xen. */ --void p2m_teardown(struct p2m_domain *p2m, bool remove_root); -+void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted); - void p2m_final_teardown(struct domain *d); - - /* Add a page to a domain's p2m table */ --- -2.37.4 - diff --git a/0051-libxl-docs-Use-arch-specific-default-paging-memory.patch b/0051-libxl-docs-Use-arch-specific-default-paging-memory.patch deleted file mode 100644 index 4ec35bf..0000000 --- a/0051-libxl-docs-Use-arch-specific-default-paging-memory.patch +++ /dev/null @@ -1,147 +0,0 @@ -From 0c0680d6e7953ca4c91699e60060c732f9ead5c1 Mon Sep 17 00:00:00 2001 -From: Henry Wang <Henry.Wang@arm.com> -Date: Tue, 11 Oct 2022 15:09:32 +0200 -Subject: [PATCH 051/126] libxl, docs: Use arch-specific default paging memory - -The default paging memory (descibed in `shadow_memory` entry in xl -config) in libxl is used to determine the memory pool size for xl -guests. Currently this size is only used for x86, and contains a part -of RAM to shadow the resident processes. Since on Arm there is no -shadow mode guests, so the part of RAM to shadow the resident processes -is not necessary. Therefore, this commit splits the function -`libxl_get_required_shadow_memory()` to arch specific helpers and -renamed the helper to `libxl__arch_get_required_paging_memory()`. - -On x86, this helper calls the original value from -`libxl_get_required_shadow_memory()` so no functional change intended. - -On Arm, this helper returns 1MB per vcpu plus 4KB per MiB of RAM -for the P2M map and additional 512KB. - -Also update the xl.cfg documentation to add Arm documentation -according to code changes and correct the comment style following Xen -coding style. - -This is part of CVE-2022-33747 / XSA-409. - -Suggested-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Henry Wang <Henry.Wang@arm.com> -Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> -master commit: 156a239ea288972425f967ac807b3cb5b5e14874 -master date: 2022-10-11 14:28:37 +0200 ---- - docs/man/xl.cfg.5.pod.in | 5 +++++ - tools/libs/light/libxl_arch.h | 4 ++++ - tools/libs/light/libxl_arm.c | 12 ++++++++++++ - tools/libs/light/libxl_utils.c | 9 ++------- - tools/libs/light/libxl_x86.c | 13 +++++++++++++ - 5 files changed, 36 insertions(+), 7 deletions(-) - -diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in -index 56370a37dbb1..af7fae7c52f9 100644 ---- a/docs/man/xl.cfg.5.pod.in -+++ b/docs/man/xl.cfg.5.pod.in -@@ -1746,6 +1746,11 @@ are not using hardware assisted paging (i.e. you are using shadow - mode) and your guest workload consists of a very large number of - similar processes then increasing this value may improve performance. - -+On Arm, this field is used to determine the size of the guest P2M pages -+pool, and the default value is 1MB per vCPU plus 4KB per MB of RAM for -+the P2M map. Users should adjust this value if bigger P2M pool size is -+needed. -+ - =back - - =head3 Processor and Platform Features -diff --git a/tools/libs/light/libxl_arch.h b/tools/libs/light/libxl_arch.h -index 8527fc5c6c23..6741b7f6f457 100644 ---- a/tools/libs/light/libxl_arch.h -+++ b/tools/libs/light/libxl_arch.h -@@ -90,6 +90,10 @@ void libxl__arch_update_domain_config(libxl__gc *gc, - libxl_domain_config *dst, - const libxl_domain_config *src); - -+_hidden -+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb, -+ unsigned int smp_cpus); -+ - #if defined(__i386__) || defined(__x86_64__) - - #define LAPIC_BASE_ADDRESS 0xfee00000 -diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c -index e2901f13b724..d59b464192c2 100644 ---- a/tools/libs/light/libxl_arm.c -+++ b/tools/libs/light/libxl_arm.c -@@ -154,6 +154,18 @@ out: - return rc; - } - -+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb, -+ unsigned int smp_cpus) -+{ -+ /* -+ * 256 pages (1MB) per vcpu, -+ * plus 1 page per MiB of RAM for the P2M map, -+ * This is higher than the minimum that Xen would allocate if no value -+ * were given (but the Xen minimum is for safety, not performance). -+ */ -+ return 4 * (256 * smp_cpus + maxmem_kb / 1024); -+} -+ - static struct arch_info { - const char *guest_type; - const char *timer_compat; -diff --git a/tools/libs/light/libxl_utils.c b/tools/libs/light/libxl_utils.c -index 4699c4a0a36f..e276c0ee9cc3 100644 ---- a/tools/libs/light/libxl_utils.c -+++ b/tools/libs/light/libxl_utils.c -@@ -18,6 +18,7 @@ - #include <ctype.h> - - #include "libxl_internal.h" -+#include "libxl_arch.h" - #include "_paths.h" - - #ifndef LIBXL_HAVE_NONCONST_LIBXL_BASENAME_RETURN_VALUE -@@ -39,13 +40,7 @@ char *libxl_basename(const char *name) - - unsigned long libxl_get_required_shadow_memory(unsigned long maxmem_kb, unsigned int smp_cpus) - { -- /* 256 pages (1MB) per vcpu, -- plus 1 page per MiB of RAM for the P2M map, -- plus 1 page per MiB of RAM to shadow the resident processes. -- This is higher than the minimum that Xen would allocate if no value -- were given (but the Xen minimum is for safety, not performance). -- */ -- return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024)); -+ return libxl__arch_get_required_paging_memory(maxmem_kb, smp_cpus); - } - - char *libxl_domid_to_name(libxl_ctx *ctx, uint32_t domid) -diff --git a/tools/libs/light/libxl_x86.c b/tools/libs/light/libxl_x86.c -index 18c3c77ccde3..4d66478fe9dd 100644 ---- a/tools/libs/light/libxl_x86.c -+++ b/tools/libs/light/libxl_x86.c -@@ -882,6 +882,19 @@ void libxl__arch_update_domain_config(libxl__gc *gc, - libxl_defbool_val(src->b_info.arch_x86.msr_relaxed)); - } - -+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb, -+ unsigned int smp_cpus) -+{ -+ /* -+ * 256 pages (1MB) per vcpu, -+ * plus 1 page per MiB of RAM for the P2M map, -+ * plus 1 page per MiB of RAM to shadow the resident processes. -+ * This is higher than the minimum that Xen would allocate if no value -+ * were given (but the Xen minimum is for safety, not performance). -+ */ -+ return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024)); -+} -+ - /* - * Local variables: - * mode: C --- -2.37.4 - diff --git a/0052-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch b/0052-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch deleted file mode 100644 index a17ad53..0000000 --- a/0052-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch +++ /dev/null @@ -1,189 +0,0 @@ -From 45336d8f88725aec65ee177b1b09abf6eef1dc8d Mon Sep 17 00:00:00 2001 -From: Henry Wang <Henry.Wang@arm.com> -Date: Tue, 11 Oct 2022 15:09:58 +0200 -Subject: [PATCH 052/126] xen/arm: Construct the P2M pages pool for guests - -This commit constructs the p2m pages pool for guests from the -data structure and helper perspective. - -This is implemented by: - -- Adding a `struct paging_domain` which contains a freelist, a -counter variable and a spinlock to `struct arch_domain` to -indicate the free p2m pages and the number of p2m total pages in -the p2m pages pool. - -- Adding a helper `p2m_get_allocation` to get the p2m pool size. - -- Adding a helper `p2m_set_allocation` to set the p2m pages pool -size. This helper should be called before allocating memory for -a guest. - -- Adding a helper `p2m_teardown_allocation` to free the p2m pages -pool. This helper should be called during the xl domain destory. - -This is part of CVE-2022-33747 / XSA-409. - -Signed-off-by: Henry Wang <Henry.Wang@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -master commit: 55914f7fc91a468649b8a3ec3f53ae1c4aca6670 -master date: 2022-10-11 14:28:39 +0200 ---- - xen/arch/arm/p2m.c | 88 ++++++++++++++++++++++++++++++++++++ - xen/include/asm-arm/domain.h | 10 ++++ - xen/include/asm-arm/p2m.h | 4 ++ - 3 files changed, 102 insertions(+) - -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index 4ad3e0606e9c..6883d8627702 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -50,6 +50,92 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn) - return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48)); - } - -+/* Return the size of the pool, rounded up to the nearest MB */ -+unsigned int p2m_get_allocation(struct domain *d) -+{ -+ unsigned long nr_pages = ACCESS_ONCE(d->arch.paging.p2m_total_pages); -+ -+ return ROUNDUP(nr_pages, 1 << (20 - PAGE_SHIFT)) >> (20 - PAGE_SHIFT); -+} -+ -+/* -+ * Set the pool of pages to the required number of pages. -+ * Returns 0 for success, non-zero for failure. -+ * Call with d->arch.paging.lock held. -+ */ -+int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted) -+{ -+ struct page_info *pg; -+ -+ ASSERT(spin_is_locked(&d->arch.paging.lock)); -+ -+ for ( ; ; ) -+ { -+ if ( d->arch.paging.p2m_total_pages < pages ) -+ { -+ /* Need to allocate more memory from domheap */ -+ pg = alloc_domheap_page(NULL, 0); -+ if ( pg == NULL ) -+ { -+ printk(XENLOG_ERR "Failed to allocate P2M pages.\n"); -+ return -ENOMEM; -+ } -+ ACCESS_ONCE(d->arch.paging.p2m_total_pages) = -+ d->arch.paging.p2m_total_pages + 1; -+ page_list_add_tail(pg, &d->arch.paging.p2m_freelist); -+ } -+ else if ( d->arch.paging.p2m_total_pages > pages ) -+ { -+ /* Need to return memory to domheap */ -+ pg = page_list_remove_head(&d->arch.paging.p2m_freelist); -+ if( pg ) -+ { -+ ACCESS_ONCE(d->arch.paging.p2m_total_pages) = -+ d->arch.paging.p2m_total_pages - 1; -+ free_domheap_page(pg); -+ } -+ else -+ { -+ printk(XENLOG_ERR -+ "Failed to free P2M pages, P2M freelist is empty.\n"); -+ return -ENOMEM; -+ } -+ } -+ else -+ break; -+ -+ /* Check to see if we need to yield and try again */ -+ if ( preempted && general_preempt_check() ) -+ { -+ *preempted = true; -+ return -ERESTART; -+ } -+ } -+ -+ return 0; -+} -+ -+int p2m_teardown_allocation(struct domain *d) -+{ -+ int ret = 0; -+ bool preempted = false; -+ -+ spin_lock(&d->arch.paging.lock); -+ if ( d->arch.paging.p2m_total_pages != 0 ) -+ { -+ ret = p2m_set_allocation(d, 0, &preempted); -+ if ( preempted ) -+ { -+ spin_unlock(&d->arch.paging.lock); -+ return -ERESTART; -+ } -+ ASSERT(d->arch.paging.p2m_total_pages == 0); -+ } -+ spin_unlock(&d->arch.paging.lock); -+ -+ return ret; -+} -+ - /* Unlock the flush and do a P2M TLB flush if necessary */ - void p2m_write_unlock(struct p2m_domain *p2m) - { -@@ -1602,7 +1688,9 @@ int p2m_init(struct domain *d) - unsigned int cpu; - - rwlock_init(&p2m->lock); -+ spin_lock_init(&d->arch.paging.lock); - INIT_PAGE_LIST_HEAD(&p2m->pages); -+ INIT_PAGE_LIST_HEAD(&d->arch.paging.p2m_freelist); - - p2m->vmid = INVALID_VMID; - -diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h -index bb0a6adbe00b..1d8935778f3b 100644 ---- a/xen/include/asm-arm/domain.h -+++ b/xen/include/asm-arm/domain.h -@@ -40,6 +40,14 @@ struct vtimer { - uint64_t cval; - }; - -+struct paging_domain { -+ spinlock_t lock; -+ /* Free P2M pages from the pre-allocated P2M pool */ -+ struct page_list_head p2m_freelist; -+ /* Number of pages from the pre-allocated P2M pool */ -+ unsigned long p2m_total_pages; -+}; -+ - struct arch_domain - { - #ifdef CONFIG_ARM_64 -@@ -51,6 +59,8 @@ struct arch_domain - - struct hvm_domain hvm; - -+ struct paging_domain paging; -+ - struct vmmio vmmio; - - /* Continuable domain_relinquish_resources(). */ -diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h -index 3a2d51b35d71..18675b234570 100644 ---- a/xen/include/asm-arm/p2m.h -+++ b/xen/include/asm-arm/p2m.h -@@ -218,6 +218,10 @@ void p2m_restore_state(struct vcpu *n); - /* Print debugging/statistial info about a domain's p2m */ - void p2m_dump_info(struct domain *d); - -+unsigned int p2m_get_allocation(struct domain *d); -+int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted); -+int p2m_teardown_allocation(struct domain *d); -+ - static inline void p2m_write_lock(struct p2m_domain *p2m) - { - write_lock(&p2m->lock); --- -2.37.4 - diff --git a/0053-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch b/0053-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch deleted file mode 100644 index c4e543d..0000000 --- a/0053-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch +++ /dev/null @@ -1,109 +0,0 @@ -From c5215044578e88b401a1296ed6302df05c113c5f Mon Sep 17 00:00:00 2001 -From: Henry Wang <Henry.Wang@arm.com> -Date: Tue, 11 Oct 2022 15:10:16 +0200 -Subject: [PATCH 053/126] xen/arm, libxl: Implement XEN_DOMCTL_shadow_op for - Arm - -This commit implements the `XEN_DOMCTL_shadow_op` support in Xen -for Arm. The p2m pages pool size for xl guests is supposed to be -determined by `XEN_DOMCTL_shadow_op`. Hence, this commit: - -- Introduces a function `p2m_domctl` and implements the subops -`XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` and -`XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION` of `XEN_DOMCTL_shadow_op`. - -- Adds the `XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` support in libxl. - -Therefore enabling the setting of shadow memory pool size -when creating a guest from xl and getting shadow memory pool size -from Xen. - -Note that the `XEN_DOMCTL_shadow_op` added in this commit is only -a dummy op, and the functionality of setting/getting p2m memory pool -size for xl guests will be added in following commits. - -This is part of CVE-2022-33747 / XSA-409. - -Signed-off-by: Henry Wang <Henry.Wang@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -master commit: cf2a68d2ffbc3ce95e01449d46180bddb10d24a0 -master date: 2022-10-11 14:28:42 +0200 ---- - tools/libs/light/libxl_arm.c | 12 ++++++++++++ - xen/arch/arm/domctl.c | 32 ++++++++++++++++++++++++++++++++ - 2 files changed, 44 insertions(+) - -diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c -index d59b464192c2..d21f614ed788 100644 ---- a/tools/libs/light/libxl_arm.c -+++ b/tools/libs/light/libxl_arm.c -@@ -131,6 +131,18 @@ int libxl__arch_domain_create(libxl__gc *gc, - libxl__domain_build_state *state, - uint32_t domid) - { -+ libxl_ctx *ctx = libxl__gc_owner(gc); -+ unsigned int shadow_mb = DIV_ROUNDUP(d_config->b_info.shadow_memkb, 1024); -+ -+ int r = xc_shadow_control(ctx->xch, domid, -+ XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION, -+ &shadow_mb, 0); -+ if (r) { -+ LOGED(ERROR, domid, -+ "Failed to set %u MiB shadow allocation", shadow_mb); -+ return ERROR_FAIL; -+ } -+ - return 0; - } - -diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c -index a8c48b0beaab..a049bc7f3e52 100644 ---- a/xen/arch/arm/domctl.c -+++ b/xen/arch/arm/domctl.c -@@ -45,11 +45,43 @@ static int handle_vuart_init(struct domain *d, - return rc; - } - -+static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc, -+ XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) -+{ -+ if ( unlikely(d == current->domain) ) -+ { -+ printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n"); -+ return -EINVAL; -+ } -+ -+ if ( unlikely(d->is_dying) ) -+ { -+ printk(XENLOG_ERR "Tried to do a p2m domctl op on dying domain %u\n", -+ d->domain_id); -+ return -EINVAL; -+ } -+ -+ switch ( sc->op ) -+ { -+ case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: -+ return 0; -+ case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: -+ return 0; -+ default: -+ { -+ printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op); -+ return -EINVAL; -+ } -+ } -+} -+ - long arch_do_domctl(struct xen_domctl *domctl, struct domain *d, - XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) - { - switch ( domctl->cmd ) - { -+ case XEN_DOMCTL_shadow_op: -+ return p2m_domctl(d, &domctl->u.shadow_op, u_domctl); - case XEN_DOMCTL_cacheflush: - { - gfn_t s = _gfn(domctl->u.cacheflush.start_pfn); --- -2.37.4 - diff --git a/0054-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch b/0054-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch deleted file mode 100644 index 78ce712..0000000 --- a/0054-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch +++ /dev/null @@ -1,290 +0,0 @@ -From 7ad38a39f08aadc1578bdb46ccabaad79ed0faee Mon Sep 17 00:00:00 2001 -From: Henry Wang <Henry.Wang@arm.com> -Date: Tue, 11 Oct 2022 15:10:34 +0200 -Subject: [PATCH 054/126] xen/arm: Allocate and free P2M pages from the P2M - pool - -This commit sets/tearsdown of p2m pages pool for non-privileged Arm -guests by calling `p2m_set_allocation` and `p2m_teardown_allocation`. - -- For dom0, P2M pages should come from heap directly instead of p2m -pool, so that the kernel may take advantage of the extended regions. - -- For xl guests, the setting of the p2m pool is called in -`XEN_DOMCTL_shadow_op` and the p2m pool is destroyed in -`domain_relinquish_resources`. Note that domctl->u.shadow_op.mb is -updated with the new size when setting the p2m pool. - -- For dom0less domUs, the setting of the p2m pool is called before -allocating memory during domain creation. Users can specify the p2m -pool size by `xen,domain-p2m-mem-mb` dts property. - -To actually allocate/free pages from the p2m pool, this commit adds -two helper functions namely `p2m_alloc_page` and `p2m_free_page` to -`struct p2m_domain`. By replacing the `alloc_domheap_page` and -`free_domheap_page` with these two helper functions, p2m pages can -be added/removed from the list of p2m pool rather than from the heap. - -Since page from `p2m_alloc_page` is cleaned, take the opportunity -to remove the redundant `clean_page` in `p2m_create_table`. - -This is part of CVE-2022-33747 / XSA-409. - -Signed-off-by: Henry Wang <Henry.Wang@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -master commit: cbea5a1149ca7fd4b7cdbfa3ec2e4f109b601ff7 -master date: 2022-10-11 14:28:44 +0200 ---- - docs/misc/arm/device-tree/booting.txt | 8 ++++ - xen/arch/arm/domain.c | 6 +++ - xen/arch/arm/domain_build.c | 29 ++++++++++++++ - xen/arch/arm/domctl.c | 23 ++++++++++- - xen/arch/arm/p2m.c | 57 +++++++++++++++++++++++++-- - 5 files changed, 118 insertions(+), 5 deletions(-) - -diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt -index 5243bc7fd344..470c9491a781 100644 ---- a/docs/misc/arm/device-tree/booting.txt -+++ b/docs/misc/arm/device-tree/booting.txt -@@ -164,6 +164,14 @@ with the following properties: - Both #address-cells and #size-cells need to be specified because - both sub-nodes (described shortly) have reg properties. - -+- xen,domain-p2m-mem-mb -+ -+ Optional. A 32-bit integer specifying the amount of megabytes of RAM -+ used for the domain P2M pool. This is in-sync with the shadow_memory -+ option in xl.cfg. Leaving this field empty in device tree will lead to -+ the default size of domain P2M pool, i.e. 1MB per guest vCPU plus 4KB -+ per MB of guest RAM plus 512KB for guest extended regions. -+ - Under the "xen,domain" compatible node, one or more sub-nodes are present - for the DomU kernel and ramdisk. - -diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c -index 223ec9694df1..a5ffd952ecd0 100644 ---- a/xen/arch/arm/domain.c -+++ b/xen/arch/arm/domain.c -@@ -985,6 +985,7 @@ enum { - PROG_page, - PROG_mapping, - PROG_p2m, -+ PROG_p2m_pool, - PROG_done, - }; - -@@ -1044,6 +1045,11 @@ int domain_relinquish_resources(struct domain *d) - if ( ret ) - return ret; - -+ PROGRESS(p2m_pool): -+ ret = p2m_teardown_allocation(d); -+ if( ret ) -+ return ret; -+ - PROGRESS(done): - break; - -diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c -index 26c13429488d..df0ec84f034c 100644 ---- a/xen/arch/arm/domain_build.c -+++ b/xen/arch/arm/domain_build.c -@@ -2333,6 +2333,21 @@ static void __init find_gnttab_region(struct domain *d, - kinfo->gnttab_start, kinfo->gnttab_start + kinfo->gnttab_size); - } - -+static unsigned long __init domain_p2m_pages(unsigned long maxmem_kb, -+ unsigned int smp_cpus) -+{ -+ /* -+ * Keep in sync with libxl__get_required_paging_memory(). -+ * 256 pages (1MB) per vcpu, plus 1 page per MiB of RAM for the P2M map, -+ * plus 128 pages to cover extended regions. -+ */ -+ unsigned long memkb = 4 * (256 * smp_cpus + (maxmem_kb / 1024) + 128); -+ -+ BUILD_BUG_ON(PAGE_SIZE != SZ_4K); -+ -+ return DIV_ROUND_UP(memkb, 1024) << (20 - PAGE_SHIFT); -+} -+ - static int __init construct_domain(struct domain *d, struct kernel_info *kinfo) - { - unsigned int i; -@@ -2424,6 +2439,8 @@ static int __init construct_domU(struct domain *d, - struct kernel_info kinfo = {}; - int rc; - u64 mem; -+ u32 p2m_mem_mb; -+ unsigned long p2m_pages; - - rc = dt_property_read_u64(node, "memory", &mem); - if ( !rc ) -@@ -2433,6 +2450,18 @@ static int __init construct_domU(struct domain *d, - } - kinfo.unassigned_mem = (paddr_t)mem * SZ_1K; - -+ rc = dt_property_read_u32(node, "xen,domain-p2m-mem-mb", &p2m_mem_mb); -+ /* If xen,domain-p2m-mem-mb is not specified, use the default value. */ -+ p2m_pages = rc ? -+ p2m_mem_mb << (20 - PAGE_SHIFT) : -+ domain_p2m_pages(mem, d->max_vcpus); -+ -+ spin_lock(&d->arch.paging.lock); -+ rc = p2m_set_allocation(d, p2m_pages, NULL); -+ spin_unlock(&d->arch.paging.lock); -+ if ( rc != 0 ) -+ return rc; -+ - printk("*** LOADING DOMU cpus=%u memory=%"PRIx64"KB ***\n", d->max_vcpus, mem); - - kinfo.vpl011 = dt_property_read_bool(node, "vpl011"); -diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c -index a049bc7f3e52..4ab5ed4ab24d 100644 ---- a/xen/arch/arm/domctl.c -+++ b/xen/arch/arm/domctl.c -@@ -48,6 +48,9 @@ static int handle_vuart_init(struct domain *d, - static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc, - XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) - { -+ long rc; -+ bool preempted = false; -+ - if ( unlikely(d == current->domain) ) - { - printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n"); -@@ -64,9 +67,27 @@ static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc, - switch ( sc->op ) - { - case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: -- return 0; -+ { -+ /* Allow and handle preemption */ -+ spin_lock(&d->arch.paging.lock); -+ rc = p2m_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted); -+ spin_unlock(&d->arch.paging.lock); -+ -+ if ( preempted ) -+ /* Not finished. Set up to re-run the call. */ -+ rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h", -+ u_domctl); -+ else -+ /* Finished. Return the new allocation. */ -+ sc->mb = p2m_get_allocation(d); -+ -+ return rc; -+ } - case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: -+ { -+ sc->mb = p2m_get_allocation(d); - return 0; -+ } - default: - { - printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op); -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index 6883d8627702..c1055ff2a745 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -50,6 +50,54 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn) - return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48)); - } - -+static struct page_info *p2m_alloc_page(struct domain *d) -+{ -+ struct page_info *pg; -+ -+ spin_lock(&d->arch.paging.lock); -+ /* -+ * For hardware domain, there should be no limit in the number of pages that -+ * can be allocated, so that the kernel may take advantage of the extended -+ * regions. Hence, allocate p2m pages for hardware domains from heap. -+ */ -+ if ( is_hardware_domain(d) ) -+ { -+ pg = alloc_domheap_page(NULL, 0); -+ if ( pg == NULL ) -+ { -+ printk(XENLOG_G_ERR "Failed to allocate P2M pages for hwdom.\n"); -+ spin_unlock(&d->arch.paging.lock); -+ return NULL; -+ } -+ } -+ else -+ { -+ pg = page_list_remove_head(&d->arch.paging.p2m_freelist); -+ if ( unlikely(!pg) ) -+ { -+ spin_unlock(&d->arch.paging.lock); -+ return NULL; -+ } -+ d->arch.paging.p2m_total_pages--; -+ } -+ spin_unlock(&d->arch.paging.lock); -+ -+ return pg; -+} -+ -+static void p2m_free_page(struct domain *d, struct page_info *pg) -+{ -+ spin_lock(&d->arch.paging.lock); -+ if ( is_hardware_domain(d) ) -+ free_domheap_page(pg); -+ else -+ { -+ d->arch.paging.p2m_total_pages++; -+ page_list_add_tail(pg, &d->arch.paging.p2m_freelist); -+ } -+ spin_unlock(&d->arch.paging.lock); -+} -+ - /* Return the size of the pool, rounded up to the nearest MB */ - unsigned int p2m_get_allocation(struct domain *d) - { -@@ -751,7 +799,7 @@ static int p2m_create_table(struct p2m_domain *p2m, lpae_t *entry) - - ASSERT(!p2m_is_valid(*entry)); - -- page = alloc_domheap_page(NULL, 0); -+ page = p2m_alloc_page(p2m->domain); - if ( page == NULL ) - return -ENOMEM; - -@@ -878,7 +926,7 @@ static void p2m_free_entry(struct p2m_domain *p2m, - pg = mfn_to_page(mfn); - - page_list_del(pg, &p2m->pages); -- free_domheap_page(pg); -+ p2m_free_page(p2m->domain, pg); - } - - static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry, -@@ -902,7 +950,7 @@ static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry, - ASSERT(level < target); - ASSERT(p2m_is_superpage(*entry, level)); - -- page = alloc_domheap_page(NULL, 0); -+ page = p2m_alloc_page(p2m->domain); - if ( !page ) - return false; - -@@ -1644,7 +1692,7 @@ int p2m_teardown(struct domain *d) - - while ( (pg = page_list_remove_head(&p2m->pages)) ) - { -- free_domheap_page(pg); -+ p2m_free_page(p2m->domain, pg); - count++; - /* Arbitrarily preempt every 512 iterations */ - if ( !(count % 512) && hypercall_preempt_check() ) -@@ -1668,6 +1716,7 @@ void p2m_final_teardown(struct domain *d) - return; - - ASSERT(page_list_empty(&p2m->pages)); -+ ASSERT(page_list_empty(&d->arch.paging.p2m_freelist)); - - if ( p2m->root ) - free_domheap_pages(p2m->root, P2M_ROOT_ORDER); --- -2.37.4 - diff --git a/0055-gnttab-correct-locking-on-transitive-grant-copy-erro.patch b/0055-gnttab-correct-locking-on-transitive-grant-copy-erro.patch deleted file mode 100644 index 5b8a7ea..0000000 --- a/0055-gnttab-correct-locking-on-transitive-grant-copy-erro.patch +++ /dev/null @@ -1,66 +0,0 @@ -From bb43a10fefe494ab747b020fef3e823b63fc566d Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 11 Oct 2022 15:11:01 +0200 -Subject: [PATCH 055/126] gnttab: correct locking on transitive grant copy - error path - -While the comment next to the lock dropping in preparation of -recursively calling acquire_grant_for_copy() mistakenly talks about the -rd == td case (excluded a few lines further up), the same concerns apply -to the calling of release_grant_for_copy() on a subsequent error path. - -This is CVE-2022-33748 / XSA-411. - -Fixes: ad48fb963dbf ("gnttab: fix transitive grant handling") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -master commit: 6e3aab858eef614a21a782a3b73acc88e74690ea -master date: 2022-10-11 14:29:30 +0200 ---- - xen/common/grant_table.c | 19 ++++++++++++++++--- - 1 file changed, 16 insertions(+), 3 deletions(-) - -diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c -index 77bba9806937..0523beb9b734 100644 ---- a/xen/common/grant_table.c -+++ b/xen/common/grant_table.c -@@ -2608,9 +2608,8 @@ acquire_grant_for_copy( - trans_domid); - - /* -- * acquire_grant_for_copy() could take the lock on the -- * remote table (if rd == td), so we have to drop the lock -- * here and reacquire. -+ * acquire_grant_for_copy() will take the lock on the remote table, -+ * so we have to drop the lock here and reacquire. - */ - active_entry_release(act); - grant_read_unlock(rgt); -@@ -2647,11 +2646,25 @@ acquire_grant_for_copy( - act->trans_gref != trans_gref || - !act->is_sub_page)) ) - { -+ /* -+ * Like above for acquire_grant_for_copy() we need to drop and then -+ * re-acquire the locks here to prevent lock order inversion issues. -+ * Unlike for acquire_grant_for_copy() we don't need to re-check -+ * anything, as release_grant_for_copy() doesn't depend on the grant -+ * table entry: It only updates internal state and the status flags. -+ */ -+ active_entry_release(act); -+ grant_read_unlock(rgt); -+ - release_grant_for_copy(td, trans_gref, readonly); - rcu_unlock_domain(td); -+ -+ grant_read_lock(rgt); -+ act = active_entry_acquire(rgt, gref); - reduce_status_for_pin(rd, act, status, readonly); - active_entry_release(act); - grant_read_unlock(rgt); -+ - put_page(*page); - *page = NULL; - return ERESTART; --- -2.37.4 - diff --git a/0056-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch b/0056-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch deleted file mode 100644 index 80a1923..0000000 --- a/0056-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch +++ /dev/null @@ -1,112 +0,0 @@ -From d65ebacb78901b695bc5e8a075ad1ad865a78928 Mon Sep 17 00:00:00 2001 -From: Anthony PERARD <anthony.perard@citrix.com> -Date: Tue, 11 Oct 2022 15:13:15 +0200 -Subject: [PATCH 056/126] tools/libxl: Replace deprecated -soundhw on QEMU - command line - --soundhw is deprecated since 825ff02911c9 ("audio: add soundhw -deprecation notice"), QEMU v5.1, and is been remove for upcoming v7.1 -by 039a68373c45 ("introduce -audio as a replacement for -soundhw"). - -Instead we can just add the sound card with "-device", for most option -that "-soundhw" could handle. "-device" is an option that existed -before QEMU 1.0, and could already be used to add audio hardware. - -The list of possible option for libxl's "soundhw" is taken the list -from QEMU 7.0. - -The list of options for "soundhw" are listed in order of preference in -the manual. The first three (hda, ac97, es1370) are PCI devices and -easy to test on Linux, and the last four are ISA devices which doesn't -seems to work out of the box on linux. - -The sound card 'pcspk' isn't listed even if it used to be accepted by -'-soundhw' because QEMU crash when trying to add it to a Xen domain. -Also, it wouldn't work with "-device" might need to be "-machine -pcspk-audiodev=default" instead. - -Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> -Reviewed-by: Jason Andryuk <jandryuk@gmail.com> -master commit: 62ca138c2c052187783aca3957d3f47c4dcfd683 -master date: 2022-08-18 09:25:50 +0200 ---- - docs/man/xl.cfg.5.pod.in | 6 +++--- - tools/libs/light/libxl_dm.c | 19 ++++++++++++++++++- - tools/libs/light/libxl_types_internal.idl | 10 ++++++++++ - 3 files changed, 31 insertions(+), 4 deletions(-) - -diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in -index af7fae7c52f9..ef9505f91341 100644 ---- a/docs/man/xl.cfg.5.pod.in -+++ b/docs/man/xl.cfg.5.pod.in -@@ -2523,9 +2523,9 @@ The form serial=DEVICE is also accepted for backwards compatibility. - - =item B<soundhw="DEVICE"> - --Select the virtual sound card to expose to the guest. The valid --devices are defined by the device model configuration, please see the --B<qemu(1)> manpage for details. The default is not to export any sound -+Select the virtual sound card to expose to the guest. The valid devices are -+B<hda>, B<ac97>, B<es1370>, B<adlib>, B<cs4231a>, B<gus>, B<sb16> if there are -+available with the device model QEMU. The default is not to export any sound - device. - - =item B<vkb_device=BOOLEAN> -diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c -index ae5f35e0c3fd..b86e8ccc858f 100644 ---- a/tools/libs/light/libxl_dm.c -+++ b/tools/libs/light/libxl_dm.c -@@ -1204,6 +1204,7 @@ static int libxl__build_device_model_args_new(libxl__gc *gc, - uint64_t ram_size; - const char *path, *chardev; - bool is_stubdom = libxl_defbool_val(b_info->device_model_stubdomain); -+ int rc; - - dm_args = flexarray_make(gc, 16, 1); - dm_envs = flexarray_make(gc, 16, 1); -@@ -1531,7 +1532,23 @@ static int libxl__build_device_model_args_new(libxl__gc *gc, - } - } - if (b_info->u.hvm.soundhw) { -- flexarray_vappend(dm_args, "-soundhw", b_info->u.hvm.soundhw, NULL); -+ libxl__qemu_soundhw soundhw; -+ -+ rc = libxl__qemu_soundhw_from_string(b_info->u.hvm.soundhw, &soundhw); -+ if (rc) { -+ LOGD(ERROR, guest_domid, "Unknown soundhw option '%s'", b_info->u.hvm.soundhw); -+ return ERROR_INVAL; -+ } -+ -+ switch (soundhw) { -+ case LIBXL__QEMU_SOUNDHW_HDA: -+ flexarray_vappend(dm_args, "-device", "intel-hda", -+ "-device", "hda-duplex", NULL); -+ break; -+ default: -+ flexarray_append_pair(dm_args, "-device", -+ (char*)libxl__qemu_soundhw_to_string(soundhw)); -+ } - } - if (!libxl__acpi_defbool_val(b_info)) { - flexarray_append(dm_args, "-no-acpi"); -diff --git a/tools/libs/light/libxl_types_internal.idl b/tools/libs/light/libxl_types_internal.idl -index 3593e21dbb64..caa08d3229cd 100644 ---- a/tools/libs/light/libxl_types_internal.idl -+++ b/tools/libs/light/libxl_types_internal.idl -@@ -55,3 +55,13 @@ libxl__device_action = Enumeration("device_action", [ - (1, "ADD"), - (2, "REMOVE"), - ]) -+ -+libxl__qemu_soundhw = Enumeration("qemu_soundhw", [ -+ (1, "ac97"), -+ (2, "adlib"), -+ (3, "cs4231a"), -+ (4, "es1370"), -+ (5, "gus"), -+ (6, "hda"), -+ (7, "sb16"), -+ ]) --- -2.37.4 - diff --git a/0057-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch b/0057-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch deleted file mode 100644 index 2949fb0..0000000 --- a/0057-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch +++ /dev/null @@ -1,44 +0,0 @@ -From 7923ea47e578bca30a6e45951a9da09e827ff028 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 11 Oct 2022 15:14:05 +0200 -Subject: [PATCH 057/126] x86/CPUID: surface suitable value in EBX of XSTATE - subleaf 1 - -While the SDM isn't very clear about this, our present behavior make -Linux 5.19 unhappy. As of commit 8ad7e8f69695 ("x86/fpu/xsave: Support -XSAVEC in the kernel") they're using this CPUID output also to size -the compacted area used by XSAVEC. Getting back zero there isn't really -liked, yet for PV that's the default on capable hardware: XSAVES isn't -exposed to PV domains. - -Considering that the size reported is that of the compacted save area, -I view Linux'es assumption as appropriate (short of the SDM properly -considering the case). Therefore we need to populate the field also when -only XSAVEC is supported for a guest. - -Fixes: 460b9a4b3630 ("x86/xsaves: enable xsaves/xrstors for hvm guest") -Fixes: 8d050ed1097c ("x86: don't expose XSAVES capability to PV guests") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: c3bd0b83ea5b7c0da6542687436042eeea1e7909 -master date: 2022-08-24 14:23:59 +0200 ---- - xen/arch/x86/cpuid.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c -index ee2c4ea03a89..11c95178f110 100644 ---- a/xen/arch/x86/cpuid.c -+++ b/xen/arch/x86/cpuid.c -@@ -1052,7 +1052,7 @@ void guest_cpuid(const struct vcpu *v, uint32_t leaf, - switch ( subleaf ) - { - case 1: -- if ( p->xstate.xsaves ) -+ if ( p->xstate.xsavec || p->xstate.xsaves ) - { - /* - * TODO: Figure out what to do for XSS state. VT-x manages --- -2.37.4 - diff --git a/0058-xen-sched-introduce-cpupool_update_node_affinity.patch b/0058-xen-sched-introduce-cpupool_update_node_affinity.patch deleted file mode 100644 index c2cf0b8..0000000 --- a/0058-xen-sched-introduce-cpupool_update_node_affinity.patch +++ /dev/null @@ -1,257 +0,0 @@ -From 735b10844489babf52d3193193285a7311cf2c39 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 11 Oct 2022 15:14:22 +0200 -Subject: [PATCH 058/126] xen/sched: introduce cpupool_update_node_affinity() - -For updating the node affinities of all domains in a cpupool add a new -function cpupool_update_node_affinity(). - -In order to avoid multiple allocations of cpumasks carve out memory -allocation and freeing from domain_update_node_affinity() into new -helpers, which can be used by cpupool_update_node_affinity(). - -Modify domain_update_node_affinity() to take an additional parameter -for passing the allocated memory in and to allocate and free the memory -via the new helpers in case NULL was passed. - -This will help later to pre-allocate the cpumasks in order to avoid -allocations in stop-machine context. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -Tested-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: a83fa1e2b96ace65b45dde6954d67012633a082b -master date: 2022-09-05 11:42:30 +0100 ---- - xen/common/sched/core.c | 54 ++++++++++++++++++++++++++------------ - xen/common/sched/cpupool.c | 39 +++++++++++++++------------ - xen/common/sched/private.h | 7 +++++ - xen/include/xen/sched.h | 9 ++++++- - 4 files changed, 74 insertions(+), 35 deletions(-) - -diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c -index f07bd2681fcb..065a83eca912 100644 ---- a/xen/common/sched/core.c -+++ b/xen/common/sched/core.c -@@ -1824,9 +1824,28 @@ int vcpu_affinity_domctl(struct domain *d, uint32_t cmd, - return ret; - } - --void domain_update_node_affinity(struct domain *d) -+bool alloc_affinity_masks(struct affinity_masks *affinity) - { -- cpumask_var_t dom_cpumask, dom_cpumask_soft; -+ if ( !alloc_cpumask_var(&affinity->hard) ) -+ return false; -+ if ( !alloc_cpumask_var(&affinity->soft) ) -+ { -+ free_cpumask_var(affinity->hard); -+ return false; -+ } -+ -+ return true; -+} -+ -+void free_affinity_masks(struct affinity_masks *affinity) -+{ -+ free_cpumask_var(affinity->soft); -+ free_cpumask_var(affinity->hard); -+} -+ -+void domain_update_node_aff(struct domain *d, struct affinity_masks *affinity) -+{ -+ struct affinity_masks masks; - cpumask_t *dom_affinity; - const cpumask_t *online; - struct sched_unit *unit; -@@ -1836,14 +1855,16 @@ void domain_update_node_affinity(struct domain *d) - if ( !d->vcpu || !d->vcpu[0] ) - return; - -- if ( !zalloc_cpumask_var(&dom_cpumask) ) -- return; -- if ( !zalloc_cpumask_var(&dom_cpumask_soft) ) -+ if ( !affinity ) - { -- free_cpumask_var(dom_cpumask); -- return; -+ affinity = &masks; -+ if ( !alloc_affinity_masks(affinity) ) -+ return; - } - -+ cpumask_clear(affinity->hard); -+ cpumask_clear(affinity->soft); -+ - online = cpupool_domain_master_cpumask(d); - - spin_lock(&d->node_affinity_lock); -@@ -1864,22 +1885,21 @@ void domain_update_node_affinity(struct domain *d) - */ - for_each_sched_unit ( d, unit ) - { -- cpumask_or(dom_cpumask, dom_cpumask, unit->cpu_hard_affinity); -- cpumask_or(dom_cpumask_soft, dom_cpumask_soft, -- unit->cpu_soft_affinity); -+ cpumask_or(affinity->hard, affinity->hard, unit->cpu_hard_affinity); -+ cpumask_or(affinity->soft, affinity->soft, unit->cpu_soft_affinity); - } - /* Filter out non-online cpus */ -- cpumask_and(dom_cpumask, dom_cpumask, online); -- ASSERT(!cpumask_empty(dom_cpumask)); -+ cpumask_and(affinity->hard, affinity->hard, online); -+ ASSERT(!cpumask_empty(affinity->hard)); - /* And compute the intersection between hard, online and soft */ -- cpumask_and(dom_cpumask_soft, dom_cpumask_soft, dom_cpumask); -+ cpumask_and(affinity->soft, affinity->soft, affinity->hard); - - /* - * If not empty, the intersection of hard, soft and online is the - * narrowest set we want. If empty, we fall back to hard&online. - */ -- dom_affinity = cpumask_empty(dom_cpumask_soft) ? -- dom_cpumask : dom_cpumask_soft; -+ dom_affinity = cpumask_empty(affinity->soft) ? affinity->hard -+ : affinity->soft; - - nodes_clear(d->node_affinity); - for_each_cpu ( cpu, dom_affinity ) -@@ -1888,8 +1908,8 @@ void domain_update_node_affinity(struct domain *d) - - spin_unlock(&d->node_affinity_lock); - -- free_cpumask_var(dom_cpumask_soft); -- free_cpumask_var(dom_cpumask); -+ if ( affinity == &masks ) -+ free_affinity_masks(affinity); - } - - typedef long ret_t; -diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c -index 8c6e6eb9ccd5..45b6ff99561a 100644 ---- a/xen/common/sched/cpupool.c -+++ b/xen/common/sched/cpupool.c -@@ -401,6 +401,25 @@ int cpupool_move_domain(struct domain *d, struct cpupool *c) - return ret; - } - -+/* Update affinities of all domains in a cpupool. */ -+static void cpupool_update_node_affinity(const struct cpupool *c) -+{ -+ struct affinity_masks masks; -+ struct domain *d; -+ -+ if ( !alloc_affinity_masks(&masks) ) -+ return; -+ -+ rcu_read_lock(&domlist_read_lock); -+ -+ for_each_domain_in_cpupool(d, c) -+ domain_update_node_aff(d, &masks); -+ -+ rcu_read_unlock(&domlist_read_lock); -+ -+ free_affinity_masks(&masks); -+} -+ - /* - * assign a specific cpu to a cpupool - * cpupool_lock must be held -@@ -408,7 +427,6 @@ int cpupool_move_domain(struct domain *d, struct cpupool *c) - static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) - { - int ret; -- struct domain *d; - const cpumask_t *cpus; - - cpus = sched_get_opt_cpumask(c->gran, cpu); -@@ -433,12 +451,7 @@ static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) - - rcu_read_unlock(&sched_res_rculock); - -- rcu_read_lock(&domlist_read_lock); -- for_each_domain_in_cpupool(d, c) -- { -- domain_update_node_affinity(d); -- } -- rcu_read_unlock(&domlist_read_lock); -+ cpupool_update_node_affinity(c); - - return 0; - } -@@ -447,18 +460,14 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c) - { - int cpu = cpupool_moving_cpu; - const cpumask_t *cpus; -- struct domain *d; - int ret; - - if ( c != cpupool_cpu_moving ) - return -EADDRNOTAVAIL; - -- /* -- * We need this for scanning the domain list, both in -- * cpu_disable_scheduler(), and at the bottom of this function. -- */ - rcu_read_lock(&domlist_read_lock); - ret = cpu_disable_scheduler(cpu); -+ rcu_read_unlock(&domlist_read_lock); - - rcu_read_lock(&sched_res_rculock); - cpus = get_sched_res(cpu)->cpus; -@@ -485,11 +494,7 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c) - } - rcu_read_unlock(&sched_res_rculock); - -- for_each_domain_in_cpupool(d, c) -- { -- domain_update_node_affinity(d); -- } -- rcu_read_unlock(&domlist_read_lock); -+ cpupool_update_node_affinity(c); - - return ret; - } -diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h -index 92d0d4961063..6e036f8c8077 100644 ---- a/xen/common/sched/private.h -+++ b/xen/common/sched/private.h -@@ -593,6 +593,13 @@ affinity_balance_cpumask(const struct sched_unit *unit, int step, - cpumask_copy(mask, unit->cpu_hard_affinity); - } - -+struct affinity_masks { -+ cpumask_var_t hard; -+ cpumask_var_t soft; -+}; -+ -+bool alloc_affinity_masks(struct affinity_masks *affinity); -+void free_affinity_masks(struct affinity_masks *affinity); - void sched_rm_cpu(unsigned int cpu); - const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu); - void schedule_dump(struct cpupool *c); -diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h -index 701963f84cb8..4e25627d9685 100644 ---- a/xen/include/xen/sched.h -+++ b/xen/include/xen/sched.h -@@ -649,8 +649,15 @@ static inline void get_knownalive_domain(struct domain *d) - ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED)); - } - -+struct affinity_masks; -+ - int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity); --void domain_update_node_affinity(struct domain *d); -+void domain_update_node_aff(struct domain *d, struct affinity_masks *affinity); -+ -+static inline void domain_update_node_affinity(struct domain *d) -+{ -+ domain_update_node_aff(d, NULL); -+} - - /* - * To be implemented by each architecture, sanity checking the configuration --- -2.37.4 - diff --git a/0059-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch b/0059-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch deleted file mode 100644 index 7e81f53..0000000 --- a/0059-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch +++ /dev/null @@ -1,263 +0,0 @@ -From d638c2085f71f694344b34e70eb1b371c86b00f0 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 11 Oct 2022 15:15:14 +0200 -Subject: [PATCH 059/126] xen/sched: carve out memory allocation and freeing - from schedule_cpu_rm() - -In order to prepare not allocating or freeing memory from -schedule_cpu_rm(), move this functionality to dedicated functions. - -For now call those functions from schedule_cpu_rm(). - -No change of behavior expected. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: d42be6f83480b3ada286dc18444331a816be88a3 -master date: 2022-09-05 11:42:30 +0100 ---- - xen/common/sched/core.c | 143 ++++++++++++++++++++++--------------- - xen/common/sched/private.h | 11 +++ - 2 files changed, 98 insertions(+), 56 deletions(-) - -diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c -index 065a83eca912..2decb1161a63 100644 ---- a/xen/common/sched/core.c -+++ b/xen/common/sched/core.c -@@ -3221,6 +3221,75 @@ out: - return ret; - } - -+/* -+ * Allocate all memory needed for free_cpu_rm_data(), as allocations cannot -+ * be made in stop_machine() context. -+ * -+ * Between alloc_cpu_rm_data() and the real cpu removal action the relevant -+ * contents of struct sched_resource can't change, as the cpu in question is -+ * locked against any other movement to or from cpupools, and the data copied -+ * by alloc_cpu_rm_data() is modified only in case the cpu in question is -+ * being moved from or to a cpupool. -+ */ -+struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu) -+{ -+ struct cpu_rm_data *data; -+ const struct sched_resource *sr; -+ unsigned int idx; -+ -+ rcu_read_lock(&sched_res_rculock); -+ -+ sr = get_sched_res(cpu); -+ data = xmalloc_flex_struct(struct cpu_rm_data, sr, sr->granularity - 1); -+ if ( !data ) -+ goto out; -+ -+ data->old_ops = sr->scheduler; -+ data->vpriv_old = idle_vcpu[cpu]->sched_unit->priv; -+ data->ppriv_old = sr->sched_priv; -+ -+ for ( idx = 0; idx < sr->granularity - 1; idx++ ) -+ { -+ data->sr[idx] = sched_alloc_res(); -+ if ( data->sr[idx] ) -+ { -+ data->sr[idx]->sched_unit_idle = sched_alloc_unit_mem(); -+ if ( !data->sr[idx]->sched_unit_idle ) -+ { -+ sched_res_free(&data->sr[idx]->rcu); -+ data->sr[idx] = NULL; -+ } -+ } -+ if ( !data->sr[idx] ) -+ { -+ while ( idx > 0 ) -+ sched_res_free(&data->sr[--idx]->rcu); -+ XFREE(data); -+ goto out; -+ } -+ -+ data->sr[idx]->curr = data->sr[idx]->sched_unit_idle; -+ data->sr[idx]->scheduler = &sched_idle_ops; -+ data->sr[idx]->granularity = 1; -+ -+ /* We want the lock not to change when replacing the resource. */ -+ data->sr[idx]->schedule_lock = sr->schedule_lock; -+ } -+ -+ out: -+ rcu_read_unlock(&sched_res_rculock); -+ -+ return data; -+} -+ -+void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu) -+{ -+ sched_free_udata(mem->old_ops, mem->vpriv_old); -+ sched_free_pdata(mem->old_ops, mem->ppriv_old, cpu); -+ -+ xfree(mem); -+} -+ - /* - * Remove a pCPU from its cpupool. Its scheduler becomes &sched_idle_ops - * (the idle scheduler). -@@ -3229,53 +3298,23 @@ out: - */ - int schedule_cpu_rm(unsigned int cpu) - { -- void *ppriv_old, *vpriv_old; -- struct sched_resource *sr, **sr_new = NULL; -+ struct sched_resource *sr; -+ struct cpu_rm_data *data; - struct sched_unit *unit; -- struct scheduler *old_ops; - spinlock_t *old_lock; - unsigned long flags; -- int idx, ret = -ENOMEM; -+ int idx = 0; - unsigned int cpu_iter; - -+ data = alloc_cpu_rm_data(cpu); -+ if ( !data ) -+ return -ENOMEM; -+ - rcu_read_lock(&sched_res_rculock); - - sr = get_sched_res(cpu); -- old_ops = sr->scheduler; - -- if ( sr->granularity > 1 ) -- { -- sr_new = xmalloc_array(struct sched_resource *, sr->granularity - 1); -- if ( !sr_new ) -- goto out; -- for ( idx = 0; idx < sr->granularity - 1; idx++ ) -- { -- sr_new[idx] = sched_alloc_res(); -- if ( sr_new[idx] ) -- { -- sr_new[idx]->sched_unit_idle = sched_alloc_unit_mem(); -- if ( !sr_new[idx]->sched_unit_idle ) -- { -- sched_res_free(&sr_new[idx]->rcu); -- sr_new[idx] = NULL; -- } -- } -- if ( !sr_new[idx] ) -- { -- for ( idx--; idx >= 0; idx-- ) -- sched_res_free(&sr_new[idx]->rcu); -- goto out; -- } -- sr_new[idx]->curr = sr_new[idx]->sched_unit_idle; -- sr_new[idx]->scheduler = &sched_idle_ops; -- sr_new[idx]->granularity = 1; -- -- /* We want the lock not to change when replacing the resource. */ -- sr_new[idx]->schedule_lock = sr->schedule_lock; -- } -- } -- -- ret = 0; -+ ASSERT(sr->granularity); - ASSERT(sr->cpupool != NULL); - ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus)); - ASSERT(!cpumask_test_cpu(cpu, sr->cpupool->cpu_valid)); -@@ -3283,10 +3322,6 @@ int schedule_cpu_rm(unsigned int cpu) - /* See comment in schedule_cpu_add() regarding lock switching. */ - old_lock = pcpu_schedule_lock_irqsave(cpu, &flags); - -- vpriv_old = idle_vcpu[cpu]->sched_unit->priv; -- ppriv_old = sr->sched_priv; -- -- idx = 0; - for_each_cpu ( cpu_iter, sr->cpus ) - { - per_cpu(sched_res_idx, cpu_iter) = 0; -@@ -3300,27 +3335,27 @@ int schedule_cpu_rm(unsigned int cpu) - else - { - /* Initialize unit. */ -- unit = sr_new[idx]->sched_unit_idle; -- unit->res = sr_new[idx]; -+ unit = data->sr[idx]->sched_unit_idle; -+ unit->res = data->sr[idx]; - unit->is_running = true; - sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]); - sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain); - - /* Adjust cpu masks of resources (old and new). */ - cpumask_clear_cpu(cpu_iter, sr->cpus); -- cpumask_set_cpu(cpu_iter, sr_new[idx]->cpus); -+ cpumask_set_cpu(cpu_iter, data->sr[idx]->cpus); - cpumask_set_cpu(cpu_iter, &sched_res_mask); - - /* Init timer. */ -- init_timer(&sr_new[idx]->s_timer, s_timer_fn, NULL, cpu_iter); -+ init_timer(&data->sr[idx]->s_timer, s_timer_fn, NULL, cpu_iter); - - /* Last resource initializations and insert resource pointer. */ -- sr_new[idx]->master_cpu = cpu_iter; -- set_sched_res(cpu_iter, sr_new[idx]); -+ data->sr[idx]->master_cpu = cpu_iter; -+ set_sched_res(cpu_iter, data->sr[idx]); - - /* Last action: set the new lock pointer. */ - smp_mb(); -- sr_new[idx]->schedule_lock = &sched_free_cpu_lock; -+ data->sr[idx]->schedule_lock = &sched_free_cpu_lock; - - idx++; - } -@@ -3336,16 +3371,12 @@ int schedule_cpu_rm(unsigned int cpu) - /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */ - spin_unlock_irqrestore(old_lock, flags); - -- sched_deinit_pdata(old_ops, ppriv_old, cpu); -+ sched_deinit_pdata(data->old_ops, data->ppriv_old, cpu); - -- sched_free_udata(old_ops, vpriv_old); -- sched_free_pdata(old_ops, ppriv_old, cpu); -- --out: - rcu_read_unlock(&sched_res_rculock); -- xfree(sr_new); -+ free_cpu_rm_data(data, cpu); - -- return ret; -+ return 0; - } - - struct scheduler *scheduler_get_default(void) -diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h -index 6e036f8c8077..ff3185425219 100644 ---- a/xen/common/sched/private.h -+++ b/xen/common/sched/private.h -@@ -600,6 +600,15 @@ struct affinity_masks { - - bool alloc_affinity_masks(struct affinity_masks *affinity); - void free_affinity_masks(struct affinity_masks *affinity); -+ -+/* Memory allocation related data for schedule_cpu_rm(). */ -+struct cpu_rm_data { -+ const struct scheduler *old_ops; -+ void *ppriv_old; -+ void *vpriv_old; -+ struct sched_resource *sr[]; -+}; -+ - void sched_rm_cpu(unsigned int cpu); - const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu); - void schedule_dump(struct cpupool *c); -@@ -608,6 +617,8 @@ struct scheduler *scheduler_alloc(unsigned int sched_id); - void scheduler_free(struct scheduler *sched); - int cpu_disable_scheduler(unsigned int cpu); - int schedule_cpu_add(unsigned int cpu, struct cpupool *c); -+struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu); -+void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu); - int schedule_cpu_rm(unsigned int cpu); - int sched_move_domain(struct domain *d, struct cpupool *c); - struct cpupool *cpupool_get_by_id(unsigned int poolid); --- -2.37.4 - diff --git a/0060-xen-sched-fix-cpu-hotplug.patch b/0060-xen-sched-fix-cpu-hotplug.patch deleted file mode 100644 index 264c8ef..0000000 --- a/0060-xen-sched-fix-cpu-hotplug.patch +++ /dev/null @@ -1,307 +0,0 @@ -From d17680808b4c8015e31070c971e1ee548170ae34 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 11 Oct 2022 15:15:41 +0200 -Subject: [PATCH 060/126] xen/sched: fix cpu hotplug - -Cpu unplugging is calling schedule_cpu_rm() via stop_machine_run() with -interrupts disabled, thus any memory allocation or freeing must be -avoided. - -Since commit 5047cd1d5dea ("xen/common: Use enhanced -ASSERT_ALLOC_CONTEXT in xmalloc()") this restriction is being enforced -via an assertion, which will now fail. - -Fix this by allocating needed memory before entering stop_machine_run() -and freeing any memory only after having finished stop_machine_run(). - -Fixes: 1ec410112cdd ("xen/sched: support differing granularity in schedule_cpu_[add/rm]()") -Reported-by: Gao Ruifeng <ruifeng.gao@intel.com> -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -Tested-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: d84473689611eed32fd90b27e614f28af767fa3f -master date: 2022-09-05 11:42:30 +0100 ---- - xen/common/sched/core.c | 25 +++++++++++--- - xen/common/sched/cpupool.c | 69 +++++++++++++++++++++++++++++--------- - xen/common/sched/private.h | 5 +-- - 3 files changed, 77 insertions(+), 22 deletions(-) - -diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c -index 2decb1161a63..900aab8f66a7 100644 ---- a/xen/common/sched/core.c -+++ b/xen/common/sched/core.c -@@ -3231,7 +3231,7 @@ out: - * by alloc_cpu_rm_data() is modified only in case the cpu in question is - * being moved from or to a cpupool. - */ --struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu) -+struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu, bool aff_alloc) - { - struct cpu_rm_data *data; - const struct sched_resource *sr; -@@ -3244,6 +3244,17 @@ struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu) - if ( !data ) - goto out; - -+ if ( aff_alloc ) -+ { -+ if ( !alloc_affinity_masks(&data->affinity) ) -+ { -+ XFREE(data); -+ goto out; -+ } -+ } -+ else -+ memset(&data->affinity, 0, sizeof(data->affinity)); -+ - data->old_ops = sr->scheduler; - data->vpriv_old = idle_vcpu[cpu]->sched_unit->priv; - data->ppriv_old = sr->sched_priv; -@@ -3264,6 +3275,7 @@ struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu) - { - while ( idx > 0 ) - sched_res_free(&data->sr[--idx]->rcu); -+ free_affinity_masks(&data->affinity); - XFREE(data); - goto out; - } -@@ -3286,6 +3298,7 @@ void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu) - { - sched_free_udata(mem->old_ops, mem->vpriv_old); - sched_free_pdata(mem->old_ops, mem->ppriv_old, cpu); -+ free_affinity_masks(&mem->affinity); - - xfree(mem); - } -@@ -3296,17 +3309,18 @@ void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu) - * The cpu is already marked as "free" and not valid any longer for its - * cpupool. - */ --int schedule_cpu_rm(unsigned int cpu) -+int schedule_cpu_rm(unsigned int cpu, struct cpu_rm_data *data) - { - struct sched_resource *sr; -- struct cpu_rm_data *data; - struct sched_unit *unit; - spinlock_t *old_lock; - unsigned long flags; - int idx = 0; - unsigned int cpu_iter; -+ bool free_data = !data; - -- data = alloc_cpu_rm_data(cpu); -+ if ( !data ) -+ data = alloc_cpu_rm_data(cpu, false); - if ( !data ) - return -ENOMEM; - -@@ -3374,7 +3388,8 @@ int schedule_cpu_rm(unsigned int cpu) - sched_deinit_pdata(data->old_ops, data->ppriv_old, cpu); - - rcu_read_unlock(&sched_res_rculock); -- free_cpu_rm_data(data, cpu); -+ if ( free_data ) -+ free_cpu_rm_data(data, cpu); - - return 0; - } -diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c -index 45b6ff99561a..b5a948639aad 100644 ---- a/xen/common/sched/cpupool.c -+++ b/xen/common/sched/cpupool.c -@@ -402,22 +402,28 @@ int cpupool_move_domain(struct domain *d, struct cpupool *c) - } - - /* Update affinities of all domains in a cpupool. */ --static void cpupool_update_node_affinity(const struct cpupool *c) -+static void cpupool_update_node_affinity(const struct cpupool *c, -+ struct affinity_masks *masks) - { -- struct affinity_masks masks; -+ struct affinity_masks local_masks; - struct domain *d; - -- if ( !alloc_affinity_masks(&masks) ) -- return; -+ if ( !masks ) -+ { -+ if ( !alloc_affinity_masks(&local_masks) ) -+ return; -+ masks = &local_masks; -+ } - - rcu_read_lock(&domlist_read_lock); - - for_each_domain_in_cpupool(d, c) -- domain_update_node_aff(d, &masks); -+ domain_update_node_aff(d, masks); - - rcu_read_unlock(&domlist_read_lock); - -- free_affinity_masks(&masks); -+ if ( masks == &local_masks ) -+ free_affinity_masks(masks); - } - - /* -@@ -451,15 +457,17 @@ static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) - - rcu_read_unlock(&sched_res_rculock); - -- cpupool_update_node_affinity(c); -+ cpupool_update_node_affinity(c, NULL); - - return 0; - } - --static int cpupool_unassign_cpu_finish(struct cpupool *c) -+static int cpupool_unassign_cpu_finish(struct cpupool *c, -+ struct cpu_rm_data *mem) - { - int cpu = cpupool_moving_cpu; - const cpumask_t *cpus; -+ struct affinity_masks *masks = mem ? &mem->affinity : NULL; - int ret; - - if ( c != cpupool_cpu_moving ) -@@ -482,7 +490,7 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c) - */ - if ( !ret ) - { -- ret = schedule_cpu_rm(cpu); -+ ret = schedule_cpu_rm(cpu, mem); - if ( ret ) - cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus); - else -@@ -494,7 +502,7 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c) - } - rcu_read_unlock(&sched_res_rculock); - -- cpupool_update_node_affinity(c); -+ cpupool_update_node_affinity(c, masks); - - return ret; - } -@@ -558,7 +566,7 @@ static long cpupool_unassign_cpu_helper(void *info) - cpupool_cpu_moving->cpupool_id, cpupool_moving_cpu); - spin_lock(&cpupool_lock); - -- ret = cpupool_unassign_cpu_finish(c); -+ ret = cpupool_unassign_cpu_finish(c, NULL); - - spin_unlock(&cpupool_lock); - debugtrace_printk("cpupool_unassign_cpu ret=%ld\n", ret); -@@ -701,7 +709,7 @@ static int cpupool_cpu_add(unsigned int cpu) - * This function is called in stop_machine context, so we can be sure no - * non-idle vcpu is active on the system. - */ --static void cpupool_cpu_remove(unsigned int cpu) -+static void cpupool_cpu_remove(unsigned int cpu, struct cpu_rm_data *mem) - { - int ret; - -@@ -709,7 +717,7 @@ static void cpupool_cpu_remove(unsigned int cpu) - - if ( !cpumask_test_cpu(cpu, &cpupool_free_cpus) ) - { -- ret = cpupool_unassign_cpu_finish(cpupool0); -+ ret = cpupool_unassign_cpu_finish(cpupool0, mem); - BUG_ON(ret); - } - cpumask_clear_cpu(cpu, &cpupool_free_cpus); -@@ -775,7 +783,7 @@ static void cpupool_cpu_remove_forced(unsigned int cpu) - { - ret = cpupool_unassign_cpu_start(c, master_cpu); - BUG_ON(ret); -- ret = cpupool_unassign_cpu_finish(c); -+ ret = cpupool_unassign_cpu_finish(c, NULL); - BUG_ON(ret); - } - } -@@ -993,12 +1001,24 @@ void dump_runq(unsigned char key) - static int cpu_callback( - struct notifier_block *nfb, unsigned long action, void *hcpu) - { -+ static struct cpu_rm_data *mem; -+ - unsigned int cpu = (unsigned long)hcpu; - int rc = 0; - - switch ( action ) - { - case CPU_DOWN_FAILED: -+ if ( system_state <= SYS_STATE_active ) -+ { -+ if ( mem ) -+ { -+ free_cpu_rm_data(mem, cpu); -+ mem = NULL; -+ } -+ rc = cpupool_cpu_add(cpu); -+ } -+ break; - case CPU_ONLINE: - if ( system_state <= SYS_STATE_active ) - rc = cpupool_cpu_add(cpu); -@@ -1006,12 +1026,31 @@ static int cpu_callback( - case CPU_DOWN_PREPARE: - /* Suspend/Resume don't change assignments of cpus to cpupools. */ - if ( system_state <= SYS_STATE_active ) -+ { - rc = cpupool_cpu_remove_prologue(cpu); -+ if ( !rc ) -+ { -+ ASSERT(!mem); -+ mem = alloc_cpu_rm_data(cpu, true); -+ rc = mem ? 0 : -ENOMEM; -+ } -+ } - break; - case CPU_DYING: - /* Suspend/Resume don't change assignments of cpus to cpupools. */ - if ( system_state <= SYS_STATE_active ) -- cpupool_cpu_remove(cpu); -+ { -+ ASSERT(mem); -+ cpupool_cpu_remove(cpu, mem); -+ } -+ break; -+ case CPU_DEAD: -+ if ( system_state <= SYS_STATE_active ) -+ { -+ ASSERT(mem); -+ free_cpu_rm_data(mem, cpu); -+ mem = NULL; -+ } - break; - case CPU_RESUME_FAILED: - cpupool_cpu_remove_forced(cpu); -diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h -index ff3185425219..3bab78ccb240 100644 ---- a/xen/common/sched/private.h -+++ b/xen/common/sched/private.h -@@ -603,6 +603,7 @@ void free_affinity_masks(struct affinity_masks *affinity); - - /* Memory allocation related data for schedule_cpu_rm(). */ - struct cpu_rm_data { -+ struct affinity_masks affinity; - const struct scheduler *old_ops; - void *ppriv_old; - void *vpriv_old; -@@ -617,9 +618,9 @@ struct scheduler *scheduler_alloc(unsigned int sched_id); - void scheduler_free(struct scheduler *sched); - int cpu_disable_scheduler(unsigned int cpu); - int schedule_cpu_add(unsigned int cpu, struct cpupool *c); --struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu); -+struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu, bool aff_alloc); - void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu); --int schedule_cpu_rm(unsigned int cpu); -+int schedule_cpu_rm(unsigned int cpu, struct cpu_rm_data *mem); - int sched_move_domain(struct domain *d, struct cpupool *c); - struct cpupool *cpupool_get_by_id(unsigned int poolid); - void cpupool_put(struct cpupool *pool); --- -2.37.4 - diff --git a/0061-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch b/0061-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch deleted file mode 100644 index 64144fe..0000000 --- a/0061-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch +++ /dev/null @@ -1,58 +0,0 @@ -From 19cf28b515f21da02df80e68f901ad7650daaa37 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 11 Oct 2022 15:15:55 +0200 -Subject: [PATCH 061/126] Config.mk: correct PIE-related option(s) in - EMBEDDED_EXTRA_CFLAGS - -I haven't been able to find evidence of "-nopie" ever having been a -supported compiler option. The correct spelling is "-no-pie". -Furthermore like "-pie" this is an option which is solely passed to the -linker. The compiler only recognizes "-fpie" / "-fPIE" / "-fno-pie", and -it doesn't infer these options from "-pie" / "-no-pie". - -Add the compiler recognized form, but for the possible case of the -variable also being used somewhere for linking keep the linker option as -well (with corrected spelling). - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> - -Build: Drop -no-pie from EMBEDDED_EXTRA_CFLAGS - -This breaks all Clang builds, as demostrated by Gitlab CI. - -Contrary to the description in ecd6b9759919, -no-pie is not even an option -passed to the linker. GCC's actual behaviour is to inhibit the passing of --pie to the linker, as well as selecting different cr0 artefacts to be linked. - -EMBEDDED_EXTRA_CFLAGS is not used for $(CC)-doing-linking, and not liable to -gain such a usecase. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -Tested-by: Stefano Stabellini <sstabellini@kernel.org> -Fixes: ecd6b9759919 ("Config.mk: correct PIE-related option(s) in EMBEDDED_EXTRA_CFLAGS") -master commit: ecd6b9759919fa6335b0be1b5fc5cce29a30c4f1 -master date: 2022-09-08 09:25:26 +0200 -master commit: 13a7c0074ac8fb31f6c0485429b7a20a1946cb22 -master date: 2022-09-27 15:40:42 -0700 ---- - Config.mk | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/Config.mk b/Config.mk -index 96d89b2f7dfc..9f87608f6602 100644 ---- a/Config.mk -+++ b/Config.mk -@@ -203,7 +203,7 @@ endif - APPEND_LDFLAGS += $(foreach i, $(APPEND_LIB), -L$(i)) - APPEND_CFLAGS += $(foreach i, $(APPEND_INCLUDES), -I$(i)) - --EMBEDDED_EXTRA_CFLAGS := -nopie -fno-stack-protector -fno-stack-protector-all -+EMBEDDED_EXTRA_CFLAGS := -fno-pie -fno-stack-protector -fno-stack-protector-all - EMBEDDED_EXTRA_CFLAGS += -fno-exceptions -fno-asynchronous-unwind-tables - - XEN_EXTFILES_URL ?= http://xenbits.xen.org/xen-extfiles --- -2.37.4 - diff --git a/0062-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch b/0062-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch deleted file mode 100644 index c2299bf..0000000 --- a/0062-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch +++ /dev/null @@ -1,41 +0,0 @@ -From 182f8bb503b9dd3db5dd9118dc763d241787c6fc Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 11 Oct 2022 15:16:09 +0200 -Subject: [PATCH 062/126] tools/xenstore: minor fix of the migration stream doc - -Drop mentioning the non-existent read-only socket in the migration -stream description document. - -The related record field was removed in commit 8868a0e3f674 ("docs: -update the xenstore migration stream documentation). - -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> -master commit: ace1d2eff80d3d66c37ae765dae3e3cb5697e5a4 -master date: 2022-09-08 09:25:58 +0200 ---- - docs/designs/xenstore-migration.md | 8 +++----- - 1 file changed, 3 insertions(+), 5 deletions(-) - -diff --git a/docs/designs/xenstore-migration.md b/docs/designs/xenstore-migration.md -index 5f1155273ec3..78530bbb0ef4 100644 ---- a/docs/designs/xenstore-migration.md -+++ b/docs/designs/xenstore-migration.md -@@ -129,11 +129,9 @@ xenstored state that needs to be restored. - | `evtchn-fd` | The file descriptor used to communicate with | - | | the event channel driver | - --xenstored will resume in the original process context. Hence `rw-socket-fd` and --`ro-socket-fd` simply specify the file descriptors of the sockets. Sockets --are not always used, however, and so -1 will be used to denote an unused --socket. -- -+xenstored will resume in the original process context. Hence `rw-socket-fd` -+simply specifies the file descriptor of the socket. Sockets are not always -+used, however, and so -1 will be used to denote an unused socket. - - \pagebreak - --- -2.37.4 - diff --git a/0063-xen-gnttab-fix-gnttab_acquire_resource.patch b/0063-xen-gnttab-fix-gnttab_acquire_resource.patch deleted file mode 100644 index 9087ddb..0000000 --- a/0063-xen-gnttab-fix-gnttab_acquire_resource.patch +++ /dev/null @@ -1,69 +0,0 @@ -From 3ac64b3751837a117ee3dfb3e2cc27057a83d0f7 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 11 Oct 2022 15:16:53 +0200 -Subject: [PATCH 063/126] xen/gnttab: fix gnttab_acquire_resource() - -Commit 9dc46386d89d ("gnttab: work around "may be used uninitialized" -warning") was wrong, as vaddrs can legitimately be NULL in case -XENMEM_resource_grant_table_id_status was specified for a grant table -v1. This would result in crashes in debug builds due to -ASSERT_UNREACHABLE() triggering. - -Check vaddrs only to be NULL in the rc == 0 case. - -Expand the tests in tools/tests/resource to tickle this path, and verify that -using XENMEM_resource_grant_table_id_status on a v1 grant table fails. - -Fixes: 9dc46386d89d ("gnttab: work around "may be used uninitialized" warning") -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> # xen -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 52daa6a8483e4fbd6757c9d1b791e23931791608 -master date: 2022-09-09 16:28:38 +0100 ---- - tools/tests/resource/test-resource.c | 15 +++++++++++++++ - xen/common/grant_table.c | 2 +- - 2 files changed, 16 insertions(+), 1 deletion(-) - -diff --git a/tools/tests/resource/test-resource.c b/tools/tests/resource/test-resource.c -index 1caaa60e62d9..bf485baff2b4 100644 ---- a/tools/tests/resource/test-resource.c -+++ b/tools/tests/resource/test-resource.c -@@ -63,6 +63,21 @@ static void test_gnttab(uint32_t domid, unsigned int nr_frames) - rc = xenforeignmemory_unmap_resource(fh, res); - if ( rc ) - return fail(" Fail: Unmap %d - %s\n", errno, strerror(errno)); -+ -+ /* -+ * Verify that an attempt to map the status frames fails, as the domain is -+ * in gnttab v1 mode. -+ */ -+ res = xenforeignmemory_map_resource( -+ fh, domid, XENMEM_resource_grant_table, -+ XENMEM_resource_grant_table_id_status, 0, 1, -+ (void **)&gnttab, PROT_READ | PROT_WRITE, 0); -+ -+ if ( res ) -+ { -+ fail(" Fail: Managed to map gnttab v2 status frames in v1 mode\n"); -+ xenforeignmemory_unmap_resource(fh, res); -+ } - } - - static void test_domain_configurations(void) -diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c -index 0523beb9b734..01e426c67fb6 100644 ---- a/xen/common/grant_table.c -+++ b/xen/common/grant_table.c -@@ -4138,7 +4138,7 @@ int gnttab_acquire_resource( - * on non-error paths, and hence it needs setting to NULL at the top of the - * function. Leave some runtime safety. - */ -- if ( !vaddrs ) -+ if ( !rc && !vaddrs ) - { - ASSERT_UNREACHABLE(); - rc = -ENODATA; --- -2.37.4 - diff --git a/0064-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch b/0064-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch deleted file mode 100644 index 738df82..0000000 --- a/0064-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch +++ /dev/null @@ -1,59 +0,0 @@ -From 62e534d17cdd838828bfd75d3d845e31524dd336 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 11 Oct 2022 15:17:12 +0200 -Subject: [PATCH 064/126] x86: wire up VCPUOP_register_vcpu_time_memory_area - for 32-bit guests -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Forever sinced its introduction VCPUOP_register_vcpu_time_memory_area -was available only to native domains. Linux, for example, would attempt -to use it irrespective of guest bitness (including in its so called -PVHVM mode) as long as it finds XEN_PVCLOCK_TSC_STABLE_BIT set (which we -set only for clocksource=tsc, which in turn needs engaging via command -line option). - -Fixes: a5d39947cb89 ("Allow guests to register secondary vcpu_time_info") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: b726541d94bd0a80b5864d17a2cd2e6d73a3fe0a -master date: 2022-09-29 14:47:45 +0200 ---- - xen/arch/x86/x86_64/domain.c | 20 ++++++++++++++++++++ - 1 file changed, 20 insertions(+) - -diff --git a/xen/arch/x86/x86_64/domain.c b/xen/arch/x86/x86_64/domain.c -index c46dccc25a54..d51d99344796 100644 ---- a/xen/arch/x86/x86_64/domain.c -+++ b/xen/arch/x86/x86_64/domain.c -@@ -54,6 +54,26 @@ arch_compat_vcpu_op( - break; - } - -+ case VCPUOP_register_vcpu_time_memory_area: -+ { -+ struct compat_vcpu_register_time_memory_area area = { .addr.p = 0 }; -+ -+ rc = -EFAULT; -+ if ( copy_from_guest(&area.addr.h, arg, 1) ) -+ break; -+ -+ if ( area.addr.h.c != area.addr.p || -+ !compat_handle_okay(area.addr.h, 1) ) -+ break; -+ -+ rc = 0; -+ guest_from_compat_handle(v->arch.time_info_guest, area.addr.h); -+ -+ force_update_vcpu_system_time(v); -+ -+ break; -+ } -+ - case VCPUOP_get_physid: - rc = arch_do_vcpu_op(cmd, v, arg); - break; --- -2.37.4 - diff --git a/0065-x86-vpmu-Fix-race-condition-in-vpmu_load.patch b/0065-x86-vpmu-Fix-race-condition-in-vpmu_load.patch deleted file mode 100644 index 84edf5d..0000000 --- a/0065-x86-vpmu-Fix-race-condition-in-vpmu_load.patch +++ /dev/null @@ -1,97 +0,0 @@ -From 9690bb261d5fa09cb281e1fa124d93db7b84fda5 Mon Sep 17 00:00:00 2001 -From: Tamas K Lengyel <tamas.lengyel@intel.com> -Date: Tue, 11 Oct 2022 15:17:42 +0200 -Subject: [PATCH 065/126] x86/vpmu: Fix race-condition in vpmu_load - -The vPMU code-bases attempts to perform an optimization on saving/reloading the -PMU context by keeping track of what vCPU ran on each pCPU. When a pCPU is -getting scheduled, checks if the previous vCPU isn't the current one. If so, -attempts a call to vpmu_save_force. Unfortunately if the previous vCPU is -already getting scheduled to run on another pCPU its state will be already -runnable, which results in an ASSERT failure. - -Fix this by always performing a pmu context save in vpmu_save when called from -vpmu_switch_from, and do a vpmu_load when called from vpmu_switch_to. - -While this presents a minimal overhead in case the same vCPU is getting -rescheduled on the same pCPU, the ASSERT failure is avoided and the code is a -lot easier to reason about. - -Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -master commit: defa4e51d20a143bdd4395a075bf0933bb38a9a4 -master date: 2022-09-30 09:53:49 +0200 ---- - xen/arch/x86/cpu/vpmu.c | 42 ++++------------------------------------- - 1 file changed, 4 insertions(+), 38 deletions(-) - -diff --git a/xen/arch/x86/cpu/vpmu.c b/xen/arch/x86/cpu/vpmu.c -index fb1b296a6cc1..800eff87dc03 100644 ---- a/xen/arch/x86/cpu/vpmu.c -+++ b/xen/arch/x86/cpu/vpmu.c -@@ -364,58 +364,24 @@ void vpmu_save(struct vcpu *v) - vpmu->last_pcpu = pcpu; - per_cpu(last_vcpu, pcpu) = v; - -+ vpmu_set(vpmu, VPMU_CONTEXT_SAVE); -+ - if ( vpmu->arch_vpmu_ops ) - if ( vpmu->arch_vpmu_ops->arch_vpmu_save(v, 0) ) - vpmu_reset(vpmu, VPMU_CONTEXT_LOADED); - -+ vpmu_reset(vpmu, VPMU_CONTEXT_SAVE); -+ - apic_write(APIC_LVTPC, PMU_APIC_VECTOR | APIC_LVT_MASKED); - } - - int vpmu_load(struct vcpu *v, bool_t from_guest) - { - struct vpmu_struct *vpmu = vcpu_vpmu(v); -- int pcpu = smp_processor_id(); -- struct vcpu *prev = NULL; - - if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) ) - return 0; - -- /* First time this VCPU is running here */ -- if ( vpmu->last_pcpu != pcpu ) -- { -- /* -- * Get the context from last pcpu that we ran on. Note that if another -- * VCPU is running there it must have saved this VPCU's context before -- * startig to run (see below). -- * There should be no race since remote pcpu will disable interrupts -- * before saving the context. -- */ -- if ( vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) ) -- { -- on_selected_cpus(cpumask_of(vpmu->last_pcpu), -- vpmu_save_force, (void *)v, 1); -- vpmu_reset(vpmu, VPMU_CONTEXT_LOADED); -- } -- } -- -- /* Prevent forced context save from remote CPU */ -- local_irq_disable(); -- -- prev = per_cpu(last_vcpu, pcpu); -- -- if ( prev != v && prev ) -- { -- vpmu = vcpu_vpmu(prev); -- -- /* Someone ran here before us */ -- vpmu_save_force(prev); -- vpmu_reset(vpmu, VPMU_CONTEXT_LOADED); -- -- vpmu = vcpu_vpmu(v); -- } -- -- local_irq_enable(); -- - /* Only when PMU is counting, we load PMU context immediately. */ - if ( !vpmu_is_set(vpmu, VPMU_RUNNING) || - (!has_vlapic(vpmu_vcpu(vpmu)->domain) && --- -2.37.4 - diff --git a/0066-tools-tests-fix-wrong-backport-of-upstream-commit-52.patch b/0066-tools-tests-fix-wrong-backport-of-upstream-commit-52.patch deleted file mode 100644 index 8578e02..0000000 --- a/0066-tools-tests-fix-wrong-backport-of-upstream-commit-52.patch +++ /dev/null @@ -1,31 +0,0 @@ -From 0d233924d4b0f676056856096e8761205add3ee8 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Wed, 12 Oct 2022 17:31:44 +0200 -Subject: [PATCH 066/126] tools/tests: fix wrong backport of upstream commit - 52daa6a8483e4 - -The backport of upstream commit 52daa6a8483e4 had a bug, correct it. - -Fixes: 3ac64b375183 ("xen/gnttab: fix gnttab_acquire_resource()") -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> ---- - tools/tests/resource/test-resource.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/tests/resource/test-resource.c b/tools/tests/resource/test-resource.c -index bf485baff2b4..51a8f4a000f6 100644 ---- a/tools/tests/resource/test-resource.c -+++ b/tools/tests/resource/test-resource.c -@@ -71,7 +71,7 @@ static void test_gnttab(uint32_t domid, unsigned int nr_frames) - res = xenforeignmemory_map_resource( - fh, domid, XENMEM_resource_grant_table, - XENMEM_resource_grant_table_id_status, 0, 1, -- (void **)&gnttab, PROT_READ | PROT_WRITE, 0); -+ &addr, PROT_READ | PROT_WRITE, 0); - - if ( res ) - { --- -2.37.4 - diff --git a/0067-libxl-Arm-correct-xc_shadow_control-invocation-to-fi.patch b/0067-libxl-Arm-correct-xc_shadow_control-invocation-to-fi.patch deleted file mode 100644 index 6e75a84..0000000 --- a/0067-libxl-Arm-correct-xc_shadow_control-invocation-to-fi.patch +++ /dev/null @@ -1,42 +0,0 @@ -From 816580afdd1730d4f85f64477a242a439af1cdf8 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Wed, 12 Oct 2022 17:33:40 +0200 -Subject: [PATCH 067/126] libxl/Arm: correct xc_shadow_control() invocation to - fix build - -The backport didn't adapt to the earlier function prototype taking more -(unused here) arguments. - -Fixes: c5215044578e ("xen/arm, libxl: Implement XEN_DOMCTL_shadow_op for Arm") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Henry Wang <Henry.Wang@arm.com> -Acked-by: Anthony PERARD <anthony.perard@citrix.com> ---- - tools/libs/light/libxl_arm.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c -index d21f614ed788..ba548befdd25 100644 ---- a/tools/libs/light/libxl_arm.c -+++ b/tools/libs/light/libxl_arm.c -@@ -132,14 +132,14 @@ int libxl__arch_domain_create(libxl__gc *gc, - uint32_t domid) - { - libxl_ctx *ctx = libxl__gc_owner(gc); -- unsigned int shadow_mb = DIV_ROUNDUP(d_config->b_info.shadow_memkb, 1024); -+ unsigned long shadow_mb = DIV_ROUNDUP(d_config->b_info.shadow_memkb, 1024); - - int r = xc_shadow_control(ctx->xch, domid, - XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION, -- &shadow_mb, 0); -+ NULL, 0, &shadow_mb, 0, NULL); - if (r) { - LOGED(ERROR, domid, -- "Failed to set %u MiB shadow allocation", shadow_mb); -+ "Failed to set %lu MiB shadow allocation", shadow_mb); - return ERROR_FAIL; - } - --- -2.37.4 - diff --git a/0068-arm-p2m-Rework-p2m_init.patch b/0068-arm-p2m-Rework-p2m_init.patch deleted file mode 100644 index cc80d52..0000000 --- a/0068-arm-p2m-Rework-p2m_init.patch +++ /dev/null @@ -1,88 +0,0 @@ -From 6f948fd1929c01b82a119f03670cab38ffebb47e Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 25 Oct 2022 09:21:11 +0000 -Subject: [PATCH 068/126] arm/p2m: Rework p2m_init() - -p2m_init() is mostly trivial initialisation, but has two fallible operations -which are on either side of the backpointer trigger for teardown to take -actions. - -p2m_free_vmid() is idempotent with a failed p2m_alloc_vmid(), so rearrange -p2m_init() to perform all trivial setup, then set the backpointer, then -perform all fallible setup. - -This will simplify a future bugfix which needs to add a third fallible -operation. - -No practical change. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Bertrand Marquis <bertrand.marquis@arm.com> -(cherry picked from commit: 3783e583319fa1ce75e414d851f0fde191a14753) ---- - xen/arch/arm/p2m.c | 24 ++++++++++++------------ - 1 file changed, 12 insertions(+), 12 deletions(-) - -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index c1055ff2a745..25eb1d84cbc1 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -1733,7 +1733,7 @@ void p2m_final_teardown(struct domain *d) - int p2m_init(struct domain *d) - { - struct p2m_domain *p2m = p2m_get_hostp2m(d); -- int rc = 0; -+ int rc; - unsigned int cpu; - - rwlock_init(&p2m->lock); -@@ -1742,11 +1742,6 @@ int p2m_init(struct domain *d) - INIT_PAGE_LIST_HEAD(&d->arch.paging.p2m_freelist); - - p2m->vmid = INVALID_VMID; -- -- rc = p2m_alloc_vmid(d); -- if ( rc != 0 ) -- return rc; -- - p2m->max_mapped_gfn = _gfn(0); - p2m->lowest_mapped_gfn = _gfn(ULONG_MAX); - -@@ -1762,8 +1757,6 @@ int p2m_init(struct domain *d) - p2m->clean_pte = is_iommu_enabled(d) && - !iommu_has_feature(d, IOMMU_FEAT_COHERENT_WALK); - -- rc = p2m_alloc_table(d); -- - /* - * Make sure that the type chosen to is able to store the an vCPU ID - * between 0 and the maximum of virtual CPUS supported as long as -@@ -1776,13 +1769,20 @@ int p2m_init(struct domain *d) - p2m->last_vcpu_ran[cpu] = INVALID_VCPU_ID; - - /* -- * Besides getting a domain when we only have the p2m in hand, -- * the back pointer to domain is also used in p2m_teardown() -- * as an end-of-initialization indicator. -+ * "Trivial" initialisation is now complete. Set the backpointer so -+ * p2m_teardown() and friends know to do something. - */ - p2m->domain = d; - -- return rc; -+ rc = p2m_alloc_vmid(d); -+ if ( rc ) -+ return rc; -+ -+ rc = p2m_alloc_table(d); -+ if ( rc ) -+ return rc; -+ -+ return 0; - } - - /* --- -2.37.4 - diff --git a/0069-xen-arm-p2m-Populate-pages-for-GICv2-mapping-in-p2m_.patch b/0069-xen-arm-p2m-Populate-pages-for-GICv2-mapping-in-p2m_.patch deleted file mode 100644 index 67cdb7a..0000000 --- a/0069-xen-arm-p2m-Populate-pages-for-GICv2-mapping-in-p2m_.patch +++ /dev/null @@ -1,169 +0,0 @@ -From f8915cd5dbe0f51e9bb31a54fe40600b839dd707 Mon Sep 17 00:00:00 2001 -From: Henry Wang <Henry.Wang@arm.com> -Date: Tue, 25 Oct 2022 09:21:12 +0000 -Subject: [PATCH 069/126] xen/arm: p2m: Populate pages for GICv2 mapping in - p2m_init() - -Hardware using GICv2 needs to create a P2M mapping of 8KB GICv2 area -when the domain is created. Considering the worst case of page tables -which requires 6 P2M pages as the two pages will be consecutive but not -necessarily in the same L3 page table and keep a buffer, populate 16 -pages as the default value to the P2M pages pool in p2m_init() at the -domain creation stage to satisfy the GICv2 requirement. For GICv3, the -above-mentioned P2M mapping is not necessary, but since the allocated -16 pages here would not be lost, hence populate these pages -unconditionally. - -With the default 16 P2M pages populated, there would be a case that -failures would happen in the domain creation with P2M pages already in -use. To properly free the P2M for this case, firstly support the -optionally preemption of p2m_teardown(), then call p2m_teardown() and -p2m_set_allocation(d, 0, NULL) non-preemptively in p2m_final_teardown(). -As non-preemptive p2m_teardown() should only return 0, use a -BUG_ON to confirm that. - -Since p2m_final_teardown() is called either after -domain_relinquish_resources() where relinquish_p2m_mapping() has been -called, or from failure path of domain_create()/arch_domain_create() -where mappings that require p2m_put_l3_page() should never be created, -relinquish_p2m_mapping() is not added in p2m_final_teardown(), add -in-code comments to refer this. - -Fixes: cbea5a1149ca ("xen/arm: Allocate and free P2M pages from the P2M pool") -Suggested-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Henry Wang <Henry.Wang@arm.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Bertrand Marquis <bertrand.marquis@arm.com> -(cherry picked from commit: c7cff1188802646eaa38e918e5738da0e84949be) ---- - xen/arch/arm/domain.c | 2 +- - xen/arch/arm/p2m.c | 34 ++++++++++++++++++++++++++++++++-- - xen/include/asm-arm/p2m.h | 14 ++++++++++---- - 3 files changed, 43 insertions(+), 7 deletions(-) - -diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c -index a5ffd952ecd0..b11359b8cca3 100644 ---- a/xen/arch/arm/domain.c -+++ b/xen/arch/arm/domain.c -@@ -1041,7 +1041,7 @@ int domain_relinquish_resources(struct domain *d) - return ret; - - PROGRESS(p2m): -- ret = p2m_teardown(d); -+ ret = p2m_teardown(d, true); - if ( ret ) - return ret; - -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index 25eb1d84cbc1..f6012f2a538f 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -1664,7 +1664,7 @@ static void p2m_free_vmid(struct domain *d) - spin_unlock(&vmid_alloc_lock); - } - --int p2m_teardown(struct domain *d) -+int p2m_teardown(struct domain *d, bool allow_preemption) - { - struct p2m_domain *p2m = p2m_get_hostp2m(d); - unsigned long count = 0; -@@ -1672,6 +1672,9 @@ int p2m_teardown(struct domain *d) - unsigned int i; - int rc = 0; - -+ if ( page_list_empty(&p2m->pages) ) -+ return 0; -+ - p2m_write_lock(p2m); - - /* -@@ -1695,7 +1698,7 @@ int p2m_teardown(struct domain *d) - p2m_free_page(p2m->domain, pg); - count++; - /* Arbitrarily preempt every 512 iterations */ -- if ( !(count % 512) && hypercall_preempt_check() ) -+ if ( allow_preemption && !(count % 512) && hypercall_preempt_check() ) - { - rc = -ERESTART; - break; -@@ -1715,7 +1718,20 @@ void p2m_final_teardown(struct domain *d) - if ( !p2m->domain ) - return; - -+ /* -+ * No need to call relinquish_p2m_mapping() here because -+ * p2m_final_teardown() is called either after domain_relinquish_resources() -+ * where relinquish_p2m_mapping() has been called, or from failure path of -+ * domain_create()/arch_domain_create() where mappings that require -+ * p2m_put_l3_page() should never be created. For the latter case, also see -+ * comment on top of the p2m_set_entry() for more info. -+ */ -+ -+ BUG_ON(p2m_teardown(d, false)); - ASSERT(page_list_empty(&p2m->pages)); -+ -+ while ( p2m_teardown_allocation(d) == -ERESTART ) -+ continue; /* No preemption support here */ - ASSERT(page_list_empty(&d->arch.paging.p2m_freelist)); - - if ( p2m->root ) -@@ -1782,6 +1798,20 @@ int p2m_init(struct domain *d) - if ( rc ) - return rc; - -+ /* -+ * Hardware using GICv2 needs to create a P2M mapping of 8KB GICv2 area -+ * when the domain is created. Considering the worst case for page -+ * tables and keep a buffer, populate 16 pages to the P2M pages pool here. -+ * For GICv3, the above-mentioned P2M mapping is not necessary, but since -+ * the allocated 16 pages here would not be lost, hence populate these -+ * pages unconditionally. -+ */ -+ spin_lock(&d->arch.paging.lock); -+ rc = p2m_set_allocation(d, 16, NULL); -+ spin_unlock(&d->arch.paging.lock); -+ if ( rc ) -+ return rc; -+ - return 0; - } - -diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h -index 18675b234570..ea7ca41d82b2 100644 ---- a/xen/include/asm-arm/p2m.h -+++ b/xen/include/asm-arm/p2m.h -@@ -194,14 +194,18 @@ int p2m_init(struct domain *d); - - /* - * The P2M resources are freed in two parts: -- * - p2m_teardown() will be called when relinquish the resources. It -- * will free large resources (e.g. intermediate page-tables) that -- * requires preemption. -+ * - p2m_teardown() will be called preemptively when relinquish the -+ * resources, in which case it will free large resources (e.g. intermediate -+ * page-tables) that requires preemption. - * - p2m_final_teardown() will be called when domain struct is been - * freed. This *cannot* be preempted and therefore one small - * resources should be freed here. -+ * Note that p2m_final_teardown() will also call p2m_teardown(), to properly -+ * free the P2M when failures happen in the domain creation with P2M pages -+ * already in use. In this case p2m_teardown() is called non-preemptively and -+ * p2m_teardown() will always return 0. - */ --int p2m_teardown(struct domain *d); -+int p2m_teardown(struct domain *d, bool allow_preemption); - void p2m_final_teardown(struct domain *d); - - /* -@@ -266,6 +270,8 @@ mfn_t p2m_get_entry(struct p2m_domain *p2m, gfn_t gfn, - /* - * Direct set a p2m entry: only for use by the P2M code. - * The P2M write lock should be taken. -+ * TODO: Add a check in __p2m_set_entry() to avoid creating a mapping in -+ * arch_domain_create() that requires p2m_put_l3_page() to be called. - */ - int p2m_set_entry(struct p2m_domain *p2m, - gfn_t sgfn, --- -2.37.4 - diff --git a/0070-VMX-correct-error-handling-in-vmx_create_vmcs.patch b/0070-VMX-correct-error-handling-in-vmx_create_vmcs.patch deleted file mode 100644 index 4823c64..0000000 --- a/0070-VMX-correct-error-handling-in-vmx_create_vmcs.patch +++ /dev/null @@ -1,38 +0,0 @@ -From 3885fa42349c3c6f31f0e0eec3b4605dca7fdda9 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Mon, 31 Oct 2022 13:31:26 +0100 -Subject: [PATCH 070/126] VMX: correct error handling in vmx_create_vmcs() - -With the addition of vmx_add_msr() calls to construct_vmcs() there are -now cases where simply freeing the VMCS isn't enough: The MSR bitmap -page as well as one of the MSR area ones (if it's the 2nd vmx_add_msr() -which fails) may also need freeing. Switch to using vmx_destroy_vmcs() -instead. - -Fixes: 3bd36952dab6 ("x86/spec-ctrl: Introduce an option to control L1D_FLUSH for HVM HAP guests") -Fixes: 53a570b28569 ("x86/spec-ctrl: Support IBPB-on-entry") -Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Kevin Tian <kevin.tian@intel.com> -master commit: 448d28309f1a966bdc850aff1a637e0b79a03e43 -master date: 2022-10-12 17:57:56 +0200 ---- - xen/arch/x86/hvm/vmx/vmcs.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c -index dd817cee4e69..237b13459d4f 100644 ---- a/xen/arch/x86/hvm/vmx/vmcs.c -+++ b/xen/arch/x86/hvm/vmx/vmcs.c -@@ -1831,7 +1831,7 @@ int vmx_create_vmcs(struct vcpu *v) - - if ( (rc = construct_vmcs(v)) != 0 ) - { -- vmx_free_vmcs(vmx->vmcs_pa); -+ vmx_destroy_vmcs(v); - return rc; - } - --- -2.37.4 - diff --git a/0071-argo-Remove-reachable-ASSERT_UNREACHABLE.patch b/0071-argo-Remove-reachable-ASSERT_UNREACHABLE.patch deleted file mode 100644 index d1563bd..0000000 --- a/0071-argo-Remove-reachable-ASSERT_UNREACHABLE.patch +++ /dev/null @@ -1,41 +0,0 @@ -From 916668baf9252ac30260e3394278a098712c5d34 Mon Sep 17 00:00:00 2001 -From: Jason Andryuk <jandryuk@gmail.com> -Date: Mon, 31 Oct 2022 13:32:59 +0100 -Subject: [PATCH 071/126] argo: Remove reachable ASSERT_UNREACHABLE - -I observed this ASSERT_UNREACHABLE in partner_rings_remove consistently -trip. It was in OpenXT with the viptables patch applied. - -dom10 shuts down. -dom7 is REJECTED sending to dom10. -dom7 shuts down and this ASSERT trips for dom10. - -The argo_send_info has a domid, but there is no refcount taken on -the domain. Therefore it's not appropriate to ASSERT that the domain -can be looked up via domid. Replace with a debug message. - -Signed-off-by: Jason Andryuk <jandryuk@gmail.com> -Reviewed-by: Christopher Clark <christopher.w.clark@gmail.com> -master commit: 197f612b77c5afe04e60df2100a855370d720ad7 -master date: 2022-10-14 14:45:41 +0100 ---- - xen/common/argo.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/xen/common/argo.c b/xen/common/argo.c -index 49be715f638e..2b0d980d4bba 100644 ---- a/xen/common/argo.c -+++ b/xen/common/argo.c -@@ -1299,7 +1299,8 @@ partner_rings_remove(struct domain *src_d) - ASSERT_UNREACHABLE(); - } - else -- ASSERT_UNREACHABLE(); -+ argo_dprintk("%pd has entry for stale partner d%u\n", -+ src_d, send_info->id.domain_id); - - if ( dst_d ) - rcu_unlock_domain(dst_d); --- -2.37.4 - diff --git a/0072-EFI-don-t-convert-memory-marked-for-runtime-use-to-o.patch b/0072-EFI-don-t-convert-memory-marked-for-runtime-use-to-o.patch deleted file mode 100644 index 7993482..0000000 --- a/0072-EFI-don-t-convert-memory-marked-for-runtime-use-to-o.patch +++ /dev/null @@ -1,64 +0,0 @@ -From b833014293f3fa5a7c48756ce0c8c9f3e4a666ff Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Mon, 31 Oct 2022 13:33:29 +0100 -Subject: [PATCH 072/126] EFI: don't convert memory marked for runtime use to - ordinary RAM -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -efi_init_memory() in both relevant places is treating EFI_MEMORY_RUNTIME -higher priority than the type of the range. To avoid accessing memory at -runtime which was re-used for other purposes, make -efi_arch_process_memory_map() follow suit. While in theory the same would -apply to EfiACPIReclaimMemory, we don't actually "reclaim" or clobber -that memory (converted to E820_ACPI on x86) there (and it would be a bug -if the Dom0 kernel tried to reclaim the range, bypassing Xen's memory -management, plus it would be at least bogus if it clobbered that space), -hence that type's handling can be left alone. - -Fixes: bf6501a62e80 ("x86-64: EFI boot code") -Fixes: facac0af87ef ("x86-64: EFI runtime code") -Fixes: 6d70ea10d49f ("Add ARM EFI boot support") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -master commit: f324300c8347b6aa6f9c0b18e0a90bbf44011a9a -master date: 2022-10-21 12:30:24 +0200 ---- - xen/arch/arm/efi/efi-boot.h | 3 ++- - xen/arch/x86/efi/efi-boot.h | 4 +++- - 2 files changed, 5 insertions(+), 2 deletions(-) - -diff --git a/xen/arch/arm/efi/efi-boot.h b/xen/arch/arm/efi/efi-boot.h -index cf9c37153fea..37d7ebd59ae2 100644 ---- a/xen/arch/arm/efi/efi-boot.h -+++ b/xen/arch/arm/efi/efi-boot.h -@@ -149,7 +149,8 @@ static EFI_STATUS __init efi_process_memory_map_bootinfo(EFI_MEMORY_DESCRIPTOR * - - for ( Index = 0; Index < (mmap_size / desc_size); Index++ ) - { -- if ( desc_ptr->Attribute & EFI_MEMORY_WB && -+ if ( !(desc_ptr->Attribute & EFI_MEMORY_RUNTIME) && -+ (desc_ptr->Attribute & EFI_MEMORY_WB) && - (desc_ptr->Type == EfiConventionalMemory || - desc_ptr->Type == EfiLoaderCode || - desc_ptr->Type == EfiLoaderData || -diff --git a/xen/arch/x86/efi/efi-boot.h b/xen/arch/x86/efi/efi-boot.h -index 84fd77931456..3c3b3ab936f4 100644 ---- a/xen/arch/x86/efi/efi-boot.h -+++ b/xen/arch/x86/efi/efi-boot.h -@@ -183,7 +183,9 @@ static void __init efi_arch_process_memory_map(EFI_SYSTEM_TABLE *SystemTable, - /* fall through */ - case EfiLoaderCode: - case EfiLoaderData: -- if ( desc->Attribute & EFI_MEMORY_WB ) -+ if ( desc->Attribute & EFI_MEMORY_RUNTIME ) -+ type = E820_RESERVED; -+ else if ( desc->Attribute & EFI_MEMORY_WB ) - type = E820_RAM; - else - case EfiUnusableMemory: --- -2.37.4 - diff --git a/0073-xen-sched-fix-race-in-RTDS-scheduler.patch b/0073-xen-sched-fix-race-in-RTDS-scheduler.patch deleted file mode 100644 index bb456ca..0000000 --- a/0073-xen-sched-fix-race-in-RTDS-scheduler.patch +++ /dev/null @@ -1,42 +0,0 @@ -From 1f679f084fef76810762ee69a584fc1b524be0b6 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Mon, 31 Oct 2022 13:33:59 +0100 -Subject: [PATCH 073/126] xen/sched: fix race in RTDS scheduler - -When a domain gets paused the unit runnable state can change to "not -runnable" without the scheduling lock being involved. This means that -a specific scheduler isn't involved in this change of runnable state. - -In the RTDS scheduler this can result in an inconsistency in case a -unit is losing its "runnable" capability while the RTDS scheduler's -scheduling function is active. RTDS will remove the unit from the run -queue, but doesn't do so for the replenish queue, leading to hitting -an ASSERT() in replq_insert() later when the domain is unpaused again. - -Fix that by removing the unit from the replenish queue as well in this -case. - -Fixes: 7c7b407e7772 ("xen/sched: introduce unit_runnable_state()") -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Dario Faggioli <dfaggioli@suse.com> -master commit: 73c62927f64ecb48f27d06176befdf76b879f340 -master date: 2022-10-21 12:32:23 +0200 ---- - xen/common/sched/rt.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/xen/common/sched/rt.c b/xen/common/sched/rt.c -index c24cd2ac3200..ec2ca1bebc26 100644 ---- a/xen/common/sched/rt.c -+++ b/xen/common/sched/rt.c -@@ -1087,6 +1087,7 @@ rt_schedule(const struct scheduler *ops, struct sched_unit *currunit, - else if ( !unit_runnable_state(snext->unit) ) - { - q_remove(snext); -+ replq_remove(ops, snext); - snext = rt_unit(sched_idle_unit(sched_cpu)); - } - --- -2.37.4 - diff --git a/0074-xen-sched-fix-restore_vcpu_affinity-by-removing-it.patch b/0074-xen-sched-fix-restore_vcpu_affinity-by-removing-it.patch deleted file mode 100644 index 9085f67..0000000 --- a/0074-xen-sched-fix-restore_vcpu_affinity-by-removing-it.patch +++ /dev/null @@ -1,158 +0,0 @@ -From 9c5114696c6f7773b7f3691f27aaa7a0636c916d Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Mon, 31 Oct 2022 13:34:28 +0100 -Subject: [PATCH 074/126] xen/sched: fix restore_vcpu_affinity() by removing it -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -When the system is coming up after having been suspended, -restore_vcpu_affinity() is called for each domain in order to adjust -the vcpu's affinity settings in case a cpu didn't come to live again. - -The way restore_vcpu_affinity() is doing that is wrong, because the -specific scheduler isn't being informed about a possible migration of -the vcpu to another cpu. Additionally the migration is often even -happening if all cpus are running again, as it is done without check -whether it is really needed. - -As cpupool management is already calling cpu_disable_scheduler() for -cpus not having come up again, and cpu_disable_scheduler() is taking -care of eventually needed vcpu migration in the proper way, there is -simply no need for restore_vcpu_affinity(). - -So just remove restore_vcpu_affinity() completely, together with the -no longer used sched_reset_affinity_broken(). - -Fixes: 8a04eaa8ea83 ("xen/sched: move some per-vcpu items to struct sched_unit") -Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Dario Faggioli <dfaggioli@suse.com> -Tested-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> -master commit: fce1f381f7388daaa3e96dbb0d67d7a3e4bb2d2d -master date: 2022-10-24 11:16:27 +0100 ---- - xen/arch/x86/acpi/power.c | 3 -- - xen/common/sched/core.c | 78 --------------------------------------- - xen/include/xen/sched.h | 1 - - 3 files changed, 82 deletions(-) - -diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c -index dd397f713067..1a7baeebe6d0 100644 ---- a/xen/arch/x86/acpi/power.c -+++ b/xen/arch/x86/acpi/power.c -@@ -159,10 +159,7 @@ static void thaw_domains(void) - - rcu_read_lock(&domlist_read_lock); - for_each_domain ( d ) -- { -- restore_vcpu_affinity(d); - domain_unpause(d); -- } - rcu_read_unlock(&domlist_read_lock); - } - -diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c -index 900aab8f66a7..9173cf690c72 100644 ---- a/xen/common/sched/core.c -+++ b/xen/common/sched/core.c -@@ -1188,84 +1188,6 @@ static bool sched_check_affinity_broken(const struct sched_unit *unit) - return false; - } - --static void sched_reset_affinity_broken(const struct sched_unit *unit) --{ -- struct vcpu *v; -- -- for_each_sched_unit_vcpu ( unit, v ) -- v->affinity_broken = false; --} -- --void restore_vcpu_affinity(struct domain *d) --{ -- unsigned int cpu = smp_processor_id(); -- struct sched_unit *unit; -- -- ASSERT(system_state == SYS_STATE_resume); -- -- rcu_read_lock(&sched_res_rculock); -- -- for_each_sched_unit ( d, unit ) -- { -- spinlock_t *lock; -- unsigned int old_cpu = sched_unit_master(unit); -- struct sched_resource *res; -- -- ASSERT(!unit_runnable(unit)); -- -- /* -- * Re-assign the initial processor as after resume we have no -- * guarantee the old processor has come back to life again. -- * -- * Therefore, here, before actually unpausing the domains, we should -- * set v->processor of each of their vCPUs to something that will -- * make sense for the scheduler of the cpupool in which they are in. -- */ -- lock = unit_schedule_lock_irq(unit); -- -- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, -- cpupool_domain_master_cpumask(d)); -- if ( cpumask_empty(cpumask_scratch_cpu(cpu)) ) -- { -- if ( sched_check_affinity_broken(unit) ) -- { -- sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL); -- sched_reset_affinity_broken(unit); -- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, -- cpupool_domain_master_cpumask(d)); -- } -- -- if ( cpumask_empty(cpumask_scratch_cpu(cpu)) ) -- { -- /* Affinity settings of one vcpu are for the complete unit. */ -- printk(XENLOG_DEBUG "Breaking affinity for %pv\n", -- unit->vcpu_list); -- sched_set_affinity(unit, &cpumask_all, NULL); -- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, -- cpupool_domain_master_cpumask(d)); -- } -- } -- -- res = get_sched_res(cpumask_any(cpumask_scratch_cpu(cpu))); -- sched_set_res(unit, res); -- -- spin_unlock_irq(lock); -- -- /* v->processor might have changed, so reacquire the lock. */ -- lock = unit_schedule_lock_irq(unit); -- res = sched_pick_resource(unit_scheduler(unit), unit); -- sched_set_res(unit, res); -- spin_unlock_irq(lock); -- -- if ( old_cpu != sched_unit_master(unit) ) -- sched_move_irqs(unit); -- } -- -- rcu_read_unlock(&sched_res_rculock); -- -- domain_update_node_affinity(d); --} -- - /* - * This function is used by cpu_hotplug code via cpu notifier chain - * and from cpupools to switch schedulers on a cpu. -diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h -index 4e25627d9685..bb05d167ae0f 100644 ---- a/xen/include/xen/sched.h -+++ b/xen/include/xen/sched.h -@@ -993,7 +993,6 @@ void vcpu_set_periodic_timer(struct vcpu *v, s_time_t value); - void sched_setup_dom0_vcpus(struct domain *d); - int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason); - int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity); --void restore_vcpu_affinity(struct domain *d); - int vcpu_affinity_domctl(struct domain *d, uint32_t cmd, - struct xen_domctl_vcpuaffinity *vcpuaff); - --- -2.37.4 - diff --git a/0075-x86-shadow-drop-replace-bogus-assertions.patch b/0075-x86-shadow-drop-replace-bogus-assertions.patch deleted file mode 100644 index 183dc68..0000000 --- a/0075-x86-shadow-drop-replace-bogus-assertions.patch +++ /dev/null @@ -1,71 +0,0 @@ -From 08bc78b4eecaef33250038f7e484bdf01ea1017c Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Mon, 31 Oct 2022 13:35:06 +0100 -Subject: [PATCH 075/126] x86/shadow: drop (replace) bogus assertions -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The addition of a call to shadow_blow_tables() from shadow_teardown() -has resulted in the "no vcpus" related assertion becoming triggerable: -If domain_create() fails with at least one page successfully allocated -in the course of shadow_enable(), or if domain_create() succeeds and -the domain is then killed without ever invoking XEN_DOMCTL_max_vcpus. -Note that in-tree tests (test-resource and test-tsx) do exactly the -latter of these two. - -The assertion's comment was bogus anyway: Shadow mode has been getting -enabled before allocation of vCPU-s for quite some time. Convert the -assertion to a conditional: As long as there are no vCPU-s, there's -nothing to blow away. - -Fixes: e7aa55c0aab3 ("x86/p2m: free the paging memory pool preemptively") -Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> - -A similar assertion/comment pair exists in _shadow_prealloc(); the -comment is similarly bogus, and the assertion could in principle trigger -e.g. when shadow_alloc_p2m_page() is called early enough. Replace those -at the same time by a similar early return, here indicating failure to -the caller (which will generally lead to the domain being crashed in -shadow_prealloc()). - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Roger Pau Monné <roger.pau@citrix.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: a92dc2bb30ba65ae25d2f417677eb7ef9a6a0fef -master date: 2022-10-24 15:46:11 +0200 ---- - xen/arch/x86/mm/shadow/common.c | 10 ++++++---- - 1 file changed, 6 insertions(+), 4 deletions(-) - -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index 8f7fddcee1e5..e36d49d1fcba 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -942,8 +942,9 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) - /* No reclaim when the domain is dying, teardown will take care of it. */ - return false; - -- /* Shouldn't have enabled shadows if we've no vcpus. */ -- ASSERT(d->vcpu && d->vcpu[0]); -+ /* Nothing to reclaim when there are no vcpus yet. */ -+ if ( !d->vcpu[0] ) -+ return false; - - /* Stage one: walk the list of pinned pages, unpinning them */ - perfc_incr(shadow_prealloc_1); -@@ -1033,8 +1034,9 @@ void shadow_blow_tables(struct domain *d) - mfn_t smfn; - int i; - -- /* Shouldn't have enabled shadows if we've no vcpus. */ -- ASSERT(d->vcpu && d->vcpu[0]); -+ /* Nothing to do when there are no vcpus yet. */ -+ if ( !d->vcpu[0] ) -+ return; - - /* Pass one: unpin all pinned pages */ - foreach_pinned_shadow(d, sp, t) --- -2.37.4 - diff --git a/0076-vpci-don-t-assume-that-vpci-per-device-data-exists-u.patch b/0076-vpci-don-t-assume-that-vpci-per-device-data-exists-u.patch deleted file mode 100644 index 0350771..0000000 --- a/0076-vpci-don-t-assume-that-vpci-per-device-data-exists-u.patch +++ /dev/null @@ -1,61 +0,0 @@ -From 6b035f4f5829eb213cb9fcbe83b5dfae05c857a6 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Mon, 31 Oct 2022 13:35:33 +0100 -Subject: [PATCH 076/126] vpci: don't assume that vpci per-device data exists - unconditionally -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -It's possible for a device to be assigned to a domain but have no -vpci structure if vpci_process_pending() failed and called -vpci_remove_device() as a result. The unconditional accesses done by -vpci_{read,write}() and vpci_remove_device() to pdev->vpci would -then trigger a NULL pointer dereference. - -Add checks for pdev->vpci presence in the affected functions. - -Fixes: 9c244fdef7 ('vpci: add header handlers') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 6ccb5e308ceeb895fbccd87a528a8bd24325aa39 -master date: 2022-10-26 14:55:30 +0200 ---- - xen/drivers/vpci/vpci.c | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - -diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c -index a27c9e600df1..6b90e4fa32dc 100644 ---- a/xen/drivers/vpci/vpci.c -+++ b/xen/drivers/vpci/vpci.c -@@ -37,6 +37,9 @@ extern vpci_register_init_t *const __end_vpci_array[]; - - void vpci_remove_device(struct pci_dev *pdev) - { -+ if ( !pdev->vpci ) -+ return; -+ - spin_lock(&pdev->vpci->lock); - while ( !list_empty(&pdev->vpci->handlers) ) - { -@@ -320,7 +323,7 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size) - - /* Find the PCI dev matching the address. */ - pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn); -- if ( !pdev ) -+ if ( !pdev || !pdev->vpci ) - return vpci_read_hw(sbdf, reg, size); - - spin_lock(&pdev->vpci->lock); -@@ -430,7 +433,7 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, - * Passthrough everything that's not trapped. - */ - pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn); -- if ( !pdev ) -+ if ( !pdev || !pdev->vpci ) - { - vpci_write_hw(sbdf, reg, size, data); - return; --- -2.37.4 - diff --git a/0077-vpci-msix-remove-from-table-list-on-detach.patch b/0077-vpci-msix-remove-from-table-list-on-detach.patch deleted file mode 100644 index 2e60831..0000000 --- a/0077-vpci-msix-remove-from-table-list-on-detach.patch +++ /dev/null @@ -1,47 +0,0 @@ -From bff4c4457950abb498270d921d728f654876f944 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Mon, 31 Oct 2022 13:35:59 +0100 -Subject: [PATCH 077/126] vpci/msix: remove from table list on detach -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Teardown of MSIX vPCI related data doesn't currently remove the MSIX -device data from the list of MSIX tables handled by the domain, -leading to a use-after-free of the data in the msix structure. - -Remove the structure from the list before freeing in order to solve -it. - -Reported-by: Jan Beulich <jbeulich@suse.com> -Fixes: d6281be9d0 ('vpci/msix: add MSI-X handlers') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: c14aea137eab29eb9c30bfad745a00c65ad21066 -master date: 2022-10-26 14:56:58 +0200 ---- - xen/drivers/vpci/vpci.c | 8 ++++++-- - 1 file changed, 6 insertions(+), 2 deletions(-) - -diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c -index 6b90e4fa32dc..75edbbee4025 100644 ---- a/xen/drivers/vpci/vpci.c -+++ b/xen/drivers/vpci/vpci.c -@@ -51,8 +51,12 @@ void vpci_remove_device(struct pci_dev *pdev) - xfree(r); - } - spin_unlock(&pdev->vpci->lock); -- if ( pdev->vpci->msix && pdev->vpci->msix->pba ) -- iounmap(pdev->vpci->msix->pba); -+ if ( pdev->vpci->msix ) -+ { -+ list_del(&pdev->vpci->msix->next); -+ if ( pdev->vpci->msix->pba ) -+ iounmap(pdev->vpci->msix->pba); -+ } - xfree(pdev->vpci->msix); - xfree(pdev->vpci->msi); - xfree(pdev->vpci); --- -2.37.4 - diff --git a/0078-x86-also-zap-secondary-time-area-handles-during-soft.patch b/0078-x86-also-zap-secondary-time-area-handles-during-soft.patch deleted file mode 100644 index e3db6ad..0000000 --- a/0078-x86-also-zap-secondary-time-area-handles-during-soft.patch +++ /dev/null @@ -1,49 +0,0 @@ -From 9b8b65c827169eca2d0e500150009ac0f857d455 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Mon, 31 Oct 2022 13:36:25 +0100 -Subject: [PATCH 078/126] x86: also zap secondary time area handles during soft - reset -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Just like domain_soft_reset() properly zaps runstate area handles, the -secondary time area ones also need discarding to prevent guest memory -corruption once the guest is re-started. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: b80d4f8d2ea6418e32fb4f20d1304ace6d6566e3 -master date: 2022-10-27 11:49:09 +0200 ---- - xen/arch/x86/domain.c | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c -index ce6ddcf31397..e9b8ed4c96c2 100644 ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -927,6 +927,7 @@ int arch_domain_soft_reset(struct domain *d) - struct page_info *page = virt_to_page(d->shared_info), *new_page; - int ret = 0; - struct domain *owner; -+ struct vcpu *v; - mfn_t mfn; - gfn_t gfn; - p2m_type_t p2mt; -@@ -1006,7 +1007,12 @@ int arch_domain_soft_reset(struct domain *d) - "Failed to add a page to replace %pd's shared_info frame %"PRI_gfn"\n", - d, gfn_x(gfn)); - free_domheap_page(new_page); -+ goto exit_put_gfn; - } -+ -+ for_each_vcpu ( d, v ) -+ set_xen_guest_handle(v->arch.time_info_guest, NULL); -+ - exit_put_gfn: - put_gfn(d, gfn_x(gfn)); - exit_put_page: --- -2.37.4 - diff --git a/0079-common-map_vcpu_info-wants-to-unshare-the-underlying.patch b/0079-common-map_vcpu_info-wants-to-unshare-the-underlying.patch deleted file mode 100644 index 2944a80..0000000 --- a/0079-common-map_vcpu_info-wants-to-unshare-the-underlying.patch +++ /dev/null @@ -1,41 +0,0 @@ -From 317894fa6a067a7903199bc5c1e3e06a0436caf8 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Mon, 31 Oct 2022 13:36:50 +0100 -Subject: [PATCH 079/126] common: map_vcpu_info() wants to unshare the - underlying page -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Not passing P2M_UNSHARE to get_page_from_gfn() means there won't even be -an attempt to unshare the referenced page, without any indication to the -caller (e.g. -EAGAIN). Note that guests have no direct control over -which of their pages are shared (or paged out), and hence they have no -way to make sure all on their own that the subsequent obtaining of a -writable type reference can actually succeed. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -Acked-by: Julien Grall <jgrall@amazon.com> -master commit: 48980cf24d5cf41fd644600f99c753419505e735 -master date: 2022-10-28 11:38:32 +0200 ---- - xen/common/domain.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/common/domain.c b/xen/common/domain.c -index 17cc32fde373..0fb7f9a6225c 100644 ---- a/xen/common/domain.c -+++ b/xen/common/domain.c -@@ -1454,7 +1454,7 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset) - if ( (v != current) && !(v->pause_flags & VPF_down) ) - return -EINVAL; - -- page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC); -+ page = get_page_from_gfn(d, gfn, NULL, P2M_UNSHARE); - if ( !page ) - return -EINVAL; - --- -2.37.4 - diff --git a/0080-x86-pv-shim-correctly-ignore-empty-onlining-requests.patch b/0080-x86-pv-shim-correctly-ignore-empty-onlining-requests.patch deleted file mode 100644 index 31aa812..0000000 --- a/0080-x86-pv-shim-correctly-ignore-empty-onlining-requests.patch +++ /dev/null @@ -1,43 +0,0 @@ -From a46f01fad17173afe3809ac1980cbe4b67a9a8b5 Mon Sep 17 00:00:00 2001 -From: Igor Druzhinin <igor.druzhinin@citrix.com> -Date: Mon, 31 Oct 2022 13:37:17 +0100 -Subject: [PATCH 080/126] x86/pv-shim: correctly ignore empty onlining requests -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Mem-op requests may have zero extents. Such requests need treating as -no-ops. pv_shim_online_memory(), however, would have tried to take 2³²-1 -order-sized pages from its balloon list (to then populate them), -typically ending when the entire set of ballooned pages of this order -was consumed. - -Note that pv_shim_offline_memory() does not have such an issue. - -Fixes: b2245acc60c3 ("xen/pvshim: memory hotplug") -Signed-off-by: Igor Druzhinin <igor.druzhinin@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 9272225ca72801fd9fa5b268a2d1c5adebd19cd9 -master date: 2022-10-28 15:47:59 +0200 ---- - xen/arch/x86/pv/shim.c | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c -index b4e83e077891..104357e2c398 100644 ---- a/xen/arch/x86/pv/shim.c -+++ b/xen/arch/x86/pv/shim.c -@@ -922,6 +922,9 @@ void pv_shim_online_memory(unsigned int nr, unsigned int order) - struct page_info *page, *tmp; - PAGE_LIST_HEAD(list); - -+ if ( !nr ) -+ return; -+ - spin_lock(&balloon_lock); - page_list_for_each_safe ( page, tmp, &balloon ) - { --- -2.37.4 - diff --git a/0081-x86-pv-shim-correct-ballooning-up-for-compat-guests.patch b/0081-x86-pv-shim-correct-ballooning-up-for-compat-guests.patch deleted file mode 100644 index cd97334..0000000 --- a/0081-x86-pv-shim-correct-ballooning-up-for-compat-guests.patch +++ /dev/null @@ -1,55 +0,0 @@ -From b68e3fda8a76fb3ab582b5633727ac5545e4e8b9 Mon Sep 17 00:00:00 2001 -From: Igor Druzhinin <igor.druzhinin@citrix.com> -Date: Mon, 31 Oct 2022 13:37:42 +0100 -Subject: [PATCH 081/126] x86/pv-shim: correct ballooning up for compat guests - -The compat layer for multi-extent memory ops may need to split incoming -requests. Since the guest handles in the interface structures may not be -altered, it does so by leveraging do_memory_op()'s continuation -handling: It hands on non-initial requests with a non-zero start extent, -with the (native) handle suitably adjusted down. As a result -do_memory_op() sees only the first of potentially several requests with -start extent being zero. It's only that case when the function would -issue a call to pv_shim_online_memory(), yet the range then covers only -the first sub-range that results from the split. - -Address that breakage by making a complementary call to -pv_shim_online_memory() in compat layer. - -Fixes: b2245acc60c3 ("xen/pvshim: memory hotplug") -Signed-off-by: Igor Druzhinin <igor.druzhinin@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: a0bfdd201ea12aa5679bb8944d63a4e0d3c23160 -master date: 2022-10-28 15:48:50 +0200 ---- - xen/common/compat/memory.c | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) - -diff --git a/xen/common/compat/memory.c b/xen/common/compat/memory.c -index c43fa97cf15f..a0e0562a4033 100644 ---- a/xen/common/compat/memory.c -+++ b/xen/common/compat/memory.c -@@ -7,6 +7,7 @@ EMIT_FILE; - #include <xen/event.h> - #include <xen/mem_access.h> - #include <asm/current.h> -+#include <asm/guest.h> - #include <compat/memory.h> - - #define xen_domid_t domid_t -@@ -146,7 +147,10 @@ int compat_memory_op(unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) compat) - nat.rsrv->nr_extents = end_extent; - ++split; - } -- -+ /* Avoid calling pv_shim_online_memory() when in a continuation. */ -+ if ( pv_shim && op != XENMEM_decrease_reservation && !start_extent ) -+ pv_shim_online_memory(cmp.rsrv.nr_extents - nat.rsrv->nr_extents, -+ cmp.rsrv.extent_order); - break; - - case XENMEM_exchange: --- -2.37.4 - diff --git a/0082-x86-pv-shim-correct-ballooning-down-for-compat-guest.patch b/0082-x86-pv-shim-correct-ballooning-down-for-compat-guest.patch deleted file mode 100644 index a6d895f..0000000 --- a/0082-x86-pv-shim-correct-ballooning-down-for-compat-guest.patch +++ /dev/null @@ -1,73 +0,0 @@ -From ddab5b1e001366258c0bfc7d5995b9d548e6042b Mon Sep 17 00:00:00 2001 -From: Igor Druzhinin <igor.druzhinin@citrix.com> -Date: Mon, 31 Oct 2022 13:38:05 +0100 -Subject: [PATCH 082/126] x86/pv-shim: correct ballooning down for compat - guests - -The compat layer for multi-extent memory ops may need to split incoming -requests. Since the guest handles in the interface structures may not be -altered, it does so by leveraging do_memory_op()'s continuation -handling: It hands on non-initial requests with a non-zero start extent, -with the (native) handle suitably adjusted down. As a result -do_memory_op() sees only the first of potentially several requests with -start extent being zero. In order to be usable as overall result, the -function accumulates args.nr_done, i.e. it initialized the field with -the start extent. Therefore non-initial requests resulting from the -split would pass too large a number into pv_shim_offline_memory(). - -Address that breakage by always calling pv_shim_offline_memory() -regardless of current hypercall preemption status, with a suitably -adjusted first argument. Note that this is correct also for the native -guest case: We now simply "commit" what was completed right away, rather -than at the end of a series of preemption/re-start cycles. In fact this -improves overall preemption behavior: There's no longer a potentially -big chunk of work done non-preemptively at the end of the last -"iteration". - -Fixes: b2245acc60c3 ("xen/pvshim: memory hotplug") -Signed-off-by: Igor Druzhinin <igor.druzhinin@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 1d7fbc535d1d37bdc2cc53ede360b0f6651f7de1 -master date: 2022-10-28 15:49:33 +0200 ---- - xen/common/memory.c | 19 +++++++------------ - 1 file changed, 7 insertions(+), 12 deletions(-) - -diff --git a/xen/common/memory.c b/xen/common/memory.c -index 95b2b934e4a2..a958d94ac3cd 100644 ---- a/xen/common/memory.c -+++ b/xen/common/memory.c -@@ -1407,22 +1407,17 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) - - rc = args.nr_done; - -- if ( args.preempted ) -- return hypercall_create_continuation( -- __HYPERVISOR_memory_op, "lh", -- op | (rc << MEMOP_EXTENT_SHIFT), arg); -- - #ifdef CONFIG_X86 - if ( pv_shim && op == XENMEM_decrease_reservation ) -- /* -- * Only call pv_shim_offline_memory when the hypercall has -- * finished. Note that nr_done is used to cope in case the -- * hypercall has failed and only part of the extents where -- * processed. -- */ -- pv_shim_offline_memory(args.nr_done, args.extent_order); -+ pv_shim_offline_memory(args.nr_done - start_extent, -+ args.extent_order); - #endif - -+ if ( args.preempted ) -+ return hypercall_create_continuation( -+ __HYPERVISOR_memory_op, "lh", -+ op | (rc << MEMOP_EXTENT_SHIFT), arg); -+ - break; - - case XENMEM_exchange: --- -2.37.4 - diff --git a/0083-tools-xenstore-create_node-Don-t-defer-work-to-undo-.patch b/0083-tools-xenstore-create_node-Don-t-defer-work-to-undo-.patch deleted file mode 100644 index 5204b3f..0000000 --- a/0083-tools-xenstore-create_node-Don-t-defer-work-to-undo-.patch +++ /dev/null @@ -1,120 +0,0 @@ -From ee03d9b56e6141422b4ef2444f93cf2e88e6a26c Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Tue, 13 Sep 2022 07:35:06 +0200 -Subject: [PATCH 083/126] tools/xenstore: create_node: Don't defer work to undo - any changes on failure - -XSA-115 extended destroy_node() to update the node accounting for the -connection. The implementation is assuming the connection is the parent -of the node, however all the nodes are allocated using a separate context -(see process_message()). This will result to crash (or corrupt) xenstored -as the pointer is wrongly used. - -In case of an error, any changes to the database or update to the -accounting will now be reverted in create_node() by calling directly -destroy_node(). This has the nice advantage to remove the loop to unset -the destructors in case of success. - -Take the opportunity to free the nodes right now as they are not -going to be reachable (the function returns NULL) and are just wasting -resources. - -This is XSA-414 / CVE-2022-42309. - -Fixes: 0bfb2101f243 ("tools/xenstore: fix node accounting after failed node creation") -Signed-off-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Juergen Gross <jgross@suse.com> -(cherry picked from commit 1cd3cc7ea27cda7640a8d895e09617b61c265697) ---- - tools/xenstore/xenstored_core.c | 47 ++++++++++++++++++++++----------- - 1 file changed, 32 insertions(+), 15 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 9172dd767140..a00c49e404a1 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1054,9 +1054,8 @@ nomem: - return NULL; - } - --static int destroy_node(void *_node) -+static int destroy_node(struct connection *conn, struct node *node) - { -- struct node *node = _node; - TDB_DATA key; - - if (streq(node->name, "/")) -@@ -1065,7 +1064,7 @@ static int destroy_node(void *_node) - set_tdb_key(node->name, &key); - tdb_delete(tdb_ctx, key); - -- domain_entry_dec(talloc_parent(node), node); -+ domain_entry_dec(conn, node); - - return 0; - } -@@ -1074,7 +1073,8 @@ static struct node *create_node(struct connection *conn, const void *ctx, - const char *name, - void *data, unsigned int datalen) - { -- struct node *node, *i; -+ struct node *node, *i, *j; -+ int ret; - - node = construct_node(conn, ctx, name); - if (!node) -@@ -1096,23 +1096,40 @@ static struct node *create_node(struct connection *conn, const void *ctx, - /* i->parent is set for each new node, so check quota. */ - if (i->parent && - domain_entry(conn) >= quota_nb_entry_per_domain) { -- errno = ENOSPC; -- return NULL; -+ ret = ENOSPC; -+ goto err; - } -- if (write_node(conn, i, false)) -- return NULL; - -- /* Account for new node, set destructor for error case. */ -- if (i->parent) { -+ ret = write_node(conn, i, false); -+ if (ret) -+ goto err; -+ -+ /* Account for new node */ -+ if (i->parent) - domain_entry_inc(conn, i); -- talloc_set_destructor(i, destroy_node); -- } - } - -- /* OK, now remove destructors so they stay around */ -- for (i = node; i->parent; i = i->parent) -- talloc_set_destructor(i, NULL); - return node; -+ -+err: -+ /* -+ * We failed to update TDB for some of the nodes. Undo any work that -+ * have already been done. -+ */ -+ for (j = node; j != i; j = j->parent) -+ destroy_node(conn, j); -+ -+ /* We don't need to keep the nodes around, so free them. */ -+ i = node; -+ while (i) { -+ j = i; -+ i = i->parent; -+ talloc_free(j); -+ } -+ -+ errno = ret; -+ -+ return NULL; - } - - /* path, data... */ --- -2.37.4 - diff --git a/0084-tools-xenstore-Fail-a-transaction-if-it-is-not-possi.patch b/0084-tools-xenstore-Fail-a-transaction-if-it-is-not-possi.patch deleted file mode 100644 index 05936ea..0000000 --- a/0084-tools-xenstore-Fail-a-transaction-if-it-is-not-possi.patch +++ /dev/null @@ -1,145 +0,0 @@ -From 579e7334b909c22efc65c5df22e8afe414882154 Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Tue, 13 Sep 2022 07:35:06 +0200 -Subject: [PATCH 084/126] tools/xenstore: Fail a transaction if it is not - possible to create a node - -Commit f2bebf72c4d5 "xenstore: rework of transaction handling" moved -out from copying the entire database everytime a new transaction is -opened to track the list of nodes changed. - -The content of all the nodes accessed during a transaction will be -temporarily stored in TDB using a different key. - -The function create_node() may write/update multiple nodes if the child -doesn't exist. In case of a failure, the function will revert any -changes (this include any update to TDB). Unfortunately, the function -which reverts the changes (i.e. destroy_node()) will not use the correct -key to delete any update or even request the transaction to fail. - -This means that if a client decide to go ahead with committing the -transaction, orphan nodes will be created because they were not linked -to an existing node (create_node() will write the nodes backwards). - -Once some nodes have been partially updated in a transaction, it is not -easily possible to undo any changes. So rather than continuing and hit -weird issue while committing, it is much saner to fail the transaction. - -This will have an impact on any client that decides to commit even if it -can't write a node. Although, it is not clear why a normal client would -want to do that... - -Lastly, update destroy_node() to use the correct key for deleting the -node. Rather than recreating it (this will allocate memory and -therefore fail), stash the key in the structure node. - -This is XSA-415 / CVE-2022-42310. - -Signed-off-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Juergen Gross <jgross@suse.com> -(cherry picked from commit 5d71766bd1a4a3a8b2fe952ca2be80e02fe48f34) ---- - tools/xenstore/xenstored_core.c | 23 +++++++++++++++-------- - tools/xenstore/xenstored_core.h | 2 ++ - tools/xenstore/xenstored_transaction.c | 5 +++++ - tools/xenstore/xenstored_transaction.h | 3 +++ - 4 files changed, 25 insertions(+), 8 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index a00c49e404a1..b28c2c66b53b 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -531,15 +531,17 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - return 0; - } - -+/* -+ * Write the node. If the node is written, caller can find the key used in -+ * node->key. This can later be used if the change needs to be reverted. -+ */ - static int write_node(struct connection *conn, struct node *node, - bool no_quota_check) - { -- TDB_DATA key; -- -- if (access_node(conn, node, NODE_ACCESS_WRITE, &key)) -+ if (access_node(conn, node, NODE_ACCESS_WRITE, &node->key)) - return errno; - -- return write_node_raw(conn, &key, node, no_quota_check); -+ return write_node_raw(conn, &node->key, node, no_quota_check); - } - - enum xs_perm_type perm_for_conn(struct connection *conn, -@@ -1056,16 +1058,21 @@ nomem: - - static int destroy_node(struct connection *conn, struct node *node) - { -- TDB_DATA key; -- - if (streq(node->name, "/")) - corrupt(NULL, "Destroying root node!"); - -- set_tdb_key(node->name, &key); -- tdb_delete(tdb_ctx, key); -+ tdb_delete(tdb_ctx, node->key); - - domain_entry_dec(conn, node); - -+ /* -+ * It is not possible to easily revert the changes in a transaction. -+ * So if the failure happens in a transaction, mark it as fail to -+ * prevent any commit. -+ */ -+ if ( conn->transaction ) -+ fail_transaction(conn->transaction); -+ - return 0; - } - -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 0c9a0961b57e..900336afa426 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -148,6 +148,8 @@ struct node_perms { - - struct node { - const char *name; -+ /* Key used to update TDB */ -+ TDB_DATA key; - - /* Parent (optional) */ - struct node *parent; -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index cd07fb0f218b..faf6c930e42a 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -580,6 +580,11 @@ void transaction_entry_dec(struct transaction *trans, unsigned int domid) - list_add_tail(&d->list, &trans->changed_domains); - } - -+void fail_transaction(struct transaction *trans) -+{ -+ trans->fail = true; -+} -+ - void conn_delete_all_transactions(struct connection *conn) - { - struct transaction *trans; -diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h -index 43a162bea3f3..14062730e3c9 100644 ---- a/tools/xenstore/xenstored_transaction.h -+++ b/tools/xenstore/xenstored_transaction.h -@@ -46,6 +46,9 @@ int access_node(struct connection *conn, struct node *node, - int transaction_prepend(struct connection *conn, const char *name, - TDB_DATA *key); - -+/* Mark the transaction as failed. This will prevent it to be committed. */ -+void fail_transaction(struct transaction *trans); -+ - void conn_delete_all_transactions(struct connection *conn); - int check_transactions(struct hashtable *hash); - --- -2.37.4 - diff --git a/0085-tools-xenstore-split-up-send_reply.patch b/0085-tools-xenstore-split-up-send_reply.patch deleted file mode 100644 index 7420f93..0000000 --- a/0085-tools-xenstore-split-up-send_reply.patch +++ /dev/null @@ -1,213 +0,0 @@ -From 0d8bea403d4d1763dddb0c1c81d30efebafb6962 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:07 +0200 -Subject: [PATCH 085/126] tools/xenstore: split up send_reply() - -Today send_reply() is used for both, normal request replies and watch -events. - -Split it up into send_reply() and send_event(). This will be used to -add some event specific handling. - -add_event() can be merged into send_event(), removing the need for an -intermediate memory allocation. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 9bfde319dbac2a1321898d2f75a3f075c3eb7b32) ---- - tools/xenstore/xenstored_core.c | 74 +++++++++++++++++++------------- - tools/xenstore/xenstored_core.h | 1 + - tools/xenstore/xenstored_watch.c | 39 +++-------------- - 3 files changed, 52 insertions(+), 62 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index b28c2c66b53b..01d4a2e440ec 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -733,49 +733,32 @@ static void send_error(struct connection *conn, int error) - void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - const void *data, unsigned int len) - { -- struct buffered_data *bdata; -+ struct buffered_data *bdata = conn->in; -+ -+ assert(type != XS_WATCH_EVENT); - - if ( len > XENSTORE_PAYLOAD_MAX ) { - send_error(conn, E2BIG); - return; - } - -- /* Replies reuse the request buffer, events need a new one. */ -- if (type != XS_WATCH_EVENT) { -- bdata = conn->in; -- /* Drop asynchronous responses, e.g. errors for watch events. */ -- if (!bdata) -- return; -- bdata->inhdr = true; -- bdata->used = 0; -- conn->in = NULL; -- } else { -- /* Message is a child of the connection for auto-cleanup. */ -- bdata = new_buffer(conn); -+ if (!bdata) -+ return; -+ bdata->inhdr = true; -+ bdata->used = 0; - -- /* -- * Allocation failure here is unfortunate: we have no way to -- * tell anybody about it. -- */ -- if (!bdata) -- return; -- } - if (len <= DEFAULT_BUFFER_SIZE) - bdata->buffer = bdata->default_buffer; -- else -+ else { - bdata->buffer = talloc_array(bdata, char, len); -- if (!bdata->buffer) { -- if (type == XS_WATCH_EVENT) { -- /* Same as above: no way to tell someone. */ -- talloc_free(bdata); -+ if (!bdata->buffer) { -+ send_error(conn, ENOMEM); - return; - } -- /* re-establish request buffer for sending ENOMEM. */ -- conn->in = bdata; -- send_error(conn, ENOMEM); -- return; - } - -+ conn->in = NULL; -+ - /* Update relevant header fields and fill in the message body. */ - bdata->hdr.msg.type = type; - bdata->hdr.msg.len = len; -@@ -783,8 +766,39 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); -+} - -- return; -+/* -+ * Send a watch event. -+ * As this is not directly related to the current command, errors can't be -+ * reported. -+ */ -+void send_event(struct connection *conn, const char *path, const char *token) -+{ -+ struct buffered_data *bdata; -+ unsigned int len; -+ -+ len = strlen(path) + 1 + strlen(token) + 1; -+ /* Don't try to send over-long events. */ -+ if (len > XENSTORE_PAYLOAD_MAX) -+ return; -+ -+ bdata = new_buffer(conn); -+ if (!bdata) -+ return; -+ -+ bdata->buffer = talloc_array(bdata, char, len); -+ if (!bdata->buffer) { -+ talloc_free(bdata); -+ return; -+ } -+ strcpy(bdata->buffer, path); -+ strcpy(bdata->buffer + strlen(path) + 1, token); -+ bdata->hdr.msg.type = XS_WATCH_EVENT; -+ bdata->hdr.msg.len = len; -+ -+ /* Queue for later transmission. */ -+ list_add_tail(&bdata->list, &conn->out_list); - } - - /* Some routines (write, mkdir, etc) just need a non-error return */ -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 900336afa426..38d97fa081a6 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -180,6 +180,7 @@ unsigned int get_string(const struct buffered_data *data, unsigned int offset); - - void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - const void *data, unsigned int len); -+void send_event(struct connection *conn, const char *path, const char *token); - - /* Some routines (write, mkdir, etc) just need a non-error return */ - void send_ack(struct connection *conn, enum xsd_sockmsg_type type); -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index db89e0141fce..a116f967dc66 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -85,35 +85,6 @@ static const char *get_watch_path(const struct watch *watch, const char *name) - return path; - } - --/* -- * Send a watch event. -- * Temporary memory allocations are done with ctx. -- */ --static void add_event(struct connection *conn, -- const void *ctx, -- struct watch *watch, -- const char *name) --{ -- /* Data to send (node\0token\0). */ -- unsigned int len; -- char *data; -- -- name = get_watch_path(watch, name); -- -- len = strlen(name) + 1 + strlen(watch->token) + 1; -- /* Don't try to send over-long events. */ -- if (len > XENSTORE_PAYLOAD_MAX) -- return; -- -- data = talloc_array(ctx, char, len); -- if (!data) -- return; -- strcpy(data, name); -- strcpy(data + strlen(name) + 1, watch->token); -- send_reply(conn, XS_WATCH_EVENT, data, len); -- talloc_free(data); --} -- - /* - * Check permissions of a specific watch to fire: - * Either the node itself or its parent have to be readable by the connection -@@ -190,10 +161,14 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, - list_for_each_entry(watch, &i->watches, list) { - if (exact) { - if (streq(name, watch->node)) -- add_event(i, ctx, watch, name); -+ send_event(i, -+ get_watch_path(watch, name), -+ watch->token); - } else { - if (is_child(name, watch->node)) -- add_event(i, ctx, watch, name); -+ send_event(i, -+ get_watch_path(watch, name), -+ watch->token); - } - } - } -@@ -292,7 +267,7 @@ int do_watch(struct connection *conn, struct buffered_data *in) - send_ack(conn, XS_WATCH); - - /* We fire once up front: simplifies clients and restart. */ -- add_event(conn, in, watch, watch->node); -+ send_event(conn, get_watch_path(watch, watch->node), watch->token); - - return 0; - } --- -2.37.4 - diff --git a/0086-tools-xenstore-add-helpers-to-free-struct-buffered_d.patch b/0086-tools-xenstore-add-helpers-to-free-struct-buffered_d.patch deleted file mode 100644 index 46ae2d3..0000000 --- a/0086-tools-xenstore-add-helpers-to-free-struct-buffered_d.patch +++ /dev/null @@ -1,117 +0,0 @@ -From b322923894ea23f397efc58a938cb9213d7dc617 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:07 +0200 -Subject: [PATCH 086/126] tools/xenstore: add helpers to free struct - buffered_data - -Add two helpers for freeing struct buffered_data: free_buffered_data() -for freeing one instance and conn_free_buffered_data() for freeing all -instances for a connection. - -This is avoiding duplicated code and will help later when more actions -are needed when freeing a struct buffered_data. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit ead062a68a9c201a95488e84750a70a107f7b317) ---- - tools/xenstore/xenstored_core.c | 26 +++++++++++++++++--------- - tools/xenstore/xenstored_core.h | 2 ++ - tools/xenstore/xenstored_domain.c | 7 +------ - 3 files changed, 20 insertions(+), 15 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 01d4a2e440ec..6498bf603666 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -211,6 +211,21 @@ void reopen_log(void) - } - } - -+static void free_buffered_data(struct buffered_data *out, -+ struct connection *conn) -+{ -+ list_del(&out->list); -+ talloc_free(out); -+} -+ -+void conn_free_buffered_data(struct connection *conn) -+{ -+ struct buffered_data *out; -+ -+ while ((out = list_top(&conn->out_list, struct buffered_data, list))) -+ free_buffered_data(out, conn); -+} -+ - static bool write_messages(struct connection *conn) - { - int ret; -@@ -254,8 +269,7 @@ static bool write_messages(struct connection *conn) - - trace_io(conn, out, 1); - -- list_del(&out->list); -- talloc_free(out); -+ free_buffered_data(out, conn); - - return true; - } -@@ -1472,18 +1486,12 @@ static struct { - */ - static void ignore_connection(struct connection *conn) - { -- struct buffered_data *out, *tmp; -- - trace("CONN %p ignored\n", conn); - - conn->is_ignored = true; - conn_delete_all_watches(conn); - conn_delete_all_transactions(conn); -- -- list_for_each_entry_safe(out, tmp, &conn->out_list, list) { -- list_del(&out->list); -- talloc_free(out); -- } -+ conn_free_buffered_data(conn); - - talloc_free(conn->in); - conn->in = NULL; -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 38d97fa081a6..0ba5b783d4d1 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -270,6 +270,8 @@ int remember_string(struct hashtable *hash, const char *str); - - void set_tdb_key(const char *name, TDB_DATA *key); - -+void conn_free_buffered_data(struct connection *conn); -+ - const char *dump_state_global(FILE *fp); - const char *dump_state_buffered_data(FILE *fp, const struct connection *c, - const struct connection *conn, -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index 3d4d0649a243..72a5cd3b9aaf 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -417,15 +417,10 @@ static struct domain *find_domain_by_domid(unsigned int domid) - static void domain_conn_reset(struct domain *domain) - { - struct connection *conn = domain->conn; -- struct buffered_data *out; - - conn_delete_all_watches(conn); - conn_delete_all_transactions(conn); -- -- while ((out = list_top(&conn->out_list, struct buffered_data, list))) { -- list_del(&out->list); -- talloc_free(out); -- } -+ conn_free_buffered_data(conn); - - talloc_free(conn->in); - --- -2.37.4 - diff --git a/0087-tools-xenstore-reduce-number-of-watch-events.patch b/0087-tools-xenstore-reduce-number-of-watch-events.patch deleted file mode 100644 index ab6cc92..0000000 --- a/0087-tools-xenstore-reduce-number-of-watch-events.patch +++ /dev/null @@ -1,201 +0,0 @@ -From 8999db805e5ef55172a85d67695429edc3d78771 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:07 +0200 -Subject: [PATCH 087/126] tools/xenstore: reduce number of watch events - -When removing a watched node outside of a transaction, two watch events -are being produced instead of just a single one. - -When finalizing a transaction watch events can be generated for each -node which is being modified, even if outside a transaction such -modifications might not have resulted in a watch event. - -This happens e.g.: - -- for nodes which are only modified due to added/removed child entries -- for nodes being removed or created implicitly (e.g. creation of a/b/c - is implicitly creating a/b, resulting in watch events for a, a/b and - a/b/c instead of a/b/c only) - -Avoid these additional watch events, in order to reduce the needed -memory inside Xenstore for queueing them. - -This is being achieved by adding event flags to struct accessed_node -specifying whether an event should be triggered, and whether it should -be an exact match of the modified path. Both flags can be set from -fire_watches() instead of implying them only. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 3a96013a3e17baa07410b1b9776225d1d9a74297) ---- - tools/xenstore/xenstored_core.c | 19 ++++++------ - tools/xenstore/xenstored_transaction.c | 41 +++++++++++++++++++++----- - tools/xenstore/xenstored_transaction.h | 3 ++ - tools/xenstore/xenstored_watch.c | 7 +++-- - 4 files changed, 51 insertions(+), 19 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 6498bf603666..5157a7527f58 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1261,7 +1261,7 @@ static void delete_child(struct connection *conn, - } - - static int delete_node(struct connection *conn, const void *ctx, -- struct node *parent, struct node *node) -+ struct node *parent, struct node *node, bool watch_exact) - { - char *name; - -@@ -1273,7 +1273,7 @@ static int delete_node(struct connection *conn, const void *ctx, - node->children); - child = name ? read_node(conn, node, name) : NULL; - if (child) { -- if (delete_node(conn, ctx, node, child)) -+ if (delete_node(conn, ctx, node, child, true)) - return errno; - } else { - trace("delete_node: Error deleting child '%s/%s'!\n", -@@ -1285,7 +1285,12 @@ static int delete_node(struct connection *conn, const void *ctx, - talloc_free(name); - } - -- fire_watches(conn, ctx, node->name, node, true, NULL); -+ /* -+ * Fire the watches now, when we can still see the node permissions. -+ * This fine as we are single threaded and the next possible read will -+ * be handled only after the node has been really removed. -+ */ -+ fire_watches(conn, ctx, node->name, node, watch_exact, NULL); - delete_node_single(conn, node); - delete_child(conn, parent, basename(node->name)); - talloc_free(node); -@@ -1311,13 +1316,7 @@ static int _rm(struct connection *conn, const void *ctx, struct node *node, - return (errno == ENOMEM) ? ENOMEM : EINVAL; - node->parent = parent; - -- /* -- * Fire the watches now, when we can still see the node permissions. -- * This fine as we are single threaded and the next possible read will -- * be handled only after the node has been really removed. -- */ -- fire_watches(conn, ctx, name, node, false, NULL); -- return delete_node(conn, ctx, parent, node); -+ return delete_node(conn, ctx, parent, node, false); - } - - -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index faf6c930e42a..54432907fc76 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -130,6 +130,10 @@ struct accessed_node - - /* Transaction node in data base? */ - bool ta_node; -+ -+ /* Watch event flags. */ -+ bool fire_watch; -+ bool watch_exact; - }; - - struct changed_domain -@@ -323,6 +327,29 @@ err: - return ret; - } - -+/* -+ * A watch event should be fired for a node modified inside a transaction. -+ * Set the corresponding information. A non-exact event is replacing an exact -+ * one, but not the other way round. -+ */ -+void queue_watches(struct connection *conn, const char *name, bool watch_exact) -+{ -+ struct accessed_node *i; -+ -+ i = find_accessed_node(conn->transaction, name); -+ if (!i) { -+ conn->transaction->fail = true; -+ return; -+ } -+ -+ if (!i->fire_watch) { -+ i->fire_watch = true; -+ i->watch_exact = watch_exact; -+ } else if (!watch_exact) { -+ i->watch_exact = false; -+ } -+} -+ - /* - * Finalize transaction: - * Walk through accessed nodes and check generation against global data. -@@ -377,15 +404,15 @@ static int finalize_transaction(struct connection *conn, - ret = tdb_store(tdb_ctx, key, data, - TDB_REPLACE); - talloc_free(data.dptr); -- if (ret) -- goto err; -- fire_watches(conn, trans, i->node, NULL, false, -- i->perms.p ? &i->perms : NULL); - } else { -- fire_watches(conn, trans, i->node, NULL, false, -+ ret = tdb_delete(tdb_ctx, key); -+ } -+ if (ret) -+ goto err; -+ if (i->fire_watch) { -+ fire_watches(conn, trans, i->node, NULL, -+ i->watch_exact, - i->perms.p ? &i->perms : NULL); -- if (tdb_delete(tdb_ctx, key)) -- goto err; - } - } - -diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h -index 14062730e3c9..0093cac807e3 100644 ---- a/tools/xenstore/xenstored_transaction.h -+++ b/tools/xenstore/xenstored_transaction.h -@@ -42,6 +42,9 @@ void transaction_entry_dec(struct transaction *trans, unsigned int domid); - int access_node(struct connection *conn, struct node *node, - enum node_access_type type, TDB_DATA *key); - -+/* Queue watches for a modified node. */ -+void queue_watches(struct connection *conn, const char *name, bool watch_exact); -+ - /* Prepend the transaction to name if appropriate. */ - int transaction_prepend(struct connection *conn, const char *name, - TDB_DATA *key); -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index a116f967dc66..bc6d833028a3 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -29,6 +29,7 @@ - #include "xenstore_lib.h" - #include "utils.h" - #include "xenstored_domain.h" -+#include "xenstored_transaction.h" - - extern int quota_nb_watch_per_domain; - -@@ -143,9 +144,11 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, - struct connection *i; - struct watch *watch; - -- /* During transactions, don't fire watches. */ -- if (conn && conn->transaction) -+ /* During transactions, don't fire watches, but queue them. */ -+ if (conn && conn->transaction) { -+ queue_watches(conn, name, exact); - return; -+ } - - /* Create an event for each watch. */ - list_for_each_entry(i, &connections, list) { --- -2.37.4 - diff --git a/0088-tools-xenstore-let-unread-watch-events-time-out.patch b/0088-tools-xenstore-let-unread-watch-events-time-out.patch deleted file mode 100644 index 03419c6..0000000 --- a/0088-tools-xenstore-let-unread-watch-events-time-out.patch +++ /dev/null @@ -1,309 +0,0 @@ -From 53a77b82717530d836300f1de0ad037de85477dd Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:07 +0200 -Subject: [PATCH 088/126] tools/xenstore: let unread watch events time out - -A future modification will limit the number of outstanding requests -for a domain, where "outstanding" means that the response of the -request or any resulting watch event hasn't been consumed yet. - -In order to avoid a malicious guest being capable to block other guests -by not reading watch events, add a timeout for watch events. In case a -watch event hasn't been consumed after this timeout, it is being -deleted. Set the default timeout to 20 seconds (a random value being -not too high). - -In order to support to specify other timeout values in future, use a -generic command line option for that purpose: - ---timeout|-w watch-event=<seconds> - -This is part of XSA-326 / CVE-2022-42311. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 5285dcb1a5c01695c11e6397c95d906b5e765c98) ---- - tools/xenstore/xenstored_core.c | 133 +++++++++++++++++++++++++++++++- - tools/xenstore/xenstored_core.h | 6 ++ - 2 files changed, 138 insertions(+), 1 deletion(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 5157a7527f58..ee3396fefa94 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -108,6 +108,8 @@ int quota_max_transaction = 10; - int quota_nb_perms_per_node = 5; - int quota_max_path_len = XENSTORE_REL_PATH_MAX; - -+unsigned int timeout_watch_event_msec = 20000; -+ - void trace(const char *fmt, ...) - { - va_list arglist; -@@ -211,19 +213,92 @@ void reopen_log(void) - } - } - -+static uint64_t get_now_msec(void) -+{ -+ struct timespec now_ts; -+ -+ if (clock_gettime(CLOCK_MONOTONIC, &now_ts)) -+ barf_perror("Could not find time (clock_gettime failed)"); -+ -+ return now_ts.tv_sec * 1000 + now_ts.tv_nsec / 1000000; -+} -+ - static void free_buffered_data(struct buffered_data *out, - struct connection *conn) - { -+ struct buffered_data *req; -+ - list_del(&out->list); -+ -+ /* -+ * Update conn->timeout_msec with the next found timeout value in the -+ * queued pending requests. -+ */ -+ if (out->timeout_msec) { -+ conn->timeout_msec = 0; -+ list_for_each_entry(req, &conn->out_list, list) { -+ if (req->timeout_msec) { -+ conn->timeout_msec = req->timeout_msec; -+ break; -+ } -+ } -+ } -+ - talloc_free(out); - } - -+static void check_event_timeout(struct connection *conn, uint64_t msecs, -+ int *ptimeout) -+{ -+ uint64_t delta; -+ struct buffered_data *out, *tmp; -+ -+ if (!conn->timeout_msec) -+ return; -+ -+ delta = conn->timeout_msec - msecs; -+ if (conn->timeout_msec <= msecs) { -+ delta = 0; -+ list_for_each_entry_safe(out, tmp, &conn->out_list, list) { -+ /* -+ * Only look at buffers with timeout and no data -+ * already written to the ring. -+ */ -+ if (out->timeout_msec && out->inhdr && !out->used) { -+ if (out->timeout_msec > msecs) { -+ conn->timeout_msec = out->timeout_msec; -+ delta = conn->timeout_msec - msecs; -+ break; -+ } -+ -+ /* -+ * Free out without updating conn->timeout_msec, -+ * as the update is done in this loop already. -+ */ -+ out->timeout_msec = 0; -+ trace("watch event path %s for domain %u timed out\n", -+ out->buffer, conn->id); -+ free_buffered_data(out, conn); -+ } -+ } -+ if (!delta) { -+ conn->timeout_msec = 0; -+ return; -+ } -+ } -+ -+ if (*ptimeout == -1 || *ptimeout > delta) -+ *ptimeout = delta; -+} -+ - void conn_free_buffered_data(struct connection *conn) - { - struct buffered_data *out; - - while ((out = list_top(&conn->out_list, struct buffered_data, list))) - free_buffered_data(out, conn); -+ -+ conn->timeout_msec = 0; - } - - static bool write_messages(struct connection *conn) -@@ -382,6 +457,7 @@ static void initialize_fds(int *p_sock_pollfd_idx, int *ptimeout) - { - struct connection *conn; - struct wrl_timestampt now; -+ uint64_t msecs; - - if (fds) - memset(fds, 0, sizeof(struct pollfd) * current_array_size); -@@ -402,10 +478,12 @@ static void initialize_fds(int *p_sock_pollfd_idx, int *ptimeout) - - wrl_gettime_now(&now); - wrl_log_periodic(now); -+ msecs = get_now_msec(); - - list_for_each_entry(conn, &connections, list) { - if (conn->domain) { - wrl_check_timeout(conn->domain, now, ptimeout); -+ check_event_timeout(conn, msecs, ptimeout); - if (domain_can_read(conn) || - (domain_can_write(conn) && - !list_empty(&conn->out_list))) -@@ -760,6 +838,7 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - return; - bdata->inhdr = true; - bdata->used = 0; -+ bdata->timeout_msec = 0; - - if (len <= DEFAULT_BUFFER_SIZE) - bdata->buffer = bdata->default_buffer; -@@ -811,6 +890,12 @@ void send_event(struct connection *conn, const char *path, const char *token) - bdata->hdr.msg.type = XS_WATCH_EVENT; - bdata->hdr.msg.len = len; - -+ if (timeout_watch_event_msec && domain_is_unprivileged(conn)) { -+ bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; -+ if (!conn->timeout_msec) -+ conn->timeout_msec = bdata->timeout_msec; -+ } -+ - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); - } -@@ -2099,6 +2184,9 @@ static void usage(void) - " -t, --transaction <nb> limit the number of transaction allowed per domain,\n" - " -A, --perm-nb <nb> limit the number of permissions per node,\n" - " -M, --path-max <chars> limit the allowed Xenstore node path length,\n" -+" -w, --timeout <what>=<seconds> set the timeout in seconds for <what>,\n" -+" allowed timeout candidates are:\n" -+" watch-event: time a watch-event is kept pending\n" - " -R, --no-recovery to request that no recovery should be attempted when\n" - " the store is corrupted (debug only),\n" - " -I, --internal-db store database in memory, not on disk\n" -@@ -2121,6 +2209,7 @@ static struct option options[] = { - { "transaction", 1, NULL, 't' }, - { "perm-nb", 1, NULL, 'A' }, - { "path-max", 1, NULL, 'M' }, -+ { "timeout", 1, NULL, 'w' }, - { "no-recovery", 0, NULL, 'R' }, - { "internal-db", 0, NULL, 'I' }, - { "verbose", 0, NULL, 'V' }, -@@ -2135,6 +2224,39 @@ int dom0_domid = 0; - int dom0_event = 0; - int priv_domid = 0; - -+static int get_optval_int(const char *arg) -+{ -+ char *end; -+ long val; -+ -+ val = strtol(arg, &end, 10); -+ if (!*arg || *end || val < 0 || val > INT_MAX) -+ barf("invalid parameter value \"%s\"\n", arg); -+ -+ return val; -+} -+ -+static bool what_matches(const char *arg, const char *what) -+{ -+ unsigned int what_len = strlen(what); -+ -+ return !strncmp(arg, what, what_len) && arg[what_len] == '='; -+} -+ -+static void set_timeout(const char *arg) -+{ -+ const char *eq = strchr(arg, '='); -+ int val; -+ -+ if (!eq) -+ barf("quotas must be specified via <what>=<seconds>\n"); -+ val = get_optval_int(eq + 1); -+ if (what_matches(arg, "watch-event")) -+ timeout_watch_event_msec = val * 1000; -+ else -+ barf("unknown timeout \"%s\"\n", arg); -+} -+ - int main(int argc, char *argv[]) - { - int opt; -@@ -2149,7 +2271,7 @@ int main(int argc, char *argv[]) - orig_argc = argc; - orig_argv = argv; - -- while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:T:RVW:U", options, -+ while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:T:RVW:w:U", options, - NULL)) != -1) { - switch (opt) { - case 'D': -@@ -2198,6 +2320,9 @@ int main(int argc, char *argv[]) - quota_max_path_len = min(XENSTORE_REL_PATH_MAX, - quota_max_path_len); - break; -+ case 'w': -+ set_timeout(optarg); -+ break; - case 'e': - dom0_event = strtol(optarg, NULL, 10); - break; -@@ -2642,6 +2767,12 @@ static void add_buffered_data(struct buffered_data *bdata, - barf("error restoring buffered data"); - - memcpy(bdata->buffer, data, len); -+ if (bdata->hdr.msg.type == XS_WATCH_EVENT && timeout_watch_event_msec && -+ domain_is_unprivileged(conn)) { -+ bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; -+ if (!conn->timeout_msec) -+ conn->timeout_msec = bdata->timeout_msec; -+ } - - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 0ba5b783d4d1..2db577928fc6 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -27,6 +27,7 @@ - #include <dirent.h> - #include <stdbool.h> - #include <stdint.h> -+#include <time.h> - #include <errno.h> - - #include "xenstore_lib.h" -@@ -67,6 +68,8 @@ struct buffered_data - char raw[sizeof(struct xsd_sockmsg)]; - } hdr; - -+ uint64_t timeout_msec; -+ - /* The actual data. */ - char *buffer; - char default_buffer[DEFAULT_BUFFER_SIZE]; -@@ -110,6 +113,7 @@ struct connection - - /* Buffered output data */ - struct list_head out_list; -+ uint64_t timeout_msec; - - /* Transaction context for current request (NULL if none). */ - struct transaction *transaction; -@@ -237,6 +241,8 @@ extern int dom0_event; - extern int priv_domid; - extern int quota_nb_entry_per_domain; - -+extern unsigned int timeout_watch_event_msec; -+ - /* Map the kernel's xenstore page. */ - void *xenbus_map(void); - void unmap_xenbus(void *interface); --- -2.37.4 - diff --git a/0089-tools-xenstore-limit-outstanding-requests.patch b/0089-tools-xenstore-limit-outstanding-requests.patch deleted file mode 100644 index 2e110b0..0000000 --- a/0089-tools-xenstore-limit-outstanding-requests.patch +++ /dev/null @@ -1,453 +0,0 @@ -From 56300e8e1781cee1b6a514e5f2bea234a7885d55 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:08 +0200 -Subject: [PATCH 089/126] tools/xenstore: limit outstanding requests - -Add another quota for limiting the number of outstanding requests of a -guest. As the way to specify quotas on the command line is becoming -rather nasty, switch to a new scheme using [--quota|-Q] <what>=<val> -allowing to add more quotas in future easily. - -Set the default value to 20 (basically a random value not seeming to -be too high or too low). - -A request is said to be outstanding if any message generated by this -request (the direct response plus potential watch events) is not yet -completely stored into a ring buffer. The initial watch event sent as -a result of registering a watch is an exception. - -Note that across a live update the relation to buffered watch events -for other domains is lost. - -Use talloc_zero() for allocating the domain structure in order to have -all per-domain quota zeroed initially. - -This is part of XSA-326 / CVE-2022-42312. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 36de433a273f55d614c83b89c9a8972287a1e475) ---- - tools/xenstore/xenstored_core.c | 88 +++++++++++++++++++++++++++++-- - tools/xenstore/xenstored_core.h | 20 ++++++- - tools/xenstore/xenstored_domain.c | 38 ++++++++++--- - tools/xenstore/xenstored_domain.h | 3 ++ - tools/xenstore/xenstored_watch.c | 15 ++++-- - 5 files changed, 150 insertions(+), 14 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index ee3396fefa94..d871f217af9c 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -107,6 +107,7 @@ int quota_max_entry_size = 2048; /* 2K */ - int quota_max_transaction = 10; - int quota_nb_perms_per_node = 5; - int quota_max_path_len = XENSTORE_REL_PATH_MAX; -+int quota_req_outstanding = 20; - - unsigned int timeout_watch_event_msec = 20000; - -@@ -223,12 +224,24 @@ static uint64_t get_now_msec(void) - return now_ts.tv_sec * 1000 + now_ts.tv_nsec / 1000000; - } - -+/* -+ * Remove a struct buffered_data from the list of outgoing data. -+ * A struct buffered_data related to a request having caused watch events to be -+ * sent is kept until all those events have been written out. -+ * Each watch event is referencing the related request via pend.req, while the -+ * number of watch events caused by a request is kept in pend.ref.event_cnt -+ * (those two cases are mutually exclusive, so the two fields can share memory -+ * via a union). -+ * The struct buffered_data is freed only if no related watch event is -+ * referencing it. The related return data can be freed right away. -+ */ - static void free_buffered_data(struct buffered_data *out, - struct connection *conn) - { - struct buffered_data *req; - - list_del(&out->list); -+ out->on_out_list = false; - - /* - * Update conn->timeout_msec with the next found timeout value in the -@@ -244,6 +257,30 @@ static void free_buffered_data(struct buffered_data *out, - } - } - -+ if (out->hdr.msg.type == XS_WATCH_EVENT) { -+ req = out->pend.req; -+ if (req) { -+ req->pend.ref.event_cnt--; -+ if (!req->pend.ref.event_cnt && !req->on_out_list) { -+ if (req->on_ref_list) { -+ domain_outstanding_domid_dec( -+ req->pend.ref.domid); -+ list_del(&req->list); -+ } -+ talloc_free(req); -+ } -+ } -+ } else if (out->pend.ref.event_cnt) { -+ /* Hang out off from conn. */ -+ talloc_steal(NULL, out); -+ if (out->buffer != out->default_buffer) -+ talloc_free(out->buffer); -+ list_add(&out->list, &conn->ref_list); -+ out->on_ref_list = true; -+ return; -+ } else -+ domain_outstanding_dec(conn); -+ - talloc_free(out); - } - -@@ -399,6 +436,7 @@ int delay_request(struct connection *conn, struct buffered_data *in, - static int destroy_conn(void *_conn) - { - struct connection *conn = _conn; -+ struct buffered_data *req; - - /* Flush outgoing if possible, but don't block. */ - if (!conn->domain) { -@@ -412,6 +450,11 @@ static int destroy_conn(void *_conn) - break; - close(conn->fd); - } -+ -+ conn_free_buffered_data(conn); -+ list_for_each_entry(req, &conn->ref_list, list) -+ req->on_ref_list = false; -+ - if (conn->target) - talloc_unlink(conn, conn->target); - list_del(&conn->list); -@@ -859,6 +902,8 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); -+ bdata->on_out_list = true; -+ domain_outstanding_inc(conn); - } - - /* -@@ -866,7 +911,8 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - * As this is not directly related to the current command, errors can't be - * reported. - */ --void send_event(struct connection *conn, const char *path, const char *token) -+void send_event(struct buffered_data *req, struct connection *conn, -+ const char *path, const char *token) - { - struct buffered_data *bdata; - unsigned int len; -@@ -896,8 +942,13 @@ void send_event(struct connection *conn, const char *path, const char *token) - conn->timeout_msec = bdata->timeout_msec; - } - -+ bdata->pend.req = req; -+ if (req) -+ req->pend.ref.event_cnt++; -+ - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); -+ bdata->on_out_list = true; - } - - /* Some routines (write, mkdir, etc) just need a non-error return */ -@@ -1658,6 +1709,7 @@ static void handle_input(struct connection *conn) - return; - } - in = conn->in; -+ in->pend.ref.domid = conn->id; - - /* Not finished header yet? */ - if (in->inhdr) { -@@ -1727,6 +1779,7 @@ struct connection *new_connection(connwritefn_t *write, connreadfn_t *read) - new->is_ignored = false; - new->transaction_started = 0; - INIT_LIST_HEAD(&new->out_list); -+ INIT_LIST_HEAD(&new->ref_list); - INIT_LIST_HEAD(&new->watches); - INIT_LIST_HEAD(&new->transaction_list); - INIT_LIST_HEAD(&new->delayed); -@@ -2184,6 +2237,9 @@ static void usage(void) - " -t, --transaction <nb> limit the number of transaction allowed per domain,\n" - " -A, --perm-nb <nb> limit the number of permissions per node,\n" - " -M, --path-max <chars> limit the allowed Xenstore node path length,\n" -+" -Q, --quota <what>=<nb> set the quota <what> to the value <nb>, allowed\n" -+" quotas are:\n" -+" outstanding: number of outstanding requests\n" - " -w, --timeout <what>=<seconds> set the timeout in seconds for <what>,\n" - " allowed timeout candidates are:\n" - " watch-event: time a watch-event is kept pending\n" -@@ -2209,6 +2265,7 @@ static struct option options[] = { - { "transaction", 1, NULL, 't' }, - { "perm-nb", 1, NULL, 'A' }, - { "path-max", 1, NULL, 'M' }, -+ { "quota", 1, NULL, 'Q' }, - { "timeout", 1, NULL, 'w' }, - { "no-recovery", 0, NULL, 'R' }, - { "internal-db", 0, NULL, 'I' }, -@@ -2257,6 +2314,20 @@ static void set_timeout(const char *arg) - barf("unknown timeout \"%s\"\n", arg); - } - -+static void set_quota(const char *arg) -+{ -+ const char *eq = strchr(arg, '='); -+ int val; -+ -+ if (!eq) -+ barf("quotas must be specified via <what>=<nb>\n"); -+ val = get_optval_int(eq + 1); -+ if (what_matches(arg, "outstanding")) -+ quota_req_outstanding = val; -+ else -+ barf("unknown quota \"%s\"\n", arg); -+} -+ - int main(int argc, char *argv[]) - { - int opt; -@@ -2271,8 +2342,8 @@ int main(int argc, char *argv[]) - orig_argc = argc; - orig_argv = argv; - -- while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:T:RVW:w:U", options, -- NULL)) != -1) { -+ while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:Q:T:RVW:w:U", -+ options, NULL)) != -1) { - switch (opt) { - case 'D': - no_domain_init = true; -@@ -2320,6 +2391,9 @@ int main(int argc, char *argv[]) - quota_max_path_len = min(XENSTORE_REL_PATH_MAX, - quota_max_path_len); - break; -+ case 'Q': -+ set_quota(optarg); -+ break; - case 'w': - set_timeout(optarg); - break; -@@ -2776,6 +2850,14 @@ static void add_buffered_data(struct buffered_data *bdata, - - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); -+ bdata->on_out_list = true; -+ /* -+ * Watch events are never "outstanding", but the request causing them -+ * are instead kept "outstanding" until all watch events caused by that -+ * request have been delivered. -+ */ -+ if (bdata->hdr.msg.type != XS_WATCH_EVENT) -+ domain_outstanding_inc(conn); - } - - void read_state_buffered_data(const void *ctx, struct connection *conn, -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 2db577928fc6..fcb27399f116 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -56,6 +56,8 @@ struct xs_state_connection; - struct buffered_data - { - struct list_head list; -+ bool on_out_list; -+ bool on_ref_list; - - /* Are we still doing the header? */ - bool inhdr; -@@ -63,6 +65,17 @@ struct buffered_data - /* How far are we? */ - unsigned int used; - -+ /* Outstanding request accounting. */ -+ union { -+ /* ref is being used for requests. */ -+ struct { -+ unsigned int event_cnt; /* # of outstanding events. */ -+ unsigned int domid; /* domid of request. */ -+ } ref; -+ /* req is being used for watch events. */ -+ struct buffered_data *req; /* request causing event. */ -+ } pend; -+ - union { - struct xsd_sockmsg msg; - char raw[sizeof(struct xsd_sockmsg)]; -@@ -115,6 +128,9 @@ struct connection - struct list_head out_list; - uint64_t timeout_msec; - -+ /* Referenced requests no longer pending. */ -+ struct list_head ref_list; -+ - /* Transaction context for current request (NULL if none). */ - struct transaction *transaction; - -@@ -184,7 +200,8 @@ unsigned int get_string(const struct buffered_data *data, unsigned int offset); - - void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - const void *data, unsigned int len); --void send_event(struct connection *conn, const char *path, const char *token); -+void send_event(struct buffered_data *req, struct connection *conn, -+ const char *path, const char *token); - - /* Some routines (write, mkdir, etc) just need a non-error return */ - void send_ack(struct connection *conn, enum xsd_sockmsg_type type); -@@ -240,6 +257,7 @@ extern int dom0_domid; - extern int dom0_event; - extern int priv_domid; - extern int quota_nb_entry_per_domain; -+extern int quota_req_outstanding; - - extern unsigned int timeout_watch_event_msec; - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index 72a5cd3b9aaf..979f8c629835 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -78,6 +78,9 @@ struct domain - /* number of watch for this domain */ - int nbwatch; - -+ /* Number of outstanding requests. */ -+ int nboutstanding; -+ - /* write rate limit */ - wrl_creditt wrl_credit; /* [ -wrl_config_writecost, +_dburst ] */ - struct wrl_timestampt wrl_timestamp; -@@ -287,8 +290,12 @@ bool domain_can_read(struct connection *conn) - { - struct xenstore_domain_interface *intf = conn->domain->interface; - -- if (domain_is_unprivileged(conn) && conn->domain->wrl_credit < 0) -- return false; -+ if (domain_is_unprivileged(conn)) { -+ if (conn->domain->wrl_credit < 0) -+ return false; -+ if (conn->domain->nboutstanding >= quota_req_outstanding) -+ return false; -+ } - - if (conn->is_ignored) - return false; -@@ -337,7 +344,7 @@ static struct domain *alloc_domain(const void *context, unsigned int domid) - { - struct domain *domain; - -- domain = talloc(context, struct domain); -+ domain = talloc_zero(context, struct domain); - if (!domain) { - errno = ENOMEM; - return NULL; -@@ -398,9 +405,6 @@ static int new_domain(struct domain *domain, int port, bool restore) - domain->conn->domain = domain; - domain->conn->id = domain->domid; - -- domain->nbentry = 0; -- domain->nbwatch = 0; -- - return 0; - } - -@@ -944,6 +948,28 @@ int domain_watch(struct connection *conn) - : 0; - } - -+void domain_outstanding_inc(struct connection *conn) -+{ -+ if (!conn || !conn->domain) -+ return; -+ conn->domain->nboutstanding++; -+} -+ -+void domain_outstanding_dec(struct connection *conn) -+{ -+ if (!conn || !conn->domain) -+ return; -+ conn->domain->nboutstanding--; -+} -+ -+void domain_outstanding_domid_dec(unsigned int domid) -+{ -+ struct domain *d = find_domain_by_domid(domid); -+ -+ if (d) -+ d->nboutstanding--; -+} -+ - static wrl_creditt wrl_config_writecost = WRL_FACTOR; - static wrl_creditt wrl_config_rate = WRL_RATE * WRL_FACTOR; - static wrl_creditt wrl_config_dburst = WRL_DBURST * WRL_FACTOR; -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index dc9759171317..5757a6557146 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -68,6 +68,9 @@ int domain_entry(struct connection *conn); - void domain_watch_inc(struct connection *conn); - void domain_watch_dec(struct connection *conn); - int domain_watch(struct connection *conn); -+void domain_outstanding_inc(struct connection *conn); -+void domain_outstanding_dec(struct connection *conn); -+void domain_outstanding_domid_dec(unsigned int domid); - - /* Special node permission handling. */ - int set_perms_special(struct connection *conn, const char *name, -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index bc6d833028a3..1d664e3d6b72 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -142,6 +142,7 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, - struct node *node, bool exact, struct node_perms *perms) - { - struct connection *i; -+ struct buffered_data *req; - struct watch *watch; - - /* During transactions, don't fire watches, but queue them. */ -@@ -150,6 +151,8 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, - return; - } - -+ req = domain_is_unprivileged(conn) ? conn->in : NULL; -+ - /* Create an event for each watch. */ - list_for_each_entry(i, &connections, list) { - /* introduce/release domain watches */ -@@ -164,12 +167,12 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, - list_for_each_entry(watch, &i->watches, list) { - if (exact) { - if (streq(name, watch->node)) -- send_event(i, -+ send_event(req, i, - get_watch_path(watch, name), - watch->token); - } else { - if (is_child(name, watch->node)) -- send_event(i, -+ send_event(req, i, - get_watch_path(watch, name), - watch->token); - } -@@ -269,8 +272,12 @@ int do_watch(struct connection *conn, struct buffered_data *in) - trace_create(watch, "watch"); - send_ack(conn, XS_WATCH); - -- /* We fire once up front: simplifies clients and restart. */ -- send_event(conn, get_watch_path(watch, watch->node), watch->token); -+ /* -+ * We fire once up front: simplifies clients and restart. -+ * This event will not be linked to the XS_WATCH request. -+ */ -+ send_event(NULL, conn, get_watch_path(watch, watch->node), -+ watch->token); - - return 0; - } --- -2.37.4 - diff --git a/0090-tools-xenstore-don-t-buffer-multiple-identical-watch.patch b/0090-tools-xenstore-don-t-buffer-multiple-identical-watch.patch deleted file mode 100644 index 305d8ac..0000000 --- a/0090-tools-xenstore-don-t-buffer-multiple-identical-watch.patch +++ /dev/null @@ -1,93 +0,0 @@ -From 97c251f953c58aec7620499ac12924054b7cd758 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:08 +0200 -Subject: [PATCH 090/126] tools/xenstore: don't buffer multiple identical watch - events - -A guest not reading its Xenstore response buffer fast enough might -pile up lots of Xenstore watch events buffered. Reduce the generated -load by dropping new events which already have an identical copy -pending. - -The special events "@..." are excluded from that handling as there are -known use cases where the handler is relying on each event to be sent -individually. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit b5c0bdb96d33e18c324c13d8e33c08732d77eaa2) ---- - tools/xenstore/xenstored_core.c | 20 +++++++++++++++++++- - tools/xenstore/xenstored_core.h | 3 +++ - 2 files changed, 22 insertions(+), 1 deletion(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index d871f217af9c..6ea06e20df91 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -882,6 +882,7 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - bdata->inhdr = true; - bdata->used = 0; - bdata->timeout_msec = 0; -+ bdata->watch_event = false; - - if (len <= DEFAULT_BUFFER_SIZE) - bdata->buffer = bdata->default_buffer; -@@ -914,7 +915,7 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - void send_event(struct buffered_data *req, struct connection *conn, - const char *path, const char *token) - { -- struct buffered_data *bdata; -+ struct buffered_data *bdata, *bd; - unsigned int len; - - len = strlen(path) + 1 + strlen(token) + 1; -@@ -936,12 +937,29 @@ void send_event(struct buffered_data *req, struct connection *conn, - bdata->hdr.msg.type = XS_WATCH_EVENT; - bdata->hdr.msg.len = len; - -+ /* -+ * Check whether an identical event is pending already. -+ * Special events are excluded from that check. -+ */ -+ if (path[0] != '@') { -+ list_for_each_entry(bd, &conn->out_list, list) { -+ if (bd->watch_event && bd->hdr.msg.len == len && -+ !memcmp(bdata->buffer, bd->buffer, len)) { -+ trace("dropping duplicate watch %s %s for domain %u\n", -+ path, token, conn->id); -+ talloc_free(bdata); -+ return; -+ } -+ } -+ } -+ - if (timeout_watch_event_msec && domain_is_unprivileged(conn)) { - bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; - if (!conn->timeout_msec) - conn->timeout_msec = bdata->timeout_msec; - } - -+ bdata->watch_event = true; - bdata->pend.req = req; - if (req) - req->pend.ref.event_cnt++; -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index fcb27399f116..afbd982c2654 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -62,6 +62,9 @@ struct buffered_data - /* Are we still doing the header? */ - bool inhdr; - -+ /* Is this a watch event? */ -+ bool watch_event; -+ - /* How far are we? */ - unsigned int used; - --- -2.37.4 - diff --git a/0091-tools-xenstore-fix-connection-id-usage.patch b/0091-tools-xenstore-fix-connection-id-usage.patch deleted file mode 100644 index dd7f382..0000000 --- a/0091-tools-xenstore-fix-connection-id-usage.patch +++ /dev/null @@ -1,61 +0,0 @@ -From 3e51699fcc578c7c005fd4add70cf7c8117d0af9 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:08 +0200 -Subject: [PATCH 091/126] tools/xenstore: fix connection->id usage - -Don't use conn->id for privilege checks, but domain_is_unprivileged(). - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 3047df38e1991510bc295e3e1bb6b6b6c4a97831) ---- - tools/xenstore/xenstored_control.c | 2 +- - tools/xenstore/xenstored_core.h | 2 +- - tools/xenstore/xenstored_transaction.c | 3 ++- - 3 files changed, 4 insertions(+), 3 deletions(-) - -diff --git a/tools/xenstore/xenstored_control.c b/tools/xenstore/xenstored_control.c -index 8e470f2b2056..211fe1fd9b37 100644 ---- a/tools/xenstore/xenstored_control.c -+++ b/tools/xenstore/xenstored_control.c -@@ -821,7 +821,7 @@ int do_control(struct connection *conn, struct buffered_data *in) - unsigned int cmd, num, off; - char **vec = NULL; - -- if (conn->id != 0) -+ if (domain_is_unprivileged(conn)) - return EACCES; - - off = get_string(in, 0); -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index afbd982c2654..c0a056ce13fe 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -118,7 +118,7 @@ struct connection - /* The index of pollfd in global pollfd array */ - int pollfd_idx; - -- /* Who am I? 0 for socket connections. */ -+ /* Who am I? Domid of connection. */ - unsigned int id; - - /* Is this connection ignored? */ -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 54432907fc76..ee1b09031a3b 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -477,7 +477,8 @@ int do_transaction_start(struct connection *conn, struct buffered_data *in) - if (conn->transaction) - return EBUSY; - -- if (conn->id && conn->transaction_started > quota_max_transaction) -+ if (domain_is_unprivileged(conn) && -+ conn->transaction_started > quota_max_transaction) - return ENOSPC; - - /* Attach transaction to input for autofree until it's complete */ --- -2.37.4 - diff --git a/0092-tools-xenstore-simplify-and-fix-per-domain-node-acco.patch b/0092-tools-xenstore-simplify-and-fix-per-domain-node-acco.patch deleted file mode 100644 index 01f29b1..0000000 --- a/0092-tools-xenstore-simplify-and-fix-per-domain-node-acco.patch +++ /dev/null @@ -1,336 +0,0 @@ -From 8ee7ed7c1ef435f43edc08be07c036d81642d8e1 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:08 +0200 -Subject: [PATCH 092/126] tools/xenstore: simplify and fix per domain node - accounting - -The accounting of nodes can be simplified now that each connection -holds the associated domid. - -Fix the node accounting to cover nodes created for a domain before it -has been introduced. This requires to react properly to an allocation -failure inside domain_entry_inc() by returning an error code. - -Especially in error paths the node accounting has to be fixed in some -cases. - -This is part of XSA-326 / CVE-2022-42313. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit dbef1f7482894c572d90cd73d99ed689c891e863) ---- - tools/xenstore/xenstored_core.c | 43 ++++++++-- - tools/xenstore/xenstored_domain.c | 105 ++++++++++++++++--------- - tools/xenstore/xenstored_domain.h | 4 +- - tools/xenstore/xenstored_transaction.c | 8 +- - 4 files changed, 109 insertions(+), 51 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 6ea06e20df91..85c0d2f38fac 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -603,7 +603,7 @@ struct node *read_node(struct connection *conn, const void *ctx, - - /* Permissions are struct xs_permissions. */ - node->perms.p = hdr->perms; -- if (domain_adjust_node_perms(node)) { -+ if (domain_adjust_node_perms(conn, node)) { - talloc_free(node); - return NULL; - } -@@ -625,7 +625,7 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - void *p; - struct xs_tdb_record_hdr *hdr; - -- if (domain_adjust_node_perms(node)) -+ if (domain_adjust_node_perms(conn, node)) - return errno; - - data.dsize = sizeof(*hdr) -@@ -1238,13 +1238,17 @@ nomem: - return NULL; - } - --static int destroy_node(struct connection *conn, struct node *node) -+static void destroy_node_rm(struct node *node) - { - if (streq(node->name, "/")) - corrupt(NULL, "Destroying root node!"); - - tdb_delete(tdb_ctx, node->key); -+} - -+static int destroy_node(struct connection *conn, struct node *node) -+{ -+ destroy_node_rm(node); - domain_entry_dec(conn, node); - - /* -@@ -1294,8 +1298,12 @@ static struct node *create_node(struct connection *conn, const void *ctx, - goto err; - - /* Account for new node */ -- if (i->parent) -- domain_entry_inc(conn, i); -+ if (i->parent) { -+ if (domain_entry_inc(conn, i)) { -+ destroy_node_rm(i); -+ return NULL; -+ } -+ } - } - - return node; -@@ -1580,10 +1588,27 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) - old_perms = node->perms; - domain_entry_dec(conn, node); - node->perms = perms; -- domain_entry_inc(conn, node); -+ if (domain_entry_inc(conn, node)) { -+ node->perms = old_perms; -+ /* -+ * This should never fail because we had a reference on the -+ * domain before and Xenstored is single-threaded. -+ */ -+ domain_entry_inc(conn, node); -+ return ENOMEM; -+ } - -- if (write_node(conn, node, false)) -+ if (write_node(conn, node, false)) { -+ int saved_errno = errno; -+ -+ domain_entry_dec(conn, node); -+ node->perms = old_perms; -+ /* No failure possible as above. */ -+ domain_entry_inc(conn, node); -+ -+ errno = saved_errno; - return errno; -+ } - - fire_watches(conn, in, name, node, false, &old_perms); - send_ack(conn, XS_SET_PERMS); -@@ -3003,7 +3028,9 @@ void read_state_node(const void *ctx, const void *state) - set_tdb_key(name, &key); - if (write_node_raw(NULL, &key, node, true)) - barf("write node error restoring node"); -- domain_entry_inc(&conn, node); -+ -+ if (domain_entry_inc(&conn, node)) -+ barf("node accounting error restoring node"); - - talloc_free(node); - } -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index 979f8c629835..3c27973fb836 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -16,6 +16,7 @@ - along with this program; If not, see <http://www.gnu.org/licenses/>. - */ - -+#include <assert.h> - #include <stdio.h> - #include <sys/mman.h> - #include <unistd.h> -@@ -369,6 +370,18 @@ static struct domain *find_or_alloc_domain(const void *ctx, unsigned int domid) - return domain ? : alloc_domain(ctx, domid); - } - -+static struct domain *find_or_alloc_existing_domain(unsigned int domid) -+{ -+ struct domain *domain; -+ xc_dominfo_t dominfo; -+ -+ domain = find_domain_struct(domid); -+ if (!domain && get_domain_info(domid, &dominfo)) -+ domain = alloc_domain(NULL, domid); -+ -+ return domain; -+} -+ - static int new_domain(struct domain *domain, int port, bool restore) - { - int rc; -@@ -788,30 +801,28 @@ void domain_deinit(void) - xenevtchn_unbind(xce_handle, virq_port); - } - --void domain_entry_inc(struct connection *conn, struct node *node) -+int domain_entry_inc(struct connection *conn, struct node *node) - { - struct domain *d; -+ unsigned int domid; - - if (!conn) -- return; -+ return 0; - -- if (node->perms.p && node->perms.p[0].id != conn->id) { -- if (conn->transaction) { -- transaction_entry_inc(conn->transaction, -- node->perms.p[0].id); -- } else { -- d = find_domain_by_domid(node->perms.p[0].id); -- if (d) -- d->nbentry++; -- } -- } else if (conn->domain) { -- if (conn->transaction) { -- transaction_entry_inc(conn->transaction, -- conn->domain->domid); -- } else { -- conn->domain->nbentry++; -- } -+ domid = node->perms.p ? node->perms.p[0].id : conn->id; -+ -+ if (conn->transaction) { -+ transaction_entry_inc(conn->transaction, domid); -+ } else { -+ d = (domid == conn->id && conn->domain) ? conn->domain -+ : find_or_alloc_existing_domain(domid); -+ if (d) -+ d->nbentry++; -+ else -+ return ENOMEM; - } -+ -+ return 0; - } - - /* -@@ -847,7 +858,7 @@ static int chk_domain_generation(unsigned int domid, uint64_t gen) - * Remove permissions for no longer existing domains in order to avoid a new - * domain with the same domid inheriting the permissions. - */ --int domain_adjust_node_perms(struct node *node) -+int domain_adjust_node_perms(struct connection *conn, struct node *node) - { - unsigned int i; - int ret; -@@ -857,8 +868,14 @@ int domain_adjust_node_perms(struct node *node) - return errno; - - /* If the owner doesn't exist any longer give it to priv domain. */ -- if (!ret) -+ if (!ret) { -+ /* -+ * In theory we'd need to update the number of dom0 nodes here, -+ * but we could be called for a read of the node. So better -+ * avoid the risk to overflow the node count of dom0. -+ */ - node->perms.p[0].id = priv_domid; -+ } - - for (i = 1; i < node->perms.num; i++) { - if (node->perms.p[i].perms & XS_PERM_IGNORE) -@@ -877,25 +894,25 @@ int domain_adjust_node_perms(struct node *node) - void domain_entry_dec(struct connection *conn, struct node *node) - { - struct domain *d; -+ unsigned int domid; - - if (!conn) - return; - -- if (node->perms.p && node->perms.p[0].id != conn->id) { -- if (conn->transaction) { -- transaction_entry_dec(conn->transaction, -- node->perms.p[0].id); -+ domid = node->perms.p ? node->perms.p[0].id : conn->id; -+ -+ if (conn->transaction) { -+ transaction_entry_dec(conn->transaction, domid); -+ } else { -+ d = (domid == conn->id && conn->domain) ? conn->domain -+ : find_domain_struct(domid); -+ if (d) { -+ d->nbentry--; - } else { -- d = find_domain_by_domid(node->perms.p[0].id); -- if (d && d->nbentry) -- d->nbentry--; -- } -- } else if (conn->domain && conn->domain->nbentry) { -- if (conn->transaction) { -- transaction_entry_dec(conn->transaction, -- conn->domain->domid); -- } else { -- conn->domain->nbentry--; -+ errno = ENOENT; -+ corrupt(conn, -+ "Node \"%s\" owned by non-existing domain %u\n", -+ node->name, domid); - } - } - } -@@ -905,13 +922,23 @@ int domain_entry_fix(unsigned int domid, int num, bool update) - struct domain *d; - int cnt; - -- d = find_domain_by_domid(domid); -- if (!d) -- return 0; -+ if (update) { -+ d = find_domain_struct(domid); -+ assert(d); -+ } else { -+ /* -+ * We are called first with update == false in order to catch -+ * any error. So do a possible allocation and check for error -+ * only in this case, as in the case of update == true nothing -+ * can go wrong anymore as the allocation already happened. -+ */ -+ d = find_or_alloc_existing_domain(domid); -+ if (!d) -+ return -1; -+ } - - cnt = d->nbentry + num; -- if (cnt < 0) -- cnt = 0; -+ assert(cnt >= 0); - - if (update) - d->nbentry = cnt; -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index 5757a6557146..cce13d14f016 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -58,10 +58,10 @@ bool domain_can_write(struct connection *conn); - bool domain_is_unprivileged(struct connection *conn); - - /* Remove node permissions for no longer existing domains. */ --int domain_adjust_node_perms(struct node *node); -+int domain_adjust_node_perms(struct connection *conn, struct node *node); - - /* Quota manipulation */ --void domain_entry_inc(struct connection *conn, struct node *); -+int domain_entry_inc(struct connection *conn, struct node *); - void domain_entry_dec(struct connection *conn, struct node *); - int domain_entry_fix(unsigned int domid, int num, bool update); - int domain_entry(struct connection *conn); -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index ee1b09031a3b..86caf6c398be 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -519,8 +519,12 @@ static int transaction_fix_domains(struct transaction *trans, bool update) - - list_for_each_entry(d, &trans->changed_domains, list) { - cnt = domain_entry_fix(d->domid, d->nbentry, update); -- if (!update && cnt >= quota_nb_entry_per_domain) -- return ENOSPC; -+ if (!update) { -+ if (cnt >= quota_nb_entry_per_domain) -+ return ENOSPC; -+ if (cnt < 0) -+ return ENOMEM; -+ } - } - - return 0; --- -2.37.4 - diff --git a/0093-tools-xenstore-limit-max-number-of-nodes-accessed-in.patch b/0093-tools-xenstore-limit-max-number-of-nodes-accessed-in.patch deleted file mode 100644 index f064355..0000000 --- a/0093-tools-xenstore-limit-max-number-of-nodes-accessed-in.patch +++ /dev/null @@ -1,255 +0,0 @@ -From 1035371fee5552b8cfe9819c4058a4c9e695ba5e Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:09 +0200 -Subject: [PATCH 093/126] tools/xenstore: limit max number of nodes accessed in - a transaction - -Today a guest is free to access as many nodes in a single transaction -as it wants. This can lead to unbounded memory consumption in Xenstore -as there is the need to keep track of all nodes having been accessed -during a transaction. - -In oxenstored the number of requests in a transaction is being limited -via a quota maxrequests (default is 1024). As multiple accesses of a -node are not problematic in C Xenstore, limit the number of accessed -nodes. - -In order to let read_node() detect a quota error in case too many nodes -are being accessed, check the return value of access_node() and return -NULL in case an error has been seen. Introduce __must_check and add it -to the access_node() prototype. - -This is part of XSA-326 / CVE-2022-42314. - -Suggested-by: Julien Grall <julien@xen.org> -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 268369d8e322d227a74a899009c5748d7b0ea142) ---- - tools/include/xen-tools/libs.h | 4 +++ - tools/xenstore/xenstored_core.c | 50 ++++++++++++++++++-------- - tools/xenstore/xenstored_core.h | 1 + - tools/xenstore/xenstored_transaction.c | 9 +++++ - tools/xenstore/xenstored_transaction.h | 4 +-- - 5 files changed, 52 insertions(+), 16 deletions(-) - -diff --git a/tools/include/xen-tools/libs.h b/tools/include/xen-tools/libs.h -index a16e0c380709..bafc90e2f603 100644 ---- a/tools/include/xen-tools/libs.h -+++ b/tools/include/xen-tools/libs.h -@@ -63,4 +63,8 @@ - #define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1)) - #endif - -+#ifndef __must_check -+#define __must_check __attribute__((__warn_unused_result__)) -+#endif -+ - #endif /* __XEN_TOOLS_LIBS__ */ -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 85c0d2f38fac..050d6f651ae9 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -106,6 +106,7 @@ int quota_nb_watch_per_domain = 128; - int quota_max_entry_size = 2048; /* 2K */ - int quota_max_transaction = 10; - int quota_nb_perms_per_node = 5; -+int quota_trans_nodes = 1024; - int quota_max_path_len = XENSTORE_REL_PATH_MAX; - int quota_req_outstanding = 20; - -@@ -560,6 +561,7 @@ struct node *read_node(struct connection *conn, const void *ctx, - TDB_DATA key, data; - struct xs_tdb_record_hdr *hdr; - struct node *node; -+ int err; - - node = talloc(ctx, struct node); - if (!node) { -@@ -581,14 +583,13 @@ struct node *read_node(struct connection *conn, const void *ctx, - if (data.dptr == NULL) { - if (tdb_error(tdb_ctx) == TDB_ERR_NOEXIST) { - node->generation = NO_GENERATION; -- access_node(conn, node, NODE_ACCESS_READ, NULL); -- errno = ENOENT; -+ err = access_node(conn, node, NODE_ACCESS_READ, NULL); -+ errno = err ? : ENOENT; - } else { - log("TDB error on read: %s", tdb_errorstr(tdb_ctx)); - errno = EIO; - } -- talloc_free(node); -- return NULL; -+ goto error; - } - - node->parent = NULL; -@@ -603,19 +604,36 @@ struct node *read_node(struct connection *conn, const void *ctx, - - /* Permissions are struct xs_permissions. */ - node->perms.p = hdr->perms; -- if (domain_adjust_node_perms(conn, node)) { -- talloc_free(node); -- return NULL; -- } -+ if (domain_adjust_node_perms(conn, node)) -+ goto error; - - /* Data is binary blob (usually ascii, no nul). */ - node->data = node->perms.p + hdr->num_perms; - /* Children is strings, nul separated. */ - node->children = node->data + node->datalen; - -- access_node(conn, node, NODE_ACCESS_READ, NULL); -+ if (access_node(conn, node, NODE_ACCESS_READ, NULL)) -+ goto error; - - return node; -+ -+ error: -+ err = errno; -+ talloc_free(node); -+ errno = err; -+ return NULL; -+} -+ -+static bool read_node_can_propagate_errno(void) -+{ -+ /* -+ * 2 error cases for read_node() can always be propagated up: -+ * ENOMEM, because this has nothing to do with the node being in the -+ * data base or not, but is caused by a general lack of memory. -+ * ENOSPC, because this is related to hitting quota limits which need -+ * to be respected. -+ */ -+ return errno == ENOMEM || errno == ENOSPC; - } - - int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, -@@ -732,7 +750,7 @@ static int ask_parents(struct connection *conn, const void *ctx, - node = read_node(conn, ctx, name); - if (node) - break; -- if (errno == ENOMEM) -+ if (read_node_can_propagate_errno()) - return errno; - } while (!streq(name, "/")); - -@@ -795,7 +813,7 @@ static struct node *get_node(struct connection *conn, - } - } - /* Clean up errno if they weren't supposed to know. */ -- if (!node && errno != ENOMEM) -+ if (!node && !read_node_can_propagate_errno()) - errno = errno_from_parents(conn, ctx, name, errno, perm); - return node; - } -@@ -1201,7 +1219,7 @@ static struct node *construct_node(struct connection *conn, const void *ctx, - - /* If parent doesn't exist, create it. */ - parent = read_node(conn, parentname, parentname); -- if (!parent) -+ if (!parent && errno == ENOENT) - parent = construct_node(conn, ctx, parentname); - if (!parent) - return NULL; -@@ -1475,7 +1493,7 @@ static int _rm(struct connection *conn, const void *ctx, struct node *node, - - parent = read_node(conn, ctx, parentname); - if (!parent) -- return (errno == ENOMEM) ? ENOMEM : EINVAL; -+ return read_node_can_propagate_errno() ? errno : EINVAL; - node->parent = parent; - - return delete_node(conn, ctx, parent, node, false); -@@ -1505,7 +1523,7 @@ static int do_rm(struct connection *conn, struct buffered_data *in) - return 0; - } - /* Restore errno, just in case. */ -- if (errno != ENOMEM) -+ if (!read_node_can_propagate_errno()) - errno = ENOENT; - } - return errno; -@@ -2282,6 +2300,8 @@ static void usage(void) - " -M, --path-max <chars> limit the allowed Xenstore node path length,\n" - " -Q, --quota <what>=<nb> set the quota <what> to the value <nb>, allowed\n" - " quotas are:\n" -+" transaction-nodes: number of accessed node per\n" -+" transaction\n" - " outstanding: number of outstanding requests\n" - " -w, --timeout <what>=<seconds> set the timeout in seconds for <what>,\n" - " allowed timeout candidates are:\n" -@@ -2367,6 +2387,8 @@ static void set_quota(const char *arg) - val = get_optval_int(eq + 1); - if (what_matches(arg, "outstanding")) - quota_req_outstanding = val; -+ else if (what_matches(arg, "transaction-nodes")) -+ quota_trans_nodes = val; - else - barf("unknown quota \"%s\"\n", arg); - } -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index c0a056ce13fe..1b3bd5ca563a 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -261,6 +261,7 @@ extern int dom0_event; - extern int priv_domid; - extern int quota_nb_entry_per_domain; - extern int quota_req_outstanding; -+extern int quota_trans_nodes; - - extern unsigned int timeout_watch_event_msec; - -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 86caf6c398be..7bd41eb475e3 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -156,6 +156,9 @@ struct transaction - /* Connection-local identifier for this transaction. */ - uint32_t id; - -+ /* Node counter. */ -+ unsigned int nodes; -+ - /* Generation when transaction started. */ - uint64_t generation; - -@@ -260,6 +263,11 @@ int access_node(struct connection *conn, struct node *node, - - i = find_accessed_node(trans, node->name); - if (!i) { -+ if (trans->nodes >= quota_trans_nodes && -+ domain_is_unprivileged(conn)) { -+ ret = ENOSPC; -+ goto err; -+ } - i = talloc_zero(trans, struct accessed_node); - if (!i) - goto nomem; -@@ -297,6 +305,7 @@ int access_node(struct connection *conn, struct node *node, - i->ta_node = true; - } - } -+ trans->nodes++; - list_add_tail(&i->list, &trans->accessed); - } - -diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h -index 0093cac807e3..e3cbd6b23095 100644 ---- a/tools/xenstore/xenstored_transaction.h -+++ b/tools/xenstore/xenstored_transaction.h -@@ -39,8 +39,8 @@ void transaction_entry_inc(struct transaction *trans, unsigned int domid); - void transaction_entry_dec(struct transaction *trans, unsigned int domid); - - /* This node was accessed. */ --int access_node(struct connection *conn, struct node *node, -- enum node_access_type type, TDB_DATA *key); -+int __must_check access_node(struct connection *conn, struct node *node, -+ enum node_access_type type, TDB_DATA *key); - - /* Queue watches for a modified node. */ - void queue_watches(struct connection *conn, const char *name, bool watch_exact); --- -2.37.4 - diff --git a/0094-tools-xenstore-move-the-call-of-setup_structure-to-d.patch b/0094-tools-xenstore-move-the-call-of-setup_structure-to-d.patch deleted file mode 100644 index 4cebe89..0000000 --- a/0094-tools-xenstore-move-the-call-of-setup_structure-to-d.patch +++ /dev/null @@ -1,96 +0,0 @@ -From ccef72b6a885714dae0b6f1accb33042ee40e108 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:09 +0200 -Subject: [PATCH 094/126] tools/xenstore: move the call of setup_structure() to - dom0 introduction - -Setting up the basic structure when introducing dom0 has the advantage -to be able to add proper node memory accounting for the added nodes -later. - -This makes it possible to do proper node accounting, too. - -An additional requirement to make that work fine is to correct the -owner of the created nodes to be dom0_domid instead of domid 0. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 60e2f6020dea7f616857b8fc1141b1c085d88761) ---- - tools/xenstore/xenstored_core.c | 9 ++++----- - tools/xenstore/xenstored_core.h | 1 + - tools/xenstore/xenstored_domain.c | 3 +++ - 3 files changed, 8 insertions(+), 5 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 050d6f651ae9..51af74390cbe 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1940,7 +1940,8 @@ static int tdb_flags; - static void manual_node(const char *name, const char *child) - { - struct node *node; -- struct xs_permissions perms = { .id = 0, .perms = XS_PERM_NONE }; -+ struct xs_permissions perms = { .id = dom0_domid, -+ .perms = XS_PERM_NONE }; - - node = talloc_zero(NULL, struct node); - if (!node) -@@ -1979,7 +1980,7 @@ static void tdb_logger(TDB_CONTEXT *tdb, int level, const char * fmt, ...) - } - } - --static void setup_structure(bool live_update) -+void setup_structure(bool live_update) - { - char *tdbname; - -@@ -2002,6 +2003,7 @@ static void setup_structure(bool live_update) - manual_node("/", "tool"); - manual_node("/tool", "xenstored"); - manual_node("/tool/xenstored", NULL); -+ domain_entry_fix(dom0_domid, 3, true); - } - - check_store(); -@@ -2512,9 +2514,6 @@ int main(int argc, char *argv[]) - - init_pipe(reopen_log_pipe); - -- /* Setup the database */ -- setup_structure(live_update); -- - /* Listen to hypervisor. */ - if (!no_domain_init && !live_update) { - domain_init(-1); -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 1b3bd5ca563a..459698d8407a 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -224,6 +224,7 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - struct node *read_node(struct connection *conn, const void *ctx, - const char *name); - -+void setup_structure(bool live_update); - struct connection *new_connection(connwritefn_t *write, connreadfn_t *read); - struct connection *get_connection_by_id(unsigned int conn_id); - void check_store(void); -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index 3c27973fb836..0dd75a6a2194 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -476,6 +476,9 @@ static struct domain *introduce_domain(const void *ctx, - } - domain->interface = interface; - -+ if (is_master_domain) -+ setup_structure(restore); -+ - /* Now domain belongs to its connection. */ - talloc_steal(domain->conn, domain); - --- -2.37.4 - diff --git a/0095-tools-xenstore-add-infrastructure-to-keep-track-of-p.patch b/0095-tools-xenstore-add-infrastructure-to-keep-track-of-p.patch deleted file mode 100644 index f826f80..0000000 --- a/0095-tools-xenstore-add-infrastructure-to-keep-track-of-p.patch +++ /dev/null @@ -1,289 +0,0 @@ -From aa29eb624797fb6825e4a23071c88417672868a4 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:09 +0200 -Subject: [PATCH 095/126] tools/xenstore: add infrastructure to keep track of - per domain memory usage - -The amount of memory a domain can consume in Xenstore is limited by -various quota today, but even with sane quota a domain can still -consume rather large memory quantities. - -Add the infrastructure for keeping track of the amount of memory a -domain is consuming in Xenstore. Note that this is only the memory a -domain has direct control over, so any internal administration data -needed by Xenstore only is not being accounted for. - -There are two quotas defined: a soft quota which will result in a -warning issued via syslog() when it is exceeded, and a hard quota -resulting in a stop of accepting further requests or watch events as -long as the hard quota would be violated by accepting those. - -Setting any of those quotas to 0 will disable it. - -As default values use 2MB per domain for the soft limit (this basically -covers the allowed case to create 1000 nodes needing 2kB each), and -2.5MB for the hard limit. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 0d4a8ec7a93faedbe54fd197db146de628459e77) ---- - tools/xenstore/xenstored_core.c | 30 ++++++++-- - tools/xenstore/xenstored_core.h | 2 + - tools/xenstore/xenstored_domain.c | 93 +++++++++++++++++++++++++++++++ - tools/xenstore/xenstored_domain.h | 20 +++++++ - 4 files changed, 139 insertions(+), 6 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 51af74390cbe..eeb0d893e8c3 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -109,6 +109,8 @@ int quota_nb_perms_per_node = 5; - int quota_trans_nodes = 1024; - int quota_max_path_len = XENSTORE_REL_PATH_MAX; - int quota_req_outstanding = 20; -+int quota_memory_per_domain_soft = 2 * 1024 * 1024; /* 2 MB */ -+int quota_memory_per_domain_hard = 2 * 1024 * 1024 + 512 * 1024; /* 2.5 MB */ - - unsigned int timeout_watch_event_msec = 20000; - -@@ -2304,7 +2306,14 @@ static void usage(void) - " quotas are:\n" - " transaction-nodes: number of accessed node per\n" - " transaction\n" -+" memory: total used memory per domain for nodes,\n" -+" transactions, watches and requests, above\n" -+" which Xenstore will stop talking to domain\n" - " outstanding: number of outstanding requests\n" -+" -q, --quota-soft <what>=<nb> set a soft quota <what> to the value <nb>,\n" -+" causing a warning to be issued via syslog() if the\n" -+" limit is violated, allowed quotas are:\n" -+" memory: see above\n" - " -w, --timeout <what>=<seconds> set the timeout in seconds for <what>,\n" - " allowed timeout candidates are:\n" - " watch-event: time a watch-event is kept pending\n" -@@ -2331,6 +2340,7 @@ static struct option options[] = { - { "perm-nb", 1, NULL, 'A' }, - { "path-max", 1, NULL, 'M' }, - { "quota", 1, NULL, 'Q' }, -+ { "quota-soft", 1, NULL, 'q' }, - { "timeout", 1, NULL, 'w' }, - { "no-recovery", 0, NULL, 'R' }, - { "internal-db", 0, NULL, 'I' }, -@@ -2379,7 +2389,7 @@ static void set_timeout(const char *arg) - barf("unknown timeout \"%s\"\n", arg); - } - --static void set_quota(const char *arg) -+static void set_quota(const char *arg, bool soft) - { - const char *eq = strchr(arg, '='); - int val; -@@ -2387,11 +2397,16 @@ static void set_quota(const char *arg) - if (!eq) - barf("quotas must be specified via <what>=<nb>\n"); - val = get_optval_int(eq + 1); -- if (what_matches(arg, "outstanding")) -+ if (what_matches(arg, "outstanding") && !soft) - quota_req_outstanding = val; -- else if (what_matches(arg, "transaction-nodes")) -+ else if (what_matches(arg, "transaction-nodes") && !soft) - quota_trans_nodes = val; -- else -+ else if (what_matches(arg, "memory")) { -+ if (soft) -+ quota_memory_per_domain_soft = val; -+ else -+ quota_memory_per_domain_hard = val; -+ } else - barf("unknown quota \"%s\"\n", arg); - } - -@@ -2409,7 +2424,7 @@ int main(int argc, char *argv[]) - orig_argc = argc; - orig_argv = argv; - -- while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:Q:T:RVW:w:U", -+ while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:Q:q:T:RVW:w:U", - options, NULL)) != -1) { - switch (opt) { - case 'D': -@@ -2459,7 +2474,10 @@ int main(int argc, char *argv[]) - quota_max_path_len); - break; - case 'Q': -- set_quota(optarg); -+ set_quota(optarg, false); -+ break; -+ case 'q': -+ set_quota(optarg, true); - break; - case 'w': - set_timeout(optarg); -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 459698d8407a..2fb37dbfe847 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -263,6 +263,8 @@ extern int priv_domid; - extern int quota_nb_entry_per_domain; - extern int quota_req_outstanding; - extern int quota_trans_nodes; -+extern int quota_memory_per_domain_soft; -+extern int quota_memory_per_domain_hard; - - extern unsigned int timeout_watch_event_msec; - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index 0dd75a6a2194..ec542df6a67e 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -76,6 +76,13 @@ struct domain - /* number of entry from this domain in the store */ - int nbentry; - -+ /* Amount of memory allocated for this domain. */ -+ int memory; -+ bool soft_quota_reported; -+ bool hard_quota_reported; -+ time_t mem_last_msg; -+#define MEM_WARN_MINTIME_SEC 10 -+ - /* number of watch for this domain */ - int nbwatch; - -@@ -296,6 +303,9 @@ bool domain_can_read(struct connection *conn) - return false; - if (conn->domain->nboutstanding >= quota_req_outstanding) - return false; -+ if (conn->domain->memory >= quota_memory_per_domain_hard && -+ quota_memory_per_domain_hard) -+ return false; - } - - if (conn->is_ignored) -@@ -956,6 +966,89 @@ int domain_entry(struct connection *conn) - : 0; - } - -+static bool domain_chk_quota(struct domain *domain, int mem) -+{ -+ time_t now; -+ -+ if (!domain || !domid_is_unprivileged(domain->domid) || -+ (domain->conn && domain->conn->is_ignored)) -+ return false; -+ -+ now = time(NULL); -+ -+ if (mem >= quota_memory_per_domain_hard && -+ quota_memory_per_domain_hard) { -+ if (domain->hard_quota_reported) -+ return true; -+ syslog(LOG_ERR, "Domain %u exceeds hard memory quota, Xenstore interface to domain stalled\n", -+ domain->domid); -+ domain->mem_last_msg = now; -+ domain->hard_quota_reported = true; -+ return true; -+ } -+ -+ if (now - domain->mem_last_msg >= MEM_WARN_MINTIME_SEC) { -+ if (domain->hard_quota_reported) { -+ domain->mem_last_msg = now; -+ domain->hard_quota_reported = false; -+ syslog(LOG_INFO, "Domain %u below hard memory quota again\n", -+ domain->domid); -+ } -+ if (mem >= quota_memory_per_domain_soft && -+ quota_memory_per_domain_soft && -+ !domain->soft_quota_reported) { -+ domain->mem_last_msg = now; -+ domain->soft_quota_reported = true; -+ syslog(LOG_WARNING, "Domain %u exceeds soft memory quota\n", -+ domain->domid); -+ } -+ if (mem < quota_memory_per_domain_soft && -+ domain->soft_quota_reported) { -+ domain->mem_last_msg = now; -+ domain->soft_quota_reported = false; -+ syslog(LOG_INFO, "Domain %u below soft memory quota again\n", -+ domain->domid); -+ } -+ -+ } -+ -+ return false; -+} -+ -+int domain_memory_add(unsigned int domid, int mem, bool no_quota_check) -+{ -+ struct domain *domain; -+ -+ domain = find_domain_struct(domid); -+ if (domain) { -+ /* -+ * domain_chk_quota() will print warning and also store whether -+ * the soft/hard quota has been hit. So check no_quota_check -+ * *after*. -+ */ -+ if (domain_chk_quota(domain, domain->memory + mem) && -+ !no_quota_check) -+ return ENOMEM; -+ domain->memory += mem; -+ } else { -+ /* -+ * The domain the memory is to be accounted for should always -+ * exist, as accounting is done either for a domain related to -+ * the current connection, or for the domain owning a node -+ * (which is always existing, as the owner of the node is -+ * tested to exist and replaced by domid 0 if not). -+ * So not finding the related domain MUST be an error in the -+ * data base. -+ */ -+ errno = ENOENT; -+ corrupt(NULL, "Accounting called for non-existing domain %u\n", -+ domid); -+ return ENOENT; -+ } -+ -+ return 0; -+} -+ - void domain_watch_inc(struct connection *conn) - { - if (!conn || !conn->domain) -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index cce13d14f016..571aa46d158e 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -65,6 +65,26 @@ int domain_entry_inc(struct connection *conn, struct node *); - void domain_entry_dec(struct connection *conn, struct node *); - int domain_entry_fix(unsigned int domid, int num, bool update); - int domain_entry(struct connection *conn); -+int domain_memory_add(unsigned int domid, int mem, bool no_quota_check); -+ -+/* -+ * domain_memory_add_chk(): to be used when memory quota should be checked. -+ * Not to be used when specifying a negative mem value, as lowering the used -+ * memory should always be allowed. -+ */ -+static inline int domain_memory_add_chk(unsigned int domid, int mem) -+{ -+ return domain_memory_add(domid, mem, false); -+} -+/* -+ * domain_memory_add_nochk(): to be used when memory quota should not be -+ * checked, e.g. when lowering memory usage, or in an error case for undoing -+ * a previous memory adjustment. -+ */ -+static inline void domain_memory_add_nochk(unsigned int domid, int mem) -+{ -+ domain_memory_add(domid, mem, true); -+} - void domain_watch_inc(struct connection *conn); - void domain_watch_dec(struct connection *conn); - int domain_watch(struct connection *conn); --- -2.37.4 - diff --git a/0096-tools-xenstore-add-memory-accounting-for-responses.patch b/0096-tools-xenstore-add-memory-accounting-for-responses.patch deleted file mode 100644 index 6174433..0000000 --- a/0096-tools-xenstore-add-memory-accounting-for-responses.patch +++ /dev/null @@ -1,82 +0,0 @@ -From 0113aacb3d791600668cd7703f6f12ed94fc6d03 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:09 +0200 -Subject: [PATCH 096/126] tools/xenstore: add memory accounting for responses - -Add the memory accounting for queued responses. - -In case adding a watch event for a guest is causing the hard memory -quota of that guest to be violated, the event is dropped. This will -ensure that it is impossible to drive another guest past its memory -quota by generating insane amounts of events for that guest. This is -especially important for protecting driver domains from that attack -vector. - -This is part of XSA-326 / CVE-2022-42315. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit f6d00133643a524d2138c9e3f192bbde719050ba) ---- - tools/xenstore/xenstored_core.c | 22 +++++++++++++++++++--- - 1 file changed, 19 insertions(+), 3 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index eeb0d893e8c3..2e02b577c912 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -260,6 +260,8 @@ static void free_buffered_data(struct buffered_data *out, - } - } - -+ domain_memory_add_nochk(conn->id, -out->hdr.msg.len - sizeof(out->hdr)); -+ - if (out->hdr.msg.type == XS_WATCH_EVENT) { - req = out->pend.req; - if (req) { -@@ -904,11 +906,14 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - bdata->timeout_msec = 0; - bdata->watch_event = false; - -- if (len <= DEFAULT_BUFFER_SIZE) -+ if (len <= DEFAULT_BUFFER_SIZE) { - bdata->buffer = bdata->default_buffer; -- else { -+ /* Don't check quota, path might be used for returning error. */ -+ domain_memory_add_nochk(conn->id, len + sizeof(bdata->hdr)); -+ } else { - bdata->buffer = talloc_array(bdata, char, len); -- if (!bdata->buffer) { -+ if (!bdata->buffer || -+ domain_memory_add_chk(conn->id, len + sizeof(bdata->hdr))) { - send_error(conn, ENOMEM); - return; - } -@@ -973,6 +978,11 @@ void send_event(struct buffered_data *req, struct connection *conn, - } - } - -+ if (domain_memory_add_chk(conn->id, len + sizeof(bdata->hdr))) { -+ talloc_free(bdata); -+ return; -+ } -+ - if (timeout_watch_event_msec && domain_is_unprivileged(conn)) { - bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; - if (!conn->timeout_msec) -@@ -2940,6 +2950,12 @@ static void add_buffered_data(struct buffered_data *bdata, - */ - if (bdata->hdr.msg.type != XS_WATCH_EVENT) - domain_outstanding_inc(conn); -+ /* -+ * We are restoring the state after Live-Update and the new quota may -+ * be smaller. So ignore it. The limit will be applied for any resource -+ * after the state has been fully restored. -+ */ -+ domain_memory_add_nochk(conn->id, len + sizeof(bdata->hdr)); - } - - void read_state_buffered_data(const void *ctx, struct connection *conn, --- -2.37.4 - diff --git a/0097-tools-xenstore-add-memory-accounting-for-watches.patch b/0097-tools-xenstore-add-memory-accounting-for-watches.patch deleted file mode 100644 index dd2ed61..0000000 --- a/0097-tools-xenstore-add-memory-accounting-for-watches.patch +++ /dev/null @@ -1,96 +0,0 @@ -From 9c2e71fe0611da9ed2ebbf2362a9bb05d42bf0c3 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: [PATCH 097/126] tools/xenstore: add memory accounting for watches - -Add the memory accounting for registered watches. - -When a socket connection is destroyed, the associated watches are -removed, too. In order to keep memory accounting correct the watches -must be removed explicitly via a call of conn_delete_all_watches() from -destroy_conn(). - -This is part of XSA-326 / CVE-2022-42315. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 7f9978a2cc37aaffab2fb09593bc598c0712a69b) ---- - tools/xenstore/xenstored_core.c | 1 + - tools/xenstore/xenstored_watch.c | 13 ++++++++++--- - 2 files changed, 11 insertions(+), 3 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 2e02b577c912..b1a4575929bd 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -457,6 +457,7 @@ static int destroy_conn(void *_conn) - } - - conn_free_buffered_data(conn); -+ conn_delete_all_watches(conn); - list_for_each_entry(req, &conn->ref_list, list) - req->on_ref_list = false; - -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index 1d664e3d6b72..0d5858df5bdd 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -211,7 +211,7 @@ static int check_watch_path(struct connection *conn, const void *ctx, - } - - static struct watch *add_watch(struct connection *conn, char *path, char *token, -- bool relative) -+ bool relative, bool no_quota_check) - { - struct watch *watch; - -@@ -222,6 +222,9 @@ static struct watch *add_watch(struct connection *conn, char *path, char *token, - watch->token = talloc_strdup(watch, token); - if (!watch->node || !watch->token) - goto nomem; -+ if (domain_memory_add(conn->id, strlen(path) + strlen(token), -+ no_quota_check)) -+ goto nomem; - - if (relative) - watch->relative_path = get_implicit_path(conn); -@@ -265,7 +268,7 @@ int do_watch(struct connection *conn, struct buffered_data *in) - if (domain_watch(conn) > quota_nb_watch_per_domain) - return E2BIG; - -- watch = add_watch(conn, vec[0], vec[1], relative); -+ watch = add_watch(conn, vec[0], vec[1], relative, false); - if (!watch) - return errno; - -@@ -296,6 +299,8 @@ int do_unwatch(struct connection *conn, struct buffered_data *in) - list_for_each_entry(watch, &conn->watches, list) { - if (streq(watch->node, node) && streq(watch->token, vec[1])) { - list_del(&watch->list); -+ domain_memory_add_nochk(conn->id, -strlen(watch->node) - -+ strlen(watch->token)); - talloc_free(watch); - domain_watch_dec(conn); - send_ack(conn, XS_UNWATCH); -@@ -311,6 +316,8 @@ void conn_delete_all_watches(struct connection *conn) - - while ((watch = list_top(&conn->watches, struct watch, list))) { - list_del(&watch->list); -+ domain_memory_add_nochk(conn->id, -strlen(watch->node) - -+ strlen(watch->token)); - talloc_free(watch); - domain_watch_dec(conn); - } -@@ -373,7 +380,7 @@ void read_state_watch(const void *ctx, const void *state) - if (!path) - barf("allocation error for read watch"); - -- if (!add_watch(conn, path, token, relative)) -+ if (!add_watch(conn, path, token, relative, true)) - barf("error adding watch"); - } - --- -2.37.4 - diff --git a/0098-tools-xenstore-add-memory-accounting-for-nodes.patch b/0098-tools-xenstore-add-memory-accounting-for-nodes.patch deleted file mode 100644 index f2f8e4f..0000000 --- a/0098-tools-xenstore-add-memory-accounting-for-nodes.patch +++ /dev/null @@ -1,342 +0,0 @@ -From 32efe29a00efab2896cc973e966a35ecad556495 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: [PATCH 098/126] tools/xenstore: add memory accounting for nodes - -Add the memory accounting for Xenstore nodes. In order to make this -not too complicated allow for some sloppiness when writing nodes. Any -hard quota violation will result in no further requests to be accepted. - -This is part of XSA-326 / CVE-2022-42315. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 00e9e32d022be1afc144b75acdaeba8393e63315) ---- - tools/xenstore/xenstored_core.c | 140 ++++++++++++++++++++++--- - tools/xenstore/xenstored_core.h | 12 +++ - tools/xenstore/xenstored_transaction.c | 16 ++- - 3 files changed, 151 insertions(+), 17 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index b1a4575929bd..f27d5c0101bc 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -556,6 +556,117 @@ void set_tdb_key(const char *name, TDB_DATA *key) - key->dsize = strlen(name); - } - -+static void get_acc_data(TDB_DATA *key, struct node_account_data *acc) -+{ -+ TDB_DATA old_data; -+ struct xs_tdb_record_hdr *hdr; -+ -+ if (acc->memory < 0) { -+ old_data = tdb_fetch(tdb_ctx, *key); -+ /* No check for error, as the node might not exist. */ -+ if (old_data.dptr == NULL) { -+ acc->memory = 0; -+ } else { -+ hdr = (void *)old_data.dptr; -+ acc->memory = old_data.dsize; -+ acc->domid = hdr->perms[0].id; -+ } -+ talloc_free(old_data.dptr); -+ } -+} -+ -+/* -+ * Per-transaction nodes need to be accounted for the transaction owner. -+ * Those nodes are stored in the data base with the transaction generation -+ * count prepended (e.g. 123/local/domain/...). So testing for the node's -+ * key not to start with "/" is sufficient. -+ */ -+static unsigned int get_acc_domid(struct connection *conn, TDB_DATA *key, -+ unsigned int domid) -+{ -+ return (!conn || key->dptr[0] == '/') ? domid : conn->id; -+} -+ -+int do_tdb_write(struct connection *conn, TDB_DATA *key, TDB_DATA *data, -+ struct node_account_data *acc, bool no_quota_check) -+{ -+ struct xs_tdb_record_hdr *hdr = (void *)data->dptr; -+ struct node_account_data old_acc = {}; -+ unsigned int old_domid, new_domid; -+ int ret; -+ -+ if (!acc) -+ old_acc.memory = -1; -+ else -+ old_acc = *acc; -+ -+ get_acc_data(key, &old_acc); -+ old_domid = get_acc_domid(conn, key, old_acc.domid); -+ new_domid = get_acc_domid(conn, key, hdr->perms[0].id); -+ -+ /* -+ * Don't check for ENOENT, as we want to be able to switch orphaned -+ * nodes to new owners. -+ */ -+ if (old_acc.memory) -+ domain_memory_add_nochk(old_domid, -+ -old_acc.memory - key->dsize); -+ ret = domain_memory_add(new_domid, data->dsize + key->dsize, -+ no_quota_check); -+ if (ret) { -+ /* Error path, so no quota check. */ -+ if (old_acc.memory) -+ domain_memory_add_nochk(old_domid, -+ old_acc.memory + key->dsize); -+ return ret; -+ } -+ -+ /* TDB should set errno, but doesn't even set ecode AFAICT. */ -+ if (tdb_store(tdb_ctx, *key, *data, TDB_REPLACE) != 0) { -+ domain_memory_add_nochk(new_domid, -data->dsize - key->dsize); -+ /* Error path, so no quota check. */ -+ if (old_acc.memory) -+ domain_memory_add_nochk(old_domid, -+ old_acc.memory + key->dsize); -+ errno = EIO; -+ return errno; -+ } -+ -+ if (acc) { -+ /* Don't use new_domid, as it might be a transaction node. */ -+ acc->domid = hdr->perms[0].id; -+ acc->memory = data->dsize; -+ } -+ -+ return 0; -+} -+ -+int do_tdb_delete(struct connection *conn, TDB_DATA *key, -+ struct node_account_data *acc) -+{ -+ struct node_account_data tmp_acc; -+ unsigned int domid; -+ -+ if (!acc) { -+ acc = &tmp_acc; -+ acc->memory = -1; -+ } -+ -+ get_acc_data(key, acc); -+ -+ if (tdb_delete(tdb_ctx, *key)) { -+ errno = EIO; -+ return errno; -+ } -+ -+ if (acc->memory) { -+ domid = get_acc_domid(conn, key, acc->domid); -+ domain_memory_add_nochk(domid, -acc->memory - key->dsize); -+ } -+ -+ return 0; -+} -+ - /* - * If it fails, returns NULL and sets errno. - * Temporary memory allocations will be done with ctx. -@@ -609,9 +720,15 @@ struct node *read_node(struct connection *conn, const void *ctx, - - /* Permissions are struct xs_permissions. */ - node->perms.p = hdr->perms; -+ node->acc.domid = node->perms.p[0].id; -+ node->acc.memory = data.dsize; - if (domain_adjust_node_perms(conn, node)) - goto error; - -+ /* If owner is gone reset currently accounted memory size. */ -+ if (node->acc.domid != node->perms.p[0].id) -+ node->acc.memory = 0; -+ - /* Data is binary blob (usually ascii, no nul). */ - node->data = node->perms.p + hdr->num_perms; - /* Children is strings, nul separated. */ -@@ -680,12 +797,9 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - p += node->datalen; - memcpy(p, node->children, node->childlen); - -- /* TDB should set errno, but doesn't even set ecode AFAICT. */ -- if (tdb_store(tdb_ctx, *key, data, TDB_REPLACE) != 0) { -- corrupt(conn, "Write of %s failed", key->dptr); -- errno = EIO; -- return errno; -- } -+ if (do_tdb_write(conn, key, &data, &node->acc, no_quota_check)) -+ return EIO; -+ - return 0; - } - -@@ -1188,7 +1302,7 @@ static void delete_node_single(struct connection *conn, struct node *node) - if (access_node(conn, node, NODE_ACCESS_DELETE, &key)) - return; - -- if (tdb_delete(tdb_ctx, key) != 0) { -+ if (do_tdb_delete(conn, &key, &node->acc) != 0) { - corrupt(conn, "Could not delete '%s'", node->name); - return; - } -@@ -1261,6 +1375,7 @@ static struct node *construct_node(struct connection *conn, const void *ctx, - /* No children, no data */ - node->children = node->data = NULL; - node->childlen = node->datalen = 0; -+ node->acc.memory = 0; - node->parent = parent; - return node; - -@@ -1269,17 +1384,17 @@ nomem: - return NULL; - } - --static void destroy_node_rm(struct node *node) -+static void destroy_node_rm(struct connection *conn, struct node *node) - { - if (streq(node->name, "/")) - corrupt(NULL, "Destroying root node!"); - -- tdb_delete(tdb_ctx, node->key); -+ do_tdb_delete(conn, &node->key, &node->acc); - } - - static int destroy_node(struct connection *conn, struct node *node) - { -- destroy_node_rm(node); -+ destroy_node_rm(conn, node); - domain_entry_dec(conn, node); - - /* -@@ -1331,7 +1446,7 @@ static struct node *create_node(struct connection *conn, const void *ctx, - /* Account for new node */ - if (i->parent) { - if (domain_entry_inc(conn, i)) { -- destroy_node_rm(i); -+ destroy_node_rm(conn, i); - return NULL; - } - } -@@ -2192,7 +2307,7 @@ static int clean_store_(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA val, - if (!hashtable_search(reachable, name)) { - log("clean_store: '%s' is orphaned!", name); - if (recovery) { -- tdb_delete(tdb, key); -+ do_tdb_delete(NULL, &key, NULL); - } - } - -@@ -3030,6 +3145,7 @@ void read_state_node(const void *ctx, const void *state) - if (!node) - barf("allocation error restoring node"); - -+ node->acc.memory = 0; - node->name = name; - node->generation = ++generation; - node->datalen = sn->data_len; -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 2fb37dbfe847..5c1b574bffe6 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -169,6 +169,11 @@ struct node_perms { - struct xs_permissions *p; - }; - -+struct node_account_data { -+ unsigned int domid; -+ int memory; /* -1 if unknown */ -+}; -+ - struct node { - const char *name; - /* Key used to update TDB */ -@@ -191,6 +196,9 @@ struct node { - /* Children, each nul-terminated. */ - unsigned int childlen; - char *children; -+ -+ /* Allocation information for node currently in store. */ -+ struct node_account_data acc; - }; - - /* Return the only argument in the input. */ -@@ -300,6 +308,10 @@ extern xengnttab_handle **xgt_handle; - int remember_string(struct hashtable *hash, const char *str); - - void set_tdb_key(const char *name, TDB_DATA *key); -+int do_tdb_write(struct connection *conn, TDB_DATA *key, TDB_DATA *data, -+ struct node_account_data *acc, bool no_quota_check); -+int do_tdb_delete(struct connection *conn, TDB_DATA *key, -+ struct node_account_data *acc); - - void conn_free_buffered_data(struct connection *conn); - -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 7bd41eb475e3..ace9a11d77bb 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -153,6 +153,9 @@ struct transaction - /* List of all transactions active on this connection. */ - struct list_head list; - -+ /* Connection this transaction is associated with. */ -+ struct connection *conn; -+ - /* Connection-local identifier for this transaction. */ - uint32_t id; - -@@ -286,6 +289,8 @@ int access_node(struct connection *conn, struct node *node, - - introduce = true; - i->ta_node = false; -+ /* acc.memory < 0 means "unknown, get size from TDB". */ -+ node->acc.memory = -1; - - /* - * Additional transaction-specific node for read type. We only -@@ -410,11 +415,11 @@ static int finalize_transaction(struct connection *conn, - goto err; - hdr = (void *)data.dptr; - hdr->generation = ++generation; -- ret = tdb_store(tdb_ctx, key, data, -- TDB_REPLACE); -+ ret = do_tdb_write(conn, &key, &data, NULL, -+ true); - talloc_free(data.dptr); - } else { -- ret = tdb_delete(tdb_ctx, key); -+ ret = do_tdb_delete(conn, &key, NULL); - } - if (ret) - goto err; -@@ -425,7 +430,7 @@ static int finalize_transaction(struct connection *conn, - } - } - -- if (i->ta_node && tdb_delete(tdb_ctx, ta_key)) -+ if (i->ta_node && do_tdb_delete(conn, &ta_key, NULL)) - goto err; - list_del(&i->list); - talloc_free(i); -@@ -453,7 +458,7 @@ static int destroy_transaction(void *_transaction) - i->node); - if (trans_name) { - set_tdb_key(trans_name, &key); -- tdb_delete(tdb_ctx, key); -+ do_tdb_delete(trans->conn, &key, NULL); - } - } - list_del(&i->list); -@@ -497,6 +502,7 @@ int do_transaction_start(struct connection *conn, struct buffered_data *in) - - INIT_LIST_HEAD(&trans->accessed); - INIT_LIST_HEAD(&trans->changed_domains); -+ trans->conn = conn; - trans->fail = false; - trans->generation = ++generation; - --- -2.37.4 - diff --git a/0099-tools-xenstore-add-exports-for-quota-variables.patch b/0099-tools-xenstore-add-exports-for-quota-variables.patch deleted file mode 100644 index 98f341f..0000000 --- a/0099-tools-xenstore-add-exports-for-quota-variables.patch +++ /dev/null @@ -1,62 +0,0 @@ -From 1fc3ecc9bfead0a50d8e05de983ed2a8f02fa03c Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: [PATCH 099/126] tools/xenstore: add exports for quota variables - -Some quota variables are not exported via header files. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 1da16d5990b5f7752657fca3e948f735177ea9ad) ---- - tools/xenstore/xenstored_core.h | 5 +++++ - tools/xenstore/xenstored_transaction.c | 1 - - tools/xenstore/xenstored_watch.c | 2 -- - 3 files changed, 5 insertions(+), 3 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 5c1b574bffe6..1eb3708f82dd 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -268,6 +268,11 @@ extern TDB_CONTEXT *tdb_ctx; - extern int dom0_domid; - extern int dom0_event; - extern int priv_domid; -+extern int quota_nb_watch_per_domain; -+extern int quota_max_transaction; -+extern int quota_max_entry_size; -+extern int quota_nb_perms_per_node; -+extern int quota_max_path_len; - extern int quota_nb_entry_per_domain; - extern int quota_req_outstanding; - extern int quota_trans_nodes; -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index ace9a11d77bb..28774813de83 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -175,7 +175,6 @@ struct transaction - bool fail; - }; - --extern int quota_max_transaction; - uint64_t generation; - - static struct accessed_node *find_accessed_node(struct transaction *trans, -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index 0d5858df5bdd..4970e9f1a1b9 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -31,8 +31,6 @@ - #include "xenstored_domain.h" - #include "xenstored_transaction.h" - --extern int quota_nb_watch_per_domain; -- - struct watch - { - /* Watches on this connection */ --- -2.37.4 - diff --git a/0100-tools-xenstore-add-control-command-for-setting-and-s.patch b/0100-tools-xenstore-add-control-command-for-setting-and-s.patch deleted file mode 100644 index e721645..0000000 --- a/0100-tools-xenstore-add-control-command-for-setting-and-s.patch +++ /dev/null @@ -1,248 +0,0 @@ -From 4d30175fdadb75c55acb8abb186727eda7cd5585 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: [PATCH 100/126] tools/xenstore: add control command for setting and - showing quota - -Add a xenstore-control command "quota" to: -- show current quota settings -- change quota settings -- show current quota related values of a domain - -Note that in the case the new quota is lower than existing one, -Xenstored may continue to handle requests from a domain exceeding the -new limit (depends on which one has been broken) and the amount of -resource used will not change. However the domain will not be able to -create more resource (associated to the quota) until it is back to below -the limit. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 9c484bef83496b683b0087e3bd2a560da4aa37af) ---- - docs/misc/xenstore.txt | 11 +++ - tools/xenstore/xenstored_control.c | 111 +++++++++++++++++++++++++++++ - tools/xenstore/xenstored_domain.c | 33 +++++++++ - tools/xenstore/xenstored_domain.h | 2 + - 4 files changed, 157 insertions(+) - -diff --git a/docs/misc/xenstore.txt b/docs/misc/xenstore.txt -index 334dc8b6fdf5..a7d006519ae8 100644 ---- a/docs/misc/xenstore.txt -+++ b/docs/misc/xenstore.txt -@@ -366,6 +366,17 @@ CONTROL <command>|[<parameters>|] - print|<string> - print <string> to syslog (xenstore runs as daemon) or - to console (xenstore runs as stubdom) -+ quota|[set <name> <val>|<domid>] -+ without parameters: print the current quota settings -+ with "set <name> <val>": set the quota <name> to new value -+ <val> (The admin should make sure all the domain usage is -+ below the quota. If it is not, then Xenstored may continue to -+ handle requests from the domain as long as the resource -+ violating the new quota setting isn't increased further) -+ with "<domid>": print quota related accounting data for -+ the domain <domid> -+ quota-soft|[set <name> <val>] -+ like the "quota" command, but for soft-quota. - help <supported-commands> - return list of supported commands for CONTROL - -diff --git a/tools/xenstore/xenstored_control.c b/tools/xenstore/xenstored_control.c -index 211fe1fd9b37..980279fa53ff 100644 ---- a/tools/xenstore/xenstored_control.c -+++ b/tools/xenstore/xenstored_control.c -@@ -148,6 +148,115 @@ static int do_control_log(void *ctx, struct connection *conn, - return 0; - } - -+struct quota { -+ const char *name; -+ int *quota; -+ const char *descr; -+}; -+ -+static const struct quota hard_quotas[] = { -+ { "nodes", "a_nb_entry_per_domain, "Nodes per domain" }, -+ { "watches", "a_nb_watch_per_domain, "Watches per domain" }, -+ { "transactions", "a_max_transaction, "Transactions per domain" }, -+ { "outstanding", "a_req_outstanding, -+ "Outstanding requests per domain" }, -+ { "transaction-nodes", "a_trans_nodes, -+ "Max. number of accessed nodes per transaction" }, -+ { "memory", "a_memory_per_domain_hard, -+ "Total Xenstore memory per domain (error level)" }, -+ { "node-size", "a_max_entry_size, "Max. size of a node" }, -+ { "path-max", "a_max_path_len, "Max. length of a node path" }, -+ { "permissions", "a_nb_perms_per_node, -+ "Max. number of permissions per node" }, -+ { NULL, NULL, NULL } -+}; -+ -+static const struct quota soft_quotas[] = { -+ { "memory", "a_memory_per_domain_soft, -+ "Total Xenstore memory per domain (warning level)" }, -+ { NULL, NULL, NULL } -+}; -+ -+static int quota_show_current(const void *ctx, struct connection *conn, -+ const struct quota *quotas) -+{ -+ char *resp; -+ unsigned int i; -+ -+ resp = talloc_strdup(ctx, "Quota settings:\n"); -+ if (!resp) -+ return ENOMEM; -+ -+ for (i = 0; quotas[i].quota; i++) { -+ resp = talloc_asprintf_append(resp, "%-17s: %8d %s\n", -+ quotas[i].name, *quotas[i].quota, -+ quotas[i].descr); -+ if (!resp) -+ return ENOMEM; -+ } -+ -+ send_reply(conn, XS_CONTROL, resp, strlen(resp) + 1); -+ -+ return 0; -+} -+ -+static int quota_set(const void *ctx, struct connection *conn, -+ char **vec, int num, const struct quota *quotas) -+{ -+ unsigned int i; -+ int val; -+ -+ if (num != 2) -+ return EINVAL; -+ -+ val = atoi(vec[1]); -+ if (val < 1) -+ return EINVAL; -+ -+ for (i = 0; quotas[i].quota; i++) { -+ if (!strcmp(vec[0], quotas[i].name)) { -+ *quotas[i].quota = val; -+ send_ack(conn, XS_CONTROL); -+ return 0; -+ } -+ } -+ -+ return EINVAL; -+} -+ -+static int quota_get(const void *ctx, struct connection *conn, -+ char **vec, int num) -+{ -+ if (num != 1) -+ return EINVAL; -+ -+ return domain_get_quota(ctx, conn, atoi(vec[0])); -+} -+ -+static int do_control_quota(void *ctx, struct connection *conn, -+ char **vec, int num) -+{ -+ if (num == 0) -+ return quota_show_current(ctx, conn, hard_quotas); -+ -+ if (!strcmp(vec[0], "set")) -+ return quota_set(ctx, conn, vec + 1, num - 1, hard_quotas); -+ -+ return quota_get(ctx, conn, vec, num); -+} -+ -+static int do_control_quota_s(void *ctx, struct connection *conn, -+ char **vec, int num) -+{ -+ if (num == 0) -+ return quota_show_current(ctx, conn, soft_quotas); -+ -+ if (!strcmp(vec[0], "set")) -+ return quota_set(ctx, conn, vec + 1, num - 1, soft_quotas); -+ -+ return EINVAL; -+} -+ - #ifdef __MINIOS__ - static int do_control_memreport(void *ctx, struct connection *conn, - char **vec, int num) -@@ -777,6 +886,8 @@ static struct cmd_s cmds[] = { - { "memreport", do_control_memreport, "[<file>]" }, - #endif - { "print", do_control_print, "<string>" }, -+ { "quota", do_control_quota, "[set <name> <val>|<domid>]" }, -+ { "quota-soft", do_control_quota_s, "[set <name> <val>]" }, - { "help", do_control_help, "" }, - }; - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index ec542df6a67e..3d5142581332 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -31,6 +31,7 @@ - #include "xenstored_domain.h" - #include "xenstored_transaction.h" - #include "xenstored_watch.h" -+#include "xenstored_control.h" - - #include <xenevtchn.h> - #include <xenctrl.h> -@@ -351,6 +352,38 @@ static struct domain *find_domain_struct(unsigned int domid) - return NULL; - } - -+int domain_get_quota(const void *ctx, struct connection *conn, -+ unsigned int domid) -+{ -+ struct domain *d = find_domain_struct(domid); -+ char *resp; -+ int ta; -+ -+ if (!d) -+ return ENOENT; -+ -+ ta = d->conn ? d->conn->transaction_started : 0; -+ resp = talloc_asprintf(ctx, "Domain %u:\n", domid); -+ if (!resp) -+ return ENOMEM; -+ -+#define ent(t, e) \ -+ resp = talloc_asprintf_append(resp, "%-16s: %8d\n", #t, e); \ -+ if (!resp) return ENOMEM -+ -+ ent(nodes, d->nbentry); -+ ent(watches, d->nbwatch); -+ ent(transactions, ta); -+ ent(outstanding, d->nboutstanding); -+ ent(memory, d->memory); -+ -+#undef ent -+ -+ send_reply(conn, XS_CONTROL, resp, strlen(resp) + 1); -+ -+ return 0; -+} -+ - static struct domain *alloc_domain(const void *context, unsigned int domid) - { - struct domain *domain; -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index 571aa46d158e..0f883936f413 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -91,6 +91,8 @@ int domain_watch(struct connection *conn); - void domain_outstanding_inc(struct connection *conn); - void domain_outstanding_dec(struct connection *conn); - void domain_outstanding_domid_dec(unsigned int domid); -+int domain_get_quota(const void *ctx, struct connection *conn, -+ unsigned int domid); - - /* Special node permission handling. */ - int set_perms_special(struct connection *conn, const char *name, --- -2.37.4 - diff --git a/0101-tools-ocaml-xenstored-Synchronise-defaults-with-oxen.patch b/0101-tools-ocaml-xenstored-Synchronise-defaults-with-oxen.patch deleted file mode 100644 index 7df76b1..0000000 --- a/0101-tools-ocaml-xenstored-Synchronise-defaults-with-oxen.patch +++ /dev/null @@ -1,63 +0,0 @@ -From 8fabb963e662a544a397cb2afefb2b15af07ace9 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Wed, 12 Oct 2022 19:13:01 +0100 -Subject: [PATCH 101/126] tools/ocaml/xenstored: Synchronise defaults with - oxenstore.conf.in -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -We currently have 2 different set of defaults in upstream Xen git tree: -* defined in the source code, only used if there is no config file -* defined in the oxenstored.conf.in upstream Xen - -An oxenstored.conf file is not mandatory, and if missing, maxrequests in -particular has an unsafe default. - -Resync the defaults from oxenstored.conf.in into the source code. - -This is part of XSA-326 / CVE-2022-42316. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit 84734955d4bf629ba459a74773afcde50a52236f) ---- - tools/ocaml/xenstored/define.ml | 6 +++--- - tools/ocaml/xenstored/quota.ml | 4 ++-- - 2 files changed, 5 insertions(+), 5 deletions(-) - -diff --git a/tools/ocaml/xenstored/define.ml b/tools/ocaml/xenstored/define.ml -index ebe18b8e312c..6b06f808595b 100644 ---- a/tools/ocaml/xenstored/define.ml -+++ b/tools/ocaml/xenstored/define.ml -@@ -21,9 +21,9 @@ let xs_daemon_socket = Paths.xen_run_stored ^ "/socket" - - let default_config_dir = Paths.xen_config_dir - --let maxwatch = ref (50) --let maxtransaction = ref (20) --let maxrequests = ref (-1) (* maximum requests per transaction *) -+let maxwatch = ref (100) -+let maxtransaction = ref (10) -+let maxrequests = ref (1024) (* maximum requests per transaction *) - - let conflict_burst_limit = ref 5.0 - let conflict_max_history_seconds = ref 0.05 -diff --git a/tools/ocaml/xenstored/quota.ml b/tools/ocaml/xenstored/quota.ml -index abcac912805a..6e3d6401ae89 100644 ---- a/tools/ocaml/xenstored/quota.ml -+++ b/tools/ocaml/xenstored/quota.ml -@@ -20,8 +20,8 @@ exception Transaction_opened - - let warn fmt = Logging.warn "quota" fmt - let activate = ref true --let maxent = ref (10000) --let maxsize = ref (4096) -+let maxent = ref (1000) -+let maxsize = ref (2048) - - type t = { - maxent: int; (* max entities per domU *) --- -2.37.4 - diff --git a/0102-tools-ocaml-xenstored-Check-for-maxrequests-before-p.patch b/0102-tools-ocaml-xenstored-Check-for-maxrequests-before-p.patch deleted file mode 100644 index bc741ae..0000000 --- a/0102-tools-ocaml-xenstored-Check-for-maxrequests-before-p.patch +++ /dev/null @@ -1,101 +0,0 @@ -From 45816222bb3da04f4cd3388efc46d127d48b8906 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Thu, 28 Jul 2022 17:08:15 +0100 -Subject: [PATCH 102/126] tools/ocaml/xenstored: Check for maxrequests before - performing operations -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Previously we'd perform the operation, record the updated tree in the -transaction record, then try to insert a watchop path and the reply packet. - -If we exceeded max requests we would've returned EQUOTA, but still: -* have performed the operation on the transaction's tree -* have recorded the watchop, making this queue effectively unbounded - -It is better if we check whether we'd have room to store the operation before -performing the transaction, and raise EQUOTA there. Then the transaction -record won't grow. - -This is part of XSA-326 / CVE-2022-42317. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit 329f4d1a6535c6c5a34025ca0d03fc5c7228fcff) ---- - tools/ocaml/xenstored/process.ml | 4 +++- - tools/ocaml/xenstored/transaction.ml | 16 ++++++++++++---- - 2 files changed, 15 insertions(+), 5 deletions(-) - -diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml -index 27790d4a5c41..dd58e6979cf9 100644 ---- a/tools/ocaml/xenstored/process.ml -+++ b/tools/ocaml/xenstored/process.ml -@@ -389,6 +389,7 @@ let input_handle_error ~cons ~doms ~fct ~con ~t ~req = - let reply_error e = - Packet.Error e in - try -+ Transaction.check_quota_exn ~perm:(Connection.get_perm con) t; - fct con t doms cons req.Packet.data - with - | Define.Invalid_path -> reply_error "EINVAL" -@@ -681,9 +682,10 @@ let process_packet ~store ~cons ~doms ~con ~req = - in - - let response = try -+ Transaction.check_quota_exn ~perm:(Connection.get_perm con) t; - if tid <> Transaction.none then - (* Remember the request and response for this operation in case we need to replay the transaction *) -- Transaction.add_operation ~perm:(Connection.get_perm con) t req response; -+ Transaction.add_operation t req response; - response - with Quota.Limit_reached -> - Packet.Error "EQUOTA" -diff --git a/tools/ocaml/xenstored/transaction.ml b/tools/ocaml/xenstored/transaction.ml -index 17b1bdf2eaf9..294143e2335b 100644 ---- a/tools/ocaml/xenstored/transaction.ml -+++ b/tools/ocaml/xenstored/transaction.ml -@@ -85,6 +85,7 @@ type t = { - oldroot: Store.Node.t; - mutable paths: (Xenbus.Xb.Op.operation * Store.Path.t) list; - mutable operations: (Packet.request * Packet.response) list; -+ mutable quota_reached: bool; - mutable read_lowpath: Store.Path.t option; - mutable write_lowpath: Store.Path.t option; - } -@@ -127,6 +128,7 @@ let make ?(internal=false) id store = - oldroot = Store.get_root store; - paths = []; - operations = []; -+ quota_reached = false; - read_lowpath = None; - write_lowpath = None; - } in -@@ -143,13 +145,19 @@ let get_root t = Store.get_root t.store - - let is_read_only t = t.paths = [] - let add_wop t ty path = t.paths <- (ty, path) :: t.paths --let add_operation ~perm t request response = -+let get_operations t = List.rev t.operations -+ -+let check_quota_exn ~perm t = - if !Define.maxrequests >= 0 - && not (Perms.Connection.is_dom0 perm) -- && List.length t.operations >= !Define.maxrequests -- then raise Quota.Limit_reached; -+ && (t.quota_reached || List.length t.operations >= !Define.maxrequests) -+ then begin -+ t.quota_reached <- true; -+ raise Quota.Limit_reached; -+ end -+ -+let add_operation t request response = - t.operations <- (request, response) :: t.operations --let get_operations t = List.rev t.operations - let set_read_lowpath t path = t.read_lowpath <- get_lowest path t.read_lowpath - let set_write_lowpath t path = t.write_lowpath <- get_lowest path t.write_lowpath - --- -2.37.4 - diff --git a/0103-tools-ocaml-GC-parameter-tuning.patch b/0103-tools-ocaml-GC-parameter-tuning.patch deleted file mode 100644 index d1473df..0000000 --- a/0103-tools-ocaml-GC-parameter-tuning.patch +++ /dev/null @@ -1,126 +0,0 @@ -From 9f89883fabd53cb7873cc31778887ba2a1228dd8 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Wed, 12 Oct 2022 19:13:07 +0100 -Subject: [PATCH 103/126] tools/ocaml: GC parameter tuning -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -By default the OCaml garbage collector would return memory to the OS only -after unused memory is 5x live memory. Tweak this to 120% instead, which -would match the major GC speed. - -This is part of XSA-326. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit 4a8bacff20b857ca0d628ef5525877ade11f2a42) ---- - tools/ocaml/xenstored/define.ml | 1 + - tools/ocaml/xenstored/xenstored.ml | 64 ++++++++++++++++++++++++++++++ - 2 files changed, 65 insertions(+) - -diff --git a/tools/ocaml/xenstored/define.ml b/tools/ocaml/xenstored/define.ml -index 6b06f808595b..ba63a8147e09 100644 ---- a/tools/ocaml/xenstored/define.ml -+++ b/tools/ocaml/xenstored/define.ml -@@ -25,6 +25,7 @@ let maxwatch = ref (100) - let maxtransaction = ref (10) - let maxrequests = ref (1024) (* maximum requests per transaction *) - -+let gc_max_overhead = ref 120 (* 120% see comment in xenstored.ml *) - let conflict_burst_limit = ref 5.0 - let conflict_max_history_seconds = ref 0.05 - let conflict_rate_limit_is_aggregate = ref true -diff --git a/tools/ocaml/xenstored/xenstored.ml b/tools/ocaml/xenstored/xenstored.ml -index d44ae673c42a..3b57ad016dfb 100644 ---- a/tools/ocaml/xenstored/xenstored.ml -+++ b/tools/ocaml/xenstored/xenstored.ml -@@ -104,6 +104,7 @@ let parse_config filename = - ("quota-maxsize", Config.Set_int Quota.maxsize); - ("quota-maxrequests", Config.Set_int Define.maxrequests); - ("quota-path-max", Config.Set_int Define.path_max); -+ ("gc-max-overhead", Config.Set_int Define.gc_max_overhead); - ("test-eagain", Config.Set_bool Transaction.test_eagain); - ("persistent", Config.Set_bool Disk.enable); - ("xenstored-log-file", Config.String Logging.set_xenstored_log_destination); -@@ -265,6 +266,67 @@ let to_file store cons fds file = - (fun () -> close_out channel) - end - -+(* -+ By default OCaml's GC only returns memory to the OS when it exceeds a -+ configurable 'max overhead' setting. -+ The default is 500%, that is 5/6th of the OCaml heap needs to be free -+ and only 1/6th live for a compaction to be triggerred that would -+ release memory back to the OS. -+ If the limit is not hit then the OCaml process can reuse that memory -+ for its own purposes, but other processes won't be able to use it. -+ -+ There is also a 'space overhead' setting that controls how much work -+ each major GC slice does, and by default aims at having no more than -+ 80% or 120% (depending on version) garbage values compared to live -+ values. -+ This doesn't have as much relevance to memory returned to the OS as -+ long as space_overhead <= max_overhead, because compaction is only -+ triggerred at the end of major GC cycles. -+ -+ The defaults are too large once the program starts using ~100MiB of -+ memory, at which point ~500MiB would be unavailable to other processes -+ (which would be fine if this was the main process in this VM, but it is -+ not). -+ -+ Max overhead can also be set to 0, however this is for testing purposes -+ only (setting it lower than 'space overhead' wouldn't help because the -+ major GC wouldn't run fast enough, and compaction does have a -+ performance cost: we can only compact contiguous regions, so memory has -+ to be moved around). -+ -+ Max overhead controls how often the heap is compacted, which is useful -+ if there are burst of activity followed by long periods of idle state, -+ or if a domain quits, etc. Compaction returns memory to the OS. -+ -+ wasted = live * space_overhead / 100 -+ -+ For globally overriding the GC settings one can use OCAMLRUNPARAM, -+ however we provide a config file override to be consistent with other -+ oxenstored settings. -+ -+ One might want to dynamically adjust the overhead setting based on used -+ memory, i.e. to use a fixed upper bound in bytes, not percentage. However -+ measurements show that such adjustments increase GC overhead massively, -+ while still not guaranteeing that memory is returned any more quickly -+ than with a percentage based setting. -+ -+ The allocation policy could also be tweaked, e.g. first fit would reduce -+ fragmentation and thus memory usage, but the documentation warns that it -+ can be sensibly slower, and indeed one of our own testcases can trigger -+ such a corner case where it is multiple times slower, so it is best to keep -+ the default allocation policy (next-fit/best-fit depending on version). -+ -+ There are other tweaks that can be attempted in the future, e.g. setting -+ 'ulimit -v' to 75% of RAM, however getting the kernel to actually return -+ NULL from allocations is difficult even with that setting, and without a -+ NULL the emergency GC won't be triggerred. -+ Perhaps cgroup limits could help, but for now tweak the safest only. -+*) -+ -+let tweak_gc () = -+ Gc.set { (Gc.get ()) with Gc.max_overhead = !Define.gc_max_overhead } -+ -+ - let _ = - let cf = do_argv in - let pidfile = -@@ -274,6 +336,8 @@ let _ = - default_pidfile - in - -+ tweak_gc (); -+ - (try - Unixext.mkdir_rec (Filename.dirname pidfile) 0o755 - with _ -> --- -2.37.4 - diff --git a/0104-tools-ocaml-libs-xb-hide-type-of-Xb.t.patch b/0104-tools-ocaml-libs-xb-hide-type-of-Xb.t.patch deleted file mode 100644 index 15f69b0..0000000 --- a/0104-tools-ocaml-libs-xb-hide-type-of-Xb.t.patch +++ /dev/null @@ -1,92 +0,0 @@ -From bbb4ceab25124646fa845855f3cb95ae15d0c3f2 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Fri, 29 Jul 2022 18:53:29 +0100 -Subject: [PATCH 104/126] tools/ocaml/libs/xb: hide type of Xb.t -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Hiding the type will make it easier to change the implementation -in the future without breaking code that relies on it. - -No functional change. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit 7ade30a1451734d041363c750a65d322e25b47ba) ---- - tools/ocaml/libs/xb/xb.ml | 3 +++ - tools/ocaml/libs/xb/xb.mli | 9 ++------- - tools/ocaml/xenstored/connection.ml | 8 ++------ - 3 files changed, 7 insertions(+), 13 deletions(-) - -diff --git a/tools/ocaml/libs/xb/xb.ml b/tools/ocaml/libs/xb/xb.ml -index 104d319d7747..8404ddd8a682 100644 ---- a/tools/ocaml/libs/xb/xb.ml -+++ b/tools/ocaml/libs/xb/xb.ml -@@ -196,6 +196,9 @@ let peek_output con = Queue.peek con.pkt_out - let input_len con = Queue.length con.pkt_in - let has_in_packet con = Queue.length con.pkt_in > 0 - let get_in_packet con = Queue.pop con.pkt_in -+let has_partial_input con = match con.partial_in with -+ | HaveHdr _ -> true -+ | NoHdr (n, _) -> n < Partial.header_size () - let has_more_input con = - match con.backend with - | Fd _ -> false -diff --git a/tools/ocaml/libs/xb/xb.mli b/tools/ocaml/libs/xb/xb.mli -index 3a00da6cddc1..794e35bb343e 100644 ---- a/tools/ocaml/libs/xb/xb.mli -+++ b/tools/ocaml/libs/xb/xb.mli -@@ -66,13 +66,7 @@ type backend_mmap = { - type backend_fd = { fd : Unix.file_descr; } - type backend = Fd of backend_fd | Xenmmap of backend_mmap - type partial_buf = HaveHdr of Partial.pkt | NoHdr of int * bytes --type t = { -- backend : backend; -- pkt_in : Packet.t Queue.t; -- pkt_out : Packet.t Queue.t; -- mutable partial_in : partial_buf; -- mutable partial_out : string; --} -+type t - val init_partial_in : unit -> partial_buf - val reconnect : t -> unit - val queue : t -> Packet.t -> unit -@@ -97,6 +91,7 @@ val has_output : t -> bool - val peek_output : t -> Packet.t - val input_len : t -> int - val has_in_packet : t -> bool -+val has_partial_input : t -> bool - val get_in_packet : t -> Packet.t - val has_more_input : t -> bool - val is_selectable : t -> bool -diff --git a/tools/ocaml/xenstored/connection.ml b/tools/ocaml/xenstored/connection.ml -index 65f99ea6f28a..38b47363a173 100644 ---- a/tools/ocaml/xenstored/connection.ml -+++ b/tools/ocaml/xenstored/connection.ml -@@ -125,9 +125,7 @@ let get_perm con = - let set_target con target_domid = - con.perm <- Perms.Connection.set_target (get_perm con) ~perms:[Perms.READ; Perms.WRITE] target_domid - --let is_backend_mmap con = match con.xb.Xenbus.Xb.backend with -- | Xenbus.Xb.Xenmmap _ -> true -- | _ -> false -+let is_backend_mmap con = Xenbus.Xb.is_mmap con.xb - - let send_reply con tid rid ty data = - if (String.length data) > xenstore_payload_max && (is_backend_mmap con) then -@@ -280,9 +278,7 @@ let get_transaction con tid = - - let do_input con = Xenbus.Xb.input con.xb - let has_input con = Xenbus.Xb.has_in_packet con.xb --let has_partial_input con = match con.xb.Xenbus.Xb.partial_in with -- | HaveHdr _ -> true -- | NoHdr (n, _) -> n < Xenbus.Partial.header_size () -+let has_partial_input con = Xenbus.Xb.has_partial_input con.xb - let pop_in con = Xenbus.Xb.get_in_packet con.xb - let has_more_input con = Xenbus.Xb.has_more_input con.xb - --- -2.37.4 - diff --git a/0105-tools-ocaml-Change-Xb.input-to-return-Packet.t-optio.patch b/0105-tools-ocaml-Change-Xb.input-to-return-Packet.t-optio.patch deleted file mode 100644 index 2691ae4..0000000 --- a/0105-tools-ocaml-Change-Xb.input-to-return-Packet.t-optio.patch +++ /dev/null @@ -1,225 +0,0 @@ -From fccdca83a4425b0e30ec9e29e9a5909e1a55b80d Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Wed, 12 Oct 2022 19:13:02 +0100 -Subject: [PATCH 105/126] tools/ocaml: Change Xb.input to return Packet.t - option -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The queue here would only ever hold at most one element. This will simplify -follow-up patches. - -This is part of XSA-326. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit c0a86a462721008eca5ff733660de094d3c34bc7) ---- - tools/ocaml/libs/xb/xb.ml | 18 +++++------------- - tools/ocaml/libs/xb/xb.mli | 5 +---- - tools/ocaml/libs/xs/xsraw.ml | 20 ++++++-------------- - tools/ocaml/xenstored/connection.ml | 4 +--- - tools/ocaml/xenstored/process.ml | 15 +++++++-------- - 5 files changed, 20 insertions(+), 42 deletions(-) - -diff --git a/tools/ocaml/libs/xb/xb.ml b/tools/ocaml/libs/xb/xb.ml -index 8404ddd8a682..165fd4a1edf4 100644 ---- a/tools/ocaml/libs/xb/xb.ml -+++ b/tools/ocaml/libs/xb/xb.ml -@@ -45,7 +45,6 @@ type partial_buf = HaveHdr of Partial.pkt | NoHdr of int * bytes - type t = - { - backend: backend; -- pkt_in: Packet.t Queue.t; - pkt_out: Packet.t Queue.t; - mutable partial_in: partial_buf; - mutable partial_out: string; -@@ -62,7 +61,6 @@ let reconnect t = match t.backend with - Xs_ring.close backend.mmap; - backend.eventchn_notify (); - (* Clear our old connection state *) -- Queue.clear t.pkt_in; - Queue.clear t.pkt_out; - t.partial_in <- init_partial_in (); - t.partial_out <- "" -@@ -124,7 +122,6 @@ let output con = - - (* NB: can throw Reconnect *) - let input con = -- let newpacket = ref false in - let to_read = - match con.partial_in with - | HaveHdr partial_pkt -> Partial.to_complete partial_pkt -@@ -143,21 +140,19 @@ let input con = - if Partial.to_complete partial_pkt = 0 then ( - let pkt = Packet.of_partialpkt partial_pkt in - con.partial_in <- init_partial_in (); -- Queue.push pkt con.pkt_in; -- newpacket := true -- ) -+ Some pkt -+ ) else None - | NoHdr (i, buf) -> - (* we complete the partial header *) - if sz > 0 then - Bytes.blit b 0 buf (Partial.header_size () - i) sz; - con.partial_in <- if sz = i then -- HaveHdr (Partial.of_string (Bytes.to_string buf)) else NoHdr (i - sz, buf) -- ); -- !newpacket -+ HaveHdr (Partial.of_string (Bytes.to_string buf)) else NoHdr (i - sz, buf); -+ None -+ ) - - let newcon backend = { - backend = backend; -- pkt_in = Queue.create (); - pkt_out = Queue.create (); - partial_in = init_partial_in (); - partial_out = ""; -@@ -193,9 +188,6 @@ let has_output con = has_new_output con || has_old_output con - - let peek_output con = Queue.peek con.pkt_out - --let input_len con = Queue.length con.pkt_in --let has_in_packet con = Queue.length con.pkt_in > 0 --let get_in_packet con = Queue.pop con.pkt_in - let has_partial_input con = match con.partial_in with - | HaveHdr _ -> true - | NoHdr (n, _) -> n < Partial.header_size () -diff --git a/tools/ocaml/libs/xb/xb.mli b/tools/ocaml/libs/xb/xb.mli -index 794e35bb343e..91c682162cea 100644 ---- a/tools/ocaml/libs/xb/xb.mli -+++ b/tools/ocaml/libs/xb/xb.mli -@@ -77,7 +77,7 @@ val write_fd : backend_fd -> 'a -> string -> int -> int - val write_mmap : backend_mmap -> 'a -> string -> int -> int - val write : t -> string -> int -> int - val output : t -> bool --val input : t -> bool -+val input : t -> Packet.t option - val newcon : backend -> t - val open_fd : Unix.file_descr -> t - val open_mmap : Xenmmap.mmap_interface -> (unit -> unit) -> t -@@ -89,10 +89,7 @@ val has_new_output : t -> bool - val has_old_output : t -> bool - val has_output : t -> bool - val peek_output : t -> Packet.t --val input_len : t -> int --val has_in_packet : t -> bool - val has_partial_input : t -> bool --val get_in_packet : t -> Packet.t - val has_more_input : t -> bool - val is_selectable : t -> bool - val get_fd : t -> Unix.file_descr -diff --git a/tools/ocaml/libs/xs/xsraw.ml b/tools/ocaml/libs/xs/xsraw.ml -index d982fb24dbb1..451f8b38dbcc 100644 ---- a/tools/ocaml/libs/xs/xsraw.ml -+++ b/tools/ocaml/libs/xs/xsraw.ml -@@ -94,26 +94,18 @@ let pkt_send con = - done - - (* receive one packet - can sleep *) --let pkt_recv con = -- let workdone = ref false in -- while not !workdone -- do -- workdone := Xb.input con.xb -- done; -- Xb.get_in_packet con.xb -+let rec pkt_recv con = -+ match Xb.input con.xb with -+ | Some packet -> packet -+ | None -> pkt_recv con - - let pkt_recv_timeout con timeout = - let fd = Xb.get_fd con.xb in - let r, _, _ = Unix.select [ fd ] [] [] timeout in - if r = [] then - true, None -- else ( -- let workdone = Xb.input con.xb in -- if workdone then -- false, (Some (Xb.get_in_packet con.xb)) -- else -- false, None -- ) -+ else -+ false, Xb.input con.xb - - let queue_watchevent con data = - let ls = split_string ~limit:2 '\000' data in -diff --git a/tools/ocaml/xenstored/connection.ml b/tools/ocaml/xenstored/connection.ml -index 38b47363a173..cc20e047d2b9 100644 ---- a/tools/ocaml/xenstored/connection.ml -+++ b/tools/ocaml/xenstored/connection.ml -@@ -277,9 +277,7 @@ let get_transaction con tid = - Hashtbl.find con.transactions tid - - let do_input con = Xenbus.Xb.input con.xb --let has_input con = Xenbus.Xb.has_in_packet con.xb - let has_partial_input con = Xenbus.Xb.has_partial_input con.xb --let pop_in con = Xenbus.Xb.get_in_packet con.xb - let has_more_input con = Xenbus.Xb.has_more_input con.xb - - let has_output con = Xenbus.Xb.has_output con.xb -@@ -307,7 +305,7 @@ let is_bad con = match con.dom with None -> false | Some dom -> Domain.is_bad_do - Restrictions below can be relaxed once xenstored learns to dump more - of its live state in a safe way *) - let has_extra_connection_data con = -- let has_in = has_input con || has_partial_input con in -+ let has_in = has_partial_input con in - let has_out = has_output con in - let has_socket = con.dom = None in - let has_nondefault_perms = make_perm con.dom <> con.perm in -diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml -index dd58e6979cf9..cbf708213796 100644 ---- a/tools/ocaml/xenstored/process.ml -+++ b/tools/ocaml/xenstored/process.ml -@@ -195,10 +195,9 @@ let parse_live_update args = - | _ when Unix.gettimeofday () < t.deadline -> false - | l -> - warn "timeout reached: have to wait, migrate or shutdown %d domains:" (List.length l); -- let msgs = List.rev_map (fun con -> Printf.sprintf "%s: %d tx, in: %b, out: %b, perm: %s" -+ let msgs = List.rev_map (fun con -> Printf.sprintf "%s: %d tx, out: %b, perm: %s" - (Connection.get_domstr con) - (Connection.number_of_transactions con) -- (Connection.has_input con) - (Connection.has_output con) - (Connection.get_perm con |> Perms.Connection.to_string) - ) l in -@@ -706,16 +705,17 @@ let do_input store cons doms con = - info "%s requests a reconnect" (Connection.get_domstr con); - History.reconnect con; - info "%s reconnection complete" (Connection.get_domstr con); -- false -+ None - | Failure exp -> - error "caught exception %s" exp; - error "got a bad client %s" (sprintf "%-8s" (Connection.get_domstr con)); - Connection.mark_as_bad con; -- false -+ None - in - -- if newpacket then ( -- let packet = Connection.pop_in con in -+ match newpacket with -+ | None -> () -+ | Some packet -> - let tid, rid, ty, data = Xenbus.Xb.Packet.unpack packet in - let req = {Packet.tid=tid; Packet.rid=rid; Packet.ty=ty; Packet.data=data} in - -@@ -725,8 +725,7 @@ let do_input store cons doms con = - (Xenbus.Xb.Op.to_string ty) (sanitize_data data); *) - process_packet ~store ~cons ~doms ~con ~req; - write_access_log ~ty ~tid ~con:(Connection.get_domstr con) ~data; -- Connection.incr_ops con; -- ) -+ Connection.incr_ops con - - let do_output _store _cons _doms con = - if Connection.has_output con then ( --- -2.37.4 - diff --git a/0106-tools-ocaml-xb-Add-BoundedQueue.patch b/0106-tools-ocaml-xb-Add-BoundedQueue.patch deleted file mode 100644 index c1f0385..0000000 --- a/0106-tools-ocaml-xb-Add-BoundedQueue.patch +++ /dev/null @@ -1,133 +0,0 @@ -From 9e5290daf923e84ca56a6f3d9fc6a333175ef0f9 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Wed, 12 Oct 2022 19:13:03 +0100 -Subject: [PATCH 106/126] tools/ocaml/xb: Add BoundedQueue -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Ensures we cannot store more than [capacity] elements in a [Queue]. Replacing -all Queue with this module will then ensure at compile time that all Queues -are correctly bound checked. - -Each element in the queue has a class with its own limits. This, in a -subsequent change, will ensure that command responses can proceed during a -flood of watch events. - -No functional change. - -This is part of XSA-326. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit 19171fb5d888b4467a7073e8febc5e05540956e9) ---- - tools/ocaml/libs/xb/xb.ml | 92 +++++++++++++++++++++++++++++++++++++++ - 1 file changed, 92 insertions(+) - -diff --git a/tools/ocaml/libs/xb/xb.ml b/tools/ocaml/libs/xb/xb.ml -index 165fd4a1edf4..4197a3888a68 100644 ---- a/tools/ocaml/libs/xb/xb.ml -+++ b/tools/ocaml/libs/xb/xb.ml -@@ -17,6 +17,98 @@ - module Op = struct include Op end - module Packet = struct include Packet end - -+module BoundedQueue : sig -+ type ('a, 'b) t -+ -+ (** [create ~capacity ~classify ~limit] creates a queue with maximum [capacity] elements. -+ This is burst capacity, each element is further classified according to [classify], -+ and each class can have its own [limit]. -+ [capacity] is enforced as an overall limit. -+ The [limit] can be dynamic, and can be smaller than the number of elements already queued of that class, -+ in which case those elements are considered to use "burst capacity". -+ *) -+ val create: capacity:int -> classify:('a -> 'b) -> limit:('b -> int) -> ('a, 'b) t -+ -+ (** [clear q] discards all elements from [q] *) -+ val clear: ('a, 'b) t -> unit -+ -+ (** [can_push q] when [length q < capacity]. *) -+ val can_push: ('a, 'b) t -> 'b -> bool -+ -+ (** [push e q] adds [e] at the end of queue [q] if [can_push q], or returns [None]. *) -+ val push: 'a -> ('a, 'b) t -> unit option -+ -+ (** [pop q] removes and returns first element in [q], or raises [Queue.Empty]. *) -+ val pop: ('a, 'b) t -> 'a -+ -+ (** [peek q] returns the first element in [q], or raises [Queue.Empty]. *) -+ val peek : ('a, 'b) t -> 'a -+ -+ (** [length q] returns the current number of elements in [q] *) -+ val length: ('a, 'b) t -> int -+ -+ (** [debug string_of_class q] prints queue usage statistics in an unspecified internal format. *) -+ val debug: ('b -> string) -> (_, 'b) t -> string -+end = struct -+ type ('a, 'b) t = -+ { q: 'a Queue.t -+ ; capacity: int -+ ; classify: 'a -> 'b -+ ; limit: 'b -> int -+ ; class_count: ('b, int) Hashtbl.t -+ } -+ -+ let create ~capacity ~classify ~limit = -+ { capacity; q = Queue.create (); classify; limit; class_count = Hashtbl.create 3 } -+ -+ let get_count t classification = try Hashtbl.find t.class_count classification with Not_found -> 0 -+ -+ let can_push_internal t classification class_count = -+ Queue.length t.q < t.capacity && class_count < t.limit classification -+ -+ let ok = Some () -+ -+ let push e t = -+ let classification = t.classify e in -+ let class_count = get_count t classification in -+ if can_push_internal t classification class_count then begin -+ Queue.push e t.q; -+ Hashtbl.replace t.class_count classification (class_count + 1); -+ ok -+ end -+ else -+ None -+ -+ let can_push t classification = -+ can_push_internal t classification @@ get_count t classification -+ -+ let clear t = -+ Queue.clear t.q; -+ Hashtbl.reset t.class_count -+ -+ let pop t = -+ let e = Queue.pop t.q in -+ let classification = t.classify e in -+ let () = match get_count t classification - 1 with -+ | 0 -> Hashtbl.remove t.class_count classification (* reduces memusage *) -+ | n -> Hashtbl.replace t.class_count classification n -+ in -+ e -+ -+ let peek t = Queue.peek t.q -+ let length t = Queue.length t.q -+ -+ let debug string_of_class t = -+ let b = Buffer.create 128 in -+ Printf.bprintf b "BoundedQueue capacity: %d, used: {" t.capacity; -+ Hashtbl.iter (fun packet_class count -> -+ Printf.bprintf b " %s: %d" (string_of_class packet_class) count -+ ) t.class_count; -+ Printf.bprintf b "}"; -+ Buffer.contents b -+end -+ -+ - exception End_of_file - exception Eagain - exception Noent --- -2.37.4 - diff --git a/0107-tools-ocaml-Limit-maximum-in-flight-requests-outstan.patch b/0107-tools-ocaml-Limit-maximum-in-flight-requests-outstan.patch deleted file mode 100644 index 5f5c4b6..0000000 --- a/0107-tools-ocaml-Limit-maximum-in-flight-requests-outstan.patch +++ /dev/null @@ -1,888 +0,0 @@ -From 64048b4c218099b6adcf46cd7b4d1dc9c658009e Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Wed, 12 Oct 2022 19:13:04 +0100 -Subject: [PATCH 107/126] tools/ocaml: Limit maximum in-flight requests / - outstanding replies -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Introduce a limit on the number of outstanding reply packets in the xenbus -queue. This limits the number of in-flight requests: when the output queue is -full we'll stop processing inputs until the output queue has room again. - -To avoid a busy loop on the Unix socket we only add it to the watched input -file descriptor set if we'd be able to call `input` on it. Even though Dom0 -is trusted and exempt from quotas a flood of events might cause a backlog -where events are produced faster than daemons in Dom0 can consume them, which -could lead to an unbounded queue size and OOM. - -Therefore the xenbus queue limit must apply to all connections, Dom0 is not -exempt from it, although if everything works correctly it will eventually -catch up. - -This prevents a malicious guest from sending more commands while it has -outstanding watch events or command replies in its input ring. However if it -can cause the generation of watch events by other means (e.g. by Dom0, or -another cooperative guest) and stop reading its own ring then watch events -would've queued up without limit. - -The xenstore protocol doesn't have a back-pressure mechanism, and doesn't -allow dropping watch events. In fact, dropping watch events is known to break -some pieces of normal functionality. This leaves little choice to safely -implement the xenstore protocol without exposing the xenstore daemon to -out-of-memory attacks. - -Implement the fix as pipes with bounded buffers: -* Use a bounded buffer for watch events -* The watch structure will have a bounded receiving pipe of watch events -* The source will have an "overflow" pipe of pending watch events it couldn't - deliver - -Items are queued up on one end and are sent as far along the pipe as possible: - - source domain -> watch -> xenbus of target -> xenstore ring/socket of target - -If the pipe is "full" at any point then back-pressure is applied and we prevent -more items from being queued up. For the source domain this means that we'll -stop accepting new commands as long as its pipe buffer is not empty. - -Before we try to enqueue an item we first check whether it is possible to send -it further down the pipe, by attempting to recursively flush the pipes. This -ensures that we retain the order of events as much as possible. - -We might break causality of watch events if the target domain's queue is full -and we need to start using the watch's queue. This is a breaking change in -the xenstore protocol, but only for domains which are not processing their -incoming ring as expected. - -When a watch is deleted its entire pending queue is dropped (no code is needed -for that, because it is part of the 'watch' type). - -There is a cache of watches that have pending events that we attempt to flush -at every cycle if possible. - -Introduce 3 limits here: -* quota-maxwatchevents on watch event destination: when this is hit the - source will not be allowed to queue up more watch events. -* quota-maxoustanding which is the number of responses not read from the ring: - once exceeded, no more inputs are processed until all outstanding replies - are consumed by the client. -* overflow queue on the watch event source: all watches that cannot be stored - on destination are queued up here, a single command can trigger multiple - watches (e.g. due to recursion). - -The overflow queue currently doesn't have an upper bound, it is difficult to -accurately calculate one as it depends on whether you are Dom0 and how many -watches each path has registered and how many watch events you can trigger -with a single command (e.g. a commit). However these events were already -using memory, this just moves them elsewhere, and as long as we correctly -block a domain it shouldn't result in unbounded memory usage. - -Note that Dom0 is not excluded from these checks, it is important that Dom0 is -especially not excluded when it is the source, since there are many ways in -which a guest could trigger Dom0 to send it watch events. - -This should protect against malicious frontends as long as the backend follows -the PV xenstore protocol and only exposes paths needed by the frontend, and -changes those paths at most once as a reaction to guest events, or protocol -state. - -The queue limits are per watch, and per domain-pair, so even if one -communication channel would be "blocked", others would keep working, and the -domain itself won't get blocked as long as it doesn't overflow the queue of -watch events. - -Similarly a malicious backend could cause the frontend to get blocked, but -this watch queue protects the frontend as well as long as it follows the PV -protocol. (Although note that protection against malicious backends is only a -best effort at the moment) - -This is part of XSA-326 / CVE-2022-42318. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit 9284ae0c40fb5b9606947eaaec23dc71d0540e96) ---- - tools/ocaml/libs/xb/xb.ml | 61 +++++++-- - tools/ocaml/libs/xb/xb.mli | 11 +- - tools/ocaml/libs/xs/queueop.ml | 25 ++-- - tools/ocaml/libs/xs/xsraw.ml | 4 +- - tools/ocaml/xenstored/connection.ml | 155 +++++++++++++++++++++-- - tools/ocaml/xenstored/connections.ml | 57 +++++++-- - tools/ocaml/xenstored/define.ml | 7 + - tools/ocaml/xenstored/oxenstored.conf.in | 2 + - tools/ocaml/xenstored/process.ml | 31 ++++- - tools/ocaml/xenstored/xenstored.ml | 2 + - 10 files changed, 296 insertions(+), 59 deletions(-) - -diff --git a/tools/ocaml/libs/xb/xb.ml b/tools/ocaml/libs/xb/xb.ml -index 4197a3888a68..b292ed7a874d 100644 ---- a/tools/ocaml/libs/xb/xb.ml -+++ b/tools/ocaml/libs/xb/xb.ml -@@ -134,14 +134,44 @@ type backend = Fd of backend_fd | Xenmmap of backend_mmap - - type partial_buf = HaveHdr of Partial.pkt | NoHdr of int * bytes - -+(* -+ separate capacity reservation for replies and watch events: -+ this allows a domain to keep working even when under a constant flood of -+ watch events -+*) -+type capacity = { maxoutstanding: int; maxwatchevents: int } -+ -+module Queue = BoundedQueue -+ -+type packet_class = -+ | CommandReply -+ | Watchevent -+ -+let string_of_packet_class = function -+ | CommandReply -> "command_reply" -+ | Watchevent -> "watch_event" -+ - type t = - { - backend: backend; -- pkt_out: Packet.t Queue.t; -+ pkt_out: (Packet.t, packet_class) Queue.t; - mutable partial_in: partial_buf; - mutable partial_out: string; -+ capacity: capacity - } - -+let to_read con = -+ match con.partial_in with -+ | HaveHdr partial_pkt -> Partial.to_complete partial_pkt -+ | NoHdr (i, _) -> i -+ -+let debug t = -+ Printf.sprintf "XenBus state: partial_in: %d needed, partial_out: %d bytes, pkt_out: %d packets, %s" -+ (to_read t) -+ (String.length t.partial_out) -+ (Queue.length t.pkt_out) -+ (BoundedQueue.debug string_of_packet_class t.pkt_out) -+ - let init_partial_in () = NoHdr - (Partial.header_size (), Bytes.make (Partial.header_size()) '\000') - -@@ -199,7 +229,8 @@ let output con = - let s = if String.length con.partial_out > 0 then - con.partial_out - else if Queue.length con.pkt_out > 0 then -- Packet.to_string (Queue.pop con.pkt_out) -+ let pkt = Queue.pop con.pkt_out in -+ Packet.to_string pkt - else - "" in - (* send data from s, and save the unsent data to partial_out *) -@@ -212,12 +243,15 @@ let output con = - (* after sending one packet, partial is empty *) - con.partial_out = "" - -+(* we can only process an input packet if we're guaranteed to have room -+ to store the response packet *) -+let can_input con = Queue.can_push con.pkt_out CommandReply -+ - (* NB: can throw Reconnect *) - let input con = -- let to_read = -- match con.partial_in with -- | HaveHdr partial_pkt -> Partial.to_complete partial_pkt -- | NoHdr (i, _) -> i in -+ if not (can_input con) then None -+ else -+ let to_read = to_read con in - - (* try to get more data from input stream *) - let b = Bytes.make to_read '\000' in -@@ -243,11 +277,22 @@ let input con = - None - ) - --let newcon backend = { -+let classify t = -+ match t.Packet.ty with -+ | Op.Watchevent -> Watchevent -+ | _ -> CommandReply -+ -+let newcon ~capacity backend = -+ let limit = function -+ | CommandReply -> capacity.maxoutstanding -+ | Watchevent -> capacity.maxwatchevents -+ in -+ { - backend = backend; -- pkt_out = Queue.create (); -+ pkt_out = Queue.create ~capacity:(capacity.maxoutstanding + capacity.maxwatchevents) ~classify ~limit; - partial_in = init_partial_in (); - partial_out = ""; -+ capacity = capacity; - } - - let open_fd fd = newcon (Fd { fd = fd; }) -diff --git a/tools/ocaml/libs/xb/xb.mli b/tools/ocaml/libs/xb/xb.mli -index 91c682162cea..71b2754ca788 100644 ---- a/tools/ocaml/libs/xb/xb.mli -+++ b/tools/ocaml/libs/xb/xb.mli -@@ -66,10 +66,11 @@ type backend_mmap = { - type backend_fd = { fd : Unix.file_descr; } - type backend = Fd of backend_fd | Xenmmap of backend_mmap - type partial_buf = HaveHdr of Partial.pkt | NoHdr of int * bytes -+type capacity = { maxoutstanding: int; maxwatchevents: int } - type t - val init_partial_in : unit -> partial_buf - val reconnect : t -> unit --val queue : t -> Packet.t -> unit -+val queue : t -> Packet.t -> unit option - val read_fd : backend_fd -> 'a -> bytes -> int -> int - val read_mmap : backend_mmap -> 'a -> bytes -> int -> int - val read : t -> bytes -> int -> int -@@ -78,13 +79,14 @@ val write_mmap : backend_mmap -> 'a -> string -> int -> int - val write : t -> string -> int -> int - val output : t -> bool - val input : t -> Packet.t option --val newcon : backend -> t --val open_fd : Unix.file_descr -> t --val open_mmap : Xenmmap.mmap_interface -> (unit -> unit) -> t -+val newcon : capacity:capacity -> backend -> t -+val open_fd : Unix.file_descr -> capacity:capacity -> t -+val open_mmap : Xenmmap.mmap_interface -> (unit -> unit) -> capacity:capacity -> t - val close : t -> unit - val is_fd : t -> bool - val is_mmap : t -> bool - val output_len : t -> int -+val can_input: t -> bool - val has_new_output : t -> bool - val has_old_output : t -> bool - val has_output : t -> bool -@@ -93,3 +95,4 @@ val has_partial_input : t -> bool - val has_more_input : t -> bool - val is_selectable : t -> bool - val get_fd : t -> Unix.file_descr -+val debug: t -> string -diff --git a/tools/ocaml/libs/xs/queueop.ml b/tools/ocaml/libs/xs/queueop.ml -index 9ff5bbd529ce..4e532cdaeacb 100644 ---- a/tools/ocaml/libs/xs/queueop.ml -+++ b/tools/ocaml/libs/xs/queueop.ml -@@ -16,9 +16,10 @@ - open Xenbus - - let data_concat ls = (String.concat "\000" ls) ^ "\000" -+let queue con pkt = let r = Xb.queue con pkt in assert (r <> None) - let queue_path ty (tid: int) (path: string) con = - let data = data_concat [ path; ] in -- Xb.queue con (Xb.Packet.create tid 0 ty data) -+ queue con (Xb.Packet.create tid 0 ty data) - - (* operations *) - let directory tid path con = queue_path Xb.Op.Directory tid path con -@@ -27,48 +28,48 @@ let read tid path con = queue_path Xb.Op.Read tid path con - let getperms tid path con = queue_path Xb.Op.Getperms tid path con - - let debug commands con = -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Debug (data_concat commands)) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Debug (data_concat commands)) - - let watch path data con = - let data = data_concat [ path; data; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Watch data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Watch data) - - let unwatch path data con = - let data = data_concat [ path; data; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Unwatch data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Unwatch data) - - let transaction_start con = -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Transaction_start (data_concat [])) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Transaction_start (data_concat [])) - - let transaction_end tid commit con = - let data = data_concat [ (if commit then "T" else "F"); ] in -- Xb.queue con (Xb.Packet.create tid 0 Xb.Op.Transaction_end data) -+ queue con (Xb.Packet.create tid 0 Xb.Op.Transaction_end data) - - let introduce domid mfn port con = - let data = data_concat [ Printf.sprintf "%u" domid; - Printf.sprintf "%nu" mfn; - string_of_int port; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Introduce data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Introduce data) - - let release domid con = - let data = data_concat [ Printf.sprintf "%u" domid; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Release data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Release data) - - let resume domid con = - let data = data_concat [ Printf.sprintf "%u" domid; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Resume data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Resume data) - - let getdomainpath domid con = - let data = data_concat [ Printf.sprintf "%u" domid; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Getdomainpath data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Getdomainpath data) - - let write tid path value con = - let data = path ^ "\000" ^ value (* no NULL at the end *) in -- Xb.queue con (Xb.Packet.create tid 0 Xb.Op.Write data) -+ queue con (Xb.Packet.create tid 0 Xb.Op.Write data) - - let mkdir tid path con = queue_path Xb.Op.Mkdir tid path con - let rm tid path con = queue_path Xb.Op.Rm tid path con - - let setperms tid path perms con = - let data = data_concat [ path; perms ] in -- Xb.queue con (Xb.Packet.create tid 0 Xb.Op.Setperms data) -+ queue con (Xb.Packet.create tid 0 Xb.Op.Setperms data) -diff --git a/tools/ocaml/libs/xs/xsraw.ml b/tools/ocaml/libs/xs/xsraw.ml -index 451f8b38dbcc..cbd17280600c 100644 ---- a/tools/ocaml/libs/xs/xsraw.ml -+++ b/tools/ocaml/libs/xs/xsraw.ml -@@ -36,8 +36,10 @@ type con = { - let close con = - Xb.close con.xb - -+let capacity = { Xb.maxoutstanding = 1; maxwatchevents = 0; } -+ - let open_fd fd = { -- xb = Xb.open_fd fd; -+ xb = Xb.open_fd ~capacity fd; - watchevents = Queue.create (); - } - -diff --git a/tools/ocaml/xenstored/connection.ml b/tools/ocaml/xenstored/connection.ml -index cc20e047d2b9..9624a5f9da2c 100644 ---- a/tools/ocaml/xenstored/connection.ml -+++ b/tools/ocaml/xenstored/connection.ml -@@ -20,12 +20,84 @@ open Stdext - - let xenstore_payload_max = 4096 (* xen/include/public/io/xs_wire.h *) - -+type 'a bounded_sender = 'a -> unit option -+(** a bounded sender accepts an ['a] item and returns: -+ None - if there is no room to accept the item -+ Some () - if it has successfully accepted/sent the item -+ *) -+ -+module BoundedPipe : sig -+ type 'a t -+ -+ (** [create ~capacity ~destination] creates a bounded pipe with a -+ local buffer holding at most [capacity] items. Once the buffer is -+ full it will not accept further items. items from the pipe are -+ flushed into [destination] as long as it accepts items. The -+ destination could be another pipe. -+ *) -+ val create: capacity:int -> destination:'a bounded_sender -> 'a t -+ -+ (** [is_empty t] returns whether the local buffer of [t] is empty. *) -+ val is_empty : _ t -> bool -+ -+ (** [length t] the number of items in the internal buffer *) -+ val length: _ t -> int -+ -+ (** [flush_pipe t] sends as many items from the local buffer as possible, -+ which could be none. *) -+ val flush_pipe: _ t -> unit -+ -+ (** [push t item] tries to [flush_pipe] and then push [item] -+ into the pipe if its [capacity] allows. -+ Returns [None] if there is no more room -+ *) -+ val push : 'a t -> 'a bounded_sender -+end = struct -+ (* items are enqueued in [q], and then flushed to [connect_to] *) -+ type 'a t = -+ { q: 'a Queue.t -+ ; destination: 'a bounded_sender -+ ; capacity: int -+ } -+ -+ let create ~capacity ~destination = -+ { q = Queue.create (); capacity; destination } -+ -+ let rec flush_pipe t = -+ if not Queue.(is_empty t.q) then -+ let item = Queue.peek t.q in -+ match t.destination item with -+ | None -> () (* no room *) -+ | Some () -> -+ (* successfully sent item to next stage *) -+ let _ = Queue.pop t.q in -+ (* continue trying to send more items *) -+ flush_pipe t -+ -+ let push t item = -+ (* first try to flush as many items from this pipe as possible to make room, -+ it is important to do this first to preserve the order of the items -+ *) -+ flush_pipe t; -+ if Queue.length t.q < t.capacity then begin -+ (* enqueue, instead of sending directly. -+ this ensures that [out] sees the items in the same order as we receive them -+ *) -+ Queue.push item t.q; -+ Some (flush_pipe t) -+ end else None -+ -+ let is_empty t = Queue.is_empty t.q -+ let length t = Queue.length t.q -+end -+ - type watch = { - con: t; - token: string; - path: string; - base: string; - is_relative: bool; -+ pending_watchevents: Xenbus.Xb.Packet.t BoundedPipe.t; - } - - and t = { -@@ -38,8 +110,36 @@ and t = { - anonid: int; - mutable stat_nb_ops: int; - mutable perm: Perms.Connection.t; -+ pending_source_watchevents: (watch * Xenbus.Xb.Packet.t) BoundedPipe.t - } - -+module Watch = struct -+ module T = struct -+ type t = watch -+ -+ let compare w1 w2 = -+ (* cannot compare watches from different connections *) -+ assert (w1.con == w2.con); -+ match String.compare w1.token w2.token with -+ | 0 -> String.compare w1.path w2.path -+ | n -> n -+ end -+ module Set = Set.Make(T) -+ -+ let flush_events t = -+ BoundedPipe.flush_pipe t.pending_watchevents; -+ not (BoundedPipe.is_empty t.pending_watchevents) -+ -+ let pending_watchevents t = -+ BoundedPipe.length t.pending_watchevents -+end -+ -+let source_flush_watchevents t = -+ BoundedPipe.flush_pipe t.pending_source_watchevents -+ -+let source_pending_watchevents t = -+ BoundedPipe.length t.pending_source_watchevents -+ - let mark_as_bad con = - match con.dom with - |None -> () -@@ -67,7 +167,8 @@ let watch_create ~con ~path ~token = { - token = token; - path = path; - base = get_path con; -- is_relative = path.[0] <> '/' && path.[0] <> '@' -+ is_relative = path.[0] <> '/' && path.[0] <> '@'; -+ pending_watchevents = BoundedPipe.create ~capacity:!Define.maxwatchevents ~destination:(Xenbus.Xb.queue con.xb) - } - - let get_con w = w.con -@@ -93,6 +194,9 @@ let make_perm dom = - Perms.Connection.create ~perms:[Perms.READ; Perms.WRITE] domid - - let create xbcon dom = -+ let destination (watch, pkt) = -+ BoundedPipe.push watch.pending_watchevents pkt -+ in - let id = - match dom with - | None -> let old = !anon_id_next in incr anon_id_next; old -@@ -109,6 +213,16 @@ let create xbcon dom = - anonid = id; - stat_nb_ops = 0; - perm = make_perm dom; -+ -+ (* the actual capacity will be lower, this is used as an overflow -+ buffer: anything that doesn't fit elsewhere gets put here, only -+ limited by the amount of watches that you can generate with a -+ single xenstore command (which is finite, although possibly very -+ large in theory for Dom0). Once the pipe here has any contents the -+ domain is blocked from sending more commands until it is empty -+ again though. -+ *) -+ pending_source_watchevents = BoundedPipe.create ~capacity:Sys.max_array_length ~destination - } - in - Logging.new_connection ~tid:Transaction.none ~con:(get_domstr con); -@@ -127,11 +241,17 @@ let set_target con target_domid = - - let is_backend_mmap con = Xenbus.Xb.is_mmap con.xb - --let send_reply con tid rid ty data = -+let packet_of con tid rid ty data = - if (String.length data) > xenstore_payload_max && (is_backend_mmap con) then -- Xenbus.Xb.queue con.xb (Xenbus.Xb.Packet.create tid rid Xenbus.Xb.Op.Error "E2BIG\000") -+ Xenbus.Xb.Packet.create tid rid Xenbus.Xb.Op.Error "E2BIG\000" - else -- Xenbus.Xb.queue con.xb (Xenbus.Xb.Packet.create tid rid ty data) -+ Xenbus.Xb.Packet.create tid rid ty data -+ -+let send_reply con tid rid ty data = -+ let result = Xenbus.Xb.queue con.xb (packet_of con tid rid ty data) in -+ (* should never happen: we only process an input packet when there is room for an output packet *) -+ (* and the limit for replies is different from the limit for watch events *) -+ assert (result <> None) - - let send_error con tid rid err = send_reply con tid rid Xenbus.Xb.Op.Error (err ^ "\000") - let send_ack con tid rid ty = send_reply con tid rid ty "OK\000" -@@ -181,11 +301,11 @@ let del_watch con path token = - apath, w - - let del_watches con = -- Hashtbl.clear con.watches; -+ Hashtbl.reset con.watches; - con.nb_watches <- 0 - - let del_transactions con = -- Hashtbl.clear con.transactions -+ Hashtbl.reset con.transactions - - let list_watches con = - let ll = Hashtbl.fold -@@ -208,21 +328,29 @@ let lookup_watch_perm path = function - let lookup_watch_perms oldroot root path = - lookup_watch_perm path oldroot @ lookup_watch_perm path (Some root) - --let fire_single_watch_unchecked watch = -+let fire_single_watch_unchecked source watch = - let data = Utils.join_by_null [watch.path; watch.token; ""] in -- send_reply watch.con Transaction.none 0 Xenbus.Xb.Op.Watchevent data -+ let pkt = packet_of watch.con Transaction.none 0 Xenbus.Xb.Op.Watchevent data in - --let fire_single_watch (oldroot, root) watch = -+ match BoundedPipe.push source.pending_source_watchevents (watch, pkt) with -+ | Some () -> () (* packet queued *) -+ | None -> -+ (* a well behaved Dom0 shouldn't be able to trigger this, -+ if it happens it is likely a Dom0 bug causing runaway memory usage -+ *) -+ failwith "watch event overflow, cannot happen" -+ -+let fire_single_watch source (oldroot, root) watch = - let abspath = get_watch_path watch.con watch.path |> Store.Path.of_string in - let perms = lookup_watch_perms oldroot root abspath in - if Perms.can_fire_watch watch.con.perm perms then -- fire_single_watch_unchecked watch -+ fire_single_watch_unchecked source watch - else - let perms = perms |> List.map (Perms.Node.to_string ~sep:" ") |> String.concat ", " in - let con = get_domstr watch.con in - Logging.watch_not_fired ~con perms (Store.Path.to_string abspath) - --let fire_watch roots watch path = -+let fire_watch source roots watch path = - let new_path = - if watch.is_relative && path.[0] = '/' - then begin -@@ -232,7 +360,7 @@ let fire_watch roots watch path = - end else - path - in -- fire_single_watch roots { watch with path = new_path } -+ fire_single_watch source roots { watch with path = new_path } - - (* Search for a valid unused transaction id. *) - let rec valid_transaction_id con proposed_id = -@@ -280,6 +408,7 @@ let do_input con = Xenbus.Xb.input con.xb - let has_partial_input con = Xenbus.Xb.has_partial_input con.xb - let has_more_input con = Xenbus.Xb.has_more_input con.xb - -+let can_input con = Xenbus.Xb.can_input con.xb && BoundedPipe.is_empty con.pending_source_watchevents - let has_output con = Xenbus.Xb.has_output con.xb - let has_old_output con = Xenbus.Xb.has_old_output con.xb - let has_new_output con = Xenbus.Xb.has_new_output con.xb -@@ -323,7 +452,7 @@ let prevents_live_update con = not (is_bad con) - && (has_extra_connection_data con || has_transaction_data con) - - let has_more_work con = -- has_more_input con || not (has_old_output con) && has_new_output con -+ (has_more_input con && can_input con) || not (has_old_output con) && has_new_output con - - let incr_ops con = con.stat_nb_ops <- con.stat_nb_ops + 1 - -diff --git a/tools/ocaml/xenstored/connections.ml b/tools/ocaml/xenstored/connections.ml -index 3c7429fe7f61..7d68c583b43a 100644 ---- a/tools/ocaml/xenstored/connections.ml -+++ b/tools/ocaml/xenstored/connections.ml -@@ -22,22 +22,30 @@ type t = { - domains: (int, Connection.t) Hashtbl.t; - ports: (Xeneventchn.t, Connection.t) Hashtbl.t; - mutable watches: Connection.watch list Trie.t; -+ mutable has_pending_watchevents: Connection.Watch.Set.t - } - - let create () = { - anonymous = Hashtbl.create 37; - domains = Hashtbl.create 37; - ports = Hashtbl.create 37; -- watches = Trie.create () -+ watches = Trie.create (); -+ has_pending_watchevents = Connection.Watch.Set.empty; - } - -+let get_capacity () = -+ (* not multiplied by maxwatch on purpose: 2nd queue in watch itself! *) -+ { Xenbus.Xb.maxoutstanding = !Define.maxoutstanding; maxwatchevents = !Define.maxwatchevents } -+ - let add_anonymous cons fd = -- let xbcon = Xenbus.Xb.open_fd fd in -+ let capacity = get_capacity () in -+ let xbcon = Xenbus.Xb.open_fd fd ~capacity in - let con = Connection.create xbcon None in - Hashtbl.add cons.anonymous (Xenbus.Xb.get_fd xbcon) con - - let add_domain cons dom = -- let xbcon = Xenbus.Xb.open_mmap (Domain.get_interface dom) (fun () -> Domain.notify dom) in -+ let capacity = get_capacity () in -+ let xbcon = Xenbus.Xb.open_mmap ~capacity (Domain.get_interface dom) (fun () -> Domain.notify dom) in - let con = Connection.create xbcon (Some dom) in - Hashtbl.add cons.domains (Domain.get_id dom) con; - match Domain.get_port dom with -@@ -48,7 +56,9 @@ let select ?(only_if = (fun _ -> true)) cons = - Hashtbl.fold (fun _ con (ins, outs) -> - if (only_if con) then ( - let fd = Connection.get_fd con in -- (fd :: ins, if Connection.has_output con then fd :: outs else outs) -+ let in_fds = if Connection.can_input con then fd :: ins else ins in -+ let out_fds = if Connection.has_output con then fd :: outs else outs in -+ in_fds, out_fds - ) else (ins, outs) - ) - cons.anonymous ([], []) -@@ -67,10 +77,17 @@ let del_watches_of_con con watches = - | [] -> None - | ws -> Some ws - -+let del_watches cons con = -+ Connection.del_watches con; -+ cons.watches <- Trie.map (del_watches_of_con con) cons.watches; -+ cons.has_pending_watchevents <- -+ cons.has_pending_watchevents |> Connection.Watch.Set.filter @@ fun w -> -+ Connection.get_con w != con -+ - let del_anonymous cons con = - try - Hashtbl.remove cons.anonymous (Connection.get_fd con); -- cons.watches <- Trie.map (del_watches_of_con con) cons.watches; -+ del_watches cons con; - Connection.close con - with exn -> - debug "del anonymous %s" (Printexc.to_string exn) -@@ -85,7 +102,7 @@ let del_domain cons id = - | Some p -> Hashtbl.remove cons.ports p - | None -> ()) - | None -> ()); -- cons.watches <- Trie.map (del_watches_of_con con) cons.watches; -+ del_watches cons con; - Connection.close con - with exn -> - debug "del domain %u: %s" id (Printexc.to_string exn) -@@ -136,31 +153,33 @@ let del_watch cons con path token = - cons.watches <- Trie.set cons.watches key watches; - watch - --let del_watches cons con = -- Connection.del_watches con; -- cons.watches <- Trie.map (del_watches_of_con con) cons.watches -- - (* path is absolute *) --let fire_watches ?oldroot root cons path recurse = -+let fire_watches ?oldroot source root cons path recurse = - let key = key_of_path path in - let path = Store.Path.to_string path in - let roots = oldroot, root in - let fire_watch _ = function - | None -> () -- | Some watches -> List.iter (fun w -> Connection.fire_watch roots w path) watches -+ | Some watches -> List.iter (fun w -> Connection.fire_watch source roots w path) watches - in - let fire_rec _x = function - | None -> () - | Some watches -> -- List.iter (Connection.fire_single_watch roots) watches -+ List.iter (Connection.fire_single_watch source roots) watches - in - Trie.iter_path fire_watch cons.watches key; - if recurse then - Trie.iter fire_rec (Trie.sub cons.watches key) - -+let send_watchevents cons con = -+ cons.has_pending_watchevents <- -+ cons.has_pending_watchevents |> Connection.Watch.Set.filter Connection.Watch.flush_events; -+ Connection.source_flush_watchevents con -+ - let fire_spec_watches root cons specpath = -+ let source = find_domain cons 0 in - iter cons (fun con -> -- List.iter (Connection.fire_single_watch (None, root)) (Connection.get_watches con specpath)) -+ List.iter (Connection.fire_single_watch source (None, root)) (Connection.get_watches con specpath)) - - let set_target cons domain target_domain = - let con = find_domain cons domain in -@@ -197,6 +216,16 @@ let debug cons = - let domains = Hashtbl.fold (fun _ con accu -> Connection.debug con :: accu) cons.domains [] in - String.concat "" (domains @ anonymous) - -+let debug_watchevents cons con = -+ (* == (physical equality) -+ has to be used here because w.con.xb.backend might contain a [unit->unit] value causing regular -+ comparison to fail due to having a 'functional value' which cannot be compared. -+ *) -+ let s = cons.has_pending_watchevents |> Connection.Watch.Set.filter (fun w -> w.con == con) in -+ let pending = s |> Connection.Watch.Set.elements -+ |> List.map (fun w -> Connection.Watch.pending_watchevents w) |> List.fold_left (+) 0 in -+ Printf.sprintf "Watches with pending events: %d, pending events total: %d" (Connection.Watch.Set.cardinal s) pending -+ - let filter ~f cons = - let fold _ v acc = if f v then v :: acc else acc in - [] -diff --git a/tools/ocaml/xenstored/define.ml b/tools/ocaml/xenstored/define.ml -index ba63a8147e09..327b6d795ec7 100644 ---- a/tools/ocaml/xenstored/define.ml -+++ b/tools/ocaml/xenstored/define.ml -@@ -24,6 +24,13 @@ let default_config_dir = Paths.xen_config_dir - let maxwatch = ref (100) - let maxtransaction = ref (10) - let maxrequests = ref (1024) (* maximum requests per transaction *) -+let maxoutstanding = ref (1024) (* maximum outstanding requests, i.e. in-flight requests / domain *) -+let maxwatchevents = ref (1024) -+(* -+ maximum outstanding watch events per watch, -+ recommended >= maxoutstanding to avoid blocking backend transactions due to -+ malicious frontends -+ *) - - let gc_max_overhead = ref 120 (* 120% see comment in xenstored.ml *) - let conflict_burst_limit = ref 5.0 -diff --git a/tools/ocaml/xenstored/oxenstored.conf.in b/tools/ocaml/xenstored/oxenstored.conf.in -index 4ae48e42d47d..9d034e744b4b 100644 ---- a/tools/ocaml/xenstored/oxenstored.conf.in -+++ b/tools/ocaml/xenstored/oxenstored.conf.in -@@ -62,6 +62,8 @@ quota-maxwatch = 100 - quota-transaction = 10 - quota-maxrequests = 1024 - quota-path-max = 1024 -+quota-maxoutstanding = 1024 -+quota-maxwatchevents = 1024 - - # Activate filed base backend - persistent = false -diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml -index cbf708213796..ce39ce28b5f3 100644 ---- a/tools/ocaml/xenstored/process.ml -+++ b/tools/ocaml/xenstored/process.ml -@@ -57,7 +57,7 @@ let split_one_path data con = - | path :: "" :: [] -> Store.Path.create path (Connection.get_path con) - | _ -> raise Invalid_Cmd_Args - --let process_watch t cons = -+let process_watch source t cons = - let oldroot = t.Transaction.oldroot in - let newroot = Store.get_root t.store in - let ops = Transaction.get_paths t |> List.rev in -@@ -67,8 +67,9 @@ let process_watch t cons = - | Xenbus.Xb.Op.Rm -> true, None, oldroot - | Xenbus.Xb.Op.Setperms -> false, Some oldroot, newroot - | _ -> raise (Failure "huh ?") in -- Connections.fire_watches ?oldroot root cons (snd op) recurse in -- List.iter (fun op -> do_op_watch op cons) ops -+ Connections.fire_watches ?oldroot source root cons (snd op) recurse in -+ List.iter (fun op -> do_op_watch op cons) ops; -+ Connections.send_watchevents cons source - - let create_implicit_path t perm path = - let dirname = Store.Path.get_parent path in -@@ -234,6 +235,20 @@ let do_debug con t _domains cons data = - | "watches" :: _ -> - let watches = Connections.debug cons in - Some (watches ^ "\000") -+ | "xenbus" :: domid :: _ -> -+ let domid = int_of_string domid in -+ let con = Connections.find_domain cons domid in -+ let s = Printf.sprintf "xenbus: %s; overflow queue length: %d, can_input: %b, has_more_input: %b, has_old_output: %b, has_new_output: %b, has_more_work: %b. pending: %s" -+ (Xenbus.Xb.debug con.xb) -+ (Connection.source_pending_watchevents con) -+ (Connection.can_input con) -+ (Connection.has_more_input con) -+ (Connection.has_old_output con) -+ (Connection.has_new_output con) -+ (Connection.has_more_work con) -+ (Connections.debug_watchevents cons con) -+ in -+ Some s - | "mfn" :: domid :: _ -> - let domid = int_of_string domid in - let con = Connections.find_domain cons domid in -@@ -342,7 +357,7 @@ let reply_ack fct con t doms cons data = - fct con t doms cons data; - Packet.Ack (fun () -> - if Transaction.get_id t = Transaction.none then -- process_watch t cons -+ process_watch con t cons - ) - - let reply_data fct con t doms cons data = -@@ -501,7 +516,7 @@ let do_watch con t _domains cons data = - Packet.Ack (fun () -> - (* xenstore.txt says this watch is fired immediately, - implying even if path doesn't exist or is unreadable *) -- Connection.fire_single_watch_unchecked watch) -+ Connection.fire_single_watch_unchecked con watch) - - let do_unwatch con _t _domains cons data = - let (node, token) = -@@ -532,7 +547,7 @@ let do_transaction_end con t domains cons data = - if not success then - raise Transaction_again; - if commit then begin -- process_watch t cons; -+ process_watch con t cons; - match t.Transaction.ty with - | Transaction.No -> - () (* no need to record anything *) -@@ -700,7 +715,8 @@ let process_packet ~store ~cons ~doms ~con ~req = - let do_input store cons doms con = - let newpacket = - try -- Connection.do_input con -+ if Connection.can_input con then Connection.do_input con -+ else None - with Xenbus.Xb.Reconnect -> - info "%s requests a reconnect" (Connection.get_domstr con); - History.reconnect con; -@@ -728,6 +744,7 @@ let do_input store cons doms con = - Connection.incr_ops con - - let do_output _store _cons _doms con = -+ Connection.source_flush_watchevents con; - if Connection.has_output con then ( - if Connection.has_new_output con then ( - let packet = Connection.peek_output con in -diff --git a/tools/ocaml/xenstored/xenstored.ml b/tools/ocaml/xenstored/xenstored.ml -index 3b57ad016dfb..c799e20f1145 100644 ---- a/tools/ocaml/xenstored/xenstored.ml -+++ b/tools/ocaml/xenstored/xenstored.ml -@@ -103,6 +103,8 @@ let parse_config filename = - ("quota-maxentity", Config.Set_int Quota.maxent); - ("quota-maxsize", Config.Set_int Quota.maxsize); - ("quota-maxrequests", Config.Set_int Define.maxrequests); -+ ("quota-maxoutstanding", Config.Set_int Define.maxoutstanding); -+ ("quota-maxwatchevents", Config.Set_int Define.maxwatchevents); - ("quota-path-max", Config.Set_int Define.path_max); - ("gc-max-overhead", Config.Set_int Define.gc_max_overhead); - ("test-eagain", Config.Set_bool Transaction.test_eagain); --- -2.37.4 - diff --git a/0108-SUPPORT.md-clarify-support-of-untrusted-driver-domai.patch b/0108-SUPPORT.md-clarify-support-of-untrusted-driver-domai.patch deleted file mode 100644 index 82773df..0000000 --- a/0108-SUPPORT.md-clarify-support-of-untrusted-driver-domai.patch +++ /dev/null @@ -1,55 +0,0 @@ -From 26faa6b55881445c25e7e83613c2354090fdff18 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Thu, 29 Sep 2022 13:07:35 +0200 -Subject: [PATCH 108/126] SUPPORT.md: clarify support of untrusted driver - domains with oxenstored - -Add a support statement for the scope of support regarding different -Xenstore variants. Especially oxenstored does not (yet) have security -support of untrusted driver domains, as those might drive oxenstored -out of memory by creating lots of watch events for the guests they are -servicing. - -Add a statement regarding Live Update support of oxenstored. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: George Dunlap <george.dunlap@citrix.com> -Acked-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit c7bc20d8d123851a468402bbfc9e3330efff21ec) ---- - SUPPORT.md | 13 +++++++++---- - 1 file changed, 9 insertions(+), 4 deletions(-) - -diff --git a/SUPPORT.md b/SUPPORT.md -index 0fb262f81f40..48fb462221cf 100644 ---- a/SUPPORT.md -+++ b/SUPPORT.md -@@ -179,13 +179,18 @@ Support for running qemu-xen device model in a linux stubdomain. - - Status: Tech Preview - --## Liveupdate of C xenstored daemon -+## Xenstore - -- Status: Tech Preview -+### C xenstored daemon - --## Liveupdate of OCaml xenstored daemon -+ Status: Supported -+ Status, Liveupdate: Tech Preview - -- Status: Tech Preview -+### OCaml xenstored daemon -+ -+ Status: Supported -+ Status, untrusted driver domains: Supported, not security supported -+ Status, Liveupdate: Not functional - - ## Toolstack/3rd party - --- -2.37.4 - diff --git a/0109-tools-xenstore-don-t-use-conn-in-as-context-for-temp.patch b/0109-tools-xenstore-don-t-use-conn-in-as-context-for-temp.patch deleted file mode 100644 index c9a2e6e..0000000 --- a/0109-tools-xenstore-don-t-use-conn-in-as-context-for-temp.patch +++ /dev/null @@ -1,716 +0,0 @@ -From 607e186fe094f8d1c78572cd3b1f7a43730203c1 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: [PATCH 109/126] tools/xenstore: don't use conn->in as context for - temporary allocations - -Using the struct buffered data pointer of the current processed request -for temporary data allocations has a major drawback: the used area (and -with that the temporary data) is freed only after the response of the -request has been written to the ring page or has been read via the -socket. This can happen much later in case a guest isn't reading its -responses fast enough. - -As the temporary data can be safely freed after creating the response, -add a temporary context for that purpose and use that for allocating -the temporary memory, as it was already the case before commit -cc0612464896 ("xenstore: add small default data buffer to internal -struct"). - -Some sub-functions need to gain the "const" attribute for the talloc -context. - -This is XSA-416 / CVE-2022-42319. - -Fixes: cc0612464896 ("xenstore: add small default data buffer to internal struct") -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 2a587de219cc0765330fbf9fac6827bfaf29e29b) ---- - tools/xenstore/xenstored_control.c | 31 ++++++----- - tools/xenstore/xenstored_control.h | 3 +- - tools/xenstore/xenstored_core.c | 76 ++++++++++++++++---------- - tools/xenstore/xenstored_domain.c | 29 ++++++---- - tools/xenstore/xenstored_domain.h | 21 ++++--- - tools/xenstore/xenstored_transaction.c | 14 +++-- - tools/xenstore/xenstored_transaction.h | 6 +- - tools/xenstore/xenstored_watch.c | 9 +-- - tools/xenstore/xenstored_watch.h | 6 +- - 9 files changed, 118 insertions(+), 77 deletions(-) - -diff --git a/tools/xenstore/xenstored_control.c b/tools/xenstore/xenstored_control.c -index 980279fa53ff..95a60bf57858 100644 ---- a/tools/xenstore/xenstored_control.c -+++ b/tools/xenstore/xenstored_control.c -@@ -107,7 +107,7 @@ static const char *lu_begin(struct connection *conn) - - struct cmd_s { - char *cmd; -- int (*func)(void *, struct connection *, char **, int); -+ int (*func)(const void *, struct connection *, char **, int); - char *pars; - /* - * max_pars can be used to limit the size of the parameter vector, -@@ -119,7 +119,7 @@ struct cmd_s { - unsigned int max_pars; - }; - --static int do_control_check(void *ctx, struct connection *conn, -+static int do_control_check(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num) -@@ -131,7 +131,7 @@ static int do_control_check(void *ctx, struct connection *conn, - return 0; - } - --static int do_control_log(void *ctx, struct connection *conn, -+static int do_control_log(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num != 1) -@@ -233,7 +233,7 @@ static int quota_get(const void *ctx, struct connection *conn, - return domain_get_quota(ctx, conn, atoi(vec[0])); - } - --static int do_control_quota(void *ctx, struct connection *conn, -+static int do_control_quota(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num == 0) -@@ -245,7 +245,7 @@ static int do_control_quota(void *ctx, struct connection *conn, - return quota_get(ctx, conn, vec, num); - } - --static int do_control_quota_s(void *ctx, struct connection *conn, -+static int do_control_quota_s(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num == 0) -@@ -258,7 +258,7 @@ static int do_control_quota_s(void *ctx, struct connection *conn, - } - - #ifdef __MINIOS__ --static int do_control_memreport(void *ctx, struct connection *conn, -+static int do_control_memreport(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num) -@@ -270,7 +270,7 @@ static int do_control_memreport(void *ctx, struct connection *conn, - return 0; - } - #else --static int do_control_logfile(void *ctx, struct connection *conn, -+static int do_control_logfile(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num != 1) -@@ -285,7 +285,7 @@ static int do_control_logfile(void *ctx, struct connection *conn, - return 0; - } - --static int do_control_memreport(void *ctx, struct connection *conn, -+static int do_control_memreport(const void *ctx, struct connection *conn, - char **vec, int num) - { - FILE *fp; -@@ -325,7 +325,7 @@ static int do_control_memreport(void *ctx, struct connection *conn, - } - #endif - --static int do_control_print(void *ctx, struct connection *conn, -+static int do_control_print(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num != 1) -@@ -802,7 +802,7 @@ static const char *lu_start(const void *ctx, struct connection *conn, - return NULL; - } - --static int do_control_lu(void *ctx, struct connection *conn, -+static int do_control_lu(const void *ctx, struct connection *conn, - char **vec, int num) - { - const char *ret = NULL; -@@ -852,7 +852,7 @@ static int do_control_lu(void *ctx, struct connection *conn, - } - #endif - --static int do_control_help(void *, struct connection *, char **, int); -+static int do_control_help(const void *, struct connection *, char **, int); - - static struct cmd_s cmds[] = { - { "check", do_control_check, "" }, -@@ -891,7 +891,7 @@ static struct cmd_s cmds[] = { - { "help", do_control_help, "" }, - }; - --static int do_control_help(void *ctx, struct connection *conn, -+static int do_control_help(const void *ctx, struct connection *conn, - char **vec, int num) - { - int cmd, len = 0; -@@ -927,7 +927,8 @@ static int do_control_help(void *ctx, struct connection *conn, - return 0; - } - --int do_control(struct connection *conn, struct buffered_data *in) -+int do_control(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - unsigned int cmd, num, off; - char **vec = NULL; -@@ -947,11 +948,11 @@ int do_control(struct connection *conn, struct buffered_data *in) - num = xs_count_strings(in->buffer, in->used); - if (cmds[cmd].max_pars) - num = min(num, cmds[cmd].max_pars); -- vec = talloc_array(in, char *, num); -+ vec = talloc_array(ctx, char *, num); - if (!vec) - return ENOMEM; - if (get_strings(in, vec, num) < num) - return EIO; - -- return cmds[cmd].func(in, conn, vec + 1, num - 1); -+ return cmds[cmd].func(ctx, conn, vec + 1, num - 1); - } -diff --git a/tools/xenstore/xenstored_control.h b/tools/xenstore/xenstored_control.h -index aac61f05908f..6430c3769361 100644 ---- a/tools/xenstore/xenstored_control.h -+++ b/tools/xenstore/xenstored_control.h -@@ -16,5 +16,6 @@ - along with this program; If not, see <http://www.gnu.org/licenses/>. - */ - --int do_control(struct connection *conn, struct buffered_data *in); -+int do_control(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - void lu_read_state(void); -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index f27d5c0101bc..806f24bbab8b 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1214,11 +1214,13 @@ static struct node *get_node_canonicalized(struct connection *conn, - return get_node(conn, ctx, *canonical_name, perm); - } - --static int send_directory(struct connection *conn, struct buffered_data *in) -+static int send_directory(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node *node; - -- node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ); -+ node = get_node_canonicalized(conn, ctx, onearg(in), NULL, -+ XS_PERM_READ); - if (!node) - return errno; - -@@ -1227,7 +1229,7 @@ static int send_directory(struct connection *conn, struct buffered_data *in) - return 0; - } - --static int send_directory_part(struct connection *conn, -+static int send_directory_part(const void *ctx, struct connection *conn, - struct buffered_data *in) - { - unsigned int off, len, maxlen, genlen; -@@ -1239,7 +1241,8 @@ static int send_directory_part(struct connection *conn, - return EINVAL; - - /* First arg is node name. */ -- node = get_node_canonicalized(conn, in, in->buffer, NULL, XS_PERM_READ); -+ node = get_node_canonicalized(conn, ctx, in->buffer, NULL, -+ XS_PERM_READ); - if (!node) - return errno; - -@@ -1266,7 +1269,7 @@ static int send_directory_part(struct connection *conn, - break; - } - -- data = talloc_array(in, char, genlen + len + 1); -+ data = talloc_array(ctx, char, genlen + len + 1); - if (!data) - return ENOMEM; - -@@ -1282,11 +1285,13 @@ static int send_directory_part(struct connection *conn, - return 0; - } - --static int do_read(struct connection *conn, struct buffered_data *in) -+static int do_read(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node *node; - -- node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ); -+ node = get_node_canonicalized(conn, ctx, onearg(in), NULL, -+ XS_PERM_READ); - if (!node) - return errno; - -@@ -1476,7 +1481,8 @@ err: - } - - /* path, data... */ --static int do_write(struct connection *conn, struct buffered_data *in) -+static int do_write(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - unsigned int offset, datalen; - struct node *node; -@@ -1490,12 +1496,12 @@ static int do_write(struct connection *conn, struct buffered_data *in) - offset = strlen(vec[0]) + 1; - datalen = in->used - offset; - -- node = get_node_canonicalized(conn, in, vec[0], &name, XS_PERM_WRITE); -+ node = get_node_canonicalized(conn, ctx, vec[0], &name, XS_PERM_WRITE); - if (!node) { - /* No permissions, invalid input? */ - if (errno != ENOENT) - return errno; -- node = create_node(conn, in, name, in->buffer + offset, -+ node = create_node(conn, ctx, name, in->buffer + offset, - datalen); - if (!node) - return errno; -@@ -1506,18 +1512,19 @@ static int do_write(struct connection *conn, struct buffered_data *in) - return errno; - } - -- fire_watches(conn, in, name, node, false, NULL); -+ fire_watches(conn, ctx, name, node, false, NULL); - send_ack(conn, XS_WRITE); - - return 0; - } - --static int do_mkdir(struct connection *conn, struct buffered_data *in) -+static int do_mkdir(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node *node; - char *name; - -- node = get_node_canonicalized(conn, in, onearg(in), &name, -+ node = get_node_canonicalized(conn, ctx, onearg(in), &name, - XS_PERM_WRITE); - - /* If it already exists, fine. */ -@@ -1527,10 +1534,10 @@ static int do_mkdir(struct connection *conn, struct buffered_data *in) - return errno; - if (!name) - return ENOMEM; -- node = create_node(conn, in, name, NULL, 0); -+ node = create_node(conn, ctx, name, NULL, 0); - if (!node) - return errno; -- fire_watches(conn, in, name, node, false, NULL); -+ fire_watches(conn, ctx, name, node, false, NULL); - } - send_ack(conn, XS_MKDIR); - -@@ -1628,24 +1635,25 @@ static int _rm(struct connection *conn, const void *ctx, struct node *node, - } - - --static int do_rm(struct connection *conn, struct buffered_data *in) -+static int do_rm(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node *node; - int ret; - char *name; - char *parentname; - -- node = get_node_canonicalized(conn, in, onearg(in), &name, -+ node = get_node_canonicalized(conn, ctx, onearg(in), &name, - XS_PERM_WRITE); - if (!node) { - /* Didn't exist already? Fine, if parent exists. */ - if (errno == ENOENT) { - if (!name) - return ENOMEM; -- parentname = get_parent(in, name); -+ parentname = get_parent(ctx, name); - if (!parentname) - return errno; -- node = read_node(conn, in, parentname); -+ node = read_node(conn, ctx, parentname); - if (node) { - send_ack(conn, XS_RM); - return 0; -@@ -1660,7 +1668,7 @@ static int do_rm(struct connection *conn, struct buffered_data *in) - if (streq(name, "/")) - return EINVAL; - -- ret = _rm(conn, in, node, name); -+ ret = _rm(conn, ctx, node, name); - if (ret) - return ret; - -@@ -1670,13 +1678,15 @@ static int do_rm(struct connection *conn, struct buffered_data *in) - } - - --static int do_get_perms(struct connection *conn, struct buffered_data *in) -+static int do_get_perms(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node *node; - char *strings; - unsigned int len; - -- node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ); -+ node = get_node_canonicalized(conn, ctx, onearg(in), NULL, -+ XS_PERM_READ); - if (!node) - return errno; - -@@ -1689,7 +1699,8 @@ static int do_get_perms(struct connection *conn, struct buffered_data *in) - return 0; - } - --static int do_set_perms(struct connection *conn, struct buffered_data *in) -+static int do_set_perms(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node_perms perms, old_perms; - char *name, *permstr; -@@ -1706,7 +1717,7 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) - - permstr = in->buffer + strlen(in->buffer) + 1; - -- perms.p = talloc_array(in, struct xs_permissions, perms.num); -+ perms.p = talloc_array(ctx, struct xs_permissions, perms.num); - if (!perms.p) - return ENOMEM; - if (!xs_strings_to_perms(perms.p, perms.num, permstr)) -@@ -1721,7 +1732,7 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) - } - - /* We must own node to do this (tools can do this too). */ -- node = get_node_canonicalized(conn, in, in->buffer, &name, -+ node = get_node_canonicalized(conn, ctx, in->buffer, &name, - XS_PERM_WRITE | XS_PERM_OWNER); - if (!node) - return errno; -@@ -1756,7 +1767,7 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) - return errno; - } - -- fire_watches(conn, in, name, node, false, &old_perms); -+ fire_watches(conn, ctx, name, node, false, &old_perms); - send_ack(conn, XS_SET_PERMS); - - return 0; -@@ -1764,7 +1775,8 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) - - static struct { - const char *str; -- int (*func)(struct connection *conn, struct buffered_data *in); -+ int (*func)(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - unsigned int flags; - #define XS_FLAG_NOTID (1U << 0) /* Ignore transaction id. */ - #define XS_FLAG_PRIV (1U << 1) /* Privileged domain only. */ -@@ -1840,6 +1852,7 @@ static void process_message(struct connection *conn, struct buffered_data *in) - struct transaction *trans; - enum xsd_sockmsg_type type = in->hdr.msg.type; - int ret; -+ void *ctx; - - if ((unsigned int)type >= XS_TYPE_COUNT || !wire_funcs[type].func) { - eprintf("Client unknown operation %i", type); -@@ -1860,10 +1873,17 @@ static void process_message(struct connection *conn, struct buffered_data *in) - return; - } - -+ ctx = talloc_new(NULL); -+ if (!ctx) { -+ send_error(conn, ENOMEM); -+ return; -+ } -+ - assert(conn->transaction == NULL); - conn->transaction = trans; - -- ret = wire_funcs[type].func(conn, in); -+ ret = wire_funcs[type].func(ctx, conn, in); -+ talloc_free(ctx); - if (ret) - send_error(conn, ret); - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index 3d5142581332..d262f4e9dbdf 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -336,7 +336,7 @@ bool domain_can_write(struct connection *conn) - return ((intf->rsp_prod - intf->rsp_cons) != XENSTORE_RING_SIZE); - } - --static char *talloc_domain_path(void *context, unsigned int domid) -+static char *talloc_domain_path(const void *context, unsigned int domid) - { - return talloc_asprintf(context, "/local/domain/%u", domid); - } -@@ -540,7 +540,8 @@ static struct domain *introduce_domain(const void *ctx, - } - - /* domid, gfn, evtchn, path */ --int do_introduce(struct connection *conn, struct buffered_data *in) -+int do_introduce(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct domain *domain; - char *vec[3]; -@@ -558,7 +559,7 @@ int do_introduce(struct connection *conn, struct buffered_data *in) - if (port <= 0) - return EINVAL; - -- domain = introduce_domain(in, domid, port, false); -+ domain = introduce_domain(ctx, domid, port, false); - if (!domain) - return errno; - -@@ -581,7 +582,8 @@ static struct domain *find_connected_domain(unsigned int domid) - return domain; - } - --int do_set_target(struct connection *conn, struct buffered_data *in) -+int do_set_target(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - char *vec[2]; - unsigned int domid, tdomid; -@@ -625,7 +627,8 @@ static struct domain *onearg_domain(struct connection *conn, - } - - /* domid */ --int do_release(struct connection *conn, struct buffered_data *in) -+int do_release(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct domain *domain; - -@@ -640,7 +643,8 @@ int do_release(struct connection *conn, struct buffered_data *in) - return 0; - } - --int do_resume(struct connection *conn, struct buffered_data *in) -+int do_resume(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct domain *domain; - -@@ -655,7 +659,8 @@ int do_resume(struct connection *conn, struct buffered_data *in) - return 0; - } - --int do_get_domain_path(struct connection *conn, struct buffered_data *in) -+int do_get_domain_path(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - char *path; - const char *domid_str = onearg(in); -@@ -663,18 +668,17 @@ int do_get_domain_path(struct connection *conn, struct buffered_data *in) - if (!domid_str) - return EINVAL; - -- path = talloc_domain_path(conn, atoi(domid_str)); -+ path = talloc_domain_path(ctx, atoi(domid_str)); - if (!path) - return errno; - - send_reply(conn, XS_GET_DOMAIN_PATH, path, strlen(path) + 1); - -- talloc_free(path); -- - return 0; - } - --int do_is_domain_introduced(struct connection *conn, struct buffered_data *in) -+int do_is_domain_introduced(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - int result; - unsigned int domid; -@@ -695,7 +699,8 @@ int do_is_domain_introduced(struct connection *conn, struct buffered_data *in) - } - - /* Allow guest to reset all watches */ --int do_reset_watches(struct connection *conn, struct buffered_data *in) -+int do_reset_watches(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - conn_delete_all_watches(conn); - conn_delete_all_transactions(conn); -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index 0f883936f413..da513443cd46 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -24,25 +24,32 @@ void handle_event(void); - void check_domains(bool restore); - - /* domid, mfn, eventchn, path */ --int do_introduce(struct connection *conn, struct buffered_data *in); -+int do_introduce(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* domid */ --int do_is_domain_introduced(struct connection *conn, struct buffered_data *in); -+int do_is_domain_introduced(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* domid */ --int do_release(struct connection *conn, struct buffered_data *in); -+int do_release(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* domid */ --int do_resume(struct connection *conn, struct buffered_data *in); -+int do_resume(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* domid, target */ --int do_set_target(struct connection *conn, struct buffered_data *in); -+int do_set_target(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* domid */ --int do_get_domain_path(struct connection *conn, struct buffered_data *in); -+int do_get_domain_path(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* Allow guest to reset all watches */ --int do_reset_watches(struct connection *conn, struct buffered_data *in); -+int do_reset_watches(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - void domain_init(int evtfd); - void dom0_init(void); -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 28774813de83..3e3eb47326cc 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -481,7 +481,8 @@ struct transaction *transaction_lookup(struct connection *conn, uint32_t id) - return ERR_PTR(-ENOENT); - } - --int do_transaction_start(struct connection *conn, struct buffered_data *in) -+int do_transaction_start(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct transaction *trans, *exists; - char id_str[20]; -@@ -494,8 +495,8 @@ int do_transaction_start(struct connection *conn, struct buffered_data *in) - conn->transaction_started > quota_max_transaction) - return ENOSPC; - -- /* Attach transaction to input for autofree until it's complete */ -- trans = talloc_zero(in, struct transaction); -+ /* Attach transaction to ctx for autofree until it's complete */ -+ trans = talloc_zero(ctx, struct transaction); - if (!trans) - return ENOMEM; - -@@ -544,7 +545,8 @@ static int transaction_fix_domains(struct transaction *trans, bool update) - return 0; - } - --int do_transaction_end(struct connection *conn, struct buffered_data *in) -+int do_transaction_end(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - const char *arg = onearg(in); - struct transaction *trans; -@@ -562,8 +564,8 @@ int do_transaction_end(struct connection *conn, struct buffered_data *in) - if (!conn->transaction_started) - conn->ta_start_time = 0; - -- /* Attach transaction to in for auto-cleanup */ -- talloc_steal(in, trans); -+ /* Attach transaction to ctx for auto-cleanup */ -+ talloc_steal(ctx, trans); - - if (streq(arg, "T")) { - if (trans->fail) -diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h -index e3cbd6b23095..39d7f81c5127 100644 ---- a/tools/xenstore/xenstored_transaction.h -+++ b/tools/xenstore/xenstored_transaction.h -@@ -29,8 +29,10 @@ struct transaction; - - extern uint64_t generation; - --int do_transaction_start(struct connection *conn, struct buffered_data *node); --int do_transaction_end(struct connection *conn, struct buffered_data *in); -+int do_transaction_start(const void *ctx, struct connection *conn, -+ struct buffered_data *node); -+int do_transaction_end(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - struct transaction *transaction_lookup(struct connection *conn, uint32_t id); - -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index 4970e9f1a1b9..854bbcad6e45 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -243,7 +243,7 @@ static struct watch *add_watch(struct connection *conn, char *path, char *token, - return NULL; - } - --int do_watch(struct connection *conn, struct buffered_data *in) -+int do_watch(const void *ctx, struct connection *conn, struct buffered_data *in) - { - struct watch *watch; - char *vec[2]; -@@ -252,7 +252,7 @@ int do_watch(struct connection *conn, struct buffered_data *in) - if (get_strings(in, vec, ARRAY_SIZE(vec)) != ARRAY_SIZE(vec)) - return EINVAL; - -- errno = check_watch_path(conn, in, &(vec[0]), &relative); -+ errno = check_watch_path(conn, ctx, &(vec[0]), &relative); - if (errno) - return errno; - -@@ -283,7 +283,8 @@ int do_watch(struct connection *conn, struct buffered_data *in) - return 0; - } - --int do_unwatch(struct connection *conn, struct buffered_data *in) -+int do_unwatch(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct watch *watch; - char *node, *vec[2]; -@@ -291,7 +292,7 @@ int do_unwatch(struct connection *conn, struct buffered_data *in) - if (get_strings(in, vec, ARRAY_SIZE(vec)) != ARRAY_SIZE(vec)) - return EINVAL; - -- node = canonicalize(conn, in, vec[0]); -+ node = canonicalize(conn, ctx, vec[0]); - if (!node) - return ENOMEM; - list_for_each_entry(watch, &conn->watches, list) { -diff --git a/tools/xenstore/xenstored_watch.h b/tools/xenstore/xenstored_watch.h -index 0e693f0839cd..091890edca96 100644 ---- a/tools/xenstore/xenstored_watch.h -+++ b/tools/xenstore/xenstored_watch.h -@@ -21,8 +21,10 @@ - - #include "xenstored_core.h" - --int do_watch(struct connection *conn, struct buffered_data *in); --int do_unwatch(struct connection *conn, struct buffered_data *in); -+int do_watch(const void *ctx, struct connection *conn, -+ struct buffered_data *in); -+int do_unwatch(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* Fire all watches: !exact means all the children are affected (ie. rm). */ - void fire_watches(struct connection *conn, const void *tmp, const char *name, --- -2.37.4 - diff --git a/0110-tools-xenstore-fix-checking-node-permissions.patch b/0110-tools-xenstore-fix-checking-node-permissions.patch deleted file mode 100644 index 77345f7..0000000 --- a/0110-tools-xenstore-fix-checking-node-permissions.patch +++ /dev/null @@ -1,143 +0,0 @@ -From 8012324cb9e676bd342a5adfda1700525f195e2e Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: [PATCH 110/126] tools/xenstore: fix checking node permissions - -Today chk_domain_generation() is being used to check whether a node -permission entry is still valid or whether it is referring to a domain -no longer existing. This is done by comparing the node's and the -domain's generation count. - -In case no struct domain is existing for a checked domain, but the -domain itself is valid, chk_domain_generation() assumes it is being -called due to the first node created for a new domain and it will -return success. - -This might be wrong in case the checked permission is related to an -old domain, which has just been replaced with a new domain using the -same domid. - -Fix that by letting chk_domain_generation() fail in case a struct -domain isn't found. In order to cover the case of the first node for -a new domain try to allocate the needed struct domain explicitly when -processing the related SET_PERMS command. In case a referenced domain -isn't existing, flag the related permission to be ignored right away. - -This is XSA-417 / CVE-2022-42320. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit ab128218225d3542596ca3a02aee80d55494bef8) ---- - tools/xenstore/xenstored_core.c | 5 +++++ - tools/xenstore/xenstored_domain.c | 37 +++++++++++++++++++++---------- - tools/xenstore/xenstored_domain.h | 1 + - 3 files changed, 31 insertions(+), 12 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 806f24bbab8b..8aecd425f274 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1723,6 +1723,11 @@ static int do_set_perms(const void *ctx, struct connection *conn, - if (!xs_strings_to_perms(perms.p, perms.num, permstr)) - return errno; - -+ if (domain_alloc_permrefs(&perms) < 0) -+ return ENOMEM; -+ if (perms.p[0].perms & XS_PERM_IGNORE) -+ return ENOENT; -+ - /* First arg is node name. */ - if (strstarts(in->buffer, "@")) { - if (set_perms_special(conn, in->buffer, &perms)) -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index d262f4e9dbdf..8b503c2dfe07 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -881,7 +881,6 @@ int domain_entry_inc(struct connection *conn, struct node *node) - * count (used for testing whether a node permission is older than a domain). - * - * Return values: -- * -1: error - * 0: domain has higher generation count (it is younger than a node with the - * given count), or domain isn't existing any longer - * 1: domain is older than the node -@@ -889,20 +888,38 @@ int domain_entry_inc(struct connection *conn, struct node *node) - static int chk_domain_generation(unsigned int domid, uint64_t gen) - { - struct domain *d; -- xc_dominfo_t dominfo; - - if (!xc_handle && domid == 0) - return 1; - - d = find_domain_struct(domid); -- if (d) -- return (d->generation <= gen) ? 1 : 0; - -- if (!get_domain_info(domid, &dominfo)) -- return 0; -+ return (d && d->generation <= gen) ? 1 : 0; -+} - -- d = alloc_domain(NULL, domid); -- return d ? 1 : -1; -+/* -+ * Allocate all missing struct domain referenced by a permission set. -+ * Any permission entries for not existing domains will be marked to be -+ * ignored. -+ */ -+int domain_alloc_permrefs(struct node_perms *perms) -+{ -+ unsigned int i, domid; -+ struct domain *d; -+ xc_dominfo_t dominfo; -+ -+ for (i = 0; i < perms->num; i++) { -+ domid = perms->p[i].id; -+ d = find_domain_struct(domid); -+ if (!d) { -+ if (!get_domain_info(domid, &dominfo)) -+ perms->p[i].perms |= XS_PERM_IGNORE; -+ else if (!alloc_domain(NULL, domid)) -+ return ENOMEM; -+ } -+ } -+ -+ return 0; - } - - /* -@@ -915,8 +932,6 @@ int domain_adjust_node_perms(struct connection *conn, struct node *node) - int ret; - - ret = chk_domain_generation(node->perms.p[0].id, node->generation); -- if (ret < 0) -- return errno; - - /* If the owner doesn't exist any longer give it to priv domain. */ - if (!ret) { -@@ -933,8 +948,6 @@ int domain_adjust_node_perms(struct connection *conn, struct node *node) - continue; - ret = chk_domain_generation(node->perms.p[i].id, - node->generation); -- if (ret < 0) -- return errno; - if (!ret) - node->perms.p[i].perms |= XS_PERM_IGNORE; - } -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index da513443cd46..0b4f56b8146c 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -66,6 +66,7 @@ bool domain_is_unprivileged(struct connection *conn); - - /* Remove node permissions for no longer existing domains. */ - int domain_adjust_node_perms(struct connection *conn, struct node *node); -+int domain_alloc_permrefs(struct node_perms *perms); - - /* Quota manipulation */ - int domain_entry_inc(struct connection *conn, struct node *); --- -2.37.4 - diff --git a/0111-tools-xenstore-remove-recursion-from-construct_node.patch b/0111-tools-xenstore-remove-recursion-from-construct_node.patch deleted file mode 100644 index aa63d32..0000000 --- a/0111-tools-xenstore-remove-recursion-from-construct_node.patch +++ /dev/null @@ -1,126 +0,0 @@ -From 62755d0a90344e704062e7b6943a3fa2dc5e02e6 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:11 +0200 -Subject: [PATCH 111/126] tools/xenstore: remove recursion from - construct_node() - -In order to reduce stack usage due to recursion, switch -construct_node() to use a loop instead. - -This is part of XSA-418 / CVE-2022-42321. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit da8ee25d02a5447ba39a9800ee2a710ae1f54222) ---- - tools/xenstore/xenstored_core.c | 86 +++++++++++++++++++++------------ - 1 file changed, 55 insertions(+), 31 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 8aecd425f274..46a37e5257e5 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1343,45 +1343,69 @@ static int add_child(const void *ctx, struct node *parent, const char *name) - static struct node *construct_node(struct connection *conn, const void *ctx, - const char *name) - { -- struct node *parent, *node; -- char *parentname = get_parent(ctx, name); -+ const char **names = NULL; -+ unsigned int levels = 0; -+ struct node *node = NULL; -+ struct node *parent = NULL; -+ const char *parentname = talloc_strdup(ctx, name); - - if (!parentname) - return NULL; - -- /* If parent doesn't exist, create it. */ -- parent = read_node(conn, parentname, parentname); -- if (!parent && errno == ENOENT) -- parent = construct_node(conn, ctx, parentname); -- if (!parent) -- return NULL; -+ /* Walk the path up until an existing node is found. */ -+ while (!parent) { -+ names = talloc_realloc(ctx, names, const char *, levels + 1); -+ if (!names) -+ goto nomem; - -- /* Add child to parent. */ -- if (add_child(ctx, parent, name)) -- goto nomem; -+ /* -+ * names[0] is the name of the node to construct initially, -+ * names[1] is its parent, and so on. -+ */ -+ names[levels] = parentname; -+ parentname = get_parent(ctx, parentname); -+ if (!parentname) -+ return NULL; - -- /* Allocate node */ -- node = talloc(ctx, struct node); -- if (!node) -- goto nomem; -- node->name = talloc_strdup(node, name); -- if (!node->name) -- goto nomem; -+ /* Try to read parent node until we found an existing one. */ -+ parent = read_node(conn, ctx, parentname); -+ if (!parent && (errno != ENOENT || !strcmp(parentname, "/"))) -+ return NULL; - -- /* Inherit permissions, except unprivileged domains own what they create */ -- node->perms.num = parent->perms.num; -- node->perms.p = talloc_memdup(node, parent->perms.p, -- node->perms.num * sizeof(*node->perms.p)); -- if (!node->perms.p) -- goto nomem; -- if (domain_is_unprivileged(conn)) -- node->perms.p[0].id = conn->id; -+ levels++; -+ } -+ -+ /* Walk the path down again constructing the missing nodes. */ -+ for (; levels > 0; levels--) { -+ /* Add child to parent. */ -+ if (add_child(ctx, parent, names[levels - 1])) -+ goto nomem; -+ -+ /* Allocate node */ -+ node = talloc(ctx, struct node); -+ if (!node) -+ goto nomem; -+ node->name = talloc_steal(node, names[levels - 1]); -+ -+ /* Inherit permissions, unpriv domains own what they create. */ -+ node->perms.num = parent->perms.num; -+ node->perms.p = talloc_memdup(node, parent->perms.p, -+ node->perms.num * -+ sizeof(*node->perms.p)); -+ if (!node->perms.p) -+ goto nomem; -+ if (domain_is_unprivileged(conn)) -+ node->perms.p[0].id = conn->id; -+ -+ /* No children, no data */ -+ node->children = node->data = NULL; -+ node->childlen = node->datalen = 0; -+ node->acc.memory = 0; -+ node->parent = parent; -+ -+ parent = node; -+ } - -- /* No children, no data */ -- node->children = node->data = NULL; -- node->childlen = node->datalen = 0; -- node->acc.memory = 0; -- node->parent = parent; - return node; - - nomem: --- -2.37.4 - diff --git a/0112-tools-xenstore-don-t-let-remove_child_entry-call-cor.patch b/0112-tools-xenstore-don-t-let-remove_child_entry-call-cor.patch deleted file mode 100644 index 8250ff0..0000000 --- a/0112-tools-xenstore-don-t-let-remove_child_entry-call-cor.patch +++ /dev/null @@ -1,110 +0,0 @@ -From b9a005b0b4520261c6c362fca55500782837f119 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:11 +0200 -Subject: [PATCH 112/126] tools/xenstore: don't let remove_child_entry() call - corrupt() - -In case of write_node() returning an error, remove_child_entry() will -call corrupt() today. This could result in an endless recursion, as -remove_child_entry() is called by corrupt(), too: - -corrupt() - check_store() - check_store_() - remove_child_entry() - -Fix that by letting remove_child_entry() return an error instead and -let the caller decide what to do. - -This is part of XSA-418 / CVE-2022-42321. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 0c00c51f3bc8206c7f9cf87d014650157bee2bf4) ---- - tools/xenstore/xenstored_core.c | 36 ++++++++++++++++++--------------- - 1 file changed, 20 insertions(+), 16 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 46a37e5257e5..4c3897721bdd 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1574,15 +1574,15 @@ static void memdel(void *mem, unsigned off, unsigned len, unsigned total) - memmove(mem + off, mem + off + len, total - off - len); - } - --static void remove_child_entry(struct connection *conn, struct node *node, -- size_t offset) -+static int remove_child_entry(struct connection *conn, struct node *node, -+ size_t offset) - { - size_t childlen = strlen(node->children + offset); - - memdel(node->children, offset, childlen + 1, node->childlen); - node->childlen -= childlen + 1; -- if (write_node(conn, node, true)) -- corrupt(conn, "Can't update parent node '%s'", node->name); -+ -+ return write_node(conn, node, true); - } - - static void delete_child(struct connection *conn, -@@ -1592,7 +1592,9 @@ static void delete_child(struct connection *conn, - - for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) { - if (streq(node->children+i, childname)) { -- remove_child_entry(conn, node, i); -+ if (remove_child_entry(conn, node, i)) -+ corrupt(conn, "Can't update parent node '%s'", -+ node->name); - return; - } - } -@@ -2226,6 +2228,17 @@ int remember_string(struct hashtable *hash, const char *str) - return hashtable_insert(hash, k, (void *)1); - } - -+static int rm_child_entry(struct node *node, size_t off, size_t len) -+{ -+ if (!recovery) -+ return off; -+ -+ if (remove_child_entry(NULL, node, off)) -+ log("check_store: child entry could not be removed from '%s'", -+ node->name); -+ -+ return off - len - 1; -+} - - /** - * A node has a children field that names the children of the node, separated -@@ -2278,12 +2291,7 @@ static int check_store_(const char *name, struct hashtable *reachable) - if (hashtable_search(children, childname)) { - log("check_store: '%s' is duplicated!", - childname); -- -- if (recovery) { -- remove_child_entry(NULL, node, -- i); -- i -= childlen + 1; -- } -+ i = rm_child_entry(node, i, childlen); - } - else { - if (!remember_string(children, -@@ -2300,11 +2308,7 @@ static int check_store_(const char *name, struct hashtable *reachable) - } else if (errno != ENOMEM) { - log("check_store: No child '%s' found!\n", - childname); -- -- if (recovery) { -- remove_child_entry(NULL, node, i); -- i -= childlen + 1; -- } -+ i = rm_child_entry(node, i, childlen); - } else { - log("check_store: ENOMEM"); - ret = ENOMEM; --- -2.37.4 - diff --git a/0113-tools-xenstore-add-generic-treewalk-function.patch b/0113-tools-xenstore-add-generic-treewalk-function.patch deleted file mode 100644 index b80c574..0000000 --- a/0113-tools-xenstore-add-generic-treewalk-function.patch +++ /dev/null @@ -1,250 +0,0 @@ -From 83b6c511a5989a83c50daae83c5b5a683d6dc096 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:11 +0200 -Subject: [PATCH 113/126] tools/xenstore: add generic treewalk function - -Add a generic function to walk the complete node tree. It will start -at "/" and descend recursively into each child, calling a function -specified by the caller. Depending on the return value of the user -specified function the walk will be aborted, continued, or the current -child will be skipped by not descending into its children. - -This is part of XSA-418 / CVE-2022-42321. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 0d7c5d19bc27492360196e7dad2b227908564fff) ---- - tools/xenstore/xenstored_core.c | 143 +++++++++++++++++++++++++++++--- - tools/xenstore/xenstored_core.h | 40 +++++++++ - 2 files changed, 170 insertions(+), 13 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 4c3897721bdd..7463d0a002d7 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1804,6 +1804,135 @@ static int do_set_perms(const void *ctx, struct connection *conn, - return 0; - } - -+static char *child_name(const void *ctx, const char *s1, const char *s2) -+{ -+ if (strcmp(s1, "/")) -+ return talloc_asprintf(ctx, "%s/%s", s1, s2); -+ return talloc_asprintf(ctx, "/%s", s2); -+} -+ -+static int rm_from_parent(struct connection *conn, struct node *parent, -+ const char *name) -+{ -+ size_t off; -+ -+ if (!parent) -+ return WALK_TREE_ERROR_STOP; -+ -+ for (off = parent->childoff - 1; off && parent->children[off - 1]; -+ off--); -+ if (remove_child_entry(conn, parent, off)) { -+ log("treewalk: child entry could not be removed from '%s'", -+ parent->name); -+ return WALK_TREE_ERROR_STOP; -+ } -+ parent->childoff = off; -+ -+ return WALK_TREE_OK; -+} -+ -+static int walk_call_func(const void *ctx, struct connection *conn, -+ struct node *node, struct node *parent, void *arg, -+ int (*func)(const void *ctx, struct connection *conn, -+ struct node *node, void *arg)) -+{ -+ int ret; -+ -+ if (!func) -+ return WALK_TREE_OK; -+ -+ ret = func(ctx, conn, node, arg); -+ if (ret == WALK_TREE_RM_CHILDENTRY && parent) -+ ret = rm_from_parent(conn, parent, node->name); -+ -+ return ret; -+} -+ -+int walk_node_tree(const void *ctx, struct connection *conn, const char *root, -+ struct walk_funcs *funcs, void *arg) -+{ -+ int ret = 0; -+ void *tmpctx; -+ char *name; -+ struct node *node = NULL; -+ struct node *parent = NULL; -+ -+ tmpctx = talloc_new(ctx); -+ if (!tmpctx) { -+ errno = ENOMEM; -+ return WALK_TREE_ERROR_STOP; -+ } -+ name = talloc_strdup(tmpctx, root); -+ if (!name) { -+ errno = ENOMEM; -+ talloc_free(tmpctx); -+ return WALK_TREE_ERROR_STOP; -+ } -+ -+ /* Continue the walk until an error is returned. */ -+ while (ret >= 0) { -+ /* node == NULL possible only for the initial loop iteration. */ -+ if (node) { -+ /* Go one step up if ret or if last child finished. */ -+ if (ret || node->childoff >= node->childlen) { -+ parent = node->parent; -+ /* Call function AFTER processing a node. */ -+ ret = walk_call_func(ctx, conn, node, parent, -+ arg, funcs->exit); -+ /* Last node, so exit loop. */ -+ if (!parent) -+ break; -+ talloc_free(node); -+ /* Continue with parent. */ -+ node = parent; -+ continue; -+ } -+ /* Get next child of current node. */ -+ name = child_name(tmpctx, node->name, -+ node->children + node->childoff); -+ if (!name) { -+ ret = WALK_TREE_ERROR_STOP; -+ break; -+ } -+ /* Point to next child. */ -+ node->childoff += strlen(node->children + -+ node->childoff) + 1; -+ /* Descent into children. */ -+ parent = node; -+ } -+ /* Read next node (root node or next child). */ -+ node = read_node(conn, tmpctx, name); -+ if (!node) { -+ /* Child not found - should not happen! */ -+ /* ENOENT case can be handled by supplied function. */ -+ if (errno == ENOENT && funcs->enoent) -+ ret = funcs->enoent(ctx, conn, parent, name, -+ arg); -+ else -+ ret = WALK_TREE_ERROR_STOP; -+ if (!parent) -+ break; -+ if (ret == WALK_TREE_RM_CHILDENTRY) -+ ret = rm_from_parent(conn, parent, name); -+ if (ret < 0) -+ break; -+ talloc_free(name); -+ node = parent; -+ continue; -+ } -+ talloc_free(name); -+ node->parent = parent; -+ node->childoff = 0; -+ /* Call function BEFORE processing a node. */ -+ ret = walk_call_func(ctx, conn, node, parent, arg, -+ funcs->enter); -+ } -+ -+ talloc_free(tmpctx); -+ -+ return ret < 0 ? ret : WALK_TREE_OK; -+} -+ - static struct { - const char *str; - int (*func)(const void *ctx, struct connection *conn, -@@ -2206,18 +2335,6 @@ static int keys_equal_fn(void *key1, void *key2) - return 0 == strcmp((char *)key1, (char *)key2); - } - -- --static char *child_name(const char *s1, const char *s2) --{ -- if (strcmp(s1, "/")) { -- return talloc_asprintf(NULL, "%s/%s", s1, s2); -- } -- else { -- return talloc_asprintf(NULL, "/%s", s2); -- } --} -- -- - int remember_string(struct hashtable *hash, const char *str) - { - char *k = malloc(strlen(str) + 1); -@@ -2277,7 +2394,7 @@ static int check_store_(const char *name, struct hashtable *reachable) - while (i < node->childlen && !ret) { - struct node *childnode; - size_t childlen = strlen(node->children + i); -- char * childname = child_name(node->name, -+ char * childname = child_name(NULL, node->name, - node->children + i); - - if (!childname) { -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 1eb3708f82dd..f0fd8c352857 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -195,6 +195,7 @@ struct node { - - /* Children, each nul-terminated. */ - unsigned int childlen; -+ unsigned int childoff; /* Used by walk_node_tree() internally. */ - char *children; - - /* Allocation information for node currently in store. */ -@@ -334,6 +335,45 @@ void read_state_buffered_data(const void *ctx, struct connection *conn, - const struct xs_state_connection *sc); - void read_state_node(const void *ctx, const void *state); - -+/* -+ * Walk the node tree below root calling funcs->enter() and funcs->exit() for -+ * each node. funcs->enter() is being called when entering a node, so before -+ * any of the children of the node is processed. funcs->exit() is being -+ * called when leaving the node, so after all children have been processed. -+ * funcs->enoent() is being called when a node isn't existing. -+ * funcs->*() return values: -+ * < 0: tree walk is stopped, walk_node_tree() returns funcs->*() return value -+ * in case WALK_TREE_ERROR_STOP is returned, errno should be set -+ * WALK_TREE_OK: tree walk is continuing -+ * WALK_TREE_SKIP_CHILDREN: tree walk won't descend below current node, but -+ * walk continues -+ * WALK_TREE_RM_CHILDENTRY: Remove the child entry from its parent and write -+ * the modified parent node back to the data base, implies to not descend -+ * below the current node, but to continue the walk -+ * funcs->*() is allowed to modify the node it is called for in the data base. -+ * In case funcs->enter() is deleting the node, it must not return WALK_TREE_OK -+ * in order to avoid descending into no longer existing children. -+ */ -+/* Return values for funcs->*() and walk_node_tree(). */ -+#define WALK_TREE_SUCCESS_STOP -100 /* Stop walk early, no error. */ -+#define WALK_TREE_ERROR_STOP -1 /* Stop walk due to error. */ -+#define WALK_TREE_OK 0 /* No error. */ -+/* Return value for funcs->*() only. */ -+#define WALK_TREE_SKIP_CHILDREN 1 /* Don't recurse below current node. */ -+#define WALK_TREE_RM_CHILDENTRY 2 /* Remove child entry from parent. */ -+ -+struct walk_funcs { -+ int (*enter)(const void *ctx, struct connection *conn, -+ struct node *node, void *arg); -+ int (*exit)(const void *ctx, struct connection *conn, -+ struct node *node, void *arg); -+ int (*enoent)(const void *ctx, struct connection *conn, -+ struct node *parent, char *name, void *arg); -+}; -+ -+int walk_node_tree(const void *ctx, struct connection *conn, const char *root, -+ struct walk_funcs *funcs, void *arg); -+ - #endif /* _XENSTORED_CORE_H */ - - /* --- -2.37.4 - diff --git a/0114-tools-xenstore-simplify-check_store.patch b/0114-tools-xenstore-simplify-check_store.patch deleted file mode 100644 index 6247114..0000000 --- a/0114-tools-xenstore-simplify-check_store.patch +++ /dev/null @@ -1,114 +0,0 @@ -From 4096512a70fd0bb65e40ed4269a1ca74dbb16220 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:12 +0200 -Subject: [PATCH 114/126] tools/xenstore: simplify check_store() - -check_store() is using a hash table for storing all node names it has -found via walking the tree. Additionally it using another hash table -for all children of a node to detect duplicate child names. - -Simplify that by dropping the second hash table as the first one is -already holding all the needed information. - -This is part of XSA-418 / CVE-2022-42321. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 70f719f52a220bc5bc987e4dd28e14a7039a176b) ---- - tools/xenstore/xenstored_core.c | 47 +++++++++++---------------------- - 1 file changed, 15 insertions(+), 32 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 7463d0a002d7..a48255c64cad 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -2378,50 +2378,34 @@ static int check_store_(const char *name, struct hashtable *reachable) - if (node) { - size_t i = 0; - -- struct hashtable * children = -- create_hashtable(16, hash_from_key_fn, keys_equal_fn); -- if (!children) { -- log("check_store create table: ENOMEM"); -- return ENOMEM; -- } -- - if (!remember_string(reachable, name)) { -- hashtable_destroy(children, 0); - log("check_store: ENOMEM"); - return ENOMEM; - } - - while (i < node->childlen && !ret) { -- struct node *childnode; -+ struct node *childnode = NULL; - size_t childlen = strlen(node->children + i); -- char * childname = child_name(NULL, node->name, -- node->children + i); -+ char *childname = child_name(NULL, node->name, -+ node->children + i); - - if (!childname) { - log("check_store: ENOMEM"); - ret = ENOMEM; - break; - } -+ -+ if (hashtable_search(reachable, childname)) { -+ log("check_store: '%s' is duplicated!", -+ childname); -+ i = rm_child_entry(node, i, childlen); -+ goto next; -+ } -+ - childnode = read_node(NULL, childname, childname); -- -+ - if (childnode) { -- if (hashtable_search(children, childname)) { -- log("check_store: '%s' is duplicated!", -- childname); -- i = rm_child_entry(node, i, childlen); -- } -- else { -- if (!remember_string(children, -- childname)) { -- log("check_store: ENOMEM"); -- talloc_free(childnode); -- talloc_free(childname); -- ret = ENOMEM; -- break; -- } -- ret = check_store_(childname, -- reachable); -- } -+ ret = check_store_(childname, reachable); - } else if (errno != ENOMEM) { - log("check_store: No child '%s' found!\n", - childname); -@@ -2431,19 +2415,18 @@ static int check_store_(const char *name, struct hashtable *reachable) - ret = ENOMEM; - } - -+ next: - talloc_free(childnode); - talloc_free(childname); - i += childlen + 1; - } - -- hashtable_destroy(children, 0 /* Don't free values (they are -- all (void *)1) */); - talloc_free(node); - } else if (errno != ENOMEM) { - /* Impossible, because no database should ever be without the - root, and otherwise, we've just checked in our caller - (which made a recursive call to get here). */ -- -+ - log("check_store: No child '%s' found: impossible!", name); - } else { - log("check_store: ENOMEM"); --- -2.37.4 - diff --git a/0115-tools-xenstore-use-treewalk-for-check_store.patch b/0115-tools-xenstore-use-treewalk-for-check_store.patch deleted file mode 100644 index 74d58f4..0000000 --- a/0115-tools-xenstore-use-treewalk-for-check_store.patch +++ /dev/null @@ -1,172 +0,0 @@ -From a95277ee36e1db2f67e8091f4ea401975d341659 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:12 +0200 -Subject: [PATCH 115/126] tools/xenstore: use treewalk for check_store() - -Instead of doing an open tree walk using call recursion, use -walk_node_tree() when checking the store for inconsistencies. - -This will reduce code size and avoid many nesting levels of function -calls which could potentially exhaust the stack. - -This is part of XSA-418 / CVE-2022-42321. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit a07cc0ec60612f414bedf2bafb26ec38d2602e95) ---- - tools/xenstore/xenstored_core.c | 109 +++++++++----------------------- - 1 file changed, 30 insertions(+), 79 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index a48255c64cad..ed8bc9b02ed2 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -2345,18 +2345,6 @@ int remember_string(struct hashtable *hash, const char *str) - return hashtable_insert(hash, k, (void *)1); - } - --static int rm_child_entry(struct node *node, size_t off, size_t len) --{ -- if (!recovery) -- return off; -- -- if (remove_child_entry(NULL, node, off)) -- log("check_store: child entry could not be removed from '%s'", -- node->name); -- -- return off - len - 1; --} -- - /** - * A node has a children field that names the children of the node, separated - * by NULs. We check whether there are entries in there that are duplicated -@@ -2370,70 +2358,29 @@ static int rm_child_entry(struct node *node, size_t off, size_t len) - * As we go, we record each node in the given reachable hashtable. These - * entries will be used later in clean_store. - */ --static int check_store_(const char *name, struct hashtable *reachable) -+static int check_store_step(const void *ctx, struct connection *conn, -+ struct node *node, void *arg) - { -- struct node *node = read_node(NULL, name, name); -- int ret = 0; -+ struct hashtable *reachable = arg; - -- if (node) { -- size_t i = 0; -- -- if (!remember_string(reachable, name)) { -- log("check_store: ENOMEM"); -- return ENOMEM; -- } -- -- while (i < node->childlen && !ret) { -- struct node *childnode = NULL; -- size_t childlen = strlen(node->children + i); -- char *childname = child_name(NULL, node->name, -- node->children + i); -- -- if (!childname) { -- log("check_store: ENOMEM"); -- ret = ENOMEM; -- break; -- } -- -- if (hashtable_search(reachable, childname)) { -- log("check_store: '%s' is duplicated!", -- childname); -- i = rm_child_entry(node, i, childlen); -- goto next; -- } -- -- childnode = read_node(NULL, childname, childname); -- -- if (childnode) { -- ret = check_store_(childname, reachable); -- } else if (errno != ENOMEM) { -- log("check_store: No child '%s' found!\n", -- childname); -- i = rm_child_entry(node, i, childlen); -- } else { -- log("check_store: ENOMEM"); -- ret = ENOMEM; -- } -- -- next: -- talloc_free(childnode); -- talloc_free(childname); -- i += childlen + 1; -- } -- -- talloc_free(node); -- } else if (errno != ENOMEM) { -- /* Impossible, because no database should ever be without the -- root, and otherwise, we've just checked in our caller -- (which made a recursive call to get here). */ -- -- log("check_store: No child '%s' found: impossible!", name); -- } else { -- log("check_store: ENOMEM"); -- ret = ENOMEM; -+ if (hashtable_search(reachable, (void *)node->name)) { -+ log("check_store: '%s' is duplicated!", node->name); -+ return recovery ? WALK_TREE_RM_CHILDENTRY -+ : WALK_TREE_SKIP_CHILDREN; - } - -- return ret; -+ if (!remember_string(reachable, node->name)) -+ return WALK_TREE_ERROR_STOP; -+ -+ return WALK_TREE_OK; -+} -+ -+static int check_store_enoent(const void *ctx, struct connection *conn, -+ struct node *parent, char *name, void *arg) -+{ -+ log("check_store: node '%s' not found", name); -+ -+ return recovery ? WALK_TREE_RM_CHILDENTRY : WALK_TREE_OK; - } - - -@@ -2482,24 +2429,28 @@ static void clean_store(struct hashtable *reachable) - - void check_store(void) - { -- char * root = talloc_strdup(NULL, "/"); -- struct hashtable * reachable = -- create_hashtable(16, hash_from_key_fn, keys_equal_fn); -- -+ struct hashtable *reachable; -+ struct walk_funcs walkfuncs = { -+ .enter = check_store_step, -+ .enoent = check_store_enoent, -+ }; -+ -+ reachable = create_hashtable(16, hash_from_key_fn, keys_equal_fn); - if (!reachable) { - log("check_store: ENOMEM"); - return; - } - - log("Checking store ..."); -- if (!check_store_(root, reachable) && -- !check_transactions(reachable)) -+ if (walk_node_tree(NULL, NULL, "/", &walkfuncs, reachable)) { -+ if (errno == ENOMEM) -+ log("check_store: ENOMEM"); -+ } else if (!check_transactions(reachable)) - clean_store(reachable); - log("Checking store complete."); - - hashtable_destroy(reachable, 0 /* Don't free values (they are all - (void *)1) */); -- talloc_free(root); - } - - --- -2.37.4 - diff --git a/0116-tools-xenstore-use-treewalk-for-deleting-nodes.patch b/0116-tools-xenstore-use-treewalk-for-deleting-nodes.patch deleted file mode 100644 index 2dcf32e..0000000 --- a/0116-tools-xenstore-use-treewalk-for-deleting-nodes.patch +++ /dev/null @@ -1,180 +0,0 @@ -From 9ead5845034c04a5c6e04d9b069d9c13141f4f33 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:12 +0200 -Subject: [PATCH 116/126] tools/xenstore: use treewalk for deleting nodes - -Instead of doing an open tree walk using call recursion, use -walk_node_tree() when deleting a sub-tree of nodes. - -This will reduce code size and avoid many nesting levels of function -calls which could potentially exhaust the stack. - -This is part of XSA-418 / CVE-2022-42321. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit ea16962053a6849a6e7cada549ba7f8c586d85c6) ---- - tools/xenstore/xenstored_core.c | 99 ++++++++++++++------------------- - 1 file changed, 43 insertions(+), 56 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index ed8bc9b02ed2..9576411757fa 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1300,21 +1300,6 @@ static int do_read(const void *ctx, struct connection *conn, - return 0; - } - --static void delete_node_single(struct connection *conn, struct node *node) --{ -- TDB_DATA key; -- -- if (access_node(conn, node, NODE_ACCESS_DELETE, &key)) -- return; -- -- if (do_tdb_delete(conn, &key, &node->acc) != 0) { -- corrupt(conn, "Could not delete '%s'", node->name); -- return; -- } -- -- domain_entry_dec(conn, node); --} -- - /* Must not be / */ - static char *basename(const char *name) - { -@@ -1585,69 +1570,59 @@ static int remove_child_entry(struct connection *conn, struct node *node, - return write_node(conn, node, true); - } - --static void delete_child(struct connection *conn, -- struct node *node, const char *childname) -+static int delete_child(struct connection *conn, -+ struct node *node, const char *childname) - { - unsigned int i; - - for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) { - if (streq(node->children+i, childname)) { -- if (remove_child_entry(conn, node, i)) -- corrupt(conn, "Can't update parent node '%s'", -- node->name); -- return; -+ errno = remove_child_entry(conn, node, i) ? EIO : 0; -+ return errno; - } - } - corrupt(conn, "Can't find child '%s' in %s", childname, node->name); -+ -+ errno = EIO; -+ return errno; - } - --static int delete_node(struct connection *conn, const void *ctx, -- struct node *parent, struct node *node, bool watch_exact) -+static int delnode_sub(const void *ctx, struct connection *conn, -+ struct node *node, void *arg) - { -- char *name; -+ const char *root = arg; -+ bool watch_exact; -+ int ret; -+ TDB_DATA key; - -- /* Delete children. */ -- while (node->childlen) { -- struct node *child; -+ /* Any error here will probably be repeated for all following calls. */ -+ ret = access_node(conn, node, NODE_ACCESS_DELETE, &key); -+ if (ret > 0) -+ return WALK_TREE_SUCCESS_STOP; - -- name = talloc_asprintf(node, "%s/%s", node->name, -- node->children); -- child = name ? read_node(conn, node, name) : NULL; -- if (child) { -- if (delete_node(conn, ctx, node, child, true)) -- return errno; -- } else { -- trace("delete_node: Error deleting child '%s/%s'!\n", -- node->name, node->children); -- /* Quit deleting. */ -- errno = ENOMEM; -- return errno; -- } -- talloc_free(name); -- } -+ /* In case of error stop the walk. */ -+ if (!ret && do_tdb_delete(conn, &key, &node->acc)) -+ return WALK_TREE_SUCCESS_STOP; - - /* - * Fire the watches now, when we can still see the node permissions. - * This fine as we are single threaded and the next possible read will - * be handled only after the node has been really removed. -- */ -+ */ -+ watch_exact = strcmp(root, node->name); - fire_watches(conn, ctx, node->name, node, watch_exact, NULL); -- delete_node_single(conn, node); -- delete_child(conn, parent, basename(node->name)); -- talloc_free(node); - -- return 0; -+ domain_entry_dec(conn, node); -+ -+ return WALK_TREE_RM_CHILDENTRY; - } - --static int _rm(struct connection *conn, const void *ctx, struct node *node, -- const char *name) -+static int _rm(struct connection *conn, const void *ctx, const char *name) - { -- /* -- * Deleting node by node, so the result is always consistent even in -- * case of a failure. -- */ - struct node *parent; - char *parentname = get_parent(ctx, name); -+ struct walk_funcs walkfuncs = { .exit = delnode_sub }; -+ int ret; - - if (!parentname) - return errno; -@@ -1655,9 +1630,21 @@ static int _rm(struct connection *conn, const void *ctx, struct node *node, - parent = read_node(conn, ctx, parentname); - if (!parent) - return read_node_can_propagate_errno() ? errno : EINVAL; -- node->parent = parent; - -- return delete_node(conn, ctx, parent, node, false); -+ ret = walk_node_tree(ctx, conn, name, &walkfuncs, (void *)name); -+ if (ret < 0) { -+ if (ret == WALK_TREE_ERROR_STOP) { -+ corrupt(conn, "error when deleting sub-nodes of %s\n", -+ name); -+ errno = EIO; -+ } -+ return errno; -+ } -+ -+ if (delete_child(conn, parent, basename(name))) -+ return errno; -+ -+ return 0; - } - - -@@ -1694,7 +1681,7 @@ static int do_rm(const void *ctx, struct connection *conn, - if (streq(name, "/")) - return EINVAL; - -- ret = _rm(conn, ctx, node, name); -+ ret = _rm(conn, ctx, name); - if (ret) - return ret; - --- -2.37.4 - diff --git a/0117-tools-xenstore-use-treewalk-for-creating-node-record.patch b/0117-tools-xenstore-use-treewalk-for-creating-node-record.patch deleted file mode 100644 index 6271169..0000000 --- a/0117-tools-xenstore-use-treewalk-for-creating-node-record.patch +++ /dev/null @@ -1,242 +0,0 @@ -From 84674f206778e9b3d8d67c6c76aa8094a262d5ec Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:12 +0200 -Subject: [PATCH 117/126] tools/xenstore: use treewalk for creating node - records - -Instead of doing an open tree walk using call recursion, use -walk_node_tree() when creating the node records during a live update. - -This will reduce code size and avoid many nesting levels of function -calls which could potentially exhaust the stack. - -This is part of XSA-418 / CVE-2022-42321. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 297ac246a5d8ed656b349641288f3402dcc0251e) ---- - tools/xenstore/xenstored_core.c | 127 ++++++++++++------------------ - tools/xenstore/xenstored_core.h | 3 +- - tools/xenstore/xenstored_domain.c | 2 +- - 3 files changed, 54 insertions(+), 78 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 9576411757fa..e8cdfeef50c7 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -2990,132 +2990,109 @@ const char *dump_state_buffered_data(FILE *fp, const struct connection *c, - return NULL; - } - --const char *dump_state_node_perms(FILE *fp, struct xs_state_node *sn, -- const struct xs_permissions *perms, -+const char *dump_state_node_perms(FILE *fp, const struct xs_permissions *perms, - unsigned int n_perms) - { - unsigned int p; - - for (p = 0; p < n_perms; p++) { -+ struct xs_state_node_perm sp; -+ - switch ((int)perms[p].perms & ~XS_PERM_IGNORE) { - case XS_PERM_READ: -- sn->perms[p].access = XS_STATE_NODE_PERM_READ; -+ sp.access = XS_STATE_NODE_PERM_READ; - break; - case XS_PERM_WRITE: -- sn->perms[p].access = XS_STATE_NODE_PERM_WRITE; -+ sp.access = XS_STATE_NODE_PERM_WRITE; - break; - case XS_PERM_READ | XS_PERM_WRITE: -- sn->perms[p].access = XS_STATE_NODE_PERM_BOTH; -+ sp.access = XS_STATE_NODE_PERM_BOTH; - break; - default: -- sn->perms[p].access = XS_STATE_NODE_PERM_NONE; -+ sp.access = XS_STATE_NODE_PERM_NONE; - break; - } -- sn->perms[p].flags = (perms[p].perms & XS_PERM_IGNORE) -+ sp.flags = (perms[p].perms & XS_PERM_IGNORE) - ? XS_STATE_NODE_PERM_IGNORE : 0; -- sn->perms[p].domid = perms[p].id; -- } -+ sp.domid = perms[p].id; - -- if (fwrite(sn->perms, sizeof(*sn->perms), n_perms, fp) != n_perms) -- return "Dump node permissions error"; -+ if (fwrite(&sp, sizeof(sp), 1, fp) != 1) -+ return "Dump node permissions error"; -+ } - - return NULL; - } - --static const char *dump_state_node_tree(FILE *fp, char *path) -+struct dump_node_data { -+ FILE *fp; -+ const char *err; -+}; -+ -+static int dump_state_node_err(struct dump_node_data *data, const char *err) - { -- unsigned int pathlen, childlen, p = 0; -+ data->err = err; -+ return WALK_TREE_ERROR_STOP; -+} -+ -+static int dump_state_node(const void *ctx, struct connection *conn, -+ struct node *node, void *arg) -+{ -+ struct dump_node_data *data = arg; -+ FILE *fp = data->fp; -+ unsigned int pathlen; - struct xs_state_record_header head; - struct xs_state_node sn; -- TDB_DATA key, data; -- const struct xs_tdb_record_hdr *hdr; -- const char *child; - const char *ret; - -- pathlen = strlen(path) + 1; -- -- set_tdb_key(path, &key); -- data = tdb_fetch(tdb_ctx, key); -- if (data.dptr == NULL) -- return "Error reading node"; -- -- /* Clean up in case of failure. */ -- talloc_steal(path, data.dptr); -- -- hdr = (void *)data.dptr; -+ pathlen = strlen(node->name) + 1; - - head.type = XS_STATE_TYPE_NODE; - head.length = sizeof(sn); - sn.conn_id = 0; - sn.ta_id = 0; - sn.ta_access = 0; -- sn.perm_n = hdr->num_perms; -+ sn.perm_n = node->perms.num; - sn.path_len = pathlen; -- sn.data_len = hdr->datalen; -- head.length += hdr->num_perms * sizeof(*sn.perms); -+ sn.data_len = node->datalen; -+ head.length += node->perms.num * sizeof(*sn.perms); - head.length += pathlen; -- head.length += hdr->datalen; -+ head.length += node->datalen; - head.length = ROUNDUP(head.length, 3); - - if (fwrite(&head, sizeof(head), 1, fp) != 1) -- return "Dump node state error"; -+ return dump_state_node_err(data, "Dump node head error"); - if (fwrite(&sn, sizeof(sn), 1, fp) != 1) -- return "Dump node state error"; -+ return dump_state_node_err(data, "Dump node state error"); - -- ret = dump_state_node_perms(fp, &sn, hdr->perms, hdr->num_perms); -+ ret = dump_state_node_perms(fp, node->perms.p, node->perms.num); - if (ret) -- return ret; -+ return dump_state_node_err(data, ret); - -- if (fwrite(path, pathlen, 1, fp) != 1) -- return "Dump node path error"; -- if (hdr->datalen && -- fwrite(hdr->perms + hdr->num_perms, hdr->datalen, 1, fp) != 1) -- return "Dump node data error"; -+ if (fwrite(node->name, pathlen, 1, fp) != 1) -+ return dump_state_node_err(data, "Dump node path error"); -+ -+ if (node->datalen && fwrite(node->data, node->datalen, 1, fp) != 1) -+ return dump_state_node_err(data, "Dump node data error"); - - ret = dump_state_align(fp); - if (ret) -- return ret; -+ return dump_state_node_err(data, ret); - -- child = (char *)(hdr->perms + hdr->num_perms) + hdr->datalen; -- -- /* -- * Use path for constructing children paths. -- * As we don't write out nodes without having written their parent -- * already we will never clobber a part of the path we'll need later. -- */ -- pathlen--; -- if (path[pathlen - 1] != '/') { -- path[pathlen] = '/'; -- pathlen++; -- } -- while (p < hdr->childlen) { -- childlen = strlen(child) + 1; -- if (pathlen + childlen > XENSTORE_ABS_PATH_MAX) -- return "Dump node path length error"; -- strcpy(path + pathlen, child); -- ret = dump_state_node_tree(fp, path); -- if (ret) -- return ret; -- p += childlen; -- child += childlen; -- } -- -- talloc_free(data.dptr); -- -- return NULL; -+ return WALK_TREE_OK; - } - - const char *dump_state_nodes(FILE *fp, const void *ctx) - { -- char *path; -+ struct dump_node_data data = { -+ .fp = fp, -+ .err = "Dump node walk error" -+ }; -+ struct walk_funcs walkfuncs = { .enter = dump_state_node }; - -- path = talloc_size(ctx, XENSTORE_ABS_PATH_MAX); -- if (!path) -- return "Path buffer allocation error"; -+ if (walk_node_tree(ctx, NULL, "/", &walkfuncs, &data)) -+ return data.err; - -- strcpy(path, "/"); -- -- return dump_state_node_tree(fp, path); -+ return NULL; - } - - void read_state_global(const void *ctx, const void *state) -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index f0fd8c352857..3190494bbeb5 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -326,8 +326,7 @@ const char *dump_state_buffered_data(FILE *fp, const struct connection *c, - const struct connection *conn, - struct xs_state_connection *sc); - const char *dump_state_nodes(FILE *fp, const void *ctx); --const char *dump_state_node_perms(FILE *fp, struct xs_state_node *sn, -- const struct xs_permissions *perms, -+const char *dump_state_node_perms(FILE *fp, const struct xs_permissions *perms, - unsigned int n_perms); - - void read_state_global(const void *ctx, const void *state); -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index 8b503c2dfe07..a91cc75ab59b 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -1449,7 +1449,7 @@ static const char *dump_state_special_node(FILE *fp, const char *name, - if (fwrite(&sn, sizeof(sn), 1, fp) != 1) - return "Dump special node error"; - -- ret = dump_state_node_perms(fp, &sn, perms->p, perms->num); -+ ret = dump_state_node_perms(fp, perms->p, perms->num); - if (ret) - return ret; - --- -2.37.4 - diff --git a/0118-tools-xenstore-remove-nodes-owned-by-destroyed-domai.patch b/0118-tools-xenstore-remove-nodes-owned-by-destroyed-domai.patch deleted file mode 100644 index a95a48e..0000000 --- a/0118-tools-xenstore-remove-nodes-owned-by-destroyed-domai.patch +++ /dev/null @@ -1,299 +0,0 @@ -From da87661d058c4a6cf2ea6439771b9834f1c06223 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:12 +0200 -Subject: [PATCH 118/126] tools/xenstore: remove nodes owned by destroyed - domain - -In case a domain is removed from Xenstore, remove all nodes owned by -it per default. - -This tackles the problem that nodes might be created by a domain -outside its home path in Xenstore, leading to Xenstore hogging more -and more memory. Domain quota don't work in this case if the guest is -rebooting in between. - -Since XSA-322 ownership of such stale nodes is transferred to dom0, -which is helping against unintended access, but not against OOM of -Xenstore. - -As a fallback for weird cases add a Xenstore start parameter for -keeping today's way to handle stale nodes, adding the risk of Xenstore -hitting an OOM situation. - -This is part of XSA-419 / CVE-2022-42322. - -Fixes: 496306324d8d ("tools/xenstore: revoke access rights for removed domains") -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 755d3f9debf8879448211fffb018f556136f6a79) ---- - tools/xenstore/xenstored_core.c | 17 +++++-- - tools/xenstore/xenstored_core.h | 4 ++ - tools/xenstore/xenstored_domain.c | 84 +++++++++++++++++++++++-------- - tools/xenstore/xenstored_domain.h | 2 +- - 4 files changed, 80 insertions(+), 27 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index e8cdfeef50c7..d5b2e59b0db6 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -80,6 +80,7 @@ static bool verbose = false; - LIST_HEAD(connections); - int tracefd = -1; - static bool recovery = true; -+bool keep_orphans = false; - static int reopen_log_pipe[2]; - static int reopen_log_pipe0_pollfd_idx = -1; - char *tracefile = NULL; -@@ -722,7 +723,7 @@ struct node *read_node(struct connection *conn, const void *ctx, - node->perms.p = hdr->perms; - node->acc.domid = node->perms.p[0].id; - node->acc.memory = data.dsize; -- if (domain_adjust_node_perms(conn, node)) -+ if (domain_adjust_node_perms(node)) - goto error; - - /* If owner is gone reset currently accounted memory size. */ -@@ -765,7 +766,7 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - void *p; - struct xs_tdb_record_hdr *hdr; - -- if (domain_adjust_node_perms(conn, node)) -+ if (domain_adjust_node_perms(node)) - return errno; - - data.dsize = sizeof(*hdr) -@@ -1617,7 +1618,7 @@ static int delnode_sub(const void *ctx, struct connection *conn, - return WALK_TREE_RM_CHILDENTRY; - } - --static int _rm(struct connection *conn, const void *ctx, const char *name) -+int rm_node(struct connection *conn, const void *ctx, const char *name) - { - struct node *parent; - char *parentname = get_parent(ctx, name); -@@ -1681,7 +1682,7 @@ static int do_rm(const void *ctx, struct connection *conn, - if (streq(name, "/")) - return EINVAL; - -- ret = _rm(conn, ctx, name); -+ ret = rm_node(conn, ctx, name); - if (ret) - return ret; - -@@ -2537,6 +2538,8 @@ static void usage(void) - " -R, --no-recovery to request that no recovery should be attempted when\n" - " the store is corrupted (debug only),\n" - " -I, --internal-db store database in memory, not on disk\n" -+" -K, --keep-orphans don't delete nodes owned by a domain when the\n" -+" domain is deleted (this is a security risk!)\n" - " -V, --verbose to request verbose execution.\n"); - } - -@@ -2561,6 +2564,7 @@ static struct option options[] = { - { "timeout", 1, NULL, 'w' }, - { "no-recovery", 0, NULL, 'R' }, - { "internal-db", 0, NULL, 'I' }, -+ { "keep-orphans", 0, NULL, 'K' }, - { "verbose", 0, NULL, 'V' }, - { "watch-nb", 1, NULL, 'W' }, - #ifndef NO_LIVE_UPDATE -@@ -2641,7 +2645,7 @@ int main(int argc, char *argv[]) - orig_argc = argc; - orig_argv = argv; - -- while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:Q:q:T:RVW:w:U", -+ while ((opt = getopt_long(argc, argv, "DE:F:HKNPS:t:A:M:Q:q:T:RVW:w:U", - options, NULL)) != -1) { - switch (opt) { - case 'D': -@@ -2677,6 +2681,9 @@ int main(int argc, char *argv[]) - case 'I': - tdb_flags = TDB_INTERNAL|TDB_NOLOCK; - break; -+ case 'K': -+ keep_orphans = true; -+ break; - case 'V': - verbose = true; - break; -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 3190494bbeb5..9a9dbb2c3c86 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -233,6 +233,9 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - struct node *read_node(struct connection *conn, const void *ctx, - const char *name); - -+/* Remove a node and its children. */ -+int rm_node(struct connection *conn, const void *ctx, const char *name); -+ - void setup_structure(bool live_update); - struct connection *new_connection(connwritefn_t *write, connreadfn_t *read); - struct connection *get_connection_by_id(unsigned int conn_id); -@@ -279,6 +282,7 @@ extern int quota_req_outstanding; - extern int quota_trans_nodes; - extern int quota_memory_per_domain_soft; - extern int quota_memory_per_domain_hard; -+extern bool keep_orphans; - - extern unsigned int timeout_watch_event_msec; - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index a91cc75ab59b..ee4b19387db8 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -196,10 +196,64 @@ static void unmap_interface(void *interface) - xengnttab_unmap(*xgt_handle, interface, 1); - } - -+static int domain_tree_remove_sub(const void *ctx, struct connection *conn, -+ struct node *node, void *arg) -+{ -+ struct domain *domain = arg; -+ TDB_DATA key; -+ int ret = WALK_TREE_OK; -+ -+ if (node->perms.p[0].id != domain->domid) -+ return WALK_TREE_OK; -+ -+ if (keep_orphans) { -+ set_tdb_key(node->name, &key); -+ domain->nbentry--; -+ node->perms.p[0].id = priv_domid; -+ node->acc.memory = 0; -+ domain_entry_inc(NULL, node); -+ if (write_node_raw(NULL, &key, node, true)) { -+ /* That's unfortunate. We only can try to continue. */ -+ syslog(LOG_ERR, -+ "error when moving orphaned node %s to dom0\n", -+ node->name); -+ } else -+ trace("orphaned node %s moved to dom0\n", node->name); -+ } else { -+ if (rm_node(NULL, ctx, node->name)) { -+ /* That's unfortunate. We only can try to continue. */ -+ syslog(LOG_ERR, -+ "error when deleting orphaned node %s\n", -+ node->name); -+ } else -+ trace("orphaned node %s deleted\n", node->name); -+ -+ /* Skip children in all cases in order to avoid more errors. */ -+ ret = WALK_TREE_SKIP_CHILDREN; -+ } -+ -+ return domain->nbentry > 0 ? ret : WALK_TREE_SUCCESS_STOP; -+} -+ -+static void domain_tree_remove(struct domain *domain) -+{ -+ int ret; -+ struct walk_funcs walkfuncs = { .enter = domain_tree_remove_sub }; -+ -+ if (domain->nbentry > 0) { -+ ret = walk_node_tree(domain, NULL, "/", &walkfuncs, domain); -+ if (ret == WALK_TREE_ERROR_STOP) -+ syslog(LOG_ERR, -+ "error when looking for orphaned nodes\n"); -+ } -+} -+ - static int destroy_domain(void *_domain) - { - struct domain *domain = _domain; - -+ domain_tree_remove(domain); -+ - list_del(&domain->list); - - if (!domain->introduced) -@@ -857,15 +911,15 @@ int domain_entry_inc(struct connection *conn, struct node *node) - struct domain *d; - unsigned int domid; - -- if (!conn) -+ if (!node->perms.p) - return 0; - -- domid = node->perms.p ? node->perms.p[0].id : conn->id; -+ domid = node->perms.p[0].id; - -- if (conn->transaction) { -+ if (conn && conn->transaction) { - transaction_entry_inc(conn->transaction, domid); - } else { -- d = (domid == conn->id && conn->domain) ? conn->domain -+ d = (conn && domid == conn->id && conn->domain) ? conn->domain - : find_or_alloc_existing_domain(domid); - if (d) - d->nbentry++; -@@ -926,23 +980,11 @@ int domain_alloc_permrefs(struct node_perms *perms) - * Remove permissions for no longer existing domains in order to avoid a new - * domain with the same domid inheriting the permissions. - */ --int domain_adjust_node_perms(struct connection *conn, struct node *node) -+int domain_adjust_node_perms(struct node *node) - { - unsigned int i; - int ret; - -- ret = chk_domain_generation(node->perms.p[0].id, node->generation); -- -- /* If the owner doesn't exist any longer give it to priv domain. */ -- if (!ret) { -- /* -- * In theory we'd need to update the number of dom0 nodes here, -- * but we could be called for a read of the node. So better -- * avoid the risk to overflow the node count of dom0. -- */ -- node->perms.p[0].id = priv_domid; -- } -- - for (i = 1; i < node->perms.num; i++) { - if (node->perms.p[i].perms & XS_PERM_IGNORE) - continue; -@@ -960,15 +1002,15 @@ void domain_entry_dec(struct connection *conn, struct node *node) - struct domain *d; - unsigned int domid; - -- if (!conn) -+ if (!node->perms.p) - return; - - domid = node->perms.p ? node->perms.p[0].id : conn->id; - -- if (conn->transaction) { -+ if (conn && conn->transaction) { - transaction_entry_dec(conn->transaction, domid); - } else { -- d = (domid == conn->id && conn->domain) ? conn->domain -+ d = (conn && domid == conn->id && conn->domain) ? conn->domain - : find_domain_struct(domid); - if (d) { - d->nbentry--; -@@ -1087,7 +1129,7 @@ int domain_memory_add(unsigned int domid, int mem, bool no_quota_check) - * exist, as accounting is done either for a domain related to - * the current connection, or for the domain owning a node - * (which is always existing, as the owner of the node is -- * tested to exist and replaced by domid 0 if not). -+ * tested to exist and deleted or replaced by domid 0 if not). - * So not finding the related domain MUST be an error in the - * data base. - */ -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index 0b4f56b8146c..491d7a325bd3 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -65,7 +65,7 @@ bool domain_can_write(struct connection *conn); - bool domain_is_unprivileged(struct connection *conn); - - /* Remove node permissions for no longer existing domains. */ --int domain_adjust_node_perms(struct connection *conn, struct node *node); -+int domain_adjust_node_perms(struct node *node); - int domain_alloc_permrefs(struct node_perms *perms); - - /* Quota manipulation */ --- -2.37.4 - diff --git a/0119-tools-xenstore-make-the-internal-memory-data-base-th.patch b/0119-tools-xenstore-make-the-internal-memory-data-base-th.patch deleted file mode 100644 index 8c1611b..0000000 --- a/0119-tools-xenstore-make-the-internal-memory-data-base-th.patch +++ /dev/null @@ -1,101 +0,0 @@ -From 4269999ecedf79452a3fbbfab842f045d1ece16e Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:13 +0200 -Subject: [PATCH 119/126] tools/xenstore: make the internal memory data base - the default - -Having a file backed data base has the only advantage of being capable -to dump the contents of it while Xenstore is running, and potentially -using less swap space in case the data base can't be kept in memory. - -It has the major disadvantage of a huge performance overhead: switching -to keep the data base in memory only speeds up live update of xenstored -with 120000 nodes from 20 minutes to 11 seconds. A complete tree walk -of this configuration will be reduced from 7 seconds to 280 msecs -(measured by "xenstore-control check"). - -So make the internal memory data base the default and enhance the -"--internal-db" command line parameter to take an optional parameter -allowing to switch the internal data base back to the file based one. - -This is part of XSA-419. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit d174fefa90487ddd25ebc618028f67b2e8a1f795) ---- - tools/helpers/init-xenstore-domain.c | 4 ++-- - tools/xenstore/xenstored_core.c | 13 ++++++++----- - 2 files changed, 10 insertions(+), 7 deletions(-) - -diff --git a/tools/helpers/init-xenstore-domain.c b/tools/helpers/init-xenstore-domain.c -index 32689abd7479..d080dae5d3b8 100644 ---- a/tools/helpers/init-xenstore-domain.c -+++ b/tools/helpers/init-xenstore-domain.c -@@ -214,9 +214,9 @@ static int build(xc_interface *xch) - } - - if ( param ) -- snprintf(cmdline, 512, "--event %d --internal-db %s", rv, param); -+ snprintf(cmdline, 512, "--event %d %s", rv, param); - else -- snprintf(cmdline, 512, "--event %d --internal-db", rv); -+ snprintf(cmdline, 512, "--event %d", rv); - - dom->cmdline = xc_dom_strdup(dom, cmdline); - dom->xenstore_domid = domid; -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index d5b2e59b0db6..9ddbd934f794 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -2230,7 +2230,7 @@ static void accept_connection(int sock) - } - #endif - --static int tdb_flags; -+static int tdb_flags = TDB_INTERNAL | TDB_NOLOCK; - - /* We create initial nodes manually. */ - static void manual_node(const char *name, const char *child) -@@ -2537,7 +2537,8 @@ static void usage(void) - " watch-event: time a watch-event is kept pending\n" - " -R, --no-recovery to request that no recovery should be attempted when\n" - " the store is corrupted (debug only),\n" --" -I, --internal-db store database in memory, not on disk\n" -+" -I, --internal-db [on|off] store database in memory, not on disk, default is\n" -+" memory, with \"--internal-db off\" it is on disk\n" - " -K, --keep-orphans don't delete nodes owned by a domain when the\n" - " domain is deleted (this is a security risk!)\n" - " -V, --verbose to request verbose execution.\n"); -@@ -2563,7 +2564,7 @@ static struct option options[] = { - { "quota-soft", 1, NULL, 'q' }, - { "timeout", 1, NULL, 'w' }, - { "no-recovery", 0, NULL, 'R' }, -- { "internal-db", 0, NULL, 'I' }, -+ { "internal-db", 2, NULL, 'I' }, - { "keep-orphans", 0, NULL, 'K' }, - { "verbose", 0, NULL, 'V' }, - { "watch-nb", 1, NULL, 'W' }, -@@ -2645,7 +2646,8 @@ int main(int argc, char *argv[]) - orig_argc = argc; - orig_argv = argv; - -- while ((opt = getopt_long(argc, argv, "DE:F:HKNPS:t:A:M:Q:q:T:RVW:w:U", -+ while ((opt = getopt_long(argc, argv, -+ "DE:F:HI::KNPS:t:A:M:Q:q:T:RVW:w:U", - options, NULL)) != -1) { - switch (opt) { - case 'D': -@@ -2679,7 +2681,8 @@ int main(int argc, char *argv[]) - tracefile = optarg; - break; - case 'I': -- tdb_flags = TDB_INTERNAL|TDB_NOLOCK; -+ if (optarg && !strcmp(optarg, "off")) -+ tdb_flags = 0; - break; - case 'K': - keep_orphans = true; --- -2.37.4 - diff --git a/0120-docs-enhance-xenstore.txt-with-permissions-descripti.patch b/0120-docs-enhance-xenstore.txt-with-permissions-descripti.patch deleted file mode 100644 index e0d7d9e..0000000 --- a/0120-docs-enhance-xenstore.txt-with-permissions-descripti.patch +++ /dev/null @@ -1,51 +0,0 @@ -From bc3921135cf8590d0f587f460be431922183c4c4 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:13 +0200 -Subject: [PATCH 120/126] docs: enhance xenstore.txt with permissions - description -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The permission scheme of Xenstore nodes is not really covered by -docs/misc/xenstore.txt, other than referring to the Xen wiki. - -Add a paragraph explaining the permissions of nodes, and especially -mentioning removal of nodes when a domain has been removed from -Xenstore. - -This is part of XSA-419. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit d084d2c6dff7044956ebdf83a259ad6081a1d921) ---- - docs/misc/xenstore.txt | 11 +++++++++++ - 1 file changed, 11 insertions(+) - -diff --git a/docs/misc/xenstore.txt b/docs/misc/xenstore.txt -index a7d006519ae8..eccd596ee38c 100644 ---- a/docs/misc/xenstore.txt -+++ b/docs/misc/xenstore.txt -@@ -43,6 +43,17 @@ bytes are forbidden; clients specifying relative paths should keep - them to within 2048 bytes. (See XENSTORE_*_PATH_MAX in xs_wire.h.) - - -+Each node has one or multiple permission entries. Permissions are -+granted by domain-id, the first permission entry of each node specifies -+the owner of the node. Permissions of a node can be changed by the -+owner of the node, the owner can only be modified by the control -+domain (usually domain id 0). The owner always has the right to read -+and write the node, while other permissions can be setup to allow -+read and/or write access. When a domain is being removed from Xenstore -+nodes owned by that domain will be removed together with all of those -+nodes' children. -+ -+ - Communication with xenstore is via either sockets, or event channel - and shared memory, as specified in io/xs_wire.h: each message in - either direction is a header formatted as a struct xsd_sockmsg --- -2.37.4 - diff --git a/0121-tools-ocaml-xenstored-Fix-quota-bypass-on-domain-shu.patch b/0121-tools-ocaml-xenstored-Fix-quota-bypass-on-domain-shu.patch deleted file mode 100644 index 722700e..0000000 --- a/0121-tools-ocaml-xenstored-Fix-quota-bypass-on-domain-shu.patch +++ /dev/null @@ -1,93 +0,0 @@ -From b9ede0950b3a6526d5ccea074841f093e0580948 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Wed, 12 Oct 2022 19:13:06 +0100 -Subject: [PATCH 121/126] tools/ocaml/xenstored: Fix quota bypass on domain - shutdown -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -XSA-322 fixed a domid reuse vulnerability by assigning Dom0 as the owner of -any nodes left after a domain is shutdown (e.g. outside its /local/domain/N -tree). - -However Dom0 has no quota on purpose, so this opened up another potential -attack vector. Avoid it by deleting these nodes instead of assigning them to -Dom0. - -This is part of XSA-419 / CVE-2022-42323. - -Fixes: c46eff921209 ("tools/ocaml/xenstored: clean up permissions for dead domains") -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit db471408edd46af403b8bd44d180a928ad7fbb80) ---- - tools/ocaml/xenstored/perms.ml | 3 +-- - tools/ocaml/xenstored/store.ml | 29 +++++++++++++++++++++-------- - 2 files changed, 22 insertions(+), 10 deletions(-) - -diff --git a/tools/ocaml/xenstored/perms.ml b/tools/ocaml/xenstored/perms.ml -index e8a16221f8fa..84f2503e8e29 100644 ---- a/tools/ocaml/xenstored/perms.ml -+++ b/tools/ocaml/xenstored/perms.ml -@@ -64,8 +64,7 @@ let get_owner perm = perm.owner - * *) - let remove_domid ~domid perm = - let acl = List.filter (fun (acl_domid, _) -> acl_domid <> domid) perm.acl in -- let owner = if perm.owner = domid then 0 else perm.owner in -- { perm with acl; owner } -+ if perm.owner = domid then None else Some { perm with acl; owner = perm.owner } - - let default0 = create 0 NONE [] - -diff --git a/tools/ocaml/xenstored/store.ml b/tools/ocaml/xenstored/store.ml -index 20e67b142746..70f0c83de404 100644 ---- a/tools/ocaml/xenstored/store.ml -+++ b/tools/ocaml/xenstored/store.ml -@@ -87,10 +87,21 @@ let check_owner node connection = - - let rec recurse fct node = fct node; SymbolMap.iter (fun _ -> recurse fct) node.children - --(** [recurse_map f tree] applies [f] on each node in the tree recursively *) --let recurse_map f = -+(** [recurse_filter_map f tree] applies [f] on each node in the tree recursively, -+ possibly removing some nodes. -+ Note that the nodes removed this way won't generate watch events. -+*) -+let recurse_filter_map f = -+ let invalid = -1 in -+ let is_valid _ node = node.perms.owner <> invalid in - let rec walk node = -- f { node with children = SymbolMap.map walk node.children } -+ (* Map.filter_map is Ocaml 4.11+ only *) -+ let node = -+ { node with children = -+ SymbolMap.map walk node.children |> SymbolMap.filter is_valid } in -+ match f node with -+ | Some keep -> keep -+ | None -> { node with perms = {node.perms with owner = invalid } } - in - walk - -@@ -444,11 +455,13 @@ let setperms store perm path nperms = - - let reset_permissions store domid = - Logging.info "store|node" "Cleaning up xenstore ACLs for domid %d" domid; -- store.root <- Node.recurse_map (fun node -> -- let perms = Perms.Node.remove_domid ~domid node.perms in -- if perms <> node.perms then -- Logging.debug "store|node" "Changed permissions for node %s" (Node.get_name node); -- { node with perms } -+ store.root <- Node.recurse_filter_map (fun node -> -+ match Perms.Node.remove_domid ~domid node.perms with -+ | None -> None -+ | Some perms -> -+ if perms <> node.perms then -+ Logging.debug "store|node" "Changed permissions for node %s" (Node.get_name node); -+ Some { node with perms } - ) store.root - - type ops = { --- -2.37.4 - diff --git a/0122-tools-ocaml-Ensure-packet-size-is-never-negative.patch b/0122-tools-ocaml-Ensure-packet-size-is-never-negative.patch deleted file mode 100644 index 35a14f1..0000000 --- a/0122-tools-ocaml-Ensure-packet-size-is-never-negative.patch +++ /dev/null @@ -1,75 +0,0 @@ -From d3649d33e1eae49d3925ef34a7ccf39cae8852e6 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Wed, 12 Oct 2022 19:13:05 +0100 -Subject: [PATCH 122/126] tools/ocaml: Ensure packet size is never negative -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Integers in Ocaml have 63 or 31 bits of signed precision. - -On 64-bit builds of Ocaml, this is fine because a C uint32_t always fits -within a 63-bit signed integer. - -In 32-bit builds of Ocaml, this goes wrong. The C uint32_t is truncated -first (loses the top bit), then has a unsigned/signed mismatch. - -A "negative" value (i.e. a packet on the ring of between 1G and 2G in size) -will trigger an exception later in Bytes.make in xb.ml, and because the packet -is not removed from the ring, the exception re-triggers on every subsequent -query, creating a livelock. - -Fix both the source of the exception in Xb, and as defence in depth, mark the -domain as bad for any Invalid_argument exceptions to avoid the risk of -livelock. - -This is XSA-420 / CVE-2022-42324. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit ae34df4d82636f4c82700b447ea2c93b9f82b3f3) ---- - tools/ocaml/libs/xb/partial.ml | 6 +++--- - tools/ocaml/xenstored/process.ml | 2 +- - 2 files changed, 4 insertions(+), 4 deletions(-) - -diff --git a/tools/ocaml/libs/xb/partial.ml b/tools/ocaml/libs/xb/partial.ml -index b6e2a716e263..3aa8927eb7f0 100644 ---- a/tools/ocaml/libs/xb/partial.ml -+++ b/tools/ocaml/libs/xb/partial.ml -@@ -36,7 +36,7 @@ let of_string s = - This will leave the guest connection is a bad state and will - be hard to recover from without restarting the connection - (ie rebooting the guest) *) -- let dlen = min xenstore_payload_max dlen in -+ let dlen = max 0 (min xenstore_payload_max dlen) in - { - tid = tid; - rid = rid; -@@ -46,8 +46,8 @@ let of_string s = - } - - let append pkt s sz = -- if pkt.len > 4096 then failwith "Buffer.add: cannot grow buffer"; -- Buffer.add_string pkt.buf (String.sub s 0 sz) -+ if Buffer.length pkt.buf + sz > xenstore_payload_max then failwith "Buffer.add: cannot grow buffer"; -+ Buffer.add_substring pkt.buf s 0 sz - - let to_complete pkt = - pkt.len - (Buffer.length pkt.buf) -diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml -index ce39ce28b5f3..6cb990ee7fb2 100644 ---- a/tools/ocaml/xenstored/process.ml -+++ b/tools/ocaml/xenstored/process.ml -@@ -722,7 +722,7 @@ let do_input store cons doms con = - History.reconnect con; - info "%s reconnection complete" (Connection.get_domstr con); - None -- | Failure exp -> -+ | Invalid_argument exp | Failure exp -> - error "caught exception %s" exp; - error "got a bad client %s" (sprintf "%-8s" (Connection.get_domstr con)); - Connection.mark_as_bad con; --- -2.37.4 - diff --git a/0123-tools-xenstore-fix-deleting-node-in-transaction.patch b/0123-tools-xenstore-fix-deleting-node-in-transaction.patch deleted file mode 100644 index efa7178..0000000 --- a/0123-tools-xenstore-fix-deleting-node-in-transaction.patch +++ /dev/null @@ -1,46 +0,0 @@ -From 2d3476effe3a9236867562f14dc26979a6527080 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:13 +0200 -Subject: [PATCH 123/126] tools/xenstore: fix deleting node in transaction - -In case a node has been created in a transaction and it is later -deleted in the same transaction, the transaction will be terminated -with an error. - -As this error is encountered only when handling the deleted node at -transaction finalization, the transaction will have been performed -partially and without updating the accounting information. This will -enable a malicious guest to create arbitrary number of nodes. - -This is part of XSA-421 / CVE-2022-42325. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Tested-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 13ac37f1416cae88d97f7baf6cf2a827edb9a187) ---- - tools/xenstore/xenstored_transaction.c | 8 +++++++- - 1 file changed, 7 insertions(+), 1 deletion(-) - -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 3e3eb47326cc..7ffe21bb5285 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -418,7 +418,13 @@ static int finalize_transaction(struct connection *conn, - true); - talloc_free(data.dptr); - } else { -- ret = do_tdb_delete(conn, &key, NULL); -+ /* -+ * A node having been created and later deleted -+ * in this transaction will have no generation -+ * information stored. -+ */ -+ ret = (i->generation == NO_GENERATION) -+ ? 0 : do_tdb_delete(conn, &key, NULL); - } - if (ret) - goto err; --- -2.37.4 - diff --git a/0124-tools-xenstore-harden-transaction-finalization-again.patch b/0124-tools-xenstore-harden-transaction-finalization-again.patch deleted file mode 100644 index 8279aeb..0000000 --- a/0124-tools-xenstore-harden-transaction-finalization-again.patch +++ /dev/null @@ -1,410 +0,0 @@ -From e818f4f0dabf83a6138cd77d7464495fab7bfc16 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:14 +0200 -Subject: [PATCH 124/126] tools/xenstore: harden transaction finalization - against errors - -When finalizing a transaction, any error occurring after checking for -conflicts will result in the transaction being performed only -partially today. Additionally accounting data will not be updated at -the end of the transaction, which might result in further problems -later. - -Avoid those problems by multiple modifications: - -- free any transaction specific nodes which don't need to be committed - as they haven't been written during the transaction as soon as their - generation count has been verified, this will reduce the risk of - out-of-memory situations - -- store the transaction specific node name in struct accessed_node in - order to avoid the need to allocate additional memory for it when - finalizing the transaction - -- don't stop the transaction finalization when hitting an error - condition, but try to continue to handle all modified nodes - -- in case of a detected error do the accounting update as needed and - call the data base checking only after that - -- if writing a node in a transaction is failing (e.g. due to a failed - quota check), fail the transaction, as prior changes to struct - accessed_node can't easily be undone in that case - -This is part of XSA-421 / CVE-2022-42326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -Tested-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 2dd823ca7237e7fb90c890642d6a3b357a26fcff) ---- - tools/xenstore/xenstored_core.c | 16 ++- - tools/xenstore/xenstored_transaction.c | 171 +++++++++++-------------- - tools/xenstore/xenstored_transaction.h | 4 +- - 3 files changed, 92 insertions(+), 99 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 9ddbd934f794..3c008c8cd455 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -692,8 +692,7 @@ struct node *read_node(struct connection *conn, const void *ctx, - return NULL; - } - -- if (transaction_prepend(conn, name, &key)) -- return NULL; -+ transaction_prepend(conn, name, &key); - - data = tdb_fetch(tdb_ctx, key); - -@@ -811,10 +810,21 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - static int write_node(struct connection *conn, struct node *node, - bool no_quota_check) - { -+ int ret; -+ - if (access_node(conn, node, NODE_ACCESS_WRITE, &node->key)) - return errno; - -- return write_node_raw(conn, &node->key, node, no_quota_check); -+ ret = write_node_raw(conn, &node->key, node, no_quota_check); -+ if (ret && conn && conn->transaction) { -+ /* -+ * Reverting access_node() is hard, so just fail the -+ * transaction. -+ */ -+ fail_transaction(conn->transaction); -+ } -+ -+ return ret; - } - - enum xs_perm_type perm_for_conn(struct connection *conn, -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 7ffe21bb5285..ac854197cadb 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -114,7 +114,8 @@ struct accessed_node - struct list_head list; - - /* The name of the node. */ -- char *node; -+ char *trans_name; /* Transaction specific name. */ -+ char *node; /* Main data base name. */ - - /* Generation count (or NO_GENERATION) for conflict checking. */ - uint64_t generation; -@@ -199,25 +200,20 @@ static char *transaction_get_node_name(void *ctx, struct transaction *trans, - * Prepend the transaction to name if node has been modified in the current - * transaction. - */ --int transaction_prepend(struct connection *conn, const char *name, -- TDB_DATA *key) -+void transaction_prepend(struct connection *conn, const char *name, -+ TDB_DATA *key) - { -- char *tdb_name; -+ struct accessed_node *i; - -- if (!conn || !conn->transaction || -- !find_accessed_node(conn->transaction, name)) { -- set_tdb_key(name, key); -- return 0; -+ if (conn && conn->transaction) { -+ i = find_accessed_node(conn->transaction, name); -+ if (i) { -+ set_tdb_key(i->trans_name, key); -+ return; -+ } - } - -- tdb_name = transaction_get_node_name(conn->transaction, -- conn->transaction, name); -- if (!tdb_name) -- return errno; -- -- set_tdb_key(tdb_name, key); -- -- return 0; -+ set_tdb_key(name, key); - } - - /* -@@ -240,7 +236,6 @@ int access_node(struct connection *conn, struct node *node, - struct accessed_node *i = NULL; - struct transaction *trans; - TDB_DATA local_key; -- const char *trans_name = NULL; - int ret; - bool introduce = false; - -@@ -259,10 +254,6 @@ int access_node(struct connection *conn, struct node *node, - - trans = conn->transaction; - -- trans_name = transaction_get_node_name(node, trans, node->name); -- if (!trans_name) -- goto nomem; -- - i = find_accessed_node(trans, node->name); - if (!i) { - if (trans->nodes >= quota_trans_nodes && -@@ -273,9 +264,10 @@ int access_node(struct connection *conn, struct node *node, - i = talloc_zero(trans, struct accessed_node); - if (!i) - goto nomem; -- i->node = talloc_strdup(i, node->name); -- if (!i->node) -+ i->trans_name = transaction_get_node_name(i, trans, node->name); -+ if (!i->trans_name) - goto nomem; -+ i->node = strchr(i->trans_name, '/') + 1; - if (node->generation != NO_GENERATION && node->perms.num) { - i->perms.p = talloc_array(i, struct xs_permissions, - node->perms.num); -@@ -302,7 +294,7 @@ int access_node(struct connection *conn, struct node *node, - i->generation = node->generation; - i->check_gen = true; - if (node->generation != NO_GENERATION) { -- set_tdb_key(trans_name, &local_key); -+ set_tdb_key(i->trans_name, &local_key); - ret = write_node_raw(conn, &local_key, node, true); - if (ret) - goto err; -@@ -321,7 +313,7 @@ int access_node(struct connection *conn, struct node *node, - return -1; - - if (key) { -- set_tdb_key(trans_name, key); -+ set_tdb_key(i->trans_name, key); - if (type == NODE_ACCESS_WRITE) - i->ta_node = true; - if (type == NODE_ACCESS_DELETE) -@@ -333,7 +325,6 @@ int access_node(struct connection *conn, struct node *node, - nomem: - ret = ENOMEM; - err: -- talloc_free((void *)trans_name); - talloc_free(i); - trans->fail = true; - errno = ret; -@@ -371,100 +362,90 @@ void queue_watches(struct connection *conn, const char *name, bool watch_exact) - * base. - */ - static int finalize_transaction(struct connection *conn, -- struct transaction *trans) -+ struct transaction *trans, bool *is_corrupt) - { -- struct accessed_node *i; -+ struct accessed_node *i, *n; - TDB_DATA key, ta_key, data; - struct xs_tdb_record_hdr *hdr; - uint64_t gen; -- char *trans_name; -- int ret; - -- list_for_each_entry(i, &trans->accessed, list) { -- if (!i->check_gen) -- continue; -+ list_for_each_entry_safe(i, n, &trans->accessed, list) { -+ if (i->check_gen) { -+ set_tdb_key(i->node, &key); -+ data = tdb_fetch(tdb_ctx, key); -+ hdr = (void *)data.dptr; -+ if (!data.dptr) { -+ if (tdb_error(tdb_ctx) != TDB_ERR_NOEXIST) -+ return EIO; -+ gen = NO_GENERATION; -+ } else -+ gen = hdr->generation; -+ talloc_free(data.dptr); -+ if (i->generation != gen) -+ return EAGAIN; -+ } - -- set_tdb_key(i->node, &key); -- data = tdb_fetch(tdb_ctx, key); -- hdr = (void *)data.dptr; -- if (!data.dptr) { -- if (tdb_error(tdb_ctx) != TDB_ERR_NOEXIST) -- return EIO; -- gen = NO_GENERATION; -- } else -- gen = hdr->generation; -- talloc_free(data.dptr); -- if (i->generation != gen) -- return EAGAIN; -+ /* Entries for unmodified nodes can be removed early. */ -+ if (!i->modified) { -+ if (i->ta_node) { -+ set_tdb_key(i->trans_name, &ta_key); -+ if (do_tdb_delete(conn, &ta_key, NULL)) -+ return EIO; -+ } -+ list_del(&i->list); -+ talloc_free(i); -+ } - } - - while ((i = list_top(&trans->accessed, struct accessed_node, list))) { -- trans_name = transaction_get_node_name(i, trans, i->node); -- if (!trans_name) -- /* We are doomed: the transaction is only partial. */ -- goto err; -- -- set_tdb_key(trans_name, &ta_key); -- -- if (i->modified) { -- set_tdb_key(i->node, &key); -- if (i->ta_node) { -- data = tdb_fetch(tdb_ctx, ta_key); -- if (!data.dptr) -- goto err; -+ set_tdb_key(i->node, &key); -+ if (i->ta_node) { -+ set_tdb_key(i->trans_name, &ta_key); -+ data = tdb_fetch(tdb_ctx, ta_key); -+ if (data.dptr) { - hdr = (void *)data.dptr; - hdr->generation = ++generation; -- ret = do_tdb_write(conn, &key, &data, NULL, -- true); -+ *is_corrupt |= do_tdb_write(conn, &key, &data, -+ NULL, true); - talloc_free(data.dptr); -+ if (do_tdb_delete(conn, &ta_key, NULL)) -+ *is_corrupt = true; - } else { -- /* -- * A node having been created and later deleted -- * in this transaction will have no generation -- * information stored. -- */ -- ret = (i->generation == NO_GENERATION) -- ? 0 : do_tdb_delete(conn, &key, NULL); -- } -- if (ret) -- goto err; -- if (i->fire_watch) { -- fire_watches(conn, trans, i->node, NULL, -- i->watch_exact, -- i->perms.p ? &i->perms : NULL); -+ *is_corrupt = true; - } -+ } else { -+ /* -+ * A node having been created and later deleted -+ * in this transaction will have no generation -+ * information stored. -+ */ -+ *is_corrupt |= (i->generation == NO_GENERATION) -+ ? false -+ : do_tdb_delete(conn, &key, NULL); - } -+ if (i->fire_watch) -+ fire_watches(conn, trans, i->node, NULL, i->watch_exact, -+ i->perms.p ? &i->perms : NULL); - -- if (i->ta_node && do_tdb_delete(conn, &ta_key, NULL)) -- goto err; - list_del(&i->list); - talloc_free(i); - } - - return 0; -- --err: -- corrupt(conn, "Partial transaction"); -- return EIO; - } - - static int destroy_transaction(void *_transaction) - { - struct transaction *trans = _transaction; - struct accessed_node *i; -- char *trans_name; - TDB_DATA key; - - wrl_ntransactions--; - trace_destroy(trans, "transaction"); - while ((i = list_top(&trans->accessed, struct accessed_node, list))) { - if (i->ta_node) { -- trans_name = transaction_get_node_name(i, trans, -- i->node); -- if (trans_name) { -- set_tdb_key(trans_name, &key); -- do_tdb_delete(trans->conn, &key, NULL); -- } -+ set_tdb_key(i->trans_name, &key); -+ do_tdb_delete(trans->conn, &key, NULL); - } - list_del(&i->list); - talloc_free(i); -@@ -556,6 +537,7 @@ int do_transaction_end(const void *ctx, struct connection *conn, - { - const char *arg = onearg(in); - struct transaction *trans; -+ bool is_corrupt = false; - int ret; - - if (!arg || (!streq(arg, "T") && !streq(arg, "F"))) -@@ -579,13 +561,17 @@ int do_transaction_end(const void *ctx, struct connection *conn, - ret = transaction_fix_domains(trans, false); - if (ret) - return ret; -- if (finalize_transaction(conn, trans)) -- return EAGAIN; -+ ret = finalize_transaction(conn, trans, &is_corrupt); -+ if (ret) -+ return ret; - - wrl_apply_debit_trans_commit(conn); - - /* fix domain entry for each changed domain */ - transaction_fix_domains(trans, true); -+ -+ if (is_corrupt) -+ corrupt(conn, "transaction inconsistency"); - } - send_ack(conn, XS_TRANSACTION_END); - -@@ -660,7 +646,7 @@ int check_transactions(struct hashtable *hash) - struct connection *conn; - struct transaction *trans; - struct accessed_node *i; -- char *tname, *tnode; -+ char *tname; - - list_for_each_entry(conn, &connections, list) { - list_for_each_entry(trans, &conn->transaction_list, list) { -@@ -672,11 +658,8 @@ int check_transactions(struct hashtable *hash) - list_for_each_entry(i, &trans->accessed, list) { - if (!i->ta_node) - continue; -- tnode = transaction_get_node_name(tname, trans, -- i->node); -- if (!tnode || !remember_string(hash, tnode)) -+ if (!remember_string(hash, i->trans_name)) - goto nomem; -- talloc_free(tnode); - } - - talloc_free(tname); -diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h -index 39d7f81c5127..3417303f9427 100644 ---- a/tools/xenstore/xenstored_transaction.h -+++ b/tools/xenstore/xenstored_transaction.h -@@ -48,8 +48,8 @@ int __must_check access_node(struct connection *conn, struct node *node, - void queue_watches(struct connection *conn, const char *name, bool watch_exact); - - /* Prepend the transaction to name if appropriate. */ --int transaction_prepend(struct connection *conn, const char *name, -- TDB_DATA *key); -+void transaction_prepend(struct connection *conn, const char *name, -+ TDB_DATA *key); - - /* Mark the transaction as failed. This will prevent it to be committed. */ - void fail_transaction(struct transaction *trans); --- -2.37.4 - diff --git a/0125-x86-spec-ctrl-Enumeration-for-IBPB_RET.patch b/0125-x86-spec-ctrl-Enumeration-for-IBPB_RET.patch deleted file mode 100644 index f1667ac..0000000 --- a/0125-x86-spec-ctrl-Enumeration-for-IBPB_RET.patch +++ /dev/null @@ -1,82 +0,0 @@ -From 07be0fe497349ed423c5201bdc410b6281ebf04f Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 14 Jun 2022 16:18:36 +0100 -Subject: [PATCH 125/126] x86/spec-ctrl: Enumeration for IBPB_RET - -The IBPB_RET bit indicates that the CPU's implementation of MSR_PRED_CMD.IBPB -does flush the RSB/RAS too. - -This is part of XSA-422 / CVE-2022-23824. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 24496558e650535bdbd22cc04731e82276cd1b3f) ---- - tools/libs/light/libxl_cpuid.c | 1 + - tools/misc/xen-cpuid.c | 1 + - xen/arch/x86/spec_ctrl.c | 5 +++-- - xen/include/public/arch-x86/cpufeatureset.h | 1 + - 4 files changed, 6 insertions(+), 2 deletions(-) - -diff --git a/tools/libs/light/libxl_cpuid.c b/tools/libs/light/libxl_cpuid.c -index 2632efc6adb0..4cc2f211b878 100644 ---- a/tools/libs/light/libxl_cpuid.c -+++ b/tools/libs/light/libxl_cpuid.c -@@ -284,6 +284,7 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str) - {"ssb-no", 0x80000008, NA, CPUID_REG_EBX, 26, 1}, - {"psfd", 0x80000008, NA, CPUID_REG_EBX, 28, 1}, - {"btc-no", 0x80000008, NA, CPUID_REG_EBX, 29, 1}, -+ {"ibpb-ret", 0x80000008, NA, CPUID_REG_EBX, 30, 1}, - - {"nc", 0x80000008, NA, CPUID_REG_ECX, 0, 8}, - {"apicidsize", 0x80000008, NA, CPUID_REG_ECX, 12, 4}, -diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c -index e83bc4793d6e..5c944c24fe36 100644 ---- a/tools/misc/xen-cpuid.c -+++ b/tools/misc/xen-cpuid.c -@@ -158,6 +158,7 @@ static const char *const str_e8b[32] = - [24] = "amd-ssbd", [25] = "virt-ssbd", - [26] = "ssb-no", - [28] = "psfd", [29] = "btc-no", -+ [30] = "ibpb-ret", - }; - - static const char *const str_7d0[32] = -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 3ff602bd0281..459c64d139b6 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -419,7 +419,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - * Hardware read-only information, stating immunity to certain issues, or - * suggestions of which mitigation to use. - */ -- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", -+ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", - (caps & ARCH_CAPS_IBRS_ALL) ? " IBRS_ALL" : "", - (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", -@@ -436,7 +436,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_SAME_MODE)) ? " IBRS_SAME_MODE" : "", -- (e8b & cpufeat_mask(X86_FEATURE_BTC_NO)) ? " BTC_NO" : ""); -+ (e8b & cpufeat_mask(X86_FEATURE_BTC_NO)) ? " BTC_NO" : "", -+ (e8b & cpufeat_mask(X86_FEATURE_IBPB_RET)) ? " IBPB_RET" : ""); - - /* Hardware features which need driving to mitigate issues. */ - printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n", -diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h -index 1bbc7da4b53c..41a358d575d3 100644 ---- a/xen/include/public/arch-x86/cpufeatureset.h -+++ b/xen/include/public/arch-x86/cpufeatureset.h -@@ -266,6 +266,7 @@ XEN_CPUFEATURE(VIRT_SSBD, 8*32+25) /* MSR_VIRT_SPEC_CTRL.SSBD */ - XEN_CPUFEATURE(SSB_NO, 8*32+26) /*A Hardware not vulnerable to SSB */ - XEN_CPUFEATURE(PSFD, 8*32+28) /*S MSR_SPEC_CTRL.PSFD */ - XEN_CPUFEATURE(BTC_NO, 8*32+29) /*A Hardware not vulnerable to Branch Type Confusion */ -+XEN_CPUFEATURE(IBPB_RET, 8*32+30) /*A IBPB clears RSB/RAS too. */ - - /* Intel-defined CPU features, CPUID level 0x00000007:0.edx, word 9 */ - XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions */ --- -2.37.4 - diff --git a/0126-x86-spec-ctrl-Mitigate-IBPB-not-flushing-the-RSB-RAS.patch b/0126-x86-spec-ctrl-Mitigate-IBPB-not-flushing-the-RSB-RAS.patch deleted file mode 100644 index 2abb0f2..0000000 --- a/0126-x86-spec-ctrl-Mitigate-IBPB-not-flushing-the-RSB-RAS.patch +++ /dev/null @@ -1,113 +0,0 @@ -From 32445f23fea6a533fc1d7ade5871246d75210bf1 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 14 Jun 2022 16:18:36 +0100 -Subject: [PATCH 126/126] x86/spec-ctrl: Mitigate IBPB not flushing the RSB/RAS - -Introduce spec_ctrl_new_guest_context() to encapsulate all logic pertaining to -using MSR_PRED_CMD for a new guest context, even if it only has one user -presently. - -Introduce X86_BUG_IBPB_NO_RET, and use it extend spec_ctrl_new_guest_context() -with a manual fixup for hardware which mis-implements IBPB. - -This is part of XSA-422 / CVE-2022-23824. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 2b27967fb89d7904a1571a2fb963b1c9cac548db) ---- - xen/arch/x86/asm-macros.c | 1 + - xen/arch/x86/domain.c | 2 +- - xen/arch/x86/spec_ctrl.c | 8 ++++++++ - xen/include/asm-x86/cpufeatures.h | 1 + - xen/include/asm-x86/spec_ctrl.h | 22 ++++++++++++++++++++++ - 5 files changed, 33 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/asm-macros.c b/xen/arch/x86/asm-macros.c -index 7e536b0d82f5..891d86c7655c 100644 ---- a/xen/arch/x86/asm-macros.c -+++ b/xen/arch/x86/asm-macros.c -@@ -1,2 +1,3 @@ - #include <asm/asm-defns.h> - #include <asm/alternative-asm.h> -+#include <asm/spec_ctrl_asm.h> -diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c -index e9b8ed4c96c2..b82e18dd62d8 100644 ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -2069,7 +2069,7 @@ void context_switch(struct vcpu *prev, struct vcpu *next) - */ - if ( *last_id != next_id ) - { -- wrmsrl(MSR_PRED_CMD, PRED_CMD_IBPB); -+ spec_ctrl_new_guest_context(); - *last_id = next_id; - } - } -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 459c64d139b6..5636853aae6b 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -775,6 +775,14 @@ static void __init ibpb_calculations(void) - return; - } - -+ /* -+ * AMD/Hygon CPUs to date (June 2022) don't flush the the RAS. Future -+ * CPUs are expected to enumerate IBPB_RET when this has been fixed. -+ * Until then, cover the difference with the software sequence. -+ */ -+ if ( boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_IBPB_RET) ) -+ setup_force_cpu_cap(X86_BUG_IBPB_NO_RET); -+ - /* - * IBPB-on-entry mitigations for Branch Type Confusion. - * -diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h -index b233e5835fb5..bdb119a34c5d 100644 ---- a/xen/include/asm-x86/cpufeatures.h -+++ b/xen/include/asm-x86/cpufeatures.h -@@ -48,6 +48,7 @@ XEN_CPUFEATURE(IBPB_ENTRY_HVM, X86_SYNTH(29)) /* MSR_PRED_CMD used by Xen for - - #define X86_BUG_FPU_PTRS X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */ - #define X86_BUG_CLFLUSH_MFENCE X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */ -+#define X86_BUG_IBPB_NO_RET X86_BUG( 3) /* IBPB doesn't flush the RSB/RAS */ - - /* Total number of capability words, inc synth and bug words. */ - #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */ -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index 33e845991b0a..e400ff227391 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -65,6 +65,28 @@ - void init_speculation_mitigations(void); - void spec_ctrl_init_domain(struct domain *d); - -+/* -+ * Switch to a new guest prediction context. -+ * -+ * This flushes all indirect branch predictors (BTB, RSB/RAS), so guest code -+ * which has previously run on this CPU can't attack subsequent guest code. -+ * -+ * As this flushes the RSB/RAS, it destroys the predictions of the calling -+ * context. For best performace, arrange for this to be used when we're going -+ * to jump out of the current context, e.g. with reset_stack_and_jump(). -+ * -+ * For hardware which mis-implements IBPB, fix up by flushing the RSB/RAS -+ * manually. -+ */ -+static always_inline void spec_ctrl_new_guest_context(void) -+{ -+ wrmsrl(MSR_PRED_CMD, PRED_CMD_IBPB); -+ -+ /* (ab)use alternative_input() to specify clobbers. */ -+ alternative_input("", "DO_OVERWRITE_RSB", X86_BUG_IBPB_NO_RET, -+ : "rax", "rcx"); -+} -+ - extern int8_t opt_ibpb_ctxt_switch; - extern bool opt_ssbd; - extern int8_t opt_eager_fpu; --- -2.37.4 - @@ -1,6 +1,6 @@ -Xen upstream patchset #2 for 4.15.4-pre +Xen upstream patchset #0 for 4.15.5-pre Containing patches from -RELEASE-4.15.3 (feecaf4abf733e83b7a297190819eca7a7f65168) +RELEASE-4.15.4 (460047c177fe7b0b1e5eb7d42ffb400e680f521f) to -staging-4.15 (32445f23fea6a533fc1d7ade5871246d75210bf1) +staging-4.15 (7963cdbf91d8a8d2f8338171adab3807b20f658a) |