summaryrefslogtreecommitdiff
blob: 4a46326b052386890737741215066f660f5c97c2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
From 846fb984b506135917c2862d2e4607005d6afdeb Mon Sep 17 00:00:00 2001
From: Andrew Cooper <andrew.cooper3@citrix.com>
Date: Tue, 2 Apr 2024 16:20:09 +0200
Subject: [PATCH 65/67] x86/boot: Improve the boot watchdog determination of
 stuck cpus
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Right now, check_nmi_watchdog() has two processing loops over all online CPUs
using prev_nmi_count as storage.

Use a cpumask_t instead (1/32th as much initdata) and have wait_for_nmis()
make the determination of whether it is stuck, rather than having both
functions needing to agree on how many ticks mean stuck.

More importantly though, it means we can use the standard cpumask
infrastructure, including turning this:

  (XEN) Brought up 512 CPUs
  (XEN) Testing NMI watchdog on all CPUs: {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511} stuck

into the rather more manageable:

  (XEN) Brought up 512 CPUs
  (XEN) Testing NMI watchdog on all CPUs: {0-511} stuck

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
master commit: 9e18f339830c828798aef465556d4029d83476a0
master date: 2024-03-19 18:29:37 +0000
---
 xen/arch/x86/nmi.c | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c
index 7c9591b65e..dd31034ac8 100644
--- a/xen/arch/x86/nmi.c
+++ b/xen/arch/x86/nmi.c
@@ -150,6 +150,8 @@ int nmi_active;
 
 static void __init cf_check wait_for_nmis(void *p)
 {
+    cpumask_t *stuck_cpus = p;
+    unsigned int cpu = smp_processor_id();
     unsigned int start_count = this_cpu(nmi_count);
     unsigned long ticks = 10 * 1000 * cpu_khz / nmi_hz;
     unsigned long s, e;
@@ -158,42 +160,35 @@ static void __init cf_check wait_for_nmis(void *p)
     do {
         cpu_relax();
         if ( this_cpu(nmi_count) >= start_count + 2 )
-            break;
+            return;
+
         e = rdtsc();
-    } while( e - s < ticks );
+    } while ( e - s < ticks );
+
+    /* Timeout.  Mark ourselves as stuck. */
+    cpumask_set_cpu(cpu, stuck_cpus);
 }
 
 void __init check_nmi_watchdog(void)
 {
-    static unsigned int __initdata prev_nmi_count[NR_CPUS];
-    int cpu;
-    bool ok = true;
+    static cpumask_t __initdata stuck_cpus;
 
     if ( nmi_watchdog == NMI_NONE )
         return;
 
     printk("Testing NMI watchdog on all CPUs:");
 
-    for_each_online_cpu ( cpu )
-        prev_nmi_count[cpu] = per_cpu(nmi_count, cpu);
-
     /*
      * Wait at most 10 ticks for 2 watchdog NMIs on each CPU.
      * Busy-wait on all CPUs: the LAPIC counter that the NMI watchdog
      * uses only runs while the core's not halted
      */
-    on_selected_cpus(&cpu_online_map, wait_for_nmis, NULL, 1);
-
-    for_each_online_cpu ( cpu )
-    {
-        if ( per_cpu(nmi_count, cpu) - prev_nmi_count[cpu] < 2 )
-        {
-            printk(" %d", cpu);
-            ok = false;
-        }
-    }
+    on_selected_cpus(&cpu_online_map, wait_for_nmis, &stuck_cpus, 1);
 
-    printk(" %s\n", ok ? "ok" : "stuck");
+    if ( cpumask_empty(&stuck_cpus) )
+        printk("ok\n");
+    else
+        printk("{%*pbl} stuck\n", CPUMASK_PR(&stuck_cpus));
 
     /*
      * Now that we know it works we can reduce NMI frequency to
-- 
2.44.0