Skip to content

Commit cada8ca

Browse files
jchu314atgithubgregkh
authored andcommitted
sparc64: Measure receiver forward progress to avoid send mondo timeout
[ Upstream commit 9d53caec84c7c5700e7c1ed744ea584fff55f9ac ] A large sun4v SPARC system may have moments of intensive xcall activities, usually caused by unmapping many pages on many CPUs concurrently. This can flood receivers with CPU mondo interrupts for an extended period, causing some unlucky senders to hit send-mondo timeout. This problem gets worse as cpu count increases because sometimes mappings must be invalidated on all CPUs, and sometimes all CPUs may gang up on a single CPU. But a busy system is not a broken system. In the above scenario, as long as the receiver is making forward progress processing mondo interrupts, the sender should continue to retry. This patch implements the receiver's forward progress meter by introducing a per cpu counter 'cpu_mondo_counter[cpu]' where 'cpu' is in the range of 0..NR_CPUS. The receiver increments its counter as soon as it receives a mondo and the sender tracks the receiver's counter. If the receiver has stopped making forward progress when the retry limit is reached, the sender declares send-mondo-timeout and panic; otherwise, the receiver is allowed to keep making forward progress. In addition, it's been observed that PCIe hotplug events generate Correctable Errors that are handled by hypervisor and then OS. Hypervisor 'borrows' a guest cpu strand briefly to provide the service. If the cpu strand is simultaneously the only cpu targeted by a mondo, it may not be available for the mondo in 20msec, causing SUN4V mondo timeout. It appears that 1 second is the agreed wait time between hypervisor and guest OS, this patch makes the adjustment. Orabug: 25476541 Orabug: 26417466 Signed-off-by: Jane Chu <jane.chu@oracle.com> Reviewed-by: Steve Sistare <steven.sistare@oracle.com> Reviewed-by: Anthony Yznaga <anthony.yznaga@oracle.com> Reviewed-by: Rob Gardner <rob.gardner@oracle.com> Reviewed-by: Thomas Tai <thomas.tai@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent 7c37101 commit cada8ca

4 files changed

Lines changed: 132 additions & 70 deletions

File tree

arch/sparc/include/asm/trap_block.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ extern struct trap_per_cpu trap_block[NR_CPUS];
5454
void init_cur_cpu_trap(struct thread_info *);
5555
void setup_tba(void);
5656
extern int ncpus_probed;
57+
extern u64 cpu_mondo_counter[NR_CPUS];
5758

5859
unsigned long real_hard_smp_processor_id(void);
5960

arch/sparc/kernel/smp_64.c

Lines changed: 115 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -617,117 +617,162 @@ static void cheetah_xcall_deliver(struct trap_per_cpu *tb, int cnt)
617617
}
618618
}
619619

620-
/* Multi-cpu list version. */
620+
#define CPU_MONDO_COUNTER(cpuid) (cpu_mondo_counter[cpuid])
621+
#define MONDO_USEC_WAIT_MIN 2
622+
#define MONDO_USEC_WAIT_MAX 100
623+
#define MONDO_RETRY_LIMIT 500000
624+
625+
/* Multi-cpu list version.
626+
*
627+
* Deliver xcalls to 'cnt' number of cpus in 'cpu_list'.
628+
* Sometimes not all cpus receive the mondo, requiring us to re-send
629+
* the mondo until all cpus have received, or cpus are truly stuck
630+
* unable to receive mondo, and we timeout.
631+
* Occasionally a target cpu strand is borrowed briefly by hypervisor to
632+
* perform guest service, such as PCIe error handling. Consider the
633+
* service time, 1 second overall wait is reasonable for 1 cpu.
634+
* Here two in-between mondo check wait time are defined: 2 usec for
635+
* single cpu quick turn around and up to 100usec for large cpu count.
636+
* Deliver mondo to large number of cpus could take longer, we adjusts
637+
* the retry count as long as target cpus are making forward progress.
638+
*/
621639
static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
622640
{
623-
int retries, this_cpu, prev_sent, i, saw_cpu_error;
641+
int this_cpu, tot_cpus, prev_sent, i, rem;
642+
int usec_wait, retries, tot_retries;
643+
u16 first_cpu = 0xffff;
644+
unsigned long xc_rcvd = 0;
624645
unsigned long status;
646+
int ecpuerror_id = 0;
647+
int enocpu_id = 0;
625648
u16 *cpu_list;
649+
u16 cpu;
626650

627651
this_cpu = smp_processor_id();
628-
629652
cpu_list = __va(tb->cpu_list_pa);
630-
631-
saw_cpu_error = 0;
632-
retries = 0;
653+
usec_wait = cnt * MONDO_USEC_WAIT_MIN;
654+
if (usec_wait > MONDO_USEC_WAIT_MAX)
655+
usec_wait = MONDO_USEC_WAIT_MAX;
656+
retries = tot_retries = 0;
657+
tot_cpus = cnt;
633658
prev_sent = 0;
659+
634660
do {
635-
int forward_progress, n_sent;
661+
int n_sent, mondo_delivered, target_cpu_busy;
636662

637663
status = sun4v_cpu_mondo_send(cnt,
638664
tb->cpu_list_pa,
639665
tb->cpu_mondo_block_pa);
640666

641667
/* HV_EOK means all cpus received the xcall, we're done. */
642668
if (likely(status == HV_EOK))
643-
break;
669+
goto xcall_done;
670+
671+
/* If not these non-fatal errors, panic */
672+
if (unlikely((status != HV_EWOULDBLOCK) &&
673+
(status != HV_ECPUERROR) &&
674+
(status != HV_ENOCPU)))
675+
goto fatal_errors;
644676

645677
/* First, see if we made any forward progress.
678+
*
679+
* Go through the cpu_list, count the target cpus that have
680+
* received our mondo (n_sent), and those that did not (rem).
681+
* Re-pack cpu_list with the cpus remain to be retried in the
682+
* front - this simplifies tracking the truly stalled cpus.
646683
*
647684
* The hypervisor indicates successful sends by setting
648685
* cpu list entries to the value 0xffff.
686+
*
687+
* EWOULDBLOCK means some target cpus did not receive the
688+
* mondo and retry usually helps.
689+
*
690+
* ECPUERROR means at least one target cpu is in error state,
691+
* it's usually safe to skip the faulty cpu and retry.
692+
*
693+
* ENOCPU means one of the target cpu doesn't belong to the
694+
* domain, perhaps offlined which is unexpected, but not
695+
* fatal and it's okay to skip the offlined cpu.
649696
*/
697+
rem = 0;
650698
n_sent = 0;
651699
for (i = 0; i < cnt; i++) {
652-
if (likely(cpu_list[i] == 0xffff))
700+
cpu = cpu_list[i];
701+
if (likely(cpu == 0xffff)) {
653702
n_sent++;
703+
} else if ((status == HV_ECPUERROR) &&
704+
(sun4v_cpu_state(cpu) == HV_CPU_STATE_ERROR)) {
705+
ecpuerror_id = cpu + 1;
706+
} else if (status == HV_ENOCPU && !cpu_online(cpu)) {
707+
enocpu_id = cpu + 1;
708+
} else {
709+
cpu_list[rem++] = cpu;
710+
}
654711
}
655712

656-
forward_progress = 0;
657-
if (n_sent > prev_sent)
658-
forward_progress = 1;
713+
/* No cpu remained, we're done. */
714+
if (rem == 0)
715+
break;
659716

660-
prev_sent = n_sent;
717+
/* Otherwise, update the cpu count for retry. */
718+
cnt = rem;
661719

662-
/* If we get a HV_ECPUERROR, then one or more of the cpus
663-
* in the list are in error state. Use the cpu_state()
664-
* hypervisor call to find out which cpus are in error state.
720+
/* Record the overall number of mondos received by the
721+
* first of the remaining cpus.
665722
*/
666-
if (unlikely(status == HV_ECPUERROR)) {
667-
for (i = 0; i < cnt; i++) {
668-
long err;
669-
u16 cpu;
723+
if (first_cpu != cpu_list[0]) {
724+
first_cpu = cpu_list[0];
725+
xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
726+
}
670727

671-
cpu = cpu_list[i];
672-
if (cpu == 0xffff)
673-
continue;
728+
/* Was any mondo delivered successfully? */
729+
mondo_delivered = (n_sent > prev_sent);
730+
prev_sent = n_sent;
674731

675-
err = sun4v_cpu_state(cpu);
676-
if (err == HV_CPU_STATE_ERROR) {
677-
saw_cpu_error = (cpu + 1);
678-
cpu_list[i] = 0xffff;
679-
}
680-
}
681-
} else if (unlikely(status != HV_EWOULDBLOCK))
682-
goto fatal_mondo_error;
732+
/* or, was any target cpu busy processing other mondos? */
733+
target_cpu_busy = (xc_rcvd < CPU_MONDO_COUNTER(first_cpu));
734+
xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
683735

684-
/* Don't bother rewriting the CPU list, just leave the
685-
* 0xffff and non-0xffff entries in there and the
686-
* hypervisor will do the right thing.
687-
*
688-
* Only advance timeout state if we didn't make any
689-
* forward progress.
736+
/* Retry count is for no progress. If we're making progress,
737+
* reset the retry count.
690738
*/
691-
if (unlikely(!forward_progress)) {
692-
if (unlikely(++retries > 10000))
693-
goto fatal_mondo_timeout;
694-
695-
/* Delay a little bit to let other cpus catch up
696-
* on their cpu mondo queue work.
697-
*/
698-
udelay(2 * cnt);
739+
if (likely(mondo_delivered || target_cpu_busy)) {
740+
tot_retries += retries;
741+
retries = 0;
742+
} else if (unlikely(retries > MONDO_RETRY_LIMIT)) {
743+
goto fatal_mondo_timeout;
699744
}
700-
} while (1);
701745

702-
if (unlikely(saw_cpu_error))
703-
goto fatal_mondo_cpu_error;
746+
/* Delay a little bit to let other cpus catch up on
747+
* their cpu mondo queue work.
748+
*/
749+
if (!mondo_delivered)
750+
udelay(usec_wait);
704751

705-
return;
752+
retries++;
753+
} while (1);
706754

707-
fatal_mondo_cpu_error:
708-
printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
709-
"(including %d) were in error state\n",
710-
this_cpu, saw_cpu_error - 1);
755+
xcall_done:
756+
if (unlikely(ecpuerror_id > 0)) {
757+
pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) was in error state\n",
758+
this_cpu, ecpuerror_id - 1);
759+
} else if (unlikely(enocpu_id > 0)) {
760+
pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) does not belong to the domain\n",
761+
this_cpu, enocpu_id - 1);
762+
}
711763
return;
712764

765+
fatal_errors:
766+
/* fatal errors include bad alignment, etc */
767+
pr_crit("CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) mondo_block_pa(%lx)\n",
768+
this_cpu, tot_cpus, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
769+
panic("Unexpected SUN4V mondo error %lu\n", status);
770+
713771
fatal_mondo_timeout:
714-
printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
715-
" progress after %d retries.\n",
716-
this_cpu, retries);
717-
goto dump_cpu_list_and_out;
718-
719-
fatal_mondo_error:
720-
printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
721-
this_cpu, status);
722-
printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
723-
"mondo_block_pa(%lx)\n",
724-
this_cpu, cnt, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
725-
726-
dump_cpu_list_and_out:
727-
printk(KERN_CRIT "CPU[%d]: CPU list [ ", this_cpu);
728-
for (i = 0; i < cnt; i++)
729-
printk("%u ", cpu_list[i]);
730-
printk("]\n");
772+
/* some cpus being non-responsive to the cpu mondo */
773+
pr_crit("CPU[%d]: SUN4V mondo timeout, cpu(%d) made no forward progress after %d retries. Total target cpus(%d).\n",
774+
this_cpu, first_cpu, (tot_retries + retries), tot_cpus);
775+
panic("SUN4V mondo timeout panic\n");
731776
}
732777

733778
static void (*xcall_deliver_impl)(struct trap_per_cpu *, int);

arch/sparc/kernel/sun4v_ivec.S

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,21 @@ sun4v_cpu_mondo:
2626
ldxa [%g0] ASI_SCRATCHPAD, %g4
2727
sub %g4, TRAP_PER_CPU_FAULT_INFO, %g4
2828

29+
/* Get smp_processor_id() into %g3 */
30+
sethi %hi(trap_block), %g5
31+
or %g5, %lo(trap_block), %g5
32+
sub %g4, %g5, %g3
33+
srlx %g3, TRAP_BLOCK_SZ_SHIFT, %g3
34+
35+
/* Increment cpu_mondo_counter[smp_processor_id()] */
36+
sethi %hi(cpu_mondo_counter), %g5
37+
or %g5, %lo(cpu_mondo_counter), %g5
38+
sllx %g3, 3, %g3
39+
add %g5, %g3, %g5
40+
ldx [%g5], %g3
41+
add %g3, 1, %g3
42+
stx %g3, [%g5]
43+
2944
/* Get CPU mondo queue base phys address into %g7. */
3045
ldx [%g4 + TRAP_PER_CPU_CPU_MONDO_PA], %g7
3146

arch/sparc/kernel/traps_64.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2659,6 +2659,7 @@ void do_getpsr(struct pt_regs *regs)
26592659
}
26602660
}
26612661

2662+
u64 cpu_mondo_counter[NR_CPUS] = {0};
26622663
struct trap_per_cpu trap_block[NR_CPUS];
26632664
EXPORT_SYMBOL(trap_block);
26642665

0 commit comments

Comments
 (0)