From 86f78d2bdd6c019c437c586734e72c5264337e98 Mon Sep 17 00:00:00 2001 From: Eric Lordahl Date: Mon, 8 Jun 2026 09:33:57 -0400 Subject: [PATCH] 99-mellanox: fix verbs/iface array skew, skip interfaceless devices Three independent sysfs globs (infiniband_verbs, infiniband, infiniband_mad) built the parallel arrays assuming equal counts and aligned ordering. When a PCI function exposed a verbs device but no infiniband/ class entry, ifaces[] ended up shorter than devices[]. The mount loop only range-checked against ${#devices[@]}, so it dereferenced an unset ifaces[id] and, under set -euo pipefail, aborted the hook: /etc/enroot/hooks.d/99-mellanox.sh: line 88: ifaces[id]: unbound variable [ERROR] /etc/enroot/hooks.d/99-mellanox.sh exited with return code 1 Before the abort, the skew also silently mis-paired devices[] with the wrong ifaces[] entry for every id past the first gap. On SR-IOV/RoCE nodes this is the normal steady state, not a hardware fault: when a pod starts, the SR-IOV + ovs-cni + rdma-cni chain moves an assigned VF's RDMA device into the pod's network namespace. The VF's infiniband_verbs char node still enumerates on the host while its ib_device leaves the host /sys/class/infiniband and the per-function infiniband/ directory, so devices[] > ifaces[] for as long as the pod holds the VF. Observed on a Blackwell Spectrum-X node where a training pod claimed all eight port-0 VFs and any concurrent MELLANOX_VISIBLE_DEVICES=all launch hit the crash; originally seen breaking NCCL alltoall_perf_mpi, all_gather_perf_mpi, all_reduce_perf_mpi, and reduce_scatter_perf_mpi. Fix: enumerate per PCI function anchored on infiniband_verbs and resolve the iface and management nodes from the same directory, so the arrays are always index-aligned regardless of which sysfs sub-entries are present. A device whose interface is absent in the current namespace is skipped with a warning rather than treated as a fatal error: an interfaceless verbs node cannot be mounted (its /sys/class/infiniband entry does not exist here), and on shared SR-IOV nodes its absence is expected. umad/issm entries are guarded with [ -n ] since their absence is less critical. Signed-off-by: Eric Lordahl --- conf/hooks/99-mellanox.sh | 64 +++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/conf/hooks/99-mellanox.sh b/conf/hooks/99-mellanox.sh index cd3c712..944ee5c 100755 --- a/conf/hooks/99-mellanox.sh +++ b/conf/hooks/99-mellanox.sh @@ -39,28 +39,41 @@ declare -a issms=() declare -a umads=() declare -A providers=() -# Lookup all the devices and their respective driver. +# Enumerate per PCI function anchored on infiniband_verbs, resolving the +# interface and management nodes from the same directory. Three +# independent globs over different sysfs subtrees previously left ifaces[] +# shorter than devices[] when a function had no infiniband/ entry (DPU, +# SF/SR-IOV representor, down port), causing "ifaces[id]: unbound variable" +# under set -u and aborting every container launch on the affected node. for uevent in /sys/bus/pci/drivers/mlx?_core/*/infiniband_verbs/*/uevent; do case "${uevent}" in - *mlx4*) drivers+=("mlx4") ;; - *mlx5*) drivers+=("mlx5") ;; + *mlx4*) driver="mlx4" ;; + *mlx5*) driver="mlx5" ;; *) continue ;; esac + + # .../0000:xx:00.0/infiniband_verbs/uverbsN/uevent -> .../0000:xx:00.0 + pcidir="${uevent%/infiniband_verbs/*}" + + drivers+=("${driver}") devices+=("$(. "${uevent}"; echo "/dev/${DEVNAME}")") -done -# Lookup all the interfaces. -for uevent in /sys/bus/pci/drivers/mlx?_core/*/infiniband/*/uevent; do - ifaces+=("$(. "${uevent}"; echo "${NAME}")") -done + iface="" + for ib_uevent in "${pcidir}"/infiniband/*/uevent; do + iface="$(. "${ib_uevent}"; echo "${NAME}")" + break + done + ifaces+=("${iface}") -# Lookup all the management devices. -for uevent in /sys/bus/pci/drivers/mlx?_core/*/infiniband_mad/*/uevent; do - case "${uevent}" in - *issm*) issms+=("$(. "${uevent}"; echo "/dev/${DEVNAME}")") ;; - *umad*) umads+=("$(. "${uevent}"; echo "/dev/${DEVNAME}")") ;; - *) continue ;; - esac + umad="" issm="" + for mad_uevent in "${pcidir}"/infiniband_mad/*/uevent; do + case "${mad_uevent}" in + *issm*) issm="$(. "${mad_uevent}"; echo "/dev/${DEVNAME}")" ;; + *umad*) umad="$(. "${mad_uevent}"; echo "/dev/${DEVNAME}")" ;; + esac + done + umads+=("${umad}") + issms+=("${issm}") done # Hide all the device entries in sysfs by default and mount RDMA CM. @@ -83,15 +96,28 @@ for id in ${MELLANOX_VISIBLE_DEVICES//,/ }; do if [[ ! "${id}" =~ ^[[:digit:]]+$ ]] || [ "${id}" -lt 0 ] || [ "${id}" -ge "${#devices[@]}" ]; then common::err "Unknown MELLANOX device id: ${id}" fi + # A verbs device with no InfiniBand interface in this namespace is normal on + # SR-IOV/RoCE nodes: the VF may be claimed by another network namespace (a + # Kubernetes pod via rdma-cni), or be a representor or a down port. Skip it + # rather than aborting the launch -- mounting it would fail anyway since + # /sys/class/infiniband/ does not exist in this namespace. + if [ -z "${ifaces[id]}" ]; then + common::log WARN "MELLANOX device id ${id} (${devices[id]}) has no InfiniBand interface in this namespace; skipping" + continue + fi providers["${drivers[id]}"]=true enroot-mount --root "${ENROOT_ROOTFS}" - <<< "${devices[id]} ${devices[id]} none x-create=file,bind,ro,nosuid,noexec,private" ln -s "$(common::realpath "/sys/class/infiniband/${ifaces[id]}")" "${ENROOT_ROOTFS}/sys/class/infiniband/${ifaces[id]}" ln -s "$(common::realpath "/sys/class/infiniband_verbs/${devices[id]##*/}")" "${ENROOT_ROOTFS}/sys/class/infiniband_verbs/${devices[id]##*/}" if [ -n "${ENROOT_ALLOW_SUPERUSER-}" ] && [ "$(awk '{print $2}' /proc/self/uid_map)" -eq 0 ]; then - enroot-mount --root "${ENROOT_ROOTFS}" - <<< "${umads[id]} ${umads[id]} none x-create=file,bind,ro,nosuid,noexec,private,nofail,silent" - enroot-mount --root "${ENROOT_ROOTFS}" - <<< "${issms[id]} ${issms[id]} none x-create=file,bind,ro,nosuid,noexec,private,nofail,silent" - ln -s "$(common::realpath "/sys/class/infiniband_mad/${umads[id]##*/}")" "${ENROOT_ROOTFS}/sys/class/infiniband_mad/${umads[id]##*/}" - ln -s "$(common::realpath "/sys/class/infiniband_mad/${issms[id]##*/}")" "${ENROOT_ROOTFS}/sys/class/infiniband_mad/${issms[id]##*/}" + if [ -n "${umads[id]}" ]; then + enroot-mount --root "${ENROOT_ROOTFS}" - <<< "${umads[id]} ${umads[id]} none x-create=file,bind,ro,nosuid,noexec,private,nofail,silent" + ln -s "$(common::realpath "/sys/class/infiniband_mad/${umads[id]##*/}")" "${ENROOT_ROOTFS}/sys/class/infiniband_mad/${umads[id]##*/}" + fi + if [ -n "${issms[id]}" ]; then + enroot-mount --root "${ENROOT_ROOTFS}" - <<< "${issms[id]} ${issms[id]} none x-create=file,bind,ro,nosuid,noexec,private,nofail,silent" + ln -s "$(common::realpath "/sys/class/infiniband_mad/${issms[id]##*/}")" "${ENROOT_ROOTFS}/sys/class/infiniband_mad/${issms[id]##*/}" + fi fi done