fix(commander): calibration CPU starvation on linux (#26608)

On Linux targets with high-rate external sensor data (>1000Hz), all
sensor calibrations (gyro, accel, mag) can freeze PX4 by starving
other threads of CPU. Normal flight is unaffected — only calibration
triggers the problem.

Two compounding issues in the calibration worker threads:

1. calibrate_cancel_check() creates a new uORB::Subscription on every
   call, which triggers getDeviceNodeLocked() — an O(n) linear strcmp
   scan through all uORB nodes. In gyro/mag calibration this was called
   on every sensor sample, consuming the majority of CPU in strcmp alone.

2. SubscriptionBlocking::updatedBlocking() returns immediately when data
   is already available (it only blocks when no data is pending). With
   continuous high-rate sensor data, the calibration loops never yield,
   spinning at 100% CPU.

These problems are addressed with this patch as follows:

- Throttle calibrate_cancel_check() to once per 200ms in gyro
  and mag calibration loops.

- Add 1ms px4_usleep() yield before updatedBlocking()/updateBlocking()
  in all calibration loops (gyro, accel, mag, orientation detection).
  This caps the effective loop rate at ~1000Hz — still far above what
  calibration needs (250-750 samples).

- Force Commander main loop to sleep during calibration so it does not
  compete with calibration worker threads for CPU.

Tested under Linux (x64, aarch64) both with RT and non-RT scheduling,
with sensor data arriving at ~3600Hz. Calibration completes normally
and no longer results in a deadlocked process.
This commit is contained in:
Jeff Katz 2026-03-06 20:11:24 +01:00 committed by GitHub
parent 7ee02968ac
commit 5cdf5ac482
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 60 additions and 11 deletions

View File

@ -2032,8 +2032,10 @@ void Commander::run()
perf_end(_loop_perf);
// sleep if there are no vehicle_commands or action_requests to process
if (!_vehicle_command_sub.updated() && !_action_request_sub.updated()) {
// Always sleep during calibration to avoid competing with calibration
// worker threads for CPU. Otherwise, sleep only when idle.
if (_vehicle_status.calibration_enabled
|| (!_vehicle_command_sub.updated() && !_action_request_sub.updated())) {
px4_usleep(COMMANDER_MONITORING_INTERVAL);
}
}

View File

@ -178,6 +178,10 @@ static calibrate_return read_accelerometer_avg(accel_worker_data_s *worker_data,
/* use the first sensor to pace the readout, but do per-sensor counts */
while (counts[0] < samples_num) {
// Yield CPU — updatedBlocking() returns immediately when data is
// already available, so with high-rate sensors the loop never blocks.
px4_usleep(1000);
if (accel_sub[0].updatedBlocking(100000)) {
for (unsigned accel_index = 0; accel_index < MAX_ACCEL_SENS; accel_index++) {
sensor_accel_s arp;

View File

@ -90,6 +90,10 @@ enum detect_orientation_return detect_orientation(orb_advert_t *mavlink_log_pub,
uORB::SubscriptionBlocking<vehicle_acceleration_s> vehicle_acceleration_sub{ORB_ID(vehicle_acceleration)};
while (true) {
// Yield CPU — updateBlocking() returns immediately when data is
// already available, so with high-rate sensors the loop never blocks.
px4_usleep(1000);
vehicle_acceleration_s accel;
if (vehicle_acceleration_sub.updateBlocking(accel, 100000)) {

View File

@ -59,6 +59,8 @@
#include <uORB/SubscriptionBlocking.hpp>
#include <uORB/topics/sensor_gyro.h>
using namespace time_literals;
static constexpr char sensor_name[] {"gyro"};
static constexpr unsigned MAX_GYROS = 4;
@ -91,12 +93,24 @@ static calibrate_return gyro_calibration_worker(gyro_worker_data_t &worker_data)
/* use slowest gyro to pace, but count correctly per-gyro for statistics */
unsigned slow_count = 0;
hrt_abstime last_cancel_check = hrt_absolute_time();
while (slow_count < CALIBRATION_COUNT) {
if (calibrate_cancel_check(worker_data.mavlink_log_pub, calibration_started)) {
return calibrate_return_cancelled;
// Throttle cancel check — calibrate_cancel_check() creates a new
// uORB::Subscription each call, triggering an O(n) topic lookup.
// At high sensor rates this dominates CPU. Check at most every 200ms.
if (hrt_elapsed_time(&last_cancel_check) > 200_ms) {
last_cancel_check = hrt_absolute_time();
if (calibrate_cancel_check(worker_data.mavlink_log_pub, calibration_started)) {
return calibrate_return_cancelled;
}
}
// Yield CPU — updatedBlocking() returns immediately when data is
// already available, so with high-rate sensors the loop never blocks.
px4_usleep(1000);
if (gyro_sub[0].updatedBlocking(100000)) {
unsigned update_count = CALIBRATION_COUNT;

View File

@ -278,14 +278,22 @@ static calibrate_return mag_calibration_worker(detect_orientation_return orienta
uORB::SubscriptionBlocking<sensor_gyro_s> gyro_sub{ORB_ID(sensor_gyro)};
hrt_abstime last_cancel_check = hrt_absolute_time();
while (fabsf(gyro_x_integral) < gyro_int_thresh_rad &&
fabsf(gyro_y_integral) < gyro_int_thresh_rad &&
fabsf(gyro_z_integral) < gyro_int_thresh_rad) {
/* abort on request */
if (calibrate_cancel_check(worker_data->mavlink_log_pub, calibration_started)) {
result = calibrate_return_cancelled;
return result;
// Throttle cancel check — calibrate_cancel_check() creates a new
// uORB::Subscription each call, triggering an O(n) topic lookup.
// At high sensor rates this dominates CPU. Check at most every 200ms.
if (hrt_elapsed_time(&last_cancel_check) > 200_ms) {
last_cancel_check = hrt_absolute_time();
if (calibrate_cancel_check(worker_data->mavlink_log_pub, calibration_started)) {
result = calibrate_return_cancelled;
return result;
}
}
/* abort with timeout */
@ -297,6 +305,10 @@ static calibrate_return mag_calibration_worker(detect_orientation_return orienta
}
/* Wait clocking for new data on all gyro */
// Yield CPU — updateBlocking() returns immediately when data is
// already available, so with high-rate sensors the loop never blocks.
px4_usleep(1000);
sensor_gyro_s gyro;
if (gyro_sub.updateBlocking(gyro, 1000_ms)) {
@ -326,14 +338,27 @@ static calibrate_return mag_calibration_worker(detect_orientation_return orienta
unsigned poll_errcount = 0;
unsigned calibration_counter_side = 0;
last_cancel_check = hrt_absolute_time();
while (hrt_absolute_time() < calibration_deadline &&
calibration_counter_side < worker_data->calibration_points_perside) {
if (calibrate_cancel_check(worker_data->mavlink_log_pub, calibration_started)) {
result = calibrate_return_cancelled;
break;
// Throttle cancel check — calibrate_cancel_check() creates a new
// uORB::Subscription each call, triggering an O(n) topic lookup.
// At high sensor rates this dominates CPU. Check at most every 200ms.
if (hrt_elapsed_time(&last_cancel_check) > 200_ms) {
last_cancel_check = hrt_absolute_time();
if (calibrate_cancel_check(worker_data->mavlink_log_pub, calibration_started)) {
result = calibrate_return_cancelled;
break;
}
}
// Yield CPU — updatedBlocking() returns immediately when data is
// already available, so with high-rate sensors the loop never blocks.
px4_usleep(1000);
if (mag_sub[0].updatedBlocking(1000_ms)) {
bool rejected = false;
Vector3f new_samples[MAX_MAGS] {};