Skip to content

Commit cb468b8

Browse files
committed
~20% improvements in data transfer when drawing
1 parent 6c2a5ce commit cb468b8

4 files changed

Lines changed: 183 additions & 45 deletions

File tree

ports/espressif/common-hal/qspibus/QSPIBus.c

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -194,12 +194,11 @@ static void qspibus_send_color_bytes(
194194
remaining -= chunk;
195195
}
196196

197-
// Keep Python/API semantics predictable: color transfer call returns only
198-
// after queued DMA chunks have completed.
199-
if (!qspibus_wait_all_transfers_done(self, pdMS_TO_TICKS(QSPI_COLOR_TIMEOUT_MS))) {
200-
qspibus_reset_transfer_state(self);
201-
mp_raise_OSError_msg(MP_ERROR_TEXT("Operation timed out"));
202-
}
197+
// Let DMA complete asynchronously. The next begin_transaction() will
198+
// wait for all in-flight transfers, allowing fill_area() computation
199+
// to overlap with DMA. The explicit wait is only needed for the
200+
// Python write_data() API path where callers expect the transfer to
201+
// be finished on return.
203202
}
204203

205204
static bool qspibus_is_color_payload_command(uint8_t command) {
@@ -465,12 +464,19 @@ void common_hal_qspibus_qspibus_write_data(
465464
}
466465
return;
467466
}
468-
if (!self->has_pending_command) {
467+
if (!self->has_pending_command) {
469468
mp_raise_ValueError(MP_ERROR_TEXT("No pending command"));
470469
}
471470

472471
if (qspibus_is_color_payload_command(self->pending_command)) {
473472
qspibus_send_color_bytes(self, self->pending_command, data, len);
473+
// Python API: wait for DMA to finish so callers see the transfer as
474+
// complete on return. The internal displayio path skips this wait
475+
// to allow fill_area/DMA overlap.
476+
if (!qspibus_wait_all_transfers_done(self, pdMS_TO_TICKS(QSPI_COLOR_TIMEOUT_MS))) {
477+
qspibus_reset_transfer_state(self);
478+
mp_raise_OSError_msg(MP_ERROR_TEXT("Operation timed out"));
479+
}
474480
} else {
475481
qspibus_send_command_bytes(self, self->pending_command, data, len);
476482
}
@@ -497,9 +503,20 @@ bool common_hal_qspibus_qspibus_bus_free(mp_obj_t obj) {
497503

498504
bool common_hal_qspibus_qspibus_begin_transaction(mp_obj_t obj) {
499505
qspibus_qspibus_obj_t *self = MP_OBJ_TO_PTR(obj);
500-
if (!common_hal_qspibus_qspibus_bus_free(obj)) {
506+
if (!self->bus_initialized || self->in_transaction || self->has_pending_command) {
501507
return false;
502508
}
509+
// Wait for any in-flight DMA to complete before starting a new
510+
// transaction. This replaces the old non-blocking bus_free() check
511+
// and enables fill_area()/DMA overlap: the CPU fills the next
512+
// subrectangle while the previous DMA runs, and this wait only
513+
// blocks when we actually need the bus for the next send.
514+
if (self->transfer_in_progress) {
515+
if (!qspibus_wait_all_transfers_done(self, pdMS_TO_TICKS(QSPI_COLOR_TIMEOUT_MS))) {
516+
qspibus_reset_transfer_state(self);
517+
return false;
518+
}
519+
}
503520
self->in_transaction = true;
504521
return true;
505522
}

shared-module/busdisplay/BusDisplay.c

Lines changed: 83 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ static void _send_pixels(busdisplay_busdisplay_obj_t *self, uint8_t *pixels, uin
217217
}
218218

219219
static bool _refresh_area(busdisplay_busdisplay_obj_t *self, const displayio_area_t *area) {
220-
uint16_t buffer_size = 128; // In uint32_ts
220+
uint32_t buffer_size = 128; // In uint32_ts
221221

222222
displayio_area_t clipped;
223223
// Clip the area to the display by overlapping the areas. If there is no overlap then we're done.
@@ -226,7 +226,7 @@ static bool _refresh_area(busdisplay_busdisplay_obj_t *self, const displayio_are
226226
}
227227
uint16_t rows_per_buffer = displayio_area_height(&clipped);
228228
uint8_t pixels_per_word = (sizeof(uint32_t) * 8) / self->core.colorspace.depth;
229-
uint16_t pixels_per_buffer = displayio_area_size(&clipped);
229+
uint32_t pixels_per_buffer = displayio_area_size(&clipped);
230230

231231
uint16_t subrectangles = 1;
232232
// for SH1107 and other boundary constrained controllers
@@ -265,7 +265,8 @@ static bool _refresh_area(busdisplay_busdisplay_obj_t *self, const displayio_are
265265
// upper bound for ESP32-S3's 24KB main task stack.
266266
_Static_assert(CIRCUITPY_QSPI_DISPLAY_AREA_BUFFER_SIZE <= 2048,
267267
"CIRCUITPY_QSPI_DISPLAY_AREA_BUFFER_SIZE exceeds safe stack limit (8KB)");
268-
if (mp_obj_is_type(self->bus.bus, &qspibus_qspibus_type) &&
268+
bool is_qspi_bus = mp_obj_is_type(self->bus.bus, &qspibus_qspibus_type);
269+
if (is_qspi_bus &&
269270
self->core.colorspace.depth == 16 &&
270271
!self->bus.data_as_commands &&
271272
!self->bus.SH1107_addressing &&
@@ -275,6 +276,10 @@ static bool _refresh_area(busdisplay_busdisplay_obj_t *self, const displayio_are
275276
if (rows_per_buffer == 0) {
276277
rows_per_buffer = 1;
277278
}
279+
// Clamp to actual display height.
280+
if (rows_per_buffer > displayio_area_height(&clipped)) {
281+
rows_per_buffer = displayio_area_height(&clipped);
282+
}
278283
subrectangles = displayio_area_height(&clipped) / rows_per_buffer;
279284
if (displayio_area_height(&clipped) % rows_per_buffer != 0) {
280285
subrectangles++;
@@ -284,32 +289,31 @@ static bool _refresh_area(busdisplay_busdisplay_obj_t *self, const displayio_are
284289
if (pixels_per_buffer % pixels_per_word) {
285290
buffer_size += 1;
286291
}
287-
}
288292

289-
if (mp_obj_is_type(self->bus.bus, &qspibus_qspibus_type) &&
290-
self->core.colorspace.depth == 16 &&
291-
!self->bus.data_as_commands &&
292-
displayio_area_height(&clipped) > 1 &&
293-
rows_per_buffer < 2 &&
294-
(2 * displayio_area_width(&clipped) + pixels_per_word - 1) / pixels_per_word <= CIRCUITPY_QSPI_DISPLAY_AREA_BUFFER_SIZE) {
295-
rows_per_buffer = 2;
296-
subrectangles = displayio_area_height(&clipped) / rows_per_buffer;
297-
if (displayio_area_height(&clipped) % rows_per_buffer != 0) {
298-
subrectangles++;
299-
}
300-
pixels_per_buffer = rows_per_buffer * displayio_area_width(&clipped);
301-
buffer_size = pixels_per_buffer / pixels_per_word;
302-
if (pixels_per_buffer % pixels_per_word) {
303-
buffer_size += 1;
293+
// Ensure at least 2 rows per buffer when possible.
294+
if (rows_per_buffer < 2 &&
295+
displayio_area_height(&clipped) > 1 &&
296+
(uint32_t)((2 * displayio_area_width(&clipped) + pixels_per_word - 1) / pixels_per_word) <= (uint32_t)CIRCUITPY_QSPI_DISPLAY_AREA_BUFFER_SIZE) {
297+
rows_per_buffer = 2;
298+
subrectangles = displayio_area_height(&clipped) / rows_per_buffer;
299+
if (displayio_area_height(&clipped) % rows_per_buffer != 0) {
300+
subrectangles++;
301+
}
302+
pixels_per_buffer = rows_per_buffer * displayio_area_width(&clipped);
303+
buffer_size = pixels_per_buffer / pixels_per_word;
304+
if (pixels_per_buffer % pixels_per_word) {
305+
buffer_size += 1;
306+
}
304307
}
305308
}
306309
#endif
307310

308311
// Allocated and shared as a uint32_t array so the compiler knows the
309312
// alignment everywhere.
310-
uint32_t buffer[buffer_size];
311313
uint32_t mask_length = (pixels_per_buffer / 32) + 1;
314+
uint32_t buffer[buffer_size];
312315
uint32_t mask[mask_length];
316+
313317
uint16_t remaining_rows = displayio_area_height(&clipped);
314318

315319
for (uint16_t j = 0; j < subrectangles; j++) {
@@ -324,28 +328,55 @@ static bool _refresh_area(busdisplay_busdisplay_obj_t *self, const displayio_are
324328
}
325329
remaining_rows -= rows_per_buffer;
326330

327-
displayio_display_bus_set_region_to_update(&self->bus, &self->core, &subrectangle);
331+
#if CIRCUITPY_QSPIBUS
332+
if (is_qspi_bus) {
333+
// QSPI path: fill_area first (overlaps with previous DMA),
334+
// then single-transaction set_region + RAMWR + pixels.
335+
// depth is always 16 here (guarded by is_qspi_bus check above).
336+
uint32_t subrectangle_size_bytes = (uint32_t)displayio_area_size(&subrectangle) * (self->core.colorspace.depth / 8);
328337

329-
uint16_t subrectangle_size_bytes;
330-
if (self->core.colorspace.depth >= 8) {
331-
subrectangle_size_bytes = displayio_area_size(&subrectangle) * (self->core.colorspace.depth / 8);
332-
} else {
333-
subrectangle_size_bytes = displayio_area_size(&subrectangle) / (8 / self->core.colorspace.depth);
334-
}
338+
memset(mask, 0, mask_length * sizeof(mask[0]));
339+
memset(buffer, 0, buffer_size * sizeof(buffer[0]));
340+
341+
displayio_display_core_fill_area(&self->core, &subrectangle, mask, buffer);
335342

336-
memset(mask, 0, mask_length * sizeof(mask[0]));
337-
memset(buffer, 0, buffer_size * sizeof(buffer[0]));
343+
// begin_transaction waits for any prior async DMA to finish,
344+
// so fill_area above overlaps with previous DMA.
345+
if (!displayio_display_bus_begin_transaction(&self->bus)) {
346+
// Transaction failed (bus deinitialized, timeout, etc.).
347+
// Bail out to prevent calling send() outside a transaction.
348+
return false;
349+
}
350+
displayio_display_bus_send_region_commands(&self->bus, &self->core, &subrectangle);
351+
_send_pixels(self, (uint8_t *)buffer, subrectangle_size_bytes);
352+
displayio_display_bus_end_transaction(&self->bus);
353+
} else
354+
#endif
355+
{
356+
// Non-QSPI path: original ordering preserved exactly.
357+
displayio_display_bus_set_region_to_update(&self->bus, &self->core, &subrectangle);
358+
359+
uint16_t subrectangle_size_bytes;
360+
if (self->core.colorspace.depth >= 8) {
361+
subrectangle_size_bytes = displayio_area_size(&subrectangle) * (self->core.colorspace.depth / 8);
362+
} else {
363+
subrectangle_size_bytes = displayio_area_size(&subrectangle) / (8 / self->core.colorspace.depth);
364+
}
338365

339-
displayio_display_core_fill_area(&self->core, &subrectangle, mask, buffer);
366+
memset(mask, 0, mask_length * sizeof(mask[0]));
367+
memset(buffer, 0, buffer_size * sizeof(buffer[0]));
340368

341-
// Can't acquire display bus; skip the rest of the data.
342-
if (!displayio_display_bus_is_free(&self->bus)) {
343-
return false;
344-
}
369+
displayio_display_core_fill_area(&self->core, &subrectangle, mask, buffer);
345370

346-
displayio_display_bus_begin_transaction(&self->bus);
347-
_send_pixels(self, (uint8_t *)buffer, subrectangle_size_bytes);
348-
displayio_display_bus_end_transaction(&self->bus);
371+
// Can't acquire display bus; skip the rest of the data.
372+
if (!displayio_display_bus_is_free(&self->bus)) {
373+
return false;
374+
}
375+
376+
displayio_display_bus_begin_transaction(&self->bus);
377+
_send_pixels(self, (uint8_t *)buffer, subrectangle_size_bytes);
378+
displayio_display_bus_end_transaction(&self->bus);
379+
}
349380

350381
// Run background tasks so they can run during an explicit refresh.
351382
// Auto-refresh won't run background tasks here because it is a background task itself.
@@ -356,6 +387,21 @@ static bool _refresh_area(busdisplay_busdisplay_obj_t *self, const displayio_are
356387
usb_background();
357388
#endif
358389
}
390+
391+
#if CIRCUITPY_QSPIBUS
392+
if (is_qspi_bus) {
393+
// Drain the last async DMA transfer before returning.
394+
// Within the loop, begin_transaction() waits for the PREVIOUS
395+
// subrectangle's DMA, enabling fill_area/DMA overlap. But the
396+
// LAST subrectangle's DMA is still in-flight when the loop ends.
397+
// Without this drain, bus_free() returns false on the next
398+
// refresh() call, causing it to be silently skipped.
399+
if (displayio_display_bus_begin_transaction(&self->bus)) {
400+
displayio_display_bus_end_transaction(&self->bus);
401+
}
402+
}
403+
#endif
404+
359405
return true;
360406
}
361407

shared-module/displayio/TileGrid.c

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,75 @@ bool displayio_tilegrid_fill_area(displayio_tilegrid_t *self,
512512
y_shift = temp_shift;
513513
}
514514

515+
// Fast path for the common case: single-tile, 16bpp bitmap with Palette,
516+
// no transforms, full coverage. Inlines bitmap access and palette cache
517+
// lookup to eliminate per-pixel function-call overhead (~4-5x speedup).
518+
if (full_coverage &&
519+
colorspace->depth == 16 &&
520+
self->width_in_tiles == 1 &&
521+
self->height_in_tiles == 1 &&
522+
self->absolute_transform->scale == 1 &&
523+
self->absolute_transform->dx == 1 &&
524+
self->absolute_transform->dy == 1 &&
525+
!self->absolute_transform->transpose_xy &&
526+
!self->flip_x && !self->flip_y &&
527+
!self->transpose_xy &&
528+
x_stride == 1 && y_stride == displayio_area_width(area) &&
529+
x_shift == 0 && y_shift == 0 &&
530+
mp_obj_is_type(self->bitmap, &displayio_bitmap_type) &&
531+
mp_obj_is_type(self->pixel_shader, &displayio_palette_type)) {
532+
533+
displayio_bitmap_t *bmp = MP_OBJ_TO_PTR(self->bitmap);
534+
displayio_palette_t *pal = MP_OBJ_TO_PTR(self->pixel_shader);
535+
536+
if (bmp->bits_per_value == 16 && !pal->dither) {
537+
uint16_t *out = (uint16_t *)buffer;
538+
bool all_opaque = true;
539+
540+
// Process all pixels with inlined bitmap access and palette cache.
541+
uint32_t idx = 0;
542+
for (int16_t y = start_y; y < end_y; ++y) {
543+
// Direct row pointer into bitmap data (16bpp → uint16_t).
544+
// stride is in uint32_t units, so multiply by 2 for uint16_t indexing.
545+
const uint16_t *bmp_row = ((const uint16_t *)bmp->data) + y * (bmp->stride * 2);
546+
547+
for (int16_t x = start_x; x < end_x; ++x) {
548+
uint16_t px_index = bmp_row[x];
549+
550+
// Inline palette cache check (avoids function call overhead).
551+
if (px_index < pal->color_count) {
552+
_displayio_color_t *color = &pal->colors[px_index];
553+
if (color->transparent) {
554+
all_opaque = false;
555+
idx++;
556+
continue;
557+
}
558+
if (color->cached_colorspace == colorspace &&
559+
color->cached_colorspace_grayscale_bit == colorspace->grayscale_bit &&
560+
color->cached_colorspace_grayscale == colorspace->grayscale) {
561+
out[idx] = (uint16_t)color->cached_color;
562+
} else {
563+
// Cache miss — do full conversion, then cache.
564+
displayio_input_pixel_t in_px = {.pixel = px_index};
565+
displayio_output_pixel_t out_px = {.opaque = true};
566+
displayio_palette_get_color(pal, colorspace, &in_px, &out_px);
567+
out[idx] = (uint16_t)out_px.pixel;
568+
}
569+
} else {
570+
// Out of palette range — transparent.
571+
all_opaque = false;
572+
idx++;
573+
continue;
574+
}
575+
mask[idx >> 5] |= 1u << (idx & 31);
576+
idx++;
577+
}
578+
}
579+
580+
return all_opaque;
581+
}
582+
}
583+
515584
displayio_input_pixel_t input_pixel;
516585
displayio_output_pixel_t output_pixel;
517586

shared-module/displayio/bus_core.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ void displayio_display_bus_end_transaction(displayio_display_bus_t *self);
4949

5050
void displayio_display_bus_set_region_to_update(displayio_display_bus_t *self, displayio_display_core_t *display, displayio_area_t *area);
5151

52+
#if CIRCUITPY_QSPIBUS
53+
// Send column/row window commands within an already-open transaction.
54+
// Caller must have called displayio_display_bus_begin_transaction() first.
55+
void displayio_display_bus_send_region_commands(displayio_display_bus_t *self, displayio_display_core_t *display, displayio_area_t *area);
56+
#endif
57+
5258
void release_display_bus(displayio_display_bus_t *self);
5359

5460
void displayio_display_bus_collect_ptrs(displayio_display_bus_t *self);

0 commit comments

Comments
 (0)