libcamera: debayer_cpu: Add 32bits/aligned output formats
diff mbox series

Message ID 20240611110721.10690-1-robert.mader@collabora.com
State Accepted
Headers show
Series
  • libcamera: debayer_cpu: Add 32bits/aligned output formats
Related show

Commit Message

Robert Mader June 11, 2024, 11:07 a.m. UTC
In order to be more compatible with modern hardware and APIs. This
notably allows GL implementations to directly import the buffers more
often and seems to be required for Wayland.

Further more, as we already enforce a 8 byte stride, these formats work
better for clients that don't support padding - such as libwebrtc at the
time of writing.

Tested on the Librem5 and PinePhone.

Signed-off-by: Robert Mader <robert.mader@collabora.com>
---
 src/libcamera/software_isp/debayer_cpu.cpp | 244 +++++++++++++++++++--
 src/libcamera/software_isp/debayer_cpu.h   |  10 +
 2 files changed, 238 insertions(+), 16 deletions(-)

Comments

Milan Zamazal June 11, 2024, 2:29 p.m. UTC | #1
Hi Robert,

thank you for the patch.

Robert Mader <robert.mader@collabora.com> writes:

> In order to be more compatible with modern hardware and APIs. This
> notably allows GL implementations to directly import the buffers more
> often and seems to be required for Wayland.
>
> Further more, as we already enforce a 8 byte stride, these formats work
> better for clients that don't support padding - such as libwebrtc at the
> time of writing.
>
> Tested on the Librem5 and PinePhone.
>
> Signed-off-by: Robert Mader <robert.mader@collabora.com>
> ---
>  src/libcamera/software_isp/debayer_cpu.cpp | 244 +++++++++++++++++++--
>  src/libcamera/software_isp/debayer_cpu.h   |  10 +
>  2 files changed, 238 insertions(+), 16 deletions(-)
>
> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
> index c038eed4..73c66a88 100644
> --- a/src/libcamera/software_isp/debayer_cpu.cpp
> +++ b/src/libcamera/software_isp/debayer_cpu.cpp
> @@ -76,6 +76,13 @@ DebayerCpu::~DebayerCpu()
>  	*dst++ = red_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
>  	x++;
>  
> +#define BGGR_XBGR8888(p, n, div)                                                              \
> +	*dst++ = blue_[curr[x] / (div)];                                                      \
> +	*dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];       \
> +	*dst++ = red_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
> +	*dst++ = 255;                                                                         \
> +	x++;
> +

The level of code duplication here starts to exceed reasonable limits.
Maybe adding an argument to the macro deciding whether to append the
last `dst' assignment or not would have a significant performance
impact.  But even then I guess there must be a way to let the compiler
generate the duplicate code (an inline function template?) rather than
doing it manually.  What do the C++ experts around think?

If nothing better is possible then `*dst++ = 255' can be applied in the
callers rather than defining the alternative macro versions.

>  /*
>   * GBG
>   * RGR
> @@ -87,6 +94,13 @@ DebayerCpu::~DebayerCpu()
>  	*dst++ = red_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
>  	x++;
>  
> +#define GRBG_XBGR8888(p, n, div)                                  \
> +	*dst++ = blue_[(prev[x] + next[x]) / (2 * (div))];        \
> +	*dst++ = green_[curr[x] / (div)];                         \
> +	*dst++ = red_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
> +	*dst++ = 255;                                             \
> +	x++;
> +
>  /*
>   * GRG
>   * BGB
> @@ -98,6 +112,13 @@ DebayerCpu::~DebayerCpu()
>  	*dst++ = red_[(prev[x] + next[x]) / (2 * (div))];          \
>  	x++;
>  
> +#define GBRG_XBGR8888(p, n, div)                                   \
> +	*dst++ = blue_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
> +	*dst++ = green_[curr[x] / (div)];                          \
> +	*dst++ = red_[(prev[x] + next[x]) / (2 * (div))];          \
> +	*dst++ = 255;                                              \
> +	x++;
> +
>  /*
>   * BGB
>   * GRG
> @@ -109,6 +130,13 @@ DebayerCpu::~DebayerCpu()
>  	*dst++ = red_[curr[x] / (div)];                                                        \
>  	x++;
>  
> +#define RGGB_XBGR8888(p, n, div)                                                               \
> +	*dst++ = blue_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
> +	*dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];        \
> +	*dst++ = red_[curr[x] / (div)];                                                        \
> +	*dst++ = 255;                                                                          \
> +	x++;
> +
>  void DebayerCpu::debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>  	DECLARE_SRC_POINTERS(uint8_t)
> @@ -119,6 +147,16 @@ void DebayerCpu::debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>  	}
>  }
>  
> +void DebayerCpu::debayer8_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +	DECLARE_SRC_POINTERS(uint8_t)
> +
> +	for (int x = 0; x < (int)window_.width;) {
> +		BGGR_XBGR8888(1, 1, 1)
> +		GBRG_XBGR8888(1, 1, 1)
> +	}
> +}
> +

... and then X and non-X versions of these methods could be merged too.
Using a template for the purpose should have no performance impact, I
suppose.

>  void DebayerCpu::debayer8_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>  	DECLARE_SRC_POINTERS(uint8_t)
> @@ -129,6 +167,16 @@ void DebayerCpu::debayer8_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>  	}
>  }
>  
> +void DebayerCpu::debayer8_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +	DECLARE_SRC_POINTERS(uint8_t)
> +
> +	for (int x = 0; x < (int)window_.width;) {
> +		GRBG_XBGR8888(1, 1, 1)
> +		RGGB_XBGR8888(1, 1, 1)
> +	}
> +}
> +
>  void DebayerCpu::debayer10_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>  	DECLARE_SRC_POINTERS(uint16_t)
> @@ -140,6 +188,17 @@ void DebayerCpu::debayer10_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>  	}
>  }
>  
> +void DebayerCpu::debayer10_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +	DECLARE_SRC_POINTERS(uint16_t)
> +
> +	for (int x = 0; x < (int)window_.width;) {
> +		/* divide values by 4 for 10 -> 8 bpp value */
> +		BGGR_XBGR8888(1, 1, 4)
> +		GBRG_XBGR8888(1, 1, 4)
> +	}
> +}
> +
>  void DebayerCpu::debayer10_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>  	DECLARE_SRC_POINTERS(uint16_t)
> @@ -151,6 +210,17 @@ void DebayerCpu::debayer10_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>  	}
>  }
>  
> +void DebayerCpu::debayer10_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +	DECLARE_SRC_POINTERS(uint16_t)
> +
> +	for (int x = 0; x < (int)window_.width;) {
> +		/* divide values by 4 for 10 -> 8 bpp value */
> +		GRBG_XBGR8888(1, 1, 4)
> +		RGGB_XBGR8888(1, 1, 4)
> +	}
> +}
> +
>  void DebayerCpu::debayer12_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>  	DECLARE_SRC_POINTERS(uint16_t)
> @@ -162,6 +232,17 @@ void DebayerCpu::debayer12_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>  	}
>  }
>  
> +void DebayerCpu::debayer12_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +	DECLARE_SRC_POINTERS(uint16_t)
> +
> +	for (int x = 0; x < (int)window_.width;) {
> +		/* divide values by 16 for 12 -> 8 bpp value */
> +		BGGR_XBGR8888(1, 1, 16)
> +		GBRG_XBGR8888(1, 1, 16)
> +	}
> +}
> +
>  void DebayerCpu::debayer12_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>  	DECLARE_SRC_POINTERS(uint16_t)
> @@ -173,6 +254,17 @@ void DebayerCpu::debayer12_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>  	}
>  }
>  
> +void DebayerCpu::debayer12_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +	DECLARE_SRC_POINTERS(uint16_t)
> +
> +	for (int x = 0; x < (int)window_.width;) {
> +		/* divide values by 16 for 12 -> 8 bpp value */
> +		GRBG_XBGR8888(1, 1, 16)
> +		RGGB_XBGR8888(1, 1, 16)
> +	}
> +}
> +
>  void DebayerCpu::debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>  	const int widthInBytes = window_.width * 5 / 4;
> @@ -198,6 +290,31 @@ void DebayerCpu::debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>  	}
>  }
>  
> +void DebayerCpu::debayer10P_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +	const int widthInBytes = window_.width * 5 / 4;
> +	const uint8_t *prev = src[0];
> +	const uint8_t *curr = src[1];
> +	const uint8_t *next = src[2];
> +
> +	/*
> +	 * For the first pixel getting a pixel from the previous column uses
> +	 * x - 2 to skip the 5th byte with least-significant bits for 4 pixels.
> +	 * Same for last pixel (uses x + 2) and looking at the next column.
> +	 */
> +	for (int x = 0; x < widthInBytes;) {
> +		/* First pixel */
> +		BGGR_XBGR8888(2, 1, 1)
> +		/* Second pixel BGGR -> GBRG */
> +		GBRG_XBGR8888(1, 1, 1)
> +		/* Same thing for third and fourth pixels */
> +		BGGR_XBGR8888(1, 1, 1)
> +		GBRG_XBGR8888(1, 2, 1)
> +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
> +		x++;
> +	}
> +}
> +
>  void DebayerCpu::debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>  	const int widthInBytes = window_.width * 5 / 4;
> @@ -218,6 +335,26 @@ void DebayerCpu::debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>  	}
>  }
>  
> +void DebayerCpu::debayer10P_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +	const int widthInBytes = window_.width * 5 / 4;
> +	const uint8_t *prev = src[0];
> +	const uint8_t *curr = src[1];
> +	const uint8_t *next = src[2];
> +
> +	for (int x = 0; x < widthInBytes;) {
> +		/* First pixel */
> +		GRBG_XBGR8888(2, 1, 1)
> +		/* Second pixel GRBG -> RGGB */
> +		RGGB_XBGR8888(1, 1, 1)
> +		/* Same thing for third and fourth pixels */
> +		GRBG_XBGR8888(1, 1, 1)
> +		RGGB_XBGR8888(1, 2, 1)
> +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
> +		x++;
> +	}
> +}
> +
>  void DebayerCpu::debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>  	const int widthInBytes = window_.width * 5 / 4;
> @@ -238,6 +375,26 @@ void DebayerCpu::debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[])
>  	}
>  }
>  
> +void DebayerCpu::debayer10P_GBGB_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +	const int widthInBytes = window_.width * 5 / 4;
> +	const uint8_t *prev = src[0];
> +	const uint8_t *curr = src[1];
> +	const uint8_t *next = src[2];
> +
> +	for (int x = 0; x < widthInBytes;) {
> +		/* Even pixel */
> +		GBRG_XBGR8888(2, 1, 1)
> +		/* Odd pixel GBGR -> BGGR */
> +		BGGR_XBGR8888(1, 1, 1)
> +		/* Same thing for next 2 pixels */
> +		GBRG_XBGR8888(1, 1, 1)
> +		BGGR_XBGR8888(1, 2, 1)
> +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
> +		x++;
> +	}
> +}
> +
>  void DebayerCpu::debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>  	const int widthInBytes = window_.width * 5 / 4;
> @@ -258,6 +415,26 @@ void DebayerCpu::debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[])
>  	}
>  }
>  
> +void DebayerCpu::debayer10P_RGRG_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +	const int widthInBytes = window_.width * 5 / 4;
> +	const uint8_t *prev = src[0];
> +	const uint8_t *curr = src[1];
> +	const uint8_t *next = src[2];
> +
> +	for (int x = 0; x < widthInBytes;) {
> +		/* Even pixel */
> +		RGGB_XBGR8888(2, 1, 1)
> +		/* Odd pixel RGGB -> GRBG */
> +		GRBG_XBGR8888(1, 1, 1)
> +		/* Same thing for next 2 pixels */
> +		RGGB_XBGR8888(1, 1, 1)
> +		GRBG_XBGR8888(1, 2, 1)
> +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
> +		x++;
> +	}
> +}
> +
>  static bool isStandardBayerOrder(BayerFormat::Order order)
>  {
>  	return order == BayerFormat::BGGR || order == BayerFormat::GBRG ||
> @@ -280,7 +457,14 @@ int DebayerCpu::getInputConfig(PixelFormat inputFormat, DebayerInputConfig &conf
>  		config.bpp = (bayerFormat.bitDepth + 7) & ~7;
>  		config.patternSize.width = 2;
>  		config.patternSize.height = 2;
> -		config.outputFormats = std::vector<PixelFormat>({ formats::RGB888, formats::BGR888 });
> +		config.outputFormats = std::vector<PixelFormat>({
> +			formats::RGB888,
> +			formats::XRGB8888,
> +			formats::ARGB8888,
> +			formats::BGR888,
> +			formats::XBGR8888,
> +			formats::ABGR8888
> +		});
>  		return 0;
>  	}
>  
> @@ -290,7 +474,14 @@ int DebayerCpu::getInputConfig(PixelFormat inputFormat, DebayerInputConfig &conf
>  		config.bpp = 10;
>  		config.patternSize.width = 4; /* 5 bytes per *4* pixels */
>  		config.patternSize.height = 2;
> -		config.outputFormats = std::vector<PixelFormat>({ formats::RGB888, formats::BGR888 });
> +		config.outputFormats = std::vector<PixelFormat>({
> +			formats::RGB888,
> +			formats::XRGB8888,
> +			formats::ARGB8888,
> +			formats::BGR888,
> +			formats::XBGR8888,
> +			formats::ABGR8888
> +		});
>  		return 0;
>  	}
>  
> @@ -306,6 +497,12 @@ int DebayerCpu::getOutputConfig(PixelFormat outputFormat, DebayerOutputConfig &c
>  		return 0;
>  	}
>  
> +	if (outputFormat == formats::XRGB8888 || outputFormat == formats::ARGB8888 ||
> +	    outputFormat == formats::XBGR8888 || outputFormat == formats::ABGR8888) {
> +		config.bpp = 32;
> +		return 0;
> +	}
> +
>  	LOG(Debayer, Info)
>  		<< "Unsupported output format " << outputFormat.toString();
>  	return -EINVAL;
> @@ -341,6 +538,7 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
>  {
>  	BayerFormat bayerFormat =
>  		BayerFormat::fromPixelFormat(inputFormat);
> +	bool is_aligned = false;

camelCase should be used for variable names.

>  	xShift_ = 0;
>  	swapRedBlueGains_ = false;
> @@ -351,8 +549,16 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
>  	};
>  
>  	switch (outputFormat) {
> +	case formats::XRGB8888:
> +	case formats::ARGB8888:
> +	  is_aligned = true;
> +	  [[fallthrough]];
>  	case formats::RGB888:
>  		break;
> +	case formats::XBGR8888:
> +	case formats::ABGR8888:
> +	  is_aligned = true;
> +	  [[fallthrough]];
>  	case formats::BGR888:
>  		/* Swap R and B in bayer order to generate BGR888 instead of RGB888 */
>  		swapRedBlueGains_ = true;
> @@ -383,16 +589,19 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
>  	    isStandardBayerOrder(bayerFormat.order)) {
>  		switch (bayerFormat.bitDepth) {
>  		case 8:
> -			debayer0_ = &DebayerCpu::debayer8_BGBG_BGR888;
> -			debayer1_ = &DebayerCpu::debayer8_GRGR_BGR888;
> +		  LOG(Debayer, Warning) << "8bit no packing";

Is this a debugging leftover or do you really mean to log something
here?  If the latter, it should probably be Debug rather than Warning
and the message should be improved.  The same applies to the other LOGs
below. 

> +		  debayer0_ = is_aligned ? &DebayerCpu::debayer8_BGBG_XBGR8888 : &DebayerCpu::debayer8_BGBG_BGR888;

If the two methods were unified and extended with isAligned argument,
I'm not sure what would be the best way to pass the argument to them.
lambda, a template parameter or anything else?

> +		  debayer1_ = is_aligned ? &DebayerCpu::debayer8_GRGR_XBGR8888 : &DebayerCpu::debayer8_GRGR_BGR888;
>  			break;
>  		case 10:
> -			debayer0_ = &DebayerCpu::debayer10_BGBG_BGR888;
> -			debayer1_ = &DebayerCpu::debayer10_GRGR_BGR888;
> +		  LOG(Debayer, Warning) << "10bit no packing";
> +			debayer0_ = is_aligned ? &DebayerCpu::debayer10_BGBG_XBGR8888 : &DebayerCpu::debayer10_BGBG_BGR888;
> +			debayer1_ = is_aligned ? &DebayerCpu::debayer10_GRGR_XBGR8888 : &DebayerCpu::debayer10_GRGR_BGR888;
>  			break;
>  		case 12:
> -			debayer0_ = &DebayerCpu::debayer12_BGBG_BGR888;
> -			debayer1_ = &DebayerCpu::debayer12_GRGR_BGR888;
> +		  LOG(Debayer, Warning) << "12bit no packing";
> +			debayer0_ = is_aligned ? &DebayerCpu::debayer12_BGBG_XBGR8888 : &DebayerCpu::debayer12_BGBG_BGR888;
> +			debayer1_ = is_aligned ? &DebayerCpu::debayer12_GRGR_XBGR8888 : &DebayerCpu::debayer12_GRGR_BGR888;
>  			break;
>  		}
>  		setupStandardBayerOrder(bayerFormat.order);
> @@ -401,22 +610,23 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
>  
>  	if (bayerFormat.bitDepth == 10 &&
>  	    bayerFormat.packing == BayerFormat::Packing::CSI2) {
> +	  LOG(Debayer, Warning) << "10bit csi2";
>  		switch (bayerFormat.order) {
>  		case BayerFormat::BGGR:
> -			debayer0_ = &DebayerCpu::debayer10P_BGBG_BGR888;
> -			debayer1_ = &DebayerCpu::debayer10P_GRGR_BGR888;
> +			debayer0_ = is_aligned ? &DebayerCpu::debayer10P_BGBG_XBGR8888 : &DebayerCpu::debayer10P_BGBG_BGR888;
> +			debayer1_ = is_aligned ? &DebayerCpu::debayer10P_GRGR_XBGR8888 : &DebayerCpu::debayer10P_GRGR_BGR888;
>  			return 0;
>  		case BayerFormat::GBRG:
> -			debayer0_ = &DebayerCpu::debayer10P_GBGB_BGR888;
> -			debayer1_ = &DebayerCpu::debayer10P_RGRG_BGR888;
> +			debayer0_ = is_aligned ? &DebayerCpu::debayer10P_GBGB_XBGR8888 : &DebayerCpu::debayer10P_GBGB_BGR888;
> +			debayer1_ = is_aligned ? &DebayerCpu::debayer10P_RGRG_XBGR8888 : &DebayerCpu::debayer10P_RGRG_BGR888;
>  			return 0;
>  		case BayerFormat::GRBG:
> -			debayer0_ = &DebayerCpu::debayer10P_GRGR_BGR888;
> -			debayer1_ = &DebayerCpu::debayer10P_BGBG_BGR888;
> +			debayer0_ = is_aligned ? &DebayerCpu::debayer10P_GRGR_XBGR8888 : &DebayerCpu::debayer10P_GRGR_BGR888;
> +			debayer1_ = is_aligned ? &DebayerCpu::debayer10P_BGBG_XBGR8888 : &DebayerCpu::debayer10P_BGBG_BGR888;
>  			return 0;
>  		case BayerFormat::RGGB:
> -			debayer0_ = &DebayerCpu::debayer10P_RGRG_BGR888;
> -			debayer1_ = &DebayerCpu::debayer10P_GBGB_BGR888;
> +			debayer0_ = is_aligned ? &DebayerCpu::debayer10P_RGRG_XBGR8888 : &DebayerCpu::debayer10P_RGRG_BGR888;
> +			debayer1_ = is_aligned ? &DebayerCpu::debayer10P_GBGB_XBGR8888 : &DebayerCpu::debayer10P_GBGB_BGR888;
>  			return 0;
>  		default:
>  			break;
> @@ -533,6 +743,8 @@ DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size
>  	/* round up to multiple of 8 for 64 bits alignment */
>  	unsigned int stride = (size.width * config.bpp / 8 + 7) & ~7;
>  
> +	LOG(Debayer, Warning) << outputFormat.toString() << " " << size.width << " " << size.height << " " << config.bpp << " " << stride << " " << stride * size.height;
> +
>  	return std::make_tuple(stride, stride * size.height);
>  }
>  
> diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
> index be7dcdca..c30f44aa 100644
> --- a/src/libcamera/software_isp/debayer_cpu.h
> +++ b/src/libcamera/software_isp/debayer_cpu.h
> @@ -86,18 +86,28 @@ private:
>  
>  	/* 8-bit raw bayer format */
>  	void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
> +	void debayer8_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>  	void debayer8_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
> +	void debayer8_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>  	/* unpacked 10-bit raw bayer format */
>  	void debayer10_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
> +	void debayer10_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>  	void debayer10_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
> +	void debayer10_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>  	/* unpacked 12-bit raw bayer format */
>  	void debayer12_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
> +	void debayer12_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>  	void debayer12_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
> +	void debayer12_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>  	/* CSI-2 packed 10-bit raw bayer format (all the 4 orders) */
>  	void debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
> +	void debayer10P_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>  	void debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
> +	void debayer10P_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>  	void debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[]);
> +	void debayer10P_GBGB_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>  	void debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[]);
> +	void debayer10P_RGRG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>  
>  	struct DebayerInputConfig {
>  		Size patternSize;
Laurent Pinchart June 11, 2024, 2:31 p.m. UTC | #2
On Tue, Jun 11, 2024 at 04:29:10PM +0200, Milan Zamazal wrote:
> Hi Robert,
> 
> thank you for the patch.
> 
> Robert Mader <robert.mader@collabora.com> writes:
> 
> > In order to be more compatible with modern hardware and APIs. This
> > notably allows GL implementations to directly import the buffers more
> > often and seems to be required for Wayland.
> >
> > Further more, as we already enforce a 8 byte stride, these formats work
> > better for clients that don't support padding - such as libwebrtc at the
> > time of writing.
> >
> > Tested on the Librem5 and PinePhone.
> >
> > Signed-off-by: Robert Mader <robert.mader@collabora.com>
> > ---
> >  src/libcamera/software_isp/debayer_cpu.cpp | 244 +++++++++++++++++++--
> >  src/libcamera/software_isp/debayer_cpu.h   |  10 +
> >  2 files changed, 238 insertions(+), 16 deletions(-)
> >
> > diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
> > index c038eed4..73c66a88 100644
> > --- a/src/libcamera/software_isp/debayer_cpu.cpp
> > +++ b/src/libcamera/software_isp/debayer_cpu.cpp
> > @@ -76,6 +76,13 @@ DebayerCpu::~DebayerCpu()
> >  	*dst++ = red_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
> >  	x++;
> >  
> > +#define BGGR_XBGR8888(p, n, div)                                                              \
> > +	*dst++ = blue_[curr[x] / (div)];                                                      \
> > +	*dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];       \
> > +	*dst++ = red_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
> > +	*dst++ = 255;                                                                         \
> > +	x++;
> > +
> 
> The level of code duplication here starts to exceed reasonable limits.
> Maybe adding an argument to the macro deciding whether to append the
> last `dst' assignment or not would have a significant performance
> impact.  But even then I guess there must be a way to let the compiler
> generate the duplicate code (an inline function template?) rather than
> doing it manually.  What do the C++ experts around think?

C++ templates are the C++ replacement of the C code-generation macros.
This should at least be considered as an option.

> If nothing better is possible then `*dst++ = 255' can be applied in the
> callers rather than defining the alternative macro versions.
> 
> >  /*
> >   * GBG
> >   * RGR
> > @@ -87,6 +94,13 @@ DebayerCpu::~DebayerCpu()
> >  	*dst++ = red_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
> >  	x++;
> >  
> > +#define GRBG_XBGR8888(p, n, div)                                  \
> > +	*dst++ = blue_[(prev[x] + next[x]) / (2 * (div))];        \
> > +	*dst++ = green_[curr[x] / (div)];                         \
> > +	*dst++ = red_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
> > +	*dst++ = 255;                                             \
> > +	x++;
> > +
> >  /*
> >   * GRG
> >   * BGB
> > @@ -98,6 +112,13 @@ DebayerCpu::~DebayerCpu()
> >  	*dst++ = red_[(prev[x] + next[x]) / (2 * (div))];          \
> >  	x++;
> >  
> > +#define GBRG_XBGR8888(p, n, div)                                   \
> > +	*dst++ = blue_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
> > +	*dst++ = green_[curr[x] / (div)];                          \
> > +	*dst++ = red_[(prev[x] + next[x]) / (2 * (div))];          \
> > +	*dst++ = 255;                                              \
> > +	x++;
> > +
> >  /*
> >   * BGB
> >   * GRG
> > @@ -109,6 +130,13 @@ DebayerCpu::~DebayerCpu()
> >  	*dst++ = red_[curr[x] / (div)];                                                        \
> >  	x++;
> >  
> > +#define RGGB_XBGR8888(p, n, div)                                                               \
> > +	*dst++ = blue_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
> > +	*dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];        \
> > +	*dst++ = red_[curr[x] / (div)];                                                        \
> > +	*dst++ = 255;                                                                          \
> > +	x++;
> > +
> >  void DebayerCpu::debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
> >  {
> >  	DECLARE_SRC_POINTERS(uint8_t)
> > @@ -119,6 +147,16 @@ void DebayerCpu::debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
> >  	}
> >  }
> >  
> > +void DebayerCpu::debayer8_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
> > +{
> > +	DECLARE_SRC_POINTERS(uint8_t)
> > +
> > +	for (int x = 0; x < (int)window_.width;) {
> > +		BGGR_XBGR8888(1, 1, 1)
> > +		GBRG_XBGR8888(1, 1, 1)
> > +	}
> > +}
> > +
> 
> ... and then X and non-X versions of these methods could be merged too.
> Using a template for the purpose should have no performance impact, I
> suppose.
> 
> >  void DebayerCpu::debayer8_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
> >  {
> >  	DECLARE_SRC_POINTERS(uint8_t)
> > @@ -129,6 +167,16 @@ void DebayerCpu::debayer8_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
> >  	}
> >  }
> >  
> > +void DebayerCpu::debayer8_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
> > +{
> > +	DECLARE_SRC_POINTERS(uint8_t)
> > +
> > +	for (int x = 0; x < (int)window_.width;) {
> > +		GRBG_XBGR8888(1, 1, 1)
> > +		RGGB_XBGR8888(1, 1, 1)
> > +	}
> > +}
> > +
> >  void DebayerCpu::debayer10_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
> >  {
> >  	DECLARE_SRC_POINTERS(uint16_t)
> > @@ -140,6 +188,17 @@ void DebayerCpu::debayer10_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
> >  	}
> >  }
> >  
> > +void DebayerCpu::debayer10_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
> > +{
> > +	DECLARE_SRC_POINTERS(uint16_t)
> > +
> > +	for (int x = 0; x < (int)window_.width;) {
> > +		/* divide values by 4 for 10 -> 8 bpp value */
> > +		BGGR_XBGR8888(1, 1, 4)
> > +		GBRG_XBGR8888(1, 1, 4)
> > +	}
> > +}
> > +
> >  void DebayerCpu::debayer10_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
> >  {
> >  	DECLARE_SRC_POINTERS(uint16_t)
> > @@ -151,6 +210,17 @@ void DebayerCpu::debayer10_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
> >  	}
> >  }
> >  
> > +void DebayerCpu::debayer10_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
> > +{
> > +	DECLARE_SRC_POINTERS(uint16_t)
> > +
> > +	for (int x = 0; x < (int)window_.width;) {
> > +		/* divide values by 4 for 10 -> 8 bpp value */
> > +		GRBG_XBGR8888(1, 1, 4)
> > +		RGGB_XBGR8888(1, 1, 4)
> > +	}
> > +}
> > +
> >  void DebayerCpu::debayer12_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
> >  {
> >  	DECLARE_SRC_POINTERS(uint16_t)
> > @@ -162,6 +232,17 @@ void DebayerCpu::debayer12_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
> >  	}
> >  }
> >  
> > +void DebayerCpu::debayer12_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
> > +{
> > +	DECLARE_SRC_POINTERS(uint16_t)
> > +
> > +	for (int x = 0; x < (int)window_.width;) {
> > +		/* divide values by 16 for 12 -> 8 bpp value */
> > +		BGGR_XBGR8888(1, 1, 16)
> > +		GBRG_XBGR8888(1, 1, 16)
> > +	}
> > +}
> > +
> >  void DebayerCpu::debayer12_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
> >  {
> >  	DECLARE_SRC_POINTERS(uint16_t)
> > @@ -173,6 +254,17 @@ void DebayerCpu::debayer12_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
> >  	}
> >  }
> >  
> > +void DebayerCpu::debayer12_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
> > +{
> > +	DECLARE_SRC_POINTERS(uint16_t)
> > +
> > +	for (int x = 0; x < (int)window_.width;) {
> > +		/* divide values by 16 for 12 -> 8 bpp value */
> > +		GRBG_XBGR8888(1, 1, 16)
> > +		RGGB_XBGR8888(1, 1, 16)
> > +	}
> > +}
> > +
> >  void DebayerCpu::debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
> >  {
> >  	const int widthInBytes = window_.width * 5 / 4;
> > @@ -198,6 +290,31 @@ void DebayerCpu::debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
> >  	}
> >  }
> >  
> > +void DebayerCpu::debayer10P_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
> > +{
> > +	const int widthInBytes = window_.width * 5 / 4;
> > +	const uint8_t *prev = src[0];
> > +	const uint8_t *curr = src[1];
> > +	const uint8_t *next = src[2];
> > +
> > +	/*
> > +	 * For the first pixel getting a pixel from the previous column uses
> > +	 * x - 2 to skip the 5th byte with least-significant bits for 4 pixels.
> > +	 * Same for last pixel (uses x + 2) and looking at the next column.
> > +	 */
> > +	for (int x = 0; x < widthInBytes;) {
> > +		/* First pixel */
> > +		BGGR_XBGR8888(2, 1, 1)
> > +		/* Second pixel BGGR -> GBRG */
> > +		GBRG_XBGR8888(1, 1, 1)
> > +		/* Same thing for third and fourth pixels */
> > +		BGGR_XBGR8888(1, 1, 1)
> > +		GBRG_XBGR8888(1, 2, 1)
> > +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
> > +		x++;
> > +	}
> > +}
> > +
> >  void DebayerCpu::debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
> >  {
> >  	const int widthInBytes = window_.width * 5 / 4;
> > @@ -218,6 +335,26 @@ void DebayerCpu::debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
> >  	}
> >  }
> >  
> > +void DebayerCpu::debayer10P_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
> > +{
> > +	const int widthInBytes = window_.width * 5 / 4;
> > +	const uint8_t *prev = src[0];
> > +	const uint8_t *curr = src[1];
> > +	const uint8_t *next = src[2];
> > +
> > +	for (int x = 0; x < widthInBytes;) {
> > +		/* First pixel */
> > +		GRBG_XBGR8888(2, 1, 1)
> > +		/* Second pixel GRBG -> RGGB */
> > +		RGGB_XBGR8888(1, 1, 1)
> > +		/* Same thing for third and fourth pixels */
> > +		GRBG_XBGR8888(1, 1, 1)
> > +		RGGB_XBGR8888(1, 2, 1)
> > +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
> > +		x++;
> > +	}
> > +}
> > +
> >  void DebayerCpu::debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[])
> >  {
> >  	const int widthInBytes = window_.width * 5 / 4;
> > @@ -238,6 +375,26 @@ void DebayerCpu::debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[])
> >  	}
> >  }
> >  
> > +void DebayerCpu::debayer10P_GBGB_XBGR8888(uint8_t *dst, const uint8_t *src[])
> > +{
> > +	const int widthInBytes = window_.width * 5 / 4;
> > +	const uint8_t *prev = src[0];
> > +	const uint8_t *curr = src[1];
> > +	const uint8_t *next = src[2];
> > +
> > +	for (int x = 0; x < widthInBytes;) {
> > +		/* Even pixel */
> > +		GBRG_XBGR8888(2, 1, 1)
> > +		/* Odd pixel GBGR -> BGGR */
> > +		BGGR_XBGR8888(1, 1, 1)
> > +		/* Same thing for next 2 pixels */
> > +		GBRG_XBGR8888(1, 1, 1)
> > +		BGGR_XBGR8888(1, 2, 1)
> > +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
> > +		x++;
> > +	}
> > +}
> > +
> >  void DebayerCpu::debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[])
> >  {
> >  	const int widthInBytes = window_.width * 5 / 4;
> > @@ -258,6 +415,26 @@ void DebayerCpu::debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[])
> >  	}
> >  }
> >  
> > +void DebayerCpu::debayer10P_RGRG_XBGR8888(uint8_t *dst, const uint8_t *src[])
> > +{
> > +	const int widthInBytes = window_.width * 5 / 4;
> > +	const uint8_t *prev = src[0];
> > +	const uint8_t *curr = src[1];
> > +	const uint8_t *next = src[2];
> > +
> > +	for (int x = 0; x < widthInBytes;) {
> > +		/* Even pixel */
> > +		RGGB_XBGR8888(2, 1, 1)
> > +		/* Odd pixel RGGB -> GRBG */
> > +		GRBG_XBGR8888(1, 1, 1)
> > +		/* Same thing for next 2 pixels */
> > +		RGGB_XBGR8888(1, 1, 1)
> > +		GRBG_XBGR8888(1, 2, 1)
> > +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
> > +		x++;
> > +	}
> > +}
> > +
> >  static bool isStandardBayerOrder(BayerFormat::Order order)
> >  {
> >  	return order == BayerFormat::BGGR || order == BayerFormat::GBRG ||
> > @@ -280,7 +457,14 @@ int DebayerCpu::getInputConfig(PixelFormat inputFormat, DebayerInputConfig &conf
> >  		config.bpp = (bayerFormat.bitDepth + 7) & ~7;
> >  		config.patternSize.width = 2;
> >  		config.patternSize.height = 2;
> > -		config.outputFormats = std::vector<PixelFormat>({ formats::RGB888, formats::BGR888 });
> > +		config.outputFormats = std::vector<PixelFormat>({
> > +			formats::RGB888,
> > +			formats::XRGB8888,
> > +			formats::ARGB8888,
> > +			formats::BGR888,
> > +			formats::XBGR8888,
> > +			formats::ABGR8888
> > +		});
> >  		return 0;
> >  	}
> >  
> > @@ -290,7 +474,14 @@ int DebayerCpu::getInputConfig(PixelFormat inputFormat, DebayerInputConfig &conf
> >  		config.bpp = 10;
> >  		config.patternSize.width = 4; /* 5 bytes per *4* pixels */
> >  		config.patternSize.height = 2;
> > -		config.outputFormats = std::vector<PixelFormat>({ formats::RGB888, formats::BGR888 });
> > +		config.outputFormats = std::vector<PixelFormat>({
> > +			formats::RGB888,
> > +			formats::XRGB8888,
> > +			formats::ARGB8888,
> > +			formats::BGR888,
> > +			formats::XBGR8888,
> > +			formats::ABGR8888
> > +		});
> >  		return 0;
> >  	}
> >  
> > @@ -306,6 +497,12 @@ int DebayerCpu::getOutputConfig(PixelFormat outputFormat, DebayerOutputConfig &c
> >  		return 0;
> >  	}
> >  
> > +	if (outputFormat == formats::XRGB8888 || outputFormat == formats::ARGB8888 ||
> > +	    outputFormat == formats::XBGR8888 || outputFormat == formats::ABGR8888) {
> > +		config.bpp = 32;
> > +		return 0;
> > +	}
> > +
> >  	LOG(Debayer, Info)
> >  		<< "Unsupported output format " << outputFormat.toString();
> >  	return -EINVAL;
> > @@ -341,6 +538,7 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
> >  {
> >  	BayerFormat bayerFormat =
> >  		BayerFormat::fromPixelFormat(inputFormat);
> > +	bool is_aligned = false;
> 
> camelCase should be used for variable names.
> 
> >  	xShift_ = 0;
> >  	swapRedBlueGains_ = false;
> > @@ -351,8 +549,16 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
> >  	};
> >  
> >  	switch (outputFormat) {
> > +	case formats::XRGB8888:
> > +	case formats::ARGB8888:
> > +	  is_aligned = true;
> > +	  [[fallthrough]];
> >  	case formats::RGB888:
> >  		break;
> > +	case formats::XBGR8888:
> > +	case formats::ABGR8888:
> > +	  is_aligned = true;
> > +	  [[fallthrough]];
> >  	case formats::BGR888:
> >  		/* Swap R and B in bayer order to generate BGR888 instead of RGB888 */
> >  		swapRedBlueGains_ = true;
> > @@ -383,16 +589,19 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
> >  	    isStandardBayerOrder(bayerFormat.order)) {
> >  		switch (bayerFormat.bitDepth) {
> >  		case 8:
> > -			debayer0_ = &DebayerCpu::debayer8_BGBG_BGR888;
> > -			debayer1_ = &DebayerCpu::debayer8_GRGR_BGR888;
> > +		  LOG(Debayer, Warning) << "8bit no packing";
> 
> Is this a debugging leftover or do you really mean to log something
> here?  If the latter, it should probably be Debug rather than Warning
> and the message should be improved.  The same applies to the other LOGs
> below. 
> 
> > +		  debayer0_ = is_aligned ? &DebayerCpu::debayer8_BGBG_XBGR8888 : &DebayerCpu::debayer8_BGBG_BGR888;
> 
> If the two methods were unified and extended with isAligned argument,
> I'm not sure what would be the best way to pass the argument to them.
> lambda, a template parameter or anything else?
> 
> > +		  debayer1_ = is_aligned ? &DebayerCpu::debayer8_GRGR_XBGR8888 : &DebayerCpu::debayer8_GRGR_BGR888;
> >  			break;
> >  		case 10:
> > -			debayer0_ = &DebayerCpu::debayer10_BGBG_BGR888;
> > -			debayer1_ = &DebayerCpu::debayer10_GRGR_BGR888;
> > +		  LOG(Debayer, Warning) << "10bit no packing";
> > +			debayer0_ = is_aligned ? &DebayerCpu::debayer10_BGBG_XBGR8888 : &DebayerCpu::debayer10_BGBG_BGR888;
> > +			debayer1_ = is_aligned ? &DebayerCpu::debayer10_GRGR_XBGR8888 : &DebayerCpu::debayer10_GRGR_BGR888;
> >  			break;
> >  		case 12:
> > -			debayer0_ = &DebayerCpu::debayer12_BGBG_BGR888;
> > -			debayer1_ = &DebayerCpu::debayer12_GRGR_BGR888;
> > +		  LOG(Debayer, Warning) << "12bit no packing";
> > +			debayer0_ = is_aligned ? &DebayerCpu::debayer12_BGBG_XBGR8888 : &DebayerCpu::debayer12_BGBG_BGR888;
> > +			debayer1_ = is_aligned ? &DebayerCpu::debayer12_GRGR_XBGR8888 : &DebayerCpu::debayer12_GRGR_BGR888;
> >  			break;
> >  		}
> >  		setupStandardBayerOrder(bayerFormat.order);
> > @@ -401,22 +610,23 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
> >  
> >  	if (bayerFormat.bitDepth == 10 &&
> >  	    bayerFormat.packing == BayerFormat::Packing::CSI2) {
> > +	  LOG(Debayer, Warning) << "10bit csi2";
> >  		switch (bayerFormat.order) {
> >  		case BayerFormat::BGGR:
> > -			debayer0_ = &DebayerCpu::debayer10P_BGBG_BGR888;
> > -			debayer1_ = &DebayerCpu::debayer10P_GRGR_BGR888;
> > +			debayer0_ = is_aligned ? &DebayerCpu::debayer10P_BGBG_XBGR8888 : &DebayerCpu::debayer10P_BGBG_BGR888;
> > +			debayer1_ = is_aligned ? &DebayerCpu::debayer10P_GRGR_XBGR8888 : &DebayerCpu::debayer10P_GRGR_BGR888;
> >  			return 0;
> >  		case BayerFormat::GBRG:
> > -			debayer0_ = &DebayerCpu::debayer10P_GBGB_BGR888;
> > -			debayer1_ = &DebayerCpu::debayer10P_RGRG_BGR888;
> > +			debayer0_ = is_aligned ? &DebayerCpu::debayer10P_GBGB_XBGR8888 : &DebayerCpu::debayer10P_GBGB_BGR888;
> > +			debayer1_ = is_aligned ? &DebayerCpu::debayer10P_RGRG_XBGR8888 : &DebayerCpu::debayer10P_RGRG_BGR888;
> >  			return 0;
> >  		case BayerFormat::GRBG:
> > -			debayer0_ = &DebayerCpu::debayer10P_GRGR_BGR888;
> > -			debayer1_ = &DebayerCpu::debayer10P_BGBG_BGR888;
> > +			debayer0_ = is_aligned ? &DebayerCpu::debayer10P_GRGR_XBGR8888 : &DebayerCpu::debayer10P_GRGR_BGR888;
> > +			debayer1_ = is_aligned ? &DebayerCpu::debayer10P_BGBG_XBGR8888 : &DebayerCpu::debayer10P_BGBG_BGR888;
> >  			return 0;
> >  		case BayerFormat::RGGB:
> > -			debayer0_ = &DebayerCpu::debayer10P_RGRG_BGR888;
> > -			debayer1_ = &DebayerCpu::debayer10P_GBGB_BGR888;
> > +			debayer0_ = is_aligned ? &DebayerCpu::debayer10P_RGRG_XBGR8888 : &DebayerCpu::debayer10P_RGRG_BGR888;
> > +			debayer1_ = is_aligned ? &DebayerCpu::debayer10P_GBGB_XBGR8888 : &DebayerCpu::debayer10P_GBGB_BGR888;
> >  			return 0;
> >  		default:
> >  			break;
> > @@ -533,6 +743,8 @@ DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size
> >  	/* round up to multiple of 8 for 64 bits alignment */
> >  	unsigned int stride = (size.width * config.bpp / 8 + 7) & ~7;
> >  
> > +	LOG(Debayer, Warning) << outputFormat.toString() << " " << size.width << " " << size.height << " " << config.bpp << " " << stride << " " << stride * size.height;
> > +
> >  	return std::make_tuple(stride, stride * size.height);
> >  }
> >  
> > diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
> > index be7dcdca..c30f44aa 100644
> > --- a/src/libcamera/software_isp/debayer_cpu.h
> > +++ b/src/libcamera/software_isp/debayer_cpu.h
> > @@ -86,18 +86,28 @@ private:
> >  
> >  	/* 8-bit raw bayer format */
> >  	void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
> > +	void debayer8_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
> >  	void debayer8_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
> > +	void debayer8_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
> >  	/* unpacked 10-bit raw bayer format */
> >  	void debayer10_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
> > +	void debayer10_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
> >  	void debayer10_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
> > +	void debayer10_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
> >  	/* unpacked 12-bit raw bayer format */
> >  	void debayer12_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
> > +	void debayer12_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
> >  	void debayer12_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
> > +	void debayer12_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
> >  	/* CSI-2 packed 10-bit raw bayer format (all the 4 orders) */
> >  	void debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
> > +	void debayer10P_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
> >  	void debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
> > +	void debayer10P_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
> >  	void debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[]);
> > +	void debayer10P_GBGB_XBGR8888(uint8_t *dst, const uint8_t *src[]);
> >  	void debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[]);
> > +	void debayer10P_RGRG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
> >  
> >  	struct DebayerInputConfig {
> >  		Size patternSize;
Kieran Bingham June 12, 2024, 10:43 p.m. UTC | #3
Quoting Robert Mader (2024-06-11 12:07:01)
> In order to be more compatible with modern hardware and APIs. This
> notably allows GL implementations to directly import the buffers more
> often and seems to be required for Wayland.
> 
> Further more, as we already enforce a 8 byte stride, these formats work
> better for clients that don't support padding - such as libwebrtc at the
> time of writing.
> 
> Tested on the Librem5 and PinePhone.
> 
> Signed-off-by: Robert Mader <robert.mader@collabora.com>

Some quite minor white space issues which must be tabs/spaces mixed up
reported in :

 - https://gitlab.freedesktop.org/camera/libcamera/-/jobs/59751572

But otherwise the CI for this is green for the compiler matrix:

 - https://gitlab.freedesktop.org/camera/libcamera/-/pipelines/1198765

I agree, there's a lot of duplication here, but I think ... I can also
add this already:

Tested-by: Kieran Bingham <kieran.bingham@ideasonboard.com>

As I've used this on the x13s with meet.jit.si / firefox / pipewire.


> ---
>  src/libcamera/software_isp/debayer_cpu.cpp | 244 +++++++++++++++++++--
>  src/libcamera/software_isp/debayer_cpu.h   |  10 +
>  2 files changed, 238 insertions(+), 16 deletions(-)
> 
> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
> index c038eed4..73c66a88 100644
> --- a/src/libcamera/software_isp/debayer_cpu.cpp
> +++ b/src/libcamera/software_isp/debayer_cpu.cpp
> @@ -76,6 +76,13 @@ DebayerCpu::~DebayerCpu()
>         *dst++ = red_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
>         x++;
>  
> +#define BGGR_XBGR8888(p, n, div)                                                              \
> +       *dst++ = blue_[curr[x] / (div)];                                                      \
> +       *dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];       \
> +       *dst++ = red_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
> +       *dst++ = 255;                                                                         \
> +       x++;
> +
>  /*
>   * GBG
>   * RGR
> @@ -87,6 +94,13 @@ DebayerCpu::~DebayerCpu()
>         *dst++ = red_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
>         x++;
>  
> +#define GRBG_XBGR8888(p, n, div)                                  \
> +       *dst++ = blue_[(prev[x] + next[x]) / (2 * (div))];        \
> +       *dst++ = green_[curr[x] / (div)];                         \
> +       *dst++ = red_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
> +       *dst++ = 255;                                             \
> +       x++;
> +
>  /*
>   * GRG
>   * BGB
> @@ -98,6 +112,13 @@ DebayerCpu::~DebayerCpu()
>         *dst++ = red_[(prev[x] + next[x]) / (2 * (div))];          \
>         x++;
>  
> +#define GBRG_XBGR8888(p, n, div)                                   \
> +       *dst++ = blue_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
> +       *dst++ = green_[curr[x] / (div)];                          \
> +       *dst++ = red_[(prev[x] + next[x]) / (2 * (div))];          \
> +       *dst++ = 255;                                              \
> +       x++;
> +
>  /*
>   * BGB
>   * GRG
> @@ -109,6 +130,13 @@ DebayerCpu::~DebayerCpu()
>         *dst++ = red_[curr[x] / (div)];                                                        \
>         x++;
>  
> +#define RGGB_XBGR8888(p, n, div)                                                               \
> +       *dst++ = blue_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
> +       *dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];        \
> +       *dst++ = red_[curr[x] / (div)];                                                        \
> +       *dst++ = 255;                                                                          \
> +       x++;
> +
>  void DebayerCpu::debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>         DECLARE_SRC_POINTERS(uint8_t)
> @@ -119,6 +147,16 @@ void DebayerCpu::debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>         }
>  }
>  
> +void DebayerCpu::debayer8_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +       DECLARE_SRC_POINTERS(uint8_t)
> +
> +       for (int x = 0; x < (int)window_.width;) {
> +               BGGR_XBGR8888(1, 1, 1)
> +               GBRG_XBGR8888(1, 1, 1)
> +       }
> +}
> +
>  void DebayerCpu::debayer8_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>         DECLARE_SRC_POINTERS(uint8_t)
> @@ -129,6 +167,16 @@ void DebayerCpu::debayer8_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>         }
>  }
>  
> +void DebayerCpu::debayer8_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +       DECLARE_SRC_POINTERS(uint8_t)
> +
> +       for (int x = 0; x < (int)window_.width;) {
> +               GRBG_XBGR8888(1, 1, 1)
> +               RGGB_XBGR8888(1, 1, 1)
> +       }
> +}
> +
>  void DebayerCpu::debayer10_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>         DECLARE_SRC_POINTERS(uint16_t)
> @@ -140,6 +188,17 @@ void DebayerCpu::debayer10_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>         }
>  }
>  
> +void DebayerCpu::debayer10_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +       DECLARE_SRC_POINTERS(uint16_t)
> +
> +       for (int x = 0; x < (int)window_.width;) {
> +               /* divide values by 4 for 10 -> 8 bpp value */
> +               BGGR_XBGR8888(1, 1, 4)
> +               GBRG_XBGR8888(1, 1, 4)
> +       }
> +}
> +
>  void DebayerCpu::debayer10_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>         DECLARE_SRC_POINTERS(uint16_t)
> @@ -151,6 +210,17 @@ void DebayerCpu::debayer10_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>         }
>  }
>  
> +void DebayerCpu::debayer10_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +       DECLARE_SRC_POINTERS(uint16_t)
> +
> +       for (int x = 0; x < (int)window_.width;) {
> +               /* divide values by 4 for 10 -> 8 bpp value */
> +               GRBG_XBGR8888(1, 1, 4)
> +               RGGB_XBGR8888(1, 1, 4)
> +       }
> +}
> +
>  void DebayerCpu::debayer12_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>         DECLARE_SRC_POINTERS(uint16_t)
> @@ -162,6 +232,17 @@ void DebayerCpu::debayer12_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>         }
>  }
>  
> +void DebayerCpu::debayer12_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +       DECLARE_SRC_POINTERS(uint16_t)
> +
> +       for (int x = 0; x < (int)window_.width;) {
> +               /* divide values by 16 for 12 -> 8 bpp value */
> +               BGGR_XBGR8888(1, 1, 16)
> +               GBRG_XBGR8888(1, 1, 16)
> +       }
> +}
> +
>  void DebayerCpu::debayer12_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>         DECLARE_SRC_POINTERS(uint16_t)
> @@ -173,6 +254,17 @@ void DebayerCpu::debayer12_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>         }
>  }
>  
> +void DebayerCpu::debayer12_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +       DECLARE_SRC_POINTERS(uint16_t)
> +
> +       for (int x = 0; x < (int)window_.width;) {
> +               /* divide values by 16 for 12 -> 8 bpp value */
> +               GRBG_XBGR8888(1, 1, 16)
> +               RGGB_XBGR8888(1, 1, 16)
> +       }
> +}
> +
>  void DebayerCpu::debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>         const int widthInBytes = window_.width * 5 / 4;
> @@ -198,6 +290,31 @@ void DebayerCpu::debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>         }
>  }
>  
> +void DebayerCpu::debayer10P_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +       const int widthInBytes = window_.width * 5 / 4;
> +       const uint8_t *prev = src[0];
> +       const uint8_t *curr = src[1];
> +       const uint8_t *next = src[2];
> +
> +       /*
> +        * For the first pixel getting a pixel from the previous column uses
> +        * x - 2 to skip the 5th byte with least-significant bits for 4 pixels.
> +        * Same for last pixel (uses x + 2) and looking at the next column.
> +        */
> +       for (int x = 0; x < widthInBytes;) {
> +               /* First pixel */
> +               BGGR_XBGR8888(2, 1, 1)
> +               /* Second pixel BGGR -> GBRG */
> +               GBRG_XBGR8888(1, 1, 1)
> +               /* Same thing for third and fourth pixels */
> +               BGGR_XBGR8888(1, 1, 1)
> +               GBRG_XBGR8888(1, 2, 1)
> +               /* Skip 5th src byte with 4 x 2 least-significant-bits */
> +               x++;
> +       }
> +}
> +
>  void DebayerCpu::debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>         const int widthInBytes = window_.width * 5 / 4;
> @@ -218,6 +335,26 @@ void DebayerCpu::debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>         }
>  }
>  
> +void DebayerCpu::debayer10P_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +       const int widthInBytes = window_.width * 5 / 4;
> +       const uint8_t *prev = src[0];
> +       const uint8_t *curr = src[1];
> +       const uint8_t *next = src[2];
> +
> +       for (int x = 0; x < widthInBytes;) {
> +               /* First pixel */
> +               GRBG_XBGR8888(2, 1, 1)
> +               /* Second pixel GRBG -> RGGB */
> +               RGGB_XBGR8888(1, 1, 1)
> +               /* Same thing for third and fourth pixels */
> +               GRBG_XBGR8888(1, 1, 1)
> +               RGGB_XBGR8888(1, 2, 1)
> +               /* Skip 5th src byte with 4 x 2 least-significant-bits */
> +               x++;
> +       }
> +}
> +
>  void DebayerCpu::debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>         const int widthInBytes = window_.width * 5 / 4;
> @@ -238,6 +375,26 @@ void DebayerCpu::debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[])
>         }
>  }
>  
> +void DebayerCpu::debayer10P_GBGB_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +       const int widthInBytes = window_.width * 5 / 4;
> +       const uint8_t *prev = src[0];
> +       const uint8_t *curr = src[1];
> +       const uint8_t *next = src[2];
> +
> +       for (int x = 0; x < widthInBytes;) {
> +               /* Even pixel */
> +               GBRG_XBGR8888(2, 1, 1)
> +               /* Odd pixel GBGR -> BGGR */
> +               BGGR_XBGR8888(1, 1, 1)
> +               /* Same thing for next 2 pixels */
> +               GBRG_XBGR8888(1, 1, 1)
> +               BGGR_XBGR8888(1, 2, 1)
> +               /* Skip 5th src byte with 4 x 2 least-significant-bits */
> +               x++;
> +       }
> +}
> +
>  void DebayerCpu::debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[])
>  {
>         const int widthInBytes = window_.width * 5 / 4;
> @@ -258,6 +415,26 @@ void DebayerCpu::debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[])
>         }
>  }
>  
> +void DebayerCpu::debayer10P_RGRG_XBGR8888(uint8_t *dst, const uint8_t *src[])
> +{
> +       const int widthInBytes = window_.width * 5 / 4;
> +       const uint8_t *prev = src[0];
> +       const uint8_t *curr = src[1];
> +       const uint8_t *next = src[2];
> +
> +       for (int x = 0; x < widthInBytes;) {
> +               /* Even pixel */
> +               RGGB_XBGR8888(2, 1, 1)
> +               /* Odd pixel RGGB -> GRBG */
> +               GRBG_XBGR8888(1, 1, 1)
> +               /* Same thing for next 2 pixels */
> +               RGGB_XBGR8888(1, 1, 1)
> +               GRBG_XBGR8888(1, 2, 1)
> +               /* Skip 5th src byte with 4 x 2 least-significant-bits */
> +               x++;
> +       }
> +}
> +
>  static bool isStandardBayerOrder(BayerFormat::Order order)
>  {
>         return order == BayerFormat::BGGR || order == BayerFormat::GBRG ||
> @@ -280,7 +457,14 @@ int DebayerCpu::getInputConfig(PixelFormat inputFormat, DebayerInputConfig &conf
>                 config.bpp = (bayerFormat.bitDepth + 7) & ~7;
>                 config.patternSize.width = 2;
>                 config.patternSize.height = 2;
> -               config.outputFormats = std::vector<PixelFormat>({ formats::RGB888, formats::BGR888 });
> +               config.outputFormats = std::vector<PixelFormat>({
> +                       formats::RGB888,
> +                       formats::XRGB8888,
> +                       formats::ARGB8888,
> +                       formats::BGR888,
> +                       formats::XBGR8888,
> +                       formats::ABGR8888
> +               });
>                 return 0;
>         }
>  
> @@ -290,7 +474,14 @@ int DebayerCpu::getInputConfig(PixelFormat inputFormat, DebayerInputConfig &conf
>                 config.bpp = 10;
>                 config.patternSize.width = 4; /* 5 bytes per *4* pixels */
>                 config.patternSize.height = 2;
> -               config.outputFormats = std::vector<PixelFormat>({ formats::RGB888, formats::BGR888 });
> +               config.outputFormats = std::vector<PixelFormat>({
> +                       formats::RGB888,
> +                       formats::XRGB8888,
> +                       formats::ARGB8888,
> +                       formats::BGR888,
> +                       formats::XBGR8888,
> +                       formats::ABGR8888
> +               });
>                 return 0;
>         }
>  
> @@ -306,6 +497,12 @@ int DebayerCpu::getOutputConfig(PixelFormat outputFormat, DebayerOutputConfig &c
>                 return 0;
>         }
>  
> +       if (outputFormat == formats::XRGB8888 || outputFormat == formats::ARGB8888 ||
> +           outputFormat == formats::XBGR8888 || outputFormat == formats::ABGR8888) {
> +               config.bpp = 32;
> +               return 0;
> +       }
> +
>         LOG(Debayer, Info)
>                 << "Unsupported output format " << outputFormat.toString();
>         return -EINVAL;
> @@ -341,6 +538,7 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
>  {
>         BayerFormat bayerFormat =
>                 BayerFormat::fromPixelFormat(inputFormat);
> +       bool is_aligned = false;

Why are we calling this 'is_aligned' rather than just differentiation on
the bits per pixel. I don't think the alignment is the key thing here -
it's producing a different output pixel format! (which ... may be more
desirable for 32 bit alignments?)

Ultimately - this code path is going to be more and more tricky the more
input and output formats we add ...

>  
>         xShift_ = 0;
>         swapRedBlueGains_ = false;
> @@ -351,8 +549,16 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
>         };
>  
>         switch (outputFormat) {
> +       case formats::XRGB8888:
> +       case formats::ARGB8888:
> +         is_aligned = true;
> +         [[fallthrough]];
>         case formats::RGB888:
>                 break;
> +       case formats::XBGR8888:
> +       case formats::ABGR8888:
> +         is_aligned = true;
> +         [[fallthrough]];
>         case formats::BGR888:
>                 /* Swap R and B in bayer order to generate BGR888 instead of RGB888 */
>                 swapRedBlueGains_ = true;
> @@ -383,16 +589,19 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
>             isStandardBayerOrder(bayerFormat.order)) {
>                 switch (bayerFormat.bitDepth) {
>                 case 8:
> -                       debayer0_ = &DebayerCpu::debayer8_BGBG_BGR888;
> -                       debayer1_ = &DebayerCpu::debayer8_GRGR_BGR888;
> +                 LOG(Debayer, Warning) << "8bit no packing";
> +                 debayer0_ = is_aligned ? &DebayerCpu::debayer8_BGBG_XBGR8888 : &DebayerCpu::debayer8_BGBG_BGR888;
> +                 debayer1_ = is_aligned ? &DebayerCpu::debayer8_GRGR_XBGR8888 : &DebayerCpu::debayer8_GRGR_BGR888;

Whitespace issues here as reported by the linter.

>                         break;
>                 case 10:
> -                       debayer0_ = &DebayerCpu::debayer10_BGBG_BGR888;
> -                       debayer1_ = &DebayerCpu::debayer10_GRGR_BGR888;
> +                 LOG(Debayer, Warning) << "10bit no packing";

and here..

But we shouldn't merge code that adds these 'Warning's. Perhaps debug
but likely we should just remove the debug prints here.

> +                       debayer0_ = is_aligned ? &DebayerCpu::debayer10_BGBG_XBGR8888 : &DebayerCpu::debayer10_BGBG_BGR888;
> +                       debayer1_ = is_aligned ? &DebayerCpu::debayer10_GRGR_XBGR8888 : &DebayerCpu::debayer10_GRGR_BGR888;
>                         break;
>                 case 12:
> -                       debayer0_ = &DebayerCpu::debayer12_BGBG_BGR888;
> -                       debayer1_ = &DebayerCpu::debayer12_GRGR_BGR888;
> +                 LOG(Debayer, Warning) << "12bit no packing";
> +                       debayer0_ = is_aligned ? &DebayerCpu::debayer12_BGBG_XBGR8888 : &DebayerCpu::debayer12_BGBG_BGR888;
> +                       debayer1_ = is_aligned ? &DebayerCpu::debayer12_GRGR_XBGR8888 : &DebayerCpu::debayer12_GRGR_BGR888;

>                         break;
>                 }
>                 setupStandardBayerOrder(bayerFormat.order);
> @@ -401,22 +610,23 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
>  
>         if (bayerFormat.bitDepth == 10 &&
>             bayerFormat.packing == BayerFormat::Packing::CSI2) {
> +         LOG(Debayer, Warning) << "10bit csi2";
>                 switch (bayerFormat.order) {
>                 case BayerFormat::BGGR:
> -                       debayer0_ = &DebayerCpu::debayer10P_BGBG_BGR888;
> -                       debayer1_ = &DebayerCpu::debayer10P_GRGR_BGR888;
> +                       debayer0_ = is_aligned ? &DebayerCpu::debayer10P_BGBG_XBGR8888 : &DebayerCpu::debayer10P_BGBG_BGR888;
> +                       debayer1_ = is_aligned ? &DebayerCpu::debayer10P_GRGR_XBGR8888 : &DebayerCpu::debayer10P_GRGR_BGR888;
>                         return 0;
>                 case BayerFormat::GBRG:
> -                       debayer0_ = &DebayerCpu::debayer10P_GBGB_BGR888;
> -                       debayer1_ = &DebayerCpu::debayer10P_RGRG_BGR888;
> +                       debayer0_ = is_aligned ? &DebayerCpu::debayer10P_GBGB_XBGR8888 : &DebayerCpu::debayer10P_GBGB_BGR888;
> +                       debayer1_ = is_aligned ? &DebayerCpu::debayer10P_RGRG_XBGR8888 : &DebayerCpu::debayer10P_RGRG_BGR888;
>                         return 0;
>                 case BayerFormat::GRBG:
> -                       debayer0_ = &DebayerCpu::debayer10P_GRGR_BGR888;
> -                       debayer1_ = &DebayerCpu::debayer10P_BGBG_BGR888;
> +                       debayer0_ = is_aligned ? &DebayerCpu::debayer10P_GRGR_XBGR8888 : &DebayerCpu::debayer10P_GRGR_BGR888;
> +                       debayer1_ = is_aligned ? &DebayerCpu::debayer10P_BGBG_XBGR8888 : &DebayerCpu::debayer10P_BGBG_BGR888;
>                         return 0;
>                 case BayerFormat::RGGB:
> -                       debayer0_ = &DebayerCpu::debayer10P_RGRG_BGR888;
> -                       debayer1_ = &DebayerCpu::debayer10P_GBGB_BGR888;
> +                       debayer0_ = is_aligned ? &DebayerCpu::debayer10P_RGRG_XBGR8888 : &DebayerCpu::debayer10P_RGRG_BGR888;
> +                       debayer1_ = is_aligned ? &DebayerCpu::debayer10P_GBGB_XBGR8888 : &DebayerCpu::debayer10P_GBGB_BGR888;
>                         return 0;
>                 default:
>                         break;
> @@ -533,6 +743,8 @@ DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size
>         /* round up to multiple of 8 for 64 bits alignment */
>         unsigned int stride = (size.width * config.bpp / 8 + 7) & ~7;
>  
> +       LOG(Debayer, Warning) << outputFormat.toString() << " " << size.width << " " << size.height << " " << config.bpp << " " << stride << " " << stride * size.height;
> +
>         return std::make_tuple(stride, stride * size.height);
>  }
>  
> diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
> index be7dcdca..c30f44aa 100644
> --- a/src/libcamera/software_isp/debayer_cpu.h
> +++ b/src/libcamera/software_isp/debayer_cpu.h
> @@ -86,18 +86,28 @@ private:
>  
>         /* 8-bit raw bayer format */
>         void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
> +       void debayer8_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>         void debayer8_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
> +       void debayer8_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>         /* unpacked 10-bit raw bayer format */
>         void debayer10_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
> +       void debayer10_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>         void debayer10_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
> +       void debayer10_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>         /* unpacked 12-bit raw bayer format */
>         void debayer12_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
> +       void debayer12_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>         void debayer12_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
> +       void debayer12_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>         /* CSI-2 packed 10-bit raw bayer format (all the 4 orders) */
>         void debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
> +       void debayer10P_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>         void debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
> +       void debayer10P_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>         void debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[]);
> +       void debayer10P_GBGB_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>         void debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[]);
> +       void debayer10P_RGRG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>  
>         struct DebayerInputConfig {
>                 Size patternSize;
> -- 
> 2.45.2
>
Robert Mader June 17, 2024, 5:41 p.m. UTC | #4
Hi!

On 11.06.24 16:29, Milan Zamazal wrote:
> Hi Robert,
>
> thank you for the patch.
>
> Robert Mader<robert.mader@collabora.com>  writes:
>
>> In order to be more compatible with modern hardware and APIs. This
>> notably allows GL implementations to directly import the buffers more
>> often and seems to be required for Wayland.
>>
>> Further more, as we already enforce a 8 byte stride, these formats work
>> better for clients that don't support padding - such as libwebrtc at the
>> time of writing.
>>
>> Tested on the Librem5 and PinePhone.
>>
>> Signed-off-by: Robert Mader<robert.mader@collabora.com>
>> ---
>>   src/libcamera/software_isp/debayer_cpu.cpp | 244 +++++++++++++++++++--
>>   src/libcamera/software_isp/debayer_cpu.h   |  10 +
>>   2 files changed, 238 insertions(+), 16 deletions(-)
>>
>> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
>> index c038eed4..73c66a88 100644
>> --- a/src/libcamera/software_isp/debayer_cpu.cpp
>> +++ b/src/libcamera/software_isp/debayer_cpu.cpp
>> @@ -76,6 +76,13 @@ DebayerCpu::~DebayerCpu()
>>   	*dst++ = red_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
>>   	x++;
>>   
>> +#define BGGR_XBGR8888(p, n, div)                                                              \
>> +	*dst++ = blue_[curr[x] / (div)];                                                      \
>> +	*dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];       \
>> +	*dst++ = red_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
>> +	*dst++ = 255;                                                                         \
>> +	x++;
>> +
> The level of code duplication here starts to exceed reasonable limits.
> Maybe adding an argument to the macro deciding whether to append the
> last `dst' assignment or not would have a significant performance
> impact.  But even then I guess there must be a way to let the compiler
> generate the duplicate code (an inline function template?) rather than
> doing it manually.  What do the C++ experts around think?
>
> If nothing better is possible then `*dst++ = 255' can be applied in the
> callers rather than defining the alternative macro versions.
Yeah, this was exactly I hoped to get feedback about. I'm going with the 
simple solution of adding an additional variable in v2, assuming that 
the performance impact will be small with modern branch prediction and 
not seeing one when quickly testing.
>>   /*
>>    * GBG
>>    * RGR
>> @@ -87,6 +94,13 @@ DebayerCpu::~DebayerCpu()
>>   	*dst++ = red_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
>>   	x++;
>>   
>> +#define GRBG_XBGR8888(p, n, div)                                  \
>> +	*dst++ = blue_[(prev[x] + next[x]) / (2 * (div))];        \
>> +	*dst++ = green_[curr[x] / (div)];                         \
>> +	*dst++ = red_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
>> +	*dst++ = 255;                                             \
>> +	x++;
>> +
>>   /*
>>    * GRG
>>    * BGB
>> @@ -98,6 +112,13 @@ DebayerCpu::~DebayerCpu()
>>   	*dst++ = red_[(prev[x] + next[x]) / (2 * (div))];          \
>>   	x++;
>>   
>> +#define GBRG_XBGR8888(p, n, div)                                   \
>> +	*dst++ = blue_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
>> +	*dst++ = green_[curr[x] / (div)];                          \
>> +	*dst++ = red_[(prev[x] + next[x]) / (2 * (div))];          \
>> +	*dst++ = 255;                                              \
>> +	x++;
>> +
>>   /*
>>    * BGB
>>    * GRG
>> @@ -109,6 +130,13 @@ DebayerCpu::~DebayerCpu()
>>   	*dst++ = red_[curr[x] / (div)];                                                        \
>>   	x++;
>>   
>> +#define RGGB_XBGR8888(p, n, div)                                                               \
>> +	*dst++ = blue_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
>> +	*dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];        \
>> +	*dst++ = red_[curr[x] / (div)];                                                        \
>> +	*dst++ = 255;                                                                          \
>> +	x++;
>> +
>>   void DebayerCpu::debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>>   {
>>   	DECLARE_SRC_POINTERS(uint8_t)
>> @@ -119,6 +147,16 @@ void DebayerCpu::debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>>   	}
>>   }
>>   
>> +void DebayerCpu::debayer8_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
>> +{
>> +	DECLARE_SRC_POINTERS(uint8_t)
>> +
>> +	for (int x = 0; x < (int)window_.width;) {
>> +		BGGR_XBGR8888(1, 1, 1)
>> +		GBRG_XBGR8888(1, 1, 1)
>> +	}
>> +}
>> +
> ... and then X and non-X versions of these methods could be merged too.
> Using a template for the purpose should have no performance impact, I
> suppose.
>
>>   void DebayerCpu::debayer8_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>>   {
>>   	DECLARE_SRC_POINTERS(uint8_t)
>> @@ -129,6 +167,16 @@ void DebayerCpu::debayer8_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>>   	}
>>   }
>>   
>> +void DebayerCpu::debayer8_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
>> +{
>> +	DECLARE_SRC_POINTERS(uint8_t)
>> +
>> +	for (int x = 0; x < (int)window_.width;) {
>> +		GRBG_XBGR8888(1, 1, 1)
>> +		RGGB_XBGR8888(1, 1, 1)
>> +	}
>> +}
>> +
>>   void DebayerCpu::debayer10_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>>   {
>>   	DECLARE_SRC_POINTERS(uint16_t)
>> @@ -140,6 +188,17 @@ void DebayerCpu::debayer10_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>>   	}
>>   }
>>   
>> +void DebayerCpu::debayer10_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
>> +{
>> +	DECLARE_SRC_POINTERS(uint16_t)
>> +
>> +	for (int x = 0; x < (int)window_.width;) {
>> +		/* divide values by 4 for 10 -> 8 bpp value */
>> +		BGGR_XBGR8888(1, 1, 4)
>> +		GBRG_XBGR8888(1, 1, 4)
>> +	}
>> +}
>> +
>>   void DebayerCpu::debayer10_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>>   {
>>   	DECLARE_SRC_POINTERS(uint16_t)
>> @@ -151,6 +210,17 @@ void DebayerCpu::debayer10_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>>   	}
>>   }
>>   
>> +void DebayerCpu::debayer10_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
>> +{
>> +	DECLARE_SRC_POINTERS(uint16_t)
>> +
>> +	for (int x = 0; x < (int)window_.width;) {
>> +		/* divide values by 4 for 10 -> 8 bpp value */
>> +		GRBG_XBGR8888(1, 1, 4)
>> +		RGGB_XBGR8888(1, 1, 4)
>> +	}
>> +}
>> +
>>   void DebayerCpu::debayer12_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>>   {
>>   	DECLARE_SRC_POINTERS(uint16_t)
>> @@ -162,6 +232,17 @@ void DebayerCpu::debayer12_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>>   	}
>>   }
>>   
>> +void DebayerCpu::debayer12_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
>> +{
>> +	DECLARE_SRC_POINTERS(uint16_t)
>> +
>> +	for (int x = 0; x < (int)window_.width;) {
>> +		/* divide values by 16 for 12 -> 8 bpp value */
>> +		BGGR_XBGR8888(1, 1, 16)
>> +		GBRG_XBGR8888(1, 1, 16)
>> +	}
>> +}
>> +
>>   void DebayerCpu::debayer12_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>>   {
>>   	DECLARE_SRC_POINTERS(uint16_t)
>> @@ -173,6 +254,17 @@ void DebayerCpu::debayer12_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>>   	}
>>   }
>>   
>> +void DebayerCpu::debayer12_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
>> +{
>> +	DECLARE_SRC_POINTERS(uint16_t)
>> +
>> +	for (int x = 0; x < (int)window_.width;) {
>> +		/* divide values by 16 for 12 -> 8 bpp value */
>> +		GRBG_XBGR8888(1, 1, 16)
>> +		RGGB_XBGR8888(1, 1, 16)
>> +	}
>> +}
>> +
>>   void DebayerCpu::debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>>   {
>>   	const int widthInBytes = window_.width * 5 / 4;
>> @@ -198,6 +290,31 @@ void DebayerCpu::debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>>   	}
>>   }
>>   
>> +void DebayerCpu::debayer10P_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
>> +{
>> +	const int widthInBytes = window_.width * 5 / 4;
>> +	const uint8_t *prev = src[0];
>> +	const uint8_t *curr = src[1];
>> +	const uint8_t *next = src[2];
>> +
>> +	/*
>> +	 * For the first pixel getting a pixel from the previous column uses
>> +	 * x - 2 to skip the 5th byte with least-significant bits for 4 pixels.
>> +	 * Same for last pixel (uses x + 2) and looking at the next column.
>> +	 */
>> +	for (int x = 0; x < widthInBytes;) {
>> +		/* First pixel */
>> +		BGGR_XBGR8888(2, 1, 1)
>> +		/* Second pixel BGGR -> GBRG */
>> +		GBRG_XBGR8888(1, 1, 1)
>> +		/* Same thing for third and fourth pixels */
>> +		BGGR_XBGR8888(1, 1, 1)
>> +		GBRG_XBGR8888(1, 2, 1)
>> +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
>> +		x++;
>> +	}
>> +}
>> +
>>   void DebayerCpu::debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>>   {
>>   	const int widthInBytes = window_.width * 5 / 4;
>> @@ -218,6 +335,26 @@ void DebayerCpu::debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
>>   	}
>>   }
>>   
>> +void DebayerCpu::debayer10P_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
>> +{
>> +	const int widthInBytes = window_.width * 5 / 4;
>> +	const uint8_t *prev = src[0];
>> +	const uint8_t *curr = src[1];
>> +	const uint8_t *next = src[2];
>> +
>> +	for (int x = 0; x < widthInBytes;) {
>> +		/* First pixel */
>> +		GRBG_XBGR8888(2, 1, 1)
>> +		/* Second pixel GRBG -> RGGB */
>> +		RGGB_XBGR8888(1, 1, 1)
>> +		/* Same thing for third and fourth pixels */
>> +		GRBG_XBGR8888(1, 1, 1)
>> +		RGGB_XBGR8888(1, 2, 1)
>> +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
>> +		x++;
>> +	}
>> +}
>> +
>>   void DebayerCpu::debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[])
>>   {
>>   	const int widthInBytes = window_.width * 5 / 4;
>> @@ -238,6 +375,26 @@ void DebayerCpu::debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[])
>>   	}
>>   }
>>   
>> +void DebayerCpu::debayer10P_GBGB_XBGR8888(uint8_t *dst, const uint8_t *src[])
>> +{
>> +	const int widthInBytes = window_.width * 5 / 4;
>> +	const uint8_t *prev = src[0];
>> +	const uint8_t *curr = src[1];
>> +	const uint8_t *next = src[2];
>> +
>> +	for (int x = 0; x < widthInBytes;) {
>> +		/* Even pixel */
>> +		GBRG_XBGR8888(2, 1, 1)
>> +		/* Odd pixel GBGR -> BGGR */
>> +		BGGR_XBGR8888(1, 1, 1)
>> +		/* Same thing for next 2 pixels */
>> +		GBRG_XBGR8888(1, 1, 1)
>> +		BGGR_XBGR8888(1, 2, 1)
>> +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
>> +		x++;
>> +	}
>> +}
>> +
>>   void DebayerCpu::debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[])
>>   {
>>   	const int widthInBytes = window_.width * 5 / 4;
>> @@ -258,6 +415,26 @@ void DebayerCpu::debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[])
>>   	}
>>   }
>>   
>> +void DebayerCpu::debayer10P_RGRG_XBGR8888(uint8_t *dst, const uint8_t *src[])
>> +{
>> +	const int widthInBytes = window_.width * 5 / 4;
>> +	const uint8_t *prev = src[0];
>> +	const uint8_t *curr = src[1];
>> +	const uint8_t *next = src[2];
>> +
>> +	for (int x = 0; x < widthInBytes;) {
>> +		/* Even pixel */
>> +		RGGB_XBGR8888(2, 1, 1)
>> +		/* Odd pixel RGGB -> GRBG */
>> +		GRBG_XBGR8888(1, 1, 1)
>> +		/* Same thing for next 2 pixels */
>> +		RGGB_XBGR8888(1, 1, 1)
>> +		GRBG_XBGR8888(1, 2, 1)
>> +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
>> +		x++;
>> +	}
>> +}
>> +
>>   static bool isStandardBayerOrder(BayerFormat::Order order)
>>   {
>>   	return order == BayerFormat::BGGR || order == BayerFormat::GBRG ||
>> @@ -280,7 +457,14 @@ int DebayerCpu::getInputConfig(PixelFormat inputFormat, DebayerInputConfig &conf
>>   		config.bpp = (bayerFormat.bitDepth + 7) & ~7;
>>   		config.patternSize.width = 2;
>>   		config.patternSize.height = 2;
>> -		config.outputFormats = std::vector<PixelFormat>({ formats::RGB888, formats::BGR888 });
>> +		config.outputFormats = std::vector<PixelFormat>({
>> +			formats::RGB888,
>> +			formats::XRGB8888,
>> +			formats::ARGB8888,
>> +			formats::BGR888,
>> +			formats::XBGR8888,
>> +			formats::ABGR8888
>> +		});
>>   		return 0;
>>   	}
>>   
>> @@ -290,7 +474,14 @@ int DebayerCpu::getInputConfig(PixelFormat inputFormat, DebayerInputConfig &conf
>>   		config.bpp = 10;
>>   		config.patternSize.width = 4; /* 5 bytes per *4* pixels */
>>   		config.patternSize.height = 2;
>> -		config.outputFormats = std::vector<PixelFormat>({ formats::RGB888, formats::BGR888 });
>> +		config.outputFormats = std::vector<PixelFormat>({
>> +			formats::RGB888,
>> +			formats::XRGB8888,
>> +			formats::ARGB8888,
>> +			formats::BGR888,
>> +			formats::XBGR8888,
>> +			formats::ABGR8888
>> +		});
>>   		return 0;
>>   	}
>>   
>> @@ -306,6 +497,12 @@ int DebayerCpu::getOutputConfig(PixelFormat outputFormat, DebayerOutputConfig &c
>>   		return 0;
>>   	}
>>   
>> +	if (outputFormat == formats::XRGB8888 || outputFormat == formats::ARGB8888 ||
>> +	    outputFormat == formats::XBGR8888 || outputFormat == formats::ABGR8888) {
>> +		config.bpp = 32;
>> +		return 0;
>> +	}
>> +
>>   	LOG(Debayer, Info)
>>   		<< "Unsupported output format " << outputFormat.toString();
>>   	return -EINVAL;
>> @@ -341,6 +538,7 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
>>   {
>>   	BayerFormat bayerFormat =
>>   		BayerFormat::fromPixelFormat(inputFormat);
>> +	bool is_aligned = false;
> camelCase should be used for variable names.
Right, fixed!
>
>>   	xShift_ = 0;
>>   	swapRedBlueGains_ = false;
>> @@ -351,8 +549,16 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
>>   	};
>>   
>>   	switch (outputFormat) {
>> +	case formats::XRGB8888:
>> +	case formats::ARGB8888:
>> +	  is_aligned = true;
>> +	  [[fallthrough]];
>>   	case formats::RGB888:
>>   		break;
>> +	case formats::XBGR8888:
>> +	case formats::ABGR8888:
>> +	  is_aligned = true;
>> +	  [[fallthrough]];
>>   	case formats::BGR888:
>>   		/* Swap R and B in bayer order to generate BGR888 instead of RGB888 */
>>   		swapRedBlueGains_ = true;
>> @@ -383,16 +589,19 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
>>   	    isStandardBayerOrder(bayerFormat.order)) {
>>   		switch (bayerFormat.bitDepth) {
>>   		case 8:
>> -			debayer0_ = &DebayerCpu::debayer8_BGBG_BGR888;
>> -			debayer1_ = &DebayerCpu::debayer8_GRGR_BGR888;
>> +		  LOG(Debayer, Warning) << "8bit no packing";
> Is this a debugging leftover or do you really mean to log something
> here?  If the latter, it should probably be Debug rather than Warning
> and the message should be improved.  The same applies to the other LOGs
> below.
Urgh, that was a left-over - removed!
>
>> +		  debayer0_ = is_aligned ? &DebayerCpu::debayer8_BGBG_XBGR8888 : &DebayerCpu::debayer8_BGBG_BGR888;
> If the two methods were unified and extended with isAligned argument,
> I'm not sure what would be the best way to pass the argument to them.
> lambda, a template parameter or anything else?
>
>> +		  debayer1_ = is_aligned ? &DebayerCpu::debayer8_GRGR_XBGR8888 : &DebayerCpu::debayer8_GRGR_BGR888;
>>   			break;
>>   		case 10:
>> -			debayer0_ = &DebayerCpu::debayer10_BGBG_BGR888;
>> -			debayer1_ = &DebayerCpu::debayer10_GRGR_BGR888;
>> +		  LOG(Debayer, Warning) << "10bit no packing";
>> +			debayer0_ = is_aligned ? &DebayerCpu::debayer10_BGBG_XBGR8888 : &DebayerCpu::debayer10_BGBG_BGR888;
>> +			debayer1_ = is_aligned ? &DebayerCpu::debayer10_GRGR_XBGR8888 : &DebayerCpu::debayer10_GRGR_BGR888;
>>   			break;
>>   		case 12:
>> -			debayer0_ = &DebayerCpu::debayer12_BGBG_BGR888;
>> -			debayer1_ = &DebayerCpu::debayer12_GRGR_BGR888;
>> +		  LOG(Debayer, Warning) << "12bit no packing";
>> +			debayer0_ = is_aligned ? &DebayerCpu::debayer12_BGBG_XBGR8888 : &DebayerCpu::debayer12_BGBG_BGR888;
>> +			debayer1_ = is_aligned ? &DebayerCpu::debayer12_GRGR_XBGR8888 : &DebayerCpu::debayer12_GRGR_BGR888;
>>   			break;
>>   		}
>>   		setupStandardBayerOrder(bayerFormat.order);
>> @@ -401,22 +610,23 @@ int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
>>   
>>   	if (bayerFormat.bitDepth == 10 &&
>>   	    bayerFormat.packing == BayerFormat::Packing::CSI2) {
>> +	  LOG(Debayer, Warning) << "10bit csi2";
>>   		switch (bayerFormat.order) {
>>   		case BayerFormat::BGGR:
>> -			debayer0_ = &DebayerCpu::debayer10P_BGBG_BGR888;
>> -			debayer1_ = &DebayerCpu::debayer10P_GRGR_BGR888;
>> +			debayer0_ = is_aligned ? &DebayerCpu::debayer10P_BGBG_XBGR8888 : &DebayerCpu::debayer10P_BGBG_BGR888;
>> +			debayer1_ = is_aligned ? &DebayerCpu::debayer10P_GRGR_XBGR8888 : &DebayerCpu::debayer10P_GRGR_BGR888;
>>   			return 0;
>>   		case BayerFormat::GBRG:
>> -			debayer0_ = &DebayerCpu::debayer10P_GBGB_BGR888;
>> -			debayer1_ = &DebayerCpu::debayer10P_RGRG_BGR888;
>> +			debayer0_ = is_aligned ? &DebayerCpu::debayer10P_GBGB_XBGR8888 : &DebayerCpu::debayer10P_GBGB_BGR888;
>> +			debayer1_ = is_aligned ? &DebayerCpu::debayer10P_RGRG_XBGR8888 : &DebayerCpu::debayer10P_RGRG_BGR888;
>>   			return 0;
>>   		case BayerFormat::GRBG:
>> -			debayer0_ = &DebayerCpu::debayer10P_GRGR_BGR888;
>> -			debayer1_ = &DebayerCpu::debayer10P_BGBG_BGR888;
>> +			debayer0_ = is_aligned ? &DebayerCpu::debayer10P_GRGR_XBGR8888 : &DebayerCpu::debayer10P_GRGR_BGR888;
>> +			debayer1_ = is_aligned ? &DebayerCpu::debayer10P_BGBG_XBGR8888 : &DebayerCpu::debayer10P_BGBG_BGR888;
>>   			return 0;
>>   		case BayerFormat::RGGB:
>> -			debayer0_ = &DebayerCpu::debayer10P_RGRG_BGR888;
>> -			debayer1_ = &DebayerCpu::debayer10P_GBGB_BGR888;
>> +			debayer0_ = is_aligned ? &DebayerCpu::debayer10P_RGRG_XBGR8888 : &DebayerCpu::debayer10P_RGRG_BGR888;
>> +			debayer1_ = is_aligned ? &DebayerCpu::debayer10P_GBGB_XBGR8888 : &DebayerCpu::debayer10P_GBGB_BGR888;
>>   			return 0;
>>   		default:
>>   			break;
>> @@ -533,6 +743,8 @@ DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size
>>   	/* round up to multiple of 8 for 64 bits alignment */
>>   	unsigned int stride = (size.width * config.bpp / 8 + 7) & ~7;
>>   
>> +	LOG(Debayer, Warning) << outputFormat.toString() << " " << size.width << " " << size.height << " " << config.bpp << " " << stride << " " << stride * size.height;
>> +
>>   	return std::make_tuple(stride, stride * size.height);
>>   }
>>   
>> diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
>> index be7dcdca..c30f44aa 100644
>> --- a/src/libcamera/software_isp/debayer_cpu.h
>> +++ b/src/libcamera/software_isp/debayer_cpu.h
>> @@ -86,18 +86,28 @@ private:
>>   
>>   	/* 8-bit raw bayer format */
>>   	void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
>> +	void debayer8_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>>   	void debayer8_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
>> +	void debayer8_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>>   	/* unpacked 10-bit raw bayer format */
>>   	void debayer10_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
>> +	void debayer10_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>>   	void debayer10_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
>> +	void debayer10_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>>   	/* unpacked 12-bit raw bayer format */
>>   	void debayer12_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
>> +	void debayer12_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>>   	void debayer12_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
>> +	void debayer12_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>>   	/* CSI-2 packed 10-bit raw bayer format (all the 4 orders) */
>>   	void debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
>> +	void debayer10P_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>>   	void debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
>> +	void debayer10P_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>>   	void debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[]);
>> +	void debayer10P_GBGB_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>>   	void debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[]);
>> +	void debayer10P_RGRG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
>>   
>>   	struct DebayerInputConfig {
>>   		Size patternSize;

Patch
diff mbox series

diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
index c038eed4..73c66a88 100644
--- a/src/libcamera/software_isp/debayer_cpu.cpp
+++ b/src/libcamera/software_isp/debayer_cpu.cpp
@@ -76,6 +76,13 @@  DebayerCpu::~DebayerCpu()
 	*dst++ = red_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
 	x++;
 
+#define BGGR_XBGR8888(p, n, div)                                                              \
+	*dst++ = blue_[curr[x] / (div)];                                                      \
+	*dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];       \
+	*dst++ = red_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
+	*dst++ = 255;                                                                         \
+	x++;
+
 /*
  * GBG
  * RGR
@@ -87,6 +94,13 @@  DebayerCpu::~DebayerCpu()
 	*dst++ = red_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
 	x++;
 
+#define GRBG_XBGR8888(p, n, div)                                  \
+	*dst++ = blue_[(prev[x] + next[x]) / (2 * (div))];        \
+	*dst++ = green_[curr[x] / (div)];                         \
+	*dst++ = red_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
+	*dst++ = 255;                                             \
+	x++;
+
 /*
  * GRG
  * BGB
@@ -98,6 +112,13 @@  DebayerCpu::~DebayerCpu()
 	*dst++ = red_[(prev[x] + next[x]) / (2 * (div))];          \
 	x++;
 
+#define GBRG_XBGR8888(p, n, div)                                   \
+	*dst++ = blue_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
+	*dst++ = green_[curr[x] / (div)];                          \
+	*dst++ = red_[(prev[x] + next[x]) / (2 * (div))];          \
+	*dst++ = 255;                                              \
+	x++;
+
 /*
  * BGB
  * GRG
@@ -109,6 +130,13 @@  DebayerCpu::~DebayerCpu()
 	*dst++ = red_[curr[x] / (div)];                                                        \
 	x++;
 
+#define RGGB_XBGR8888(p, n, div)                                                               \
+	*dst++ = blue_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
+	*dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];        \
+	*dst++ = red_[curr[x] / (div)];                                                        \
+	*dst++ = 255;                                                                          \
+	x++;
+
 void DebayerCpu::debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
 {
 	DECLARE_SRC_POINTERS(uint8_t)
@@ -119,6 +147,16 @@  void DebayerCpu::debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
 	}
 }
 
+void DebayerCpu::debayer8_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
+{
+	DECLARE_SRC_POINTERS(uint8_t)
+
+	for (int x = 0; x < (int)window_.width;) {
+		BGGR_XBGR8888(1, 1, 1)
+		GBRG_XBGR8888(1, 1, 1)
+	}
+}
+
 void DebayerCpu::debayer8_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
 {
 	DECLARE_SRC_POINTERS(uint8_t)
@@ -129,6 +167,16 @@  void DebayerCpu::debayer8_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
 	}
 }
 
+void DebayerCpu::debayer8_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
+{
+	DECLARE_SRC_POINTERS(uint8_t)
+
+	for (int x = 0; x < (int)window_.width;) {
+		GRBG_XBGR8888(1, 1, 1)
+		RGGB_XBGR8888(1, 1, 1)
+	}
+}
+
 void DebayerCpu::debayer10_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
 {
 	DECLARE_SRC_POINTERS(uint16_t)
@@ -140,6 +188,17 @@  void DebayerCpu::debayer10_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
 	}
 }
 
+void DebayerCpu::debayer10_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
+{
+	DECLARE_SRC_POINTERS(uint16_t)
+
+	for (int x = 0; x < (int)window_.width;) {
+		/* divide values by 4 for 10 -> 8 bpp value */
+		BGGR_XBGR8888(1, 1, 4)
+		GBRG_XBGR8888(1, 1, 4)
+	}
+}
+
 void DebayerCpu::debayer10_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
 {
 	DECLARE_SRC_POINTERS(uint16_t)
@@ -151,6 +210,17 @@  void DebayerCpu::debayer10_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
 	}
 }
 
+void DebayerCpu::debayer10_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
+{
+	DECLARE_SRC_POINTERS(uint16_t)
+
+	for (int x = 0; x < (int)window_.width;) {
+		/* divide values by 4 for 10 -> 8 bpp value */
+		GRBG_XBGR8888(1, 1, 4)
+		RGGB_XBGR8888(1, 1, 4)
+	}
+}
+
 void DebayerCpu::debayer12_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
 {
 	DECLARE_SRC_POINTERS(uint16_t)
@@ -162,6 +232,17 @@  void DebayerCpu::debayer12_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
 	}
 }
 
+void DebayerCpu::debayer12_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
+{
+	DECLARE_SRC_POINTERS(uint16_t)
+
+	for (int x = 0; x < (int)window_.width;) {
+		/* divide values by 16 for 12 -> 8 bpp value */
+		BGGR_XBGR8888(1, 1, 16)
+		GBRG_XBGR8888(1, 1, 16)
+	}
+}
+
 void DebayerCpu::debayer12_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
 {
 	DECLARE_SRC_POINTERS(uint16_t)
@@ -173,6 +254,17 @@  void DebayerCpu::debayer12_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
 	}
 }
 
+void DebayerCpu::debayer12_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
+{
+	DECLARE_SRC_POINTERS(uint16_t)
+
+	for (int x = 0; x < (int)window_.width;) {
+		/* divide values by 16 for 12 -> 8 bpp value */
+		GRBG_XBGR8888(1, 1, 16)
+		RGGB_XBGR8888(1, 1, 16)
+	}
+}
+
 void DebayerCpu::debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
 {
 	const int widthInBytes = window_.width * 5 / 4;
@@ -198,6 +290,31 @@  void DebayerCpu::debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
 	}
 }
 
+void DebayerCpu::debayer10P_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[])
+{
+	const int widthInBytes = window_.width * 5 / 4;
+	const uint8_t *prev = src[0];
+	const uint8_t *curr = src[1];
+	const uint8_t *next = src[2];
+
+	/*
+	 * For the first pixel getting a pixel from the previous column uses
+	 * x - 2 to skip the 5th byte with least-significant bits for 4 pixels.
+	 * Same for last pixel (uses x + 2) and looking at the next column.
+	 */
+	for (int x = 0; x < widthInBytes;) {
+		/* First pixel */
+		BGGR_XBGR8888(2, 1, 1)
+		/* Second pixel BGGR -> GBRG */
+		GBRG_XBGR8888(1, 1, 1)
+		/* Same thing for third and fourth pixels */
+		BGGR_XBGR8888(1, 1, 1)
+		GBRG_XBGR8888(1, 2, 1)
+		/* Skip 5th src byte with 4 x 2 least-significant-bits */
+		x++;
+	}
+}
+
 void DebayerCpu::debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
 {
 	const int widthInBytes = window_.width * 5 / 4;
@@ -218,6 +335,26 @@  void DebayerCpu::debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
 	}
 }
 
+void DebayerCpu::debayer10P_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[])
+{
+	const int widthInBytes = window_.width * 5 / 4;
+	const uint8_t *prev = src[0];
+	const uint8_t *curr = src[1];
+	const uint8_t *next = src[2];
+
+	for (int x = 0; x < widthInBytes;) {
+		/* First pixel */
+		GRBG_XBGR8888(2, 1, 1)
+		/* Second pixel GRBG -> RGGB */
+		RGGB_XBGR8888(1, 1, 1)
+		/* Same thing for third and fourth pixels */
+		GRBG_XBGR8888(1, 1, 1)
+		RGGB_XBGR8888(1, 2, 1)
+		/* Skip 5th src byte with 4 x 2 least-significant-bits */
+		x++;
+	}
+}
+
 void DebayerCpu::debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[])
 {
 	const int widthInBytes = window_.width * 5 / 4;
@@ -238,6 +375,26 @@  void DebayerCpu::debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[])
 	}
 }
 
+void DebayerCpu::debayer10P_GBGB_XBGR8888(uint8_t *dst, const uint8_t *src[])
+{
+	const int widthInBytes = window_.width * 5 / 4;
+	const uint8_t *prev = src[0];
+	const uint8_t *curr = src[1];
+	const uint8_t *next = src[2];
+
+	for (int x = 0; x < widthInBytes;) {
+		/* Even pixel */
+		GBRG_XBGR8888(2, 1, 1)
+		/* Odd pixel GBGR -> BGGR */
+		BGGR_XBGR8888(1, 1, 1)
+		/* Same thing for next 2 pixels */
+		GBRG_XBGR8888(1, 1, 1)
+		BGGR_XBGR8888(1, 2, 1)
+		/* Skip 5th src byte with 4 x 2 least-significant-bits */
+		x++;
+	}
+}
+
 void DebayerCpu::debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[])
 {
 	const int widthInBytes = window_.width * 5 / 4;
@@ -258,6 +415,26 @@  void DebayerCpu::debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[])
 	}
 }
 
+void DebayerCpu::debayer10P_RGRG_XBGR8888(uint8_t *dst, const uint8_t *src[])
+{
+	const int widthInBytes = window_.width * 5 / 4;
+	const uint8_t *prev = src[0];
+	const uint8_t *curr = src[1];
+	const uint8_t *next = src[2];
+
+	for (int x = 0; x < widthInBytes;) {
+		/* Even pixel */
+		RGGB_XBGR8888(2, 1, 1)
+		/* Odd pixel RGGB -> GRBG */
+		GRBG_XBGR8888(1, 1, 1)
+		/* Same thing for next 2 pixels */
+		RGGB_XBGR8888(1, 1, 1)
+		GRBG_XBGR8888(1, 2, 1)
+		/* Skip 5th src byte with 4 x 2 least-significant-bits */
+		x++;
+	}
+}
+
 static bool isStandardBayerOrder(BayerFormat::Order order)
 {
 	return order == BayerFormat::BGGR || order == BayerFormat::GBRG ||
@@ -280,7 +457,14 @@  int DebayerCpu::getInputConfig(PixelFormat inputFormat, DebayerInputConfig &conf
 		config.bpp = (bayerFormat.bitDepth + 7) & ~7;
 		config.patternSize.width = 2;
 		config.patternSize.height = 2;
-		config.outputFormats = std::vector<PixelFormat>({ formats::RGB888, formats::BGR888 });
+		config.outputFormats = std::vector<PixelFormat>({
+			formats::RGB888,
+			formats::XRGB8888,
+			formats::ARGB8888,
+			formats::BGR888,
+			formats::XBGR8888,
+			formats::ABGR8888
+		});
 		return 0;
 	}
 
@@ -290,7 +474,14 @@  int DebayerCpu::getInputConfig(PixelFormat inputFormat, DebayerInputConfig &conf
 		config.bpp = 10;
 		config.patternSize.width = 4; /* 5 bytes per *4* pixels */
 		config.patternSize.height = 2;
-		config.outputFormats = std::vector<PixelFormat>({ formats::RGB888, formats::BGR888 });
+		config.outputFormats = std::vector<PixelFormat>({
+			formats::RGB888,
+			formats::XRGB8888,
+			formats::ARGB8888,
+			formats::BGR888,
+			formats::XBGR8888,
+			formats::ABGR8888
+		});
 		return 0;
 	}
 
@@ -306,6 +497,12 @@  int DebayerCpu::getOutputConfig(PixelFormat outputFormat, DebayerOutputConfig &c
 		return 0;
 	}
 
+	if (outputFormat == formats::XRGB8888 || outputFormat == formats::ARGB8888 ||
+	    outputFormat == formats::XBGR8888 || outputFormat == formats::ABGR8888) {
+		config.bpp = 32;
+		return 0;
+	}
+
 	LOG(Debayer, Info)
 		<< "Unsupported output format " << outputFormat.toString();
 	return -EINVAL;
@@ -341,6 +538,7 @@  int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
 {
 	BayerFormat bayerFormat =
 		BayerFormat::fromPixelFormat(inputFormat);
+	bool is_aligned = false;
 
 	xShift_ = 0;
 	swapRedBlueGains_ = false;
@@ -351,8 +549,16 @@  int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
 	};
 
 	switch (outputFormat) {
+	case formats::XRGB8888:
+	case formats::ARGB8888:
+	  is_aligned = true;
+	  [[fallthrough]];
 	case formats::RGB888:
 		break;
+	case formats::XBGR8888:
+	case formats::ABGR8888:
+	  is_aligned = true;
+	  [[fallthrough]];
 	case formats::BGR888:
 		/* Swap R and B in bayer order to generate BGR888 instead of RGB888 */
 		swapRedBlueGains_ = true;
@@ -383,16 +589,19 @@  int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
 	    isStandardBayerOrder(bayerFormat.order)) {
 		switch (bayerFormat.bitDepth) {
 		case 8:
-			debayer0_ = &DebayerCpu::debayer8_BGBG_BGR888;
-			debayer1_ = &DebayerCpu::debayer8_GRGR_BGR888;
+		  LOG(Debayer, Warning) << "8bit no packing";
+		  debayer0_ = is_aligned ? &DebayerCpu::debayer8_BGBG_XBGR8888 : &DebayerCpu::debayer8_BGBG_BGR888;
+		  debayer1_ = is_aligned ? &DebayerCpu::debayer8_GRGR_XBGR8888 : &DebayerCpu::debayer8_GRGR_BGR888;
 			break;
 		case 10:
-			debayer0_ = &DebayerCpu::debayer10_BGBG_BGR888;
-			debayer1_ = &DebayerCpu::debayer10_GRGR_BGR888;
+		  LOG(Debayer, Warning) << "10bit no packing";
+			debayer0_ = is_aligned ? &DebayerCpu::debayer10_BGBG_XBGR8888 : &DebayerCpu::debayer10_BGBG_BGR888;
+			debayer1_ = is_aligned ? &DebayerCpu::debayer10_GRGR_XBGR8888 : &DebayerCpu::debayer10_GRGR_BGR888;
 			break;
 		case 12:
-			debayer0_ = &DebayerCpu::debayer12_BGBG_BGR888;
-			debayer1_ = &DebayerCpu::debayer12_GRGR_BGR888;
+		  LOG(Debayer, Warning) << "12bit no packing";
+			debayer0_ = is_aligned ? &DebayerCpu::debayer12_BGBG_XBGR8888 : &DebayerCpu::debayer12_BGBG_BGR888;
+			debayer1_ = is_aligned ? &DebayerCpu::debayer12_GRGR_XBGR8888 : &DebayerCpu::debayer12_GRGR_BGR888;
 			break;
 		}
 		setupStandardBayerOrder(bayerFormat.order);
@@ -401,22 +610,23 @@  int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputF
 
 	if (bayerFormat.bitDepth == 10 &&
 	    bayerFormat.packing == BayerFormat::Packing::CSI2) {
+	  LOG(Debayer, Warning) << "10bit csi2";
 		switch (bayerFormat.order) {
 		case BayerFormat::BGGR:
-			debayer0_ = &DebayerCpu::debayer10P_BGBG_BGR888;
-			debayer1_ = &DebayerCpu::debayer10P_GRGR_BGR888;
+			debayer0_ = is_aligned ? &DebayerCpu::debayer10P_BGBG_XBGR8888 : &DebayerCpu::debayer10P_BGBG_BGR888;
+			debayer1_ = is_aligned ? &DebayerCpu::debayer10P_GRGR_XBGR8888 : &DebayerCpu::debayer10P_GRGR_BGR888;
 			return 0;
 		case BayerFormat::GBRG:
-			debayer0_ = &DebayerCpu::debayer10P_GBGB_BGR888;
-			debayer1_ = &DebayerCpu::debayer10P_RGRG_BGR888;
+			debayer0_ = is_aligned ? &DebayerCpu::debayer10P_GBGB_XBGR8888 : &DebayerCpu::debayer10P_GBGB_BGR888;
+			debayer1_ = is_aligned ? &DebayerCpu::debayer10P_RGRG_XBGR8888 : &DebayerCpu::debayer10P_RGRG_BGR888;
 			return 0;
 		case BayerFormat::GRBG:
-			debayer0_ = &DebayerCpu::debayer10P_GRGR_BGR888;
-			debayer1_ = &DebayerCpu::debayer10P_BGBG_BGR888;
+			debayer0_ = is_aligned ? &DebayerCpu::debayer10P_GRGR_XBGR8888 : &DebayerCpu::debayer10P_GRGR_BGR888;
+			debayer1_ = is_aligned ? &DebayerCpu::debayer10P_BGBG_XBGR8888 : &DebayerCpu::debayer10P_BGBG_BGR888;
 			return 0;
 		case BayerFormat::RGGB:
-			debayer0_ = &DebayerCpu::debayer10P_RGRG_BGR888;
-			debayer1_ = &DebayerCpu::debayer10P_GBGB_BGR888;
+			debayer0_ = is_aligned ? &DebayerCpu::debayer10P_RGRG_XBGR8888 : &DebayerCpu::debayer10P_RGRG_BGR888;
+			debayer1_ = is_aligned ? &DebayerCpu::debayer10P_GBGB_XBGR8888 : &DebayerCpu::debayer10P_GBGB_BGR888;
 			return 0;
 		default:
 			break;
@@ -533,6 +743,8 @@  DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size
 	/* round up to multiple of 8 for 64 bits alignment */
 	unsigned int stride = (size.width * config.bpp / 8 + 7) & ~7;
 
+	LOG(Debayer, Warning) << outputFormat.toString() << " " << size.width << " " << size.height << " " << config.bpp << " " << stride << " " << stride * size.height;
+
 	return std::make_tuple(stride, stride * size.height);
 }
 
diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
index be7dcdca..c30f44aa 100644
--- a/src/libcamera/software_isp/debayer_cpu.h
+++ b/src/libcamera/software_isp/debayer_cpu.h
@@ -86,18 +86,28 @@  private:
 
 	/* 8-bit raw bayer format */
 	void debayer8_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
+	void debayer8_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
 	void debayer8_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
+	void debayer8_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
 	/* unpacked 10-bit raw bayer format */
 	void debayer10_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
+	void debayer10_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
 	void debayer10_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
+	void debayer10_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
 	/* unpacked 12-bit raw bayer format */
 	void debayer12_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
+	void debayer12_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
 	void debayer12_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
+	void debayer12_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
 	/* CSI-2 packed 10-bit raw bayer format (all the 4 orders) */
 	void debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
+	void debayer10P_BGBG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
 	void debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
+	void debayer10P_GRGR_XBGR8888(uint8_t *dst, const uint8_t *src[]);
 	void debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[]);
+	void debayer10P_GBGB_XBGR8888(uint8_t *dst, const uint8_t *src[]);
 	void debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[]);
+	void debayer10P_RGRG_XBGR8888(uint8_t *dst, const uint8_t *src[]);
 
 	struct DebayerInputConfig {
 		Size patternSize;