[v6,08/18] libcamera: software_isp: Add DebayerCpu class
diff mbox series

Message ID 20240319123622.675599-9-mzamazal@redhat.com
State Accepted
Headers show
Series
  • libcamera: introduce Software ISP and Software IPA
Related show

Commit Message

Milan Zamazal March 19, 2024, 12:35 p.m. UTC
From: Hans de Goede <hdegoede@redhat.com>

Add CPU based debayering implementation. This initial implementation
only supports debayering packed 10 bits per pixel bayer data in
the 4 standard bayer orders.

Doxygen documentation by Dennis Bonke.

Tested-by: Bryan O'Donoghue <bryan.odonoghue@linaro.org> # sc8280xp Lenovo x13s
Tested-by: Pavel Machek <pavel@ucw.cz>
Reviewed-by: Pavel Machek <pavel@ucw.cz>
Co-developed-by: Dennis Bonke <admin@dennisbonke.com>
Signed-off-by: Dennis Bonke <admin@dennisbonke.com>
Co-developed-by: Andrey Konovalov <andrey.konovalov@linaro.org>
Signed-off-by: Andrey Konovalov <andrey.konovalov@linaro.org>
Co-developed-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Milan Zamazal <mzamazal@redhat.com>
---
 src/libcamera/software_isp/debayer_cpu.cpp | 626 +++++++++++++++++++++
 src/libcamera/software_isp/debayer_cpu.h   | 143 +++++
 src/libcamera/software_isp/meson.build     |   1 +
 3 files changed, 770 insertions(+)
 create mode 100644 src/libcamera/software_isp/debayer_cpu.cpp
 create mode 100644 src/libcamera/software_isp/debayer_cpu.h

Comments

Laurent Pinchart March 27, 2024, 2:12 p.m. UTC | #1
Hi Milan and Hans,

Thank you for the patch.

On Tue, Mar 19, 2024 at 01:35:55PM +0100, Milan Zamazal wrote:
> From: Hans de Goede <hdegoede@redhat.com>
> 
> Add CPU based debayering implementation. This initial implementation
> only supports debayering packed 10 bits per pixel bayer data in
> the 4 standard bayer orders.
> 
> Doxygen documentation by Dennis Bonke.
> 
> Tested-by: Bryan O'Donoghue <bryan.odonoghue@linaro.org> # sc8280xp Lenovo x13s
> Tested-by: Pavel Machek <pavel@ucw.cz>
> Reviewed-by: Pavel Machek <pavel@ucw.cz>
> Co-developed-by: Dennis Bonke <admin@dennisbonke.com>
> Signed-off-by: Dennis Bonke <admin@dennisbonke.com>
> Co-developed-by: Andrey Konovalov <andrey.konovalov@linaro.org>
> Signed-off-by: Andrey Konovalov <andrey.konovalov@linaro.org>
> Co-developed-by: Pavel Machek <pavel@ucw.cz>
> Signed-off-by: Pavel Machek <pavel@ucw.cz>
> Signed-off-by: Hans de Goede <hdegoede@redhat.com>
> Reviewed-by: Milan Zamazal <mzamazal@redhat.com>
> ---
>  src/libcamera/software_isp/debayer_cpu.cpp | 626 +++++++++++++++++++++
>  src/libcamera/software_isp/debayer_cpu.h   | 143 +++++
>  src/libcamera/software_isp/meson.build     |   1 +
>  3 files changed, 770 insertions(+)
>  create mode 100644 src/libcamera/software_isp/debayer_cpu.cpp
>  create mode 100644 src/libcamera/software_isp/debayer_cpu.h
> 
> diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
> new file mode 100644
> index 00000000..f932362c
> --- /dev/null
> +++ b/src/libcamera/software_isp/debayer_cpu.cpp
> @@ -0,0 +1,626 @@
> +/* SPDX-License-Identifier: LGPL-2.1-or-later */
> +/*
> + * Copyright (C) 2023, Linaro Ltd
> + * Copyright (C) 2023, Red Hat Inc.
> + *
> + * Authors:
> + * Hans de Goede <hdegoede@redhat.com>
> + *
> + * debayer_cpu.cpp - CPU based debayering class
> + */
> +
> +#include "debayer_cpu.h"
> +
> +#include <math.h>
> +#include <stdlib.h>
> +#include <time.h>
> +
> +#include <libcamera/formats.h>
> +
> +#include "libcamera/internal/bayer_format.h"
> +#include "libcamera/internal/framebuffer.h"
> +#include "libcamera/internal/mapped_framebuffer.h"
> +
> +namespace libcamera {
> +
> +/**
> + * \class DebayerCpu
> + * \brief Class for debayering on the CPU
> + *
> + * Implementation for CPU based debayering
> + */
> +
> +/**
> + * \brief Constructs a DebayerCpu object.
> + * \param[in] stats Pointer to the stats object to use.

No period at the end of \brief or \param.

> + */
> +DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats)
> +	: stats_(std::move(stats)), gamma_correction_(1.0)

gammaCorrection_

> +{
> +#ifdef __x86_64__
> +	enableInputMemcpy_ = false;
> +#else
> +	enableInputMemcpy_ = true;
> +#endif

This is very much *not* nice :-( It needs to at least be explained
clearly somewhere.

> +	/* Initialize gamma to 1.0 curve */
> +	for (unsigned int i = 0; i < kGammaLookupSize; i++)
> +		gamma_[i] = i / (kGammaLookupSize / kRGBLookupSize);
> +
> +	for (unsigned int i = 0; i < kMaxLineBuffers; i++)
> +		lineBuffers_[i] = nullptr;
> +}
> +
> +DebayerCpu::~DebayerCpu()
> +{
> +	for (unsigned int i = 0; i < kMaxLineBuffers; i++)
> +		free(lineBuffers_[i]);
> +}
> +
> +// RGR
> +// GBG
> +// RGR

C-style comments.

> +#define BGGR_BGR888(p, n, div)                                                                \
> +	*dst++ = blue_[curr[x] / (div)];                                                      \
> +	*dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];       \
> +	*dst++ = red_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
> +	x++;
> +
> +// GBG
> +// RGR
> +// GBG
> +#define GRBG_BGR888(p, n, div)                                    \
> +	*dst++ = blue_[(prev[x] + next[x]) / (2 * (div))];        \
> +	*dst++ = green_[curr[x] / (div)];                         \
> +	*dst++ = red_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
> +	x++;
> +
> +// GRG
> +// BGB
> +// GRG
> +#define GBRG_BGR888(p, n, div)                                     \
> +	*dst++ = blue_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
> +	*dst++ = green_[curr[x] / (div)];                          \
> +	*dst++ = red_[(prev[x] + next[x]) / (2 * (div))];          \
> +	x++;
> +
> +// BGB
> +// GRG
> +// BGB
> +#define RGGB_BGR888(p, n, div)                                                                 \
> +	*dst++ = blue_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
> +	*dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];        \
> +	*dst++ = red_[curr[x] / (div)];                                                        \
> +	x++;
> +
> +void DebayerCpu::debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
> +{
> +	const int width_in_bytes = window_.width * 5 / 4;

snakeCase here too, ane where applicable everywhere else.

> +	const uint8_t *prev = (const uint8_t *)src[0];

Is the cast needed ? If so it should be a C++ cast.

> +	const uint8_t *curr = (const uint8_t *)src[1];
> +	const uint8_t *next = (const uint8_t *)src[2];
> +
> +	/*
> +	 * For the first pixel getting a pixel from the previous column uses
> +	 * x - 2 to skip the 5th byte with least-significant bits for 4 pixels.
> +	 * Same for last pixel (uses x + 2) and looking at the next column.
> +	 */
> +	for (int x = 0; x < width_in_bytes;) {
> +		/* First pixel */
> +		BGGR_BGR888(2, 1, 1)
> +		/* Second pixel BGGR -> GBRG */
> +		GBRG_BGR888(1, 1, 1)
> +		/* Same thing for third and fourth pixels */
> +		BGGR_BGR888(1, 1, 1)
> +		GBRG_BGR888(1, 2, 1)
> +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
> +		x++;
> +	}
> +}
> +
> +void DebayerCpu::debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
> +{
> +	const int width_in_bytes = window_.width * 5 / 4;
> +	const uint8_t *prev = (const uint8_t *)src[0];
> +	const uint8_t *curr = (const uint8_t *)src[1];
> +	const uint8_t *next = (const uint8_t *)src[2];
> +
> +	for (int x = 0; x < width_in_bytes;) {
> +		/* First pixel */
> +		GRBG_BGR888(2, 1, 1)
> +		/* Second pixel GRBG -> RGGB */
> +		RGGB_BGR888(1, 1, 1)
> +		/* Same thing for third and fourth pixels */
> +		GRBG_BGR888(1, 1, 1)
> +		RGGB_BGR888(1, 2, 1)
> +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
> +		x++;
> +	}
> +}
> +
> +void DebayerCpu::debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[])
> +{
> +	const int width_in_bytes = window_.width * 5 / 4;
> +	const uint8_t *prev = (const uint8_t *)src[0];
> +	const uint8_t *curr = (const uint8_t *)src[1];
> +	const uint8_t *next = (const uint8_t *)src[2];
> +
> +	for (int x = 0; x < width_in_bytes;) {
> +		/* Even pixel */
> +		GBRG_BGR888(2, 1, 1)
> +		/* Odd pixel GBGR -> BGGR */
> +		BGGR_BGR888(1, 1, 1)
> +		/* Same thing for next 2 pixels */
> +		GBRG_BGR888(1, 1, 1)
> +		BGGR_BGR888(1, 2, 1)
> +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
> +		x++;
> +	}
> +}
> +
> +void DebayerCpu::debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[])
> +{
> +	const int width_in_bytes = window_.width * 5 / 4;
> +	const uint8_t *prev = (const uint8_t *)src[0];
> +	const uint8_t *curr = (const uint8_t *)src[1];
> +	const uint8_t *next = (const uint8_t *)src[2];
> +
> +	for (int x = 0; x < width_in_bytes;) {
> +		/* Even pixel */
> +		RGGB_BGR888(2, 1, 1)
> +		/* Odd pixel RGGB -> GRBG */
> +		GRBG_BGR888(1, 1, 1)
> +		/* Same thing for next 2 pixels */
> +		RGGB_BGR888(1, 1, 1)
> +		GRBG_BGR888(1, 2, 1)
> +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
> +		x++;
> +	}
> +}
> +
> +static bool isStandardBayerOrder(BayerFormat::Order order)
> +{
> +	return order == BayerFormat::BGGR || order == BayerFormat::GBRG ||
> +	       order == BayerFormat::GRBG || order == BayerFormat::RGGB;
> +}
> +
> +/*
> + * Setup the Debayer object according to the passed in parameters.
> + * Return 0 on success, a negative errno value on failure
> + * (unsupported parameters).
> + */
> +int DebayerCpu::getInputConfig(PixelFormat inputFormat, DebayerInputConfig &config)
> +{
> +	BayerFormat bayerFormat =
> +		BayerFormat::fromPixelFormat(inputFormat);
> +
> +	if (bayerFormat.bitDepth == 10 &&
> +	    bayerFormat.packing == BayerFormat::Packing::CSI2 &&
> +	    isStandardBayerOrder(bayerFormat.order)) {
> +		config.bpp = 10;
> +		config.patternSize.width = 4; /* 5 bytes per *4* pixels */
> +		config.patternSize.height = 2;
> +		config.outputFormats = std::vector<PixelFormat>({ formats::RGB888 });
> +		return 0;
> +	}
> +
> +	LOG(Debayer, Info)

Shouldn't this be an Error mesage ? Same below.

> +		<< "Unsupported input format " << inputFormat.toString();
> +	return -EINVAL;
> +}
> +
> +int DebayerCpu::getOutputConfig(PixelFormat outputFormat, DebayerOutputConfig &config)
> +{
> +	if (outputFormat == formats::RGB888) {
> +		config.bpp = 24;
> +		return 0;
> +	}
> +
> +	LOG(Debayer, Info)
> +		<< "Unsupported output format " << outputFormat.toString();
> +	return -EINVAL;
> +}
> +
> +/* TODO: this ignores outputFormat since there is only 1 supported outputFormat for now */
> +int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, [[maybe_unused]] PixelFormat outputFormat)
> +{
> +	BayerFormat bayerFormat =
> +		BayerFormat::fromPixelFormat(inputFormat);
> +
> +	if (bayerFormat.bitDepth == 10 &&
> +	    bayerFormat.packing == BayerFormat::Packing::CSI2) {
> +		switch (bayerFormat.order) {
> +		case BayerFormat::BGGR:
> +			debayer0_ = &DebayerCpu::debayer10P_BGBG_BGR888;
> +			debayer1_ = &DebayerCpu::debayer10P_GRGR_BGR888;
> +			return 0;
> +		case BayerFormat::GBRG:
> +			debayer0_ = &DebayerCpu::debayer10P_GBGB_BGR888;
> +			debayer1_ = &DebayerCpu::debayer10P_RGRG_BGR888;
> +			return 0;
> +		case BayerFormat::GRBG:
> +			debayer0_ = &DebayerCpu::debayer10P_GRGR_BGR888;
> +			debayer1_ = &DebayerCpu::debayer10P_BGBG_BGR888;
> +			return 0;
> +		case BayerFormat::RGGB:
> +			debayer0_ = &DebayerCpu::debayer10P_RGRG_BGR888;
> +			debayer1_ = &DebayerCpu::debayer10P_GBGB_BGR888;
> +			return 0;
> +		default:
> +			break;
> +		}
> +	}
> +
> +	LOG(Debayer, Error) << "Unsupported input output format combination";
> +	return -EINVAL;
> +}
> +
> +int DebayerCpu::configure(const StreamConfiguration &inputCfg,
> +			  const std::vector<std::reference_wrapper<StreamConfiguration>> &outputCfgs)
> +{
> +	if (getInputConfig(inputCfg.pixelFormat, inputConfig_) != 0)
> +		return -EINVAL;
> +
> +	if (stats_->configure(inputCfg) != 0)
> +		return -EINVAL;
> +
> +	const Size &stats_pattern_size = stats_->patternSize();
> +	if (inputConfig_.patternSize.width != stats_pattern_size.width ||
> +	    inputConfig_.patternSize.height != stats_pattern_size.height) {
> +		LOG(Debayer, Error)
> +			<< "mismatching stats and debayer pattern sizes for "
> +			<< inputCfg.pixelFormat.toString();
> +		return -EINVAL;
> +	}
> +
> +	inputConfig_.stride = inputCfg.stride;
> +
> +	if (outputCfgs.size() != 1) {
> +		LOG(Debayer, Error)
> +			<< "Unsupported number of output streams: "
> +			<< outputCfgs.size();
> +		return -EINVAL;
> +	}
> +
> +	const StreamConfiguration &outputCfg = outputCfgs[0];
> +	SizeRange outSizeRange = sizes(inputCfg.pixelFormat, inputCfg.size);
> +	std::tie(outputConfig_.stride, outputConfig_.frameSize) =
> +		strideAndFrameSize(outputCfg.pixelFormat, outputCfg.size);
> +
> +	if (!outSizeRange.contains(outputCfg.size) || outputConfig_.stride != outputCfg.stride) {
> +		LOG(Debayer, Error)
> +			<< "Invalid output size/stride: "
> +			<< "\n  " << outputCfg.size << " (" << outSizeRange << ")"
> +			<< "\n  " << outputCfg.stride << " (" << outputConfig_.stride << ")";
> +		return -EINVAL;
> +	}
> +
> +	if (setDebayerFunctions(inputCfg.pixelFormat, outputCfg.pixelFormat) != 0)
> +		return -EINVAL;
> +
> +	window_.x = ((inputCfg.size.width - outputCfg.size.width) / 2) &
> +		    ~(inputConfig_.patternSize.width - 1);
> +	window_.y = ((inputCfg.size.height - outputCfg.size.height) / 2) &
> +		    ~(inputConfig_.patternSize.height - 1);
> +	window_.width = outputCfg.size.width;
> +	window_.height = outputCfg.size.height;
> +
> +	/* Don't pass x,y since process() already adjusts src before passing it */
> +	stats_->setWindow(Rectangle(window_.size()));
> +
> +	/* pad with patternSize.Width on both left and right side */
> +	lineBufferPadding_ = inputConfig_.patternSize.width * inputConfig_.bpp / 8;
> +	lineBufferLength_ = window_.width * inputConfig_.bpp / 8 +
> +			    2 * lineBufferPadding_;
> +	for (unsigned int i = 0;
> +	     i < (inputConfig_.patternSize.height + 1) && enableInputMemcpy_;
> +	     i++) {
> +		free(lineBuffers_[i]);
> +		lineBuffers_[i] = (uint8_t *)malloc(lineBufferLength_);
> +		if (!lineBuffers_[i])
> +			return -ENOMEM;
> +	}
> +
> +	measuredFrames_ = 0;
> +	frameProcessTime_ = 0;
> +
> +	return 0;
> +}
> +
> +/*
> + * Get width and height at which the bayer-pattern repeats.
> + * Return pattern-size or an empty Size for an unsupported inputFormat.
> + */
> +Size DebayerCpu::patternSize(PixelFormat inputFormat)
> +{
> +	DebayerCpu::DebayerInputConfig config;
> +
> +	if (getInputConfig(inputFormat, config) != 0)
> +		return {};
> +
> +	return config.patternSize;
> +}
> +
> +std::vector<PixelFormat> DebayerCpu::formats(PixelFormat inputFormat)
> +{
> +	DebayerCpu::DebayerInputConfig config;
> +
> +	if (getInputConfig(inputFormat, config) != 0)
> +		return std::vector<PixelFormat>();
> +
> +	return config.outputFormats;
> +}
> +
> +std::tuple<unsigned int, unsigned int>
> +DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size)
> +{
> +	DebayerCpu::DebayerOutputConfig config;
> +
> +	if (getOutputConfig(outputFormat, config) != 0)
> +		return std::make_tuple(0, 0);
> +
> +	/* round up to multiple of 8 for 64 bits alignment */
> +	unsigned int stride = (size.width * config.bpp / 8 + 7) & ~7;
> +
> +	return std::make_tuple(stride, stride * size.height);
> +}
> +
> +void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[])
> +{
> +	const unsigned int patternHeight = inputConfig_.patternSize.height;
> +
> +	if (!enableInputMemcpy_)
> +		return;
> +
> +	for (unsigned int i = 0; i < patternHeight; i++) {
> +		memcpy(lineBuffers_[i], linePointers[i + 1] - lineBufferPadding_,
> +		       lineBufferLength_);
> +		linePointers[i + 1] = lineBuffers_[i] + lineBufferPadding_;
> +	}
> +
> +	/* Point lineBufferIndex_ to first unused lineBuffer */
> +	lineBufferIndex_ = patternHeight;
> +}
> +
> +void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src)
> +{
> +	const unsigned int patternHeight = inputConfig_.patternSize.height;
> +
> +	for (unsigned int i = 0; i < patternHeight; i++)
> +		linePointers[i] = linePointers[i + 1];
> +
> +	linePointers[patternHeight] = src +
> +				      (patternHeight / 2) * (int)inputConfig_.stride;
> +}
> +
> +void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[])
> +{
> +	const unsigned int patternHeight = inputConfig_.patternSize.height;
> +
> +	if (!enableInputMemcpy_)
> +		return;
> +
> +	memcpy(lineBuffers_[lineBufferIndex_], linePointers[patternHeight] - lineBufferPadding_,
> +	       lineBufferLength_);
> +	linePointers[patternHeight] = lineBuffers_[lineBufferIndex_] + lineBufferPadding_;
> +
> +	lineBufferIndex_ = (lineBufferIndex_ + 1) % (patternHeight + 1);
> +}
> +
> +void DebayerCpu::process2(const uint8_t *src, uint8_t *dst)
> +{
> +	unsigned int y_end = window_.y + window_.height;
> +	/* Holds [0] previous- [1] current- [2] next-line */
> +	const uint8_t *linePointers[3];
> +
> +	/* Adjust src to top left corner of the window */
> +	src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
> +
> +	/* [x] becomes [x - 1] after initial shiftLinePointers() call */
> +	if (window_.y) {
> +		linePointers[1] = src - inputConfig_.stride; /* previous-line */
> +		linePointers[2] = src;
> +	} else {
> +		/* window_.y == 0, use the next line as prev line */
> +		linePointers[1] = src + inputConfig_.stride;
> +		linePointers[2] = src;
> +		/* Last 2 lines also need special handling */
> +		y_end -= 2;
> +	}
> +
> +	setupInputMemcpy(linePointers);
> +
> +	for (unsigned int y = window_.y; y < y_end; y += 2) {
> +		shiftLinePointers(linePointers, src);
> +		memcpyNextLine(linePointers);
> +		stats_->processLine0(y, linePointers);
> +		(this->*debayer0_)(dst, linePointers);
> +		src += inputConfig_.stride;
> +		dst += outputConfig_.stride;
> +
> +		shiftLinePointers(linePointers, src);
> +		memcpyNextLine(linePointers);
> +		(this->*debayer1_)(dst, linePointers);
> +		src += inputConfig_.stride;
> +		dst += outputConfig_.stride;
> +	}
> +
> +	if (window_.y == 0) {
> +		shiftLinePointers(linePointers, src);
> +		memcpyNextLine(linePointers);
> +		stats_->processLine0(y_end, linePointers);
> +		(this->*debayer0_)(dst, linePointers);
> +		src += inputConfig_.stride;
> +		dst += outputConfig_.stride;
> +
> +		shiftLinePointers(linePointers, src);
> +		/* next line may point outside of src, use prev. */
> +		linePointers[2] = linePointers[0];
> +		(this->*debayer1_)(dst, linePointers);
> +		src += inputConfig_.stride;
> +		dst += outputConfig_.stride;
> +	}
> +}
> +
> +void DebayerCpu::process4(const uint8_t *src, uint8_t *dst)
> +{
> +	const unsigned int y_end = window_.y + window_.height;
> +	/*
> +	 * This holds pointers to [0] 2-lines-up [1] 1-line-up [2] current-line
> +	 * [3] 1-line-down [4] 2-lines-down.
> +	 */
> +	const uint8_t *linePointers[5];
> +
> +	/* Adjust src to top left corner of the window */
> +	src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
> +
> +	/* [x] becomes [x - 1] after initial shiftLinePointers() call */
> +	linePointers[1] = src - 2 * inputConfig_.stride;
> +	linePointers[2] = src - inputConfig_.stride;
> +	linePointers[3] = src;
> +	linePointers[4] = src + inputConfig_.stride;
> +
> +	setupInputMemcpy(linePointers);
> +
> +	for (unsigned int y = window_.y; y < y_end; y += 4) {
> +		shiftLinePointers(linePointers, src);
> +		memcpyNextLine(linePointers);
> +		stats_->processLine0(y, linePointers);
> +		(this->*debayer0_)(dst, linePointers);
> +		src += inputConfig_.stride;
> +		dst += outputConfig_.stride;
> +
> +		shiftLinePointers(linePointers, src);
> +		memcpyNextLine(linePointers);
> +		(this->*debayer1_)(dst, linePointers);
> +		src += inputConfig_.stride;
> +		dst += outputConfig_.stride;
> +
> +		shiftLinePointers(linePointers, src);
> +		memcpyNextLine(linePointers);
> +		stats_->processLine2(y, linePointers);
> +		(this->*debayer2_)(dst, linePointers);
> +		src += inputConfig_.stride;
> +		dst += outputConfig_.stride;
> +
> +		shiftLinePointers(linePointers, src);
> +		memcpyNextLine(linePointers);
> +		(this->*debayer3_)(dst, linePointers);
> +		src += inputConfig_.stride;
> +		dst += outputConfig_.stride;
> +	}
> +}
> +
> +static inline int64_t timeDiff(timespec &after, timespec &before)
> +{
> +	return (after.tv_sec - before.tv_sec) * 1000000000LL +
> +	       (int64_t)after.tv_nsec - (int64_t)before.tv_nsec;
> +}
> +
> +void DebayerCpu::process(FrameBuffer *input, FrameBuffer *output, DebayerParams params)
> +{
> +	timespec frameStartTime;
> +
> +	if (measuredFrames_ < DebayerCpu::kLastFrameToMeasure) {
> +		frameStartTime = {};
> +		clock_gettime(CLOCK_MONOTONIC_RAW, &frameStartTime);
> +	}
> +
> +	/* Apply DebayerParams */
> +	if (params.gamma != gamma_correction_) {
> +		for (unsigned int i = 0; i < kGammaLookupSize; i++)
> +			gamma_[i] = UINT8_MAX * powf(i / (kGammaLookupSize - 1.0), params.gamma);
> +
> +		gamma_correction_ = params.gamma;
> +	}
> +
> +	for (unsigned int i = 0; i < kRGBLookupSize; i++) {
> +		constexpr unsigned int div =
> +			kRGBLookupSize * DebayerParams::kGain10 / kGammaLookupSize;
> +		unsigned int idx;
> +
> +		/* Apply gamma after gain! */
> +		idx = std::min({ i * params.gainR / div, (kGammaLookupSize - 1) });
> +		red_[i] = gamma_[idx];
> +
> +		idx = std::min({ i * params.gainG / div, (kGammaLookupSize - 1) });
> +		green_[i] = gamma_[idx];
> +
> +		idx = std::min({ i * params.gainB / div, (kGammaLookupSize - 1) });
> +		blue_[i] = gamma_[idx];
> +	}
> +
> +	/* Copy metadata from the input buffer */
> +	FrameMetadata &metadata = output->_d()->metadata();
> +	metadata.status = input->metadata().status;
> +	metadata.sequence = input->metadata().sequence;
> +	metadata.timestamp = input->metadata().timestamp;
> +
> +	MappedFrameBuffer in(input, MappedFrameBuffer::MapFlag::Read);
> +	MappedFrameBuffer out(output, MappedFrameBuffer::MapFlag::Write);
> +	if (!in.isValid() || !out.isValid()) {
> +		LOG(Debayer, Error) << "mmap-ing buffer(s) failed";
> +		metadata.status = FrameMetadata::FrameError;
> +		return;
> +	}
> +
> +	stats_->startFrame();
> +
> +	if (inputConfig_.patternSize.height == 2)
> +		process2(in.planes()[0].data(), out.planes()[0].data());
> +	else
> +		process4(in.planes()[0].data(), out.planes()[0].data());
> +
> +	metadata.planes()[0].bytesused = out.planes()[0].size();
> +
> +	/* Measure before emitting signals */
> +	if (measuredFrames_ < DebayerCpu::kLastFrameToMeasure &&
> +	    ++measuredFrames_ > DebayerCpu::kFramesToSkip) {
> +		timespec frameEndTime = {};
> +		clock_gettime(CLOCK_MONOTONIC_RAW, &frameEndTime);
> +		frameProcessTime_ += timeDiff(frameEndTime, frameStartTime);
> +		if (measuredFrames_ == DebayerCpu::kLastFrameToMeasure) {
> +			const unsigned int measuredFrames = DebayerCpu::kLastFrameToMeasure -
> +							    DebayerCpu::kFramesToSkip;
> +			LOG(Debayer, Info)
> +				<< "Processed " << measuredFrames
> +				<< " frames in " << frameProcessTime_ / 1000 << "us, "
> +				<< frameProcessTime_ / (1000 * measuredFrames)
> +				<< " us/frame";
> +		}
> +	}

Is this debugging leftovers, or should it be cleaned up ? It seems quite
a bit of a hack. If we want to enable measurements, we shouldn't
hardcode them to frames 30 to 60.

> +
> +	stats_->finishFrame();
> +	outputBufferReady.emit(output);
> +	inputBufferReady.emit(input);
> +}
> +
> +SizeRange DebayerCpu::sizes(PixelFormat inputFormat, const Size &inputSize)
> +{
> +	Size pattern_size = patternSize(inputFormat);
> +	unsigned int border_height = pattern_size.height;
> +
> +	if (pattern_size.isNull())
> +		return {};
> +
> +	/* No need for top/bottom border with a pattern height of 2 */
> +	if (pattern_size.height == 2)
> +		border_height = 0;
> +
> +	/*
> +	 * For debayer interpolation a border is kept around the entire image
> +	 * and the minimum output size is pattern-height x pattern-width.
> +	 */
> +	if (inputSize.width < (3 * pattern_size.width) ||
> +	    inputSize.height < (2 * border_height + pattern_size.height)) {
> +		LOG(Debayer, Warning)
> +			<< "Input format size too small: " << inputSize.toString();
> +		return {};
> +	}
> +
> +	return SizeRange(Size(pattern_size.width, pattern_size.height),
> +			 Size((inputSize.width - 2 * pattern_size.width) & ~(pattern_size.width - 1),
> +			      (inputSize.height - 2 * border_height) & ~(pattern_size.height - 1)),
> +			 pattern_size.width, pattern_size.height);
> +}
> +
> +} /* namespace libcamera */
> diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
> new file mode 100644
> index 00000000..8a51ed85
> --- /dev/null
> +++ b/src/libcamera/software_isp/debayer_cpu.h
> @@ -0,0 +1,143 @@
> +/* SPDX-License-Identifier: LGPL-2.1-or-later */
> +/*
> + * Copyright (C) 2023, Linaro Ltd
> + * Copyright (C) 2023, Red Hat Inc.
> + *
> + * Authors:
> + * Hans de Goede <hdegoede@redhat.com>
> + *
> + * debayer_cpu.h - CPU based debayering header
> + */
> +
> +#pragma once
> +
> +#include <memory>
> +#include <stdint.h>
> +#include <vector>
> +
> +#include <libcamera/base/object.h>
> +
> +#include "debayer.h"
> +#include "swstats_cpu.h"
> +
> +namespace libcamera {
> +
> +class DebayerCpu : public Debayer, public Object
> +{
> +public:
> +	DebayerCpu(std::unique_ptr<SwStatsCpu> stats);
> +	~DebayerCpu();
> +
> +	int configure(const StreamConfiguration &inputCfg,
> +		      const std::vector<std::reference_wrapper<StreamConfiguration>> &outputCfgs);
> +	Size patternSize(PixelFormat inputFormat);
> +	std::vector<PixelFormat> formats(PixelFormat input);
> +	std::tuple<unsigned int, unsigned int>
> +	strideAndFrameSize(const PixelFormat &outputFormat, const Size &size);
> +	void process(FrameBuffer *input, FrameBuffer *output, DebayerParams params);
> +	SizeRange sizes(PixelFormat inputFormat, const Size &inputSize);
> +
> +	/**
> +	 * \brief Get the file descriptor for the statistics.
> +	 *
> +	 * \return the file descriptor pointing to the statistics.
> +	 */
> +	const SharedFD &getStatsFD() { return stats_->getStatsFD(); }

This, plus the fact that this class handles colour gains and gamma,
makes me thing we have either a naming issue, or an architecture issue.

> +
> +	/**
> +	 * \brief Get the output frame size.
> +	 *
> +	 * \return The output frame size.
> +	 */
> +	unsigned int frameSize() { return outputConfig_.frameSize; }
> +
> +private:
> +	/**
> +	 * \brief Called to debayer 1 line of Bayer input data to output format
> +	 * \param[out] dst Pointer to the start of the output line to write
> +	 * \param[in] src The input data
> +	 *
> +	 * Input data is an array of (patternSize_.height + 1) src
> +	 * pointers each pointing to a line in the Bayer source. The middle
> +	 * element of the array will point to the actual line being processed.
> +	 * Earlier element(s) will point to the previous line(s) and later
> +	 * element(s) to the next line(s).
> +	 *
> +	 * These functions take an array of src pointers, rather than
> +	 * a single src pointer + a stride for the source, so that when the src
> +	 * is slow uncached memory it can be copied to faster memory before
> +	 * debayering. Debayering a standard 2x2 Bayer pattern requires access
> +	 * to the previous and next src lines for interpolating the missing
> +	 * colors. To allow copying the src lines only once 3 temporary buffers
> +	 * each holding a single line are used, re-using the oldest buffer for
> +	 * the next line and the pointers are swizzled so that:
> +	 * src[0] = previous-line, src[1] = currrent-line, src[2] = next-line.
> +	 * This way the 3 pointers passed to the debayer functions form
> +	 * a sliding window over the src avoiding the need to copy each
> +	 * line more than once.
> +	 *
> +	 * Similarly for bayer patterns which repeat every 4 lines, 5 src
> +	 * pointers are passed holding: src[0] = 2-lines-up, src[1] = 1-line-up
> +	 * src[2] = current-line, src[3] = 1-line-down, src[4] = 2-lines-down.
> +	 */
> +	using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]);
> +
> +	/* CSI-2 packed 10-bit raw bayer format (all the 4 orders) */
> +	void debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
> +	void debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
> +	void debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[]);
> +	void debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[]);
> +
> +	struct DebayerInputConfig {
> +		Size patternSize;
> +		unsigned int bpp; /* Memory used per pixel, not precision */
> +		unsigned int stride;
> +		std::vector<PixelFormat> outputFormats;
> +	};
> +
> +	struct DebayerOutputConfig {
> +		unsigned int bpp; /* Memory used per pixel, not precision */
> +		unsigned int stride;
> +		unsigned int frameSize;
> +	};
> +
> +	int getInputConfig(PixelFormat inputFormat, DebayerInputConfig &config);
> +	int getOutputConfig(PixelFormat outputFormat, DebayerOutputConfig &config);
> +	int setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputFormat);
> +	void setupInputMemcpy(const uint8_t *linePointers[]);
> +	void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src);
> +	void memcpyNextLine(const uint8_t *linePointers[]);
> +	void process2(const uint8_t *src, uint8_t *dst);
> +	void process4(const uint8_t *src, uint8_t *dst);
> +
> +	static constexpr unsigned int kGammaLookupSize = 1024;
> +	static constexpr unsigned int kRGBLookupSize = 256;
> +	/* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */
> +	static constexpr unsigned int kMaxLineBuffers = 5;
> +
> +	std::array<uint8_t, kGammaLookupSize> gamma_;
> +	std::array<uint8_t, kRGBLookupSize> red_;
> +	std::array<uint8_t, kRGBLookupSize> green_;
> +	std::array<uint8_t, kRGBLookupSize> blue_;
> +	debayerFn debayer0_;
> +	debayerFn debayer1_;
> +	debayerFn debayer2_;
> +	debayerFn debayer3_;
> +	Rectangle window_;
> +	DebayerInputConfig inputConfig_;
> +	DebayerOutputConfig outputConfig_;
> +	std::unique_ptr<SwStatsCpu> stats_;
> +	uint8_t *lineBuffers_[kMaxLineBuffers];
> +	unsigned int lineBufferLength_;
> +	unsigned int lineBufferPadding_;
> +	unsigned int lineBufferIndex_;
> +	bool enableInputMemcpy_;
> +	float gamma_correction_;
> +	unsigned int measuredFrames_;
> +	int64_t frameProcessTime_;
> +	/* Skip 30 frames for things to stabilize then measure 30 frames */
> +	static constexpr unsigned int kFramesToSkip = 30;
> +	static constexpr unsigned int kLastFrameToMeasure = 60;
> +};
> +
> +} /* namespace libcamera */
> diff --git a/src/libcamera/software_isp/meson.build b/src/libcamera/software_isp/meson.build
> index 62095f61..71b46539 100644
> --- a/src/libcamera/software_isp/meson.build
> +++ b/src/libcamera/software_isp/meson.build
> @@ -9,5 +9,6 @@ endif
>  
>  libcamera_sources += files([
>      'debayer.cpp',
> +    'debayer_cpu.cpp',
>      'swstats_cpu.cpp',
>  ])
Laurent Pinchart March 27, 2024, 6:01 p.m. UTC | #2
Another comment.

On Wed, Mar 27, 2024 at 04:12:25PM +0200, Laurent Pinchart wrote:
> Hi Milan and Hans,
> 
> Thank you for the patch.
> 
> On Tue, Mar 19, 2024 at 01:35:55PM +0100, Milan Zamazal wrote:
> > From: Hans de Goede <hdegoede@redhat.com>
> > 
> > Add CPU based debayering implementation. This initial implementation
> > only supports debayering packed 10 bits per pixel bayer data in
> > the 4 standard bayer orders.
> > 
> > Doxygen documentation by Dennis Bonke.
> > 
> > Tested-by: Bryan O'Donoghue <bryan.odonoghue@linaro.org> # sc8280xp Lenovo x13s
> > Tested-by: Pavel Machek <pavel@ucw.cz>
> > Reviewed-by: Pavel Machek <pavel@ucw.cz>
> > Co-developed-by: Dennis Bonke <admin@dennisbonke.com>
> > Signed-off-by: Dennis Bonke <admin@dennisbonke.com>
> > Co-developed-by: Andrey Konovalov <andrey.konovalov@linaro.org>
> > Signed-off-by: Andrey Konovalov <andrey.konovalov@linaro.org>
> > Co-developed-by: Pavel Machek <pavel@ucw.cz>
> > Signed-off-by: Pavel Machek <pavel@ucw.cz>
> > Signed-off-by: Hans de Goede <hdegoede@redhat.com>
> > Reviewed-by: Milan Zamazal <mzamazal@redhat.com>
> > ---
> >  src/libcamera/software_isp/debayer_cpu.cpp | 626 +++++++++++++++++++++
> >  src/libcamera/software_isp/debayer_cpu.h   | 143 +++++
> >  src/libcamera/software_isp/meson.build     |   1 +
> >  3 files changed, 770 insertions(+)
> >  create mode 100644 src/libcamera/software_isp/debayer_cpu.cpp
> >  create mode 100644 src/libcamera/software_isp/debayer_cpu.h
> > 
> > diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
> > new file mode 100644
> > index 00000000..f932362c
> > --- /dev/null
> > +++ b/src/libcamera/software_isp/debayer_cpu.cpp
> > @@ -0,0 +1,626 @@
> > +/* SPDX-License-Identifier: LGPL-2.1-or-later */
> > +/*
> > + * Copyright (C) 2023, Linaro Ltd
> > + * Copyright (C) 2023, Red Hat Inc.
> > + *
> > + * Authors:
> > + * Hans de Goede <hdegoede@redhat.com>
> > + *
> > + * debayer_cpu.cpp - CPU based debayering class
> > + */
> > +
> > +#include "debayer_cpu.h"
> > +
> > +#include <math.h>
> > +#include <stdlib.h>
> > +#include <time.h>
> > +
> > +#include <libcamera/formats.h>
> > +
> > +#include "libcamera/internal/bayer_format.h"
> > +#include "libcamera/internal/framebuffer.h"
> > +#include "libcamera/internal/mapped_framebuffer.h"
> > +
> > +namespace libcamera {
> > +
> > +/**
> > + * \class DebayerCpu
> > + * \brief Class for debayering on the CPU
> > + *
> > + * Implementation for CPU based debayering
> > + */
> > +
> > +/**
> > + * \brief Constructs a DebayerCpu object.
> > + * \param[in] stats Pointer to the stats object to use.
> 
> No period at the end of \brief or \param.
> 
> > + */
> > +DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats)
> > +	: stats_(std::move(stats)), gamma_correction_(1.0)
> 
> gammaCorrection_
> 
> > +{
> > +#ifdef __x86_64__
> > +	enableInputMemcpy_ = false;
> > +#else
> > +	enableInputMemcpy_ = true;
> > +#endif
> 
> This is very much *not* nice :-( It needs to at least be explained
> clearly somewhere.
> 
> > +	/* Initialize gamma to 1.0 curve */
> > +	for (unsigned int i = 0; i < kGammaLookupSize; i++)
> > +		gamma_[i] = i / (kGammaLookupSize / kRGBLookupSize);
> > +
> > +	for (unsigned int i = 0; i < kMaxLineBuffers; i++)
> > +		lineBuffers_[i] = nullptr;
> > +}
> > +
> > +DebayerCpu::~DebayerCpu()
> > +{
> > +	for (unsigned int i = 0; i < kMaxLineBuffers; i++)
> > +		free(lineBuffers_[i]);
> > +}
> > +
> > +// RGR
> > +// GBG
> > +// RGR
> 
> C-style comments.
> 
> > +#define BGGR_BGR888(p, n, div)                                                                \
> > +	*dst++ = blue_[curr[x] / (div)];                                                      \
> > +	*dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];       \
> > +	*dst++ = red_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
> > +	x++;
> > +
> > +// GBG
> > +// RGR
> > +// GBG
> > +#define GRBG_BGR888(p, n, div)                                    \
> > +	*dst++ = blue_[(prev[x] + next[x]) / (2 * (div))];        \
> > +	*dst++ = green_[curr[x] / (div)];                         \
> > +	*dst++ = red_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
> > +	x++;
> > +
> > +// GRG
> > +// BGB
> > +// GRG
> > +#define GBRG_BGR888(p, n, div)                                     \
> > +	*dst++ = blue_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
> > +	*dst++ = green_[curr[x] / (div)];                          \
> > +	*dst++ = red_[(prev[x] + next[x]) / (2 * (div))];          \
> > +	x++;
> > +
> > +// BGB
> > +// GRG
> > +// BGB
> > +#define RGGB_BGR888(p, n, div)                                                                 \
> > +	*dst++ = blue_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
> > +	*dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];        \
> > +	*dst++ = red_[curr[x] / (div)];                                                        \
> > +	x++;
> > +
> > +void DebayerCpu::debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
> > +{
> > +	const int width_in_bytes = window_.width * 5 / 4;
> 
> snakeCase here too, ane where applicable everywhere else.
> 
> > +	const uint8_t *prev = (const uint8_t *)src[0];
> 
> Is the cast needed ? If so it should be a C++ cast.
> 
> > +	const uint8_t *curr = (const uint8_t *)src[1];
> > +	const uint8_t *next = (const uint8_t *)src[2];
> > +
> > +	/*
> > +	 * For the first pixel getting a pixel from the previous column uses
> > +	 * x - 2 to skip the 5th byte with least-significant bits for 4 pixels.
> > +	 * Same for last pixel (uses x + 2) and looking at the next column.
> > +	 */
> > +	for (int x = 0; x < width_in_bytes;) {
> > +		/* First pixel */
> > +		BGGR_BGR888(2, 1, 1)
> > +		/* Second pixel BGGR -> GBRG */
> > +		GBRG_BGR888(1, 1, 1)
> > +		/* Same thing for third and fourth pixels */
> > +		BGGR_BGR888(1, 1, 1)
> > +		GBRG_BGR888(1, 2, 1)
> > +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
> > +		x++;
> > +	}
> > +}
> > +
> > +void DebayerCpu::debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
> > +{
> > +	const int width_in_bytes = window_.width * 5 / 4;
> > +	const uint8_t *prev = (const uint8_t *)src[0];
> > +	const uint8_t *curr = (const uint8_t *)src[1];
> > +	const uint8_t *next = (const uint8_t *)src[2];
> > +
> > +	for (int x = 0; x < width_in_bytes;) {
> > +		/* First pixel */
> > +		GRBG_BGR888(2, 1, 1)
> > +		/* Second pixel GRBG -> RGGB */
> > +		RGGB_BGR888(1, 1, 1)
> > +		/* Same thing for third and fourth pixels */
> > +		GRBG_BGR888(1, 1, 1)
> > +		RGGB_BGR888(1, 2, 1)
> > +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
> > +		x++;
> > +	}
> > +}
> > +
> > +void DebayerCpu::debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[])
> > +{
> > +	const int width_in_bytes = window_.width * 5 / 4;
> > +	const uint8_t *prev = (const uint8_t *)src[0];
> > +	const uint8_t *curr = (const uint8_t *)src[1];
> > +	const uint8_t *next = (const uint8_t *)src[2];
> > +
> > +	for (int x = 0; x < width_in_bytes;) {
> > +		/* Even pixel */
> > +		GBRG_BGR888(2, 1, 1)
> > +		/* Odd pixel GBGR -> BGGR */
> > +		BGGR_BGR888(1, 1, 1)
> > +		/* Same thing for next 2 pixels */
> > +		GBRG_BGR888(1, 1, 1)
> > +		BGGR_BGR888(1, 2, 1)
> > +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
> > +		x++;
> > +	}
> > +}
> > +
> > +void DebayerCpu::debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[])
> > +{
> > +	const int width_in_bytes = window_.width * 5 / 4;
> > +	const uint8_t *prev = (const uint8_t *)src[0];
> > +	const uint8_t *curr = (const uint8_t *)src[1];
> > +	const uint8_t *next = (const uint8_t *)src[2];
> > +
> > +	for (int x = 0; x < width_in_bytes;) {
> > +		/* Even pixel */
> > +		RGGB_BGR888(2, 1, 1)
> > +		/* Odd pixel RGGB -> GRBG */
> > +		GRBG_BGR888(1, 1, 1)
> > +		/* Same thing for next 2 pixels */
> > +		RGGB_BGR888(1, 1, 1)
> > +		GRBG_BGR888(1, 2, 1)
> > +		/* Skip 5th src byte with 4 x 2 least-significant-bits */
> > +		x++;
> > +	}
> > +}
> > +
> > +static bool isStandardBayerOrder(BayerFormat::Order order)
> > +{
> > +	return order == BayerFormat::BGGR || order == BayerFormat::GBRG ||
> > +	       order == BayerFormat::GRBG || order == BayerFormat::RGGB;
> > +}
> > +
> > +/*
> > + * Setup the Debayer object according to the passed in parameters.
> > + * Return 0 on success, a negative errno value on failure
> > + * (unsupported parameters).
> > + */
> > +int DebayerCpu::getInputConfig(PixelFormat inputFormat, DebayerInputConfig &config)
> > +{
> > +	BayerFormat bayerFormat =
> > +		BayerFormat::fromPixelFormat(inputFormat);
> > +
> > +	if (bayerFormat.bitDepth == 10 &&
> > +	    bayerFormat.packing == BayerFormat::Packing::CSI2 &&
> > +	    isStandardBayerOrder(bayerFormat.order)) {
> > +		config.bpp = 10;
> > +		config.patternSize.width = 4; /* 5 bytes per *4* pixels */
> > +		config.patternSize.height = 2;
> > +		config.outputFormats = std::vector<PixelFormat>({ formats::RGB888 });
> > +		return 0;
> > +	}
> > +
> > +	LOG(Debayer, Info)
> 
> Shouldn't this be an Error mesage ? Same below.
> 
> > +		<< "Unsupported input format " << inputFormat.toString();
> > +	return -EINVAL;
> > +}
> > +
> > +int DebayerCpu::getOutputConfig(PixelFormat outputFormat, DebayerOutputConfig &config)
> > +{
> > +	if (outputFormat == formats::RGB888) {
> > +		config.bpp = 24;
> > +		return 0;
> > +	}
> > +
> > +	LOG(Debayer, Info)
> > +		<< "Unsupported output format " << outputFormat.toString();
> > +	return -EINVAL;
> > +}
> > +
> > +/* TODO: this ignores outputFormat since there is only 1 supported outputFormat for now */

/* \todo This ignores outputFormat since there is only 1 supported outputFormat for now */

> > +int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, [[maybe_unused]] PixelFormat outputFormat)
> > +{
> > +	BayerFormat bayerFormat =
> > +		BayerFormat::fromPixelFormat(inputFormat);
> > +
> > +	if (bayerFormat.bitDepth == 10 &&
> > +	    bayerFormat.packing == BayerFormat::Packing::CSI2) {
> > +		switch (bayerFormat.order) {
> > +		case BayerFormat::BGGR:
> > +			debayer0_ = &DebayerCpu::debayer10P_BGBG_BGR888;
> > +			debayer1_ = &DebayerCpu::debayer10P_GRGR_BGR888;
> > +			return 0;
> > +		case BayerFormat::GBRG:
> > +			debayer0_ = &DebayerCpu::debayer10P_GBGB_BGR888;
> > +			debayer1_ = &DebayerCpu::debayer10P_RGRG_BGR888;
> > +			return 0;
> > +		case BayerFormat::GRBG:
> > +			debayer0_ = &DebayerCpu::debayer10P_GRGR_BGR888;
> > +			debayer1_ = &DebayerCpu::debayer10P_BGBG_BGR888;
> > +			return 0;
> > +		case BayerFormat::RGGB:
> > +			debayer0_ = &DebayerCpu::debayer10P_RGRG_BGR888;
> > +			debayer1_ = &DebayerCpu::debayer10P_GBGB_BGR888;
> > +			return 0;
> > +		default:
> > +			break;
> > +		}
> > +	}
> > +
> > +	LOG(Debayer, Error) << "Unsupported input output format combination";
> > +	return -EINVAL;
> > +}
> > +
> > +int DebayerCpu::configure(const StreamConfiguration &inputCfg,
> > +			  const std::vector<std::reference_wrapper<StreamConfiguration>> &outputCfgs)
> > +{
> > +	if (getInputConfig(inputCfg.pixelFormat, inputConfig_) != 0)
> > +		return -EINVAL;
> > +
> > +	if (stats_->configure(inputCfg) != 0)
> > +		return -EINVAL;
> > +
> > +	const Size &stats_pattern_size = stats_->patternSize();
> > +	if (inputConfig_.patternSize.width != stats_pattern_size.width ||
> > +	    inputConfig_.patternSize.height != stats_pattern_size.height) {
> > +		LOG(Debayer, Error)
> > +			<< "mismatching stats and debayer pattern sizes for "
> > +			<< inputCfg.pixelFormat.toString();
> > +		return -EINVAL;
> > +	}
> > +
> > +	inputConfig_.stride = inputCfg.stride;
> > +
> > +	if (outputCfgs.size() != 1) {
> > +		LOG(Debayer, Error)
> > +			<< "Unsupported number of output streams: "
> > +			<< outputCfgs.size();
> > +		return -EINVAL;
> > +	}
> > +
> > +	const StreamConfiguration &outputCfg = outputCfgs[0];
> > +	SizeRange outSizeRange = sizes(inputCfg.pixelFormat, inputCfg.size);
> > +	std::tie(outputConfig_.stride, outputConfig_.frameSize) =
> > +		strideAndFrameSize(outputCfg.pixelFormat, outputCfg.size);
> > +
> > +	if (!outSizeRange.contains(outputCfg.size) || outputConfig_.stride != outputCfg.stride) {
> > +		LOG(Debayer, Error)
> > +			<< "Invalid output size/stride: "
> > +			<< "\n  " << outputCfg.size << " (" << outSizeRange << ")"
> > +			<< "\n  " << outputCfg.stride << " (" << outputConfig_.stride << ")";
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (setDebayerFunctions(inputCfg.pixelFormat, outputCfg.pixelFormat) != 0)
> > +		return -EINVAL;
> > +
> > +	window_.x = ((inputCfg.size.width - outputCfg.size.width) / 2) &
> > +		    ~(inputConfig_.patternSize.width - 1);
> > +	window_.y = ((inputCfg.size.height - outputCfg.size.height) / 2) &
> > +		    ~(inputConfig_.patternSize.height - 1);
> > +	window_.width = outputCfg.size.width;
> > +	window_.height = outputCfg.size.height;
> > +
> > +	/* Don't pass x,y since process() already adjusts src before passing it */
> > +	stats_->setWindow(Rectangle(window_.size()));
> > +
> > +	/* pad with patternSize.Width on both left and right side */
> > +	lineBufferPadding_ = inputConfig_.patternSize.width * inputConfig_.bpp / 8;
> > +	lineBufferLength_ = window_.width * inputConfig_.bpp / 8 +
> > +			    2 * lineBufferPadding_;
> > +	for (unsigned int i = 0;
> > +	     i < (inputConfig_.patternSize.height + 1) && enableInputMemcpy_;
> > +	     i++) {
> > +		free(lineBuffers_[i]);
> > +		lineBuffers_[i] = (uint8_t *)malloc(lineBufferLength_);
> > +		if (!lineBuffers_[i])
> > +			return -ENOMEM;
> > +	}
> > +
> > +	measuredFrames_ = 0;
> > +	frameProcessTime_ = 0;
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Get width and height at which the bayer-pattern repeats.
> > + * Return pattern-size or an empty Size for an unsupported inputFormat.
> > + */
> > +Size DebayerCpu::patternSize(PixelFormat inputFormat)
> > +{
> > +	DebayerCpu::DebayerInputConfig config;
> > +
> > +	if (getInputConfig(inputFormat, config) != 0)
> > +		return {};
> > +
> > +	return config.patternSize;
> > +}
> > +
> > +std::vector<PixelFormat> DebayerCpu::formats(PixelFormat inputFormat)
> > +{
> > +	DebayerCpu::DebayerInputConfig config;
> > +
> > +	if (getInputConfig(inputFormat, config) != 0)
> > +		return std::vector<PixelFormat>();
> > +
> > +	return config.outputFormats;
> > +}
> > +
> > +std::tuple<unsigned int, unsigned int>
> > +DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size)
> > +{
> > +	DebayerCpu::DebayerOutputConfig config;
> > +
> > +	if (getOutputConfig(outputFormat, config) != 0)
> > +		return std::make_tuple(0, 0);
> > +
> > +	/* round up to multiple of 8 for 64 bits alignment */
> > +	unsigned int stride = (size.width * config.bpp / 8 + 7) & ~7;
> > +
> > +	return std::make_tuple(stride, stride * size.height);
> > +}
> > +
> > +void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[])
> > +{
> > +	const unsigned int patternHeight = inputConfig_.patternSize.height;
> > +
> > +	if (!enableInputMemcpy_)
> > +		return;
> > +
> > +	for (unsigned int i = 0; i < patternHeight; i++) {
> > +		memcpy(lineBuffers_[i], linePointers[i + 1] - lineBufferPadding_,
> > +		       lineBufferLength_);
> > +		linePointers[i + 1] = lineBuffers_[i] + lineBufferPadding_;
> > +	}
> > +
> > +	/* Point lineBufferIndex_ to first unused lineBuffer */
> > +	lineBufferIndex_ = patternHeight;
> > +}
> > +
> > +void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src)
> > +{
> > +	const unsigned int patternHeight = inputConfig_.patternSize.height;
> > +
> > +	for (unsigned int i = 0; i < patternHeight; i++)
> > +		linePointers[i] = linePointers[i + 1];
> > +
> > +	linePointers[patternHeight] = src +
> > +				      (patternHeight / 2) * (int)inputConfig_.stride;
> > +}
> > +
> > +void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[])
> > +{
> > +	const unsigned int patternHeight = inputConfig_.patternSize.height;
> > +
> > +	if (!enableInputMemcpy_)
> > +		return;
> > +
> > +	memcpy(lineBuffers_[lineBufferIndex_], linePointers[patternHeight] - lineBufferPadding_,
> > +	       lineBufferLength_);
> > +	linePointers[patternHeight] = lineBuffers_[lineBufferIndex_] + lineBufferPadding_;
> > +
> > +	lineBufferIndex_ = (lineBufferIndex_ + 1) % (patternHeight + 1);
> > +}
> > +
> > +void DebayerCpu::process2(const uint8_t *src, uint8_t *dst)
> > +{
> > +	unsigned int y_end = window_.y + window_.height;
> > +	/* Holds [0] previous- [1] current- [2] next-line */
> > +	const uint8_t *linePointers[3];
> > +
> > +	/* Adjust src to top left corner of the window */
> > +	src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
> > +
> > +	/* [x] becomes [x - 1] after initial shiftLinePointers() call */
> > +	if (window_.y) {
> > +		linePointers[1] = src - inputConfig_.stride; /* previous-line */
> > +		linePointers[2] = src;
> > +	} else {
> > +		/* window_.y == 0, use the next line as prev line */
> > +		linePointers[1] = src + inputConfig_.stride;
> > +		linePointers[2] = src;
> > +		/* Last 2 lines also need special handling */
> > +		y_end -= 2;
> > +	}
> > +
> > +	setupInputMemcpy(linePointers);
> > +
> > +	for (unsigned int y = window_.y; y < y_end; y += 2) {
> > +		shiftLinePointers(linePointers, src);
> > +		memcpyNextLine(linePointers);
> > +		stats_->processLine0(y, linePointers);
> > +		(this->*debayer0_)(dst, linePointers);
> > +		src += inputConfig_.stride;
> > +		dst += outputConfig_.stride;
> > +
> > +		shiftLinePointers(linePointers, src);
> > +		memcpyNextLine(linePointers);
> > +		(this->*debayer1_)(dst, linePointers);
> > +		src += inputConfig_.stride;
> > +		dst += outputConfig_.stride;
> > +	}
> > +
> > +	if (window_.y == 0) {
> > +		shiftLinePointers(linePointers, src);
> > +		memcpyNextLine(linePointers);
> > +		stats_->processLine0(y_end, linePointers);
> > +		(this->*debayer0_)(dst, linePointers);
> > +		src += inputConfig_.stride;
> > +		dst += outputConfig_.stride;
> > +
> > +		shiftLinePointers(linePointers, src);
> > +		/* next line may point outside of src, use prev. */
> > +		linePointers[2] = linePointers[0];
> > +		(this->*debayer1_)(dst, linePointers);
> > +		src += inputConfig_.stride;
> > +		dst += outputConfig_.stride;
> > +	}
> > +}
> > +
> > +void DebayerCpu::process4(const uint8_t *src, uint8_t *dst)
> > +{
> > +	const unsigned int y_end = window_.y + window_.height;
> > +	/*
> > +	 * This holds pointers to [0] 2-lines-up [1] 1-line-up [2] current-line
> > +	 * [3] 1-line-down [4] 2-lines-down.
> > +	 */
> > +	const uint8_t *linePointers[5];
> > +
> > +	/* Adjust src to top left corner of the window */
> > +	src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
> > +
> > +	/* [x] becomes [x - 1] after initial shiftLinePointers() call */
> > +	linePointers[1] = src - 2 * inputConfig_.stride;
> > +	linePointers[2] = src - inputConfig_.stride;
> > +	linePointers[3] = src;
> > +	linePointers[4] = src + inputConfig_.stride;
> > +
> > +	setupInputMemcpy(linePointers);
> > +
> > +	for (unsigned int y = window_.y; y < y_end; y += 4) {
> > +		shiftLinePointers(linePointers, src);
> > +		memcpyNextLine(linePointers);
> > +		stats_->processLine0(y, linePointers);
> > +		(this->*debayer0_)(dst, linePointers);
> > +		src += inputConfig_.stride;
> > +		dst += outputConfig_.stride;
> > +
> > +		shiftLinePointers(linePointers, src);
> > +		memcpyNextLine(linePointers);
> > +		(this->*debayer1_)(dst, linePointers);
> > +		src += inputConfig_.stride;
> > +		dst += outputConfig_.stride;
> > +
> > +		shiftLinePointers(linePointers, src);
> > +		memcpyNextLine(linePointers);
> > +		stats_->processLine2(y, linePointers);
> > +		(this->*debayer2_)(dst, linePointers);
> > +		src += inputConfig_.stride;
> > +		dst += outputConfig_.stride;
> > +
> > +		shiftLinePointers(linePointers, src);
> > +		memcpyNextLine(linePointers);
> > +		(this->*debayer3_)(dst, linePointers);
> > +		src += inputConfig_.stride;
> > +		dst += outputConfig_.stride;
> > +	}
> > +}
> > +
> > +static inline int64_t timeDiff(timespec &after, timespec &before)
> > +{
> > +	return (after.tv_sec - before.tv_sec) * 1000000000LL +
> > +	       (int64_t)after.tv_nsec - (int64_t)before.tv_nsec;
> > +}
> > +
> > +void DebayerCpu::process(FrameBuffer *input, FrameBuffer *output, DebayerParams params)
> > +{
> > +	timespec frameStartTime;
> > +
> > +	if (measuredFrames_ < DebayerCpu::kLastFrameToMeasure) {
> > +		frameStartTime = {};
> > +		clock_gettime(CLOCK_MONOTONIC_RAW, &frameStartTime);
> > +	}
> > +
> > +	/* Apply DebayerParams */
> > +	if (params.gamma != gamma_correction_) {
> > +		for (unsigned int i = 0; i < kGammaLookupSize; i++)
> > +			gamma_[i] = UINT8_MAX * powf(i / (kGammaLookupSize - 1.0), params.gamma);
> > +
> > +		gamma_correction_ = params.gamma;
> > +	}
> > +
> > +	for (unsigned int i = 0; i < kRGBLookupSize; i++) {
> > +		constexpr unsigned int div =
> > +			kRGBLookupSize * DebayerParams::kGain10 / kGammaLookupSize;
> > +		unsigned int idx;
> > +
> > +		/* Apply gamma after gain! */
> > +		idx = std::min({ i * params.gainR / div, (kGammaLookupSize - 1) });
> > +		red_[i] = gamma_[idx];
> > +
> > +		idx = std::min({ i * params.gainG / div, (kGammaLookupSize - 1) });
> > +		green_[i] = gamma_[idx];
> > +
> > +		idx = std::min({ i * params.gainB / div, (kGammaLookupSize - 1) });
> > +		blue_[i] = gamma_[idx];
> > +	}
> > +
> > +	/* Copy metadata from the input buffer */
> > +	FrameMetadata &metadata = output->_d()->metadata();
> > +	metadata.status = input->metadata().status;
> > +	metadata.sequence = input->metadata().sequence;
> > +	metadata.timestamp = input->metadata().timestamp;
> > +
> > +	MappedFrameBuffer in(input, MappedFrameBuffer::MapFlag::Read);
> > +	MappedFrameBuffer out(output, MappedFrameBuffer::MapFlag::Write);
> > +	if (!in.isValid() || !out.isValid()) {
> > +		LOG(Debayer, Error) << "mmap-ing buffer(s) failed";
> > +		metadata.status = FrameMetadata::FrameError;
> > +		return;
> > +	}
> > +
> > +	stats_->startFrame();
> > +
> > +	if (inputConfig_.patternSize.height == 2)
> > +		process2(in.planes()[0].data(), out.planes()[0].data());
> > +	else
> > +		process4(in.planes()[0].data(), out.planes()[0].data());
> > +
> > +	metadata.planes()[0].bytesused = out.planes()[0].size();
> > +
> > +	/* Measure before emitting signals */
> > +	if (measuredFrames_ < DebayerCpu::kLastFrameToMeasure &&
> > +	    ++measuredFrames_ > DebayerCpu::kFramesToSkip) {
> > +		timespec frameEndTime = {};
> > +		clock_gettime(CLOCK_MONOTONIC_RAW, &frameEndTime);
> > +		frameProcessTime_ += timeDiff(frameEndTime, frameStartTime);
> > +		if (measuredFrames_ == DebayerCpu::kLastFrameToMeasure) {
> > +			const unsigned int measuredFrames = DebayerCpu::kLastFrameToMeasure -
> > +							    DebayerCpu::kFramesToSkip;
> > +			LOG(Debayer, Info)
> > +				<< "Processed " << measuredFrames
> > +				<< " frames in " << frameProcessTime_ / 1000 << "us, "
> > +				<< frameProcessTime_ / (1000 * measuredFrames)
> > +				<< " us/frame";
> > +		}
> > +	}
> 
> Is this debugging leftovers, or should it be cleaned up ? It seems quite
> a bit of a hack. If we want to enable measurements, we shouldn't
> hardcode them to frames 30 to 60.
> 
> > +
> > +	stats_->finishFrame();
> > +	outputBufferReady.emit(output);
> > +	inputBufferReady.emit(input);
> > +}
> > +
> > +SizeRange DebayerCpu::sizes(PixelFormat inputFormat, const Size &inputSize)
> > +{
> > +	Size pattern_size = patternSize(inputFormat);
> > +	unsigned int border_height = pattern_size.height;
> > +
> > +	if (pattern_size.isNull())
> > +		return {};
> > +
> > +	/* No need for top/bottom border with a pattern height of 2 */
> > +	if (pattern_size.height == 2)
> > +		border_height = 0;
> > +
> > +	/*
> > +	 * For debayer interpolation a border is kept around the entire image
> > +	 * and the minimum output size is pattern-height x pattern-width.
> > +	 */
> > +	if (inputSize.width < (3 * pattern_size.width) ||
> > +	    inputSize.height < (2 * border_height + pattern_size.height)) {
> > +		LOG(Debayer, Warning)
> > +			<< "Input format size too small: " << inputSize.toString();
> > +		return {};
> > +	}
> > +
> > +	return SizeRange(Size(pattern_size.width, pattern_size.height),
> > +			 Size((inputSize.width - 2 * pattern_size.width) & ~(pattern_size.width - 1),
> > +			      (inputSize.height - 2 * border_height) & ~(pattern_size.height - 1)),
> > +			 pattern_size.width, pattern_size.height);
> > +}
> > +
> > +} /* namespace libcamera */
> > diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
> > new file mode 100644
> > index 00000000..8a51ed85
> > --- /dev/null
> > +++ b/src/libcamera/software_isp/debayer_cpu.h
> > @@ -0,0 +1,143 @@
> > +/* SPDX-License-Identifier: LGPL-2.1-or-later */
> > +/*
> > + * Copyright (C) 2023, Linaro Ltd
> > + * Copyright (C) 2023, Red Hat Inc.
> > + *
> > + * Authors:
> > + * Hans de Goede <hdegoede@redhat.com>
> > + *
> > + * debayer_cpu.h - CPU based debayering header
> > + */
> > +
> > +#pragma once
> > +
> > +#include <memory>
> > +#include <stdint.h>
> > +#include <vector>
> > +
> > +#include <libcamera/base/object.h>
> > +
> > +#include "debayer.h"
> > +#include "swstats_cpu.h"
> > +
> > +namespace libcamera {
> > +
> > +class DebayerCpu : public Debayer, public Object
> > +{
> > +public:
> > +	DebayerCpu(std::unique_ptr<SwStatsCpu> stats);
> > +	~DebayerCpu();
> > +
> > +	int configure(const StreamConfiguration &inputCfg,
> > +		      const std::vector<std::reference_wrapper<StreamConfiguration>> &outputCfgs);
> > +	Size patternSize(PixelFormat inputFormat);
> > +	std::vector<PixelFormat> formats(PixelFormat input);
> > +	std::tuple<unsigned int, unsigned int>
> > +	strideAndFrameSize(const PixelFormat &outputFormat, const Size &size);
> > +	void process(FrameBuffer *input, FrameBuffer *output, DebayerParams params);
> > +	SizeRange sizes(PixelFormat inputFormat, const Size &inputSize);
> > +
> > +	/**
> > +	 * \brief Get the file descriptor for the statistics.
> > +	 *
> > +	 * \return the file descriptor pointing to the statistics.
> > +	 */
> > +	const SharedFD &getStatsFD() { return stats_->getStatsFD(); }
> 
> This, plus the fact that this class handles colour gains and gamma,
> makes me thing we have either a naming issue, or an architecture issue.
> 
> > +
> > +	/**
> > +	 * \brief Get the output frame size.
> > +	 *
> > +	 * \return The output frame size.
> > +	 */
> > +	unsigned int frameSize() { return outputConfig_.frameSize; }
> > +
> > +private:
> > +	/**
> > +	 * \brief Called to debayer 1 line of Bayer input data to output format
> > +	 * \param[out] dst Pointer to the start of the output line to write
> > +	 * \param[in] src The input data
> > +	 *
> > +	 * Input data is an array of (patternSize_.height + 1) src
> > +	 * pointers each pointing to a line in the Bayer source. The middle
> > +	 * element of the array will point to the actual line being processed.
> > +	 * Earlier element(s) will point to the previous line(s) and later
> > +	 * element(s) to the next line(s).
> > +	 *
> > +	 * These functions take an array of src pointers, rather than
> > +	 * a single src pointer + a stride for the source, so that when the src
> > +	 * is slow uncached memory it can be copied to faster memory before
> > +	 * debayering. Debayering a standard 2x2 Bayer pattern requires access
> > +	 * to the previous and next src lines for interpolating the missing
> > +	 * colors. To allow copying the src lines only once 3 temporary buffers
> > +	 * each holding a single line are used, re-using the oldest buffer for
> > +	 * the next line and the pointers are swizzled so that:
> > +	 * src[0] = previous-line, src[1] = currrent-line, src[2] = next-line.
> > +	 * This way the 3 pointers passed to the debayer functions form
> > +	 * a sliding window over the src avoiding the need to copy each
> > +	 * line more than once.
> > +	 *
> > +	 * Similarly for bayer patterns which repeat every 4 lines, 5 src
> > +	 * pointers are passed holding: src[0] = 2-lines-up, src[1] = 1-line-up
> > +	 * src[2] = current-line, src[3] = 1-line-down, src[4] = 2-lines-down.
> > +	 */
> > +	using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]);
> > +
> > +	/* CSI-2 packed 10-bit raw bayer format (all the 4 orders) */
> > +	void debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
> > +	void debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
> > +	void debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[]);
> > +	void debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[]);
> > +
> > +	struct DebayerInputConfig {
> > +		Size patternSize;
> > +		unsigned int bpp; /* Memory used per pixel, not precision */
> > +		unsigned int stride;
> > +		std::vector<PixelFormat> outputFormats;
> > +	};
> > +
> > +	struct DebayerOutputConfig {
> > +		unsigned int bpp; /* Memory used per pixel, not precision */
> > +		unsigned int stride;
> > +		unsigned int frameSize;
> > +	};
> > +
> > +	int getInputConfig(PixelFormat inputFormat, DebayerInputConfig &config);
> > +	int getOutputConfig(PixelFormat outputFormat, DebayerOutputConfig &config);
> > +	int setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputFormat);
> > +	void setupInputMemcpy(const uint8_t *linePointers[]);
> > +	void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src);
> > +	void memcpyNextLine(const uint8_t *linePointers[]);
> > +	void process2(const uint8_t *src, uint8_t *dst);
> > +	void process4(const uint8_t *src, uint8_t *dst);
> > +
> > +	static constexpr unsigned int kGammaLookupSize = 1024;
> > +	static constexpr unsigned int kRGBLookupSize = 256;
> > +	/* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */
> > +	static constexpr unsigned int kMaxLineBuffers = 5;
> > +
> > +	std::array<uint8_t, kGammaLookupSize> gamma_;
> > +	std::array<uint8_t, kRGBLookupSize> red_;
> > +	std::array<uint8_t, kRGBLookupSize> green_;
> > +	std::array<uint8_t, kRGBLookupSize> blue_;
> > +	debayerFn debayer0_;
> > +	debayerFn debayer1_;
> > +	debayerFn debayer2_;
> > +	debayerFn debayer3_;
> > +	Rectangle window_;
> > +	DebayerInputConfig inputConfig_;
> > +	DebayerOutputConfig outputConfig_;
> > +	std::unique_ptr<SwStatsCpu> stats_;
> > +	uint8_t *lineBuffers_[kMaxLineBuffers];
> > +	unsigned int lineBufferLength_;
> > +	unsigned int lineBufferPadding_;
> > +	unsigned int lineBufferIndex_;
> > +	bool enableInputMemcpy_;
> > +	float gamma_correction_;
> > +	unsigned int measuredFrames_;
> > +	int64_t frameProcessTime_;
> > +	/* Skip 30 frames for things to stabilize then measure 30 frames */
> > +	static constexpr unsigned int kFramesToSkip = 30;
> > +	static constexpr unsigned int kLastFrameToMeasure = 60;
> > +};
> > +
> > +} /* namespace libcamera */
> > diff --git a/src/libcamera/software_isp/meson.build b/src/libcamera/software_isp/meson.build
> > index 62095f61..71b46539 100644
> > --- a/src/libcamera/software_isp/meson.build
> > +++ b/src/libcamera/software_isp/meson.build
> > @@ -9,5 +9,6 @@ endif
> >  
> >  libcamera_sources += files([
> >      'debayer.cpp',
> > +    'debayer_cpu.cpp',
> >      'swstats_cpu.cpp',
> >  ])
Hans de Goede March 27, 2024, 6:08 p.m. UTC | #3
Hi Laurent,

On 3/27/24 3:12 PM, Laurent Pinchart wrote:
> Hi Milan and Hans,
> 
> Thank you for the patch.

Same as with the previous review: Thank you for the review.

Milan is going to prepare a v7 of this series, so I'm going to <snip>
all the trivial remarks and only comment on the remaining remarks.

> On Tue, Mar 19, 2024 at 01:35:55PM +0100, Milan Zamazal wrote:
>> From: Hans de Goede <hdegoede@redhat.com>

<snip>

>> +{
>> +#ifdef __x86_64__
>> +	enableInputMemcpy_ = false;
>> +#else
>> +	enableInputMemcpy_ = true;
>> +#endif
> 
> This is very much *not* nice :-( It needs to at least be explained
> clearly somewhere.

Yeah this really needs to be a parameter to the SoftwareISP
constructor because some non x86 designs may also have
fully cached buffers, so they can also skip the extra
memcpy step.

My suggestion would be to introduce a struct SoftwareIspParams
with only a bool enableInputMemcpy_ in there for now and
then in the simplepipeline handler instead of having a bool
flag to indicate if the SoftwareISP should be list for a specific
media-controller driver-name, have a pointer to this struct and
have that pointer being non NULL indicate the swisp should
be used.

For now we would then have 2 SoftwareIspParams structs
this pointer can point to (uncached and cached) but in the future
we may very well have 1 SoftwareIspParams struct per supported
mediactl driver.

Actually since the IPU6 kernel side has not been merged yet,
for now we can just replace this with a hardcoded:

	enableInputMemcpy_ = true;

Milan, lets do that for the v7 submission.

Then I can add some mechanism to config this as a follow up
patch when IPU6 has actually landed in the kernel and thus
we can enable it in libcamera upstream.

<snip>

>> +void DebayerCpu::debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
>> +{
>> +	const int width_in_bytes = window_.width * 5 / 4;
> 
> snakeCase here too, ane where applicable everywhere else.
> 
>> +	const uint8_t *prev = (const uint8_t *)src[0];
> 
> Is the cast needed ? If so it should be a C++ cast.

For the set of debayer10P_* functions introduced as the first
supported bayer fmt these casts are not necessary and can
be dropped.

In "[PATCH v6 14/18] libcamera: debayer_cpu: Add support for 8, 10
and 12 bpp unpacked bayer input" there are more casts and these
are actually necessary and will need to be replaced with
static c++ style casts.

>> +	LOG(Debayer, Info)
> 
> Shouldn't this be an Error mesage ? Same below.

These functions are called when the simple pipeline handler
is discovering supported formats so on CSI receivers or sensors
supporting multiple formats this happen for one of the fmts
is sorta expected behavior.

I don't feel strongly about this though, so if you want this
to be upgraded to an Error we can do that.




> 
>> +		<< "Unsupported input format " << inputFormat.toString();
>> +	return -EINVAL;
>> +}

<snip>

>> +	/* Measure before emitting signals */
>> +	if (measuredFrames_ < DebayerCpu::kLastFrameToMeasure &&
>> +	    ++measuredFrames_ > DebayerCpu::kFramesToSkip) {
>> +		timespec frameEndTime = {};
>> +		clock_gettime(CLOCK_MONOTONIC_RAW, &frameEndTime);
>> +		frameProcessTime_ += timeDiff(frameEndTime, frameStartTime);
>> +		if (measuredFrames_ == DebayerCpu::kLastFrameToMeasure) {
>> +			const unsigned int measuredFrames = DebayerCpu::kLastFrameToMeasure -
>> +							    DebayerCpu::kFramesToSkip;
>> +			LOG(Debayer, Info)
>> +				<< "Processed " << measuredFrames
>> +				<< " frames in " << frameProcessTime_ / 1000 << "us, "
>> +				<< frameProcessTime_ / (1000 * measuredFrames)
>> +				<< " us/frame";
>> +		}
>> +	}
> 
> Is this debugging leftovers, or should it be cleaned up ? It seems quite
> a bit of a hack. If we want to enable measurements, we shouldn't
> hardcode them to frames 30 to 60.

This is meant to be kept around, also see:

[PATCH v6 16/18] libcamera: Add "Software ISP benchmarking" documentation

the SoftwareISP is sensitive to performance regressions so this is
intended as a simple builtin benchmark to check for those.

So that means this falls under the "or should it be cleaned up" part
of your question.

You indicate you don't like the hardcoded frames 30 - 60. The idea is
to warm up the caches a bit and then measure a bunch of frames, which
is just what this does. What alternative approach are you suggesting?

>> +	/**
>> +	 * \brief Get the file descriptor for the statistics.
>> +	 *
>> +	 * \return the file descriptor pointing to the statistics.
>> +	 */
>> +	const SharedFD &getStatsFD() { return stats_->getStatsFD(); }
> 
> This,

Note the statistics pass-through stuff is sort of a necessary evil
since we want one main loop going over the data line by line and
doing both debayering as well as stats while the line is still
hot in the l2 cache. And things like the process2() and process4()
loops are highly CPU debayering specific so I don't think we should
move those out of the CpuDebayer code.

> plus the fact that this class handles colour gains and gamma,
> makes me thing we have either a naming issue, or an architecture issue.

I agree that this does a bit more then debayering, although
the debayering really is the main thing it does.

I guess the calculation of the rgb lookup tables which do the
color gains and gamma could be moved outside of this class,
that might even be beneficial for GPU based debayering assuming
that that is going to use rgb lookup tables too (it could
implement actual color gains + gamma correction in some different
way).

I think this falls under the lets wait until we have a GPU
based SoftISP MVP/POC and then do some refactoring to see which
bits should go where.

Regards,

Hans
Laurent Pinchart March 27, 2024, 7:37 p.m. UTC | #4
Hi Hans,

On Wed, Mar 27, 2024 at 07:08:02PM +0100, Hans de Goede wrote:
> On 3/27/24 3:12 PM, Laurent Pinchart wrote:
> > Hi Milan and Hans,
> > 
> > Thank you for the patch.
> 
> Same as with the previous review: Thank you for the review.
> 
> Milan is going to prepare a v7 of this series, so I'm going to <snip>
> all the trivial remarks and only comment on the remaining remarks.
> 
> > On Tue, Mar 19, 2024 at 01:35:55PM +0100, Milan Zamazal wrote:
> >> From: Hans de Goede <hdegoede@redhat.com>
> 
> <snip>
> 
> >> +{
> >> +#ifdef __x86_64__
> >> +	enableInputMemcpy_ = false;
> >> +#else
> >> +	enableInputMemcpy_ = true;
> >> +#endif
> > 
> > This is very much *not* nice :-( It needs to at least be explained
> > clearly somewhere.
> 
> Yeah this really needs to be a parameter to the SoftwareISP
> constructor because some non x86 designs may also have
> fully cached buffers, so they can also skip the extra
> memcpy step.
> 
> My suggestion would be to introduce a struct SoftwareIspParams
> with only a bool enableInputMemcpy_ in there for now and
> then in the simplepipeline handler instead of having a bool
> flag to indicate if the SoftwareISP should be list for a specific
> media-controller driver-name, have a pointer to this struct and
> have that pointer being non NULL indicate the swisp should
> be used.
> 
> For now we would then have 2 SoftwareIspParams structs
> this pointer can point to (uncached and cached) but in the future
> we may very well have 1 SoftwareIspParams struct per supported
> mediactl driver.
> 
> Actually since the IPU6 kernel side has not been merged yet,
> for now we can just replace this with a hardcoded:
> 
> 	enableInputMemcpy_ = true;
> 
> Milan, lets do that for the v7 submission.

Could you please also add a comment that explains why we want the memcpy
to be configurable ?

> Then I can add some mechanism to config this as a follow up
> patch when IPU6 has actually landed in the kernel and thus
> we can enable it in libcamera upstream.
> 
> <snip>
> 
> >> +void DebayerCpu::debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
> >> +{
> >> +	const int width_in_bytes = window_.width * 5 / 4;
> > 
> > snakeCase here too, ane where applicable everywhere else.
> > 
> >> +	const uint8_t *prev = (const uint8_t *)src[0];
> > 
> > Is the cast needed ? If so it should be a C++ cast.
> 
> For the set of debayer10P_* functions introduced as the first
> supported bayer fmt these casts are not necessary and can
> be dropped.
> 
> In "[PATCH v6 14/18] libcamera: debayer_cpu: Add support for 8, 10
> and 12 bpp unpacked bayer input" there are more casts and these
> are actually necessary and will need to be replaced with
> static c++ style casts.

Right, I hadn't seen that yet when reviewing this patch.

> >> +	LOG(Debayer, Info)
> > 
> > Shouldn't this be an Error mesage ? Same below.
> 
> These functions are called when the simple pipeline handler
> is discovering supported formats so on CSI receivers or sensors
> supporting multiple formats this happen for one of the fmts
> is sorta expected behavior.
> 
> I don't feel strongly about this though, so if you want this
> to be upgraded to an Error we can do that.

If it's an expected behaviour, I would make this a debug message then.

> >> +		<< "Unsupported input format " << inputFormat.toString();
> >> +	return -EINVAL;
> >> +}
> 
> <snip>
> 
> >> +	/* Measure before emitting signals */
> >> +	if (measuredFrames_ < DebayerCpu::kLastFrameToMeasure &&
> >> +	    ++measuredFrames_ > DebayerCpu::kFramesToSkip) {
> >> +		timespec frameEndTime = {};
> >> +		clock_gettime(CLOCK_MONOTONIC_RAW, &frameEndTime);
> >> +		frameProcessTime_ += timeDiff(frameEndTime, frameStartTime);
> >> +		if (measuredFrames_ == DebayerCpu::kLastFrameToMeasure) {
> >> +			const unsigned int measuredFrames = DebayerCpu::kLastFrameToMeasure -
> >> +							    DebayerCpu::kFramesToSkip;
> >> +			LOG(Debayer, Info)
> >> +				<< "Processed " << measuredFrames
> >> +				<< " frames in " << frameProcessTime_ / 1000 << "us, "
> >> +				<< frameProcessTime_ / (1000 * measuredFrames)
> >> +				<< " us/frame";
> >> +		}
> >> +	}
> > 
> > Is this debugging leftovers, or should it be cleaned up ? It seems quite
> > a bit of a hack. If we want to enable measurements, we shouldn't
> > hardcode them to frames 30 to 60.
> 
> This is meant to be kept around, also see:
> 
> [PATCH v6 16/18] libcamera: Add "Software ISP benchmarking" documentation
> 
> the SoftwareISP is sensitive to performance regressions so this is
> intended as a simple builtin benchmark to check for those.
> 
> So that means this falls under the "or should it be cleaned up" part
> of your question.
> 
> You indicate you don't like the hardcoded frames 30 - 60. The idea is
> to warm up the caches a bit and then measure a bunch of frames, which
> is just what this does. What alternative approach are you suggesting?

I wonder if there would be a way to control at runtime when/how to
perform those measurements. Maybe that's a bit overkill. In any case, it
would be something to address later.

> >> +	/**
> >> +	 * \brief Get the file descriptor for the statistics.
> >> +	 *
> >> +	 * \return the file descriptor pointing to the statistics.
> >> +	 */
> >> +	const SharedFD &getStatsFD() { return stats_->getStatsFD(); }
> > 
> > This,
> 
> Note the statistics pass-through stuff is sort of a necessary evil
> since we want one main loop going over the data line by line and
> doing both debayering as well as stats while the line is still
> hot in the l2 cache. And things like the process2() and process4()
> loops are highly CPU debayering specific so I don't think we should
> move those out of the CpuDebayer code.

Yes, that I understood from the review. "necessary evil" is indeed the
right term :-) I expect it will take quite some design skills to balance
the need for performances and the need for a maintainable architecture.

> > plus the fact that this class handles colour gains and gamma,
> > makes me thing we have either a naming issue, or an architecture issue.
> 
> I agree that this does a bit more then debayering, although
> the debayering really is the main thing it does.
> 
> I guess the calculation of the rgb lookup tables which do the
> color gains and gamma could be moved outside of this class,
> that might even be beneficial for GPU based debayering assuming
> that that is going to use rgb lookup tables too (it could
> implement actual color gains + gamma correction in some different
> way).
> 
> I think this falls under the lets wait until we have a GPU
> based SoftISP MVP/POC and then do some refactoring to see which
> bits should go where.

As long as we record this in the todo file (with a short summary of the
above), that's fine with me.

Patch
diff mbox series

diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
new file mode 100644
index 00000000..f932362c
--- /dev/null
+++ b/src/libcamera/software_isp/debayer_cpu.cpp
@@ -0,0 +1,626 @@ 
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/*
+ * Copyright (C) 2023, Linaro Ltd
+ * Copyright (C) 2023, Red Hat Inc.
+ *
+ * Authors:
+ * Hans de Goede <hdegoede@redhat.com>
+ *
+ * debayer_cpu.cpp - CPU based debayering class
+ */
+
+#include "debayer_cpu.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include <libcamera/formats.h>
+
+#include "libcamera/internal/bayer_format.h"
+#include "libcamera/internal/framebuffer.h"
+#include "libcamera/internal/mapped_framebuffer.h"
+
+namespace libcamera {
+
+/**
+ * \class DebayerCpu
+ * \brief Class for debayering on the CPU
+ *
+ * Implementation for CPU based debayering
+ */
+
+/**
+ * \brief Constructs a DebayerCpu object.
+ * \param[in] stats Pointer to the stats object to use.
+ */
+DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats)
+	: stats_(std::move(stats)), gamma_correction_(1.0)
+{
+#ifdef __x86_64__
+	enableInputMemcpy_ = false;
+#else
+	enableInputMemcpy_ = true;
+#endif
+	/* Initialize gamma to 1.0 curve */
+	for (unsigned int i = 0; i < kGammaLookupSize; i++)
+		gamma_[i] = i / (kGammaLookupSize / kRGBLookupSize);
+
+	for (unsigned int i = 0; i < kMaxLineBuffers; i++)
+		lineBuffers_[i] = nullptr;
+}
+
+DebayerCpu::~DebayerCpu()
+{
+	for (unsigned int i = 0; i < kMaxLineBuffers; i++)
+		free(lineBuffers_[i]);
+}
+
+// RGR
+// GBG
+// RGR
+#define BGGR_BGR888(p, n, div)                                                                \
+	*dst++ = blue_[curr[x] / (div)];                                                      \
+	*dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];       \
+	*dst++ = red_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
+	x++;
+
+// GBG
+// RGR
+// GBG
+#define GRBG_BGR888(p, n, div)                                    \
+	*dst++ = blue_[(prev[x] + next[x]) / (2 * (div))];        \
+	*dst++ = green_[curr[x] / (div)];                         \
+	*dst++ = red_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
+	x++;
+
+// GRG
+// BGB
+// GRG
+#define GBRG_BGR888(p, n, div)                                     \
+	*dst++ = blue_[(curr[x - p] + curr[x + n]) / (2 * (div))]; \
+	*dst++ = green_[curr[x] / (div)];                          \
+	*dst++ = red_[(prev[x] + next[x]) / (2 * (div))];          \
+	x++;
+
+// BGB
+// GRG
+// BGB
+#define RGGB_BGR888(p, n, div)                                                                 \
+	*dst++ = blue_[(prev[x - p] + prev[x + n] + next[x - p] + next[x + n]) / (4 * (div))]; \
+	*dst++ = green_[(prev[x] + curr[x - p] + curr[x + n] + next[x]) / (4 * (div))];        \
+	*dst++ = red_[curr[x] / (div)];                                                        \
+	x++;
+
+void DebayerCpu::debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[])
+{
+	const int width_in_bytes = window_.width * 5 / 4;
+	const uint8_t *prev = (const uint8_t *)src[0];
+	const uint8_t *curr = (const uint8_t *)src[1];
+	const uint8_t *next = (const uint8_t *)src[2];
+
+	/*
+	 * For the first pixel getting a pixel from the previous column uses
+	 * x - 2 to skip the 5th byte with least-significant bits for 4 pixels.
+	 * Same for last pixel (uses x + 2) and looking at the next column.
+	 */
+	for (int x = 0; x < width_in_bytes;) {
+		/* First pixel */
+		BGGR_BGR888(2, 1, 1)
+		/* Second pixel BGGR -> GBRG */
+		GBRG_BGR888(1, 1, 1)
+		/* Same thing for third and fourth pixels */
+		BGGR_BGR888(1, 1, 1)
+		GBRG_BGR888(1, 2, 1)
+		/* Skip 5th src byte with 4 x 2 least-significant-bits */
+		x++;
+	}
+}
+
+void DebayerCpu::debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[])
+{
+	const int width_in_bytes = window_.width * 5 / 4;
+	const uint8_t *prev = (const uint8_t *)src[0];
+	const uint8_t *curr = (const uint8_t *)src[1];
+	const uint8_t *next = (const uint8_t *)src[2];
+
+	for (int x = 0; x < width_in_bytes;) {
+		/* First pixel */
+		GRBG_BGR888(2, 1, 1)
+		/* Second pixel GRBG -> RGGB */
+		RGGB_BGR888(1, 1, 1)
+		/* Same thing for third and fourth pixels */
+		GRBG_BGR888(1, 1, 1)
+		RGGB_BGR888(1, 2, 1)
+		/* Skip 5th src byte with 4 x 2 least-significant-bits */
+		x++;
+	}
+}
+
+void DebayerCpu::debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[])
+{
+	const int width_in_bytes = window_.width * 5 / 4;
+	const uint8_t *prev = (const uint8_t *)src[0];
+	const uint8_t *curr = (const uint8_t *)src[1];
+	const uint8_t *next = (const uint8_t *)src[2];
+
+	for (int x = 0; x < width_in_bytes;) {
+		/* Even pixel */
+		GBRG_BGR888(2, 1, 1)
+		/* Odd pixel GBGR -> BGGR */
+		BGGR_BGR888(1, 1, 1)
+		/* Same thing for next 2 pixels */
+		GBRG_BGR888(1, 1, 1)
+		BGGR_BGR888(1, 2, 1)
+		/* Skip 5th src byte with 4 x 2 least-significant-bits */
+		x++;
+	}
+}
+
+void DebayerCpu::debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[])
+{
+	const int width_in_bytes = window_.width * 5 / 4;
+	const uint8_t *prev = (const uint8_t *)src[0];
+	const uint8_t *curr = (const uint8_t *)src[1];
+	const uint8_t *next = (const uint8_t *)src[2];
+
+	for (int x = 0; x < width_in_bytes;) {
+		/* Even pixel */
+		RGGB_BGR888(2, 1, 1)
+		/* Odd pixel RGGB -> GRBG */
+		GRBG_BGR888(1, 1, 1)
+		/* Same thing for next 2 pixels */
+		RGGB_BGR888(1, 1, 1)
+		GRBG_BGR888(1, 2, 1)
+		/* Skip 5th src byte with 4 x 2 least-significant-bits */
+		x++;
+	}
+}
+
+static bool isStandardBayerOrder(BayerFormat::Order order)
+{
+	return order == BayerFormat::BGGR || order == BayerFormat::GBRG ||
+	       order == BayerFormat::GRBG || order == BayerFormat::RGGB;
+}
+
+/*
+ * Setup the Debayer object according to the passed in parameters.
+ * Return 0 on success, a negative errno value on failure
+ * (unsupported parameters).
+ */
+int DebayerCpu::getInputConfig(PixelFormat inputFormat, DebayerInputConfig &config)
+{
+	BayerFormat bayerFormat =
+		BayerFormat::fromPixelFormat(inputFormat);
+
+	if (bayerFormat.bitDepth == 10 &&
+	    bayerFormat.packing == BayerFormat::Packing::CSI2 &&
+	    isStandardBayerOrder(bayerFormat.order)) {
+		config.bpp = 10;
+		config.patternSize.width = 4; /* 5 bytes per *4* pixels */
+		config.patternSize.height = 2;
+		config.outputFormats = std::vector<PixelFormat>({ formats::RGB888 });
+		return 0;
+	}
+
+	LOG(Debayer, Info)
+		<< "Unsupported input format " << inputFormat.toString();
+	return -EINVAL;
+}
+
+int DebayerCpu::getOutputConfig(PixelFormat outputFormat, DebayerOutputConfig &config)
+{
+	if (outputFormat == formats::RGB888) {
+		config.bpp = 24;
+		return 0;
+	}
+
+	LOG(Debayer, Info)
+		<< "Unsupported output format " << outputFormat.toString();
+	return -EINVAL;
+}
+
+/* TODO: this ignores outputFormat since there is only 1 supported outputFormat for now */
+int DebayerCpu::setDebayerFunctions(PixelFormat inputFormat, [[maybe_unused]] PixelFormat outputFormat)
+{
+	BayerFormat bayerFormat =
+		BayerFormat::fromPixelFormat(inputFormat);
+
+	if (bayerFormat.bitDepth == 10 &&
+	    bayerFormat.packing == BayerFormat::Packing::CSI2) {
+		switch (bayerFormat.order) {
+		case BayerFormat::BGGR:
+			debayer0_ = &DebayerCpu::debayer10P_BGBG_BGR888;
+			debayer1_ = &DebayerCpu::debayer10P_GRGR_BGR888;
+			return 0;
+		case BayerFormat::GBRG:
+			debayer0_ = &DebayerCpu::debayer10P_GBGB_BGR888;
+			debayer1_ = &DebayerCpu::debayer10P_RGRG_BGR888;
+			return 0;
+		case BayerFormat::GRBG:
+			debayer0_ = &DebayerCpu::debayer10P_GRGR_BGR888;
+			debayer1_ = &DebayerCpu::debayer10P_BGBG_BGR888;
+			return 0;
+		case BayerFormat::RGGB:
+			debayer0_ = &DebayerCpu::debayer10P_RGRG_BGR888;
+			debayer1_ = &DebayerCpu::debayer10P_GBGB_BGR888;
+			return 0;
+		default:
+			break;
+		}
+	}
+
+	LOG(Debayer, Error) << "Unsupported input output format combination";
+	return -EINVAL;
+}
+
+int DebayerCpu::configure(const StreamConfiguration &inputCfg,
+			  const std::vector<std::reference_wrapper<StreamConfiguration>> &outputCfgs)
+{
+	if (getInputConfig(inputCfg.pixelFormat, inputConfig_) != 0)
+		return -EINVAL;
+
+	if (stats_->configure(inputCfg) != 0)
+		return -EINVAL;
+
+	const Size &stats_pattern_size = stats_->patternSize();
+	if (inputConfig_.patternSize.width != stats_pattern_size.width ||
+	    inputConfig_.patternSize.height != stats_pattern_size.height) {
+		LOG(Debayer, Error)
+			<< "mismatching stats and debayer pattern sizes for "
+			<< inputCfg.pixelFormat.toString();
+		return -EINVAL;
+	}
+
+	inputConfig_.stride = inputCfg.stride;
+
+	if (outputCfgs.size() != 1) {
+		LOG(Debayer, Error)
+			<< "Unsupported number of output streams: "
+			<< outputCfgs.size();
+		return -EINVAL;
+	}
+
+	const StreamConfiguration &outputCfg = outputCfgs[0];
+	SizeRange outSizeRange = sizes(inputCfg.pixelFormat, inputCfg.size);
+	std::tie(outputConfig_.stride, outputConfig_.frameSize) =
+		strideAndFrameSize(outputCfg.pixelFormat, outputCfg.size);
+
+	if (!outSizeRange.contains(outputCfg.size) || outputConfig_.stride != outputCfg.stride) {
+		LOG(Debayer, Error)
+			<< "Invalid output size/stride: "
+			<< "\n  " << outputCfg.size << " (" << outSizeRange << ")"
+			<< "\n  " << outputCfg.stride << " (" << outputConfig_.stride << ")";
+		return -EINVAL;
+	}
+
+	if (setDebayerFunctions(inputCfg.pixelFormat, outputCfg.pixelFormat) != 0)
+		return -EINVAL;
+
+	window_.x = ((inputCfg.size.width - outputCfg.size.width) / 2) &
+		    ~(inputConfig_.patternSize.width - 1);
+	window_.y = ((inputCfg.size.height - outputCfg.size.height) / 2) &
+		    ~(inputConfig_.patternSize.height - 1);
+	window_.width = outputCfg.size.width;
+	window_.height = outputCfg.size.height;
+
+	/* Don't pass x,y since process() already adjusts src before passing it */
+	stats_->setWindow(Rectangle(window_.size()));
+
+	/* pad with patternSize.Width on both left and right side */
+	lineBufferPadding_ = inputConfig_.patternSize.width * inputConfig_.bpp / 8;
+	lineBufferLength_ = window_.width * inputConfig_.bpp / 8 +
+			    2 * lineBufferPadding_;
+	for (unsigned int i = 0;
+	     i < (inputConfig_.patternSize.height + 1) && enableInputMemcpy_;
+	     i++) {
+		free(lineBuffers_[i]);
+		lineBuffers_[i] = (uint8_t *)malloc(lineBufferLength_);
+		if (!lineBuffers_[i])
+			return -ENOMEM;
+	}
+
+	measuredFrames_ = 0;
+	frameProcessTime_ = 0;
+
+	return 0;
+}
+
+/*
+ * Get width and height at which the bayer-pattern repeats.
+ * Return pattern-size or an empty Size for an unsupported inputFormat.
+ */
+Size DebayerCpu::patternSize(PixelFormat inputFormat)
+{
+	DebayerCpu::DebayerInputConfig config;
+
+	if (getInputConfig(inputFormat, config) != 0)
+		return {};
+
+	return config.patternSize;
+}
+
+std::vector<PixelFormat> DebayerCpu::formats(PixelFormat inputFormat)
+{
+	DebayerCpu::DebayerInputConfig config;
+
+	if (getInputConfig(inputFormat, config) != 0)
+		return std::vector<PixelFormat>();
+
+	return config.outputFormats;
+}
+
+std::tuple<unsigned int, unsigned int>
+DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size)
+{
+	DebayerCpu::DebayerOutputConfig config;
+
+	if (getOutputConfig(outputFormat, config) != 0)
+		return std::make_tuple(0, 0);
+
+	/* round up to multiple of 8 for 64 bits alignment */
+	unsigned int stride = (size.width * config.bpp / 8 + 7) & ~7;
+
+	return std::make_tuple(stride, stride * size.height);
+}
+
+void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[])
+{
+	const unsigned int patternHeight = inputConfig_.patternSize.height;
+
+	if (!enableInputMemcpy_)
+		return;
+
+	for (unsigned int i = 0; i < patternHeight; i++) {
+		memcpy(lineBuffers_[i], linePointers[i + 1] - lineBufferPadding_,
+		       lineBufferLength_);
+		linePointers[i + 1] = lineBuffers_[i] + lineBufferPadding_;
+	}
+
+	/* Point lineBufferIndex_ to first unused lineBuffer */
+	lineBufferIndex_ = patternHeight;
+}
+
+void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src)
+{
+	const unsigned int patternHeight = inputConfig_.patternSize.height;
+
+	for (unsigned int i = 0; i < patternHeight; i++)
+		linePointers[i] = linePointers[i + 1];
+
+	linePointers[patternHeight] = src +
+				      (patternHeight / 2) * (int)inputConfig_.stride;
+}
+
+void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[])
+{
+	const unsigned int patternHeight = inputConfig_.patternSize.height;
+
+	if (!enableInputMemcpy_)
+		return;
+
+	memcpy(lineBuffers_[lineBufferIndex_], linePointers[patternHeight] - lineBufferPadding_,
+	       lineBufferLength_);
+	linePointers[patternHeight] = lineBuffers_[lineBufferIndex_] + lineBufferPadding_;
+
+	lineBufferIndex_ = (lineBufferIndex_ + 1) % (patternHeight + 1);
+}
+
+void DebayerCpu::process2(const uint8_t *src, uint8_t *dst)
+{
+	unsigned int y_end = window_.y + window_.height;
+	/* Holds [0] previous- [1] current- [2] next-line */
+	const uint8_t *linePointers[3];
+
+	/* Adjust src to top left corner of the window */
+	src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
+
+	/* [x] becomes [x - 1] after initial shiftLinePointers() call */
+	if (window_.y) {
+		linePointers[1] = src - inputConfig_.stride; /* previous-line */
+		linePointers[2] = src;
+	} else {
+		/* window_.y == 0, use the next line as prev line */
+		linePointers[1] = src + inputConfig_.stride;
+		linePointers[2] = src;
+		/* Last 2 lines also need special handling */
+		y_end -= 2;
+	}
+
+	setupInputMemcpy(linePointers);
+
+	for (unsigned int y = window_.y; y < y_end; y += 2) {
+		shiftLinePointers(linePointers, src);
+		memcpyNextLine(linePointers);
+		stats_->processLine0(y, linePointers);
+		(this->*debayer0_)(dst, linePointers);
+		src += inputConfig_.stride;
+		dst += outputConfig_.stride;
+
+		shiftLinePointers(linePointers, src);
+		memcpyNextLine(linePointers);
+		(this->*debayer1_)(dst, linePointers);
+		src += inputConfig_.stride;
+		dst += outputConfig_.stride;
+	}
+
+	if (window_.y == 0) {
+		shiftLinePointers(linePointers, src);
+		memcpyNextLine(linePointers);
+		stats_->processLine0(y_end, linePointers);
+		(this->*debayer0_)(dst, linePointers);
+		src += inputConfig_.stride;
+		dst += outputConfig_.stride;
+
+		shiftLinePointers(linePointers, src);
+		/* next line may point outside of src, use prev. */
+		linePointers[2] = linePointers[0];
+		(this->*debayer1_)(dst, linePointers);
+		src += inputConfig_.stride;
+		dst += outputConfig_.stride;
+	}
+}
+
+void DebayerCpu::process4(const uint8_t *src, uint8_t *dst)
+{
+	const unsigned int y_end = window_.y + window_.height;
+	/*
+	 * This holds pointers to [0] 2-lines-up [1] 1-line-up [2] current-line
+	 * [3] 1-line-down [4] 2-lines-down.
+	 */
+	const uint8_t *linePointers[5];
+
+	/* Adjust src to top left corner of the window */
+	src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
+
+	/* [x] becomes [x - 1] after initial shiftLinePointers() call */
+	linePointers[1] = src - 2 * inputConfig_.stride;
+	linePointers[2] = src - inputConfig_.stride;
+	linePointers[3] = src;
+	linePointers[4] = src + inputConfig_.stride;
+
+	setupInputMemcpy(linePointers);
+
+	for (unsigned int y = window_.y; y < y_end; y += 4) {
+		shiftLinePointers(linePointers, src);
+		memcpyNextLine(linePointers);
+		stats_->processLine0(y, linePointers);
+		(this->*debayer0_)(dst, linePointers);
+		src += inputConfig_.stride;
+		dst += outputConfig_.stride;
+
+		shiftLinePointers(linePointers, src);
+		memcpyNextLine(linePointers);
+		(this->*debayer1_)(dst, linePointers);
+		src += inputConfig_.stride;
+		dst += outputConfig_.stride;
+
+		shiftLinePointers(linePointers, src);
+		memcpyNextLine(linePointers);
+		stats_->processLine2(y, linePointers);
+		(this->*debayer2_)(dst, linePointers);
+		src += inputConfig_.stride;
+		dst += outputConfig_.stride;
+
+		shiftLinePointers(linePointers, src);
+		memcpyNextLine(linePointers);
+		(this->*debayer3_)(dst, linePointers);
+		src += inputConfig_.stride;
+		dst += outputConfig_.stride;
+	}
+}
+
+static inline int64_t timeDiff(timespec &after, timespec &before)
+{
+	return (after.tv_sec - before.tv_sec) * 1000000000LL +
+	       (int64_t)after.tv_nsec - (int64_t)before.tv_nsec;
+}
+
+void DebayerCpu::process(FrameBuffer *input, FrameBuffer *output, DebayerParams params)
+{
+	timespec frameStartTime;
+
+	if (measuredFrames_ < DebayerCpu::kLastFrameToMeasure) {
+		frameStartTime = {};
+		clock_gettime(CLOCK_MONOTONIC_RAW, &frameStartTime);
+	}
+
+	/* Apply DebayerParams */
+	if (params.gamma != gamma_correction_) {
+		for (unsigned int i = 0; i < kGammaLookupSize; i++)
+			gamma_[i] = UINT8_MAX * powf(i / (kGammaLookupSize - 1.0), params.gamma);
+
+		gamma_correction_ = params.gamma;
+	}
+
+	for (unsigned int i = 0; i < kRGBLookupSize; i++) {
+		constexpr unsigned int div =
+			kRGBLookupSize * DebayerParams::kGain10 / kGammaLookupSize;
+		unsigned int idx;
+
+		/* Apply gamma after gain! */
+		idx = std::min({ i * params.gainR / div, (kGammaLookupSize - 1) });
+		red_[i] = gamma_[idx];
+
+		idx = std::min({ i * params.gainG / div, (kGammaLookupSize - 1) });
+		green_[i] = gamma_[idx];
+
+		idx = std::min({ i * params.gainB / div, (kGammaLookupSize - 1) });
+		blue_[i] = gamma_[idx];
+	}
+
+	/* Copy metadata from the input buffer */
+	FrameMetadata &metadata = output->_d()->metadata();
+	metadata.status = input->metadata().status;
+	metadata.sequence = input->metadata().sequence;
+	metadata.timestamp = input->metadata().timestamp;
+
+	MappedFrameBuffer in(input, MappedFrameBuffer::MapFlag::Read);
+	MappedFrameBuffer out(output, MappedFrameBuffer::MapFlag::Write);
+	if (!in.isValid() || !out.isValid()) {
+		LOG(Debayer, Error) << "mmap-ing buffer(s) failed";
+		metadata.status = FrameMetadata::FrameError;
+		return;
+	}
+
+	stats_->startFrame();
+
+	if (inputConfig_.patternSize.height == 2)
+		process2(in.planes()[0].data(), out.planes()[0].data());
+	else
+		process4(in.planes()[0].data(), out.planes()[0].data());
+
+	metadata.planes()[0].bytesused = out.planes()[0].size();
+
+	/* Measure before emitting signals */
+	if (measuredFrames_ < DebayerCpu::kLastFrameToMeasure &&
+	    ++measuredFrames_ > DebayerCpu::kFramesToSkip) {
+		timespec frameEndTime = {};
+		clock_gettime(CLOCK_MONOTONIC_RAW, &frameEndTime);
+		frameProcessTime_ += timeDiff(frameEndTime, frameStartTime);
+		if (measuredFrames_ == DebayerCpu::kLastFrameToMeasure) {
+			const unsigned int measuredFrames = DebayerCpu::kLastFrameToMeasure -
+							    DebayerCpu::kFramesToSkip;
+			LOG(Debayer, Info)
+				<< "Processed " << measuredFrames
+				<< " frames in " << frameProcessTime_ / 1000 << "us, "
+				<< frameProcessTime_ / (1000 * measuredFrames)
+				<< " us/frame";
+		}
+	}
+
+	stats_->finishFrame();
+	outputBufferReady.emit(output);
+	inputBufferReady.emit(input);
+}
+
+SizeRange DebayerCpu::sizes(PixelFormat inputFormat, const Size &inputSize)
+{
+	Size pattern_size = patternSize(inputFormat);
+	unsigned int border_height = pattern_size.height;
+
+	if (pattern_size.isNull())
+		return {};
+
+	/* No need for top/bottom border with a pattern height of 2 */
+	if (pattern_size.height == 2)
+		border_height = 0;
+
+	/*
+	 * For debayer interpolation a border is kept around the entire image
+	 * and the minimum output size is pattern-height x pattern-width.
+	 */
+	if (inputSize.width < (3 * pattern_size.width) ||
+	    inputSize.height < (2 * border_height + pattern_size.height)) {
+		LOG(Debayer, Warning)
+			<< "Input format size too small: " << inputSize.toString();
+		return {};
+	}
+
+	return SizeRange(Size(pattern_size.width, pattern_size.height),
+			 Size((inputSize.width - 2 * pattern_size.width) & ~(pattern_size.width - 1),
+			      (inputSize.height - 2 * border_height) & ~(pattern_size.height - 1)),
+			 pattern_size.width, pattern_size.height);
+}
+
+} /* namespace libcamera */
diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
new file mode 100644
index 00000000..8a51ed85
--- /dev/null
+++ b/src/libcamera/software_isp/debayer_cpu.h
@@ -0,0 +1,143 @@ 
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/*
+ * Copyright (C) 2023, Linaro Ltd
+ * Copyright (C) 2023, Red Hat Inc.
+ *
+ * Authors:
+ * Hans de Goede <hdegoede@redhat.com>
+ *
+ * debayer_cpu.h - CPU based debayering header
+ */
+
+#pragma once
+
+#include <memory>
+#include <stdint.h>
+#include <vector>
+
+#include <libcamera/base/object.h>
+
+#include "debayer.h"
+#include "swstats_cpu.h"
+
+namespace libcamera {
+
+class DebayerCpu : public Debayer, public Object
+{
+public:
+	DebayerCpu(std::unique_ptr<SwStatsCpu> stats);
+	~DebayerCpu();
+
+	int configure(const StreamConfiguration &inputCfg,
+		      const std::vector<std::reference_wrapper<StreamConfiguration>> &outputCfgs);
+	Size patternSize(PixelFormat inputFormat);
+	std::vector<PixelFormat> formats(PixelFormat input);
+	std::tuple<unsigned int, unsigned int>
+	strideAndFrameSize(const PixelFormat &outputFormat, const Size &size);
+	void process(FrameBuffer *input, FrameBuffer *output, DebayerParams params);
+	SizeRange sizes(PixelFormat inputFormat, const Size &inputSize);
+
+	/**
+	 * \brief Get the file descriptor for the statistics.
+	 *
+	 * \return the file descriptor pointing to the statistics.
+	 */
+	const SharedFD &getStatsFD() { return stats_->getStatsFD(); }
+
+	/**
+	 * \brief Get the output frame size.
+	 *
+	 * \return The output frame size.
+	 */
+	unsigned int frameSize() { return outputConfig_.frameSize; }
+
+private:
+	/**
+	 * \brief Called to debayer 1 line of Bayer input data to output format
+	 * \param[out] dst Pointer to the start of the output line to write
+	 * \param[in] src The input data
+	 *
+	 * Input data is an array of (patternSize_.height + 1) src
+	 * pointers each pointing to a line in the Bayer source. The middle
+	 * element of the array will point to the actual line being processed.
+	 * Earlier element(s) will point to the previous line(s) and later
+	 * element(s) to the next line(s).
+	 *
+	 * These functions take an array of src pointers, rather than
+	 * a single src pointer + a stride for the source, so that when the src
+	 * is slow uncached memory it can be copied to faster memory before
+	 * debayering. Debayering a standard 2x2 Bayer pattern requires access
+	 * to the previous and next src lines for interpolating the missing
+	 * colors. To allow copying the src lines only once 3 temporary buffers
+	 * each holding a single line are used, re-using the oldest buffer for
+	 * the next line and the pointers are swizzled so that:
+	 * src[0] = previous-line, src[1] = currrent-line, src[2] = next-line.
+	 * This way the 3 pointers passed to the debayer functions form
+	 * a sliding window over the src avoiding the need to copy each
+	 * line more than once.
+	 *
+	 * Similarly for bayer patterns which repeat every 4 lines, 5 src
+	 * pointers are passed holding: src[0] = 2-lines-up, src[1] = 1-line-up
+	 * src[2] = current-line, src[3] = 1-line-down, src[4] = 2-lines-down.
+	 */
+	using debayerFn = void (DebayerCpu::*)(uint8_t *dst, const uint8_t *src[]);
+
+	/* CSI-2 packed 10-bit raw bayer format (all the 4 orders) */
+	void debayer10P_BGBG_BGR888(uint8_t *dst, const uint8_t *src[]);
+	void debayer10P_GRGR_BGR888(uint8_t *dst, const uint8_t *src[]);
+	void debayer10P_GBGB_BGR888(uint8_t *dst, const uint8_t *src[]);
+	void debayer10P_RGRG_BGR888(uint8_t *dst, const uint8_t *src[]);
+
+	struct DebayerInputConfig {
+		Size patternSize;
+		unsigned int bpp; /* Memory used per pixel, not precision */
+		unsigned int stride;
+		std::vector<PixelFormat> outputFormats;
+	};
+
+	struct DebayerOutputConfig {
+		unsigned int bpp; /* Memory used per pixel, not precision */
+		unsigned int stride;
+		unsigned int frameSize;
+	};
+
+	int getInputConfig(PixelFormat inputFormat, DebayerInputConfig &config);
+	int getOutputConfig(PixelFormat outputFormat, DebayerOutputConfig &config);
+	int setDebayerFunctions(PixelFormat inputFormat, PixelFormat outputFormat);
+	void setupInputMemcpy(const uint8_t *linePointers[]);
+	void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src);
+	void memcpyNextLine(const uint8_t *linePointers[]);
+	void process2(const uint8_t *src, uint8_t *dst);
+	void process4(const uint8_t *src, uint8_t *dst);
+
+	static constexpr unsigned int kGammaLookupSize = 1024;
+	static constexpr unsigned int kRGBLookupSize = 256;
+	/* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */
+	static constexpr unsigned int kMaxLineBuffers = 5;
+
+	std::array<uint8_t, kGammaLookupSize> gamma_;
+	std::array<uint8_t, kRGBLookupSize> red_;
+	std::array<uint8_t, kRGBLookupSize> green_;
+	std::array<uint8_t, kRGBLookupSize> blue_;
+	debayerFn debayer0_;
+	debayerFn debayer1_;
+	debayerFn debayer2_;
+	debayerFn debayer3_;
+	Rectangle window_;
+	DebayerInputConfig inputConfig_;
+	DebayerOutputConfig outputConfig_;
+	std::unique_ptr<SwStatsCpu> stats_;
+	uint8_t *lineBuffers_[kMaxLineBuffers];
+	unsigned int lineBufferLength_;
+	unsigned int lineBufferPadding_;
+	unsigned int lineBufferIndex_;
+	bool enableInputMemcpy_;
+	float gamma_correction_;
+	unsigned int measuredFrames_;
+	int64_t frameProcessTime_;
+	/* Skip 30 frames for things to stabilize then measure 30 frames */
+	static constexpr unsigned int kFramesToSkip = 30;
+	static constexpr unsigned int kLastFrameToMeasure = 60;
+};
+
+} /* namespace libcamera */
diff --git a/src/libcamera/software_isp/meson.build b/src/libcamera/software_isp/meson.build
index 62095f61..71b46539 100644
--- a/src/libcamera/software_isp/meson.build
+++ b/src/libcamera/software_isp/meson.build
@@ -9,5 +9,6 @@  endif
 
 libcamera_sources += files([
     'debayer.cpp',
+    'debayer_cpu.cpp',
     'swstats_cpu.cpp',
 ])