diff --git a/src/libcamera/software_isp/debayer_cpu.cpp b/src/libcamera/software_isp/debayer_cpu.cpp
index 122bfbb05..ea1b17c1f 100644
--- a/src/libcamera/software_isp/debayer_cpu.cpp
+++ b/src/libcamera/software_isp/debayer_cpu.cpp
@@ -18,6 +18,8 @@
 
 #include <linux/dma-buf.h>
 
+#include <libcamera/base/thread.h>
+
 #include <libcamera/formats.h>
 
 #include "libcamera/internal/bayer_format.h"
@@ -50,13 +52,15 @@ public:
 	unsigned int lineBufferIndex_;
 	std::vector<uint8_t> lineBuffers_[DebayerCpu::kMaxLineBuffers];
 	bool enableInputMemcpy_;
+	Thread worker_;
 };
 
 DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex,
 				   bool enableInputMemcpy)
 	: debayer_(debayer), threadIndex_(threadIndex),
-	  enableInputMemcpy_(enableInputMemcpy)
+	  enableInputMemcpy_(enableInputMemcpy), worker_("DebayerWorker")
 {
+	this->moveToThread(&worker_);
 }
 
 /**
@@ -88,8 +92,10 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat
 	bool enableInputMemcpy =
 		configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true);
 
-	/* Just one thread object for now, which will be called inline rather than async */
-	threads_.resize(1);
+	unsigned int threadCount =
+		configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(2);
+	threadCount = std::clamp(threadCount, 1u, 8u);
+	threads_.resize(threadCount);
 
 	for (unsigned int i = 0; i < threads_.size(); i++)
 		threads_[i] = new DebayerCpuThread(this, i, enableInputMemcpy);
@@ -714,6 +720,11 @@ void DebayerCpuThread::process(uint32_t frame, const uint8_t *src, uint8_t *dst)
 		process2(frame, src, dst);
 	else
 		process4(frame, src, dst);
+
+	debayer_->workPendingMutex_.lock();
+	debayer_->workPending_ &= ~(1 << threadIndex_);
+	debayer_->workPendingMutex_.unlock();
+	debayer_->workPendingCv_.notify_one();
 }
 
 void DebayerCpuThread::process2(uint32_t frame, const uint8_t *src, uint8_t *dst)
@@ -953,7 +964,24 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
 
 	stats_->startFrame(frame);
 
-	threads_[0]->process(frame, in.planes()[0].data(), out.planes()[0].data());
+	workPendingMutex_.lock();
+	workPending_ = (1 << threads_.size()) - 1;
+	workPendingMutex_.unlock();
+
+	for (unsigned int i = 0; i < threads_.size(); i++)
+		threads_[i]->invokeMethod(&DebayerCpuThread::process,
+					  ConnectionTypeQueued, frame,
+					  in.planes()[0].data(), out.planes()[0].data());
+
+	{
+		MutexLocker locker(workPendingMutex_);
+
+		auto workPending = ([&]() LIBCAMERA_TSA_REQUIRES(workPendingMutex_) {
+			return workPending_ == 0;
+		});
+
+		workPendingCv_.wait(locker, workPending);
+	}
 
 	metadata.planes()[0].bytesused = out.planes()[0].size();
 
@@ -972,6 +1000,23 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
 	inputBufferReady.emit(input);
 }
 
+int DebayerCpu::start()
+{
+	for (unsigned int i = 0; i < threads_.size(); i++)
+		threads_[i]->worker_.start();
+
+	return 0;
+}
+
+void DebayerCpu::stop()
+{
+	for (unsigned int i = 0; i < threads_.size(); i++)
+		threads_[i]->worker_.exit();
+
+	for (unsigned int i = 0; i < threads_.size(); i++)
+		threads_[i]->worker_.wait();
+}
+
 SizeRange DebayerCpu::sizes(PixelFormat inputFormat, const Size &inputSize)
 {
 	Size patternSize = this->patternSize(inputFormat);
diff --git a/src/libcamera/software_isp/debayer_cpu.h b/src/libcamera/software_isp/debayer_cpu.h
index 7196dcdd0..2c84f8e40 100644
--- a/src/libcamera/software_isp/debayer_cpu.h
+++ b/src/libcamera/software_isp/debayer_cpu.h
@@ -16,6 +16,7 @@
 #include <vector>
 
 #include <libcamera/base/object.h>
+#include <libcamera/base/mutex.h>
 
 #include "libcamera/internal/bayer_format.h"
 #include "libcamera/internal/global_configuration.h"
@@ -41,6 +42,8 @@ public:
 	std::tuple<unsigned int, unsigned int>
 	strideAndFrameSize(const PixelFormat &outputFormat, const Size &size);
 	void process(uint32_t frame, FrameBuffer *input, FrameBuffer *output, const DebayerParams &params);
+	int start();
+	void stop();
 	SizeRange sizes(PixelFormat inputFormat, const Size &inputSize);
 	const SharedFD &getStatsFD() { return stats_->getStatsFD(); }
 
@@ -147,6 +150,9 @@ private:
 	std::unique_ptr<SwStatsCpu> stats_;
 	unsigned int xShift_; /* Offset of 0/1 applied to window_.x */
 
+	unsigned int workPending_ LIBCAMERA_TSA_GUARDED_BY(workPendingMutex_);
+	Mutex workPendingMutex_;
+	ConditionVariable workPendingCv_;
 	std::vector<DebayerCpuThread *>threads_;
 };
 
