Unleashing GPU Power: Metal Performance Shaders for Real-Time Image Processing
Published:
Metal Performance Shaders (MPS) transforms iOS apps by offloading computation to the GPU. This guide shows production patterns for real-time image processing, ML inference, and compute shaders—delivering 10x+ performance gains over CPU-only approaches. 🚀
Why MPS Matters in Production Apps 📱
Modern iOS devices pack incredible GPU power. The A17 Pro delivers 35+ TOPS of compute performance. MPS lets you tap this power for:
- Real-time filters (Instagram, TikTok-style effects)
- ML model inference (Core ML acceleration)
- Computer vision (object detection, segmentation)
- Scientific computing (signal processing, simulations)
1) Foundation: MTLDevice and Command Buffers
import Metal
import MetalPerformanceShaders
final class GPUProcessor {
private let device: MTLDevice
private let commandQueue: MTLCommandQueue
init?() {
guard let device = MTLCreateSystemDefaultDevice(),
let commandQueue = device.makeCommandQueue() else {
return nil
}
self.device = device
self.commandQueue = commandQueue
}
func process<T: MPSKernel>(_ kernel: T,
input: MTLTexture,
output: MTLTexture) {
guard let commandBuffer = commandQueue.makeCommandBuffer() else { return }
kernel.encode(commandBuffer: commandBuffer,
sourceTexture: input,
destinationTexture: output)
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
}
}
2) Real-Time Image Filters with MPS 🎨
Gaussian Blur Pipeline
final class BlurProcessor: GPUProcessor {
private var blurKernel: MPSImageGaussianBlur?
override init?() {
super.init()
guard let device = self.device else { return nil }
blurKernel = MPSImageGaussianBlur(device: device, sigma: 2.0)
}
func applyBlur(to texture: MTLTexture) -> MTLTexture? {
guard let blur = blurKernel,
let outputTexture = makeTexture(like: texture) else { return nil }
process(blur, input: texture, output: outputTexture)
return outputTexture
}
private func makeTexture(like source: MTLTexture) -> MTLTexture? {
let descriptor = MTLTextureDescriptor.texture2DDescriptor(
pixelFormat: source.pixelFormat,
width: source.width,
height: source.height,
mipmapped: false
)
descriptor.usage = [.shaderRead, .shaderWrite]
return device.makeTexture(descriptor: descriptor)
}
}
Chain Multiple Effects
final class FilterChain: GPUProcessor {
private let blur: MPSImageGaussianBlur
private let sharpen: MPSImageUnsharpMask
private let contrast: MPSImageContrast
override init?() {
super.init()
guard let device = self.device else { return nil }
blur = MPSImageGaussianBlur(device: device, sigma: 1.0)
sharpen = MPSImageUnsharpMask(device: device)
contrast = MPSImageContrast(device: device)
contrast.contrast = 1.2
}
func processImage(_ input: MTLTexture) -> MTLTexture? {
guard let temp1 = makeTexture(like: input),
let temp2 = makeTexture(like: input),
let output = makeTexture(like: input),
let commandBuffer = commandQueue.makeCommandBuffer() else { return nil }
// Chain: Input → Blur → Sharpen → Contrast → Output
blur.encode(commandBuffer: commandBuffer, sourceTexture: input, destinationTexture: temp1)
sharpen.encode(commandBuffer: commandBuffer, sourceTexture: temp1, destinationTexture: temp2)
contrast.encode(commandBuffer: commandBuffer, sourceTexture: temp2, destinationTexture: output)
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
return output
}
}
3) Custom Compute Shaders 🔧
For complex algorithms, write custom Metal shaders:
Edge Detection Shader
// EdgeDetection.metal
#include <metal_stdlib>
using namespace metal;
kernel void sobelEdgeDetection(texture2d<float, access::read> input [[texture(0)]],
texture2d<float, access::write> output [[texture(1)]],
uint2 gid [[thread_position_in_grid]]) {
if (gid.x >= input.get_width() || gid.y >= input.get_height()) return;
// Sobel kernels
const float3x3 sobelX = float3x3(-1, 0, 1, -2, 0, 2, -1, 0, 1);
const float3x3 sobelY = float3x3(-1, -2, -1, 0, 0, 0, 1, 2, 1);
float3 gx = 0, gy = 0;
for (int i = -1; i <= 1; i++) {
for (int j = -1; j <= 1; j++) {
uint2 coord = uint2(max(0, min(int(input.get_width()-1), int(gid.x) + i)),
max(0, min(int(input.get_height()-1), int(gid.y) + j)));
float3 pixel = input.read(coord).rgb;
gx += pixel * sobelX[i+1][j+1];
gy += pixel * sobelY[i+1][j+1];
}
}
float magnitude = length(float2(dot(gx, float3(0.299, 0.587, 0.114)),
dot(gy, float3(0.299, 0.587, 0.114))));
output.write(float4(magnitude, magnitude, magnitude, 1.0), gid);
}
Swift Integration
final class CustomShaderProcessor: GPUProcessor {
private var computePipelineState: MTLComputePipelineState?
override init?() {
super.init()
setupPipeline()
}
private func setupPipeline() {
guard let library = device.makeDefaultLibrary(),
let function = library.makeFunction(name: "sobelEdgeDetection") else {
print("Failed to create compute function")
return
}
do {
computePipelineState = try device.makeComputePipelineState(function: function)
} catch {
print("Failed to create pipeline state: \(error)")
}
}
func detectEdges(in texture: MTLTexture) -> MTLTexture? {
guard let pipelineState = computePipelineState,
let commandBuffer = commandQueue.makeCommandBuffer(),
let encoder = commandBuffer.makeComputeCommandEncoder(),
let outputTexture = makeTexture(like: texture) else { return nil }
encoder.setComputePipelineState(pipelineState)
encoder.setTexture(texture, index: 0)
encoder.setTexture(outputTexture, index: 1)
let threadsPerGroup = MTLSize(width: 16, height: 16, depth: 1)
let groupsPerGrid = MTLSize(
width: (texture.width + threadsPerGroup.width - 1) / threadsPerGroup.width,
height: (texture.height + threadsPerGroup.height - 1) / threadsPerGroup.height,
depth: 1
)
encoder.dispatchThreadgroups(groupsPerGrid, threadsPerThreadgroup: threadsPerGroup)
encoder.endEncoding()
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
return outputTexture
}
}
4) Performance Optimization Patterns 🏎️
Memory Pool for Textures
final class TexturePool {
private var availableTextures: [String: [MTLTexture]] = [:]
private let device: MTLDevice
init(device: MTLDevice) {
self.device = device
}
func texture(width: Int, height: Int, pixelFormat: MTLPixelFormat) -> MTLTexture? {
let key = "\(width)x\(height)_\(pixelFormat.rawValue)"
if let texture = availableTextures[key]?.popLast() {
return texture
}
let descriptor = MTLTextureDescriptor.texture2DDescriptor(
pixelFormat: pixelFormat,
width: width,
height: height,
mipmapped: false
)
descriptor.usage = [.shaderRead, .shaderWrite]
return device.makeTexture(descriptor: descriptor)
}
func returnTexture(_ texture: MTLTexture) {
let key = "\(texture.width)x\(texture.height)_\(texture.pixelFormat.rawValue)"
availableTextures[key, default: []].append(texture)
}
}
Async Processing Pipeline
final class AsyncImageProcessor {
private let metalQueue = DispatchQueue(label: "metal.processing", qos: .userInitiated)
private let processor: GPUProcessor
private let texturePool: TexturePool
init() {
guard let processor = GPUProcessor() else {
fatalError("Failed to initialize Metal")
}
self.processor = processor
self.texturePool = TexturePool(device: processor.device)
}
func processImage(_ image: UIImage) async -> UIImage? {
await withCheckedContinuation { continuation in
metalQueue.async {
guard let cgImage = image.cgImage,
let inputTexture = self.loadTexture(from: cgImage),
let outputTexture = self.processor.detectEdges(in: inputTexture),
let processedImage = self.imageFromTexture(outputTexture) else {
continuation.resume(returning: nil)
return
}
continuation.resume(returning: processedImage)
}
}
}
}
5) Production Debugging and Profiling 🔍
Metal Frame Capture
Enable Metal frame debugging in Xcode:
- Product → Scheme → Edit Scheme
- Run → Diagnostics → Metal API Validation: Enabled
- GPU Frame Capture: Metal
Performance Metrics
extension GPUProcessor {
func benchmarkKernel<T: MPSKernel>(_ kernel: T,
input: MTLTexture,
iterations: Int = 100) -> TimeInterval {
let startTime = CFAbsoluteTimeGetCurrent()
for _ in 0..<iterations {
guard let commandBuffer = commandQueue.makeCommandBuffer(),
let output = makeTexture(like: input) else { continue }
kernel.encode(commandBuffer: commandBuffer,
sourceTexture: input,
destinationTexture: output)
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
}
let totalTime = CFAbsoluteTimeGetCurrent() - startTime
return totalTime / Double(iterations)
}
}
6) Real-World Integration: Camera + MPS 📸
import AVFoundation
final class RealtimeCameraProcessor: NSObject, AVCaptureVideoDataOutputSampleBufferDelegate {
private let processor: FilterChain
private let textureLoader: MTKTextureLoader
override init() {
guard let processor = FilterChain() else {
fatalError("Metal initialization failed")
}
self.processor = processor
self.textureLoader = MTKTextureLoader(device: processor.device)
super.init()
setupCamera()
}
func captureOutput(_ output: AVCaptureOutput,
didOutput sampleBuffer: CMSampleBuffer,
from connection: AVCaptureConnection) {
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return }
// Convert CVPixelBuffer to MTLTexture
var textureRef: CVMetalTexture?
let status = CVMetalTextureCacheCreateTextureFromImage(
nil, textureCache, pixelBuffer, nil, .bgra8Unorm,
CVPixelBufferGetWidth(pixelBuffer),
CVPixelBufferGetHeight(pixelBuffer),
0, &textureRef
)
guard status == kCVReturnSuccess,
let textureRef = textureRef,
let inputTexture = CVMetalTextureGetTexture(textureRef),
let processedTexture = processor.processImage(inputTexture) else { return }
// Display processed texture
DispatchQueue.main.async {
self.displayTexture(processedTexture)
}
}
}
7) Checklist for Production MPS ✅
- Memory management: Use texture pools, monitor memory pressure
- Error handling: Graceful fallbacks when Metal unavailable
- Threading: Keep Metal work off main queue
- Power efficiency: Use appropriate precision (half vs float)
- Device compatibility: Check Metal feature sets
- Frame capture: Profile with Xcode’s GPU debugger
Key takeaway: MPS unlocks massive performance gains for computationally intensive tasks. Start with built-in kernels (blur, convolution), then graduate to custom shaders for specialized algorithms.
References:
Share on
Twitter Facebook LinkedIn☕ Buy me a coffee! 💝
If you found this article helpful, consider buying me a coffee to support my work! 🚀