Unleashing GPU Power: Metal Performance Shaders for Real-Time Image Processing

7 minute read

Published:

Metal Performance Shaders (MPS) transforms iOS apps by offloading computation to the GPU. This guide shows production patterns for real-time image processing, ML inference, and compute shaders—delivering 10x+ performance gains over CPU-only approaches. 🚀

Why MPS Matters in Production Apps 📱

Modern iOS devices pack incredible GPU power. The A17 Pro delivers 35+ TOPS of compute performance. MPS lets you tap this power for:

  • Real-time filters (Instagram, TikTok-style effects)
  • ML model inference (Core ML acceleration)
  • Computer vision (object detection, segmentation)
  • Scientific computing (signal processing, simulations)

1) Foundation: MTLDevice and Command Buffers

import Metal
import MetalPerformanceShaders

final class GPUProcessor {
    private let device: MTLDevice
    private let commandQueue: MTLCommandQueue
    
    init?() {
        guard let device = MTLCreateSystemDefaultDevice(),
              let commandQueue = device.makeCommandQueue() else {
            return nil
        }
        self.device = device
        self.commandQueue = commandQueue
    }
    
    func process<T: MPSKernel>(_ kernel: T, 
                              input: MTLTexture, 
                              output: MTLTexture) {
        guard let commandBuffer = commandQueue.makeCommandBuffer() else { return }
        
        kernel.encode(commandBuffer: commandBuffer, 
                     sourceTexture: input, 
                     destinationTexture: output)
        
        commandBuffer.commit()
        commandBuffer.waitUntilCompleted()
    }
}

2) Real-Time Image Filters with MPS 🎨

Gaussian Blur Pipeline

final class BlurProcessor: GPUProcessor {
    private var blurKernel: MPSImageGaussianBlur?
    
    override init?() {
        super.init()
        guard let device = self.device else { return nil }
        blurKernel = MPSImageGaussianBlur(device: device, sigma: 2.0)
    }
    
    func applyBlur(to texture: MTLTexture) -> MTLTexture? {
        guard let blur = blurKernel,
              let outputTexture = makeTexture(like: texture) else { return nil }
        
        process(blur, input: texture, output: outputTexture)
        return outputTexture
    }
    
    private func makeTexture(like source: MTLTexture) -> MTLTexture? {
        let descriptor = MTLTextureDescriptor.texture2DDescriptor(
            pixelFormat: source.pixelFormat,
            width: source.width,
            height: source.height,
            mipmapped: false
        )
        descriptor.usage = [.shaderRead, .shaderWrite]
        return device.makeTexture(descriptor: descriptor)
    }
}

Chain Multiple Effects

final class FilterChain: GPUProcessor {
    private let blur: MPSImageGaussianBlur
    private let sharpen: MPSImageUnsharpMask
    private let contrast: MPSImageContrast
    
    override init?() {
        super.init()
        guard let device = self.device else { return nil }
        
        blur = MPSImageGaussianBlur(device: device, sigma: 1.0)
        sharpen = MPSImageUnsharpMask(device: device)
        contrast = MPSImageContrast(device: device)
        contrast.contrast = 1.2
    }
    
    func processImage(_ input: MTLTexture) -> MTLTexture? {
        guard let temp1 = makeTexture(like: input),
              let temp2 = makeTexture(like: input),
              let output = makeTexture(like: input),
              let commandBuffer = commandQueue.makeCommandBuffer() else { return nil }
        
        // Chain: Input → Blur → Sharpen → Contrast → Output
        blur.encode(commandBuffer: commandBuffer, sourceTexture: input, destinationTexture: temp1)
        sharpen.encode(commandBuffer: commandBuffer, sourceTexture: temp1, destinationTexture: temp2)
        contrast.encode(commandBuffer: commandBuffer, sourceTexture: temp2, destinationTexture: output)
        
        commandBuffer.commit()
        commandBuffer.waitUntilCompleted()
        
        return output
    }
}

3) Custom Compute Shaders 🔧

For complex algorithms, write custom Metal shaders:

Edge Detection Shader

// EdgeDetection.metal
#include <metal_stdlib>
using namespace metal;

kernel void sobelEdgeDetection(texture2d<float, access::read> input [[texture(0)]],
                              texture2d<float, access::write> output [[texture(1)]],
                              uint2 gid [[thread_position_in_grid]]) {
    if (gid.x >= input.get_width() || gid.y >= input.get_height()) return;
    
    // Sobel kernels
    const float3x3 sobelX = float3x3(-1, 0, 1, -2, 0, 2, -1, 0, 1);
    const float3x3 sobelY = float3x3(-1, -2, -1, 0, 0, 0, 1, 2, 1);
    
    float3 gx = 0, gy = 0;
    
    for (int i = -1; i <= 1; i++) {
        for (int j = -1; j <= 1; j++) {
            uint2 coord = uint2(max(0, min(int(input.get_width()-1), int(gid.x) + i)),
                               max(0, min(int(input.get_height()-1), int(gid.y) + j)));
            float3 pixel = input.read(coord).rgb;
            
            gx += pixel * sobelX[i+1][j+1];
            gy += pixel * sobelY[i+1][j+1];
        }
    }
    
    float magnitude = length(float2(dot(gx, float3(0.299, 0.587, 0.114)),
                                   dot(gy, float3(0.299, 0.587, 0.114))));
    
    output.write(float4(magnitude, magnitude, magnitude, 1.0), gid);
}

Swift Integration

final class CustomShaderProcessor: GPUProcessor {
    private var computePipelineState: MTLComputePipelineState?
    
    override init?() {
        super.init()
        setupPipeline()
    }
    
    private func setupPipeline() {
        guard let library = device.makeDefaultLibrary(),
              let function = library.makeFunction(name: "sobelEdgeDetection") else {
            print("Failed to create compute function")
            return
        }
        
        do {
            computePipelineState = try device.makeComputePipelineState(function: function)
        } catch {
            print("Failed to create pipeline state: \(error)")
        }
    }
    
    func detectEdges(in texture: MTLTexture) -> MTLTexture? {
        guard let pipelineState = computePipelineState,
              let commandBuffer = commandQueue.makeCommandBuffer(),
              let encoder = commandBuffer.makeComputeCommandEncoder(),
              let outputTexture = makeTexture(like: texture) else { return nil }
        
        encoder.setComputePipelineState(pipelineState)
        encoder.setTexture(texture, index: 0)
        encoder.setTexture(outputTexture, index: 1)
        
        let threadsPerGroup = MTLSize(width: 16, height: 16, depth: 1)
        let groupsPerGrid = MTLSize(
            width: (texture.width + threadsPerGroup.width - 1) / threadsPerGroup.width,
            height: (texture.height + threadsPerGroup.height - 1) / threadsPerGroup.height,
            depth: 1
        )
        
        encoder.dispatchThreadgroups(groupsPerGrid, threadsPerThreadgroup: threadsPerGroup)
        encoder.endEncoding()
        
        commandBuffer.commit()
        commandBuffer.waitUntilCompleted()
        
        return outputTexture
    }
}

4) Performance Optimization Patterns 🏎️

Memory Pool for Textures

final class TexturePool {
    private var availableTextures: [String: [MTLTexture]] = [:]
    private let device: MTLDevice
    
    init(device: MTLDevice) {
        self.device = device
    }
    
    func texture(width: Int, height: Int, pixelFormat: MTLPixelFormat) -> MTLTexture? {
        let key = "\(width)x\(height)_\(pixelFormat.rawValue)"
        
        if let texture = availableTextures[key]?.popLast() {
            return texture
        }
        
        let descriptor = MTLTextureDescriptor.texture2DDescriptor(
            pixelFormat: pixelFormat,
            width: width,
            height: height,
            mipmapped: false
        )
        descriptor.usage = [.shaderRead, .shaderWrite]
        
        return device.makeTexture(descriptor: descriptor)
    }
    
    func returnTexture(_ texture: MTLTexture) {
        let key = "\(texture.width)x\(texture.height)_\(texture.pixelFormat.rawValue)"
        availableTextures[key, default: []].append(texture)
    }
}

Async Processing Pipeline

final class AsyncImageProcessor {
    private let metalQueue = DispatchQueue(label: "metal.processing", qos: .userInitiated)
    private let processor: GPUProcessor
    private let texturePool: TexturePool
    
    init() {
        guard let processor = GPUProcessor() else {
            fatalError("Failed to initialize Metal")
        }
        self.processor = processor
        self.texturePool = TexturePool(device: processor.device)
    }
    
    func processImage(_ image: UIImage) async -> UIImage? {
        await withCheckedContinuation { continuation in
            metalQueue.async {
                guard let cgImage = image.cgImage,
                      let inputTexture = self.loadTexture(from: cgImage),
                      let outputTexture = self.processor.detectEdges(in: inputTexture),
                      let processedImage = self.imageFromTexture(outputTexture) else {
                    continuation.resume(returning: nil)
                    return
                }
                
                continuation.resume(returning: processedImage)
            }
        }
    }
}

5) Production Debugging and Profiling 🔍

Metal Frame Capture

Enable Metal frame debugging in Xcode:

  1. Product → Scheme → Edit Scheme
  2. Run → Diagnostics → Metal API Validation: Enabled
  3. GPU Frame Capture: Metal

Performance Metrics

extension GPUProcessor {
    func benchmarkKernel<T: MPSKernel>(_ kernel: T, 
                                     input: MTLTexture, 
                                     iterations: Int = 100) -> TimeInterval {
        let startTime = CFAbsoluteTimeGetCurrent()
        
        for _ in 0..<iterations {
            guard let commandBuffer = commandQueue.makeCommandBuffer(),
                  let output = makeTexture(like: input) else { continue }
            
            kernel.encode(commandBuffer: commandBuffer, 
                         sourceTexture: input, 
                         destinationTexture: output)
            commandBuffer.commit()
            commandBuffer.waitUntilCompleted()
        }
        
        let totalTime = CFAbsoluteTimeGetCurrent() - startTime
        return totalTime / Double(iterations)
    }
}

6) Real-World Integration: Camera + MPS 📸

import AVFoundation

final class RealtimeCameraProcessor: NSObject, AVCaptureVideoDataOutputSampleBufferDelegate {
    private let processor: FilterChain
    private let textureLoader: MTKTextureLoader
    
    override init() {
        guard let processor = FilterChain() else {
            fatalError("Metal initialization failed")
        }
        self.processor = processor
        self.textureLoader = MTKTextureLoader(device: processor.device)
        super.init()
        setupCamera()
    }
    
    func captureOutput(_ output: AVCaptureOutput, 
                      didOutput sampleBuffer: CMSampleBuffer, 
                      from connection: AVCaptureConnection) {
        guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return }
        
        // Convert CVPixelBuffer to MTLTexture
        var textureRef: CVMetalTexture?
        let status = CVMetalTextureCacheCreateTextureFromImage(
            nil, textureCache, pixelBuffer, nil, .bgra8Unorm,
            CVPixelBufferGetWidth(pixelBuffer),
            CVPixelBufferGetHeight(pixelBuffer),
            0, &textureRef
        )
        
        guard status == kCVReturnSuccess,
              let textureRef = textureRef,
              let inputTexture = CVMetalTextureGetTexture(textureRef),
              let processedTexture = processor.processImage(inputTexture) else { return }
        
        // Display processed texture
        DispatchQueue.main.async {
            self.displayTexture(processedTexture)
        }
    }
}

7) Checklist for Production MPS ✅

  • Memory management: Use texture pools, monitor memory pressure
  • Error handling: Graceful fallbacks when Metal unavailable
  • Threading: Keep Metal work off main queue
  • Power efficiency: Use appropriate precision (half vs float)
  • Device compatibility: Check Metal feature sets
  • Frame capture: Profile with Xcode’s GPU debugger

Key takeaway: MPS unlocks massive performance gains for computationally intensive tasks. Start with built-in kernels (blur, convolution), then graduate to custom shaders for specialized algorithms.

References: