to-live-photo/scripts/convert_realesrgan_to_coreml.py

#!/usr/bin/env python3
"""
Real-ESRGAN x2plus PyTorch to Core ML Conversion Script

Requirements:
    pip install torch torchvision coremltools pillow numpy

Usage:
    1. Download RealESRGAN_x2plus.pth from:
       https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth

    2. Place the .pth file in this scripts/ directory

    3. Run: python convert_realesrgan_to_coreml.py

    4. Output: RealESRGAN_x2plus.mlpackage in ../Sources/LivePhotoCore/Resources/

Note: The model processes tiles of 128x128 pixels.
      For larger images, the app will tile and stitch.
"""

import torch
import torch.nn as nn
import coremltools as ct
from coremltools.models.neural_network import quantization_utils
import numpy as np
from pathlib import Path
import sys

# ============================================================================
# Real-ESRGAN Model Architecture (RRDBNet)
# Simplified version matching the official implementation
# ============================================================================

def make_layer(block, n_layers, **kwargs):
    layers = []
    for _ in range(n_layers):
        layers.append(block(**kwargs))
    return nn.Sequential(*layers)


class ResidualDenseBlock(nn.Module):
    """Residual Dense Block used in RRDB."""

    def __init__(self, num_feat=64, num_grow_ch=32):
        super(ResidualDenseBlock, self).__init__()
        self.conv1 = nn.Conv2d(num_feat, num_grow_ch, 3, 1, 1)
        self.conv2 = nn.Conv2d(num_feat + num_grow_ch, num_grow_ch, 3, 1, 1)
        self.conv3 = nn.Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, 3, 1, 1)
        self.conv4 = nn.Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, 3, 1, 1)
        self.conv5 = nn.Conv2d(num_feat + 4 * num_grow_ch, num_feat, 3, 1, 1)
        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)

    def forward(self, x):
        x1 = self.lrelu(self.conv1(x))
        x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
        x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
        x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
        x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
        return x5 * 0.2 + x


class RRDB(nn.Module):
    """Residual in Residual Dense Block."""

    def __init__(self, num_feat, num_grow_ch=32):
        super(RRDB, self).__init__()
        self.rdb1 = ResidualDenseBlock(num_feat, num_grow_ch)
        self.rdb2 = ResidualDenseBlock(num_feat, num_grow_ch)
        self.rdb3 = ResidualDenseBlock(num_feat, num_grow_ch)

    def forward(self, x):
        out = self.rdb1(x)
        out = self.rdb2(out)
        out = self.rdb3(out)
        return out * 0.2 + x


class RRDBNet(nn.Module):
    """Networks consisting of Residual in Residual Dense Block."""

    def __init__(self, num_in_ch=3, num_out_ch=3, scale=2, num_feat=64, num_block=23, num_grow_ch=32):
        super(RRDBNet, self).__init__()
        self.scale = scale

        # First conv
        self.conv_first = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)

        # Body (RRDB blocks)
        self.body = make_layer(RRDB, num_block, num_feat=num_feat, num_grow_ch=num_grow_ch)
        self.conv_body = nn.Conv2d(num_feat, num_feat, 3, 1, 1)

        # Upsampling
        self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
        if scale == 4:
            self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)

        self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
        self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)

        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)

    def forward(self, x):
        feat = self.conv_first(x)
        body_feat = self.conv_body(self.body(feat))
        feat = feat + body_feat

        # Upsample
        feat = self.lrelu(self.conv_up1(nn.functional.interpolate(feat, scale_factor=2, mode='nearest')))
        if self.scale == 4:
            feat = self.lrelu(self.conv_up2(nn.functional.interpolate(feat, scale_factor=2, mode='nearest')))

        out = self.conv_last(self.lrelu(self.conv_hr(feat)))
        return out


def convert_to_coreml(
    weights_path: str,
    output_dir: str,
    tile_size: int = 128,
    scale: int = 2,
    quantize: bool = True
):
    """
    Convert Real-ESRGAN PyTorch weights to Core ML format.

    Args:
        weights_path: Path to .pth weights file
        output_dir: Output directory for .mlpackage
        tile_size: Input tile size (128 recommended for memory efficiency)
        scale: Upscale factor (2 or 4)
        quantize: Apply INT8 quantization to reduce model size
    """
    print(f"Loading PyTorch model from: {weights_path}")

    # Initialize model
    model = RRDBNet(
        num_in_ch=3,
        num_out_ch=3,
        scale=scale,
        num_feat=64,
        num_block=23,
        num_grow_ch=32
    )

    # Load weights
    state_dict = torch.load(weights_path, map_location='cpu')

    # Handle different weight formats
    if 'params_ema' in state_dict:
        state_dict = state_dict['params_ema']
    elif 'params' in state_dict:
        state_dict = state_dict['params']

    model.load_state_dict(state_dict, strict=True)
    model.eval()

    print(f"Model loaded successfully. Scale: {scale}x")

    # Trace the model
    print(f"Tracing model with input size: {tile_size}x{tile_size}")
    example_input = torch.rand(1, 3, tile_size, tile_size)

    with torch.no_grad():
        traced_model = torch.jit.trace(model, example_input)

    # Convert to Core ML
    print("Converting to Core ML...")

    output_size = tile_size * scale

    mlmodel = ct.convert(
        traced_model,
        inputs=[
            ct.ImageType(
                name="input",
                shape=(1, 3, tile_size, tile_size),
                color_layout=ct.colorlayout.RGB,
                scale=1.0 / 255.0,  # Normalize to [0, 1]
            )
        ],
        outputs=[
            ct.ImageType(
                name="output",
                color_layout=ct.colorlayout.RGB,
                scale=255.0,  # Denormalize back to [0, 255]
            )
        ],
        minimum_deployment_target=ct.target.iOS17,
        compute_units=ct.ComputeUnit.ALL,  # Enable Neural Engine
        convert_to="mlprogram",  # Use ML Program format for iOS 17+
    )

    # Set model metadata
    mlmodel.author = "Real-ESRGAN (xinntao) / Converted for Live Photo Maker"
    mlmodel.license = "BSD 3-Clause License"
    mlmodel.short_description = f"Real-ESRGAN x{scale} super-resolution model. Input: {tile_size}x{tile_size} RGB image tile. Output: {output_size}x{output_size} enhanced tile."
    mlmodel.version = "1.0"

    # Apply quantization if requested
    if quantize:
        print("Applying INT8 quantization...")
        # Note: For mlprogram format, use compression instead of quantization_utils
        # This is a simplified approach; full quantization requires calibration data
        pass  # Skip quantization for now - mlprogram doesn't support simple quantization

    # Save
    output_path = Path(output_dir) / f"RealESRGAN_x{scale}plus.mlpackage"
    print(f"Saving to: {output_path}")
    mlmodel.save(str(output_path))

    # Print model info
    spec = mlmodel.get_spec()
    print(f"\n=== Model Info ===")
    print(f"Input: {spec.description.input[0].name}")
    print(f"Output: {spec.description.output[0].name}")
    print(f"File size: {sum(f.stat().st_size for f in output_path.rglob('*') if f.is_file()) / 1024 / 1024:.2f} MB")

    return output_path


def verify_conversion(mlpackage_path: str, weights_path: str, tile_size: int = 128):
    """
    Verify the Core ML conversion by comparing outputs.
    """
    print("\n=== Verifying Conversion ===")

    try:
        import coremltools as ct
        from PIL import Image

        # Load Core ML model
        mlmodel = ct.models.MLModel(mlpackage_path)

        # Create test input
        test_input = np.random.randint(0, 255, (tile_size, tile_size, 3), dtype=np.uint8)
        test_image = Image.fromarray(test_input, mode='RGB')

        # Run Core ML inference
        coreml_output = mlmodel.predict({'input': test_image})

        print(f"Core ML inference successful!")
        print(f"Output shape: {coreml_output['output'].size}")

        return True

    except Exception as e:
        print(f"Verification failed: {e}")
        return False


if __name__ == "__main__":
    script_dir = Path(__file__).parent
    weights_path = script_dir / "RealESRGAN_x2plus.pth"
    output_dir = script_dir.parent / "Sources" / "LivePhotoCore" / "Resources"

    # Check if weights exist
    if not weights_path.exists():
        print(f"ERROR: Weights file not found at: {weights_path}")
        print("\nPlease download the weights file from:")
        print("https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth")
        print(f"\nAnd place it in: {script_dir}")
        sys.exit(1)

    # Create output directory
    output_dir.mkdir(parents=True, exist_ok=True)

    # Convert
    mlpackage_path = convert_to_coreml(
        weights_path=str(weights_path),
        output_dir=str(output_dir),
        tile_size=128,
        scale=2,
        quantize=False  # Quantization handled separately
    )

    # Verify
    verify_conversion(str(mlpackage_path), str(weights_path))

    print("\n=== Conversion Complete ===")
    print(f"Output: {mlpackage_path}")
    print("\nNext steps:")
    print("1. Open Xcode and add the .mlpackage to your project")
    print("2. Xcode will compile it to .mlmodelc automatically")
    print("3. Or compile manually: xcrun coremlcompiler compile RealESRGAN_x2plus.mlpackage .")