rename to iopaint

2024-01-05 15:19:23 +08:00
parent f1f18aa6cd
commit a73e2a531f
101 changed files with 180 additions and 253 deletions
--- a/iopaint/model/init.py
+++ b/iopaint/model/init.py
@@ -0,0 +1,35 @@
+from .controlnet import ControlNet
+from .fcf import FcF
+from .instruct_pix2pix import InstructPix2Pix
+from .kandinsky import Kandinsky22
+from .lama import LaMa
+from .ldm import LDM
+from .manga import Manga
+from .mat import MAT
+from .mi_gan import MIGAN
+from .opencv2 import OpenCV2
+from .paint_by_example import PaintByExample
+from .power_paint.power_paint import PowerPaint
+from .sd import SD15, SD2, Anything4, RealisticVision14, SD
+from .sdxl import SDXL
+from .zits import ZITS
+
+models = {
+    LaMa.name: LaMa,
+    LDM.name: LDM,
+    ZITS.name: ZITS,
+    MAT.name: MAT,
+    FcF.name: FcF,
+    OpenCV2.name: OpenCV2,
+    Manga.name: Manga,
+    MIGAN.name: MIGAN,
+    SD15.name: SD15,
+    Anything4.name: Anything4,
+    RealisticVision14.name: RealisticVision14,
+    SD2.name: SD2,
+    PaintByExample.name: PaintByExample,
+    InstructPix2Pix.name: InstructPix2Pix,
+    Kandinsky22.name: Kandinsky22,
+    SDXL.name: SDXL,
+    PowerPaint.name: PowerPaint,
+}
--- a/iopaint/model/base.py
+++ b/iopaint/model/base.py
@@ -0,0 +1,422 @@
+import abc
+from typing import Optional
+
+import cv2
+import torch
+import numpy as np
+from loguru import logger
+
+from iopaint.helper import (
+    boxes_from_mask,
+    resize_max_size,
+    pad_img_to_modulo,
+    switch_mps_device,
+)
+from iopaint.model.helper.g_diffuser_bot import expand_image
+from iopaint.model.utils import get_scheduler
+from iopaint.schema import InpaintRequest, HDStrategy, SDSampler
+
+
+class InpaintModel:
+    name = "base"
+    min_size: Optional[int] = None
+    pad_mod = 8
+    pad_to_square = False
+    is_erase_model = False
+
+    def __init__(self, device, **kwargs):
+        """
+
+        Args:
+            device:
+        """
+        device = switch_mps_device(self.name, device)
+        self.device = device
+        self.init_model(device, **kwargs)
+
+    @abc.abstractmethod
+    def init_model(self, device, **kwargs):
+        ...
+
+    @staticmethod
+    @abc.abstractmethod
+    def is_downloaded() -> bool:
+        return False
+
+    @abc.abstractmethod
+    def forward(self, image, mask, config: InpaintRequest):
+        """Input images and output images have same size
+        images: [H, W, C] RGB
+        masks: [H, W, 1] 255 为 masks 区域
+        return: BGR IMAGE
+        """
+        ...
+
+    @staticmethod
+    def download():
+        ...
+
+    def _pad_forward(self, image, mask, config: InpaintRequest):
+        origin_height, origin_width = image.shape[:2]
+        pad_image = pad_img_to_modulo(
+            image, mod=self.pad_mod, square=self.pad_to_square, min_size=self.min_size
+        )
+        pad_mask = pad_img_to_modulo(
+            mask, mod=self.pad_mod, square=self.pad_to_square, min_size=self.min_size
+        )
+
+        # logger.info(f"final forward pad size: {pad_image.shape}")
+
+        image, mask = self.forward_pre_process(image, mask, config)
+
+        result = self.forward(pad_image, pad_mask, config)
+        result = result[0:origin_height, 0:origin_width, :]
+
+        result, image, mask = self.forward_post_process(result, image, mask, config)
+
+        if config.sd_keep_unmasked_area:
+            mask = mask[:, :, np.newaxis]
+            result = result * (mask / 255) + image[:, :, ::-1] * (1 - (mask / 255))
+        return result
+
+    def forward_pre_process(self, image, mask, config):
+        return image, mask
+
+    def forward_post_process(self, result, image, mask, config):
+        return result, image, mask
+
+    @torch.no_grad()
+    def __call__(self, image, mask, config: InpaintRequest):
+        """
+        images: [H, W, C] RGB, not normalized
+        masks: [H, W]
+        return: BGR IMAGE
+        """
+        inpaint_result = None
+        # logger.info(f"hd_strategy: {config.hd_strategy}")
+        if config.hd_strategy == HDStrategy.CROP:
+            if max(image.shape) > config.hd_strategy_crop_trigger_size:
+                logger.info(f"Run crop strategy")
+                boxes = boxes_from_mask(mask)
+                crop_result = []
+                for box in boxes:
+                    crop_image, crop_box = self._run_box(image, mask, box, config)
+                    crop_result.append((crop_image, crop_box))
+
+                inpaint_result = image[:, :, ::-1]
+                for crop_image, crop_box in crop_result:
+                    x1, y1, x2, y2 = crop_box
+                    inpaint_result[y1:y2, x1:x2, :] = crop_image
+
+        elif config.hd_strategy == HDStrategy.RESIZE:
+            if max(image.shape) > config.hd_strategy_resize_limit:
+                origin_size = image.shape[:2]
+                downsize_image = resize_max_size(
+                    image, size_limit=config.hd_strategy_resize_limit
+                )
+                downsize_mask = resize_max_size(
+                    mask, size_limit=config.hd_strategy_resize_limit
+                )
+
+                logger.info(
+                    f"Run resize strategy, origin size: {image.shape} forward size: {downsize_image.shape}"
+                )
+                inpaint_result = self._pad_forward(
+                    downsize_image, downsize_mask, config
+                )
+
+                # only paste masked area result
+                inpaint_result = cv2.resize(
+                    inpaint_result,
+                    (origin_size[1], origin_size[0]),
+                    interpolation=cv2.INTER_CUBIC,
+                )
+                original_pixel_indices = mask < 127
+                inpaint_result[original_pixel_indices] = image[:, :, ::-1][
+                    original_pixel_indices
+                ]
+
+        if inpaint_result is None:
+            inpaint_result = self._pad_forward(image, mask, config)
+
+        return inpaint_result
+
+    def _crop_box(self, image, mask, box, config: InpaintRequest):
+        """
+
+        Args:
+            image: [H, W, C] RGB
+            mask: [H, W, 1]
+            box: [left,top,right,bottom]
+
+        Returns:
+            BGR IMAGE, (l, r, r, b)
+        """
+        box_h = box[3] - box[1]
+        box_w = box[2] - box[0]
+        cx = (box[0] + box[2]) // 2
+        cy = (box[1] + box[3]) // 2
+        img_h, img_w = image.shape[:2]
+
+        w = box_w + config.hd_strategy_crop_margin * 2
+        h = box_h + config.hd_strategy_crop_margin * 2
+
+        _l = cx - w // 2
+        _r = cx + w // 2
+        _t = cy - h // 2
+        _b = cy + h // 2
+
+        l = max(_l, 0)
+        r = min(_r, img_w)
+        t = max(_t, 0)
+        b = min(_b, img_h)
+
+        # try to get more context when crop around image edge
+        if _l < 0:
+            r += abs(_l)
+        if _r > img_w:
+            l -= _r - img_w
+        if _t < 0:
+            b += abs(_t)
+        if _b > img_h:
+            t -= _b - img_h
+
+        l = max(l, 0)
+        r = min(r, img_w)
+        t = max(t, 0)
+        b = min(b, img_h)
+
+        crop_img = image[t:b, l:r, :]
+        crop_mask = mask[t:b, l:r]
+
+        # logger.info(f"box size: ({box_h},{box_w}) crop size: {crop_img.shape}")
+
+        return crop_img, crop_mask, [l, t, r, b]
+
+    def _calculate_cdf(self, histogram):
+        cdf = histogram.cumsum()
+        normalized_cdf = cdf / float(cdf.max())
+        return normalized_cdf
+
+    def _calculate_lookup(self, source_cdf, reference_cdf):
+        lookup_table = np.zeros(256)
+        lookup_val = 0
+        for source_index, source_val in enumerate(source_cdf):
+            for reference_index, reference_val in enumerate(reference_cdf):
+                if reference_val >= source_val:
+                    lookup_val = reference_index
+                    break
+            lookup_table[source_index] = lookup_val
+        return lookup_table
+
+    def _match_histograms(self, source, reference, mask):
+        transformed_channels = []
+        for channel in range(source.shape[-1]):
+            source_channel = source[:, :, channel]
+            reference_channel = reference[:, :, channel]
+
+            # only calculate histograms for non-masked parts
+            source_histogram, _ = np.histogram(source_channel[mask == 0], 256, [0, 256])
+            reference_histogram, _ = np.histogram(
+                reference_channel[mask == 0], 256, [0, 256]
+            )
+
+            source_cdf = self._calculate_cdf(source_histogram)
+            reference_cdf = self._calculate_cdf(reference_histogram)
+
+            lookup = self._calculate_lookup(source_cdf, reference_cdf)
+
+            transformed_channels.append(cv2.LUT(source_channel, lookup))
+
+        result = cv2.merge(transformed_channels)
+        result = cv2.convertScaleAbs(result)
+
+        return result
+
+    def _apply_cropper(self, image, mask, config: InpaintRequest):
+        img_h, img_w = image.shape[:2]
+        l, t, w, h = (
+            config.croper_x,
+            config.croper_y,
+            config.croper_width,
+            config.croper_height,
+        )
+        r = l + w
+        b = t + h
+
+        l = max(l, 0)
+        r = min(r, img_w)
+        t = max(t, 0)
+        b = min(b, img_h)
+
+        crop_img = image[t:b, l:r, :]
+        crop_mask = mask[t:b, l:r]
+        return crop_img, crop_mask, (l, t, r, b)
+
+    def _run_box(self, image, mask, box, config: InpaintRequest):
+        """
+
+        Args:
+            image: [H, W, C] RGB
+            mask: [H, W, 1]
+            box: [left,top,right,bottom]
+
+        Returns:
+            BGR IMAGE
+        """
+        crop_img, crop_mask, [l, t, r, b] = self._crop_box(image, mask, box, config)
+
+        return self._pad_forward(crop_img, crop_mask, config), [l, t, r, b]
+
+
+class DiffusionInpaintModel(InpaintModel):
+    def __init__(self, device, **kwargs):
+        self.model_info = kwargs["model_info"]
+        self.model_id_or_path = self.model_info.path
+        super().__init__(device, **kwargs)
+
+    @torch.no_grad()
+    def __call__(self, image, mask, config: InpaintRequest):
+        """
+        images: [H, W, C] RGB, not normalized
+        masks: [H, W]
+        return: BGR IMAGE
+        """
+        # boxes = boxes_from_mask(mask)
+        if config.use_croper:
+            crop_img, crop_mask, (l, t, r, b) = self._apply_cropper(image, mask, config)
+            crop_image = self._scaled_pad_forward(crop_img, crop_mask, config)
+            inpaint_result = image[:, :, ::-1]
+            inpaint_result[t:b, l:r, :] = crop_image
+        elif config.use_extender:
+            inpaint_result = self._do_outpainting(image, config)
+        else:
+            inpaint_result = self._scaled_pad_forward(image, mask, config)
+
+        return inpaint_result
+
+    def _do_outpainting(self, image, config: InpaintRequest):
+        # cropper 和 image 在同一个坐标系下，croper_x/y 可能为负数
+        # 从 image 中 crop 出 outpainting 区域
+        image_h, image_w = image.shape[:2]
+        cropper_l = config.extender_x
+        cropper_t = config.extender_y
+        cropper_r = config.extender_x + config.extender_width
+        cropper_b = config.extender_y + config.extender_height
+        image_l = 0
+        image_t = 0
+        image_r = image_w
+        image_b = image_h
+
+        # 类似求 IOU
+        l = max(cropper_l, image_l)
+        t = max(cropper_t, image_t)
+        r = min(cropper_r, image_r)
+        b = min(cropper_b, image_b)
+
+        assert (
+            0 <= l < r and 0 <= t < b
+        ), f"cropper and image not overlap, {l},{t},{r},{b}"
+
+        cropped_image = image[t:b, l:r, :]
+        padding_l = max(0, image_l - cropper_l)
+        padding_t = max(0, image_t - cropper_t)
+        padding_r = max(0, cropper_r - image_r)
+        padding_b = max(0, cropper_b - image_b)
+
+        zero_padding_count = [padding_l, padding_t, padding_r, padding_b].count(0)
+
+        if zero_padding_count not in [0, 3]:
+            logger.warning(
+                f"padding count({zero_padding_count}) not 0 or 3, may result in bad edge outpainting"
+            )
+
+        expanded_image, mask_image = expand_image(
+            cropped_image,
+            left=padding_l,
+            top=padding_t,
+            right=padding_r,
+            bottom=padding_b,
+            softness=config.sd_outpainting_softness,
+            space=config.sd_outpainting_space,
+        )
+
+        # 最终扩大了的 image, BGR
+        expanded_cropped_result_image = self._scaled_pad_forward(
+            expanded_image, mask_image, config
+        )
+
+        # RGB -> BGR
+        outpainting_image = cv2.copyMakeBorder(
+            image,
+            left=padding_l,
+            top=padding_t,
+            right=padding_r,
+            bottom=padding_b,
+            borderType=cv2.BORDER_CONSTANT,
+            value=0,
+        )[:, :, ::-1]
+
+        # 把 cropped_result_image 贴到 outpainting_image 上，这一步不需要 blend
+        paste_t = 0 if config.extender_y < 0 else config.extender_y
+        paste_l = 0 if config.extender_x < 0 else config.extender_x
+
+        outpainting_image[
+            paste_t : paste_t + expanded_cropped_result_image.shape[0],
+            paste_l : paste_l + expanded_cropped_result_image.shape[1],
+            :,
+        ] = expanded_cropped_result_image
+        return outpainting_image
+
+    def _scaled_pad_forward(self, image, mask, config: InpaintRequest):
+        longer_side_length = int(config.sd_scale * max(image.shape[:2]))
+        origin_size = image.shape[:2]
+        downsize_image = resize_max_size(image, size_limit=longer_side_length)
+        downsize_mask = resize_max_size(mask, size_limit=longer_side_length)
+        if config.sd_scale != 1:
+            logger.info(
+                f"Resize image to do sd inpainting: {image.shape} -> {downsize_image.shape}"
+            )
+        inpaint_result = self._pad_forward(downsize_image, downsize_mask, config)
+        # only paste masked area result
+        inpaint_result = cv2.resize(
+            inpaint_result,
+            (origin_size[1], origin_size[0]),
+            interpolation=cv2.INTER_CUBIC,
+        )
+
+        # blend result, copy from g_diffuser_bot
+        # mask_rgb = 1.0 - np_img_grey_to_rgb(mask / 255.0)
+        # inpaint_result = np.clip(
+        #     inpaint_result * (1.0 - mask_rgb) + image * mask_rgb, 0.0, 255.0
+        # )
+        # original_pixel_indices = mask < 127
+        # inpaint_result[original_pixel_indices] = image[:, :, ::-1][
+        #     original_pixel_indices
+        # ]
+        return inpaint_result
+
+    def set_scheduler(self, config: InpaintRequest):
+        scheduler_config = self.model.scheduler.config
+        sd_sampler = config.sd_sampler
+        if config.sd_lcm_lora:
+            sd_sampler = SDSampler.lcm
+            logger.info(f"LCM Lora enabled, use {sd_sampler} sampler")
+        scheduler = get_scheduler(sd_sampler, scheduler_config)
+        self.model.scheduler = scheduler
+
+    def forward_pre_process(self, image, mask, config):
+        if config.sd_mask_blur != 0:
+            k = 2 * config.sd_mask_blur + 1
+            mask = cv2.GaussianBlur(mask, (k, k), 0)[:, :, np.newaxis]
+
+        return image, mask
+
+    def forward_post_process(self, result, image, mask, config):
+        if config.sd_match_histograms:
+            result = self._match_histograms(result, image[:, :, ::-1], mask)
+
+        if config.sd_mask_blur != 0:
+            k = 2 * config.sd_mask_blur + 1
+            mask = cv2.GaussianBlur(mask, (k, k), 0)
+        return result, image, mask
--- a/iopaint/model/controlnet.py
+++ b/iopaint/model/controlnet.py
@@ -0,0 +1,166 @@
+import PIL.Image
+import cv2
+import numpy as np
+import torch
+from diffusers import ControlNetModel, DiffusionPipeline
+from loguru import logger
+
+from iopaint.model.base import DiffusionInpaintModel
+from iopaint.model.helper.controlnet_preprocess import (
+    make_canny_control_image,
+    make_openpose_control_image,
+    make_depth_control_image,
+    make_inpaint_control_image,
+)
+from iopaint.model.helper.cpu_text_encoder import CPUTextEncoderWrapper
+from iopaint.model.utils import get_scheduler, handle_from_pretrained_exceptions
+from iopaint.schema import InpaintRequest, ModelType
+
+
+class ControlNet(DiffusionInpaintModel):
+    name = "controlnet"
+    pad_mod = 8
+    min_size = 512
+
+    @property
+    def lcm_lora_id(self):
+        if self.model_info.model_type in [
+            ModelType.DIFFUSERS_SD,
+            ModelType.DIFFUSERS_SD_INPAINT,
+        ]:
+            return "latent-consistency/lcm-lora-sdv1-5"
+        if self.model_info.model_type in [
+            ModelType.DIFFUSERS_SDXL,
+            ModelType.DIFFUSERS_SDXL_INPAINT,
+        ]:
+            return "latent-consistency/lcm-lora-sdxl"
+        raise NotImplementedError(f"Unsupported controlnet lcm model {self.model_info}")
+
+    def init_model(self, device: torch.device, **kwargs):
+        fp16 = not kwargs.get("no_half", False)
+        model_info = kwargs["model_info"]
+        controlnet_method = kwargs["controlnet_method"]
+
+        self.model_info = model_info
+        self.controlnet_method = controlnet_method
+
+        model_kwargs = {}
+        if kwargs["disable_nsfw"] or kwargs.get("cpu_offload", False):
+            logger.info("Disable Stable Diffusion Model NSFW checker")
+            model_kwargs.update(
+                dict(
+                    safety_checker=None,
+                    feature_extractor=None,
+                    requires_safety_checker=False,
+                )
+            )
+
+        use_gpu = device == torch.device("cuda") and torch.cuda.is_available()
+        torch_dtype = torch.float16 if use_gpu and fp16 else torch.float32
+        self.torch_dtype = torch_dtype
+
+        if model_info.model_type in [
+            ModelType.DIFFUSERS_SD,
+            ModelType.DIFFUSERS_SD_INPAINT,
+        ]:
+            from diffusers import (
+                StableDiffusionControlNetInpaintPipeline as PipeClass,
+            )
+        elif model_info.model_type in [
+            ModelType.DIFFUSERS_SDXL,
+            ModelType.DIFFUSERS_SDXL_INPAINT,
+        ]:
+            from diffusers import (
+                StableDiffusionXLControlNetInpaintPipeline as PipeClass,
+            )
+
+        controlnet = ControlNetModel.from_pretrained(
+            pretrained_model_name_or_path=controlnet_method,
+            resume_download=True,
+        )
+        if model_info.is_single_file_diffusers:
+            if self.model_info.model_type == ModelType.DIFFUSERS_SD:
+                model_kwargs["num_in_channels"] = 4
+            else:
+                model_kwargs["num_in_channels"] = 9
+
+            self.model = PipeClass.from_single_file(
+                model_info.path, controlnet=controlnet, **model_kwargs
+            ).to(torch_dtype)
+        else:
+            self.model = handle_from_pretrained_exceptions(
+                PipeClass.from_pretrained,
+                pretrained_model_name_or_path=model_info.path,
+                controlnet=controlnet,
+                variant="fp16",
+                dtype=torch_dtype,
+                **model_kwargs,
+            )
+
+        if kwargs.get("cpu_offload", False) and use_gpu:
+            logger.info("Enable sequential cpu offload")
+            self.model.enable_sequential_cpu_offload(gpu_id=0)
+        else:
+            self.model = self.model.to(device)
+            if kwargs["sd_cpu_textencoder"]:
+                logger.info("Run Stable Diffusion TextEncoder on CPU")
+                self.model.text_encoder = CPUTextEncoderWrapper(
+                    self.model.text_encoder, torch_dtype
+                )
+
+        self.callback = kwargs.pop("callback", None)
+
+    def switch_controlnet_method(self, new_method: str):
+        self.controlnet_method = new_method
+        controlnet = ControlNetModel.from_pretrained(
+            new_method, torch_dtype=self.torch_dtype, resume_download=True
+        ).to(self.model.device)
+        self.model.controlnet = controlnet
+
+    def _get_control_image(self, image, mask):
+        if "canny" in self.controlnet_method:
+            control_image = make_canny_control_image(image)
+        elif "openpose" in self.controlnet_method:
+            control_image = make_openpose_control_image(image)
+        elif "depth" in self.controlnet_method:
+            control_image = make_depth_control_image(image)
+        elif "inpaint" in self.controlnet_method:
+            control_image = make_inpaint_control_image(image, mask)
+        else:
+            raise NotImplementedError(f"{self.controlnet_method} not implemented")
+        return control_image
+
+    def forward(self, image, mask, config: InpaintRequest):
+        """Input image and output image have same size
+        image: [H, W, C] RGB
+        mask: [H, W, 1] 255 means area to repaint
+        return: BGR IMAGE
+        """
+        scheduler_config = self.model.scheduler.config
+        scheduler = get_scheduler(config.sd_sampler, scheduler_config)
+        self.model.scheduler = scheduler
+
+        img_h, img_w = image.shape[:2]
+        control_image = self._get_control_image(image, mask)
+        mask_image = PIL.Image.fromarray(mask[:, :, -1], mode="L")
+        image = PIL.Image.fromarray(image)
+
+        output = self.model(
+            image=image,
+            mask_image=mask_image,
+            control_image=control_image,
+            prompt=config.prompt,
+            negative_prompt=config.negative_prompt,
+            num_inference_steps=config.sd_steps,
+            guidance_scale=config.sd_guidance_scale,
+            output_type="np",
+            callback_on_step_end=self.callback,
+            height=img_h,
+            width=img_w,
+            generator=torch.manual_seed(config.sd_seed),
+            controlnet_conditioning_scale=config.controlnet_conditioning_scale,
+        ).images[0]
+
+        output = (output * 255).round().astype("uint8")
+        output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
+        return output
--- a/iopaint/model/ddim_sampler.py
+++ b/iopaint/model/ddim_sampler.py
@@ -0,0 +1,193 @@
+import torch
+import numpy as np
+from tqdm import tqdm
+
+from iopaint.model.utils import make_ddim_timesteps, make_ddim_sampling_parameters, noise_like
+
+from loguru import logger
+
+
+class DDIMSampler(object):
+    def __init__(self, model, schedule="linear"):
+        super().__init__()
+        self.model = model
+        self.ddpm_num_timesteps = model.num_timesteps
+        self.schedule = schedule
+
+    def register_buffer(self, name, attr):
+        setattr(self, name, attr)
+
+    def make_schedule(
+        self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0.0, verbose=True
+    ):
+        self.ddim_timesteps = make_ddim_timesteps(
+            ddim_discr_method=ddim_discretize,
+            num_ddim_timesteps=ddim_num_steps,
+            # array([1])
+            num_ddpm_timesteps=self.ddpm_num_timesteps,
+            verbose=verbose,
+        )
+        alphas_cumprod = self.model.alphas_cumprod  # torch.Size([1000])
+        assert (
+                alphas_cumprod.shape[0] == self.ddpm_num_timesteps
+        ), "alphas have to be defined for each timestep"
+        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
+
+        self.register_buffer("betas", to_torch(self.model.betas))
+        self.register_buffer("alphas_cumprod", to_torch(alphas_cumprod))
+        self.register_buffer(
+            "alphas_cumprod_prev", to_torch(self.model.alphas_cumprod_prev)
+        )
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer(
+            "sqrt_alphas_cumprod", to_torch(np.sqrt(alphas_cumprod.cpu()))
+        )
+        self.register_buffer(
+            "sqrt_one_minus_alphas_cumprod",
+            to_torch(np.sqrt(1.0 - alphas_cumprod.cpu())),
+        )
+        self.register_buffer(
+            "log_one_minus_alphas_cumprod", to_torch(np.log(1.0 - alphas_cumprod.cpu()))
+        )
+        self.register_buffer(
+            "sqrt_recip_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod.cpu()))
+        )
+        self.register_buffer(
+            "sqrt_recipm1_alphas_cumprod",
+            to_torch(np.sqrt(1.0 / alphas_cumprod.cpu() - 1)),
+        )
+
+        # ddim sampling parameters
+        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(
+            alphacums=alphas_cumprod.cpu(),
+            ddim_timesteps=self.ddim_timesteps,
+            eta=ddim_eta,
+            verbose=verbose,
+        )
+        self.register_buffer("ddim_sigmas", ddim_sigmas)
+        self.register_buffer("ddim_alphas", ddim_alphas)
+        self.register_buffer("ddim_alphas_prev", ddim_alphas_prev)
+        self.register_buffer("ddim_sqrt_one_minus_alphas", np.sqrt(1.0 - ddim_alphas))
+        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
+            (1 - self.alphas_cumprod_prev)
+            / (1 - self.alphas_cumprod)
+            * (1 - self.alphas_cumprod / self.alphas_cumprod_prev)
+        )
+        self.register_buffer(
+            "ddim_sigmas_for_original_num_steps", sigmas_for_original_sampling_steps
+        )
+
+    @torch.no_grad()
+    def sample(self, steps, conditioning, batch_size, shape):
+        self.make_schedule(ddim_num_steps=steps, ddim_eta=0, verbose=False)
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+
+        # samples: 1,3,128,128
+        return self.ddim_sampling(
+            conditioning,
+            size,
+            quantize_denoised=False,
+            ddim_use_original_steps=False,
+            noise_dropout=0,
+            temperature=1.0,
+        )
+
+    @torch.no_grad()
+    def ddim_sampling(
+        self,
+        cond,
+        shape,
+        ddim_use_original_steps=False,
+        quantize_denoised=False,
+        temperature=1.0,
+        noise_dropout=0.0,
+    ):
+        device = self.model.betas.device
+        b = shape[0]
+        img = torch.randn(shape, device=device, dtype=cond.dtype)
+        timesteps = (
+            self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+        )
+
+        time_range = (
+            reversed(range(0, timesteps))
+            if ddim_use_original_steps
+            else np.flip(timesteps)
+        )
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+        logger.info(f"Running DDIM Sampling with {total_steps} timesteps")
+
+        iterator = tqdm(time_range, desc="DDIM Sampler", total=total_steps)
+
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b,), step, device=device, dtype=torch.long)
+
+            outs = self.p_sample_ddim(
+                img,
+                cond,
+                ts,
+                index=index,
+                use_original_steps=ddim_use_original_steps,
+                quantize_denoised=quantize_denoised,
+                temperature=temperature,
+                noise_dropout=noise_dropout,
+            )
+            img, _ = outs
+
+        return img
+
+    @torch.no_grad()
+    def p_sample_ddim(
+        self,
+        x,
+        c,
+        t,
+        index,
+        repeat_noise=False,
+        use_original_steps=False,
+        quantize_denoised=False,
+        temperature=1.0,
+        noise_dropout=0.0,
+    ):
+        b, *_, device = *x.shape, x.device
+        e_t = self.model.apply_model(x, t, c)
+
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = (
+            self.model.alphas_cumprod_prev
+            if use_original_steps
+            else self.ddim_alphas_prev
+        )
+        sqrt_one_minus_alphas = (
+            self.model.sqrt_one_minus_alphas_cumprod
+            if use_original_steps
+            else self.ddim_sqrt_one_minus_alphas
+        )
+        sigmas = (
+            self.model.ddim_sigmas_for_original_num_steps
+            if use_original_steps
+            else self.ddim_sigmas
+        )
+        # select parameters corresponding to the currently considered timestep
+        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+        sqrt_one_minus_at = torch.full(
+            (b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device
+        )
+
+        # current prediction for x_0
+        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+        if quantize_denoised:  # 没用
+            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+        # direction pointing to x_t
+        dir_xt = (1.0 - a_prev - sigma_t ** 2).sqrt() * e_t
+        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
+        if noise_dropout > 0.0:  # 没用
+            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+        return x_prev, pred_x0
--- a/iopaint/model/fcf.py
+++ b/iopaint/model/fcf.py
--- a/iopaint/model/helper/controlnet_preprocess.py
+++ b/iopaint/model/helper/controlnet_preprocess.py
@@ -0,0 +1,46 @@
+import torch
+import PIL
+import cv2
+from PIL import Image
+import numpy as np
+
+
+def make_canny_control_image(image: np.ndarray) -> Image:
+    canny_image = cv2.Canny(image, 100, 200)
+    canny_image = canny_image[:, :, None]
+    canny_image = np.concatenate([canny_image, canny_image, canny_image], axis=2)
+    canny_image = PIL.Image.fromarray(canny_image)
+    control_image = canny_image
+    return control_image
+
+
+def make_openpose_control_image(image: np.ndarray) -> Image:
+    from controlnet_aux import OpenposeDetector
+
+    processor = OpenposeDetector.from_pretrained("lllyasviel/ControlNet")
+    control_image = processor(image, hand_and_face=True)
+    return control_image
+
+
+def make_depth_control_image(image: np.ndarray) -> Image:
+    from transformers import pipeline
+
+    depth_estimator = pipeline("depth-estimation")
+    depth_image = depth_estimator(PIL.Image.fromarray(image))["depth"]
+    depth_image = np.array(depth_image)
+    depth_image = depth_image[:, :, None]
+    depth_image = np.concatenate([depth_image, depth_image, depth_image], axis=2)
+    control_image = PIL.Image.fromarray(depth_image)
+    return control_image
+
+
+def make_inpaint_control_image(image: np.ndarray, mask: np.ndarray) -> torch.Tensor:
+    """
+    image: [H, W, C] RGB
+    mask: [H, W, 1] 255 means area to repaint
+    """
+    image = image.astype(np.float32) / 255.0
+    image[mask[:, :, -1] > 128] = -1.0  # set as masked pixel
+    image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return image
--- a/iopaint/model/helper/cpu_text_encoder.py
+++ b/iopaint/model/helper/cpu_text_encoder.py
@@ -0,0 +1,25 @@
+import torch
+from iopaint.model.utils import torch_gc
+
+
+class CPUTextEncoderWrapper(torch.nn.Module):
+    def __init__(self, text_encoder, torch_dtype):
+        super().__init__()
+        self.config = text_encoder.config
+        self.text_encoder = text_encoder.to(torch.device("cpu"), non_blocking=True)
+        self.text_encoder = self.text_encoder.to(torch.float32, non_blocking=True)
+        self.torch_dtype = torch_dtype
+        del text_encoder
+        torch_gc()
+
+    def __call__(self, x, **kwargs):
+        input_device = x.device
+        return [
+            self.text_encoder(x.to(self.text_encoder.device), **kwargs)[0]
+            .to(input_device)
+            .to(self.torch_dtype)
+        ]
+
+    @property
+    def dtype(self):
+        return self.torch_dtype
--- a/iopaint/model/helper/g_diffuser_bot.py
+++ b/iopaint/model/helper/g_diffuser_bot.py
@@ -0,0 +1,167 @@
+# code copy from: https://github.com/parlance-zz/g-diffuser-bot
+import cv2
+import numpy as np
+
+
+def np_img_grey_to_rgb(data):
+    if data.ndim == 3:
+        return data
+    return np.expand_dims(data, 2) * np.ones((1, 1, 3))
+
+
+def convolve(data1, data2):  # fast convolution with fft
+    if data1.ndim != data2.ndim:  # promote to rgb if mismatch
+        if data1.ndim < 3:
+            data1 = np_img_grey_to_rgb(data1)
+        if data2.ndim < 3:
+            data2 = np_img_grey_to_rgb(data2)
+    return ifft2(fft2(data1) * fft2(data2))
+
+
+def fft2(data):
+    if data.ndim > 2:  # multiple channels
+        out_fft = np.zeros(
+            (data.shape[0], data.shape[1], data.shape[2]), dtype=np.complex128
+        )
+        for c in range(data.shape[2]):
+            c_data = data[:, :, c]
+            out_fft[:, :, c] = np.fft.fft2(np.fft.fftshift(c_data), norm="ortho")
+            out_fft[:, :, c] = np.fft.ifftshift(out_fft[:, :, c])
+    else:  # single channel
+        out_fft = np.zeros((data.shape[0], data.shape[1]), dtype=np.complex128)
+        out_fft[:, :] = np.fft.fft2(np.fft.fftshift(data), norm="ortho")
+        out_fft[:, :] = np.fft.ifftshift(out_fft[:, :])
+
+    return out_fft
+
+
+def ifft2(data):
+    if data.ndim > 2:  # multiple channels
+        out_ifft = np.zeros(
+            (data.shape[0], data.shape[1], data.shape[2]), dtype=np.complex128
+        )
+        for c in range(data.shape[2]):
+            c_data = data[:, :, c]
+            out_ifft[:, :, c] = np.fft.ifft2(np.fft.fftshift(c_data), norm="ortho")
+            out_ifft[:, :, c] = np.fft.ifftshift(out_ifft[:, :, c])
+    else:  # single channel
+        out_ifft = np.zeros((data.shape[0], data.shape[1]), dtype=np.complex128)
+        out_ifft[:, :] = np.fft.ifft2(np.fft.fftshift(data), norm="ortho")
+        out_ifft[:, :] = np.fft.ifftshift(out_ifft[:, :])
+
+    return out_ifft
+
+
+def get_gradient_kernel(width, height, std=3.14, mode="linear"):
+    window_scale_x = float(
+        width / min(width, height)
+    )  # for non-square aspect ratios we still want a circular kernel
+    window_scale_y = float(height / min(width, height))
+    if mode == "gaussian":
+        x = (np.arange(width) / width * 2.0 - 1.0) * window_scale_x
+        kx = np.exp(-x * x * std)
+        if window_scale_x != window_scale_y:
+            y = (np.arange(height) / height * 2.0 - 1.0) * window_scale_y
+            ky = np.exp(-y * y * std)
+        else:
+            y = x
+            ky = kx
+        return np.outer(kx, ky)
+    elif mode == "linear":
+        x = (np.arange(width) / width * 2.0 - 1.0) * window_scale_x
+        if window_scale_x != window_scale_y:
+            y = (np.arange(height) / height * 2.0 - 1.0) * window_scale_y
+        else:
+            y = x
+        return np.clip(1.0 - np.sqrt(np.add.outer(x * x, y * y)) * std / 3.14, 0.0, 1.0)
+    else:
+        raise Exception("Error: Unknown mode in get_gradient_kernel: {0}".format(mode))
+
+
+def image_blur(data, std=3.14, mode="linear"):
+    width = data.shape[0]
+    height = data.shape[1]
+    kernel = get_gradient_kernel(width, height, std, mode=mode)
+    return np.real(convolve(data, kernel / np.sqrt(np.sum(kernel * kernel))))
+
+
+def soften_mask(mask_img, softness, space):
+    if softness == 0:
+        return mask_img
+    softness = min(softness, 1.0)
+    space = np.clip(space, 0.0, 1.0)
+    original_max_opacity = np.max(mask_img)
+    out_mask = mask_img <= 0.0
+    blurred_mask = image_blur(mask_img, 3.5 / softness, mode="linear")
+    blurred_mask = np.maximum(blurred_mask - np.max(blurred_mask[out_mask]), 0.0)
+    mask_img *= blurred_mask  # preserve partial opacity in original input mask
+    mask_img /= np.max(mask_img)  # renormalize
+    mask_img = np.clip(mask_img - space, 0.0, 1.0)  # make space
+    mask_img /= np.max(mask_img)  # and renormalize again
+    mask_img *= original_max_opacity  # restore original max opacity
+    return mask_img
+
+
+def expand_image(
+    cv2_img, top: int, right: int, bottom: int, left: int, softness: float, space: float
+):
+    assert cv2_img.shape[2] == 3
+    origin_h, origin_w = cv2_img.shape[:2]
+    new_width = cv2_img.shape[1] + left + right
+    new_height = cv2_img.shape[0] + top + bottom
+
+    # TODO: which is better?
+    # new_img = np.random.randint(0, 255, (new_height, new_width, 3), np.uint8)
+    new_img = cv2.copyMakeBorder(
+        cv2_img, top, bottom, left, right, cv2.BORDER_REPLICATE
+    )
+    mask_img = np.zeros((new_height, new_width), np.uint8)
+    mask_img[top : top + cv2_img.shape[0], left : left + cv2_img.shape[1]] = 255
+
+    if softness > 0.0:
+        mask_img = soften_mask(mask_img / 255.0, softness / 100.0, space / 100.0)
+        mask_img = (np.clip(mask_img, 0.0, 1.0) * 255.0).astype(np.uint8)
+
+    mask_image = 255.0 - mask_img  # extract mask from alpha channel and invert
+    rgb_init_image = (
+        0.0 + new_img[:, :, 0:3]
+    )  # strip mask from init_img leaving only rgb channels
+
+    hard_mask = np.zeros_like(cv2_img[:, :, 0])
+    if top != 0:
+        hard_mask[0 : origin_h // 2, :] = 255
+    if bottom != 0:
+        hard_mask[origin_h // 2 :, :] = 255
+    if left != 0:
+        hard_mask[:, 0 : origin_w // 2] = 255
+    if right != 0:
+        hard_mask[:, origin_w // 2 :] = 255
+
+    hard_mask = cv2.copyMakeBorder(
+        hard_mask, top, bottom, left, right, cv2.BORDER_DEFAULT, value=255
+    )
+    mask_image = np.where(hard_mask > 0, mask_image, 0)
+    return rgb_init_image.astype(np.uint8), mask_image.astype(np.uint8)
+
+
+if __name__ == "__main__":
+    from pathlib import Path
+
+    current_dir = Path(__file__).parent.absolute().resolve()
+    image_path = current_dir.parent / "tests" / "bunny.jpeg"
+    init_image = cv2.imread(str(image_path))
+    init_image, mask_image = expand_image(
+        init_image,
+        top=100,
+        right=100,
+        bottom=100,
+        left=100,
+        softness=20,
+        space=20,
+    )
+    print(mask_image.dtype, mask_image.min(), mask_image.max())
+    print(init_image.dtype, init_image.min(), init_image.max())
+    mask_image = mask_image.astype(np.uint8)
+    init_image = init_image.astype(np.uint8)
+    cv2.imwrite("expanded_image.png", init_image)
+    cv2.imwrite("expanded_mask.png", mask_image)
--- a/iopaint/model/instruct_pix2pix.py
+++ b/iopaint/model/instruct_pix2pix.py
@@ -0,0 +1,63 @@
+import PIL.Image
+import cv2
+import torch
+from loguru import logger
+
+from iopaint.model.base import DiffusionInpaintModel
+from iopaint.schema import InpaintRequest
+
+
+class InstructPix2Pix(DiffusionInpaintModel):
+    name = "timbrooks/instruct-pix2pix"
+    pad_mod = 8
+    min_size = 512
+
+    def init_model(self, device: torch.device, **kwargs):
+        from diffusers import StableDiffusionInstructPix2PixPipeline
+
+        fp16 = not kwargs.get("no_half", False)
+
+        model_kwargs = {}
+        if kwargs["disable_nsfw"] or kwargs.get("cpu_offload", False):
+            logger.info("Disable Stable Diffusion Model NSFW checker")
+            model_kwargs.update(
+                dict(
+                    safety_checker=None,
+                    feature_extractor=None,
+                    requires_safety_checker=False,
+                )
+            )
+
+        use_gpu = device == torch.device("cuda") and torch.cuda.is_available()
+        torch_dtype = torch.float16 if use_gpu and fp16 else torch.float32
+        self.model = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+            self.name, variant="fp16", torch_dtype=torch_dtype, **model_kwargs
+        )
+
+        if kwargs.get("cpu_offload", False) and use_gpu:
+            logger.info("Enable sequential cpu offload")
+            self.model.enable_sequential_cpu_offload(gpu_id=0)
+        else:
+            self.model = self.model.to(device)
+
+    def forward(self, image, mask, config: InpaintRequest):
+        """Input image and output image have same size
+        image: [H, W, C] RGB
+        mask: [H, W, 1] 255 means area to repaint
+        return: BGR IMAGE
+        edit = pipe(prompt, image=image, num_inference_steps=20, image_guidance_scale=1.5, guidance_scale=7).images[0]
+        """
+        output = self.model(
+            image=PIL.Image.fromarray(image),
+            prompt=config.prompt,
+            negative_prompt=config.negative_prompt,
+            num_inference_steps=config.sd_steps,
+            image_guidance_scale=config.p2p_image_guidance_scale,
+            guidance_scale=config.sd_guidance_scale,
+            output_type="np",
+            generator=torch.manual_seed(config.sd_seed),
+        ).images[0]
+
+        output = (output * 255).round().astype("uint8")
+        output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
+        return output
--- a/iopaint/model/kandinsky.py
+++ b/iopaint/model/kandinsky.py
@@ -0,0 +1,65 @@
+import PIL.Image
+import cv2
+import numpy as np
+import torch
+
+from iopaint.model.base import DiffusionInpaintModel
+from iopaint.model.utils import get_scheduler
+from iopaint.schema import InpaintRequest
+
+
+class Kandinsky(DiffusionInpaintModel):
+    pad_mod = 64
+    min_size = 512
+
+    def init_model(self, device: torch.device, **kwargs):
+        from diffusers import AutoPipelineForInpainting
+
+        fp16 = not kwargs.get("no_half", False)
+        use_gpu = device == torch.device("cuda") and torch.cuda.is_available()
+        torch_dtype = torch.float16 if use_gpu and fp16 else torch.float32
+
+        model_kwargs = {
+            "torch_dtype": torch_dtype,
+        }
+
+        self.model = AutoPipelineForInpainting.from_pretrained(
+            self.name, **model_kwargs
+        ).to(device)
+
+        self.callback = kwargs.pop("callback", None)
+
+    def forward(self, image, mask, config: InpaintRequest):
+        """Input image and output image have same size
+        image: [H, W, C] RGB
+        mask: [H, W, 1] 255 means area to repaint
+        return: BGR IMAGE
+        """
+        self.set_scheduler(config)
+
+        generator = torch.manual_seed(config.sd_seed)
+        mask = mask.astype(np.float32) / 255
+        img_h, img_w = image.shape[:2]
+
+        # kandinsky 没有 strength
+        output = self.model(
+            prompt=config.prompt,
+            negative_prompt=config.negative_prompt,
+            image=PIL.Image.fromarray(image),
+            mask_image=mask[:, :, 0],
+            height=img_h,
+            width=img_w,
+            num_inference_steps=config.sd_steps,
+            guidance_scale=config.sd_guidance_scale,
+            output_type="np",
+            callback_on_step_end=self.callback,
+            generator=generator,
+        ).images[0]
+
+        output = (output * 255).round().astype("uint8")
+        output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
+        return output
+
+
+class Kandinsky22(Kandinsky):
+    name = "kandinsky-community/kandinsky-2-2-decoder-inpaint"
--- a/iopaint/model/lama.py
+++ b/iopaint/model/lama.py
@@ -0,0 +1,57 @@
+import os
+
+import cv2
+import numpy as np
+import torch
+
+from iopaint.helper import (
+    norm_img,
+    get_cache_path_by_url,
+    load_jit_model,
+    download_model,
+)
+from iopaint.model.base import InpaintModel
+from iopaint.schema import InpaintRequest
+
+LAMA_MODEL_URL = os.environ.get(
+    "LAMA_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_big_lama/big-lama.pt",
+)
+LAMA_MODEL_MD5 = os.environ.get("LAMA_MODEL_MD5", "e3aa4aaa15225a33ec84f9f4bc47e500")
+
+
+class LaMa(InpaintModel):
+    name = "lama"
+    pad_mod = 8
+    is_erase_model = True
+
+    @staticmethod
+    def download():
+        download_model(LAMA_MODEL_URL, LAMA_MODEL_MD5)
+
+    def init_model(self, device, **kwargs):
+        self.model = load_jit_model(LAMA_MODEL_URL, device, LAMA_MODEL_MD5).eval()
+
+    @staticmethod
+    def is_downloaded() -> bool:
+        return os.path.exists(get_cache_path_by_url(LAMA_MODEL_URL))
+
+    def forward(self, image, mask, config: InpaintRequest):
+        """Input image and output image have same size
+        image: [H, W, C] RGB
+        mask: [H, W]
+        return: BGR IMAGE
+        """
+        image = norm_img(image)
+        mask = norm_img(mask)
+
+        mask = (mask > 0) * 1
+        image = torch.from_numpy(image).unsqueeze(0).to(self.device)
+        mask = torch.from_numpy(mask).unsqueeze(0).to(self.device)
+
+        inpainted_image = self.model(image, mask)
+
+        cur_res = inpainted_image[0].permute(1, 2, 0).detach().cpu().numpy()
+        cur_res = np.clip(cur_res * 255, 0, 255).astype("uint8")
+        cur_res = cv2.cvtColor(cur_res, cv2.COLOR_RGB2BGR)
+        return cur_res
--- a/iopaint/model/ldm.py
+++ b/iopaint/model/ldm.py
@@ -0,0 +1,336 @@
+import os
+
+import numpy as np
+import torch
+from loguru import logger
+
+from iopaint.model.base import InpaintModel
+from iopaint.model.ddim_sampler import DDIMSampler
+from iopaint.model.plms_sampler import PLMSSampler
+from iopaint.schema import InpaintRequest, LDMSampler
+
+torch.manual_seed(42)
+import torch.nn as nn
+from iopaint.helper import (
+    download_model,
+    norm_img,
+    get_cache_path_by_url,
+    load_jit_model,
+)
+from iopaint.model.utils import (
+    make_beta_schedule,
+    timestep_embedding,
+)
+
+LDM_ENCODE_MODEL_URL = os.environ.get(
+    "LDM_ENCODE_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_ldm/cond_stage_model_encode.pt",
+)
+LDM_ENCODE_MODEL_MD5 = os.environ.get(
+    "LDM_ENCODE_MODEL_MD5", "23239fc9081956a3e70de56472b3f296"
+)
+
+LDM_DECODE_MODEL_URL = os.environ.get(
+    "LDM_DECODE_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_ldm/cond_stage_model_decode.pt",
+)
+LDM_DECODE_MODEL_MD5 = os.environ.get(
+    "LDM_DECODE_MODEL_MD5", "fe419cd15a750d37a4733589d0d3585c"
+)
+
+LDM_DIFFUSION_MODEL_URL = os.environ.get(
+    "LDM_DIFFUSION_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_ldm/diffusion.pt",
+)
+
+LDM_DIFFUSION_MODEL_MD5 = os.environ.get(
+    "LDM_DIFFUSION_MODEL_MD5", "b0afda12bf790c03aba2a7431f11d22d"
+)
+
+
+class DDPM(nn.Module):
+    # classic DDPM with Gaussian diffusion, in image space
+    def __init__(
+        self,
+        device,
+        timesteps=1000,
+        beta_schedule="linear",
+        linear_start=0.0015,
+        linear_end=0.0205,
+        cosine_s=0.008,
+        original_elbo_weight=0.0,
+        v_posterior=0.0,  # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
+        l_simple_weight=1.0,
+        parameterization="eps",  # all assuming fixed variance schedules
+        use_positional_encodings=False,
+    ):
+        super().__init__()
+        self.device = device
+        self.parameterization = parameterization
+        self.use_positional_encodings = use_positional_encodings
+
+        self.v_posterior = v_posterior
+        self.original_elbo_weight = original_elbo_weight
+        self.l_simple_weight = l_simple_weight
+
+        self.register_schedule(
+            beta_schedule=beta_schedule,
+            timesteps=timesteps,
+            linear_start=linear_start,
+            linear_end=linear_end,
+            cosine_s=cosine_s,
+        )
+
+    def register_schedule(
+        self,
+        given_betas=None,
+        beta_schedule="linear",
+        timesteps=1000,
+        linear_start=1e-4,
+        linear_end=2e-2,
+        cosine_s=8e-3,
+    ):
+        betas = make_beta_schedule(
+            self.device,
+            beta_schedule,
+            timesteps,
+            linear_start=linear_start,
+            linear_end=linear_end,
+            cosine_s=cosine_s,
+        )
+        alphas = 1.0 - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1])
+
+        (timesteps,) = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.linear_start = linear_start
+        self.linear_end = linear_end
+        assert (
+            alphas_cumprod.shape[0] == self.num_timesteps
+        ), "alphas have to be defined for each timestep"
+
+        to_torch = lambda x: torch.tensor(x, dtype=torch.float32).to(self.device)
+
+        self.register_buffer("betas", to_torch(betas))
+        self.register_buffer("alphas_cumprod", to_torch(alphas_cumprod))
+        self.register_buffer("alphas_cumprod_prev", to_torch(alphas_cumprod_prev))
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer("sqrt_alphas_cumprod", to_torch(np.sqrt(alphas_cumprod)))
+        self.register_buffer(
+            "sqrt_one_minus_alphas_cumprod", to_torch(np.sqrt(1.0 - alphas_cumprod))
+        )
+        self.register_buffer(
+            "log_one_minus_alphas_cumprod", to_torch(np.log(1.0 - alphas_cumprod))
+        )
+        self.register_buffer(
+            "sqrt_recip_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod))
+        )
+        self.register_buffer(
+            "sqrt_recipm1_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod - 1))
+        )
+
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = (1 - self.v_posterior) * betas * (
+            1.0 - alphas_cumprod_prev
+        ) / (1.0 - alphas_cumprod) + self.v_posterior * betas
+        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
+        self.register_buffer("posterior_variance", to_torch(posterior_variance))
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.register_buffer(
+            "posterior_log_variance_clipped",
+            to_torch(np.log(np.maximum(posterior_variance, 1e-20))),
+        )
+        self.register_buffer(
+            "posterior_mean_coef1",
+            to_torch(betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod)),
+        )
+        self.register_buffer(
+            "posterior_mean_coef2",
+            to_torch(
+                (1.0 - alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - alphas_cumprod)
+            ),
+        )
+
+        if self.parameterization == "eps":
+            lvlb_weights = self.betas**2 / (
+                2
+                * self.posterior_variance
+                * to_torch(alphas)
+                * (1 - self.alphas_cumprod)
+            )
+        elif self.parameterization == "x0":
+            lvlb_weights = (
+                0.5
+                * np.sqrt(torch.Tensor(alphas_cumprod))
+                / (2.0 * 1 - torch.Tensor(alphas_cumprod))
+            )
+        else:
+            raise NotImplementedError("mu not supported")
+        # TODO how to choose this term
+        lvlb_weights[0] = lvlb_weights[1]
+        self.register_buffer("lvlb_weights", lvlb_weights, persistent=False)
+        assert not torch.isnan(self.lvlb_weights).all()
+
+
+class LatentDiffusion(DDPM):
+    def __init__(
+        self,
+        diffusion_model,
+        device,
+        cond_stage_key="image",
+        cond_stage_trainable=False,
+        concat_mode=True,
+        scale_factor=1.0,
+        scale_by_std=False,
+        *args,
+        **kwargs,
+    ):
+        self.num_timesteps_cond = 1
+        self.scale_by_std = scale_by_std
+        super().__init__(device, *args, **kwargs)
+        self.diffusion_model = diffusion_model
+        self.concat_mode = concat_mode
+        self.cond_stage_trainable = cond_stage_trainable
+        self.cond_stage_key = cond_stage_key
+        self.num_downs = 2
+        self.scale_factor = scale_factor
+
+    def make_cond_schedule(
+        self,
+    ):
+        self.cond_ids = torch.full(
+            size=(self.num_timesteps,),
+            fill_value=self.num_timesteps - 1,
+            dtype=torch.long,
+        )
+        ids = torch.round(
+            torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)
+        ).long()
+        self.cond_ids[: self.num_timesteps_cond] = ids
+
+    def register_schedule(
+        self,
+        given_betas=None,
+        beta_schedule="linear",
+        timesteps=1000,
+        linear_start=1e-4,
+        linear_end=2e-2,
+        cosine_s=8e-3,
+    ):
+        super().register_schedule(
+            given_betas, beta_schedule, timesteps, linear_start, linear_end, cosine_s
+        )
+
+        self.shorten_cond_schedule = self.num_timesteps_cond > 1
+        if self.shorten_cond_schedule:
+            self.make_cond_schedule()
+
+    def apply_model(self, x_noisy, t, cond):
+        # x_recon = self.model(x_noisy, t, cond['c_concat'][0])  # cond['c_concat'][0].shape 1,4,128,128
+        t_emb = timestep_embedding(x_noisy.device, t, 256, repeat_only=False)
+        x_recon = self.diffusion_model(x_noisy, t_emb, cond)
+        return x_recon
+
+
+class LDM(InpaintModel):
+    name = "ldm"
+    pad_mod = 32
+    is_erase_model = True
+
+    def __init__(self, device, fp16: bool = True, **kwargs):
+        self.fp16 = fp16
+        super().__init__(device)
+        self.device = device
+
+    def init_model(self, device, **kwargs):
+        self.diffusion_model = load_jit_model(
+            LDM_DIFFUSION_MODEL_URL, device, LDM_DIFFUSION_MODEL_MD5
+        )
+        self.cond_stage_model_decode = load_jit_model(
+            LDM_DECODE_MODEL_URL, device, LDM_DECODE_MODEL_MD5
+        )
+        self.cond_stage_model_encode = load_jit_model(
+            LDM_ENCODE_MODEL_URL, device, LDM_ENCODE_MODEL_MD5
+        )
+        if self.fp16 and "cuda" in str(device):
+            self.diffusion_model = self.diffusion_model.half()
+            self.cond_stage_model_decode = self.cond_stage_model_decode.half()
+            self.cond_stage_model_encode = self.cond_stage_model_encode.half()
+
+        self.model = LatentDiffusion(self.diffusion_model, device)
+
+    @staticmethod
+    def download():
+        download_model(LDM_DIFFUSION_MODEL_URL, LDM_DIFFUSION_MODEL_MD5)
+        download_model(LDM_DECODE_MODEL_URL, LDM_DECODE_MODEL_MD5)
+        download_model(LDM_ENCODE_MODEL_URL, LDM_ENCODE_MODEL_MD5)
+
+    @staticmethod
+    def is_downloaded() -> bool:
+        model_paths = [
+            get_cache_path_by_url(LDM_DIFFUSION_MODEL_URL),
+            get_cache_path_by_url(LDM_DECODE_MODEL_URL),
+            get_cache_path_by_url(LDM_ENCODE_MODEL_URL),
+        ]
+        return all([os.path.exists(it) for it in model_paths])
+
+    @torch.cuda.amp.autocast()
+    def forward(self, image, mask, config: InpaintRequest):
+        """
+        image: [H, W, C] RGB
+        mask: [H, W, 1]
+        return: BGR IMAGE
+        """
+        # image [1,3,512,512] float32
+        # mask: [1,1,512,512] float32
+        # masked_image: [1,3,512,512] float32
+        if config.ldm_sampler == LDMSampler.ddim:
+            sampler = DDIMSampler(self.model)
+        elif config.ldm_sampler == LDMSampler.plms:
+            sampler = PLMSSampler(self.model)
+        else:
+            raise ValueError()
+
+        steps = config.ldm_steps
+        image = norm_img(image)
+        mask = norm_img(mask)
+
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+        image = torch.from_numpy(image).unsqueeze(0).to(self.device)
+        mask = torch.from_numpy(mask).unsqueeze(0).to(self.device)
+        masked_image = (1 - mask) * image
+
+        mask = self._norm(mask)
+        masked_image = self._norm(masked_image)
+
+        c = self.cond_stage_model_encode(masked_image)
+        torch.cuda.empty_cache()
+
+        cc = torch.nn.functional.interpolate(mask, size=c.shape[-2:])  # 1,1,128,128
+        c = torch.cat((c, cc), dim=1)  # 1,4,128,128
+
+        shape = (c.shape[1] - 1,) + c.shape[2:]
+        samples_ddim = sampler.sample(
+            steps=steps, conditioning=c, batch_size=c.shape[0], shape=shape
+        )
+        torch.cuda.empty_cache()
+        x_samples_ddim = self.cond_stage_model_decode(
+            samples_ddim
+        )  # samples_ddim: 1, 3, 128, 128 float32
+        torch.cuda.empty_cache()
+
+        # image = torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)
+        # mask = torch.clamp((mask + 1.0) / 2.0, min=0.0, max=1.0)
+        inpainted_image = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+
+        # inpainted = (1 - mask) * image + mask * predicted_image
+        inpainted_image = inpainted_image.cpu().numpy().transpose(0, 2, 3, 1)[0] * 255
+        inpainted_image = inpainted_image.astype(np.uint8)[:, :, ::-1]
+        return inpainted_image
+
+    def _norm(self, tensor):
+        return tensor * 2.0 - 1.0
--- a/iopaint/model/manga.py
+++ b/iopaint/model/manga.py
@@ -0,0 +1,97 @@
+import os
+import random
+
+import cv2
+import numpy as np
+import torch
+import time
+from loguru import logger
+
+from iopaint.helper import get_cache_path_by_url, load_jit_model, download_model
+from iopaint.model.base import InpaintModel
+from iopaint.schema import InpaintRequest
+
+
+MANGA_INPAINTOR_MODEL_URL = os.environ.get(
+    "MANGA_INPAINTOR_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/manga/manga_inpaintor.jit",
+)
+MANGA_INPAINTOR_MODEL_MD5 = os.environ.get(
+    "MANGA_INPAINTOR_MODEL_MD5", "7d8b269c4613b6b3768af714610da86c"
+)
+
+MANGA_LINE_MODEL_URL = os.environ.get(
+    "MANGA_LINE_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/manga/erika.jit",
+)
+MANGA_LINE_MODEL_MD5 = os.environ.get(
+    "MANGA_LINE_MODEL_MD5", "0c926d5a4af8450b0d00bc5b9a095644"
+)
+
+
+class Manga(InpaintModel):
+    name = "manga"
+    pad_mod = 16
+    is_erase_model = True
+
+    def init_model(self, device, **kwargs):
+        self.inpaintor_model = load_jit_model(
+            MANGA_INPAINTOR_MODEL_URL, device, MANGA_INPAINTOR_MODEL_MD5
+        )
+        self.line_model = load_jit_model(
+            MANGA_LINE_MODEL_URL, device, MANGA_LINE_MODEL_MD5
+        )
+        self.seed = 42
+
+    @staticmethod
+    def download():
+        download_model(MANGA_INPAINTOR_MODEL_URL, MANGA_INPAINTOR_MODEL_MD5)
+        download_model(MANGA_LINE_MODEL_URL, MANGA_LINE_MODEL_MD5)
+
+    @staticmethod
+    def is_downloaded() -> bool:
+        model_paths = [
+            get_cache_path_by_url(MANGA_INPAINTOR_MODEL_URL),
+            get_cache_path_by_url(MANGA_LINE_MODEL_URL),
+        ]
+        return all([os.path.exists(it) for it in model_paths])
+
+    def forward(self, image, mask, config: InpaintRequest):
+        """
+        image: [H, W, C] RGB
+        mask: [H, W, 1]
+        return: BGR IMAGE
+        """
+        seed = self.seed
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+        gray_img = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+        gray_img = torch.from_numpy(
+            gray_img[np.newaxis, np.newaxis, :, :].astype(np.float32)
+        ).to(self.device)
+        start = time.time()
+        lines = self.line_model(gray_img)
+        torch.cuda.empty_cache()
+        lines = torch.clamp(lines, 0, 255)
+        logger.info(f"erika_model time: {time.time() - start}")
+
+        mask = torch.from_numpy(mask[np.newaxis, :, :, :]).to(self.device)
+        mask = mask.permute(0, 3, 1, 2)
+        mask = torch.where(mask > 0.5, 1.0, 0.0)
+        noise = torch.randn_like(mask)
+        ones = torch.ones_like(mask)
+
+        gray_img = gray_img / 255 * 2 - 1.0
+        lines = lines / 255 * 2 - 1.0
+
+        start = time.time()
+        inpainted_image = self.inpaintor_model(gray_img, lines, mask, noise, ones)
+        logger.info(f"image_inpaintor_model time: {time.time() - start}")
+
+        cur_res = inpainted_image[0].permute(1, 2, 0).detach().cpu().numpy()
+        cur_res = (cur_res * 127.5 + 127.5).astype(np.uint8)
+        cur_res = cv2.cvtColor(cur_res, cv2.COLOR_GRAY2BGR)
+        return cur_res
--- a/iopaint/model/mat.py
+++ b/iopaint/model/mat.py
--- a/iopaint/model/mi_gan.py
+++ b/iopaint/model/mi_gan.py
@@ -0,0 +1,110 @@
+import os
+
+import cv2
+import torch
+
+from iopaint.helper import (
+    load_jit_model,
+    download_model,
+    get_cache_path_by_url,
+    boxes_from_mask,
+    resize_max_size,
+    norm_img,
+)
+from iopaint.model.base import InpaintModel
+from iopaint.schema import InpaintRequest
+
+MIGAN_MODEL_URL = os.environ.get(
+    "MIGAN_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/migan/migan_traced.pt",
+)
+MIGAN_MODEL_MD5 = os.environ.get("MIGAN_MODEL_MD5", "76eb3b1a71c400ee3290524f7a11b89c")
+
+
+class MIGAN(InpaintModel):
+    name = "migan"
+    min_size = 512
+    pad_mod = 512
+    pad_to_square = True
+    is_erase_model = True
+
+    def init_model(self, device, **kwargs):
+        self.model = load_jit_model(MIGAN_MODEL_URL, device, MIGAN_MODEL_MD5).eval()
+
+    @staticmethod
+    def download():
+        download_model(MIGAN_MODEL_URL, MIGAN_MODEL_MD5)
+
+    @staticmethod
+    def is_downloaded() -> bool:
+        return os.path.exists(get_cache_path_by_url(MIGAN_MODEL_URL))
+
+    @torch.no_grad()
+    def __call__(self, image, mask, config: InpaintRequest):
+        """
+        images: [H, W, C] RGB, not normalized
+        masks: [H, W]
+        return: BGR IMAGE
+        """
+        if image.shape[0] == 512 and image.shape[1] == 512:
+            return self._pad_forward(image, mask, config)
+
+        boxes = boxes_from_mask(mask)
+        crop_result = []
+        config.hd_strategy_crop_margin = 128
+        for box in boxes:
+            crop_image, crop_mask, crop_box = self._crop_box(image, mask, box, config)
+            origin_size = crop_image.shape[:2]
+            resize_image = resize_max_size(crop_image, size_limit=512)
+            resize_mask = resize_max_size(crop_mask, size_limit=512)
+            inpaint_result = self._pad_forward(resize_image, resize_mask, config)
+
+            # only paste masked area result
+            inpaint_result = cv2.resize(
+                inpaint_result,
+                (origin_size[1], origin_size[0]),
+                interpolation=cv2.INTER_CUBIC,
+            )
+
+            original_pixel_indices = crop_mask < 127
+            inpaint_result[original_pixel_indices] = crop_image[:, :, ::-1][
+                original_pixel_indices
+            ]
+
+            crop_result.append((inpaint_result, crop_box))
+
+        inpaint_result = image[:, :, ::-1].copy()
+        for crop_image, crop_box in crop_result:
+            x1, y1, x2, y2 = crop_box
+            inpaint_result[y1:y2, x1:x2, :] = crop_image
+
+        return inpaint_result
+
+    def forward(self, image, mask, config: InpaintRequest):
+        """Input images and output images have same size
+        images: [H, W, C] RGB
+        masks: [H, W] mask area == 255
+        return: BGR IMAGE
+        """
+
+        image = norm_img(image)  # [0, 1]
+        image = image * 2 - 1  # [0, 1] -> [-1, 1]
+        mask = (mask > 120) * 255
+        mask = norm_img(mask)
+
+        image = torch.from_numpy(image).unsqueeze(0).to(self.device)
+        mask = torch.from_numpy(mask).unsqueeze(0).to(self.device)
+
+        erased_img = image * (1 - mask)
+        input_image = torch.cat([0.5 - mask, erased_img], dim=1)
+
+        output = self.model(input_image)
+        output = (
+            (output.permute(0, 2, 3, 1) * 127.5 + 127.5)
+            .round()
+            .clamp(0, 255)
+            .to(torch.uint8)
+        )
+        output = output[0].cpu().numpy()
+        cur_res = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
+        return cur_res
--- a/iopaint/model/opencv2.py
+++ b/iopaint/model/opencv2.py
@@ -0,0 +1,29 @@
+import cv2
+from iopaint.model.base import InpaintModel
+from iopaint.schema import InpaintRequest
+
+flag_map = {"INPAINT_NS": cv2.INPAINT_NS, "INPAINT_TELEA": cv2.INPAINT_TELEA}
+
+
+class OpenCV2(InpaintModel):
+    name = "cv2"
+    pad_mod = 1
+    is_erase_model = True
+
+    @staticmethod
+    def is_downloaded() -> bool:
+        return True
+
+    def forward(self, image, mask, config: InpaintRequest):
+        """Input image and output image have same size
+        image: [H, W, C] RGB
+        mask: [H, W, 1]
+        return: BGR IMAGE
+        """
+        cur_res = cv2.inpaint(
+            image[:, :, ::-1],
+            mask,
+            inpaintRadius=config.cv2_radius,
+            flags=flag_map[config.cv2_flag],
+        )
+        return cur_res
--- a/iopaint/model/paint_by_example.py
+++ b/iopaint/model/paint_by_example.py
@@ -0,0 +1,66 @@
+import PIL
+import PIL.Image
+import cv2
+import torch
+from loguru import logger
+
+from iopaint.helper import decode_base64_to_image
+from iopaint.model.base import DiffusionInpaintModel
+from iopaint.schema import InpaintRequest
+
+
+class PaintByExample(DiffusionInpaintModel):
+    name = "Fantasy-Studio/Paint-by-Example"
+    pad_mod = 8
+    min_size = 512
+
+    def init_model(self, device: torch.device, **kwargs):
+        from diffusers import DiffusionPipeline
+
+        fp16 = not kwargs.get("no_half", False)
+        use_gpu = device == torch.device("cuda") and torch.cuda.is_available()
+        torch_dtype = torch.float16 if use_gpu and fp16 else torch.float32
+        model_kwargs = {}
+
+        if kwargs["disable_nsfw"] or kwargs.get("cpu_offload", False):
+            logger.info("Disable Paint By Example Model NSFW checker")
+            model_kwargs.update(
+                dict(safety_checker=None, requires_safety_checker=False)
+            )
+
+        self.model = DiffusionPipeline.from_pretrained(
+            self.name, torch_dtype=torch_dtype, **model_kwargs
+        )
+
+        # TODO: gpu_id
+        if kwargs.get("cpu_offload", False) and use_gpu:
+            self.model.image_encoder = self.model.image_encoder.to(device)
+            self.model.enable_sequential_cpu_offload(gpu_id=0)
+        else:
+            self.model = self.model.to(device)
+
+    def forward(self, image, mask, config: InpaintRequest):
+        """Input image and output image have same size
+        image: [H, W, C] RGB
+        mask: [H, W, 1] 255 means area to repaint
+        return: BGR IMAGE
+        """
+        if config.paint_by_example_example_image is None:
+            raise ValueError("paint_by_example_example_image is required")
+        example_image, _, _ = decode_base64_to_image(
+            config.paint_by_example_example_image
+        )
+        output = self.model(
+            image=PIL.Image.fromarray(image),
+            mask_image=PIL.Image.fromarray(mask[:, :, -1], mode="L"),
+            example_image=PIL.Image.fromarray(example_image),
+            num_inference_steps=config.sd_steps,
+            guidance_scale=config.sd_guidance_scale,
+            negative_prompt="out of frame, lowres, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, disfigured, gross proportions, malformed limbs, watermark, signature",
+            output_type="np.array",
+            generator=torch.manual_seed(config.sd_seed),
+        ).images[0]
+
+        output = (output * 255).round().astype("uint8")
+        output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
+        return output
--- a/iopaint/model/plms_sampler.py
+++ b/iopaint/model/plms_sampler.py
@@ -0,0 +1,225 @@
+# From: https://github.com/CompVis/latent-diffusion/blob/main/ldm/models/diffusion/plms.py
+import torch
+import numpy as np
+from iopaint.model.utils import make_ddim_timesteps, make_ddim_sampling_parameters, noise_like
+from tqdm import tqdm
+
+
+class PLMSSampler(object):
+    def __init__(self, model, schedule="linear", **kwargs):
+        super().__init__()
+        self.model = model
+        self.ddpm_num_timesteps = model.num_timesteps
+        self.schedule = schedule
+
+    def register_buffer(self, name, attr):
+        setattr(self, name, attr)
+
+    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
+        if ddim_eta != 0:
+            raise ValueError('ddim_eta must be 0 for PLMS')
+        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
+                                                  num_ddpm_timesteps=self.ddpm_num_timesteps, verbose=verbose)
+        alphas_cumprod = self.model.alphas_cumprod
+        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
+        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
+
+        self.register_buffer('betas', to_torch(self.model.betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
+        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
+
+        # ddim sampling parameters
+        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
+                                                                                   ddim_timesteps=self.ddim_timesteps,
+                                                                                   eta=ddim_eta, verbose=verbose)
+        self.register_buffer('ddim_sigmas', ddim_sigmas)
+        self.register_buffer('ddim_alphas', ddim_alphas)
+        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
+        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
+        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
+            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
+                    1 - self.alphas_cumprod / self.alphas_cumprod_prev))
+        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
+
+    @torch.no_grad()
+    def sample(self,
+               steps,
+               batch_size,
+               shape,
+               conditioning=None,
+               callback=None,
+               normals_sequence=None,
+               img_callback=None,
+               quantize_x0=False,
+               eta=0.,
+               mask=None,
+               x0=None,
+               temperature=1.,
+               noise_dropout=0.,
+               score_corrector=None,
+               corrector_kwargs=None,
+               verbose=False,
+               x_T=None,
+               log_every_t=100,
+               unconditional_guidance_scale=1.,
+               unconditional_conditioning=None,
+               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+               **kwargs
+               ):
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
+                if cbs != batch_size:
+                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+            else:
+                if conditioning.shape[0] != batch_size:
+                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+
+        self.make_schedule(ddim_num_steps=steps, ddim_eta=eta, verbose=verbose)
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        print(f'Data shape for PLMS sampling is {size}')
+
+        samples = self.plms_sampling(conditioning, size,
+                                     callback=callback,
+                                     img_callback=img_callback,
+                                     quantize_denoised=quantize_x0,
+                                     mask=mask, x0=x0,
+                                     ddim_use_original_steps=False,
+                                     noise_dropout=noise_dropout,
+                                     temperature=temperature,
+                                     score_corrector=score_corrector,
+                                     corrector_kwargs=corrector_kwargs,
+                                     x_T=x_T,
+                                     log_every_t=log_every_t,
+                                     unconditional_guidance_scale=unconditional_guidance_scale,
+                                     unconditional_conditioning=unconditional_conditioning,
+                                     )
+        return samples
+
+    @torch.no_grad()
+    def plms_sampling(self, cond, shape,
+                      x_T=None, ddim_use_original_steps=False,
+                      callback=None, timesteps=None, quantize_denoised=False,
+                      mask=None, x0=None, img_callback=None, log_every_t=100,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, ):
+        device = self.model.betas.device
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+
+        if timesteps is None:
+            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
+            timesteps = self.ddim_timesteps[:subset_end]
+
+        time_range = list(reversed(range(0, timesteps))) if ddim_use_original_steps else np.flip(timesteps)
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+        print(f"Running PLMS Sampling with {total_steps} timesteps")
+
+        iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps)
+        old_eps = []
+
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b,), step, device=device, dtype=torch.long)
+            ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
+
+            if mask is not None:
+                assert x0 is not None
+                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
+                img = img_orig * mask + (1. - mask) * img
+
+            outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
+                                      quantize_denoised=quantize_denoised, temperature=temperature,
+                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
+                                      corrector_kwargs=corrector_kwargs,
+                                      unconditional_guidance_scale=unconditional_guidance_scale,
+                                      unconditional_conditioning=unconditional_conditioning,
+                                      old_eps=old_eps, t_next=ts_next)
+            img, pred_x0, e_t = outs
+            old_eps.append(e_t)
+            if len(old_eps) >= 4:
+                old_eps.pop(0)
+            if callback: callback(i)
+            if img_callback: img_callback(pred_x0, i)
+
+        return img
+
+    @torch.no_grad()
+    def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None):
+        b, *_, device = *x.shape, x.device
+
+        def get_model_output(x, t):
+            if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+                e_t = self.model.apply_model(x, t, c)
+            else:
+                x_in = torch.cat([x] * 2)
+                t_in = torch.cat([t] * 2)
+                c_in = torch.cat([unconditional_conditioning, c])
+                e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
+                e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+
+            if score_corrector is not None:
+                assert self.model.parameterization == "eps"
+                e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
+
+            return e_t
+
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
+        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
+        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+
+        def get_x_prev_and_pred_x0(e_t, index):
+            # select parameters corresponding to the currently considered timestep
+            a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+            a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+            sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+            sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device)
+
+            # current prediction for x_0
+            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+            if quantize_denoised:
+                pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+            # direction pointing to x_t
+            dir_xt = (1. - a_prev - sigma_t ** 2).sqrt() * e_t
+            noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
+            if noise_dropout > 0.:
+                noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+            x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+            return x_prev, pred_x0
+
+        e_t = get_model_output(x, t)
+        if len(old_eps) == 0:
+            # Pseudo Improved Euler (2nd order)
+            x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
+            e_t_next = get_model_output(x_prev, t_next)
+            e_t_prime = (e_t + e_t_next) / 2
+        elif len(old_eps) == 1:
+            # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (3 * e_t - old_eps[-1]) / 2
+        elif len(old_eps) == 2:
+            # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
+        elif len(old_eps) >= 3:
+            # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
+
+        x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
+
+        return x_prev, pred_x0, e_t
--- a/iopaint/model/power_paint/init.py
+++ b/iopaint/model/power_paint/init.py
--- a/iopaint/model/power_paint/pipeline_powerpaint.py
+++ b/iopaint/model/power_paint/pipeline_powerpaint.py
--- a/iopaint/model/power_paint/pipeline_powerpaint_controlnet.py
+++ b/iopaint/model/power_paint/pipeline_powerpaint_controlnet.py
--- a/iopaint/model/power_paint/power_paint.py
+++ b/iopaint/model/power_paint/power_paint.py
@@ -0,0 +1,96 @@
+from PIL import Image
+import PIL.Image
+import cv2
+import torch
+from loguru import logger
+
+from iopaint.model.base import DiffusionInpaintModel
+from iopaint.model.helper.cpu_text_encoder import CPUTextEncoderWrapper
+from iopaint.model.utils import handle_from_pretrained_exceptions
+from iopaint.schema import InpaintRequest
+from .powerpaint_tokenizer import add_task_to_prompt
+
+
+class PowerPaint(DiffusionInpaintModel):
+    name = "Sanster/PowerPaint-V1-stable-diffusion-inpainting"
+    pad_mod = 8
+    min_size = 512
+    lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5"
+
+    def init_model(self, device: torch.device, **kwargs):
+        from .pipeline_powerpaint import StableDiffusionInpaintPipeline
+        from .powerpaint_tokenizer import PowerPaintTokenizer
+
+        fp16 = not kwargs.get("no_half", False)
+        model_kwargs = {}
+        if kwargs["disable_nsfw"] or kwargs.get("cpu_offload", False):
+            logger.info("Disable Stable Diffusion Model NSFW checker")
+            model_kwargs.update(
+                dict(
+                    safety_checker=None,
+                    feature_extractor=None,
+                    requires_safety_checker=False,
+                )
+            )
+
+        use_gpu = device == torch.device("cuda") and torch.cuda.is_available()
+        torch_dtype = torch.float16 if use_gpu and fp16 else torch.float32
+
+        self.model = handle_from_pretrained_exceptions(
+            StableDiffusionInpaintPipeline.from_pretrained,
+            pretrained_model_name_or_path=self.name,
+            variant="fp16",
+            torch_dtype=torch_dtype,
+            **model_kwargs,
+        )
+        self.model.tokenizer = PowerPaintTokenizer(self.model.tokenizer)
+
+        if kwargs.get("cpu_offload", False) and use_gpu:
+            logger.info("Enable sequential cpu offload")
+            self.model.enable_sequential_cpu_offload(gpu_id=0)
+        else:
+            self.model = self.model.to(device)
+            if kwargs["sd_cpu_textencoder"]:
+                logger.info("Run Stable Diffusion TextEncoder on CPU")
+                self.model.text_encoder = CPUTextEncoderWrapper(
+                    self.model.text_encoder, torch_dtype
+                )
+
+        self.callback = kwargs.pop("callback", None)
+
+    def forward(self, image, mask, config: InpaintRequest):
+        """Input image and output image have same size
+        image: [H, W, C] RGB
+        mask: [H, W, 1] 255 means area to repaint
+        return: BGR IMAGE
+        """
+        self.set_scheduler(config)
+
+        img_h, img_w = image.shape[:2]
+        promptA, promptB, negative_promptA, negative_promptB = add_task_to_prompt(
+            config.prompt, config.negative_prompt, config.powerpaint_task
+        )
+
+        output = self.model(
+            image=PIL.Image.fromarray(image),
+            promptA=promptA,
+            promptB=promptB,
+            tradoff=config.fitting_degree,
+            tradoff_nag=config.fitting_degree,
+            negative_promptA=negative_promptA,
+            negative_promptB=negative_promptB,
+            mask_image=PIL.Image.fromarray(mask[:, :, -1], mode="L"),
+            num_inference_steps=config.sd_steps,
+            strength=config.sd_strength,
+            guidance_scale=config.sd_guidance_scale,
+            output_type="np",
+            callback=self.callback,
+            height=img_h,
+            width=img_w,
+            generator=torch.manual_seed(config.sd_seed),
+            callback_steps=1,
+        ).images[0]
+
+        output = (output * 255).round().astype("uint8")
+        output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
+        return output
--- a/iopaint/model/power_paint/powerpaint_tokenizer.py
+++ b/iopaint/model/power_paint/powerpaint_tokenizer.py
@@ -0,0 +1,540 @@
+import torch
+import torch.nn as nn
+import copy
+import random
+from typing import Any, List, Optional, Union
+from transformers import CLIPTokenizer
+
+from iopaint.schema import PowerPaintTask
+
+
+def add_task_to_prompt(prompt, negative_prompt, task: PowerPaintTask):
+    if task == PowerPaintTask.object_remove:
+        promptA = prompt + " P_ctxt"
+        promptB = prompt + " P_ctxt"
+        negative_promptA = negative_prompt + " P_obj"
+        negative_promptB = negative_prompt + " P_obj"
+    elif task == PowerPaintTask.shape_guided:
+        promptA = prompt + " P_shape"
+        promptB = prompt + " P_ctxt"
+        negative_promptA = negative_prompt
+        negative_promptB = negative_prompt
+    elif task == PowerPaintTask.outpainting:
+        promptA = prompt + " P_ctxt"
+        promptB = prompt + " P_ctxt"
+        negative_promptA = negative_prompt + " P_obj"
+        negative_promptB = negative_prompt + " P_obj"
+    else:
+        promptA = prompt + " P_obj"
+        promptB = prompt + " P_obj"
+        negative_promptA = negative_prompt
+        negative_promptB = negative_prompt
+
+    return promptA, promptB, negative_promptA, negative_promptB
+
+
+class PowerPaintTokenizer:
+    def __init__(self, tokenizer: CLIPTokenizer):
+        self.wrapped = tokenizer
+        self.token_map = {}
+        placeholder_tokens = ["P_ctxt", "P_shape", "P_obj"]
+        num_vec_per_token = 10
+        for placeholder_token in placeholder_tokens:
+            output = []
+            for i in range(num_vec_per_token):
+                ith_token = placeholder_token + f"_{i}"
+                output.append(ith_token)
+            self.token_map[placeholder_token] = output
+
+    def __getattr__(self, name: str) -> Any:
+        if name == "wrapped":
+            return super().__getattr__("wrapped")
+
+        try:
+            return getattr(self.wrapped, name)
+        except AttributeError:
+            try:
+                return super().__getattr__(name)
+            except AttributeError:
+                raise AttributeError(
+                    "'name' cannot be found in both "
+                    f"'{self.__class__.__name__}' and "
+                    f"'{self.__class__.__name__}.tokenizer'."
+                )
+
+    def try_adding_tokens(self, tokens: Union[str, List[str]], *args, **kwargs):
+        """Attempt to add tokens to the tokenizer.
+
+        Args:
+            tokens (Union[str, List[str]]): The tokens to be added.
+        """
+        num_added_tokens = self.wrapped.add_tokens(tokens, *args, **kwargs)
+        assert num_added_tokens != 0, (
+            f"The tokenizer already contains the token {tokens}. Please pass "
+            "a different `placeholder_token` that is not already in the "
+            "tokenizer."
+        )
+
+    def get_token_info(self, token: str) -> dict:
+        """Get the information of a token, including its start and end index in
+        the current tokenizer.
+
+        Args:
+            token (str): The token to be queried.
+
+        Returns:
+            dict: The information of the token, including its start and end
+                index in current tokenizer.
+        """
+        token_ids = self.__call__(token).input_ids
+        start, end = token_ids[1], token_ids[-2] + 1
+        return {"name": token, "start": start, "end": end}
+
+    def add_placeholder_token(
+        self, placeholder_token: str, *args, num_vec_per_token: int = 1, **kwargs
+    ):
+        """Add placeholder tokens to the tokenizer.
+
+        Args:
+            placeholder_token (str): The placeholder token to be added.
+            num_vec_per_token (int, optional): The number of vectors of
+                the added placeholder token.
+            *args, **kwargs: The arguments for `self.wrapped.add_tokens`.
+        """
+        output = []
+        if num_vec_per_token == 1:
+            self.try_adding_tokens(placeholder_token, *args, **kwargs)
+            output.append(placeholder_token)
+        else:
+            output = []
+            for i in range(num_vec_per_token):
+                ith_token = placeholder_token + f"_{i}"
+                self.try_adding_tokens(ith_token, *args, **kwargs)
+                output.append(ith_token)
+
+        for token in self.token_map:
+            if token in placeholder_token:
+                raise ValueError(
+                    f"The tokenizer already has placeholder token {token} "
+                    f"that can get confused with {placeholder_token} "
+                    "keep placeholder tokens independent"
+                )
+        self.token_map[placeholder_token] = output
+
+    def replace_placeholder_tokens_in_text(
+        self,
+        text: Union[str, List[str]],
+        vector_shuffle: bool = False,
+        prop_tokens_to_load: float = 1.0,
+    ) -> Union[str, List[str]]:
+        """Replace the keywords in text with placeholder tokens. This function
+        will be called in `self.__call__` and `self.encode`.
+
+        Args:
+            text (Union[str, List[str]]): The text to be processed.
+            vector_shuffle (bool, optional): Whether to shuffle the vectors.
+                Defaults to False.
+            prop_tokens_to_load (float, optional): The proportion of tokens to
+                be loaded. If 1.0, all tokens will be loaded. Defaults to 1.0.
+
+        Returns:
+            Union[str, List[str]]: The processed text.
+        """
+        if isinstance(text, list):
+            output = []
+            for i in range(len(text)):
+                output.append(
+                    self.replace_placeholder_tokens_in_text(
+                        text[i], vector_shuffle=vector_shuffle
+                    )
+                )
+            return output
+
+        for placeholder_token in self.token_map:
+            if placeholder_token in text:
+                tokens = self.token_map[placeholder_token]
+                tokens = tokens[: 1 + int(len(tokens) * prop_tokens_to_load)]
+                if vector_shuffle:
+                    tokens = copy.copy(tokens)
+                    random.shuffle(tokens)
+                text = text.replace(placeholder_token, " ".join(tokens))
+        return text
+
+    def replace_text_with_placeholder_tokens(
+        self, text: Union[str, List[str]]
+    ) -> Union[str, List[str]]:
+        """Replace the placeholder tokens in text with the original keywords.
+        This function will be called in `self.decode`.
+
+        Args:
+            text (Union[str, List[str]]): The text to be processed.
+
+        Returns:
+            Union[str, List[str]]: The processed text.
+        """
+        if isinstance(text, list):
+            output = []
+            for i in range(len(text)):
+                output.append(self.replace_text_with_placeholder_tokens(text[i]))
+            return output
+
+        for placeholder_token, tokens in self.token_map.items():
+            merged_tokens = " ".join(tokens)
+            if merged_tokens in text:
+                text = text.replace(merged_tokens, placeholder_token)
+        return text
+
+    def __call__(
+        self,
+        text: Union[str, List[str]],
+        *args,
+        vector_shuffle: bool = False,
+        prop_tokens_to_load: float = 1.0,
+        **kwargs,
+    ):
+        """The call function of the wrapper.
+
+        Args:
+            text (Union[str, List[str]]): The text to be tokenized.
+            vector_shuffle (bool, optional): Whether to shuffle the vectors.
+                Defaults to False.
+            prop_tokens_to_load (float, optional): The proportion of tokens to
+                be loaded. If 1.0, all tokens will be loaded. Defaults to 1.0
+            *args, **kwargs: The arguments for `self.wrapped.__call__`.
+        """
+        replaced_text = self.replace_placeholder_tokens_in_text(
+            text, vector_shuffle=vector_shuffle, prop_tokens_to_load=prop_tokens_to_load
+        )
+
+        return self.wrapped.__call__(replaced_text, *args, **kwargs)
+
+    def encode(self, text: Union[str, List[str]], *args, **kwargs):
+        """Encode the passed text to token index.
+
+        Args:
+            text (Union[str, List[str]]): The text to be encode.
+            *args, **kwargs: The arguments for `self.wrapped.__call__`.
+        """
+        replaced_text = self.replace_placeholder_tokens_in_text(text)
+        return self.wrapped(replaced_text, *args, **kwargs)
+
+    def decode(
+        self, token_ids, return_raw: bool = False, *args, **kwargs
+    ) -> Union[str, List[str]]:
+        """Decode the token index to text.
+
+        Args:
+            token_ids: The token index to be decoded.
+            return_raw: Whether keep the placeholder token in the text.
+                Defaults to False.
+            *args, **kwargs: The arguments for `self.wrapped.decode`.
+
+        Returns:
+            Union[str, List[str]]: The decoded text.
+        """
+        text = self.wrapped.decode(token_ids, *args, **kwargs)
+        if return_raw:
+            return text
+        replaced_text = self.replace_text_with_placeholder_tokens(text)
+        return replaced_text
+
+
+class EmbeddingLayerWithFixes(nn.Module):
+    """The revised embedding layer to support external embeddings. This design
+    of this class is inspired by https://github.com/AUTOMATIC1111/stable-
+    diffusion-webui/blob/22bcc7be428c94e9408f589966c2040187245d81/modules/sd_hi
+    jack.py#L224  # noqa.
+
+    Args:
+        wrapped (nn.Emebdding): The embedding layer to be wrapped.
+        external_embeddings (Union[dict, List[dict]], optional): The external
+            embeddings added to this layer. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        wrapped: nn.Embedding,
+        external_embeddings: Optional[Union[dict, List[dict]]] = None,
+    ):
+        super().__init__()
+        self.wrapped = wrapped
+        self.num_embeddings = wrapped.weight.shape[0]
+
+        self.external_embeddings = []
+        if external_embeddings:
+            self.add_embeddings(external_embeddings)
+
+        self.trainable_embeddings = nn.ParameterDict()
+
+    @property
+    def weight(self):
+        """Get the weight of wrapped embedding layer."""
+        return self.wrapped.weight
+
+    def check_duplicate_names(self, embeddings: List[dict]):
+        """Check whether duplicate names exist in list of 'external
+        embeddings'.
+
+        Args:
+            embeddings (List[dict]): A list of embedding to be check.
+        """
+        names = [emb["name"] for emb in embeddings]
+        assert len(names) == len(set(names)), (
+            "Found duplicated names in 'external_embeddings'. Name list: " f"'{names}'"
+        )
+
+    def check_ids_overlap(self, embeddings):
+        """Check whether overlap exist in token ids of 'external_embeddings'.
+
+        Args:
+            embeddings (List[dict]): A list of embedding to be check.
+        """
+        ids_range = [[emb["start"], emb["end"], emb["name"]] for emb in embeddings]
+        ids_range.sort()  # sort by 'start'
+        # check if 'end' has overlapping
+        for idx in range(len(ids_range) - 1):
+            name1, name2 = ids_range[idx][-1], ids_range[idx + 1][-1]
+            assert ids_range[idx][1] <= ids_range[idx + 1][0], (
+                f"Found ids overlapping between embeddings '{name1}' " f"and '{name2}'."
+            )
+
+    def add_embeddings(self, embeddings: Optional[Union[dict, List[dict]]]):
+        """Add external embeddings to this layer.
+
+        Use case:
+
+        >>> 1. Add token to tokenizer and get the token id.
+        >>> tokenizer = TokenizerWrapper('openai/clip-vit-base-patch32')
+        >>> # 'how much' in kiswahili
+        >>> tokenizer.add_placeholder_tokens('ngapi', num_vec_per_token=4)
+        >>>
+        >>> 2. Add external embeddings to the model.
+        >>> new_embedding = {
+        >>>     'name': 'ngapi',  # 'how much' in kiswahili
+        >>>     'embedding': torch.ones(1, 15) * 4,
+        >>>     'start': tokenizer.get_token_info('kwaheri')['start'],
+        >>>     'end': tokenizer.get_token_info('kwaheri')['end'],
+        >>>     'trainable': False  # if True, will registry as a parameter
+        >>> }
+        >>> embedding_layer = nn.Embedding(10, 15)
+        >>> embedding_layer_wrapper = EmbeddingLayerWithFixes(embedding_layer)
+        >>> embedding_layer_wrapper.add_embeddings(new_embedding)
+        >>>
+        >>> 3. Forward tokenizer and embedding layer!
+        >>> input_text = ['hello, ngapi!', 'hello my friend, ngapi?']
+        >>> input_ids = tokenizer(
+        >>>     input_text, padding='max_length', truncation=True,
+        >>>     return_tensors='pt')['input_ids']
+        >>> out_feat = embedding_layer_wrapper(input_ids)
+        >>>
+        >>> 4. Let's validate the result!
+        >>> assert (out_feat[0, 3: 7] == 2.3).all()
+        >>> assert (out_feat[2, 5: 9] == 2.3).all()
+
+        Args:
+            embeddings (Union[dict, list[dict]]): The external embeddings to
+                be added. Each dict must contain the following 4 fields: 'name'
+                (the name of this embedding), 'embedding' (the embedding
+                tensor), 'start' (the start token id of this embedding), 'end'
+                (the end token id of this embedding). For example:
+                `{name: NAME, start: START, end: END, embedding: torch.Tensor}`
+        """
+        if isinstance(embeddings, dict):
+            embeddings = [embeddings]
+
+        self.external_embeddings += embeddings
+        self.check_duplicate_names(self.external_embeddings)
+        self.check_ids_overlap(self.external_embeddings)
+
+        # set for trainable
+        added_trainable_emb_info = []
+        for embedding in embeddings:
+            trainable = embedding.get("trainable", False)
+            if trainable:
+                name = embedding["name"]
+                embedding["embedding"] = torch.nn.Parameter(embedding["embedding"])
+                self.trainable_embeddings[name] = embedding["embedding"]
+                added_trainable_emb_info.append(name)
+
+        added_emb_info = [emb["name"] for emb in embeddings]
+        added_emb_info = ", ".join(added_emb_info)
+        print(f"Successfully add external embeddings: {added_emb_info}.", "current")
+
+        if added_trainable_emb_info:
+            added_trainable_emb_info = ", ".join(added_trainable_emb_info)
+            print(
+                "Successfully add trainable external embeddings: "
+                f"{added_trainable_emb_info}",
+                "current",
+            )
+
+    def replace_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        """Replace external input ids to 0.
+
+        Args:
+            input_ids (torch.Tensor): The input ids to be replaced.
+
+        Returns:
+            torch.Tensor: The replaced input ids.
+        """
+        input_ids_fwd = input_ids.clone()
+        input_ids_fwd[input_ids_fwd >= self.num_embeddings] = 0
+        return input_ids_fwd
+
+    def replace_embeddings(
+        self, input_ids: torch.Tensor, embedding: torch.Tensor, external_embedding: dict
+    ) -> torch.Tensor:
+        """Replace external embedding to the embedding layer. Noted that, in
+        this function we use `torch.cat` to avoid inplace modification.
+
+        Args:
+            input_ids (torch.Tensor): The original token ids. Shape like
+                [LENGTH, ].
+            embedding (torch.Tensor): The embedding of token ids after
+                `replace_input_ids` function.
+            external_embedding (dict): The external embedding to be replaced.
+
+        Returns:
+            torch.Tensor: The replaced embedding.
+        """
+        new_embedding = []
+
+        name = external_embedding["name"]
+        start = external_embedding["start"]
+        end = external_embedding["end"]
+        target_ids_to_replace = [i for i in range(start, end)]
+        ext_emb = external_embedding["embedding"]
+
+        # do not need to replace
+        if not (input_ids == start).any():
+            return embedding
+
+        # start replace
+        s_idx, e_idx = 0, 0
+        while e_idx < len(input_ids):
+            if input_ids[e_idx] == start:
+                if e_idx != 0:
+                    # add embedding do not need to replace
+                    new_embedding.append(embedding[s_idx:e_idx])
+
+                # check if the next embedding need to replace is valid
+                actually_ids_to_replace = [
+                    int(i) for i in input_ids[e_idx : e_idx + end - start]
+                ]
+                assert actually_ids_to_replace == target_ids_to_replace, (
+                    f"Invalid 'input_ids' in position: {s_idx} to {e_idx}. "
+                    f"Expect '{target_ids_to_replace}' for embedding "
+                    f"'{name}' but found '{actually_ids_to_replace}'."
+                )
+
+                new_embedding.append(ext_emb)
+
+                s_idx = e_idx + end - start
+                e_idx = s_idx + 1
+            else:
+                e_idx += 1
+
+        if e_idx == len(input_ids):
+            new_embedding.append(embedding[s_idx:e_idx])
+
+        return torch.cat(new_embedding, dim=0)
+
+    def forward(
+        self, input_ids: torch.Tensor, external_embeddings: Optional[List[dict]] = None
+    ):
+        """The forward function.
+
+        Args:
+            input_ids (torch.Tensor): The token ids shape like [bz, LENGTH] or
+                [LENGTH, ].
+            external_embeddings (Optional[List[dict]]): The external
+                embeddings. If not passed, only `self.external_embeddings`
+                will be used.  Defaults to None.
+
+        input_ids: shape like [bz, LENGTH] or [LENGTH].
+        """
+        assert input_ids.ndim in [1, 2]
+        if input_ids.ndim == 1:
+            input_ids = input_ids.unsqueeze(0)
+
+        if external_embeddings is None and not self.external_embeddings:
+            return self.wrapped(input_ids)
+
+        input_ids_fwd = self.replace_input_ids(input_ids)
+        inputs_embeds = self.wrapped(input_ids_fwd)
+
+        vecs = []
+
+        if external_embeddings is None:
+            external_embeddings = []
+        elif isinstance(external_embeddings, dict):
+            external_embeddings = [external_embeddings]
+        embeddings = self.external_embeddings + external_embeddings
+
+        for input_id, embedding in zip(input_ids, inputs_embeds):
+            new_embedding = embedding
+            for external_embedding in embeddings:
+                new_embedding = self.replace_embeddings(
+                    input_id, new_embedding, external_embedding
+                )
+            vecs.append(new_embedding)
+
+        return torch.stack(vecs)
+
+
+def add_tokens(
+    tokenizer,
+    text_encoder,
+    placeholder_tokens: list,
+    initialize_tokens: list = None,
+    num_vectors_per_token: int = 1,
+):
+    """Add token for training.
+
+    # TODO: support add tokens as dict, then we can load pretrained tokens.
+    """
+    if initialize_tokens is not None:
+        assert len(initialize_tokens) == len(
+            placeholder_tokens
+        ), "placeholder_token should be the same length as initialize_token"
+    for ii in range(len(placeholder_tokens)):
+        tokenizer.add_placeholder_token(
+            placeholder_tokens[ii], num_vec_per_token=num_vectors_per_token
+        )
+
+    # text_encoder.set_embedding_layer()
+    embedding_layer = text_encoder.text_model.embeddings.token_embedding
+    text_encoder.text_model.embeddings.token_embedding = EmbeddingLayerWithFixes(
+        embedding_layer
+    )
+    embedding_layer = text_encoder.text_model.embeddings.token_embedding
+
+    assert embedding_layer is not None, (
+        "Do not support get embedding layer for current text encoder. "
+        "Please check your configuration."
+    )
+    initialize_embedding = []
+    if initialize_tokens is not None:
+        for ii in range(len(placeholder_tokens)):
+            init_id = tokenizer(initialize_tokens[ii]).input_ids[1]
+            temp_embedding = embedding_layer.weight[init_id]
+            initialize_embedding.append(
+                temp_embedding[None, ...].repeat(num_vectors_per_token, 1)
+            )
+    else:
+        for ii in range(len(placeholder_tokens)):
+            init_id = tokenizer("a").input_ids[1]
+            temp_embedding = embedding_layer.weight[init_id]
+            len_emb = temp_embedding.shape[0]
+            init_weight = (torch.rand(num_vectors_per_token, len_emb) - 0.5) / 2.0
+            initialize_embedding.append(init_weight)
+
+    # initialize_embedding  = torch.cat(initialize_embedding,dim=0)
+
+    token_info_all = []
+    for ii in range(len(placeholder_tokens)):
+        token_info = tokenizer.get_token_info(placeholder_tokens[ii])
+        token_info["embedding"] = initialize_embedding[ii]
+        token_info["trainable"] = True
+        token_info_all.append(token_info)
+    embedding_layer.add_embeddings(token_info_all)
--- a/iopaint/model/sd.py
+++ b/iopaint/model/sd.py
@@ -0,0 +1,114 @@
+import PIL.Image
+import cv2
+import torch
+from loguru import logger
+
+from iopaint.model.base import DiffusionInpaintModel
+from iopaint.model.helper.cpu_text_encoder import CPUTextEncoderWrapper
+from iopaint.model.utils import handle_from_pretrained_exceptions
+from iopaint.schema import InpaintRequest, ModelType
+
+
+class SD(DiffusionInpaintModel):
+    pad_mod = 8
+    min_size = 512
+    lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5"
+
+    def init_model(self, device: torch.device, **kwargs):
+        from diffusers.pipelines.stable_diffusion import StableDiffusionInpaintPipeline
+
+        fp16 = not kwargs.get("no_half", False)
+
+        model_kwargs = {}
+        if kwargs["disable_nsfw"] or kwargs.get("cpu_offload", False):
+            logger.info("Disable Stable Diffusion Model NSFW checker")
+            model_kwargs.update(
+                dict(
+                    safety_checker=None,
+                    feature_extractor=None,
+                    requires_safety_checker=False,
+                )
+            )
+
+        use_gpu = device == torch.device("cuda") and torch.cuda.is_available()
+        torch_dtype = torch.float16 if use_gpu and fp16 else torch.float32
+
+        if self.model_info.is_single_file_diffusers:
+            if self.model_info.model_type == ModelType.DIFFUSERS_SD:
+                model_kwargs["num_in_channels"] = 4
+            else:
+                model_kwargs["num_in_channels"] = 9
+
+            self.model = StableDiffusionInpaintPipeline.from_single_file(
+                self.model_id_or_path, dtype=torch_dtype, **model_kwargs
+            )
+        else:
+            self.model = handle_from_pretrained_exceptions(
+                StableDiffusionInpaintPipeline.from_pretrained,
+                pretrained_model_name_or_path=self.model_id_or_path,
+                variant="fp16",
+                dtype=torch_dtype,
+                **model_kwargs,
+            )
+
+        if kwargs.get("cpu_offload", False) and use_gpu:
+            logger.info("Enable sequential cpu offload")
+            self.model.enable_sequential_cpu_offload(gpu_id=0)
+        else:
+            self.model = self.model.to(device)
+            if kwargs["sd_cpu_textencoder"]:
+                logger.info("Run Stable Diffusion TextEncoder on CPU")
+                self.model.text_encoder = CPUTextEncoderWrapper(
+                    self.model.text_encoder, torch_dtype
+                )
+
+        self.callback = kwargs.pop("callback", None)
+
+    def forward(self, image, mask, config: InpaintRequest):
+        """Input image and output image have same size
+        image: [H, W, C] RGB
+        mask: [H, W, 1] 255 means area to repaint
+        return: BGR IMAGE
+        """
+        self.set_scheduler(config)
+
+        img_h, img_w = image.shape[:2]
+
+        output = self.model(
+            image=PIL.Image.fromarray(image),
+            prompt=config.prompt,
+            negative_prompt=config.negative_prompt,
+            mask_image=PIL.Image.fromarray(mask[:, :, -1], mode="L"),
+            num_inference_steps=config.sd_steps,
+            strength=config.sd_strength,
+            guidance_scale=config.sd_guidance_scale,
+            output_type="np",
+            callback_on_step_end=self.callback,
+            height=img_h,
+            width=img_w,
+            generator=torch.manual_seed(config.sd_seed),
+        ).images[0]
+
+        output = (output * 255).round().astype("uint8")
+        output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
+        return output
+
+
+class SD15(SD):
+    name = "runwayml/stable-diffusion-inpainting"
+    model_id_or_path = "runwayml/stable-diffusion-inpainting"
+
+
+class Anything4(SD):
+    name = "Sanster/anything-4.0-inpainting"
+    model_id_or_path = "Sanster/anything-4.0-inpainting"
+
+
+class RealisticVision14(SD):
+    name = "Sanster/Realistic_Vision_V1.4-inpainting"
+    model_id_or_path = "Sanster/Realistic_Vision_V1.4-inpainting"
+
+
+class SD2(SD):
+    name = "stabilityai/stable-diffusion-2-inpainting"
+    model_id_or_path = "stabilityai/stable-diffusion-2-inpainting"
--- a/iopaint/model/sdxl.py
+++ b/iopaint/model/sdxl.py
@@ -0,0 +1,89 @@
+import os
+
+import PIL.Image
+import cv2
+import torch
+from diffusers import AutoencoderKL
+from loguru import logger
+
+from iopaint.model.base import DiffusionInpaintModel
+from iopaint.model.utils import handle_from_pretrained_exceptions
+from iopaint.schema import InpaintRequest, ModelType
+
+
+class SDXL(DiffusionInpaintModel):
+    name = "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
+    pad_mod = 8
+    min_size = 512
+    lcm_lora_id = "latent-consistency/lcm-lora-sdxl"
+    model_id_or_path = "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
+
+    def init_model(self, device: torch.device, **kwargs):
+        from diffusers.pipelines import StableDiffusionXLInpaintPipeline
+
+        fp16 = not kwargs.get("no_half", False)
+
+        use_gpu = device == torch.device("cuda") and torch.cuda.is_available()
+        torch_dtype = torch.float16 if use_gpu and fp16 else torch.float32
+
+        if self.model_info.model_type == ModelType.DIFFUSERS_SDXL:
+            num_in_channels = 4
+        else:
+            num_in_channels = 9
+
+        if os.path.isfile(self.model_id_or_path):
+            self.model = StableDiffusionXLInpaintPipeline.from_single_file(
+                self.model_id_or_path,
+                dtype=torch_dtype,
+                num_in_channels=num_in_channels,
+            )
+        else:
+            vae = AutoencoderKL.from_pretrained(
+                "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch_dtype
+            )
+            self.model = handle_from_pretrained_exceptions(
+                StableDiffusionXLInpaintPipeline.from_pretrained,
+                pretrained_model_name_or_path=self.model_id_or_path,
+                torch_dtype=torch_dtype,
+                vae=vae,
+                variant="fp16",
+            )
+
+        if kwargs.get("cpu_offload", False) and use_gpu:
+            logger.info("Enable sequential cpu offload")
+            self.model.enable_sequential_cpu_offload(gpu_id=0)
+        else:
+            self.model = self.model.to(device)
+            if kwargs["sd_cpu_textencoder"]:
+                logger.warning("Stable Diffusion XL not support run TextEncoder on CPU")
+
+        self.callback = kwargs.pop("callback", None)
+
+    def forward(self, image, mask, config: InpaintRequest):
+        """Input image and output image have same size
+        image: [H, W, C] RGB
+        mask: [H, W, 1] 255 means area to repaint
+        return: BGR IMAGE
+        """
+        self.set_scheduler(config)
+
+        img_h, img_w = image.shape[:2]
+
+        output = self.model(
+            image=PIL.Image.fromarray(image),
+            prompt=config.prompt,
+            negative_prompt=config.negative_prompt,
+            mask_image=PIL.Image.fromarray(mask[:, :, -1], mode="L"),
+            num_inference_steps=config.sd_steps,
+            strength=0.999 if config.sd_strength == 1.0 else config.sd_strength,
+            guidance_scale=config.sd_guidance_scale,
+            output_type="np",
+            callback_on_step_end=self.callback,
+            height=img_h,
+            width=img_w,
+            generator=torch.manual_seed(config.sd_seed),
+        ).images[0]
+
+        output = (output * 255).round().astype("uint8")
+        output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
+        return output
--- a/iopaint/model/utils.py
+++ b/iopaint/model/utils.py
@@ -0,0 +1,989 @@
+import copy
+import gc
+import math
+import random
+import traceback
+from typing import Any
+
+import torch
+import numpy as np
+import collections
+from itertools import repeat
+
+from diffusers import (
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+    UniPCMultistepScheduler,
+    LCMScheduler,
+    DPMSolverSinglestepScheduler,
+    KDPM2DiscreteScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    HeunDiscreteScheduler,
+)
+from diffusers.configuration_utils import FrozenDict
+from loguru import logger
+
+from iopaint.schema import SDSampler
+from torch import conv2d, conv_transpose2d
+
+
+def make_beta_schedule(
+    device, schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3
+):
+    if schedule == "linear":
+        betas = (
+            torch.linspace(
+                linear_start**0.5, linear_end**0.5, n_timestep, dtype=torch.float64
+            )
+            ** 2
+        )
+
+    elif schedule == "cosine":
+        timesteps = (
+            torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
+        ).to(device)
+        alphas = timesteps / (1 + cosine_s) * np.pi / 2
+        alphas = torch.cos(alphas).pow(2).to(device)
+        alphas = alphas / alphas[0]
+        betas = 1 - alphas[1:] / alphas[:-1]
+        betas = np.clip(betas, a_min=0, a_max=0.999)
+
+    elif schedule == "sqrt_linear":
+        betas = torch.linspace(
+            linear_start, linear_end, n_timestep, dtype=torch.float64
+        )
+    elif schedule == "sqrt":
+        betas = (
+            torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
+            ** 0.5
+        )
+    else:
+        raise ValueError(f"schedule '{schedule}' unknown.")
+    return betas.numpy()
+
+
+def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
+    # select alphas for computing the variance schedule
+    alphas = alphacums[ddim_timesteps]
+    alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
+
+    # according the the formula provided in https://arxiv.org/abs/2010.02502
+    sigmas = eta * np.sqrt(
+        (1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev)
+    )
+    if verbose:
+        print(
+            f"Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}"
+        )
+        print(
+            f"For the chosen value of eta, which is {eta}, "
+            f"this results in the following sigma_t schedule for ddim sampler {sigmas}"
+        )
+    return sigmas, alphas, alphas_prev
+
+
+def make_ddim_timesteps(
+    ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True
+):
+    if ddim_discr_method == "uniform":
+        c = num_ddpm_timesteps // num_ddim_timesteps
+        ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
+    elif ddim_discr_method == "quad":
+        ddim_timesteps = (
+            (np.linspace(0, np.sqrt(num_ddpm_timesteps * 0.8), num_ddim_timesteps)) ** 2
+        ).astype(int)
+    else:
+        raise NotImplementedError(
+            f'There is no ddim discretization method called "{ddim_discr_method}"'
+        )
+
+    # assert ddim_timesteps.shape[0] == num_ddim_timesteps
+    # add one to get the final alpha values right (the ones from first scale to data during sampling)
+    steps_out = ddim_timesteps + 1
+    if verbose:
+        print(f"Selected timesteps for ddim sampler: {steps_out}")
+    return steps_out
+
+
+def noise_like(shape, device, repeat=False):
+    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(
+        shape[0], *((1,) * (len(shape) - 1))
+    )
+    noise = lambda: torch.randn(shape, device=device)
+    return repeat_noise() if repeat else noise()
+
+
+def timestep_embedding(device, timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, dtype=torch.float32)
+        / half
+    ).to(device=device)
+
+    args = timesteps[:, None].float() * freqs[None]
+
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+
+
+###### MAT and FcF #######
+
+
+def normalize_2nd_moment(x, dim=1):
+    return (
+        x * (x.square().mean(dim=dim, keepdim=True) + torch.finfo(x.dtype).eps).rsqrt()
+    )
+
+
+class EasyDict(dict):
+    """Convenience class that behaves like a dict but allows access with the attribute syntax."""
+
+    def __getattr__(self, name: str) -> Any:
+        try:
+            return self[name]
+        except KeyError:
+            raise AttributeError(name)
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        self[name] = value
+
+    def __delattr__(self, name: str) -> None:
+        del self[name]
+
+
+def _bias_act_ref(x, b=None, dim=1, act="linear", alpha=None, gain=None, clamp=None):
+    """Slow reference implementation of `bias_act()` using standard TensorFlow ops."""
+    assert isinstance(x, torch.Tensor)
+    assert clamp is None or clamp >= 0
+    spec = activation_funcs[act]
+    alpha = float(alpha if alpha is not None else spec.def_alpha)
+    gain = float(gain if gain is not None else spec.def_gain)
+    clamp = float(clamp if clamp is not None else -1)
+
+    # Add bias.
+    if b is not None:
+        assert isinstance(b, torch.Tensor) and b.ndim == 1
+        assert 0 <= dim < x.ndim
+        assert b.shape[0] == x.shape[dim]
+        x = x + b.reshape([-1 if i == dim else 1 for i in range(x.ndim)])
+
+    # Evaluate activation function.
+    alpha = float(alpha)
+    x = spec.func(x, alpha=alpha)
+
+    # Scale by gain.
+    gain = float(gain)
+    if gain != 1:
+        x = x * gain
+
+    # Clamp.
+    if clamp >= 0:
+        x = x.clamp(-clamp, clamp)  # pylint: disable=invalid-unary-operand-type
+    return x
+
+
+def bias_act(
+    x, b=None, dim=1, act="linear", alpha=None, gain=None, clamp=None, impl="ref"
+):
+    r"""Fused bias and activation function.
+
+    Adds bias `b` to activation tensor `x`, evaluates activation function `act`,
+    and scales the result by `gain`. Each of the steps is optional. In most cases,
+    the fused op is considerably more efficient than performing the same calculation
+    using standard PyTorch ops. It supports first and second order gradients,
+    but not third order gradients.
+
+    Args:
+        x:      Input activation tensor. Can be of any shape.
+        b:      Bias vector, or `None` to disable. Must be a 1D tensor of the same type
+                as `x`. The shape must be known, and it must match the dimension of `x`
+                corresponding to `dim`.
+        dim:    The dimension in `x` corresponding to the elements of `b`.
+                The value of `dim` is ignored if `b` is not specified.
+        act:    Name of the activation function to evaluate, or `"linear"` to disable.
+                Can be e.g. `"relu"`, `"lrelu"`, `"tanh"`, `"sigmoid"`, `"swish"`, etc.
+                See `activation_funcs` for a full list. `None` is not allowed.
+        alpha:  Shape parameter for the activation function, or `None` to use the default.
+        gain:   Scaling factor for the output tensor, or `None` to use default.
+                See `activation_funcs` for the default scaling of each activation function.
+                If unsure, consider specifying 1.
+        clamp:  Clamp the output values to `[-clamp, +clamp]`, or `None` to disable
+                the clamping (default).
+        impl:   Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).
+
+    Returns:
+        Tensor of the same shape and datatype as `x`.
+    """
+    assert isinstance(x, torch.Tensor)
+    assert impl in ["ref", "cuda"]
+    return _bias_act_ref(
+        x=x, b=b, dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp
+    )
+
+
+def _get_filter_size(f):
+    if f is None:
+        return 1, 1
+
+    assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
+    fw = f.shape[-1]
+    fh = f.shape[0]
+
+    fw = int(fw)
+    fh = int(fh)
+    assert fw >= 1 and fh >= 1
+    return fw, fh
+
+
+def _get_weight_shape(w):
+    shape = [int(sz) for sz in w.shape]
+    return shape
+
+
+def _parse_scaling(scaling):
+    if isinstance(scaling, int):
+        scaling = [scaling, scaling]
+    assert isinstance(scaling, (list, tuple))
+    assert all(isinstance(x, int) for x in scaling)
+    sx, sy = scaling
+    assert sx >= 1 and sy >= 1
+    return sx, sy
+
+
+def _parse_padding(padding):
+    if isinstance(padding, int):
+        padding = [padding, padding]
+    assert isinstance(padding, (list, tuple))
+    assert all(isinstance(x, int) for x in padding)
+    if len(padding) == 2:
+        padx, pady = padding
+        padding = [padx, padx, pady, pady]
+    padx0, padx1, pady0, pady1 = padding
+    return padx0, padx1, pady0, pady1
+
+
+def setup_filter(
+    f,
+    device=torch.device("cpu"),
+    normalize=True,
+    flip_filter=False,
+    gain=1,
+    separable=None,
+):
+    r"""Convenience function to setup 2D FIR filter for `upfirdn2d()`.
+
+    Args:
+        f:           Torch tensor, numpy array, or python list of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable),
+                     `[]` (impulse), or
+                     `None` (identity).
+        device:      Result device (default: cpu).
+        normalize:   Normalize the filter so that it retains the magnitude
+                     for constant input signal (DC)? (default: True).
+        flip_filter: Flip the filter? (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        separable:   Return a separable filter? (default: select automatically).
+
+    Returns:
+        Float32 tensor of the shape
+        `[filter_height, filter_width]` (non-separable) or
+        `[filter_taps]` (separable).
+    """
+    # Validate.
+    if f is None:
+        f = 1
+    f = torch.as_tensor(f, dtype=torch.float32)
+    assert f.ndim in [0, 1, 2]
+    assert f.numel() > 0
+    if f.ndim == 0:
+        f = f[np.newaxis]
+
+    # Separable?
+    if separable is None:
+        separable = f.ndim == 1 and f.numel() >= 8
+    if f.ndim == 1 and not separable:
+        f = f.ger(f)
+    assert f.ndim == (1 if separable else 2)
+
+    # Apply normalize, flip, gain, and device.
+    if normalize:
+        f /= f.sum()
+    if flip_filter:
+        f = f.flip(list(range(f.ndim)))
+    f = f * (gain ** (f.ndim / 2))
+    f = f.to(device=device)
+    return f
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_2tuple = _ntuple(2)
+
+activation_funcs = {
+    "linear": EasyDict(
+        func=lambda x, **_: x,
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=1,
+        ref="",
+        has_2nd_grad=False,
+    ),
+    "relu": EasyDict(
+        func=lambda x, **_: torch.nn.functional.relu(x),
+        def_alpha=0,
+        def_gain=np.sqrt(2),
+        cuda_idx=2,
+        ref="y",
+        has_2nd_grad=False,
+    ),
+    "lrelu": EasyDict(
+        func=lambda x, alpha, **_: torch.nn.functional.leaky_relu(x, alpha),
+        def_alpha=0.2,
+        def_gain=np.sqrt(2),
+        cuda_idx=3,
+        ref="y",
+        has_2nd_grad=False,
+    ),
+    "tanh": EasyDict(
+        func=lambda x, **_: torch.tanh(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=4,
+        ref="y",
+        has_2nd_grad=True,
+    ),
+    "sigmoid": EasyDict(
+        func=lambda x, **_: torch.sigmoid(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=5,
+        ref="y",
+        has_2nd_grad=True,
+    ),
+    "elu": EasyDict(
+        func=lambda x, **_: torch.nn.functional.elu(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=6,
+        ref="y",
+        has_2nd_grad=True,
+    ),
+    "selu": EasyDict(
+        func=lambda x, **_: torch.nn.functional.selu(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=7,
+        ref="y",
+        has_2nd_grad=True,
+    ),
+    "softplus": EasyDict(
+        func=lambda x, **_: torch.nn.functional.softplus(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=8,
+        ref="y",
+        has_2nd_grad=True,
+    ),
+    "swish": EasyDict(
+        func=lambda x, **_: torch.sigmoid(x) * x,
+        def_alpha=0,
+        def_gain=np.sqrt(2),
+        cuda_idx=9,
+        ref="x",
+        has_2nd_grad=True,
+    ),
+}
+
+
+def upfirdn2d(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1, impl="cuda"):
+    r"""Pad, upsample, filter, and downsample a batch of 2D images.
+
+    Performs the following sequence of operations for each channel:
+
+    1. Upsample the image by inserting N-1 zeros after each pixel (`up`).
+
+    2. Pad the image with the specified number of zeros on each side (`padding`).
+       Negative padding corresponds to cropping the image.
+
+    3. Convolve the image with the specified 2D FIR filter (`f`), shrinking it
+       so that the footprint of all output pixels lies within the input image.
+
+    4. Downsample the image by keeping every Nth pixel (`down`).
+
+    This sequence of operations bears close resemblance to scipy.signal.upfirdn().
+    The fused op is considerably more efficient than performing the same calculation
+    using standard PyTorch ops. It supports gradients of arbitrary order.
+
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        up:          Integer upsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        down:        Integer downsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        padding:     Padding with respect to the upsampled image. Can be a single number
+                     or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    # assert isinstance(x, torch.Tensor)
+    # assert impl in ['ref', 'cuda']
+    return _upfirdn2d_ref(
+        x, f, up=up, down=down, padding=padding, flip_filter=flip_filter, gain=gain
+    )
+
+
+def _upfirdn2d_ref(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1):
+    """Slow reference implementation of `upfirdn2d()` using standard PyTorch ops."""
+    # Validate arguments.
+    assert isinstance(x, torch.Tensor) and x.ndim == 4
+    if f is None:
+        f = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+    assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
+    assert not f.requires_grad
+    batch_size, num_channels, in_height, in_width = x.shape
+    # upx, upy = _parse_scaling(up)
+    # downx, downy = _parse_scaling(down)
+
+    upx, upy = up, up
+    downx, downy = down, down
+
+    # padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    padx0, padx1, pady0, pady1 = padding[0], padding[1], padding[2], padding[3]
+
+    # Upsample by inserting zeros.
+    x = x.reshape([batch_size, num_channels, in_height, 1, in_width, 1])
+    x = torch.nn.functional.pad(x, [0, upx - 1, 0, 0, 0, upy - 1])
+    x = x.reshape([batch_size, num_channels, in_height * upy, in_width * upx])
+
+    # Pad or crop.
+    x = torch.nn.functional.pad(
+        x, [max(padx0, 0), max(padx1, 0), max(pady0, 0), max(pady1, 0)]
+    )
+    x = x[
+        :,
+        :,
+        max(-pady0, 0) : x.shape[2] - max(-pady1, 0),
+        max(-padx0, 0) : x.shape[3] - max(-padx1, 0),
+    ]
+
+    # Setup filter.
+    f = f * (gain ** (f.ndim / 2))
+    f = f.to(x.dtype)
+    if not flip_filter:
+        f = f.flip(list(range(f.ndim)))
+
+    # Convolve with the filter.
+    f = f[np.newaxis, np.newaxis].repeat([num_channels, 1] + [1] * f.ndim)
+    if f.ndim == 4:
+        x = conv2d(input=x, weight=f, groups=num_channels)
+    else:
+        x = conv2d(input=x, weight=f.unsqueeze(2), groups=num_channels)
+        x = conv2d(input=x, weight=f.unsqueeze(3), groups=num_channels)
+
+    # Downsample by throwing away pixels.
+    x = x[:, :, ::downy, ::downx]
+    return x
+
+
+def downsample2d(x, f, down=2, padding=0, flip_filter=False, gain=1, impl="cuda"):
+    r"""Downsample a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape is a fraction of the input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        down:        Integer downsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        padding:     Padding with respect to the input. Can be a single number or a
+                     list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    downx, downy = _parse_scaling(down)
+    # padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    padx0, padx1, pady0, pady1 = padding, padding, padding, padding
+
+    fw, fh = _get_filter_size(f)
+    p = [
+        padx0 + (fw - downx + 1) // 2,
+        padx1 + (fw - downx) // 2,
+        pady0 + (fh - downy + 1) // 2,
+        pady1 + (fh - downy) // 2,
+    ]
+    return upfirdn2d(
+        x, f, down=down, padding=p, flip_filter=flip_filter, gain=gain, impl=impl
+    )
+
+
+def upsample2d(x, f, up=2, padding=0, flip_filter=False, gain=1, impl="cuda"):
+    r"""Upsample a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape is a multiple of the input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        x:           Float32/float64/float16 input tensor of the shape
+                     `[batch_size, num_channels, in_height, in_width]`.
+        f:           Float32 FIR filter of the shape
+                     `[filter_height, filter_width]` (non-separable),
+                     `[filter_taps]` (separable), or
+                     `None` (identity).
+        up:          Integer upsampling factor. Can be a single int or a list/tuple
+                     `[x, y]` (default: 1).
+        padding:     Padding with respect to the output. Can be a single number or a
+                     list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                     (default: 0).
+        flip_filter: False = convolution, True = correlation (default: False).
+        gain:        Overall scaling factor for signal magnitude (default: 1).
+        impl:        Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    upx, upy = _parse_scaling(up)
+    # upx, upy = up, up
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    # padx0, padx1, pady0, pady1 = padding, padding, padding, padding
+    fw, fh = _get_filter_size(f)
+    p = [
+        padx0 + (fw + upx - 1) // 2,
+        padx1 + (fw - upx) // 2,
+        pady0 + (fh + upy - 1) // 2,
+        pady1 + (fh - upy) // 2,
+    ]
+    return upfirdn2d(
+        x,
+        f,
+        up=up,
+        padding=p,
+        flip_filter=flip_filter,
+        gain=gain * upx * upy,
+        impl=impl,
+    )
+
+
+class MinibatchStdLayer(torch.nn.Module):
+    def __init__(self, group_size, num_channels=1):
+        super().__init__()
+        self.group_size = group_size
+        self.num_channels = num_channels
+
+    def forward(self, x):
+        N, C, H, W = x.shape
+        G = (
+            torch.min(torch.as_tensor(self.group_size), torch.as_tensor(N))
+            if self.group_size is not None
+            else N
+        )
+        F = self.num_channels
+        c = C // F
+
+        y = x.reshape(
+            G, -1, F, c, H, W
+        )  # [GnFcHW] Split minibatch N into n groups of size G, and channels C into F groups of size c.
+        y = y - y.mean(dim=0)  # [GnFcHW] Subtract mean over group.
+        y = y.square().mean(dim=0)  # [nFcHW]  Calc variance over group.
+        y = (y + 1e-8).sqrt()  # [nFcHW]  Calc stddev over group.
+        y = y.mean(dim=[2, 3, 4])  # [nF]     Take average over channels and pixels.
+        y = y.reshape(-1, F, 1, 1)  # [nF11]   Add missing dimensions.
+        y = y.repeat(G, 1, H, W)  # [NFHW]   Replicate over group and pixels.
+        x = torch.cat([x, y], dim=1)  # [NCHW]   Append to input as new channels.
+        return x
+
+
+class FullyConnectedLayer(torch.nn.Module):
+    def __init__(
+        self,
+        in_features,  # Number of input features.
+        out_features,  # Number of output features.
+        bias=True,  # Apply additive bias before the activation function?
+        activation="linear",  # Activation function: 'relu', 'lrelu', etc.
+        lr_multiplier=1,  # Learning rate multiplier.
+        bias_init=0,  # Initial value for the additive bias.
+    ):
+        super().__init__()
+        self.weight = torch.nn.Parameter(
+            torch.randn([out_features, in_features]) / lr_multiplier
+        )
+        self.bias = (
+            torch.nn.Parameter(torch.full([out_features], np.float32(bias_init)))
+            if bias
+            else None
+        )
+        self.activation = activation
+
+        self.weight_gain = lr_multiplier / np.sqrt(in_features)
+        self.bias_gain = lr_multiplier
+
+    def forward(self, x):
+        w = self.weight * self.weight_gain
+        b = self.bias
+        if b is not None and self.bias_gain != 1:
+            b = b * self.bias_gain
+
+        if self.activation == "linear" and b is not None:
+            # out = torch.addmm(b.unsqueeze(0), x, w.t())
+            x = x.matmul(w.t())
+            out = x + b.reshape([-1 if i == x.ndim - 1 else 1 for i in range(x.ndim)])
+        else:
+            x = x.matmul(w.t())
+            out = bias_act(x, b, act=self.activation, dim=x.ndim - 1)
+        return out
+
+
+def _conv2d_wrapper(
+    x, w, stride=1, padding=0, groups=1, transpose=False, flip_weight=True
+):
+    """Wrapper for the underlying `conv2d()` and `conv_transpose2d()` implementations."""
+    out_channels, in_channels_per_group, kh, kw = _get_weight_shape(w)
+
+    # Flip weight if requested.
+    if (
+        not flip_weight
+    ):  # conv2d() actually performs correlation (flip_weight=True) not convolution (flip_weight=False).
+        w = w.flip([2, 3])
+
+    # Workaround performance pitfall in cuDNN 8.0.5, triggered when using
+    # 1x1 kernel + memory_format=channels_last + less than 64 channels.
+    if (
+        kw == 1
+        and kh == 1
+        and stride == 1
+        and padding in [0, [0, 0], (0, 0)]
+        and not transpose
+    ):
+        if x.stride()[1] == 1 and min(out_channels, in_channels_per_group) < 64:
+            if out_channels <= 4 and groups == 1:
+                in_shape = x.shape
+                x = w.squeeze(3).squeeze(2) @ x.reshape(
+                    [in_shape[0], in_channels_per_group, -1]
+                )
+                x = x.reshape([in_shape[0], out_channels, in_shape[2], in_shape[3]])
+            else:
+                x = x.to(memory_format=torch.contiguous_format)
+                w = w.to(memory_format=torch.contiguous_format)
+                x = conv2d(x, w, groups=groups)
+            return x.to(memory_format=torch.channels_last)
+
+    # Otherwise => execute using conv2d_gradfix.
+    op = conv_transpose2d if transpose else conv2d
+    return op(x, w, stride=stride, padding=padding, groups=groups)
+
+
+def conv2d_resample(
+    x, w, f=None, up=1, down=1, padding=0, groups=1, flip_weight=True, flip_filter=False
+):
+    r"""2D convolution with optional up/downsampling.
+
+    Padding is performed only once at the beginning, not between the operations.
+
+    Args:
+        x:              Input tensor of shape
+                        `[batch_size, in_channels, in_height, in_width]`.
+        w:              Weight tensor of shape
+                        `[out_channels, in_channels//groups, kernel_height, kernel_width]`.
+        f:              Low-pass filter for up/downsampling. Must be prepared beforehand by
+                        calling setup_filter(). None = identity (default).
+        up:             Integer upsampling factor (default: 1).
+        down:           Integer downsampling factor (default: 1).
+        padding:        Padding with respect to the upsampled image. Can be a single number
+                        or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
+                        (default: 0).
+        groups:         Split input channels into N groups (default: 1).
+        flip_weight:    False = convolution, True = correlation (default: True).
+        flip_filter:    False = convolution, True = correlation (default: False).
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
+    """
+    # Validate arguments.
+    assert isinstance(x, torch.Tensor) and (x.ndim == 4)
+    assert isinstance(w, torch.Tensor) and (w.ndim == 4) and (w.dtype == x.dtype)
+    assert f is None or (isinstance(f, torch.Tensor) and f.ndim in [1, 2])
+    assert isinstance(up, int) and (up >= 1)
+    assert isinstance(down, int) and (down >= 1)
+    # assert isinstance(groups, int) and (groups >= 1), f"!!!!!! groups: {groups} isinstance(groups, int)  {isinstance(groups, int)} {type(groups)}"
+    out_channels, in_channels_per_group, kh, kw = _get_weight_shape(w)
+    fw, fh = _get_filter_size(f)
+    # px0, px1, py0, py1 = _parse_padding(padding)
+    px0, px1, py0, py1 = padding, padding, padding, padding
+
+    # Adjust padding to account for up/downsampling.
+    if up > 1:
+        px0 += (fw + up - 1) // 2
+        px1 += (fw - up) // 2
+        py0 += (fh + up - 1) // 2
+        py1 += (fh - up) // 2
+    if down > 1:
+        px0 += (fw - down + 1) // 2
+        px1 += (fw - down) // 2
+        py0 += (fh - down + 1) // 2
+        py1 += (fh - down) // 2
+
+    # Fast path: 1x1 convolution with downsampling only => downsample first, then convolve.
+    if kw == 1 and kh == 1 and (down > 1 and up == 1):
+        x = upfirdn2d(
+            x=x, f=f, down=down, padding=[px0, px1, py0, py1], flip_filter=flip_filter
+        )
+        x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+        return x
+
+    # Fast path: 1x1 convolution with upsampling only => convolve first, then upsample.
+    if kw == 1 and kh == 1 and (up > 1 and down == 1):
+        x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+        x = upfirdn2d(
+            x=x,
+            f=f,
+            up=up,
+            padding=[px0, px1, py0, py1],
+            gain=up**2,
+            flip_filter=flip_filter,
+        )
+        return x
+
+    # Fast path: downsampling only => use strided convolution.
+    if down > 1 and up == 1:
+        x = upfirdn2d(x=x, f=f, padding=[px0, px1, py0, py1], flip_filter=flip_filter)
+        x = _conv2d_wrapper(
+            x=x, w=w, stride=down, groups=groups, flip_weight=flip_weight
+        )
+        return x
+
+    # Fast path: upsampling with optional downsampling => use transpose strided convolution.
+    if up > 1:
+        if groups == 1:
+            w = w.transpose(0, 1)
+        else:
+            w = w.reshape(groups, out_channels // groups, in_channels_per_group, kh, kw)
+            w = w.transpose(1, 2)
+            w = w.reshape(
+                groups * in_channels_per_group, out_channels // groups, kh, kw
+            )
+        px0 -= kw - 1
+        px1 -= kw - up
+        py0 -= kh - 1
+        py1 -= kh - up
+        pxt = max(min(-px0, -px1), 0)
+        pyt = max(min(-py0, -py1), 0)
+        x = _conv2d_wrapper(
+            x=x,
+            w=w,
+            stride=up,
+            padding=[pyt, pxt],
+            groups=groups,
+            transpose=True,
+            flip_weight=(not flip_weight),
+        )
+        x = upfirdn2d(
+            x=x,
+            f=f,
+            padding=[px0 + pxt, px1 + pxt, py0 + pyt, py1 + pyt],
+            gain=up**2,
+            flip_filter=flip_filter,
+        )
+        if down > 1:
+            x = upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter)
+        return x
+
+    # Fast path: no up/downsampling, padding supported by the underlying implementation => use plain conv2d.
+    if up == 1 and down == 1:
+        if px0 == px1 and py0 == py1 and px0 >= 0 and py0 >= 0:
+            return _conv2d_wrapper(
+                x=x, w=w, padding=[py0, px0], groups=groups, flip_weight=flip_weight
+            )
+
+    # Fallback: Generic reference implementation.
+    x = upfirdn2d(
+        x=x,
+        f=(f if up > 1 else None),
+        up=up,
+        padding=[px0, px1, py0, py1],
+        gain=up**2,
+        flip_filter=flip_filter,
+    )
+    x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
+    if down > 1:
+        x = upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter)
+    return x
+
+
+class Conv2dLayer(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,  # Number of input channels.
+        out_channels,  # Number of output channels.
+        kernel_size,  # Width and height of the convolution kernel.
+        bias=True,  # Apply additive bias before the activation function?
+        activation="linear",  # Activation function: 'relu', 'lrelu', etc.
+        up=1,  # Integer upsampling factor.
+        down=1,  # Integer downsampling factor.
+        resample_filter=[
+            1,
+            3,
+            3,
+            1,
+        ],  # Low-pass filter to apply when resampling activations.
+        conv_clamp=None,  # Clamp the output to +-X, None = disable clamping.
+        channels_last=False,  # Expect the input to have memory_format=channels_last?
+        trainable=True,  # Update the weights of this layer during training?
+    ):
+        super().__init__()
+        self.activation = activation
+        self.up = up
+        self.down = down
+        self.register_buffer("resample_filter", setup_filter(resample_filter))
+        self.conv_clamp = conv_clamp
+        self.padding = kernel_size // 2
+        self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size**2))
+        self.act_gain = activation_funcs[activation].def_gain
+
+        memory_format = (
+            torch.channels_last if channels_last else torch.contiguous_format
+        )
+        weight = torch.randn([out_channels, in_channels, kernel_size, kernel_size]).to(
+            memory_format=memory_format
+        )
+        bias = torch.zeros([out_channels]) if bias else None
+        if trainable:
+            self.weight = torch.nn.Parameter(weight)
+            self.bias = torch.nn.Parameter(bias) if bias is not None else None
+        else:
+            self.register_buffer("weight", weight)
+            if bias is not None:
+                self.register_buffer("bias", bias)
+            else:
+                self.bias = None
+
+    def forward(self, x, gain=1):
+        w = self.weight * self.weight_gain
+        x = conv2d_resample(
+            x=x,
+            w=w,
+            f=self.resample_filter,
+            up=self.up,
+            down=self.down,
+            padding=self.padding,
+        )
+
+        act_gain = self.act_gain * gain
+        act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None
+        out = bias_act(
+            x, self.bias, act=self.activation, gain=act_gain, clamp=act_clamp
+        )
+        return out
+
+
+def torch_gc():
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+    gc.collect()
+
+
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def get_scheduler(sd_sampler, scheduler_config):
+    # https://github.com/huggingface/diffusers/issues/4167
+    keys_to_pop = ["use_karras_sigmas", "algorithm_type"]
+    scheduler_config = dict(scheduler_config)
+    for it in keys_to_pop:
+        scheduler_config.pop(it, None)
+
+    # fmt: off
+    samplers = {
+        SDSampler.dpm_plus_plus_2m: [DPMSolverMultistepScheduler],
+        SDSampler.dpm_plus_plus_2m_karras: [DPMSolverMultistepScheduler, dict(use_karras_sigmas=True)],
+        SDSampler.dpm_plus_plus_2m_sde: [DPMSolverMultistepScheduler, dict(algorithm_type="sde-dpmsolver++")],
+        SDSampler.dpm_plus_plus_2m_sde_karras: [DPMSolverMultistepScheduler, dict(algorithm_type="sde-dpmsolver++", use_karras_sigmas=True)],
+        SDSampler.dpm_plus_plus_sde: [DPMSolverSinglestepScheduler],
+        SDSampler.dpm_plus_plus_sde_karras: [DPMSolverSinglestepScheduler, dict(use_karras_sigmas=True)],
+        SDSampler.dpm2: [KDPM2DiscreteScheduler],
+        SDSampler.dpm2_karras: [KDPM2DiscreteScheduler, dict(use_karras_sigmas=True)],
+        SDSampler.dpm2_a: [KDPM2AncestralDiscreteScheduler],
+        SDSampler.dpm2_a_karras: [KDPM2AncestralDiscreteScheduler, dict(use_karras_sigmas=True)],
+        SDSampler.euler: [EulerDiscreteScheduler],
+        SDSampler.euler_a: [EulerAncestralDiscreteScheduler],
+        SDSampler.heun: [HeunDiscreteScheduler],
+        SDSampler.lms: [LMSDiscreteScheduler],
+        SDSampler.lms_karras: [LMSDiscreteScheduler, dict(use_karras_sigmas=True)],
+        SDSampler.ddim: [DDIMScheduler],
+        SDSampler.pndm: [PNDMScheduler],
+        SDSampler.uni_pc: [UniPCMultistepScheduler],
+        SDSampler.lcm: [LCMScheduler],
+    }
+    # fmt: on
+    if sd_sampler in samplers:
+        if len(samplers[sd_sampler]) == 2:
+            scheduler_cls, kwargs = samplers[sd_sampler]
+        else:
+            scheduler_cls, kwargs = samplers[sd_sampler][0], {}
+        return scheduler_cls.from_config(scheduler_config, **kwargs)
+    else:
+        raise ValueError(sd_sampler)
+
+
+def handle_from_pretrained_exceptions(func, **kwargs):
+    try:
+        return func(**kwargs)
+    except ValueError as e:
+        if "You are trying to load the model files of the `variant=fp16`" in str(e):
+            logger.info("variant=fp16 not found, try revision=fp16")
+            return func(**{**kwargs, "variant": None, "revision": "fp16"})
+    except OSError as e:
+        previous_traceback = traceback.format_exc()
+        if "RevisionNotFoundError: 404 Client Error." in previous_traceback:
+            logger.info("revision=fp16 not found, try revision=main")
+            return func(**{**kwargs, "variant": None, "revision": "main"})
+    except Exception as e:
+        raise e
--- a/iopaint/model/zits.py
+++ b/iopaint/model/zits.py
@@ -0,0 +1,476 @@
+import os
+import time
+
+import cv2
+import torch
+import torch.nn.functional as F
+
+from iopaint.helper import get_cache_path_by_url, load_jit_model, download_model
+from iopaint.schema import InpaintRequest
+import numpy as np
+
+from iopaint.model.base import InpaintModel
+
+ZITS_INPAINT_MODEL_URL = os.environ.get(
+    "ZITS_INPAINT_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_zits/zits-inpaint-0717.pt",
+)
+ZITS_INPAINT_MODEL_MD5 = os.environ.get(
+    "ZITS_INPAINT_MODEL_MD5", "9978cc7157dc29699e42308d675b2154"
+)
+
+ZITS_EDGE_LINE_MODEL_URL = os.environ.get(
+    "ZITS_EDGE_LINE_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_zits/zits-edge-line-0717.pt",
+)
+ZITS_EDGE_LINE_MODEL_MD5 = os.environ.get(
+    "ZITS_EDGE_LINE_MODEL_MD5", "55e31af21ba96bbf0c80603c76ea8c5f"
+)
+
+ZITS_STRUCTURE_UPSAMPLE_MODEL_URL = os.environ.get(
+    "ZITS_STRUCTURE_UPSAMPLE_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_zits/zits-structure-upsample-0717.pt",
+)
+ZITS_STRUCTURE_UPSAMPLE_MODEL_MD5 = os.environ.get(
+    "ZITS_STRUCTURE_UPSAMPLE_MODEL_MD5", "3d88a07211bd41b2ec8cc0d999f29927"
+)
+
+ZITS_WIRE_FRAME_MODEL_URL = os.environ.get(
+    "ZITS_WIRE_FRAME_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_zits/zits-wireframe-0717.pt",
+)
+ZITS_WIRE_FRAME_MODEL_MD5 = os.environ.get(
+    "ZITS_WIRE_FRAME_MODEL_MD5", "a9727c63a8b48b65c905d351b21ce46b"
+)
+
+
+def resize(img, height, width, center_crop=False):
+    imgh, imgw = img.shape[0:2]
+
+    if center_crop and imgh != imgw:
+        # center crop
+        side = np.minimum(imgh, imgw)
+        j = (imgh - side) // 2
+        i = (imgw - side) // 2
+        img = img[j : j + side, i : i + side, ...]
+
+    if imgh > height and imgw > width:
+        inter = cv2.INTER_AREA
+    else:
+        inter = cv2.INTER_LINEAR
+    img = cv2.resize(img, (height, width), interpolation=inter)
+
+    return img
+
+
+def to_tensor(img, scale=True, norm=False):
+    if img.ndim == 2:
+        img = img[:, :, np.newaxis]
+    c = img.shape[-1]
+
+    if scale:
+        img_t = torch.from_numpy(img).permute(2, 0, 1).float().div(255)
+    else:
+        img_t = torch.from_numpy(img).permute(2, 0, 1).float()
+
+    if norm:
+        mean = torch.tensor([0.5, 0.5, 0.5]).reshape(c, 1, 1)
+        std = torch.tensor([0.5, 0.5, 0.5]).reshape(c, 1, 1)
+        img_t = (img_t - mean) / std
+    return img_t
+
+
+def load_masked_position_encoding(mask):
+    ones_filter = np.ones((3, 3), dtype=np.float32)
+    d_filter1 = np.array([[1, 1, 0], [1, 1, 0], [0, 0, 0]], dtype=np.float32)
+    d_filter2 = np.array([[0, 0, 0], [1, 1, 0], [1, 1, 0]], dtype=np.float32)
+    d_filter3 = np.array([[0, 1, 1], [0, 1, 1], [0, 0, 0]], dtype=np.float32)
+    d_filter4 = np.array([[0, 0, 0], [0, 1, 1], [0, 1, 1]], dtype=np.float32)
+    str_size = 256
+    pos_num = 128
+
+    ori_mask = mask.copy()
+    ori_h, ori_w = ori_mask.shape[0:2]
+    ori_mask = ori_mask / 255
+    mask = cv2.resize(mask, (str_size, str_size), interpolation=cv2.INTER_AREA)
+    mask[mask > 0] = 255
+    h, w = mask.shape[0:2]
+    mask3 = mask.copy()
+    mask3 = 1.0 - (mask3 / 255.0)
+    pos = np.zeros((h, w), dtype=np.int32)
+    direct = np.zeros((h, w, 4), dtype=np.int32)
+    i = 0
+    while np.sum(1 - mask3) > 0:
+        i += 1
+        mask3_ = cv2.filter2D(mask3, -1, ones_filter)
+        mask3_[mask3_ > 0] = 1
+        sub_mask = mask3_ - mask3
+        pos[sub_mask == 1] = i
+
+        m = cv2.filter2D(mask3, -1, d_filter1)
+        m[m > 0] = 1
+        m = m - mask3
+        direct[m == 1, 0] = 1
+
+        m = cv2.filter2D(mask3, -1, d_filter2)
+        m[m > 0] = 1
+        m = m - mask3
+        direct[m == 1, 1] = 1
+
+        m = cv2.filter2D(mask3, -1, d_filter3)
+        m[m > 0] = 1
+        m = m - mask3
+        direct[m == 1, 2] = 1
+
+        m = cv2.filter2D(mask3, -1, d_filter4)
+        m[m > 0] = 1
+        m = m - mask3
+        direct[m == 1, 3] = 1
+
+        mask3 = mask3_
+
+    abs_pos = pos.copy()
+    rel_pos = pos / (str_size / 2)  # to 0~1 maybe larger than 1
+    rel_pos = (rel_pos * pos_num).astype(np.int32)
+    rel_pos = np.clip(rel_pos, 0, pos_num - 1)
+
+    if ori_w != w or ori_h != h:
+        rel_pos = cv2.resize(rel_pos, (ori_w, ori_h), interpolation=cv2.INTER_NEAREST)
+        rel_pos[ori_mask == 0] = 0
+        direct = cv2.resize(direct, (ori_w, ori_h), interpolation=cv2.INTER_NEAREST)
+        direct[ori_mask == 0, :] = 0
+
+    return rel_pos, abs_pos, direct
+
+
+def load_image(img, mask, device, sigma256=3.0):
+    """
+    Args:
+        img: [H, W, C] RGB
+        mask: [H, W] 255 为 masks 区域
+        sigma256:
+
+    Returns:
+
+    """
+    h, w, _ = img.shape
+    imgh, imgw = img.shape[0:2]
+    img_256 = resize(img, 256, 256)
+
+    mask = (mask > 127).astype(np.uint8) * 255
+    mask_256 = cv2.resize(mask, (256, 256), interpolation=cv2.INTER_AREA)
+    mask_256[mask_256 > 0] = 255
+
+    mask_512 = cv2.resize(mask, (512, 512), interpolation=cv2.INTER_AREA)
+    mask_512[mask_512 > 0] = 255
+
+    # original skimage implemention
+    # https://scikit-image.org/docs/stable/api/skimage.feature.html#skimage.feature.canny
+    # low_threshold: Lower bound for hysteresis thresholding (linking edges). If None, low_threshold is set to 10% of dtype’s max.
+    # high_threshold: Upper bound for hysteresis thresholding (linking edges). If None, high_threshold is set to 20% of dtype’s max.
+
+    try:
+        import skimage
+
+        gray_256 = skimage.color.rgb2gray(img_256)
+        edge_256 = skimage.feature.canny(gray_256, sigma=3.0, mask=None).astype(float)
+        # cv2.imwrite("skimage_gray.jpg", (gray_256*255).astype(np.uint8))
+        # cv2.imwrite("skimage_edge.jpg", (edge_256*255).astype(np.uint8))
+    except:
+        gray_256 = cv2.cvtColor(img_256, cv2.COLOR_RGB2GRAY)
+        gray_256_blured = cv2.GaussianBlur(
+            gray_256, ksize=(7, 7), sigmaX=sigma256, sigmaY=sigma256
+        )
+        edge_256 = cv2.Canny(
+            gray_256_blured, threshold1=int(255 * 0.1), threshold2=int(255 * 0.2)
+        )
+
+    # cv2.imwrite("opencv_edge.jpg", edge_256)
+
+    # line
+    img_512 = resize(img, 512, 512)
+
+    rel_pos, abs_pos, direct = load_masked_position_encoding(mask)
+
+    batch = dict()
+    batch["images"] = to_tensor(img.copy()).unsqueeze(0).to(device)
+    batch["img_256"] = to_tensor(img_256, norm=True).unsqueeze(0).to(device)
+    batch["masks"] = to_tensor(mask).unsqueeze(0).to(device)
+    batch["mask_256"] = to_tensor(mask_256).unsqueeze(0).to(device)
+    batch["mask_512"] = to_tensor(mask_512).unsqueeze(0).to(device)
+    batch["edge_256"] = to_tensor(edge_256, scale=False).unsqueeze(0).to(device)
+    batch["img_512"] = to_tensor(img_512).unsqueeze(0).to(device)
+    batch["rel_pos"] = torch.LongTensor(rel_pos).unsqueeze(0).to(device)
+    batch["abs_pos"] = torch.LongTensor(abs_pos).unsqueeze(0).to(device)
+    batch["direct"] = torch.LongTensor(direct).unsqueeze(0).to(device)
+    batch["h"] = imgh
+    batch["w"] = imgw
+
+    return batch
+
+
+def to_device(data, device):
+    if isinstance(data, torch.Tensor):
+        return data.to(device)
+    if isinstance(data, dict):
+        for key in data:
+            if isinstance(data[key], torch.Tensor):
+                data[key] = data[key].to(device)
+        return data
+    if isinstance(data, list):
+        return [to_device(d, device) for d in data]
+
+
+class ZITS(InpaintModel):
+    name = "zits"
+    min_size = 256
+    pad_mod = 32
+    pad_to_square = True
+    is_erase_model = True
+
+    def __init__(self, device, **kwargs):
+        """
+
+        Args:
+            device:
+        """
+        super().__init__(device)
+        self.device = device
+        self.sample_edge_line_iterations = 1
+
+    def init_model(self, device, **kwargs):
+        self.wireframe = load_jit_model(
+            ZITS_WIRE_FRAME_MODEL_URL, device, ZITS_WIRE_FRAME_MODEL_MD5
+        )
+        self.edge_line = load_jit_model(
+            ZITS_EDGE_LINE_MODEL_URL, device, ZITS_EDGE_LINE_MODEL_MD5
+        )
+        self.structure_upsample = load_jit_model(
+            ZITS_STRUCTURE_UPSAMPLE_MODEL_URL, device, ZITS_STRUCTURE_UPSAMPLE_MODEL_MD5
+        )
+        self.inpaint = load_jit_model(
+            ZITS_INPAINT_MODEL_URL, device, ZITS_INPAINT_MODEL_MD5
+        )
+
+    @staticmethod
+    def download():
+        download_model(ZITS_WIRE_FRAME_MODEL_URL, ZITS_WIRE_FRAME_MODEL_MD5)
+        download_model(ZITS_EDGE_LINE_MODEL_URL, ZITS_EDGE_LINE_MODEL_MD5)
+        download_model(
+            ZITS_STRUCTURE_UPSAMPLE_MODEL_URL, ZITS_STRUCTURE_UPSAMPLE_MODEL_MD5
+        )
+        download_model(ZITS_INPAINT_MODEL_URL, ZITS_INPAINT_MODEL_MD5)
+
+    @staticmethod
+    def is_downloaded() -> bool:
+        model_paths = [
+            get_cache_path_by_url(ZITS_WIRE_FRAME_MODEL_URL),
+            get_cache_path_by_url(ZITS_EDGE_LINE_MODEL_URL),
+            get_cache_path_by_url(ZITS_STRUCTURE_UPSAMPLE_MODEL_URL),
+            get_cache_path_by_url(ZITS_INPAINT_MODEL_URL),
+        ]
+        return all([os.path.exists(it) for it in model_paths])
+
+    def wireframe_edge_and_line(self, items, enable: bool):
+        # 最终向 items 中添加 edge 和 line key
+        if not enable:
+            items["edge"] = torch.zeros_like(items["masks"])
+            items["line"] = torch.zeros_like(items["masks"])
+            return
+
+        start = time.time()
+        try:
+            line_256 = self.wireframe_forward(
+                items["img_512"],
+                h=256,
+                w=256,
+                masks=items["mask_512"],
+                mask_th=0.85,
+            )
+        except:
+            line_256 = torch.zeros_like(items["mask_256"])
+
+        print(f"wireframe_forward time: {(time.time() - start) * 1000:.2f}ms")
+
+        # np_line = (line[0][0].numpy() * 255).astype(np.uint8)
+        # cv2.imwrite("line.jpg", np_line)
+
+        start = time.time()
+        edge_pred, line_pred = self.sample_edge_line_logits(
+            context=[items["img_256"], items["edge_256"], line_256],
+            mask=items["mask_256"].clone(),
+            iterations=self.sample_edge_line_iterations,
+            add_v=0.05,
+            mul_v=4,
+        )
+        print(f"sample_edge_line_logits time: {(time.time() - start) * 1000:.2f}ms")
+
+        # np_edge_pred = (edge_pred[0][0].numpy() * 255).astype(np.uint8)
+        # cv2.imwrite("edge_pred.jpg", np_edge_pred)
+        # np_line_pred = (line_pred[0][0].numpy() * 255).astype(np.uint8)
+        # cv2.imwrite("line_pred.jpg", np_line_pred)
+        # exit()
+
+        input_size = min(items["h"], items["w"])
+        if input_size != 256 and input_size > 256:
+            while edge_pred.shape[2] < input_size:
+                edge_pred = self.structure_upsample(edge_pred)
+                edge_pred = torch.sigmoid((edge_pred + 2) * 2)
+
+                line_pred = self.structure_upsample(line_pred)
+                line_pred = torch.sigmoid((line_pred + 2) * 2)
+
+            edge_pred = F.interpolate(
+                edge_pred,
+                size=(input_size, input_size),
+                mode="bilinear",
+                align_corners=False,
+            )
+            line_pred = F.interpolate(
+                line_pred,
+                size=(input_size, input_size),
+                mode="bilinear",
+                align_corners=False,
+            )
+
+        # np_edge_pred = (edge_pred[0][0].numpy() * 255).astype(np.uint8)
+        # cv2.imwrite("edge_pred_upsample.jpg", np_edge_pred)
+        # np_line_pred = (line_pred[0][0].numpy() * 255).astype(np.uint8)
+        # cv2.imwrite("line_pred_upsample.jpg", np_line_pred)
+        # exit()
+
+        items["edge"] = edge_pred.detach()
+        items["line"] = line_pred.detach()
+
+    @torch.no_grad()
+    def forward(self, image, mask, config: InpaintRequest):
+        """Input images and output images have same size
+        images: [H, W, C] RGB
+        masks: [H, W]
+        return: BGR IMAGE
+        """
+        mask = mask[:, :, 0]
+        items = load_image(image, mask, device=self.device)
+
+        self.wireframe_edge_and_line(items, config.zits_wireframe)
+
+        inpainted_image = self.inpaint(
+            items["images"],
+            items["masks"],
+            items["edge"],
+            items["line"],
+            items["rel_pos"],
+            items["direct"],
+        )
+
+        inpainted_image = inpainted_image * 255.0
+        inpainted_image = (
+            inpainted_image.cpu().permute(0, 2, 3, 1)[0].numpy().astype(np.uint8)
+        )
+        inpainted_image = inpainted_image[:, :, ::-1]
+
+        # cv2.imwrite("inpainted.jpg", inpainted_image)
+        # exit()
+
+        return inpainted_image
+
+    def wireframe_forward(self, images, h, w, masks, mask_th=0.925):
+        lcnn_mean = torch.tensor([109.730, 103.832, 98.681]).reshape(1, 3, 1, 1)
+        lcnn_std = torch.tensor([22.275, 22.124, 23.229]).reshape(1, 3, 1, 1)
+        images = images * 255.0
+        # the masks value of lcnn is 127.5
+        masked_images = images * (1 - masks) + torch.ones_like(images) * masks * 127.5
+        masked_images = (masked_images - lcnn_mean) / lcnn_std
+
+        def to_int(x):
+            return tuple(map(int, x))
+
+        lines_tensor = []
+        lmap = np.zeros((h, w))
+
+        output_masked = self.wireframe(masked_images)
+
+        output_masked = to_device(output_masked, "cpu")
+        if output_masked["num_proposals"] == 0:
+            lines_masked = []
+            scores_masked = []
+        else:
+            lines_masked = output_masked["lines_pred"].numpy()
+            lines_masked = [
+                [line[1] * h, line[0] * w, line[3] * h, line[2] * w]
+                for line in lines_masked
+            ]
+            scores_masked = output_masked["lines_score"].numpy()
+
+        for line, score in zip(lines_masked, scores_masked):
+            if score > mask_th:
+                try:
+                    import skimage
+
+                    rr, cc, value = skimage.draw.line_aa(
+                        *to_int(line[0:2]), *to_int(line[2:4])
+                    )
+                    lmap[rr, cc] = np.maximum(lmap[rr, cc], value)
+                except:
+                    cv2.line(
+                        lmap,
+                        to_int(line[0:2][::-1]),
+                        to_int(line[2:4][::-1]),
+                        (1, 1, 1),
+                        1,
+                        cv2.LINE_AA,
+                    )
+
+        lmap = np.clip(lmap * 255, 0, 255).astype(np.uint8)
+        lines_tensor.append(to_tensor(lmap).unsqueeze(0))
+
+        lines_tensor = torch.cat(lines_tensor, dim=0)
+        return lines_tensor.detach().to(self.device)
+
+    def sample_edge_line_logits(
+        self, context, mask=None, iterations=1, add_v=0, mul_v=4
+    ):
+        [img, edge, line] = context
+
+        img = img * (1 - mask)
+        edge = edge * (1 - mask)
+        line = line * (1 - mask)
+
+        for i in range(iterations):
+            edge_logits, line_logits = self.edge_line(img, edge, line, masks=mask)
+
+            edge_pred = torch.sigmoid(edge_logits)
+            line_pred = torch.sigmoid((line_logits + add_v) * mul_v)
+            edge = edge + edge_pred * mask
+            edge[edge >= 0.25] = 1
+            edge[edge < 0.25] = 0
+            line = line + line_pred * mask
+
+            b, _, h, w = edge_pred.shape
+            edge_pred = edge_pred.reshape(b, -1, 1)
+            line_pred = line_pred.reshape(b, -1, 1)
+            mask = mask.reshape(b, -1)
+
+            edge_probs = torch.cat([1 - edge_pred, edge_pred], dim=-1)
+            line_probs = torch.cat([1 - line_pred, line_pred], dim=-1)
+            edge_probs[:, :, 1] += 0.5
+            line_probs[:, :, 1] += 0.5
+            edge_max_probs = edge_probs.max(dim=-1)[0] + (1 - mask) * (-100)
+            line_max_probs = line_probs.max(dim=-1)[0] + (1 - mask) * (-100)
+
+            indices = torch.sort(
+                edge_max_probs + line_max_probs, dim=-1, descending=True
+            )[1]
+
+            for ii in range(b):
+                keep = int((i + 1) / iterations * torch.sum(mask[ii, ...]))
+
+                assert torch.sum(mask[ii][indices[ii, :keep]]) == keep, "Error!!!"
+                mask[ii][indices[ii, :keep]] = 0
+
+            mask = mask.reshape(b, 1, h, w)
+            edge = edge * (1 - mask)
+            line = line * (1 - mask)
+
+        edge, line = edge.to(torch.float32), line.to(torch.float32)
+        return edge, line