wip mat float16

2023-03-25 22:46:28 +08:00
parent 7e028c3908
commit eb304ba696
3 changed files with 63 additions and 181 deletions
--- a/lama_cleaner/model/mat.py
+++ b/lama_cleaner/model/mat.py
@@ -52,7 +52,7 @@ class ModulatedConv2d(nn.Module):
        )
        self.out_channels = out_channels
        self.kernel_size = kernel_size
-        self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size**2))
+        self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size ** 2))
        self.padding = self.kernel_size // 2
        self.up = up
        self.down = down
@@ -213,7 +213,7 @@ class DecBlockFirst(nn.Module):
        super().__init__()
        self.fc = FullyConnectedLayer(
            in_features=in_channels * 2,
-            out_features=in_channels * 4**2,
+            out_features=in_channels * 4 ** 2,
            activation=activation,
        )
        self.conv = StyleConv(
@@ -312,7 +312,7 @@ class DecBlock(nn.Module):
            in_channels=in_channels,
            out_channels=out_channels,
            style_dim=style_dim,
-            resolution=2**res,
+            resolution=2 ** res,
            kernel_size=3,
            up=2,
            use_noise=use_noise,
@@ -323,7 +323,7 @@ class DecBlock(nn.Module):
            in_channels=out_channels,
            out_channels=out_channels,
            style_dim=style_dim,
-            resolution=2**res,
+            resolution=2 ** res,
            kernel_size=3,
            use_noise=use_noise,
            activation=activation,
@@ -402,9 +402,6 @@ class MappingNet(torch.nn.Module):
    def forward(
        self, z, c, truncation_psi=1, truncation_cutoff=None, skip_w_avg_update=False
    ):
-        import ipdb
-
-        ipdb.set_trace()
        # Embed, normalize, and concat inputs.
        x = None
        if self.z_dim > 0:
@@ -510,7 +507,7 @@ class Discriminator(torch.nn.Module):
        self.img_channels = img_channels

        resolution_log2 = int(np.log2(img_resolution))
-        assert img_resolution == 2**resolution_log2 and img_resolution >= 4
+        assert img_resolution == 2 ** resolution_log2 and img_resolution >= 4
        self.resolution_log2 = resolution_log2

        def nf(stage):
@@ -546,7 +543,7 @@ class Discriminator(torch.nn.Module):
        )
        self.Dis = nn.Sequential(*Dis)

-        self.fc0 = FullyConnectedLayer(nf(2) * 4**2, nf(2), activation=activation)
+        self.fc0 = FullyConnectedLayer(nf(2) * 4 ** 2, nf(2), activation=activation)
        self.fc1 = FullyConnectedLayer(nf(2), 1 if cmap_dim == 0 else cmap_dim)

    def forward(self, images_in, masks_in, c):
@@ -565,7 +562,7 @@ class Discriminator(torch.nn.Module):

 def nf(stage, channel_base=32768, channel_decay=1.0, channel_max=512):
    NF = {512: 64, 256: 128, 128: 256, 64: 512, 32: 512, 16: 512, 8: 512, 4: 512}
-    return NF[2**stage]
+    return NF[2 ** stage]


 class Mlp(nn.Module):
@@ -662,7 +659,7 @@ class Conv2dLayerPartial(nn.Module):
        )

        self.weight_maskUpdater = torch.ones(1, 1, kernel_size, kernel_size)
-        self.slide_winsize = kernel_size**2
+        self.slide_winsize = kernel_size ** 2
        self.stride = down
        self.padding = kernel_size // 2 if kernel_size % 2 == 1 else 0

@@ -678,9 +675,9 @@ class Conv2dLayerPartial(nn.Module):
                    stride=self.stride,
                    padding=self.padding,
                )
-                mask_ratio = self.slide_winsize / (update_mask + 1e-8)
+                mask_ratio = self.slide_winsize / (update_mask.to(torch.float32) + 1e-8)
                update_mask = torch.clamp(update_mask, 0, 1)  # 0 or 1
-                mask_ratio = torch.mul(mask_ratio, update_mask)
+                mask_ratio = torch.mul(mask_ratio, update_mask).to(x.dtype)
            x = self.conv(x)
            x = torch.mul(x, mask_ratio)
            return x, update_mask
@@ -718,7 +715,7 @@ class WindowAttention(nn.Module):
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim**-0.5
+        self.scale = qk_scale or head_dim ** -0.5

        self.q = FullyConnectedLayer(in_features=dim, out_features=dim)
        self.k = FullyConnectedLayer(in_features=dim, out_features=dim)
@@ -734,7 +731,7 @@ class WindowAttention(nn.Module):
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
        """
        B_, N, C = x.shape
-        norm_x = F.normalize(x, p=2.0, dim=-1)
+        norm_x = F.normalize(x, p=2.0, dim=-1, eps=torch.finfo(x.dtype).eps)
        q = (
            self.q(norm_x)
            .reshape(B_, N, self.num_heads, C // self.num_heads)
@@ -771,7 +768,6 @@ class WindowAttention(nn.Module):
                ).repeat(1, N, 1)

        attn = self.softmax(attn)
-
        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
        x = self.proj(x)
        return x, mask_windows
@@ -935,7 +931,9 @@ class SwinTransformerBlock(nn.Module):
            )  # nW*B, window_size*window_size, C
        else:
            attn_windows, mask_windows = self.attn(
-                x_windows, mask_windows, mask=self.calculate_mask(x_size).to(x.device)
+                x_windows,
+                mask_windows,
+                mask=self.calculate_mask(x_size).to(x.dtype).to(x.device),
            )  # nW*B, window_size*window_size, C

        # merge windows
@@ -1213,7 +1211,7 @@ class Encoder(nn.Module):
        self.resolution = []

        for idx, i in enumerate(range(res_log2, 3, -1)):  # from input size to 16x16
-            res = 2**i
+            res = 2 ** i
            self.resolution.append(res)
            if i == res_log2:
                block = EncFromRGB(img_channels * 2 + 1, nf(i), activation)
@@ -1298,7 +1296,7 @@ class DecBlockFirstV2(nn.Module):
            in_channels=in_channels,
            out_channels=out_channels,
            style_dim=style_dim,
-            resolution=2**res,
+            resolution=2 ** res,
            kernel_size=3,
            use_noise=use_noise,
            activation=activation,
@@ -1343,7 +1341,7 @@ class DecBlock(nn.Module):
            in_channels=in_channels,
            out_channels=out_channels,
            style_dim=style_dim,
-            resolution=2**res,
+            resolution=2 ** res,
            kernel_size=3,
            up=2,
            use_noise=use_noise,
@@ -1354,7 +1352,7 @@ class DecBlock(nn.Module):
            in_channels=out_channels,
            out_channels=out_channels,
            style_dim=style_dim,
-            resolution=2**res,
+            resolution=2 ** res,
            kernel_size=3,
            use_noise=use_noise,
            activation=activation,
@@ -1391,7 +1389,7 @@ class Decoder(nn.Module):
        for res in range(5, res_log2 + 1):
            setattr(
                self,
-                "Dec_%dx%d" % (2**res, 2**res),
+                "Dec_%dx%d" % (2 ** res, 2 ** res),
                DecBlock(
                    res,
                    nf(res - 1),
@@ -1408,7 +1406,7 @@ class Decoder(nn.Module):
    def forward(self, x, ws, gs, E_features, noise_mode="random"):
        x, img = self.Dec_16x16(x, ws, gs, E_features, noise_mode=noise_mode)
        for res in range(5, self.res_log2 + 1):
-            block = getattr(self, "Dec_%dx%d" % (2**res, 2**res))
+            block = getattr(self, "Dec_%dx%d" % (2 ** res, 2 ** res))
            x, img = block(x, img, ws, gs, E_features, noise_mode=noise_mode)

        return img
@@ -1433,7 +1431,7 @@ class DecStyleBlock(nn.Module):
            in_channels=in_channels,
            out_channels=out_channels,
            style_dim=style_dim,
-            resolution=2**res,
+            resolution=2 ** res,
            kernel_size=3,
            up=2,
            use_noise=use_noise,
@@ -1444,7 +1442,7 @@ class DecStyleBlock(nn.Module):
            in_channels=out_channels,
            out_channels=out_channels,
            style_dim=style_dim,
-            resolution=2**res,
+            resolution=2 ** res,
            kernel_size=3,
            use_noise=use_noise,
            activation=activation,
@@ -1642,7 +1640,7 @@ class SynthesisNet(nn.Module):
    ):
        super().__init__()
        resolution_log2 = int(np.log2(img_resolution))
-        assert img_resolution == 2**resolution_log2 and img_resolution >= 4
+        assert img_resolution == 2 ** resolution_log2 and img_resolution >= 4

        self.num_layers = resolution_log2 * 2 - 3 * 2
        self.img_resolution = img_resolution
@@ -1783,7 +1781,7 @@ class Discriminator(torch.nn.Module):
        self.img_channels = img_channels

        resolution_log2 = int(np.log2(img_resolution))
-        assert img_resolution == 2**resolution_log2 and img_resolution >= 4
+        assert img_resolution == 2 ** resolution_log2 and img_resolution >= 4
        self.resolution_log2 = resolution_log2

        if cmap_dim == None:
@@ -1814,7 +1812,7 @@ class Discriminator(torch.nn.Module):
        )
        self.Dis = nn.Sequential(*Dis)

-        self.fc0 = FullyConnectedLayer(nf(2) * 4**2, nf(2), activation=activation)
+        self.fc0 = FullyConnectedLayer(nf(2) * 4 ** 2, nf(2), activation=activation)
        self.fc1 = FullyConnectedLayer(nf(2), 1 if cmap_dim == 0 else cmap_dim)

        # for 64x64
@@ -1839,7 +1837,7 @@ class Discriminator(torch.nn.Module):
        self.Dis_stg1 = nn.Sequential(*Dis_stg1)

        self.fc0_stg1 = FullyConnectedLayer(
-            nf(2) // 2 * 4**2, nf(2) // 2, activation=activation
+            nf(2) // 2 * 4 ** 2, nf(2) // 2, activation=activation
        )
        self.fc1_stg1 = FullyConnectedLayer(
            nf(2) // 2, 1 if cmap_dim == 0 else cmap_dim
@@ -1874,7 +1872,7 @@ MAT_MODEL_MD5 = os.environ.get("MAT_MODEL_MD5", "8ca927835fa3f5e21d65ffcb165377e

 class MAT(InpaintModel):
    name = "mat"
-    min_size = 512
+    min_size = 1024
    pad_mod = 512
    pad_to_square = True

@@ -1890,9 +1888,9 @@ class MAT(InpaintModel):
            img_resolution=512,
            img_channels=3,
            mapping_kwargs={"torch_dtype": self.torch_dtype},
-        )
+        ).to(self.torch_dtype)
        # fmt: off
-        self.model = load_model(G, MAT_MODEL_URL, device, MAT_MODEL_MD5).to(self.torch_dtype)
+        self.model = load_model(G, MAT_MODEL_URL, device, MAT_MODEL_MD5)
        self.z = torch.from_numpy(np.random.randn(1, G.z_dim)).to(self.torch_dtype).to(device)
        self.label = torch.zeros([1, self.model.c_dim], device=device).to(self.torch_dtype)
        # fmt: on
--- a/lama_cleaner/model/utils.py
+++ b/lama_cleaner/model/utils.py
@@ -27,7 +27,7 @@ def make_beta_schedule(
    if schedule == "linear":
        betas = (
            torch.linspace(
-                linear_start**0.5, linear_end**0.5, n_timestep, dtype=torch.float64
+                linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64
            )
            ** 2
        )
@@ -134,8 +134,10 @@ def timestep_embedding(device, timesteps, dim, max_period=10000, repeat_only=Fal
 ###### MAT and FcF #######


-def normalize_2nd_moment(x, dim=1, eps=1e-8):
-    return x * (x.square().mean(dim=dim, keepdim=True) + eps).rsqrt()
+def normalize_2nd_moment(x, dim=1):
+    return (
+        x * (x.square().mean(dim=dim, keepdim=True) + torch.finfo(x.dtype).eps).rsqrt()
+    )


 class EasyDict(dict):
@@ -460,7 +462,7 @@ def _upfirdn2d_ref(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1):
    if f is None:
        f = torch.ones([1, 1], dtype=torch.float32, device=x.device)
    assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
-    assert f.dtype == torch.float32 and not f.requires_grad
+    assert not f.requires_grad
    batch_size, num_channels, in_height, in_width = x.shape
    # upx, upy = _parse_scaling(up)
    # downx, downy = _parse_scaling(down)
@@ -733,9 +735,7 @@ def conv2d_resample(
    # Validate arguments.
    assert isinstance(x, torch.Tensor) and (x.ndim == 4)
    assert isinstance(w, torch.Tensor) and (w.ndim == 4) and (w.dtype == x.dtype)
-    assert f is None or (
-        isinstance(f, torch.Tensor) and f.ndim in [1, 2] and f.dtype == torch.float32
-    )
+    assert f is None or (isinstance(f, torch.Tensor) and f.ndim in [1, 2])
    assert isinstance(up, int) and (up >= 1)
    assert isinstance(down, int) and (down >= 1)
    # assert isinstance(groups, int) and (groups >= 1), f"!!!!!! groups: {groups} isinstance(groups, int)  {isinstance(groups, int)} {type(groups)}"
@@ -772,7 +772,7 @@ def conv2d_resample(
            f=f,
            up=up,
            padding=[px0, px1, py0, py1],
-            gain=up**2,
+            gain=up ** 2,
            flip_filter=flip_filter,
        )
        return x
@@ -814,7 +814,7 @@ def conv2d_resample(
            x=x,
            f=f,
            padding=[px0 + pxt, px1 + pxt, py0 + pyt, py1 + pyt],
-            gain=up**2,
+            gain=up ** 2,
            flip_filter=flip_filter,
        )
        if down > 1:
@@ -834,7 +834,7 @@ def conv2d_resample(
        f=(f if up > 1 else None),
        up=up,
        padding=[px0, px1, py0, py1],
-        gain=up**2,
+        gain=up ** 2,
        flip_filter=flip_filter,
    )
    x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
@@ -870,7 +870,7 @@ class Conv2dLayer(torch.nn.Module):
        self.register_buffer("resample_filter", setup_filter(resample_filter))
        self.conv_clamp = conv_clamp
        self.padding = kernel_size // 2
-        self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size**2))
+        self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size ** 2))
        self.act_gain = activation_funcs[activation].def_gain

        memory_format = (