Source code for opr.modules.mixvpr

"""MixVPR: Feature Mixing for Visual Place Recognition.

Source: https://github.com/amaralibey/MixVPR/blob/main/models/aggregators/mixvpr.py
"""
from torch import Tensor, nn
from torch.nn import functional as F


[docs] class FeatureMixerLayer(nn.Module): """Feature Mixer Layer.""" def __init__(self, in_dim: int, mlp_ratio: float = 1.0) -> None: """Feature Mixer Layer. Args: in_dim (int): Input dimension. mlp_ratio (float): Ratio of the mid projection layer in the mixer block. Defaults to 1.0.s """ super().__init__() self.mix = nn.Sequential( nn.LayerNorm(in_dim), nn.Linear(in_dim, int(in_dim * mlp_ratio)), nn.ReLU(), nn.Linear(int(in_dim * mlp_ratio), in_dim), ) for m in self.modules(): if isinstance(m, (nn.Linear)): nn.init.trunc_normal_(m.weight, std=0.02) if m.bias is not None: nn.init.zeros_(m.bias)
[docs] def forward(self, x: Tensor) -> Tensor: # noqa: D102 return x + self.mix(x)
[docs] class MixVPR(nn.Module): """MixVPR aggregation layer. Source: https://github.com/amaralibey/MixVPR/blob/main/models/aggregators/mixvpr.py """ def __init__( self, in_channels: int = 256, in_h: int = 10, in_w: int = 18, out_channels: int = 128, mix_depth: int = 4, mlp_ratio: float = 1, out_rows: int = 2, ) -> None: """Aggregation layer from the MixVPR paper. Args: in_channels (int): Depth of input feature maps. Defaults to 256. in_h (int): Height of input feature maps. Defaults to 10. in_w (int): Width of input feature maps. Defaults to 18. out_channels (int): Depth wise projection dimension. Defaults to 128. mix_depth (int): The number of stacked FeatureMixers. Defaults to 4. mlp_ratio (float): Ratio of the mid projection layer in the mixer block. Defaults to 1. out_rows (int): Row wise projection dimesion. Defaults to 2. """ super().__init__() self.in_h = in_h # height of input feature maps self.in_w = in_w # width of input feature maps self.in_channels = in_channels # depth of input feature maps self.out_channels = out_channels # depth wise projection dimension self.out_rows = out_rows # row wise projection dimesion self.mix_depth = mix_depth # L the number of stacked FeatureMixers self.mlp_ratio = mlp_ratio # ratio of the mid projection layer in the mixer block hw = in_h * in_w self.mix = nn.Sequential( *[FeatureMixerLayer(in_dim=hw, mlp_ratio=mlp_ratio) for _ in range(self.mix_depth)] ) self.channel_proj = nn.Linear(in_channels, out_channels) self.row_proj = nn.Linear(hw, out_rows)
[docs] def forward(self, x: Tensor) -> Tensor: # noqa: D102 x = x.flatten(2) x = self.mix(x) x = x.permute(0, 2, 1) x = self.channel_proj(x) x = x.permute(0, 2, 1) x = self.row_proj(x) x = F.normalize(x.flatten(1), p=2, dim=-1) return x