Skip to content

Generic Modules

Modules implemented in slp. These modules can be used as building blocks for more complicated models.

Attention

__init__(self, attention_size=512, input_size=None, dropout=0.1) special

Single-Headed Dot-product attention module

Parameters:

Name Type Description Default
attention_size int

Number of hidden features. Defaults to 512.

512
input_size Optional[int]

Input features. Defaults to None. If None input_size is set to attention_size.

None
dropout float

Drop probability. Defaults to 0.1.

0.1
Source code in slp/modules/attention.py
def __init__(
    self,
    attention_size: int = 512,
    input_size: Optional[int] = None,
    dropout: float = 0.1,
):
    """Single-Headed Dot-product attention module

    Args:
        attention_size (int): Number of hidden features. Defaults to 512.
        input_size (Optional[int]): Input features. Defaults to None.
            If None input_size is set to attention_size.
        dropout (float): Drop probability. Defaults to 0.1.
    """
    super(Attention, self).__init__()

    if input_size is None:
        input_size = attention_size
    self.dk = input_size
    self.k = nn.Linear(input_size, attention_size, bias=False)
    self.q = nn.Linear(input_size, attention_size, bias=False)
    self.v = nn.Linear(input_size, attention_size, bias=False)
    self.dropout = dropout
    reset_parameters(self.named_parameters())

forward(self, keys, queries=None, attention_mask=None)

Single-head scaled dot-product attention forward pass

Outputs the values, where features for each sequence element are weighted by their respective attention scores

\[a = softmax(\frac{Q}{K^T}){\sqrt{d}}) \dot V\]
  • B: Batch size
  • L: Keys Sequence length
  • M: Queries Sequence length
  • H: Number of heads
  • A: Feature dimension

Parameters:

Name Type Description Default
keys Tensor

[B, L, D] Keys tensor

required
queries Optional[torch.Tensor]

Optional [B, M, D] Queries tensor. If None queries = keys. Defaults to None.

None
attention_mask Optional[torch.Tensor]

Optional [B, L] or [B, M, L] zero-one mask for sequence elements. Defaults to None.

None

Returns:

Type Description
Tuple[torch.Tensor, torch.Tensor]

Tuple[torch.Tensor, torch.Tensor]: (Reweighted values [B, L, D], attention scores [B, M, L])

Source code in slp/modules/attention.py
def forward(
    self,
    keys: torch.Tensor,
    queries: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
    r"""Single-head scaled dot-product attention forward pass

    Outputs the values, where features for each sequence element are weighted by their respective attention scores

    $$a = softmax(\frac{Q}{K^T}){\sqrt{d}}) \dot V$$

    * B: Batch size
    * L: Keys Sequence length
    * M: Queries Sequence length
    * H: Number of heads
    * A: Feature dimension

    Args:
        keys (torch.Tensor): [B, L, D] Keys tensor
        queries (Optional[torch.Tensor]): Optional [B, M, D] Queries tensor. If None queries = keys. Defaults to None.
        attention_mask (Optional[torch.Tensor]): Optional [B, L] or [B, M, L] zero-one mask for sequence elements. Defaults to None.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: (Reweighted values [B, L, D], attention scores [B, M, L])
    """
    if attention_mask is not None:
        if len(list(attention_mask.size())) == 2:
            attention_mask = attention_mask.unsqueeze(1)

    if queries is None:
        queries = keys

    values = keys

    k = self.k(keys)  # (B, L, A)
    q = self.q(queries)
    v = self.v(values)

    # weights => (B, L, L)
    out, scores = attention(
        k,
        q,
        v,
        self.dk,
        attention_mask=attention_mask,
        dropout=self.dropout,
        training=self.training,
    )

    return out, scores

MultiheadAttention

__init__(self, attention_size=512, num_heads=8, input_size=None, dropout=0.1, nystrom=False, num_landmarks=64, inverse_iterations=6, kernel_size=None) special

Multi-Headed Dot-product attention module

Parameters:

Name Type Description Default
attention_size int

Number of hidden features. Defaults to 512.

512
num_heads int

Number of attention heads

8
input_size Optional[int]

Input features. Defaults to None. If None input_size is set to attention_size.

None
dropout float

Drop probability. Defaults to 0.1.

0.1
nystrom bool

Use nystrom method for attention calculation. Defaults to False.

False
num_landmarks int

Number of landmark points for nystrom attention. Defaults to 64.

64
inverse_iterations int

Number of iteration to calculate the inverse in nystrom attention. Defaults to 6.

6
kernel_size Optional[int]

Use residual convolution in the output. Defaults to None.

None
Source code in slp/modules/attention.py
def __init__(
    self,
    attention_size: int = 512,
    num_heads: int = 8,
    input_size: Optional[int] = None,
    dropout: float = 0.1,
    nystrom: bool = False,
    num_landmarks: int = 64,
    inverse_iterations: int = 6,
    kernel_size: Optional[int] = None,
):
    """Multi-Headed Dot-product attention module

    Args:
        attention_size (int): Number of hidden features. Defaults to 512.
        num_heads (int): Number of attention heads
        input_size (Optional[int]): Input features. Defaults to None.
            If None input_size is set to attention_size.
        dropout (float): Drop probability. Defaults to 0.1.
        nystrom (bool, optional): Use nystrom method for attention calculation. Defaults to False.
        num_landmarks (int, optional): Number of landmark points for nystrom attention. Defaults to 64.
        inverse_iterations (int, optional): Number of iteration to calculate the inverse in nystrom attention. Defaults to 6.
        kernel_size (Optional[int], optional): Use residual convolution in the output. Defaults to None.
    """
    super(MultiheadAttention, self).__init__()

    if input_size is None:
        input_size = attention_size
    self.inverse_iterations = inverse_iterations
    self.num_landmarks = num_landmarks
    self.nystrom = nystrom
    self.num_heads = num_heads
    self.head_size = int(attention_size / num_heads)
    self.dk = self.head_size
    self.attention_size = attention_size
    self.k = nn.Linear(input_size, attention_size, bias=False)
    self.q = nn.Linear(input_size, attention_size, bias=False)
    self.v = nn.Linear(input_size, attention_size, bias=False)
    self.output = nn.Linear(attention_size, attention_size)
    self.dropout = dropout

    self.conv = None

    if kernel_size is not None:
        self.conv = nn.Conv2d(
            in_channels=self.num_heads,
            out_channels=self.num_heads,
            kernel_size=(kernel_size, 1),
            padding=(kernel_size // 2, 0),
            bias=False,
            groups=self.num_heads,
        )

    reset_parameters(self.named_parameters())

forward(self, keys, queries=None, attention_mask=None)

Multi-head scaled dot-product attention forward pass

Outputs the values, where features for each sequence element are weighted by their respective attention scores

Each head performs dot-product attention

\[a_H = softmax(\frac{Q_H \cdot K_H^T}{\sqrt{d}}) \cdot V_H\]

The outputs of multiple heads are concatenated and passed through a feedforward layer.

\[a = W (a^{(1)}_{H} \mathbin\Vert a^{(2)}_{H} \dots) + b\]
  • B: Batch size
  • L: Keys Sequence length
  • M: Queries Sequence length
  • H: Number of heads
  • A: Feature dimension

Parameters:

Name Type Description Default
keys torch.Tensor

[B, L, D] Keys tensor

required
queries Optional[torch.Tensor]

Optional [B, M, D] Queries tensor. If None queries = keys. Defaults to None.

None
attention_mask Optional[torch.Tensor]

Optional [B, M, L] zero-one mask for sequence elements. Defaults to None.

None

Returns:

Type Description
Tuple[torch.Tensor, torch.Tensor]

(Reweighted values [B, L, D], attention scores [B, H, M, L])

Source code in slp/modules/attention.py
def forward(self, keys, queries=None, attention_mask=None):
    r"""Multi-head scaled dot-product attention forward pass

    Outputs the values, where features for each sequence element are weighted by their respective attention scores

    Each head performs dot-product attention

    $$a_H = softmax(\frac{Q_H \cdot K_H^T}{\sqrt{d}}) \cdot V_H$$

    The outputs of multiple heads are concatenated and passed through a feedforward layer.

    $$a = W (a^{(1)}_{H} \mathbin\Vert a^{(2)}_{H} \dots) + b$$


    * B: Batch size
    * L: Keys Sequence length
    * M: Queries Sequence length
    * H: Number of heads
    * A: Feature dimension


    Args:
        keys (torch.Tensor): [B, L, D] Keys tensor
        queries (Optional[torch.Tensor]): Optional [B, M, D] Queries tensor. If None queries = keys. Defaults to None.
        attention_mask (Optional[torch.Tensor]): Optional [B, M, L] zero-one mask for sequence elements. Defaults to None.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: (Reweighted values [B, L, D], attention scores [B, H, M, L])
    """
    _, seq_length, _ = keys.size()

    if attention_mask is not None:
        if attention_mask.ndim == 2:
            attention_mask = attention_mask.unsqueeze(1)
        attention_mask = attention_mask.unsqueeze(1)

    if self.nystrom:
        keys, attention_mask = pad_for_nystrom(
            keys, self.num_landmarks, attention_mask=attention_mask
        )

    if queries is None:
        queries = keys

    values = keys

    k = self.k(keys)
    q = self.q(queries)
    v = self.v(values)
    k = split_heads(k, self.num_heads)
    q = split_heads(q, self.num_heads)
    v = split_heads(v, self.num_heads)

    if self.nystrom:
        # out = (B, H, L, A/H)
        # scores = Tuple
        out, scores = nystrom_attention(
            k,
            q,
            v,
            self.dk,
            self.num_landmarks,
            attention_mask=attention_mask,
            inverse_iterations=self.inverse_iterations,
            dropout=self.dropout,
            training=self.training,
        )
    else:
        # out => (B, H, L, A/H)
        # scores => (B, H, L, L)
        out, scores = attention(
            k,
            q,
            v,
            self.dk,
            attention_mask=attention_mask,
            dropout=self.dropout,
            training=self.training,
        )

    if self.conv is not None:
        if attention_mask is None or attention_mask.ndim > 2:
            out += self.conv(v)
        else:
            attention_mask = attention_mask.squeeze()
            out += self.conv(v * attention_mask[:, None, :, None])

    # out => (B, H, L, A/H)
    out = merge_heads(out)
    if out.size(1) != seq_length:
        out = out[:, :seq_length, :]
    out = self.output(out)

    return out, scores

MultiheadSelfAttention

__init__(self, attention_size=512, num_heads=8, input_size=None, dropout=0.1, nystrom=False, num_landmarks=64, inverse_iterations=6, kernel_size=None) special

Multi-Headed Dot-product attention module

Parameters:

Name Type Description Default
attention_size int

Number of hidden features. Defaults to 512.

512
num_heads int

Number of attention heads

8
input_size Optional[int]

Input features. Defaults to None. If None input_size is set to attention_size.

None
dropout float

Drop probability. Defaults to 0.1.

0.1
Source code in slp/modules/attention.py
def __init__(
    self,
    attention_size: int = 512,
    num_heads: int = 8,
    input_size: Optional[int] = None,
    dropout: float = 0.1,
    nystrom: bool = False,
    num_landmarks: int = 64,
    inverse_iterations: int = 6,
    kernel_size: Optional[int] = None,
):
    """Multi-Headed Dot-product attention module

    Args:
        attention_size (int): Number of hidden features. Defaults to 512.
        num_heads (int): Number of attention heads
        input_size (Optional[int]): Input features. Defaults to None.
            If None input_size is set to attention_size.
        dropout (float): Drop probability. Defaults to 0.1.
    """
    super(MultiheadSelfAttention, self).__init__()

    if input_size is None:
        input_size = attention_size
    self.inverse_iterations = inverse_iterations
    self.num_landmarks = num_landmarks
    self.nystrom = nystrom
    self.num_heads = num_heads
    self.head_size = int(attention_size / num_heads)
    self.dk = self.head_size
    self.attention_size = attention_size
    self.kqv = nn.Linear(input_size, 3 * attention_size, bias=False)
    self.output = nn.Linear(attention_size, attention_size)
    self.dropout = dropout

    self.conv = None

    if kernel_size is not None:
        self.conv = nn.Conv2d(
            in_channels=self.num_heads,
            out_channels=self.num_heads,
            kernel_size=(kernel_size, 1),
            padding=(kernel_size // 2, 0),
            bias=False,
            groups=self.num_heads,
        )

    reset_parameters(self.named_parameters())

forward(self, x, attention_mask=None)

Multi-head scaled dot-product attention forward pass

Outputs the values, where features for each sequence element are weighted by their respective attention scores

Each head performs dot-product attention

\[a_H = softmax(\frac{Q_H \cdot K_H^T}{\sqrt{d}}) \cdot V_H\]

The outputs of multiple heads are concatenated and passed through a feedforward layer.

\[a = W (a^{(1)}_{H} \mathbin\Vert a^{(2)}_{H} \dots) + b\]
  • B: Batch size
  • L: Keys Sequence length
  • M: Queries Sequence length
  • H: Number of heads
  • A: Feature dimension

Parameters:

Name Type Description Default
x torch.Tensor

[B, L, D] Keys tensor

required
attention_mask Optional[torch.Tensor]

Optional [B, M, L] zero-one mask for sequence elements. Defaults to None.

None

Returns:

Type Description
Tuple[torch.Tensor, torch.Tensor]

(Reweighted values [B, L, D], attention scores [B, H, M, L])

Source code in slp/modules/attention.py
def forward(self, x, attention_mask=None):
    r"""Multi-head scaled dot-product attention forward pass

    Outputs the values, where features for each sequence element are weighted by their respective attention scores

    Each head performs dot-product attention

    $$a_H = softmax(\frac{Q_H \cdot K_H^T}{\sqrt{d}}) \cdot V_H$$

    The outputs of multiple heads are concatenated and passed through a feedforward layer.

    $$a = W (a^{(1)}_{H} \mathbin\Vert a^{(2)}_{H} \dots) + b$$


    * B: Batch size
    * L: Keys Sequence length
    * M: Queries Sequence length
    * H: Number of heads
    * A: Feature dimension


    Args:
        x (torch.Tensor): [B, L, D] Keys tensor
        attention_mask (Optional[torch.Tensor]): Optional [B, M, L] zero-one mask for sequence elements. Defaults to None.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: (Reweighted values [B, L, D], attention scores [B, H, M, L])
    """
    _, seq_length, _ = x.size()

    if attention_mask is not None:
        if attention_mask.ndim == 2:
            attention_mask = attention_mask.unsqueeze(1)
        attention_mask = attention_mask.unsqueeze(1)

    if self.nystrom:
        x, attention_mask = pad_for_nystrom(
            x, self.num_landmarks, attention_mask=attention_mask
        )

    k, q, v = self.kqv(x).chunk(3, dim=-1)
    k = split_heads(k, self.num_heads)
    q = split_heads(q, self.num_heads)
    v = split_heads(v, self.num_heads)

    if self.nystrom:
        # out = (B, H, L, A/H)
        # scores = Tuple
        out, scores = nystrom_attention(
            k,
            q,
            v,
            self.dk,
            self.num_landmarks,
            attention_mask=attention_mask,
            inverse_iterations=self.inverse_iterations,
            dropout=self.dropout,
            training=self.training,
        )
    else:
        # out => (B, H, L, A/H)
        # scores => (B, H, L, L)
        out, scores = attention(
            k,
            q,
            v,
            self.dk,
            attention_mask=attention_mask,
            dropout=self.dropout,
            training=self.training,
        )

    if self.conv is not None:
        if attention_mask is None or attention_mask.ndim > 2:
            out = out + self.conv(v)
        else:
            attention_mask = attention_mask.squeeze()
            out = out + self.conv(v * attention_mask[:, None, :, None])

    # out => (B, H, L, A/H)
    out = merge_heads(out)
    if out.size(1) != seq_length:
        out = out[:, -seq_length:, :]
    out = self.output(out)

    return out, scores

MultiheadTwowayAttention

__init__(self, attention_size=512, input_size=None, dropout=0.1, num_heads=8, residual=True, nystrom=False, num_landmarks=64, inverse_iterations=6, kernel_size=None) special

Multihead twoway attention for multimodal fusion

This module performs two way attention for two input modality feature sequences. If att is the MultiheadAttention operation and x, y the input modality sequences, the operation is summarized as

\[out = (att(x \rightarrow y), att(y \rightarrow x))\]

If residual is True then a Vilbert-like residual connection is applied

\[out = (att(x \rightarrow y) + x, att(y \rightarrow x) + y)\]

Parameters:

Name Type Description Default
attention_size int

Number of hidden features. Defaults to 512.

512
num_heads int

Number of attention heads

8
input_size Optional[int]

Input features. Defaults to None. If None input_size is set to attention_size.

None
dropout float

Drop probability. Defaults to 0.1.

0.1
nystrom bool

Use nystrom method for attention calculation. Defaults to False.

False
num_landmarks int

Number of landmark points for nystrom attention. Defaults to 64.

64
inverse_iterations int

Number of iteration to calculate the inverse in nystrom attention. Defaults to 6.

6
kernel_size Optional[int]

Use residual convolution in the output. Defaults to None.

None
residual bool

Use vilbert-like residual connections for fusion. Defaults to True.

True
Source code in slp/modules/attention.py
def __init__(
    self,
    attention_size: int = 512,
    input_size: Optional[int] = None,
    dropout: float = 0.1,
    num_heads: int = 8,
    residual: bool = True,
    nystrom: bool = False,
    num_landmarks: int = 64,
    inverse_iterations: int = 6,
    kernel_size: Optional[int] = None,
):
    r"""Multihead twoway attention for multimodal fusion

    This module performs two way attention for two input modality feature sequences.
    If att is the MultiheadAttention operation and x, y the input modality sequences,
    the operation is summarized as

    $$out = (att(x \rightarrow y), att(y \rightarrow x))$$

    If residual is True then a Vilbert-like residual connection is applied

    $$out = (att(x \rightarrow y) + x, att(y \rightarrow x) + y)$$


    Args:
        attention_size (int): Number of hidden features. Defaults to 512.
        num_heads (int): Number of attention heads
        input_size (Optional[int]): Input features. Defaults to None.
            If None input_size is set to attention_size.
        dropout (float): Drop probability. Defaults to 0.1.
        nystrom (bool, optional): Use nystrom method for attention calculation. Defaults to False.
        num_landmarks (int, optional): Number of landmark points for nystrom attention. Defaults to 64.
        inverse_iterations (int, optional): Number of iteration to calculate the inverse in nystrom attention. Defaults to 6.
        kernel_size (Optional[int], optional): Use residual convolution in the output. Defaults to None.
        residual (bool, optional): Use vilbert-like residual connections for fusion. Defaults to True.
    """
    super(MultiheadTwowayAttention, self).__init__()

    self.xy = MultiheadAttention(
        attention_size=attention_size,
        input_size=input_size,
        dropout=dropout,
        num_heads=num_heads,
        nystrom=nystrom,
        num_landmarks=num_landmarks,
        inverse_iterations=inverse_iterations,
        kernel_size=kernel_size,
    )
    self.yx = MultiheadAttention(
        attention_size=attention_size,
        input_size=input_size,
        dropout=dropout,
        num_heads=num_heads,
        nystrom=nystrom,
        num_landmarks=num_landmarks,
        inverse_iterations=inverse_iterations,
        kernel_size=kernel_size,
    )
    self.residual = residual

forward(self, mod1, mod2, attention_mask=None)

x : (B, L, D) queries : (B, L, D) values : (B, L, D)

Source code in slp/modules/attention.py
def forward(self, mod1, mod2, attention_mask=None):
    """
    x : (B, L, D)
    queries : (B, L, D)
    values : (B, L, D)
    """
    out_mod1, _ = self.xy(mod1, queries=mod2, attention_mask=attention_mask)
    out_mod2, _ = self.yx(mod2, queries=mod1, attention_mask=attention_mask)

    if not self.residual:
        return out_mod1, out_mod2
    else:
        # vilbert cross residual

        # v + attention(v->a)
        # a + attention(a->v)
        out_mod1 += mod2
        out_mod2 += mod1

        return out_mod1, out_mod2

SelfAttention

__init__(self, attention_size=512, input_size=None, dropout=0.1) special

Single-Headed Dot-product self attention module

Parameters:

Name Type Description Default
attention_size int

Number of hidden features. Defaults to 512.

512
input_size Optional[int]

Input features. Defaults to None. If None input_size is set to attention_size.

None
dropout float

Drop probability. Defaults to 0.1.

0.1
Source code in slp/modules/attention.py
def __init__(
    self,
    attention_size: int = 512,
    input_size: Optional[int] = None,
    dropout: float = 0.1,
):
    """Single-Headed Dot-product self attention module

    Args:
        attention_size (int): Number of hidden features. Defaults to 512.
        input_size (Optional[int]): Input features. Defaults to None.
            If None input_size is set to attention_size.
        dropout (float): Drop probability. Defaults to 0.1.
    """
    super(SelfAttention, self).__init__()

    if input_size is None:
        input_size = attention_size
    self.dk = input_size
    self.kqv = nn.Linear(input_size, 3 * attention_size, bias=False)
    self.dropout = dropout
    reset_parameters(self.named_parameters())

forward(self, x, attention_mask=None)

Single-head scaled dot-product attention forward pass

Outputs the values, where features for each sequence element are weighted by their respective attention scores

\[a = softmax(\frac{Q}{K^T}){\sqrt{d}}) \dot V\]
  • B: Batch size
  • L: Keys Sequence length
  • M: Queries Sequence length
  • H: Number of heads
  • A: Feature dimension

Parameters:

Name Type Description Default
x Tensor

[B, L, D] Input tensor

required
attention_mask Optional[torch.Tensor]

Optional [B, L] or [B, M, L] zero-one mask for sequence elements. Defaults to None.

None

Returns:

Type Description
Tuple[torch.Tensor, torch.Tensor]

Tuple[torch.Tensor, torch.Tensor]: (Reweighted values [B, L, D], attention scores [B, M, L])

Source code in slp/modules/attention.py
def forward(
    self,
    x: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
    r"""Single-head scaled dot-product attention forward pass

    Outputs the values, where features for each sequence element are weighted by their respective attention scores

    $$a = softmax(\frac{Q}{K^T}){\sqrt{d}}) \dot V$$

    * B: Batch size
    * L: Keys Sequence length
    * M: Queries Sequence length
    * H: Number of heads
    * A: Feature dimension

    Args:
        x (torch.Tensor): [B, L, D] Input tensor
        attention_mask (Optional[torch.Tensor]): Optional [B, L] or [B, M, L] zero-one mask for sequence elements. Defaults to None.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: (Reweighted values [B, L, D], attention scores [B, M, L])
    """
    if attention_mask is not None:
        if len(list(attention_mask.size())) == 2:
            attention_mask = attention_mask.unsqueeze(1)

    k, q, v = self.kqv(x).chunk(3, dim=-1)  # (B, L, A)

    # weights => (B, L, L)
    out, scores = attention(
        k,
        q,
        v,
        self.dk,
        attention_mask=attention_mask,
        dropout=self.dropout,
        training=self.training,
    )

    return out, scores

attention(k, q, v, dk, attention_mask=None, dropout=0.2, training=True)

Reweight values using scaled dot product attention

\[s = softmax(\frac{Q \cdot K^T}{\sqrt{d}}) V\]
  • B: Batch size
  • L: Keys Sequence length
  • M: Queries Sequence length
  • H: Number of heads
  • A: Feature dimension

Parameters:

Name Type Description Default
k Tensor

Single head [B, L, A] or multi-head [B, H, L, A/H] Keys tensor

required
q Tensor

Single head [B, M, A] or multi-head [B, H, M, A/H] Keys tensor

required
v Tensor

Single head [B, M, A] or multi-head [B, H, M, A/H] Values tensor

required
dk int

Model dimension

required
attention_mask Optional[torch.Tensor]

Optional [B, [H], 1, L] pad mask or [B, [H], M, L] pad mask + subsequent mask tensor with zeros in sequence indices that should be masked and ones in sequence indices that should be preserved. Defaults to None.

None
dropout float

Drop probability. Defaults to 0.2.

0.2
training bool

Is module in training phase? Defaults to True.

True

Returns:

Type Description
torch.Tensor

[B, M, L] or [B, H, M, L] attention scores

Source code in slp/modules/attention.py
def attention(
    k: torch.Tensor,
    q: torch.Tensor,
    v: torch.Tensor,
    dk: int,
    attention_mask: Optional[torch.Tensor] = None,
    dropout: float = 0.2,
    training: bool = True,
):
    r"""Reweight values using scaled dot product attention

    $$s = softmax(\frac{Q \cdot K^T}{\sqrt{d}}) V$$

    * B: Batch size
    * L: Keys Sequence length
    * M: Queries Sequence length
    * H: Number of heads
    * A: Feature dimension

    Args:
        k (torch.Tensor): Single head [B, L, A] or multi-head [B, H, L, A/H] Keys tensor
        q (torch.Tensor): Single head [B, M, A] or multi-head [B, H, M, A/H] Keys tensor
        v (torch.Tensor): Single head [B, M, A] or multi-head [B, H, M, A/H] Values tensor
        dk (int): Model dimension
        attention_mask (Optional[torch.Tensor]): Optional [B, [H], 1, L] pad mask or [B, [H], M, L] pad mask + subsequent mask
            tensor with zeros in sequence indices that should be masked and ones in sequence indices that should be
            preserved. Defaults to None.
        dropout (float): Drop probability. Defaults to 0.2.
        training (bool): Is module in training phase? Defaults to True.

    Returns:
        torch.Tensor: [B, M, L] or [B, H, M, L] attention scores
    """

    scores = attention_scores(
        k, q, dk, attention_mask=attention_mask, dropout=dropout, training=training
    )
    out = torch.matmul(scores, v)

    return out, scores

attention_scores(k, q, dk, attention_mask=None, dropout=0.2, training=True)

Calculate attention scores for scaled dot product attention

\[s = softmax(\frac{Q \cdot K^T}{\sqrt{d}})\]
  • B: Batch size
  • L: Keys Sequence length
  • M: Queries Sequence length
  • H: Number of heads
  • A: Feature dimension

Parameters:

Name Type Description Default
k Tensor

Single head [B, L, A] or multi-head [B, H, L, A/H] Keys tensor

required
q Tensor

Single head [B, M, A] or multi-head [B, H, M, A/H] Keys tensor

required
dk int

Model dimension

required
attention_mask Optional[torch.Tensor]

Optional [B, [H], 1, L] pad mask or [B, [H], M, L] pad mask + subsequent mask tensor with zeros in sequence indices that should be masked and ones in sequence indices that should be preserved. Defaults to None.

None
dropout float

Drop probability. Defaults to 0.2.

0.2
training bool

Is module in training phase? Defaults to True.

True

Returns:

Type Description
Tensor

torch.Tensor: [B, M, L] or [B, H, M, L] attention scores

Source code in slp/modules/attention.py
def attention_scores(
    k: torch.Tensor,
    q: torch.Tensor,
    dk: int,
    attention_mask: Optional[torch.Tensor] = None,
    dropout: float = 0.2,
    training: bool = True,
) -> torch.Tensor:
    r"""Calculate attention scores for scaled dot product attention

    $$s = softmax(\frac{Q \cdot K^T}{\sqrt{d}})$$

    * B: Batch size
    * L: Keys Sequence length
    * M: Queries Sequence length
    * H: Number of heads
    * A: Feature dimension

    Args:
        k (torch.Tensor): Single head [B, L, A] or multi-head [B, H, L, A/H] Keys tensor
        q (torch.Tensor): Single head [B, M, A] or multi-head [B, H, M, A/H] Keys tensor
        dk (int): Model dimension
        attention_mask (Optional[torch.Tensor]): Optional [B, [H], 1, L] pad mask or [B, [H], M, L] pad mask + subsequent mask
            tensor with zeros in sequence indices that should be masked and ones in sequence indices that should be
            preserved. Defaults to None.
        dropout (float): Drop probability. Defaults to 0.2.
        training (bool): Is module in training phase? Defaults to True.

    Returns:
        torch.Tensor: [B, M, L] or [B, H, M, L] attention scores
    """
    scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(dk)

    if attention_mask is not None:
        scores = scores + ((1 - attention_mask) * -1e5)
    scores = F.softmax(scores, dim=-1)
    scores = F.dropout(scores, p=dropout, training=training)

    return scores

merge_heads(x)

Merge multiple attention heads into output tensor

(Batch size, Heads, Lengths, Attention size / Heads) => (Batch size, Length, Attention size)

Parameters:

Name Type Description Default
x Tensor

[B, H, L, A/H] multi-head tensor

required

Returns:

Type Description
Tensor

torch.Tensor: [B, L, A] merged / reshaped tensor

Source code in slp/modules/attention.py
def merge_heads(x: torch.Tensor) -> torch.Tensor:
    """Merge multiple attention heads into output tensor

    (Batch size, Heads, Lengths, Attention size / Heads) => (Batch size, Length, Attention size)

    Args:
        x (torch.Tensor): [B, H, L, A/H] multi-head tensor

    Returns:
        torch.Tensor:  [B, L, A] merged / reshaped tensor
    """
    batch_size, _, max_length, _ = x.size()
    # x => (B, L, H, A/H)
    x = x.permute(0, 2, 1, 3).contiguous()

    return x.view(batch_size, max_length, -1)

nystrom_attention(k, q, v, dk, num_landmarks, attention_mask=None, inverse_iterations=6, dropout=0.2, training=True)

Calculate attention using nystrom approximation

Implementation heavily based on: https://github.com/lucidrains/nystrom-attention

Reference: https://arxiv.org/abs/2102.03902 * B: Batch size * L: Keys Sequence length * M: Queries Sequence length * H: Number of heads * A: Feature dimension

Parameters:

Name Type Description Default
k Tensor

Single head [B, L, A] or multi-head [B, H, L, A/H] Keys tensor

required
q Tensor

Single head [B, M, A] or multi-head [B, H, M, A/H] Keys tensor

required
v Tensor

Single head [B, M, A] or multi-head [B, H, M, A/H] Values tensor

required
dk int

Model dimension

required
num_landmarks int

Number of landmark points

required
attention_mask Optional[torch.Tensor]

Optional [B, [H], 1, L] pad mask or [B, [H], M, L] pad mask + subsequent mask tensor with zeros in sequence indices that should be masked and ones in sequence indices that should be preserved. Defaults to None.

None
inverse_iterations int

Number of iterations for Moore Penrose iterative inverse approximation

6
dropout float

Drop probability. Defaults to 0.2.

0.2
training bool

Is module in training phase? Defaults to True.

True

Returns:

Type Description
torch.Tensor

[B, M, L] or [B, H, M, L] attention scores

Source code in slp/modules/attention.py
def nystrom_attention(
    k: torch.Tensor,
    q: torch.Tensor,
    v: torch.Tensor,
    dk: int,
    num_landmarks: int,
    attention_mask: Optional[torch.Tensor] = None,
    inverse_iterations: int = 6,
    dropout: float = 0.2,
    training: bool = True,
):
    """Calculate attention using nystrom approximation

    Implementation heavily based on: https://github.com/lucidrains/nystrom-attention

    Reference: https://arxiv.org/abs/2102.03902
    * B: Batch size
    * L: Keys Sequence length
    * M: Queries Sequence length
    * H: Number of heads
    * A: Feature dimension

    Args:
        k (torch.Tensor): Single head [B, L, A] or multi-head [B, H, L, A/H] Keys tensor
        q (torch.Tensor): Single head [B, M, A] or multi-head [B, H, M, A/H] Keys tensor
        v (torch.Tensor): Single head [B, M, A] or multi-head [B, H, M, A/H] Values tensor
        dk (int): Model dimension
        num_landmarks (int): Number of landmark points
        attention_mask (Optional[torch.Tensor]): Optional [B, [H], 1, L] pad mask or [B, [H], M, L] pad mask + subsequent mask
            tensor with zeros in sequence indices that should be masked and ones in sequence indices that should be
            preserved. Defaults to None.
        inverse_iterations (int): Number of iterations for Moore Penrose iterative inverse
            approximation
        dropout (float): Drop probability. Defaults to 0.2.
        training (bool): Is module in training phase? Defaults to True.

    Returns:
        torch.Tensor: [B, M, L] or [B, H, M, L] attention scores
    """
    _, num_heads, seq_length, head_size = k.size()

    masked_mean_denom = seq_length // num_landmarks
    if attention_mask is not None:
        attention_mask = attention_mask.unsqueeze(1)
        masked_mean_denom = (
            attention_mask.reshape(-1, 1, num_landmarks, seq_length // num_landmarks).sum(-1) + 1e-8  # type: ignore
        )  # (B, 1, Landmarks)
        mask_landmarks = (masked_mean_denom > 0).type(torch.float)  # type: ignore
        masked_mean_denom = masked_mean_denom[..., None]  # type: ignore
        attention_mask = attention_mask.unsqueeze(-1)
        q = q * attention_mask  # (B, H, L, A/H)
        k = k * attention_mask  # (B, H, L, A/H)
        v = v * attention_mask  # (B, H, L, A/H)

        scores_1_mask = attention_mask * mask_landmarks[..., None, :]
        scores_2_mask = mask_landmarks[..., None] * mask_landmarks[..., None, :]
        scores_3_mask = scores_1_mask.transpose(-1, -2)

    q = q / math.sqrt(dk)

    q_landmarks = q.reshape(
        q.size(0),  # batch_size
        q.size(1),  # num_heads
        num_landmarks,  # landmarks
        seq_length // num_landmarks,  # reduced length
        q.size(-1),  # head_size
    ).sum(
        dim=-2
    )  # (B, H, Landmarks, A/H)

    k_landmarks = k.reshape(
        k.size(0),  # batch_size
        k.size(1),  # num_heads
        num_landmarks,  # landmarks
        seq_length // num_landmarks,  # reduced length
        k.size(-1),  # head size
    ).sum(
        dim=-2
    )  # (B, H, Landmarks, A/H)

    k_landmarks = k_landmarks / masked_mean_denom
    q_landmarks = q_landmarks / masked_mean_denom

    scores_1 = attention_scores(
        k_landmarks,
        q,
        1,  # We have already accounted for dk
        attention_mask=scores_1_mask,
        dropout=dropout,
        training=training,
    )

    scores_2 = attention_scores(
        k_landmarks,
        q_landmarks,
        1,  # We have already accounted for dk
        attention_mask=scores_2_mask,
        dropout=dropout,
        training=training,
    )

    scores_3 = attention_scores(
        k,
        q_landmarks,
        1,  # We have already accounted for dk
        attention_mask=scores_3_mask,
        dropout=dropout,
        training=training,
    )

    z_star = moore_penrose_pinv(scores_2, num_iter=inverse_iterations)
    out = (scores_1 @ z_star) @ (scores_3 @ v)

    return out, (scores_1, scores_2, scores_3)

pad_for_nystrom(x, num_landmarks, attention_mask=None)

Pad inputs and attention_mask to perform Nystrom Attention

Pad to nearest multiple of num_landmarks

Parameters:

Name Type Description Default
x Tensor

[B, L, A] Input tensor

required
num_landmarks int

Number of landmark points

required
attention_mask Optional[torch.Tensor]

[B, L] Padding mask

None

Returns:

Type Description
Tuple[torch.Tensor, Optional[torch.Tensor]]

Tuple[torch.Tensor, Optional[torch.Tensor]]: Padded inputs and attention_mask

Source code in slp/modules/attention.py
def pad_for_nystrom(
    x: torch.Tensor, num_landmarks: int, attention_mask: Optional[torch.Tensor] = None
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
    """Pad inputs and attention_mask to perform Nystrom Attention

    Pad to nearest multiple of num_landmarks

    Args:
        x (torch.Tensor): [B, L, A] Input tensor
        num_landmarks (int): Number of landmark points
        attention_mask (Optional[torch.Tensor]): [B, L] Padding mask

    Returns:
        Tuple[torch.Tensor, Optional[torch.Tensor]]: Padded inputs and attention_mask
    """
    if attention_mask is not None:
        attention_mask = attention_mask.squeeze()

    _, seq_length, _ = x.size()

    _, remainder = (
        math.ceil(seq_length / num_landmarks),
        seq_length % num_landmarks,
    )

    if remainder > 0:
        padding = num_landmarks - remainder
        x = F.pad(x, (0, 0, padding, 0), value=0)

        if attention_mask is not None:
            attention_mask = F.pad(attention_mask, (padding, 0))

    return x, attention_mask

reset_parameters(named_parameters)

Initialize parameters in the transformer model.

Source code in slp/modules/attention.py
def reset_parameters(named_parameters):
    """Initialize parameters in the transformer model."""

    for name, p in named_parameters:
        if "weight" in name:
            nn.init.xavier_normal_(p)

        if "bias" in name:
            nn.init.constant_(p, 0.0)

split_heads(x, num_heads)

Split input tensor into multiple attention heads

(Batch size, Length, Attention size) => (Batch size, Heads, Lengths, Attention size / Heads)

Parameters:

Name Type Description Default
x Tensor

[B, L, A] input tensor

required
num_heads int

number of heads

required

Returns:

Type Description
Tensor

torch.Tensor: [B, H, L, A/H] Splitted / reshaped tensor

Source code in slp/modules/attention.py
def split_heads(x: torch.Tensor, num_heads: int) -> torch.Tensor:
    """Split input tensor into multiple attention heads

    (Batch size, Length, Attention size) => (Batch size, Heads, Lengths, Attention size / Heads)

    Args:
        x (torch.Tensor): [B, L, A] input tensor
        num_heads (int): number of heads

    Returns:
        torch.Tensor: [B, H, L, A/H] Splitted / reshaped tensor
    """
    batch_size, max_length, attention_size = x.size()
    head_size = int(attention_size / num_heads)

    return x.view(batch_size, max_length, num_heads, head_size).permute(0, 2, 1, 3)

TwowayAttention

Some Information about Attention

forward(self, mod1, mod2, attention_mask=None)

x : (B, L, D) queries : (B, L, D) values : (B, L, D)

Source code in slp/modules/twowayattention.py
def forward(self, mod1, mod2, attention_mask=None):
    """
    x : (B, L, D)
    queries : (B, L, D)
    values : (B, L, D)
    """
    k_mod1 = self.kx(mod1)
    q_mod2 = self.qy(mod2)
    v_mod1 = self.vx(mod1)

    k_mod2 = self.ky(mod2)  # (B, L, A)
    q_mod1 = self.qx(mod1)
    v_mod2 = self.vy(mod2)

    # weights => (B, L, L)

    scores_mod1 = torch.bmm(q_mod2, k_mod1.transpose(1, 2)) / math.sqrt(self.dk)
    scores_mod2 = torch.bmm(q_mod1, k_mod2.transpose(1, 2)) / math.sqrt(self.dk)

    if attention_mask is not None:
        scores_mod1 = scores_mod1 + ((1 - attention_mask.unsqueeze(1)) * -1e5)
        scores_mod2 = scores_mod2 + ((1 - attention_mask.unsqueeze(1)) * -1e5)
    scores_mod1 = F.softmax(scores_mod1, dim=-1)
    scores_mod1 = self.drop(scores_mod1)
    scores_mod2 = F.softmax(scores_mod2, dim=-1)
    scores_mod2 = self.drop(scores_mod2)

    # out => (B, L, A)
    out_mod1 = torch.bmm(scores_mod1, v_mod1)
    out_mod2 = torch.bmm(scores_mod2, v_mod2)

    if self.layernorm:
        out_mod1 = self.lnx(out_mod1)
        out_mod2 = self.lny(out_mod2)

    if not self.residual:
        return out_mod1, out_mod2
    else:
        # vilbert cross residual

        # v + attention(v->a)
        # a + attention(a->v)
        out_mod1 += mod2
        out_mod2 += mod1

        return out_mod1, out_mod2

Classifier

__init__(self, encoder, encoded_features, num_classes, dropout=0.2) special

Classifier wrapper module

Stores a Neural Network encoder and adds a classification layer on top.

Parameters:

Name Type Description Default
encoder Module

[description]

required
encoded_features int

[description]

required
num_classes int

[description]

required
dropout float

Drop probability

0.2
Source code in slp/modules/classifier.py
def __init__(
    self,
    encoder: nn.Module,
    encoded_features: int,
    num_classes: int,
    dropout: float = 0.2,
):
    """Classifier wrapper module

    Stores a Neural Network encoder and adds a classification layer on top.

    Args:
        encoder (nn.Module): [description]
        encoded_features (int): [description]
        num_classes (int): [description]
        dropout (float): Drop probability
    """
    super(Classifier, self).__init__()
    self.encoder = encoder
    self.drop = nn.Dropout(dropout)
    self.clf = nn.Linear(encoded_features, num_classes)

forward(self, *args, **kwargs)

Encode inputs using the encoder network and perform classification

Returns:

Type Description
Tensor

torch.Tensor: [B, *, num_classes] Logits tensor

Source code in slp/modules/classifier.py
def forward(self, *args, **kwargs) -> torch.Tensor:
    """Encode inputs using the encoder network and perform classification

    Returns:
        torch.Tensor: [B, *, num_classes] Logits tensor
    """
    encoded: torch.Tensor = self.encoder(*args, **kwargs)  # type: ignore
    out: torch.Tensor = self.drop(encoded)
    out = self.clf(out)

    return out

MOSEITextClassifier

forward(self, x, lengths)

Encode inputs using the encoder network and perform classification

Returns:

Type Description
torch.Tensor

[B, *, num_classes] Logits tensor

Source code in slp/modules/classifier.py
def forward(self, x, lengths):
    x = x["text"]
    lengths = lengths["text"]

    return super().forward(x, lengths)

RNNLateFusionClassifier

forward(self, inputs, lengths)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in slp/modules/classifier.py
def forward(self, inputs, lengths):
    encoded = [
        self.modality_encoders[m](inputs[m], lengths[m]) for m in self.modalities
    ]
    if self.mmdrop is not None:
        encoded = self.mmdrop(*encoded)
    fused = torch.cat(encoded, dim=-1)
    fused = self.drop(fused)
    out = self.clf(fused)

    return out

TransformerLateFusionClassifier

forward(self, inputs, attention_masks=None)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in slp/modules/classifier.py
def forward(self, inputs, attention_masks=None):
    if attention_masks is None:
        attention_masks = dict(
            zip(self.modalities, [None for _ in self.modalities])
        )

    encoded = [
        self.modality_encoders[m](inputs[m], attention_mask=attention_masks[m])
        for m in self.modalities
    ]

    if self.mmdrop is not None:
        encoded = self.mmdrop(*encoded)
    fused = torch.cat(encoded, dim=-1)
    if self.modality_drop is not None:
        fused = self.modality_drop(fused)

    out = self.clf(fused)

    return out

Embed

__init__(self, num_embeddings, embedding_dim, embeddings=None, noise=0.0, dropout=0.0, scale=1.0, trainable=False) special

Define the layer of the model and perform the initializations of the layers (wherever it is necessary)

Parameters:

Name Type Description Default
num_embeddings int

Total number of embeddings.

required
embedding_dim int

Embedding dimension.

required
embeddings Optional[numpy.ndarray]

the 2D ndarray with the word vectors.

None
noise float

Optional additive noise. Defaults to 0.0.

0.0
dropout float

Embedding dropout probability. Defaults to 0.0.

0.0
scale float

Scale word embeddings by a constant. Defaults to 1.0.

1.0
trainable bool

Finetune embeddings. Defaults to False

False
Source code in slp/modules/embed.py
def __init__(
    self,
    num_embeddings: int,
    embedding_dim: int,
    embeddings: Optional[np.ndarray] = None,
    noise: float = 0.0,
    dropout: float = 0.0,
    scale: float = 1.0,
    trainable: bool = False,
):
    """
    Define the layer of the model and perform the initializations
    of the layers (wherever it is necessary)

    Args:
        num_embeddings (int): Total number of embeddings.
        embedding_dim (int): Embedding dimension.
        embeddings (numpy.ndarray): the 2D ndarray with the word vectors.
        noise (float): Optional additive noise. Defaults to 0.0.
        dropout (float): Embedding dropout probability. Defaults to 0.0.
        scale (float): Scale word embeddings by a constant. Defaults to 1.0.
        trainable (bool): Finetune embeddings. Defaults to False
    """
    super(Embed, self).__init__()
    self.scale = scale  # scale embeddings by value. Needed for transformer
    # define the embedding layer, with the corresponding dimensions
    self.embedding = nn.Embedding(
        num_embeddings=num_embeddings, embedding_dim=embedding_dim
    )

    if embeddings is not None:
        logger.info("Initializing Embedding layer with pre-trained weights.")
        if trainable:
            logger.info("Embeddings are going to be finetuned")
        else:
            logger.info("Embeddings are frozen")
        self.init_embeddings(embeddings, trainable)

    # the dropout "layer" for the word embeddings
    self.dropout = nn.Dropout(dropout)

    # the gaussian noise "layer" for the word embeddings
    self.noise = GaussianNoise(noise)

forward(self, x)

Embed input tokens

Assign embedding that corresponds to each token. Optionally add Gaussian noise and embedding dropout and scale embeddings by a constant.

Parameters:

Name Type Description Default
x Tensor

[B, L] Input token ids.

required

Returns:

Type Description
Tensor

(torch.Tensor) -> [B, L, E] Embedded tokens.

Source code in slp/modules/embed.py
def forward(self, x: torch.Tensor) -> torch.Tensor:
    """Embed input tokens

    Assign embedding that corresponds to each token.
    Optionally add Gaussian noise and embedding dropout and scale embeddings by a constant.

    Args:
        x (torch.Tensor): [B, L] Input token ids.

    Returns:
        (torch.Tensor) -> [B, L, E] Embedded tokens.
    """
    embeddings = self.embedding(x)

    if self.noise.stddev > 0:
        embeddings = self.noise(embeddings)

    if self.dropout.p > 0:
        embeddings = self.dropout(embeddings)

    return embeddings * self.scale  # type: ignore

init_embeddings(self, weights, trainable)

Initialize embeddings matrix with pretrained embeddings

Parameters:

Name Type Description Default
weights ndarray

pretrained embeddings

required
trainable bool

Finetune embeddings?

required
Source code in slp/modules/embed.py
def init_embeddings(self, weights: np.ndarray, trainable: bool):
    """Initialize embeddings matrix with pretrained embeddings

    Args:
        weights (np.ndarray): pretrained embeddings
        trainable (bool): Finetune embeddings?
    """
    self.embedding.weight = nn.Parameter(
        torch.from_numpy(weights), requires_grad=trainable
    )

PositionalEncoding

__init__(self, embedding_dim=512, max_len=5000) special

Inject some information about the relative or absolute position of the tokens in the sequence.

The positional encodings have the same dimension as the embeddings, so that the two can be summed. Here, we use sine and cosine functions of different frequencies.

PE for even positions:

\[\text{PosEncoder}(pos, 2i) = sin(\frac{pos}{10000^{\frac{2i}{d}}})\]

PE for odd positions:

\[\text{PosEncoder}(pos, 2i+1) = cos(\frac{pos}{10000^{\frac{2i}{d}}})\]

where \(pos\) is the word position and \(i\) is the embedding idx

Implementation modified from pytorch/examples/word_language_model.py

Parameters:

Name Type Description Default
embedding_dim int

Embedding / model dimension. Defaults to 512.

512
max_len int

Maximum sequence length that can be encoded. Defaults to 5000.

5000
Source code in slp/modules/embed.py
def __init__(self, embedding_dim: int = 512, max_len: int = 5000):
    r"""Inject some information about the relative or absolute position of the tokens in the sequence.

    The positional encodings have the same dimension as
    the embeddings, so that the two can be summed. Here, we use sine and cosine
    functions of different frequencies.

    PE for even positions:

    $$\text{PosEncoder}(pos, 2i) = sin(\frac{pos}{10000^{\frac{2i}{d}}})$$

    PE for odd positions:

    $$\text{PosEncoder}(pos, 2i+1) = cos(\frac{pos}{10000^{\frac{2i}{d}}})$$

    where $pos$ is the word position and $i$ is the embedding idx

    Implementation modified from pytorch/examples/word_language_model.py

    Args:
        embedding_dim (int): Embedding / model dimension. Defaults to 512.
        max_len (int): Maximum sequence length that can be encoded. Defaults to 5000.
    """
    super(PositionalEncoding, self).__init__()
    pe = torch.zeros(max_len, embedding_dim)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(
        torch.arange(0, embedding_dim, 2).float()
        * (-math.log(10000.0) / embedding_dim)
    )
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0)
    self.register_buffer("pe", pe)

forward(self, x)

Calculate positional embeddings for input and add them to input tensor

\[out = x + PosEmbed(x)\]

x is assumed to be batch first

Parameters:

Name Type Description Default
x Tensor

[B, L, D] input embeddings

required

Returns:

Type Description
Tensor

torch.Tensor: Embeddings + positional embeddings

Source code in slp/modules/embed.py
def forward(self, x: torch.Tensor) -> torch.Tensor:
    """Calculate positional embeddings for input and add them to input tensor

    $$out = x + PosEmbed(x)$$

    x is assumed to be batch first

    Args:
        x (torch.Tensor): [B, L, D] input embeddings

    Returns:
        torch.Tensor: Embeddings + positional embeddings
    """
    x = x + self.pe[:, : x.size(1), :]  # type: ignore
    return x

PositionwiseFF

__init__(self, d_model, d_ff, dropout=0.1, gelu=False) special

Transformer Position-wise feed-forward layer

Linear -> LayerNorm -> ReLU -> Linear

Parameters:

Name Type Description Default
d_model int

Model dimension

required
d_ff int

Hidden dimension

required
dropout float

Dropout probability. Defaults to 0.1.

0.1
Source code in slp/modules/feedforward.py
def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1, gelu=False):
    """Transformer Position-wise feed-forward layer

    Linear -> LayerNorm -> ReLU -> Linear

    Args:
        d_model (int): Model dimension
        d_ff (int): Hidden dimension
        dropout (float): Dropout probability. Defaults to 0.1.
    """
    super(PositionwiseFF, self).__init__()
    self.ff1 = nn.Linear(d_model, d_ff)
    self.ff2 = nn.Linear(d_ff, d_model)
    self.drop = nn.Dropout(dropout)
    self.activation = nn.ReLU() if not gelu else nn.GELU()

forward(self, x)

Position-wise FF forward pass

\[out = W_2 \dot max(0, W_1 \dot x + b_1) + b_2\]

[B, , D] -> [B, , H] -> [B, *, D]

  • B: Batch size
  • D: Model dim
  • H: Hidden size > Model dim (Usually \(H = 2D\))

Parameters:

Name Type Description Default
x Tensor

[B, *, D] Input features

required

Returns:

Type Description
Tensor

torch.Tensor: [B, *, D] Output features

Source code in slp/modules/feedforward.py
def forward(self, x: torch.Tensor) -> torch.Tensor:
    r"""Position-wise FF forward pass

    $$out = W_2 \dot max(0, W_1 \dot x + b_1) + b_2$$

    [B, *, D] -> [B, *, H] -> [B, *, D]

    * B: Batch size
    * D: Model dim
    * H: Hidden size > Model dim (Usually $H = 2D$)

    Args:
        x (torch.Tensor): [B, *, D] Input features

    Returns:
        torch.Tensor: [B, *, D] Output features
    """
    out: torch.Tensor = self.ff2(self.drop(self.activation(self.ff1(x))))
    return out

TwoLayer

forward(self, x)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in slp/modules/feedforward.py
def forward(self, x):
    out = self.l1(x)
    out = self.drop(out)
    out = self.act(out)
    out = self.l2(out)
    out = self.drop(out)

    if self.residual:
        out = x + out

    return out

LayerNormTf

__init__(self, hidden_size, eps=1e-12) special

Construct a layernorm module in the TF style (epsilon inside the square root). Link: https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L234

Source code in slp/modules/norm.py
def __init__(self, hidden_size: int, eps: float = 1e-12):
    """Construct a layernorm module in the TF style (epsilon inside the square root).
    Link: https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L234
    """
    super(LayerNormTf, self).__init__()
    self.weight = nn.Parameter(torch.ones(hidden_size))
    self.bias = nn.Parameter(torch.zeros(hidden_size))
    self.variance_epsilon = eps

forward(self, x)

Calculate Layernorm the tf way

Source code in slp/modules/norm.py
def forward(self, x: torch.Tensor) -> torch.Tensor:
    """Calculate Layernorm the tf way"""
    u = x.mean(-1, keepdim=True)
    s = (x - u).pow(2).mean(-1, keepdim=True)
    x = (x - u) / torch.sqrt(s + self.variance_epsilon)

    return self.weight * x + self.bias

ScaleNorm

forward(self, x)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in slp/modules/norm.py
def forward(self, x: torch.Tensor):
    scaled_norm = self.g / safe_norm(x, dim=-1, keepdim=True).clamp(min=self.eps)

    return scaled_norm * x

GaussianNoise

__init__(self, stddev, mean=0.0) special

Additive Gaussian Noise layer

Parameters:

Name Type Description Default
stddev float

the standard deviation of the distribution

required
mean float

the mean of the distribution

0.0
Source code in slp/modules/regularization.py
def __init__(self, stddev: float, mean: float = 0.0):
    """Additive Gaussian Noise layer

    Args:
        stddev (float): the standard deviation of the distribution
        mean (float): the mean of the distribution
    """
    super().__init__()
    self.stddev = stddev
    self.mean = mean

__repr__(self) special

String representation of class

Source code in slp/modules/regularization.py
def __repr__(self):
    """String representation of class"""
    return "{} (mean={}, stddev={})".format(
        self.__class__.__name__, str(self.mean), str(self.stddev)
    )

forward(self, x)

Gaussian noise forward pass

Parameters:

Name Type Description Default
x Tensor

Input features.

required

Returns:

Type Description
Tensor
Source code in slp/modules/regularization.py
def forward(self, x: torch.Tensor) -> torch.Tensor:
    """Gaussian noise forward pass

    Args:
        x (torch.Tensor): Input features.

    Returns:
        [type]: [description]
    """
    if self.training:
        noise = Variable(x.data.new(x.size()).normal_(self.mean, self.stddev))
        return x + noise
    return x

AttentiveRNN

__init__(self, input_size, hidden_size=256, batch_first=True, layers=1, bidirectional=False, merge_bi='cat', dropout=0.1, rnn_type='lstm', packed_sequence=True, attention=False, max_length=-1, num_heads=1, nystrom=True, num_landmarks=32, kernel_size=33, inverse_iterations=6, return_hidden=False) special

RNN with embedding layer and optional attention mechanism

Single-headed scaled dot-product attention is used as an attention mechanism

Parameters:

Name Type Description Default
input_size int

Input features dimension

required
hidden_size int

Hidden features

256
batch_first bool

Use batch first representation type. Defaults to True.

True
layers int

Number of RNN layers. Defaults to 1.

1
bidirectional bool

Use bidirectional RNNs. Defaults to False.

False
merge_bi str

How bidirectional states are merged. Defaults to "cat".

'cat'
dropout float

Dropout probability. Defaults to 0.0.

0.1
rnn_type str

lstm or gru. Defaults to "lstm".

'lstm'
packed_sequence bool

Use packed sequences. Defaults to True.

True
max_length int

Maximum sequence length for fixed length padding. If -1 takes the largest sequence length in this batch

-1
attention bool

Use attention mechanism. Defaults to False

False
num_heads int

Number of attention heads. If 1 uses single headed attention

1
nystrom bool

Use nystrom approximation for multihead attention

True
num_landmarks int

Number of landmark sequence elements for nystrom attention

32
kernel_size Optional[int]

Kernel size for multihead attention output residual convolution

33
inverse_iterations int

Number of iterations for moore-penrose inverse approximation in nystrom attention. 6 is a good value

6
return_hidden bool

Return all hidden states. Defaults to False.

False
Source code in slp/modules/rnn.py
def __init__(
    self,
    input_size: int,
    hidden_size: int = 256,
    batch_first: bool = True,
    layers: int = 1,
    bidirectional: bool = False,
    merge_bi: str = "cat",
    dropout: float = 0.1,
    rnn_type: str = "lstm",
    packed_sequence: bool = True,
    attention: bool = False,
    max_length: int = -1,
    num_heads: int = 1,
    nystrom: bool = True,
    num_landmarks: int = 32,
    kernel_size: Optional[int] = 33,
    inverse_iterations: int = 6,
    return_hidden: bool = False,
):
    """RNN with embedding layer and optional attention mechanism

    Single-headed scaled dot-product attention is used as an attention mechanism

    Args:
        input_size (int): Input features dimension
        hidden_size (int): Hidden features
        batch_first (bool): Use batch first representation type. Defaults to True.
        layers (int): Number of RNN layers. Defaults to 1.
        bidirectional (bool): Use bidirectional RNNs. Defaults to False.
        merge_bi (str): How bidirectional states are merged. Defaults to "cat".
        dropout (float): Dropout probability. Defaults to 0.0.
        rnn_type (str): lstm or gru. Defaults to "lstm".
        packed_sequence (bool): Use packed sequences. Defaults to True.
        max_length (int): Maximum sequence length for fixed length padding. If -1 takes the
            largest sequence length in this batch
        attention (bool): Use attention mechanism. Defaults to False
        num_heads (int): Number of attention heads. If 1 uses single headed attention
        nystrom (bool): Use nystrom approximation for multihead attention
        num_landmarks (int): Number of landmark sequence elements for nystrom attention
        kernel_size (int): Kernel size for multihead attention output residual convolution
        inverse_iterations (int): Number of iterations for moore-penrose inverse approximation
            in nystrom attention. 6 is a good value
        return_hidden (bool): Return all hidden states. Defaults to False.
    """
    super(AttentiveRNN, self).__init__()
    self.rnn = RNN(
        input_size,  # type: ignore
        hidden_size,
        batch_first=batch_first,
        layers=layers,
        merge_bi=merge_bi,
        bidirectional=bidirectional,
        dropout=dropout,
        rnn_type=rnn_type,
        packed_sequence=packed_sequence,
        max_length=max_length,
    )
    self.out_size = (
        hidden_size
        if not (bidirectional and merge_bi == "cat")
        else 2 * hidden_size
    )
    self.batch_first = batch_first
    self.return_hidden = return_hidden

    self.attention = None

    if attention:
        if num_heads == 1:
            self.attention = Attention(
                attention_size=self.out_size, dropout=dropout
            )
        else:
            self.attention = MultiheadAttention(  # type: ignore
                attention_size=self.out_size,
                num_heads=num_heads,
                kernel_size=kernel_size,
                nystrom=nystrom,
                num_landmarks=num_landmarks,
                inverse_iterations=inverse_iterations,
                dropout=dropout,
            )

forward(self, x, lengths)

Attentive RNN forward pass

If self.attention=True then the outputs are the weighted sum of the RNN hidden states with the attention score weights Else the output is the last hidden state of the RNN.

Parameters:

Name Type Description Default
x Tensor

[B, L] Input token ids

required
lengths Tensor

[B] Original sequence lengths

required

Returns:

Type Description
Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]

Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: if return_hidden == False: Returns a tensor [B, H] or [B, 2H] of output features to be used for classification if return_hidden == True: Returns a tensor [B, H] or [B, 2H] of output features to be used for classification, and a tensor of all the hidden states

Source code in slp/modules/rnn.py
def forward(
    self, x: torch.Tensor, lengths: torch.Tensor
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
    """Attentive RNN forward pass

    If self.attention=True then the outputs are the weighted sum of the RNN hidden states with the attention score weights
    Else the output is the last hidden state of the RNN.

    Args:
        x (torch.Tensor): [B, L] Input token ids
        lengths (torch.Tensor): [B] Original sequence lengths

    Returns:
        Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
            if return_hidden == False: Returns a tensor [B, H] or [B, 2*H] of output features to be used for classification
            if return_hidden == True: Returns a tensor [B, H] or [B, 2*H] of output features to
                be used for classification, and a tensor of all the hidden states
    """
    states, last_hidden, _ = self.rnn(x, lengths)

    out: torch.Tensor = last_hidden

    if self.attention is not None:
        states, _ = self.attention(
            states,
            attention_mask=pad_mask(
                lengths,
                max_length=states.size(1) if self.batch_first else states.size(0),
            ),
        )
        out = states.mean(dim=1)

    if self.return_hidden:
        return out, states
    else:
        return out

RNN

out_size: int property readonly

RNN output features size

Returns:

Type Description
int

int: RNN output features size

__init__(self, input_size, hidden_size, batch_first=True, layers=1, bidirectional=False, merge_bi='cat', dropout=0.0, rnn_type='lstm', packed_sequence=True, max_length=-1) special

LSTM - GRU wrapper with packed sequence support and handling for bidirectional / last output states

It is recommended to run with batch_first=True because the rest of the code is built with this assumption

Parameters:

Name Type Description Default
input_size int

Input features.

required
hidden_size int

Hidden features.

required
batch_first bool

Use batch first representation type. Defaults to True.

True
layers int

Number of RNN layers. Defaults to 1.

1
bidirectional bool

Use bidirectional RNNs. Defaults to False.

False
merge_bi str

How bidirectional states are merged. Defaults to "cat".

'cat'
dropout float

Dropout probability. Defaults to 0.0.

0.0
rnn_type str

lstm or gru. Defaults to "lstm".

'lstm'
packed_sequence bool

Use packed sequences. Defaults to True.

True
Source code in slp/modules/rnn.py
def __init__(
    self,
    input_size: int,
    hidden_size: int,
    batch_first: bool = True,
    layers: int = 1,
    bidirectional: bool = False,
    merge_bi: str = "cat",
    dropout: float = 0.0,
    rnn_type: str = "lstm",
    packed_sequence: bool = True,
    max_length: int = -1,
):
    """LSTM - GRU wrapper with packed sequence support and handling for bidirectional / last output states

    It is recommended to run with batch_first=True because the rest of the code is built with this assumption

    Args:
        input_size (int): Input features.
        hidden_size (int): Hidden features.
        batch_first (bool): Use batch first representation type. Defaults to True.
        layers (int): Number of RNN layers. Defaults to 1.
        bidirectional (bool): Use bidirectional RNNs. Defaults to False.
        merge_bi (str): How bidirectional states are merged. Defaults to "cat".
        dropout (float): Dropout probability. Defaults to 0.0.
        rnn_type (str): lstm or gru. Defaults to "lstm".
        packed_sequence (bool): Use packed sequences. Defaults to True.
    """
    super(RNN, self).__init__()
    self.bidirectional = bidirectional
    self.hidden_size = hidden_size
    self.batch_first = batch_first
    self.merge_bi = merge_bi
    self.rnn_type = rnn_type.lower()

    if not batch_first:
        logger.warning(
            "You are running RNN with batch_first=False. Make sure this is really what you want"
        )

    if not packed_sequence:
        logger.warning(
            "You have set packed_sequence=False. Running with packed_sequence=True will be much faster"
        )

    rnn_cls = nn.LSTM if self.rnn_type == "lstm" else nn.GRU
    self.rnn = rnn_cls(
        input_size,
        hidden_size,
        batch_first=batch_first,
        num_layers=layers,
        bidirectional=bidirectional,
    )
    self.drop = nn.Dropout(dropout)
    self.packed_sequence = packed_sequence

    if packed_sequence:
        self.pack = PackSequence(batch_first=batch_first)
        self.unpack = PadPackedSequence(
            batch_first=batch_first, max_length=max_length
        )

forward(self, x, lengths)

RNN forward pass

Parameters:

Name Type Description Default
x Tensor

[B, L, D] Input features

required
lengths Tensor

[B] Original sequence lengths

required

Returns:

Type Description
Tuple[torch.Tensor, torch.Tensor, Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]]

Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: ( merged forward and backward states [B, L, H] or [B, L, 2H], merged last forward and backward state [B, H] or [B, 2H], hidden states tuple of [num_layers * num_directions, B, H] for LSTM or tensor [num_layers * num_directions, B, H] for GRU )

Source code in slp/modules/rnn.py
def forward(
    self, x: torch.Tensor, lengths: torch.Tensor
) -> Tuple[
    torch.Tensor,
    torch.Tensor,
    Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
]:
    """RNN forward pass

    Args:
        x (torch.Tensor): [B, L, D] Input features
        lengths (torch.Tensor): [B] Original sequence lengths

    Returns:
        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: (
            merged forward and backward states [B, L, H] or [B, L, 2*H],
            merged last forward and backward state [B, H] or [B, 2*H],
            hidden states tuple of [num_layers * num_directions, B, H] for LSTM or tensor [num_layers * num_directions, B, H] for GRU
        )
    """
    self.rnn.flatten_parameters()

    if self.packed_sequence:
        # Latest pytorch allows only cpu tensors for packed sequence
        lengths = lengths.to("cpu")
        x, lengths = self.pack(x, lengths)
    out, hidden = self.rnn(x)

    if self.packed_sequence:
        out = self.unpack(out, lengths)
    out = self.drop(out)
    lengths = lengths.to(out.device)

    out, last_timestep = self._final_output(out, lengths)

    return out, last_timestep, hidden

TokenRNN

__init__(self, hidden_size=256, vocab_size=None, embeddings_dim=None, embeddings=None, embeddings_dropout=0.0, finetune_embeddings=False, batch_first=True, layers=1, bidirectional=False, merge_bi='cat', dropout=0.1, rnn_type='lstm', packed_sequence=True, attention=False, max_length=-1, num_heads=1, nystrom=True, num_landmarks=32, kernel_size=33, inverse_iterations=6, return_hidden=False) special

RNN with embedding layer and optional attention mechanism

Single-headed scaled dot-product attention is used as an attention mechanism

Parameters:

Name Type Description Default
hidden_size int

Hidden features

256
vocab_size Optional[int]

Vocabulary size. Defaults to None.

None
embeddings_dim Optional[int]

Embedding dimension. Defaults to None.

None
embeddings Optional[numpy.ndarray]

Embedding matrix. Defaults to None.

None
embeddings_dropout float

Embedding dropout probability. Defaults to 0.0.

0.0
finetune_embeddings bool

Finetune embeddings? Defaults to False.

False
batch_first bool

Use batch first representation type. Defaults to True.

True
layers int

Number of RNN layers. Defaults to 1.

1
bidirectional bool

Use bidirectional RNNs. Defaults to False.

False
merge_bi str

How bidirectional states are merged. Defaults to "cat".

'cat'
dropout float

Dropout probability. Defaults to 0.0.

0.1
rnn_type str

lstm or gru. Defaults to "lstm".

'lstm'
packed_sequence bool

Use packed sequences. Defaults to True.

True
max_length int

Maximum sequence length for fixed length padding. If -1 takes the largest sequence length in this batch

-1
attention bool

Use attention mechanism. Defaults to False

False
num_heads int

Number of attention heads. If 1 uses single headed attention

1
nystrom bool

Use nystrom approximation for multihead attention

True
num_landmarks int

Number of landmark sequence elements for nystrom attention

32
kernel_size Optional[int]

Kernel size for multihead attention output residual convolution

33
inverse_iterations int

Number of iterations for moore-penrose inverse approximation in nystrom attention. 6 is a good value

6
Source code in slp/modules/rnn.py
def __init__(
    self,
    hidden_size: int = 256,
    vocab_size: Optional[int] = None,
    embeddings_dim: Optional[int] = None,
    embeddings: Optional[np.ndarray] = None,
    embeddings_dropout: float = 0.0,
    finetune_embeddings: bool = False,
    batch_first: bool = True,
    layers: int = 1,
    bidirectional: bool = False,
    merge_bi: str = "cat",
    dropout: float = 0.1,
    rnn_type: str = "lstm",
    packed_sequence: bool = True,
    attention: bool = False,
    max_length: int = -1,
    num_heads: int = 1,
    nystrom: bool = True,
    num_landmarks: int = 32,
    kernel_size: Optional[int] = 33,
    inverse_iterations: int = 6,
    return_hidden=False,
):
    """RNN with embedding layer and optional attention mechanism

    Single-headed scaled dot-product attention is used as an attention mechanism

    Args:
        hidden_size (int): Hidden features
        vocab_size (Optional[int]): Vocabulary size. Defaults to None.
        embeddings_dim (Optional[int]): Embedding dimension. Defaults to None.
        embeddings (Optional[np.ndarray]): Embedding matrix. Defaults to None.
        embeddings_dropout (float): Embedding dropout probability. Defaults to 0.0.
        finetune_embeddings (bool): Finetune embeddings? Defaults to False.
        batch_first (bool): Use batch first representation type. Defaults to True.
        layers (int): Number of RNN layers. Defaults to 1.
        bidirectional (bool): Use bidirectional RNNs. Defaults to False.
        merge_bi (str): How bidirectional states are merged. Defaults to "cat".
        dropout (float): Dropout probability. Defaults to 0.0.
        rnn_type (str): lstm or gru. Defaults to "lstm".
        packed_sequence (bool): Use packed sequences. Defaults to True.
        max_length (int): Maximum sequence length for fixed length padding. If -1 takes the
            largest sequence length in this batch
        attention (bool): Use attention mechanism. Defaults to False
        num_heads (int): Number of attention heads. If 1 uses single headed attention
        nystrom (bool): Use nystrom approximation for multihead attention
        num_landmarks (int): Number of landmark sequence elements for nystrom attention
        kernel_size (int): Kernel size for multihead attention output residual convolution
        inverse_iterations (int): Number of iterations for moore-penrose inverse approximation
            in nystrom attention. 6 is a good value
    """
    super(TokenRNN, self).__init__()

    if embeddings is None:
        finetune_embeddings = True
        assert (
            vocab_size is not None
        ), "You should either pass an embeddings matrix or vocab size"
        assert (
            embeddings_dim is not None
        ), "You should either pass an embeddings matrix or embeddings_dim"
    else:
        vocab_size = embeddings.shape[0]
        embeddings_dim = embeddings.shape[1]

    self.embed = Embed(
        vocab_size,  # type: ignore
        embeddings_dim,  # type: ignore
        embeddings=embeddings,
        dropout=embeddings_dropout,
        scale=hidden_size ** 0.5,
        trainable=finetune_embeddings,
    )
    self.encoder = AttentiveRNN(
        embeddings_dim,  # type: ignore
        hidden_size,
        batch_first=batch_first,
        layers=layers,
        bidirectional=bidirectional,
        merge_bi=merge_bi,
        dropout=dropout,
        rnn_type=rnn_type,
        packed_sequence=packed_sequence,
        attention=attention,
        max_length=max_length,
        num_heads=num_heads,
        nystrom=nystrom,
        num_landmarks=num_landmarks,
        kernel_size=kernel_size,
        inverse_iterations=inverse_iterations,
        return_hidden=return_hidden,
    )

    self.out_size = self.encoder.out_size

forward(self, x, lengths)

Token RNN forward pass

If self.attention=True then the outputs are the weighted sum of the RNN hidden states with the attention score weights Else the output is the last hidden state of the RNN.

Parameters:

Name Type Description Default
x Tensor

[B, L] Input token ids

required
lengths Tensor

[B] Original sequence lengths

required

Returns:

Type Description
Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]

torch.Tensor: [B, H] or [B, 2*H] Output features to be used for classification

Source code in slp/modules/rnn.py
def forward(
    self, x: torch.Tensor, lengths: torch.Tensor
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
    """Token RNN forward pass

    If self.attention=True then the outputs are the weighted sum of the RNN hidden states with the attention score weights
    Else the output is the last hidden state of the RNN.

    Args:
        x (torch.Tensor): [B, L] Input token ids
        lengths (torch.Tensor): [B] Original sequence lengths

    Returns:
        torch.Tensor: [B, H] or [B, 2*H] Output features to be used for classification
    """
    x = self.embed(x)
    out = self.encoder(x, lengths)

    return out  # type: ignore

Decoder

forward(self, target, encoded, source_mask=None, target_mask=None)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in slp/modules/transformer.py
def forward(self, target, encoded, source_mask=None, target_mask=None):

    for l in self.decoder:
        target = l(
            target, encoded, source_mask=source_mask, target_mask=target_mask
        )

    return target

DecoderLayer

forward(self, targets, encoded, source_mask=None, target_mask=None)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in slp/modules/transformer.py
def forward(self, targets, encoded, source_mask=None, target_mask=None):
    targets = self.in_layer(targets, attention_mask=target_mask)
    out = self.fuse_layer(encoded, targets, attention_mask=source_mask)
    out = self.out_layer(out)

    return out

Encoder

forward(self, x, attention_mask=None)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in slp/modules/transformer.py
def forward(self, x, attention_mask=None):
    for layer in self.encoder:
        x = layer(x, attention_mask=attention_mask)

    return x

EncoderDecoder

forward(self, source, target, source_mask=None, target_mask=None)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in slp/modules/transformer.py
def forward(self, source, target, source_mask=None, target_mask=None):
    encoded = self.encoder(source, attention_mask=source_mask)
    decoded = self.decoder(
        target, encoded, source_mask=source_mask, target_mask=target_mask
    )

    return decoded

EncoderLayer

forward(self, x, attention_mask=None)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in slp/modules/transformer.py
def forward(self, x, attention_mask=None):
    out = self.l1(x, attention_mask=attention_mask)
    out = self.l2(out)

    return out

Sublayer1

forward(self, x, attention_mask=None)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in slp/modules/transformer.py
def forward(self, x, attention_mask=None):
    return (
        self._prenorm(x, attention_mask=attention_mask)
        if self.prenorm
        else self._postnorm(x, attention_mask=attention_mask)
    )

Sublayer2

forward(self, x)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in slp/modules/transformer.py
def forward(self, x):
    return self._prenorm(x) if self.prenorm else self._postnorm(x)

Sublayer3

forward(self, x, y, attention_mask=None)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in slp/modules/transformer.py
def forward(self, x, y, attention_mask=None):
    return (
        self._prenorm(x, y, attention_mask=attention_mask)
        if self.prenorm
        else self._postnorm(x, y, attention_mask=attention_mask)
    )

Transformer

forward(self, source, target, source_mask=None, target_mask=None)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in slp/modules/transformer.py
def forward(self, source, target, source_mask=None, target_mask=None):
    source = self.embed(source)
    target = self.embed(target)
    # Adding embeddings + pos embeddings
    # is done in PositionalEncoding class
    source = self.pe(source)
    target = self.pe(target)
    out = self.transformer_block(
        source, target, source_mask=source_mask, target_mask=target_mask
    )
    out = self.drop(out)
    out = self.predict(out)

    return out

TransformerSequenceEncoder

forward(self, x, attention_mask=None)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in slp/modules/transformer.py
def forward(self, x, attention_mask=None):
    if self.feature_norm:
        x = self.feature_norm(x)

    x = self.embed(x)
    x = self.pe(x)
    out = self.transformer_block(x, attention_mask=attention_mask).mean(dim=1)

    return out

TransformerTokenSequenceEncoder

forward(self, x, attention_mask=None)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in slp/modules/transformer.py
def forward(self, x, attention_mask=None):
    x = self.embed(x)
    x = self.pe(x)
    out = self.transformer_block(x, attention_mask=attention_mask).mean(dim=1)

    return out

reset_parameters(named_parameters, gain=1.0)

Initialize parameters in the transformer model.

Source code in slp/modules/transformer.py
def reset_parameters(named_parameters, gain=1.0):
    """Initialize parameters in the transformer model."""

    for name, p in named_parameters:
        if p.dim() > 1:
            if "weight" in name:
                nn.init.xavier_normal_(p, gain=gain)

            if "bias" in name:
                nn.init.constant_(p, 0.0)