Generic Modules
Modules implemented in slp. These modules can be used as building blocks for more complicated models.
Attention
__init__(self, attention_size=512, input_size=None, dropout=0.1)
special
Single-Headed Dot-product attention module
Parameters:
Name | Type | Description | Default |
---|---|---|---|
attention_size |
int |
Number of hidden features. Defaults to 512. |
512 |
input_size |
Optional[int] |
Input features. Defaults to None. If None input_size is set to attention_size. |
None |
dropout |
float |
Drop probability. Defaults to 0.1. |
0.1 |
Source code in slp/modules/attention.py
def __init__(
self,
attention_size: int = 512,
input_size: Optional[int] = None,
dropout: float = 0.1,
):
"""Single-Headed Dot-product attention module
Args:
attention_size (int): Number of hidden features. Defaults to 512.
input_size (Optional[int]): Input features. Defaults to None.
If None input_size is set to attention_size.
dropout (float): Drop probability. Defaults to 0.1.
"""
super(Attention, self).__init__()
if input_size is None:
input_size = attention_size
self.dk = input_size
self.k = nn.Linear(input_size, attention_size, bias=False)
self.q = nn.Linear(input_size, attention_size, bias=False)
self.v = nn.Linear(input_size, attention_size, bias=False)
self.dropout = dropout
reset_parameters(self.named_parameters())
forward(self, keys, queries=None, attention_mask=None)
Single-head scaled dot-product attention forward pass
Outputs the values, where features for each sequence element are weighted by their respective attention scores
- B: Batch size
- L: Keys Sequence length
- M: Queries Sequence length
- H: Number of heads
- A: Feature dimension
Parameters:
Name | Type | Description | Default |
---|---|---|---|
keys |
Tensor |
[B, L, D] Keys tensor |
required |
queries |
Optional[torch.Tensor] |
Optional [B, M, D] Queries tensor. If None queries = keys. Defaults to None. |
None |
attention_mask |
Optional[torch.Tensor] |
Optional [B, L] or [B, M, L] zero-one mask for sequence elements. Defaults to None. |
None |
Returns:
Type | Description |
---|---|
Tuple[torch.Tensor, torch.Tensor] |
Tuple[torch.Tensor, torch.Tensor]: (Reweighted values [B, L, D], attention scores [B, M, L]) |
Source code in slp/modules/attention.py
def forward(
self,
keys: torch.Tensor,
queries: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
r"""Single-head scaled dot-product attention forward pass
Outputs the values, where features for each sequence element are weighted by their respective attention scores
$$a = softmax(\frac{Q}{K^T}){\sqrt{d}}) \dot V$$
* B: Batch size
* L: Keys Sequence length
* M: Queries Sequence length
* H: Number of heads
* A: Feature dimension
Args:
keys (torch.Tensor): [B, L, D] Keys tensor
queries (Optional[torch.Tensor]): Optional [B, M, D] Queries tensor. If None queries = keys. Defaults to None.
attention_mask (Optional[torch.Tensor]): Optional [B, L] or [B, M, L] zero-one mask for sequence elements. Defaults to None.
Returns:
Tuple[torch.Tensor, torch.Tensor]: (Reweighted values [B, L, D], attention scores [B, M, L])
"""
if attention_mask is not None:
if len(list(attention_mask.size())) == 2:
attention_mask = attention_mask.unsqueeze(1)
if queries is None:
queries = keys
values = keys
k = self.k(keys) # (B, L, A)
q = self.q(queries)
v = self.v(values)
# weights => (B, L, L)
out, scores = attention(
k,
q,
v,
self.dk,
attention_mask=attention_mask,
dropout=self.dropout,
training=self.training,
)
return out, scores
MultiheadAttention
__init__(self, attention_size=512, num_heads=8, input_size=None, dropout=0.1, nystrom=False, num_landmarks=64, inverse_iterations=6, kernel_size=None)
special
Multi-Headed Dot-product attention module
Parameters:
Name | Type | Description | Default |
---|---|---|---|
attention_size |
int |
Number of hidden features. Defaults to 512. |
512 |
num_heads |
int |
Number of attention heads |
8 |
input_size |
Optional[int] |
Input features. Defaults to None. If None input_size is set to attention_size. |
None |
dropout |
float |
Drop probability. Defaults to 0.1. |
0.1 |
nystrom |
bool |
Use nystrom method for attention calculation. Defaults to False. |
False |
num_landmarks |
int |
Number of landmark points for nystrom attention. Defaults to 64. |
64 |
inverse_iterations |
int |
Number of iteration to calculate the inverse in nystrom attention. Defaults to 6. |
6 |
kernel_size |
Optional[int] |
Use residual convolution in the output. Defaults to None. |
None |
Source code in slp/modules/attention.py
def __init__(
self,
attention_size: int = 512,
num_heads: int = 8,
input_size: Optional[int] = None,
dropout: float = 0.1,
nystrom: bool = False,
num_landmarks: int = 64,
inverse_iterations: int = 6,
kernel_size: Optional[int] = None,
):
"""Multi-Headed Dot-product attention module
Args:
attention_size (int): Number of hidden features. Defaults to 512.
num_heads (int): Number of attention heads
input_size (Optional[int]): Input features. Defaults to None.
If None input_size is set to attention_size.
dropout (float): Drop probability. Defaults to 0.1.
nystrom (bool, optional): Use nystrom method for attention calculation. Defaults to False.
num_landmarks (int, optional): Number of landmark points for nystrom attention. Defaults to 64.
inverse_iterations (int, optional): Number of iteration to calculate the inverse in nystrom attention. Defaults to 6.
kernel_size (Optional[int], optional): Use residual convolution in the output. Defaults to None.
"""
super(MultiheadAttention, self).__init__()
if input_size is None:
input_size = attention_size
self.inverse_iterations = inverse_iterations
self.num_landmarks = num_landmarks
self.nystrom = nystrom
self.num_heads = num_heads
self.head_size = int(attention_size / num_heads)
self.dk = self.head_size
self.attention_size = attention_size
self.k = nn.Linear(input_size, attention_size, bias=False)
self.q = nn.Linear(input_size, attention_size, bias=False)
self.v = nn.Linear(input_size, attention_size, bias=False)
self.output = nn.Linear(attention_size, attention_size)
self.dropout = dropout
self.conv = None
if kernel_size is not None:
self.conv = nn.Conv2d(
in_channels=self.num_heads,
out_channels=self.num_heads,
kernel_size=(kernel_size, 1),
padding=(kernel_size // 2, 0),
bias=False,
groups=self.num_heads,
)
reset_parameters(self.named_parameters())
forward(self, keys, queries=None, attention_mask=None)
Multi-head scaled dot-product attention forward pass
Outputs the values, where features for each sequence element are weighted by their respective attention scores
Each head performs dot-product attention
The outputs of multiple heads are concatenated and passed through a feedforward layer.
- B: Batch size
- L: Keys Sequence length
- M: Queries Sequence length
- H: Number of heads
- A: Feature dimension
Parameters:
Name | Type | Description | Default |
---|---|---|---|
keys |
torch.Tensor |
[B, L, D] Keys tensor |
required |
queries |
Optional[torch.Tensor] |
Optional [B, M, D] Queries tensor. If None queries = keys. Defaults to None. |
None |
attention_mask |
Optional[torch.Tensor] |
Optional [B, M, L] zero-one mask for sequence elements. Defaults to None. |
None |
Returns:
Type | Description |
---|---|
Tuple[torch.Tensor, torch.Tensor] |
(Reweighted values [B, L, D], attention scores [B, H, M, L]) |
Source code in slp/modules/attention.py
def forward(self, keys, queries=None, attention_mask=None):
r"""Multi-head scaled dot-product attention forward pass
Outputs the values, where features for each sequence element are weighted by their respective attention scores
Each head performs dot-product attention
$$a_H = softmax(\frac{Q_H \cdot K_H^T}{\sqrt{d}}) \cdot V_H$$
The outputs of multiple heads are concatenated and passed through a feedforward layer.
$$a = W (a^{(1)}_{H} \mathbin\Vert a^{(2)}_{H} \dots) + b$$
* B: Batch size
* L: Keys Sequence length
* M: Queries Sequence length
* H: Number of heads
* A: Feature dimension
Args:
keys (torch.Tensor): [B, L, D] Keys tensor
queries (Optional[torch.Tensor]): Optional [B, M, D] Queries tensor. If None queries = keys. Defaults to None.
attention_mask (Optional[torch.Tensor]): Optional [B, M, L] zero-one mask for sequence elements. Defaults to None.
Returns:
Tuple[torch.Tensor, torch.Tensor]: (Reweighted values [B, L, D], attention scores [B, H, M, L])
"""
_, seq_length, _ = keys.size()
if attention_mask is not None:
if attention_mask.ndim == 2:
attention_mask = attention_mask.unsqueeze(1)
attention_mask = attention_mask.unsqueeze(1)
if self.nystrom:
keys, attention_mask = pad_for_nystrom(
keys, self.num_landmarks, attention_mask=attention_mask
)
if queries is None:
queries = keys
values = keys
k = self.k(keys)
q = self.q(queries)
v = self.v(values)
k = split_heads(k, self.num_heads)
q = split_heads(q, self.num_heads)
v = split_heads(v, self.num_heads)
if self.nystrom:
# out = (B, H, L, A/H)
# scores = Tuple
out, scores = nystrom_attention(
k,
q,
v,
self.dk,
self.num_landmarks,
attention_mask=attention_mask,
inverse_iterations=self.inverse_iterations,
dropout=self.dropout,
training=self.training,
)
else:
# out => (B, H, L, A/H)
# scores => (B, H, L, L)
out, scores = attention(
k,
q,
v,
self.dk,
attention_mask=attention_mask,
dropout=self.dropout,
training=self.training,
)
if self.conv is not None:
if attention_mask is None or attention_mask.ndim > 2:
out += self.conv(v)
else:
attention_mask = attention_mask.squeeze()
out += self.conv(v * attention_mask[:, None, :, None])
# out => (B, H, L, A/H)
out = merge_heads(out)
if out.size(1) != seq_length:
out = out[:, :seq_length, :]
out = self.output(out)
return out, scores
MultiheadSelfAttention
__init__(self, attention_size=512, num_heads=8, input_size=None, dropout=0.1, nystrom=False, num_landmarks=64, inverse_iterations=6, kernel_size=None)
special
Multi-Headed Dot-product attention module
Parameters:
Name | Type | Description | Default |
---|---|---|---|
attention_size |
int |
Number of hidden features. Defaults to 512. |
512 |
num_heads |
int |
Number of attention heads |
8 |
input_size |
Optional[int] |
Input features. Defaults to None. If None input_size is set to attention_size. |
None |
dropout |
float |
Drop probability. Defaults to 0.1. |
0.1 |
Source code in slp/modules/attention.py
def __init__(
self,
attention_size: int = 512,
num_heads: int = 8,
input_size: Optional[int] = None,
dropout: float = 0.1,
nystrom: bool = False,
num_landmarks: int = 64,
inverse_iterations: int = 6,
kernel_size: Optional[int] = None,
):
"""Multi-Headed Dot-product attention module
Args:
attention_size (int): Number of hidden features. Defaults to 512.
num_heads (int): Number of attention heads
input_size (Optional[int]): Input features. Defaults to None.
If None input_size is set to attention_size.
dropout (float): Drop probability. Defaults to 0.1.
"""
super(MultiheadSelfAttention, self).__init__()
if input_size is None:
input_size = attention_size
self.inverse_iterations = inverse_iterations
self.num_landmarks = num_landmarks
self.nystrom = nystrom
self.num_heads = num_heads
self.head_size = int(attention_size / num_heads)
self.dk = self.head_size
self.attention_size = attention_size
self.kqv = nn.Linear(input_size, 3 * attention_size, bias=False)
self.output = nn.Linear(attention_size, attention_size)
self.dropout = dropout
self.conv = None
if kernel_size is not None:
self.conv = nn.Conv2d(
in_channels=self.num_heads,
out_channels=self.num_heads,
kernel_size=(kernel_size, 1),
padding=(kernel_size // 2, 0),
bias=False,
groups=self.num_heads,
)
reset_parameters(self.named_parameters())
forward(self, x, attention_mask=None)
Multi-head scaled dot-product attention forward pass
Outputs the values, where features for each sequence element are weighted by their respective attention scores
Each head performs dot-product attention
The outputs of multiple heads are concatenated and passed through a feedforward layer.
- B: Batch size
- L: Keys Sequence length
- M: Queries Sequence length
- H: Number of heads
- A: Feature dimension
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
torch.Tensor |
[B, L, D] Keys tensor |
required |
attention_mask |
Optional[torch.Tensor] |
Optional [B, M, L] zero-one mask for sequence elements. Defaults to None. |
None |
Returns:
Type | Description |
---|---|
Tuple[torch.Tensor, torch.Tensor] |
(Reweighted values [B, L, D], attention scores [B, H, M, L]) |
Source code in slp/modules/attention.py
def forward(self, x, attention_mask=None):
r"""Multi-head scaled dot-product attention forward pass
Outputs the values, where features for each sequence element are weighted by their respective attention scores
Each head performs dot-product attention
$$a_H = softmax(\frac{Q_H \cdot K_H^T}{\sqrt{d}}) \cdot V_H$$
The outputs of multiple heads are concatenated and passed through a feedforward layer.
$$a = W (a^{(1)}_{H} \mathbin\Vert a^{(2)}_{H} \dots) + b$$
* B: Batch size
* L: Keys Sequence length
* M: Queries Sequence length
* H: Number of heads
* A: Feature dimension
Args:
x (torch.Tensor): [B, L, D] Keys tensor
attention_mask (Optional[torch.Tensor]): Optional [B, M, L] zero-one mask for sequence elements. Defaults to None.
Returns:
Tuple[torch.Tensor, torch.Tensor]: (Reweighted values [B, L, D], attention scores [B, H, M, L])
"""
_, seq_length, _ = x.size()
if attention_mask is not None:
if attention_mask.ndim == 2:
attention_mask = attention_mask.unsqueeze(1)
attention_mask = attention_mask.unsqueeze(1)
if self.nystrom:
x, attention_mask = pad_for_nystrom(
x, self.num_landmarks, attention_mask=attention_mask
)
k, q, v = self.kqv(x).chunk(3, dim=-1)
k = split_heads(k, self.num_heads)
q = split_heads(q, self.num_heads)
v = split_heads(v, self.num_heads)
if self.nystrom:
# out = (B, H, L, A/H)
# scores = Tuple
out, scores = nystrom_attention(
k,
q,
v,
self.dk,
self.num_landmarks,
attention_mask=attention_mask,
inverse_iterations=self.inverse_iterations,
dropout=self.dropout,
training=self.training,
)
else:
# out => (B, H, L, A/H)
# scores => (B, H, L, L)
out, scores = attention(
k,
q,
v,
self.dk,
attention_mask=attention_mask,
dropout=self.dropout,
training=self.training,
)
if self.conv is not None:
if attention_mask is None or attention_mask.ndim > 2:
out = out + self.conv(v)
else:
attention_mask = attention_mask.squeeze()
out = out + self.conv(v * attention_mask[:, None, :, None])
# out => (B, H, L, A/H)
out = merge_heads(out)
if out.size(1) != seq_length:
out = out[:, -seq_length:, :]
out = self.output(out)
return out, scores
MultiheadTwowayAttention
__init__(self, attention_size=512, input_size=None, dropout=0.1, num_heads=8, residual=True, nystrom=False, num_landmarks=64, inverse_iterations=6, kernel_size=None)
special
Multihead twoway attention for multimodal fusion
This module performs two way attention for two input modality feature sequences. If att is the MultiheadAttention operation and x, y the input modality sequences, the operation is summarized as
If residual is True then a Vilbert-like residual connection is applied
Parameters:
Name | Type | Description | Default |
---|---|---|---|
attention_size |
int |
Number of hidden features. Defaults to 512. |
512 |
num_heads |
int |
Number of attention heads |
8 |
input_size |
Optional[int] |
Input features. Defaults to None. If None input_size is set to attention_size. |
None |
dropout |
float |
Drop probability. Defaults to 0.1. |
0.1 |
nystrom |
bool |
Use nystrom method for attention calculation. Defaults to False. |
False |
num_landmarks |
int |
Number of landmark points for nystrom attention. Defaults to 64. |
64 |
inverse_iterations |
int |
Number of iteration to calculate the inverse in nystrom attention. Defaults to 6. |
6 |
kernel_size |
Optional[int] |
Use residual convolution in the output. Defaults to None. |
None |
residual |
bool |
Use vilbert-like residual connections for fusion. Defaults to True. |
True |
Source code in slp/modules/attention.py
def __init__(
self,
attention_size: int = 512,
input_size: Optional[int] = None,
dropout: float = 0.1,
num_heads: int = 8,
residual: bool = True,
nystrom: bool = False,
num_landmarks: int = 64,
inverse_iterations: int = 6,
kernel_size: Optional[int] = None,
):
r"""Multihead twoway attention for multimodal fusion
This module performs two way attention for two input modality feature sequences.
If att is the MultiheadAttention operation and x, y the input modality sequences,
the operation is summarized as
$$out = (att(x \rightarrow y), att(y \rightarrow x))$$
If residual is True then a Vilbert-like residual connection is applied
$$out = (att(x \rightarrow y) + x, att(y \rightarrow x) + y)$$
Args:
attention_size (int): Number of hidden features. Defaults to 512.
num_heads (int): Number of attention heads
input_size (Optional[int]): Input features. Defaults to None.
If None input_size is set to attention_size.
dropout (float): Drop probability. Defaults to 0.1.
nystrom (bool, optional): Use nystrom method for attention calculation. Defaults to False.
num_landmarks (int, optional): Number of landmark points for nystrom attention. Defaults to 64.
inverse_iterations (int, optional): Number of iteration to calculate the inverse in nystrom attention. Defaults to 6.
kernel_size (Optional[int], optional): Use residual convolution in the output. Defaults to None.
residual (bool, optional): Use vilbert-like residual connections for fusion. Defaults to True.
"""
super(MultiheadTwowayAttention, self).__init__()
self.xy = MultiheadAttention(
attention_size=attention_size,
input_size=input_size,
dropout=dropout,
num_heads=num_heads,
nystrom=nystrom,
num_landmarks=num_landmarks,
inverse_iterations=inverse_iterations,
kernel_size=kernel_size,
)
self.yx = MultiheadAttention(
attention_size=attention_size,
input_size=input_size,
dropout=dropout,
num_heads=num_heads,
nystrom=nystrom,
num_landmarks=num_landmarks,
inverse_iterations=inverse_iterations,
kernel_size=kernel_size,
)
self.residual = residual
forward(self, mod1, mod2, attention_mask=None)
x : (B, L, D) queries : (B, L, D) values : (B, L, D)
Source code in slp/modules/attention.py
def forward(self, mod1, mod2, attention_mask=None):
"""
x : (B, L, D)
queries : (B, L, D)
values : (B, L, D)
"""
out_mod1, _ = self.xy(mod1, queries=mod2, attention_mask=attention_mask)
out_mod2, _ = self.yx(mod2, queries=mod1, attention_mask=attention_mask)
if not self.residual:
return out_mod1, out_mod2
else:
# vilbert cross residual
# v + attention(v->a)
# a + attention(a->v)
out_mod1 += mod2
out_mod2 += mod1
return out_mod1, out_mod2
SelfAttention
__init__(self, attention_size=512, input_size=None, dropout=0.1)
special
Single-Headed Dot-product self attention module
Parameters:
Name | Type | Description | Default |
---|---|---|---|
attention_size |
int |
Number of hidden features. Defaults to 512. |
512 |
input_size |
Optional[int] |
Input features. Defaults to None. If None input_size is set to attention_size. |
None |
dropout |
float |
Drop probability. Defaults to 0.1. |
0.1 |
Source code in slp/modules/attention.py
def __init__(
self,
attention_size: int = 512,
input_size: Optional[int] = None,
dropout: float = 0.1,
):
"""Single-Headed Dot-product self attention module
Args:
attention_size (int): Number of hidden features. Defaults to 512.
input_size (Optional[int]): Input features. Defaults to None.
If None input_size is set to attention_size.
dropout (float): Drop probability. Defaults to 0.1.
"""
super(SelfAttention, self).__init__()
if input_size is None:
input_size = attention_size
self.dk = input_size
self.kqv = nn.Linear(input_size, 3 * attention_size, bias=False)
self.dropout = dropout
reset_parameters(self.named_parameters())
forward(self, x, attention_mask=None)
Single-head scaled dot-product attention forward pass
Outputs the values, where features for each sequence element are weighted by their respective attention scores
- B: Batch size
- L: Keys Sequence length
- M: Queries Sequence length
- H: Number of heads
- A: Feature dimension
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
Tensor |
[B, L, D] Input tensor |
required |
attention_mask |
Optional[torch.Tensor] |
Optional [B, L] or [B, M, L] zero-one mask for sequence elements. Defaults to None. |
None |
Returns:
Type | Description |
---|---|
Tuple[torch.Tensor, torch.Tensor] |
Tuple[torch.Tensor, torch.Tensor]: (Reweighted values [B, L, D], attention scores [B, M, L]) |
Source code in slp/modules/attention.py
def forward(
self,
x: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
r"""Single-head scaled dot-product attention forward pass
Outputs the values, where features for each sequence element are weighted by their respective attention scores
$$a = softmax(\frac{Q}{K^T}){\sqrt{d}}) \dot V$$
* B: Batch size
* L: Keys Sequence length
* M: Queries Sequence length
* H: Number of heads
* A: Feature dimension
Args:
x (torch.Tensor): [B, L, D] Input tensor
attention_mask (Optional[torch.Tensor]): Optional [B, L] or [B, M, L] zero-one mask for sequence elements. Defaults to None.
Returns:
Tuple[torch.Tensor, torch.Tensor]: (Reweighted values [B, L, D], attention scores [B, M, L])
"""
if attention_mask is not None:
if len(list(attention_mask.size())) == 2:
attention_mask = attention_mask.unsqueeze(1)
k, q, v = self.kqv(x).chunk(3, dim=-1) # (B, L, A)
# weights => (B, L, L)
out, scores = attention(
k,
q,
v,
self.dk,
attention_mask=attention_mask,
dropout=self.dropout,
training=self.training,
)
return out, scores
attention(k, q, v, dk, attention_mask=None, dropout=0.2, training=True)
Reweight values using scaled dot product attention
- B: Batch size
- L: Keys Sequence length
- M: Queries Sequence length
- H: Number of heads
- A: Feature dimension
Parameters:
Name | Type | Description | Default |
---|---|---|---|
k |
Tensor |
Single head [B, L, A] or multi-head [B, H, L, A/H] Keys tensor |
required |
q |
Tensor |
Single head [B, M, A] or multi-head [B, H, M, A/H] Keys tensor |
required |
v |
Tensor |
Single head [B, M, A] or multi-head [B, H, M, A/H] Values tensor |
required |
dk |
int |
Model dimension |
required |
attention_mask |
Optional[torch.Tensor] |
Optional [B, [H], 1, L] pad mask or [B, [H], M, L] pad mask + subsequent mask tensor with zeros in sequence indices that should be masked and ones in sequence indices that should be preserved. Defaults to None. |
None |
dropout |
float |
Drop probability. Defaults to 0.2. |
0.2 |
training |
bool |
Is module in training phase? Defaults to True. |
True |
Returns:
Type | Description |
---|---|
torch.Tensor |
[B, M, L] or [B, H, M, L] attention scores |
Source code in slp/modules/attention.py
def attention(
k: torch.Tensor,
q: torch.Tensor,
v: torch.Tensor,
dk: int,
attention_mask: Optional[torch.Tensor] = None,
dropout: float = 0.2,
training: bool = True,
):
r"""Reweight values using scaled dot product attention
$$s = softmax(\frac{Q \cdot K^T}{\sqrt{d}}) V$$
* B: Batch size
* L: Keys Sequence length
* M: Queries Sequence length
* H: Number of heads
* A: Feature dimension
Args:
k (torch.Tensor): Single head [B, L, A] or multi-head [B, H, L, A/H] Keys tensor
q (torch.Tensor): Single head [B, M, A] or multi-head [B, H, M, A/H] Keys tensor
v (torch.Tensor): Single head [B, M, A] or multi-head [B, H, M, A/H] Values tensor
dk (int): Model dimension
attention_mask (Optional[torch.Tensor]): Optional [B, [H], 1, L] pad mask or [B, [H], M, L] pad mask + subsequent mask
tensor with zeros in sequence indices that should be masked and ones in sequence indices that should be
preserved. Defaults to None.
dropout (float): Drop probability. Defaults to 0.2.
training (bool): Is module in training phase? Defaults to True.
Returns:
torch.Tensor: [B, M, L] or [B, H, M, L] attention scores
"""
scores = attention_scores(
k, q, dk, attention_mask=attention_mask, dropout=dropout, training=training
)
out = torch.matmul(scores, v)
return out, scores
attention_scores(k, q, dk, attention_mask=None, dropout=0.2, training=True)
Calculate attention scores for scaled dot product attention
- B: Batch size
- L: Keys Sequence length
- M: Queries Sequence length
- H: Number of heads
- A: Feature dimension
Parameters:
Name | Type | Description | Default |
---|---|---|---|
k |
Tensor |
Single head [B, L, A] or multi-head [B, H, L, A/H] Keys tensor |
required |
q |
Tensor |
Single head [B, M, A] or multi-head [B, H, M, A/H] Keys tensor |
required |
dk |
int |
Model dimension |
required |
attention_mask |
Optional[torch.Tensor] |
Optional [B, [H], 1, L] pad mask or [B, [H], M, L] pad mask + subsequent mask tensor with zeros in sequence indices that should be masked and ones in sequence indices that should be preserved. Defaults to None. |
None |
dropout |
float |
Drop probability. Defaults to 0.2. |
0.2 |
training |
bool |
Is module in training phase? Defaults to True. |
True |
Returns:
Type | Description |
---|---|
Tensor |
torch.Tensor: [B, M, L] or [B, H, M, L] attention scores |
Source code in slp/modules/attention.py
def attention_scores(
k: torch.Tensor,
q: torch.Tensor,
dk: int,
attention_mask: Optional[torch.Tensor] = None,
dropout: float = 0.2,
training: bool = True,
) -> torch.Tensor:
r"""Calculate attention scores for scaled dot product attention
$$s = softmax(\frac{Q \cdot K^T}{\sqrt{d}})$$
* B: Batch size
* L: Keys Sequence length
* M: Queries Sequence length
* H: Number of heads
* A: Feature dimension
Args:
k (torch.Tensor): Single head [B, L, A] or multi-head [B, H, L, A/H] Keys tensor
q (torch.Tensor): Single head [B, M, A] or multi-head [B, H, M, A/H] Keys tensor
dk (int): Model dimension
attention_mask (Optional[torch.Tensor]): Optional [B, [H], 1, L] pad mask or [B, [H], M, L] pad mask + subsequent mask
tensor with zeros in sequence indices that should be masked and ones in sequence indices that should be
preserved. Defaults to None.
dropout (float): Drop probability. Defaults to 0.2.
training (bool): Is module in training phase? Defaults to True.
Returns:
torch.Tensor: [B, M, L] or [B, H, M, L] attention scores
"""
scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(dk)
if attention_mask is not None:
scores = scores + ((1 - attention_mask) * -1e5)
scores = F.softmax(scores, dim=-1)
scores = F.dropout(scores, p=dropout, training=training)
return scores
merge_heads(x)
Merge multiple attention heads into output tensor
(Batch size, Heads, Lengths, Attention size / Heads) => (Batch size, Length, Attention size)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
Tensor |
[B, H, L, A/H] multi-head tensor |
required |
Returns:
Type | Description |
---|---|
Tensor |
torch.Tensor: [B, L, A] merged / reshaped tensor |
Source code in slp/modules/attention.py
def merge_heads(x: torch.Tensor) -> torch.Tensor:
"""Merge multiple attention heads into output tensor
(Batch size, Heads, Lengths, Attention size / Heads) => (Batch size, Length, Attention size)
Args:
x (torch.Tensor): [B, H, L, A/H] multi-head tensor
Returns:
torch.Tensor: [B, L, A] merged / reshaped tensor
"""
batch_size, _, max_length, _ = x.size()
# x => (B, L, H, A/H)
x = x.permute(0, 2, 1, 3).contiguous()
return x.view(batch_size, max_length, -1)
nystrom_attention(k, q, v, dk, num_landmarks, attention_mask=None, inverse_iterations=6, dropout=0.2, training=True)
Calculate attention using nystrom approximation
Implementation heavily based on: https://github.com/lucidrains/nystrom-attention
Reference: https://arxiv.org/abs/2102.03902 * B: Batch size * L: Keys Sequence length * M: Queries Sequence length * H: Number of heads * A: Feature dimension
Parameters:
Name | Type | Description | Default |
---|---|---|---|
k |
Tensor |
Single head [B, L, A] or multi-head [B, H, L, A/H] Keys tensor |
required |
q |
Tensor |
Single head [B, M, A] or multi-head [B, H, M, A/H] Keys tensor |
required |
v |
Tensor |
Single head [B, M, A] or multi-head [B, H, M, A/H] Values tensor |
required |
dk |
int |
Model dimension |
required |
num_landmarks |
int |
Number of landmark points |
required |
attention_mask |
Optional[torch.Tensor] |
Optional [B, [H], 1, L] pad mask or [B, [H], M, L] pad mask + subsequent mask tensor with zeros in sequence indices that should be masked and ones in sequence indices that should be preserved. Defaults to None. |
None |
inverse_iterations |
int |
Number of iterations for Moore Penrose iterative inverse approximation |
6 |
dropout |
float |
Drop probability. Defaults to 0.2. |
0.2 |
training |
bool |
Is module in training phase? Defaults to True. |
True |
Returns:
Type | Description |
---|---|
torch.Tensor |
[B, M, L] or [B, H, M, L] attention scores |
Source code in slp/modules/attention.py
def nystrom_attention(
k: torch.Tensor,
q: torch.Tensor,
v: torch.Tensor,
dk: int,
num_landmarks: int,
attention_mask: Optional[torch.Tensor] = None,
inverse_iterations: int = 6,
dropout: float = 0.2,
training: bool = True,
):
"""Calculate attention using nystrom approximation
Implementation heavily based on: https://github.com/lucidrains/nystrom-attention
Reference: https://arxiv.org/abs/2102.03902
* B: Batch size
* L: Keys Sequence length
* M: Queries Sequence length
* H: Number of heads
* A: Feature dimension
Args:
k (torch.Tensor): Single head [B, L, A] or multi-head [B, H, L, A/H] Keys tensor
q (torch.Tensor): Single head [B, M, A] or multi-head [B, H, M, A/H] Keys tensor
v (torch.Tensor): Single head [B, M, A] or multi-head [B, H, M, A/H] Values tensor
dk (int): Model dimension
num_landmarks (int): Number of landmark points
attention_mask (Optional[torch.Tensor]): Optional [B, [H], 1, L] pad mask or [B, [H], M, L] pad mask + subsequent mask
tensor with zeros in sequence indices that should be masked and ones in sequence indices that should be
preserved. Defaults to None.
inverse_iterations (int): Number of iterations for Moore Penrose iterative inverse
approximation
dropout (float): Drop probability. Defaults to 0.2.
training (bool): Is module in training phase? Defaults to True.
Returns:
torch.Tensor: [B, M, L] or [B, H, M, L] attention scores
"""
_, num_heads, seq_length, head_size = k.size()
masked_mean_denom = seq_length // num_landmarks
if attention_mask is not None:
attention_mask = attention_mask.unsqueeze(1)
masked_mean_denom = (
attention_mask.reshape(-1, 1, num_landmarks, seq_length // num_landmarks).sum(-1) + 1e-8 # type: ignore
) # (B, 1, Landmarks)
mask_landmarks = (masked_mean_denom > 0).type(torch.float) # type: ignore
masked_mean_denom = masked_mean_denom[..., None] # type: ignore
attention_mask = attention_mask.unsqueeze(-1)
q = q * attention_mask # (B, H, L, A/H)
k = k * attention_mask # (B, H, L, A/H)
v = v * attention_mask # (B, H, L, A/H)
scores_1_mask = attention_mask * mask_landmarks[..., None, :]
scores_2_mask = mask_landmarks[..., None] * mask_landmarks[..., None, :]
scores_3_mask = scores_1_mask.transpose(-1, -2)
q = q / math.sqrt(dk)
q_landmarks = q.reshape(
q.size(0), # batch_size
q.size(1), # num_heads
num_landmarks, # landmarks
seq_length // num_landmarks, # reduced length
q.size(-1), # head_size
).sum(
dim=-2
) # (B, H, Landmarks, A/H)
k_landmarks = k.reshape(
k.size(0), # batch_size
k.size(1), # num_heads
num_landmarks, # landmarks
seq_length // num_landmarks, # reduced length
k.size(-1), # head size
).sum(
dim=-2
) # (B, H, Landmarks, A/H)
k_landmarks = k_landmarks / masked_mean_denom
q_landmarks = q_landmarks / masked_mean_denom
scores_1 = attention_scores(
k_landmarks,
q,
1, # We have already accounted for dk
attention_mask=scores_1_mask,
dropout=dropout,
training=training,
)
scores_2 = attention_scores(
k_landmarks,
q_landmarks,
1, # We have already accounted for dk
attention_mask=scores_2_mask,
dropout=dropout,
training=training,
)
scores_3 = attention_scores(
k,
q_landmarks,
1, # We have already accounted for dk
attention_mask=scores_3_mask,
dropout=dropout,
training=training,
)
z_star = moore_penrose_pinv(scores_2, num_iter=inverse_iterations)
out = (scores_1 @ z_star) @ (scores_3 @ v)
return out, (scores_1, scores_2, scores_3)
pad_for_nystrom(x, num_landmarks, attention_mask=None)
Pad inputs and attention_mask to perform Nystrom Attention
Pad to nearest multiple of num_landmarks
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
Tensor |
[B, L, A] Input tensor |
required |
num_landmarks |
int |
Number of landmark points |
required |
attention_mask |
Optional[torch.Tensor] |
[B, L] Padding mask |
None |
Returns:
Type | Description |
---|---|
Tuple[torch.Tensor, Optional[torch.Tensor]] |
Tuple[torch.Tensor, Optional[torch.Tensor]]: Padded inputs and attention_mask |
Source code in slp/modules/attention.py
def pad_for_nystrom(
x: torch.Tensor, num_landmarks: int, attention_mask: Optional[torch.Tensor] = None
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
"""Pad inputs and attention_mask to perform Nystrom Attention
Pad to nearest multiple of num_landmarks
Args:
x (torch.Tensor): [B, L, A] Input tensor
num_landmarks (int): Number of landmark points
attention_mask (Optional[torch.Tensor]): [B, L] Padding mask
Returns:
Tuple[torch.Tensor, Optional[torch.Tensor]]: Padded inputs and attention_mask
"""
if attention_mask is not None:
attention_mask = attention_mask.squeeze()
_, seq_length, _ = x.size()
_, remainder = (
math.ceil(seq_length / num_landmarks),
seq_length % num_landmarks,
)
if remainder > 0:
padding = num_landmarks - remainder
x = F.pad(x, (0, 0, padding, 0), value=0)
if attention_mask is not None:
attention_mask = F.pad(attention_mask, (padding, 0))
return x, attention_mask
reset_parameters(named_parameters)
Initialize parameters in the transformer model.
Source code in slp/modules/attention.py
def reset_parameters(named_parameters):
"""Initialize parameters in the transformer model."""
for name, p in named_parameters:
if "weight" in name:
nn.init.xavier_normal_(p)
if "bias" in name:
nn.init.constant_(p, 0.0)
split_heads(x, num_heads)
Split input tensor into multiple attention heads
(Batch size, Length, Attention size) => (Batch size, Heads, Lengths, Attention size / Heads)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
Tensor |
[B, L, A] input tensor |
required |
num_heads |
int |
number of heads |
required |
Returns:
Type | Description |
---|---|
Tensor |
torch.Tensor: [B, H, L, A/H] Splitted / reshaped tensor |
Source code in slp/modules/attention.py
def split_heads(x: torch.Tensor, num_heads: int) -> torch.Tensor:
"""Split input tensor into multiple attention heads
(Batch size, Length, Attention size) => (Batch size, Heads, Lengths, Attention size / Heads)
Args:
x (torch.Tensor): [B, L, A] input tensor
num_heads (int): number of heads
Returns:
torch.Tensor: [B, H, L, A/H] Splitted / reshaped tensor
"""
batch_size, max_length, attention_size = x.size()
head_size = int(attention_size / num_heads)
return x.view(batch_size, max_length, num_heads, head_size).permute(0, 2, 1, 3)
TwowayAttention
Some Information about Attention
forward(self, mod1, mod2, attention_mask=None)
x : (B, L, D) queries : (B, L, D) values : (B, L, D)
Source code in slp/modules/twowayattention.py
def forward(self, mod1, mod2, attention_mask=None):
"""
x : (B, L, D)
queries : (B, L, D)
values : (B, L, D)
"""
k_mod1 = self.kx(mod1)
q_mod2 = self.qy(mod2)
v_mod1 = self.vx(mod1)
k_mod2 = self.ky(mod2) # (B, L, A)
q_mod1 = self.qx(mod1)
v_mod2 = self.vy(mod2)
# weights => (B, L, L)
scores_mod1 = torch.bmm(q_mod2, k_mod1.transpose(1, 2)) / math.sqrt(self.dk)
scores_mod2 = torch.bmm(q_mod1, k_mod2.transpose(1, 2)) / math.sqrt(self.dk)
if attention_mask is not None:
scores_mod1 = scores_mod1 + ((1 - attention_mask.unsqueeze(1)) * -1e5)
scores_mod2 = scores_mod2 + ((1 - attention_mask.unsqueeze(1)) * -1e5)
scores_mod1 = F.softmax(scores_mod1, dim=-1)
scores_mod1 = self.drop(scores_mod1)
scores_mod2 = F.softmax(scores_mod2, dim=-1)
scores_mod2 = self.drop(scores_mod2)
# out => (B, L, A)
out_mod1 = torch.bmm(scores_mod1, v_mod1)
out_mod2 = torch.bmm(scores_mod2, v_mod2)
if self.layernorm:
out_mod1 = self.lnx(out_mod1)
out_mod2 = self.lny(out_mod2)
if not self.residual:
return out_mod1, out_mod2
else:
# vilbert cross residual
# v + attention(v->a)
# a + attention(a->v)
out_mod1 += mod2
out_mod2 += mod1
return out_mod1, out_mod2
Classifier
__init__(self, encoder, encoded_features, num_classes, dropout=0.2)
special
Classifier wrapper module
Stores a Neural Network encoder and adds a classification layer on top.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
encoder |
Module |
[description] |
required |
encoded_features |
int |
[description] |
required |
num_classes |
int |
[description] |
required |
dropout |
float |
Drop probability |
0.2 |
Source code in slp/modules/classifier.py
def __init__(
self,
encoder: nn.Module,
encoded_features: int,
num_classes: int,
dropout: float = 0.2,
):
"""Classifier wrapper module
Stores a Neural Network encoder and adds a classification layer on top.
Args:
encoder (nn.Module): [description]
encoded_features (int): [description]
num_classes (int): [description]
dropout (float): Drop probability
"""
super(Classifier, self).__init__()
self.encoder = encoder
self.drop = nn.Dropout(dropout)
self.clf = nn.Linear(encoded_features, num_classes)
forward(self, *args, **kwargs)
Encode inputs using the encoder network and perform classification
Returns:
Type | Description |
---|---|
Tensor |
torch.Tensor: [B, *, num_classes] Logits tensor |
Source code in slp/modules/classifier.py
def forward(self, *args, **kwargs) -> torch.Tensor:
"""Encode inputs using the encoder network and perform classification
Returns:
torch.Tensor: [B, *, num_classes] Logits tensor
"""
encoded: torch.Tensor = self.encoder(*args, **kwargs) # type: ignore
out: torch.Tensor = self.drop(encoded)
out = self.clf(out)
return out
MOSEITextClassifier
forward(self, x, lengths)
Encode inputs using the encoder network and perform classification
Returns:
Type | Description |
---|---|
torch.Tensor |
[B, *, num_classes] Logits tensor |
Source code in slp/modules/classifier.py
def forward(self, x, lengths):
x = x["text"]
lengths = lengths["text"]
return super().forward(x, lengths)
RNNLateFusionClassifier
forward(self, inputs, lengths)
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:Module
instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.
Source code in slp/modules/classifier.py
def forward(self, inputs, lengths):
encoded = [
self.modality_encoders[m](inputs[m], lengths[m]) for m in self.modalities
]
if self.mmdrop is not None:
encoded = self.mmdrop(*encoded)
fused = torch.cat(encoded, dim=-1)
fused = self.drop(fused)
out = self.clf(fused)
return out
TransformerLateFusionClassifier
forward(self, inputs, attention_masks=None)
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:Module
instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.
Source code in slp/modules/classifier.py
def forward(self, inputs, attention_masks=None):
if attention_masks is None:
attention_masks = dict(
zip(self.modalities, [None for _ in self.modalities])
)
encoded = [
self.modality_encoders[m](inputs[m], attention_mask=attention_masks[m])
for m in self.modalities
]
if self.mmdrop is not None:
encoded = self.mmdrop(*encoded)
fused = torch.cat(encoded, dim=-1)
if self.modality_drop is not None:
fused = self.modality_drop(fused)
out = self.clf(fused)
return out
Embed
__init__(self, num_embeddings, embedding_dim, embeddings=None, noise=0.0, dropout=0.0, scale=1.0, trainable=False)
special
Define the layer of the model and perform the initializations of the layers (wherever it is necessary)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
num_embeddings |
int |
Total number of embeddings. |
required |
embedding_dim |
int |
Embedding dimension. |
required |
embeddings |
Optional[numpy.ndarray] |
the 2D ndarray with the word vectors. |
None |
noise |
float |
Optional additive noise. Defaults to 0.0. |
0.0 |
dropout |
float |
Embedding dropout probability. Defaults to 0.0. |
0.0 |
scale |
float |
Scale word embeddings by a constant. Defaults to 1.0. |
1.0 |
trainable |
bool |
Finetune embeddings. Defaults to False |
False |
Source code in slp/modules/embed.py
def __init__(
self,
num_embeddings: int,
embedding_dim: int,
embeddings: Optional[np.ndarray] = None,
noise: float = 0.0,
dropout: float = 0.0,
scale: float = 1.0,
trainable: bool = False,
):
"""
Define the layer of the model and perform the initializations
of the layers (wherever it is necessary)
Args:
num_embeddings (int): Total number of embeddings.
embedding_dim (int): Embedding dimension.
embeddings (numpy.ndarray): the 2D ndarray with the word vectors.
noise (float): Optional additive noise. Defaults to 0.0.
dropout (float): Embedding dropout probability. Defaults to 0.0.
scale (float): Scale word embeddings by a constant. Defaults to 1.0.
trainable (bool): Finetune embeddings. Defaults to False
"""
super(Embed, self).__init__()
self.scale = scale # scale embeddings by value. Needed for transformer
# define the embedding layer, with the corresponding dimensions
self.embedding = nn.Embedding(
num_embeddings=num_embeddings, embedding_dim=embedding_dim
)
if embeddings is not None:
logger.info("Initializing Embedding layer with pre-trained weights.")
if trainable:
logger.info("Embeddings are going to be finetuned")
else:
logger.info("Embeddings are frozen")
self.init_embeddings(embeddings, trainable)
# the dropout "layer" for the word embeddings
self.dropout = nn.Dropout(dropout)
# the gaussian noise "layer" for the word embeddings
self.noise = GaussianNoise(noise)
forward(self, x)
Embed input tokens
Assign embedding that corresponds to each token. Optionally add Gaussian noise and embedding dropout and scale embeddings by a constant.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
Tensor |
[B, L] Input token ids. |
required |
Returns:
Type | Description |
---|---|
Tensor |
(torch.Tensor) -> [B, L, E] Embedded tokens. |
Source code in slp/modules/embed.py
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Embed input tokens
Assign embedding that corresponds to each token.
Optionally add Gaussian noise and embedding dropout and scale embeddings by a constant.
Args:
x (torch.Tensor): [B, L] Input token ids.
Returns:
(torch.Tensor) -> [B, L, E] Embedded tokens.
"""
embeddings = self.embedding(x)
if self.noise.stddev > 0:
embeddings = self.noise(embeddings)
if self.dropout.p > 0:
embeddings = self.dropout(embeddings)
return embeddings * self.scale # type: ignore
init_embeddings(self, weights, trainable)
Initialize embeddings matrix with pretrained embeddings
Parameters:
Name | Type | Description | Default |
---|---|---|---|
weights |
ndarray |
pretrained embeddings |
required |
trainable |
bool |
Finetune embeddings? |
required |
Source code in slp/modules/embed.py
def init_embeddings(self, weights: np.ndarray, trainable: bool):
"""Initialize embeddings matrix with pretrained embeddings
Args:
weights (np.ndarray): pretrained embeddings
trainable (bool): Finetune embeddings?
"""
self.embedding.weight = nn.Parameter(
torch.from_numpy(weights), requires_grad=trainable
)
PositionalEncoding
__init__(self, embedding_dim=512, max_len=5000)
special
Inject some information about the relative or absolute position of the tokens in the sequence.
The positional encodings have the same dimension as the embeddings, so that the two can be summed. Here, we use sine and cosine functions of different frequencies.
PE for even positions:
PE for odd positions:
where \(pos\) is the word position and \(i\) is the embedding idx
Implementation modified from pytorch/examples/word_language_model.py
Parameters:
Name | Type | Description | Default |
---|---|---|---|
embedding_dim |
int |
Embedding / model dimension. Defaults to 512. |
512 |
max_len |
int |
Maximum sequence length that can be encoded. Defaults to 5000. |
5000 |
Source code in slp/modules/embed.py
def __init__(self, embedding_dim: int = 512, max_len: int = 5000):
r"""Inject some information about the relative or absolute position of the tokens in the sequence.
The positional encodings have the same dimension as
the embeddings, so that the two can be summed. Here, we use sine and cosine
functions of different frequencies.
PE for even positions:
$$\text{PosEncoder}(pos, 2i) = sin(\frac{pos}{10000^{\frac{2i}{d}}})$$
PE for odd positions:
$$\text{PosEncoder}(pos, 2i+1) = cos(\frac{pos}{10000^{\frac{2i}{d}}})$$
where $pos$ is the word position and $i$ is the embedding idx
Implementation modified from pytorch/examples/word_language_model.py
Args:
embedding_dim (int): Embedding / model dimension. Defaults to 512.
max_len (int): Maximum sequence length that can be encoded. Defaults to 5000.
"""
super(PositionalEncoding, self).__init__()
pe = torch.zeros(max_len, embedding_dim)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, embedding_dim, 2).float()
* (-math.log(10000.0) / embedding_dim)
)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer("pe", pe)
forward(self, x)
Calculate positional embeddings for input and add them to input tensor
x is assumed to be batch first
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
Tensor |
[B, L, D] input embeddings |
required |
Returns:
Type | Description |
---|---|
Tensor |
torch.Tensor: Embeddings + positional embeddings |
Source code in slp/modules/embed.py
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Calculate positional embeddings for input and add them to input tensor
$$out = x + PosEmbed(x)$$
x is assumed to be batch first
Args:
x (torch.Tensor): [B, L, D] input embeddings
Returns:
torch.Tensor: Embeddings + positional embeddings
"""
x = x + self.pe[:, : x.size(1), :] # type: ignore
return x
PositionwiseFF
__init__(self, d_model, d_ff, dropout=0.1, gelu=False)
special
Transformer Position-wise feed-forward layer
Linear -> LayerNorm -> ReLU -> Linear
Parameters:
Name | Type | Description | Default |
---|---|---|---|
d_model |
int |
Model dimension |
required |
d_ff |
int |
Hidden dimension |
required |
dropout |
float |
Dropout probability. Defaults to 0.1. |
0.1 |
Source code in slp/modules/feedforward.py
def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1, gelu=False):
"""Transformer Position-wise feed-forward layer
Linear -> LayerNorm -> ReLU -> Linear
Args:
d_model (int): Model dimension
d_ff (int): Hidden dimension
dropout (float): Dropout probability. Defaults to 0.1.
"""
super(PositionwiseFF, self).__init__()
self.ff1 = nn.Linear(d_model, d_ff)
self.ff2 = nn.Linear(d_ff, d_model)
self.drop = nn.Dropout(dropout)
self.activation = nn.ReLU() if not gelu else nn.GELU()
forward(self, x)
Position-wise FF forward pass
[B, , D] -> [B, , H] -> [B, *, D]
- B: Batch size
- D: Model dim
- H: Hidden size > Model dim (Usually \(H = 2D\))
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
Tensor |
[B, *, D] Input features |
required |
Returns:
Type | Description |
---|---|
Tensor |
torch.Tensor: [B, *, D] Output features |
Source code in slp/modules/feedforward.py
def forward(self, x: torch.Tensor) -> torch.Tensor:
r"""Position-wise FF forward pass
$$out = W_2 \dot max(0, W_1 \dot x + b_1) + b_2$$
[B, *, D] -> [B, *, H] -> [B, *, D]
* B: Batch size
* D: Model dim
* H: Hidden size > Model dim (Usually $H = 2D$)
Args:
x (torch.Tensor): [B, *, D] Input features
Returns:
torch.Tensor: [B, *, D] Output features
"""
out: torch.Tensor = self.ff2(self.drop(self.activation(self.ff1(x))))
return out
TwoLayer
forward(self, x)
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:Module
instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.
Source code in slp/modules/feedforward.py
def forward(self, x):
out = self.l1(x)
out = self.drop(out)
out = self.act(out)
out = self.l2(out)
out = self.drop(out)
if self.residual:
out = x + out
return out
LayerNormTf
__init__(self, hidden_size, eps=1e-12)
special
Construct a layernorm module in the TF style (epsilon inside the square root). Link: https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L234
Source code in slp/modules/norm.py
def __init__(self, hidden_size: int, eps: float = 1e-12):
"""Construct a layernorm module in the TF style (epsilon inside the square root).
Link: https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L234
"""
super(LayerNormTf, self).__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.bias = nn.Parameter(torch.zeros(hidden_size))
self.variance_epsilon = eps
forward(self, x)
Calculate Layernorm the tf way
Source code in slp/modules/norm.py
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Calculate Layernorm the tf way"""
u = x.mean(-1, keepdim=True)
s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias
ScaleNorm
forward(self, x)
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:Module
instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.
Source code in slp/modules/norm.py
def forward(self, x: torch.Tensor):
scaled_norm = self.g / safe_norm(x, dim=-1, keepdim=True).clamp(min=self.eps)
return scaled_norm * x
GaussianNoise
__init__(self, stddev, mean=0.0)
special
Additive Gaussian Noise layer
Parameters:
Name | Type | Description | Default |
---|---|---|---|
stddev |
float |
the standard deviation of the distribution |
required |
mean |
float |
the mean of the distribution |
0.0 |
Source code in slp/modules/regularization.py
def __init__(self, stddev: float, mean: float = 0.0):
"""Additive Gaussian Noise layer
Args:
stddev (float): the standard deviation of the distribution
mean (float): the mean of the distribution
"""
super().__init__()
self.stddev = stddev
self.mean = mean
__repr__(self)
special
String representation of class
Source code in slp/modules/regularization.py
def __repr__(self):
"""String representation of class"""
return "{} (mean={}, stddev={})".format(
self.__class__.__name__, str(self.mean), str(self.stddev)
)
forward(self, x)
Gaussian noise forward pass
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
Tensor |
Input features. |
required |
Returns:
Type | Description |
---|---|
Tensor |
Source code in slp/modules/regularization.py
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Gaussian noise forward pass
Args:
x (torch.Tensor): Input features.
Returns:
[type]: [description]
"""
if self.training:
noise = Variable(x.data.new(x.size()).normal_(self.mean, self.stddev))
return x + noise
return x
AttentiveRNN
__init__(self, input_size, hidden_size=256, batch_first=True, layers=1, bidirectional=False, merge_bi='cat', dropout=0.1, rnn_type='lstm', packed_sequence=True, attention=False, max_length=-1, num_heads=1, nystrom=True, num_landmarks=32, kernel_size=33, inverse_iterations=6, return_hidden=False)
special
RNN with embedding layer and optional attention mechanism
Single-headed scaled dot-product attention is used as an attention mechanism
Parameters:
Name | Type | Description | Default |
---|---|---|---|
input_size |
int |
Input features dimension |
required |
hidden_size |
int |
Hidden features |
256 |
batch_first |
bool |
Use batch first representation type. Defaults to True. |
True |
layers |
int |
Number of RNN layers. Defaults to 1. |
1 |
bidirectional |
bool |
Use bidirectional RNNs. Defaults to False. |
False |
merge_bi |
str |
How bidirectional states are merged. Defaults to "cat". |
'cat' |
dropout |
float |
Dropout probability. Defaults to 0.0. |
0.1 |
rnn_type |
str |
lstm or gru. Defaults to "lstm". |
'lstm' |
packed_sequence |
bool |
Use packed sequences. Defaults to True. |
True |
max_length |
int |
Maximum sequence length for fixed length padding. If -1 takes the largest sequence length in this batch |
-1 |
attention |
bool |
Use attention mechanism. Defaults to False |
False |
num_heads |
int |
Number of attention heads. If 1 uses single headed attention |
1 |
nystrom |
bool |
Use nystrom approximation for multihead attention |
True |
num_landmarks |
int |
Number of landmark sequence elements for nystrom attention |
32 |
kernel_size |
Optional[int] |
Kernel size for multihead attention output residual convolution |
33 |
inverse_iterations |
int |
Number of iterations for moore-penrose inverse approximation in nystrom attention. 6 is a good value |
6 |
return_hidden |
bool |
Return all hidden states. Defaults to False. |
False |
Source code in slp/modules/rnn.py
def __init__(
self,
input_size: int,
hidden_size: int = 256,
batch_first: bool = True,
layers: int = 1,
bidirectional: bool = False,
merge_bi: str = "cat",
dropout: float = 0.1,
rnn_type: str = "lstm",
packed_sequence: bool = True,
attention: bool = False,
max_length: int = -1,
num_heads: int = 1,
nystrom: bool = True,
num_landmarks: int = 32,
kernel_size: Optional[int] = 33,
inverse_iterations: int = 6,
return_hidden: bool = False,
):
"""RNN with embedding layer and optional attention mechanism
Single-headed scaled dot-product attention is used as an attention mechanism
Args:
input_size (int): Input features dimension
hidden_size (int): Hidden features
batch_first (bool): Use batch first representation type. Defaults to True.
layers (int): Number of RNN layers. Defaults to 1.
bidirectional (bool): Use bidirectional RNNs. Defaults to False.
merge_bi (str): How bidirectional states are merged. Defaults to "cat".
dropout (float): Dropout probability. Defaults to 0.0.
rnn_type (str): lstm or gru. Defaults to "lstm".
packed_sequence (bool): Use packed sequences. Defaults to True.
max_length (int): Maximum sequence length for fixed length padding. If -1 takes the
largest sequence length in this batch
attention (bool): Use attention mechanism. Defaults to False
num_heads (int): Number of attention heads. If 1 uses single headed attention
nystrom (bool): Use nystrom approximation for multihead attention
num_landmarks (int): Number of landmark sequence elements for nystrom attention
kernel_size (int): Kernel size for multihead attention output residual convolution
inverse_iterations (int): Number of iterations for moore-penrose inverse approximation
in nystrom attention. 6 is a good value
return_hidden (bool): Return all hidden states. Defaults to False.
"""
super(AttentiveRNN, self).__init__()
self.rnn = RNN(
input_size, # type: ignore
hidden_size,
batch_first=batch_first,
layers=layers,
merge_bi=merge_bi,
bidirectional=bidirectional,
dropout=dropout,
rnn_type=rnn_type,
packed_sequence=packed_sequence,
max_length=max_length,
)
self.out_size = (
hidden_size
if not (bidirectional and merge_bi == "cat")
else 2 * hidden_size
)
self.batch_first = batch_first
self.return_hidden = return_hidden
self.attention = None
if attention:
if num_heads == 1:
self.attention = Attention(
attention_size=self.out_size, dropout=dropout
)
else:
self.attention = MultiheadAttention( # type: ignore
attention_size=self.out_size,
num_heads=num_heads,
kernel_size=kernel_size,
nystrom=nystrom,
num_landmarks=num_landmarks,
inverse_iterations=inverse_iterations,
dropout=dropout,
)
forward(self, x, lengths)
Attentive RNN forward pass
If self.attention=True then the outputs are the weighted sum of the RNN hidden states with the attention score weights Else the output is the last hidden state of the RNN.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
Tensor |
[B, L] Input token ids |
required |
lengths |
Tensor |
[B] Original sequence lengths |
required |
Returns:
Type | Description |
---|---|
Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] |
Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: if return_hidden == False: Returns a tensor [B, H] or [B, 2H] of output features to be used for classification if return_hidden == True: Returns a tensor [B, H] or [B, 2H] of output features to be used for classification, and a tensor of all the hidden states |
Source code in slp/modules/rnn.py
def forward(
self, x: torch.Tensor, lengths: torch.Tensor
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
"""Attentive RNN forward pass
If self.attention=True then the outputs are the weighted sum of the RNN hidden states with the attention score weights
Else the output is the last hidden state of the RNN.
Args:
x (torch.Tensor): [B, L] Input token ids
lengths (torch.Tensor): [B] Original sequence lengths
Returns:
Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
if return_hidden == False: Returns a tensor [B, H] or [B, 2*H] of output features to be used for classification
if return_hidden == True: Returns a tensor [B, H] or [B, 2*H] of output features to
be used for classification, and a tensor of all the hidden states
"""
states, last_hidden, _ = self.rnn(x, lengths)
out: torch.Tensor = last_hidden
if self.attention is not None:
states, _ = self.attention(
states,
attention_mask=pad_mask(
lengths,
max_length=states.size(1) if self.batch_first else states.size(0),
),
)
out = states.mean(dim=1)
if self.return_hidden:
return out, states
else:
return out
RNN
out_size: int
property
readonly
RNN output features size
Returns:
Type | Description |
---|---|
int |
int: RNN output features size |
__init__(self, input_size, hidden_size, batch_first=True, layers=1, bidirectional=False, merge_bi='cat', dropout=0.0, rnn_type='lstm', packed_sequence=True, max_length=-1)
special
LSTM - GRU wrapper with packed sequence support and handling for bidirectional / last output states
It is recommended to run with batch_first=True because the rest of the code is built with this assumption
Parameters:
Name | Type | Description | Default |
---|---|---|---|
input_size |
int |
Input features. |
required |
hidden_size |
int |
Hidden features. |
required |
batch_first |
bool |
Use batch first representation type. Defaults to True. |
True |
layers |
int |
Number of RNN layers. Defaults to 1. |
1 |
bidirectional |
bool |
Use bidirectional RNNs. Defaults to False. |
False |
merge_bi |
str |
How bidirectional states are merged. Defaults to "cat". |
'cat' |
dropout |
float |
Dropout probability. Defaults to 0.0. |
0.0 |
rnn_type |
str |
lstm or gru. Defaults to "lstm". |
'lstm' |
packed_sequence |
bool |
Use packed sequences. Defaults to True. |
True |
Source code in slp/modules/rnn.py
def __init__(
self,
input_size: int,
hidden_size: int,
batch_first: bool = True,
layers: int = 1,
bidirectional: bool = False,
merge_bi: str = "cat",
dropout: float = 0.0,
rnn_type: str = "lstm",
packed_sequence: bool = True,
max_length: int = -1,
):
"""LSTM - GRU wrapper with packed sequence support and handling for bidirectional / last output states
It is recommended to run with batch_first=True because the rest of the code is built with this assumption
Args:
input_size (int): Input features.
hidden_size (int): Hidden features.
batch_first (bool): Use batch first representation type. Defaults to True.
layers (int): Number of RNN layers. Defaults to 1.
bidirectional (bool): Use bidirectional RNNs. Defaults to False.
merge_bi (str): How bidirectional states are merged. Defaults to "cat".
dropout (float): Dropout probability. Defaults to 0.0.
rnn_type (str): lstm or gru. Defaults to "lstm".
packed_sequence (bool): Use packed sequences. Defaults to True.
"""
super(RNN, self).__init__()
self.bidirectional = bidirectional
self.hidden_size = hidden_size
self.batch_first = batch_first
self.merge_bi = merge_bi
self.rnn_type = rnn_type.lower()
if not batch_first:
logger.warning(
"You are running RNN with batch_first=False. Make sure this is really what you want"
)
if not packed_sequence:
logger.warning(
"You have set packed_sequence=False. Running with packed_sequence=True will be much faster"
)
rnn_cls = nn.LSTM if self.rnn_type == "lstm" else nn.GRU
self.rnn = rnn_cls(
input_size,
hidden_size,
batch_first=batch_first,
num_layers=layers,
bidirectional=bidirectional,
)
self.drop = nn.Dropout(dropout)
self.packed_sequence = packed_sequence
if packed_sequence:
self.pack = PackSequence(batch_first=batch_first)
self.unpack = PadPackedSequence(
batch_first=batch_first, max_length=max_length
)
forward(self, x, lengths)
RNN forward pass
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
Tensor |
[B, L, D] Input features |
required |
lengths |
Tensor |
[B] Original sequence lengths |
required |
Returns:
Type | Description |
---|---|
Tuple[torch.Tensor, torch.Tensor, Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] |
Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: ( merged forward and backward states [B, L, H] or [B, L, 2H], merged last forward and backward state [B, H] or [B, 2H], hidden states tuple of [num_layers * num_directions, B, H] for LSTM or tensor [num_layers * num_directions, B, H] for GRU ) |
Source code in slp/modules/rnn.py
def forward(
self, x: torch.Tensor, lengths: torch.Tensor
) -> Tuple[
torch.Tensor,
torch.Tensor,
Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
]:
"""RNN forward pass
Args:
x (torch.Tensor): [B, L, D] Input features
lengths (torch.Tensor): [B] Original sequence lengths
Returns:
Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: (
merged forward and backward states [B, L, H] or [B, L, 2*H],
merged last forward and backward state [B, H] or [B, 2*H],
hidden states tuple of [num_layers * num_directions, B, H] for LSTM or tensor [num_layers * num_directions, B, H] for GRU
)
"""
self.rnn.flatten_parameters()
if self.packed_sequence:
# Latest pytorch allows only cpu tensors for packed sequence
lengths = lengths.to("cpu")
x, lengths = self.pack(x, lengths)
out, hidden = self.rnn(x)
if self.packed_sequence:
out = self.unpack(out, lengths)
out = self.drop(out)
lengths = lengths.to(out.device)
out, last_timestep = self._final_output(out, lengths)
return out, last_timestep, hidden
TokenRNN
__init__(self, hidden_size=256, vocab_size=None, embeddings_dim=None, embeddings=None, embeddings_dropout=0.0, finetune_embeddings=False, batch_first=True, layers=1, bidirectional=False, merge_bi='cat', dropout=0.1, rnn_type='lstm', packed_sequence=True, attention=False, max_length=-1, num_heads=1, nystrom=True, num_landmarks=32, kernel_size=33, inverse_iterations=6, return_hidden=False)
special
RNN with embedding layer and optional attention mechanism
Single-headed scaled dot-product attention is used as an attention mechanism
Parameters:
Name | Type | Description | Default |
---|---|---|---|
hidden_size |
int |
Hidden features |
256 |
vocab_size |
Optional[int] |
Vocabulary size. Defaults to None. |
None |
embeddings_dim |
Optional[int] |
Embedding dimension. Defaults to None. |
None |
embeddings |
Optional[numpy.ndarray] |
Embedding matrix. Defaults to None. |
None |
embeddings_dropout |
float |
Embedding dropout probability. Defaults to 0.0. |
0.0 |
finetune_embeddings |
bool |
Finetune embeddings? Defaults to False. |
False |
batch_first |
bool |
Use batch first representation type. Defaults to True. |
True |
layers |
int |
Number of RNN layers. Defaults to 1. |
1 |
bidirectional |
bool |
Use bidirectional RNNs. Defaults to False. |
False |
merge_bi |
str |
How bidirectional states are merged. Defaults to "cat". |
'cat' |
dropout |
float |
Dropout probability. Defaults to 0.0. |
0.1 |
rnn_type |
str |
lstm or gru. Defaults to "lstm". |
'lstm' |
packed_sequence |
bool |
Use packed sequences. Defaults to True. |
True |
max_length |
int |
Maximum sequence length for fixed length padding. If -1 takes the largest sequence length in this batch |
-1 |
attention |
bool |
Use attention mechanism. Defaults to False |
False |
num_heads |
int |
Number of attention heads. If 1 uses single headed attention |
1 |
nystrom |
bool |
Use nystrom approximation for multihead attention |
True |
num_landmarks |
int |
Number of landmark sequence elements for nystrom attention |
32 |
kernel_size |
Optional[int] |
Kernel size for multihead attention output residual convolution |
33 |
inverse_iterations |
int |
Number of iterations for moore-penrose inverse approximation in nystrom attention. 6 is a good value |
6 |
Source code in slp/modules/rnn.py
def __init__(
self,
hidden_size: int = 256,
vocab_size: Optional[int] = None,
embeddings_dim: Optional[int] = None,
embeddings: Optional[np.ndarray] = None,
embeddings_dropout: float = 0.0,
finetune_embeddings: bool = False,
batch_first: bool = True,
layers: int = 1,
bidirectional: bool = False,
merge_bi: str = "cat",
dropout: float = 0.1,
rnn_type: str = "lstm",
packed_sequence: bool = True,
attention: bool = False,
max_length: int = -1,
num_heads: int = 1,
nystrom: bool = True,
num_landmarks: int = 32,
kernel_size: Optional[int] = 33,
inverse_iterations: int = 6,
return_hidden=False,
):
"""RNN with embedding layer and optional attention mechanism
Single-headed scaled dot-product attention is used as an attention mechanism
Args:
hidden_size (int): Hidden features
vocab_size (Optional[int]): Vocabulary size. Defaults to None.
embeddings_dim (Optional[int]): Embedding dimension. Defaults to None.
embeddings (Optional[np.ndarray]): Embedding matrix. Defaults to None.
embeddings_dropout (float): Embedding dropout probability. Defaults to 0.0.
finetune_embeddings (bool): Finetune embeddings? Defaults to False.
batch_first (bool): Use batch first representation type. Defaults to True.
layers (int): Number of RNN layers. Defaults to 1.
bidirectional (bool): Use bidirectional RNNs. Defaults to False.
merge_bi (str): How bidirectional states are merged. Defaults to "cat".
dropout (float): Dropout probability. Defaults to 0.0.
rnn_type (str): lstm or gru. Defaults to "lstm".
packed_sequence (bool): Use packed sequences. Defaults to True.
max_length (int): Maximum sequence length for fixed length padding. If -1 takes the
largest sequence length in this batch
attention (bool): Use attention mechanism. Defaults to False
num_heads (int): Number of attention heads. If 1 uses single headed attention
nystrom (bool): Use nystrom approximation for multihead attention
num_landmarks (int): Number of landmark sequence elements for nystrom attention
kernel_size (int): Kernel size for multihead attention output residual convolution
inverse_iterations (int): Number of iterations for moore-penrose inverse approximation
in nystrom attention. 6 is a good value
"""
super(TokenRNN, self).__init__()
if embeddings is None:
finetune_embeddings = True
assert (
vocab_size is not None
), "You should either pass an embeddings matrix or vocab size"
assert (
embeddings_dim is not None
), "You should either pass an embeddings matrix or embeddings_dim"
else:
vocab_size = embeddings.shape[0]
embeddings_dim = embeddings.shape[1]
self.embed = Embed(
vocab_size, # type: ignore
embeddings_dim, # type: ignore
embeddings=embeddings,
dropout=embeddings_dropout,
scale=hidden_size ** 0.5,
trainable=finetune_embeddings,
)
self.encoder = AttentiveRNN(
embeddings_dim, # type: ignore
hidden_size,
batch_first=batch_first,
layers=layers,
bidirectional=bidirectional,
merge_bi=merge_bi,
dropout=dropout,
rnn_type=rnn_type,
packed_sequence=packed_sequence,
attention=attention,
max_length=max_length,
num_heads=num_heads,
nystrom=nystrom,
num_landmarks=num_landmarks,
kernel_size=kernel_size,
inverse_iterations=inverse_iterations,
return_hidden=return_hidden,
)
self.out_size = self.encoder.out_size
forward(self, x, lengths)
Token RNN forward pass
If self.attention=True then the outputs are the weighted sum of the RNN hidden states with the attention score weights Else the output is the last hidden state of the RNN.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
Tensor |
[B, L] Input token ids |
required |
lengths |
Tensor |
[B] Original sequence lengths |
required |
Returns:
Type | Description |
---|---|
Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] |
torch.Tensor: [B, H] or [B, 2*H] Output features to be used for classification |
Source code in slp/modules/rnn.py
def forward(
self, x: torch.Tensor, lengths: torch.Tensor
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
"""Token RNN forward pass
If self.attention=True then the outputs are the weighted sum of the RNN hidden states with the attention score weights
Else the output is the last hidden state of the RNN.
Args:
x (torch.Tensor): [B, L] Input token ids
lengths (torch.Tensor): [B] Original sequence lengths
Returns:
torch.Tensor: [B, H] or [B, 2*H] Output features to be used for classification
"""
x = self.embed(x)
out = self.encoder(x, lengths)
return out # type: ignore
Decoder
forward(self, target, encoded, source_mask=None, target_mask=None)
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:Module
instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.
Source code in slp/modules/transformer.py
def forward(self, target, encoded, source_mask=None, target_mask=None):
for l in self.decoder:
target = l(
target, encoded, source_mask=source_mask, target_mask=target_mask
)
return target
DecoderLayer
forward(self, targets, encoded, source_mask=None, target_mask=None)
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:Module
instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.
Source code in slp/modules/transformer.py
def forward(self, targets, encoded, source_mask=None, target_mask=None):
targets = self.in_layer(targets, attention_mask=target_mask)
out = self.fuse_layer(encoded, targets, attention_mask=source_mask)
out = self.out_layer(out)
return out
Encoder
forward(self, x, attention_mask=None)
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:Module
instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.
Source code in slp/modules/transformer.py
def forward(self, x, attention_mask=None):
for layer in self.encoder:
x = layer(x, attention_mask=attention_mask)
return x
EncoderDecoder
forward(self, source, target, source_mask=None, target_mask=None)
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:Module
instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.
Source code in slp/modules/transformer.py
def forward(self, source, target, source_mask=None, target_mask=None):
encoded = self.encoder(source, attention_mask=source_mask)
decoded = self.decoder(
target, encoded, source_mask=source_mask, target_mask=target_mask
)
return decoded
EncoderLayer
forward(self, x, attention_mask=None)
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:Module
instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.
Source code in slp/modules/transformer.py
def forward(self, x, attention_mask=None):
out = self.l1(x, attention_mask=attention_mask)
out = self.l2(out)
return out
Sublayer1
forward(self, x, attention_mask=None)
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:Module
instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.
Source code in slp/modules/transformer.py
def forward(self, x, attention_mask=None):
return (
self._prenorm(x, attention_mask=attention_mask)
if self.prenorm
else self._postnorm(x, attention_mask=attention_mask)
)
Sublayer2
forward(self, x)
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:Module
instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.
Source code in slp/modules/transformer.py
def forward(self, x):
return self._prenorm(x) if self.prenorm else self._postnorm(x)
Sublayer3
forward(self, x, y, attention_mask=None)
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:Module
instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.
Source code in slp/modules/transformer.py
def forward(self, x, y, attention_mask=None):
return (
self._prenorm(x, y, attention_mask=attention_mask)
if self.prenorm
else self._postnorm(x, y, attention_mask=attention_mask)
)
Transformer
forward(self, source, target, source_mask=None, target_mask=None)
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:Module
instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.
Source code in slp/modules/transformer.py
def forward(self, source, target, source_mask=None, target_mask=None):
source = self.embed(source)
target = self.embed(target)
# Adding embeddings + pos embeddings
# is done in PositionalEncoding class
source = self.pe(source)
target = self.pe(target)
out = self.transformer_block(
source, target, source_mask=source_mask, target_mask=target_mask
)
out = self.drop(out)
out = self.predict(out)
return out
TransformerSequenceEncoder
forward(self, x, attention_mask=None)
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:Module
instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.
Source code in slp/modules/transformer.py
def forward(self, x, attention_mask=None):
if self.feature_norm:
x = self.feature_norm(x)
x = self.embed(x)
x = self.pe(x)
out = self.transformer_block(x, attention_mask=attention_mask).mean(dim=1)
return out
TransformerTokenSequenceEncoder
forward(self, x, attention_mask=None)
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:Module
instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.
Source code in slp/modules/transformer.py
def forward(self, x, attention_mask=None):
x = self.embed(x)
x = self.pe(x)
out = self.transformer_block(x, attention_mask=attention_mask).mean(dim=1)
return out
reset_parameters(named_parameters, gain=1.0)
Initialize parameters in the transformer model.
Source code in slp/modules/transformer.py
def reset_parameters(named_parameters, gain=1.0):
"""Initialize parameters in the transformer model."""
for name, p in named_parameters:
if p.dim() > 1:
if "weight" in name:
nn.init.xavier_normal_(p, gain=gain)
if "bias" in name:
nn.init.constant_(p, 0.0)