Skip to content

Data manipulation

We provide out of the box support for easy preprocessing of NLP corpora and helpers to work with datasets in the Pytorch way.

MultimodalSequenceClassificationCollator

__call__(self, batch) special

Call collate function

Parameters:

Name Type Description Default
batch List[Dict[str, torch.Tensor]]

Batch of samples. It expects a list of dictionaries from modalities to torch tensors

required

Returns:

Type Description
Tuple[Dict[str, torch.Tensor], torch.Tensor, Dict[str, torch.Tensor]]

Tuple[Dict[str, torch.Tensor], torch.Tensor, Dict[str, torch.Tensor]]: tuple of (dict batched modality tensors, labels, dict of modality sequence lengths)

Source code in slp/data/collators.py
def __call__(
    self, batch: List[Dict[str, torch.Tensor]]
) -> Tuple[Dict[str, torch.Tensor], torch.Tensor, Dict[str, torch.Tensor]]:
    """Call collate function

    Args:
        batch (List[Dict[str, torch.Tensor]]): Batch of samples.
            It expects a list of dictionaries from modalities to torch tensors

    Returns:
        Tuple[Dict[str, torch.Tensor], torch.Tensor, Dict[str, torch.Tensor]]: tuple of
            (dict batched modality tensors, labels, dict of modality sequence lengths)
    """
    inputs = {}
    lengths = {}

    for m in self.modalities:
        seq = self.extract_sequence(batch, m)
        lengths[m] = torch.tensor([s.size(0) for s in seq], device=self.device)

        if self.max_length > 0:
            lengths[m] = torch.clamp(lengths[m], min=0, max=self.max_length)

        inputs[m] = pad_sequence(
            seq,
            batch_first=True,
            padding_value=self.pad_indx,
            max_length=self.max_length,
        ).to(self.device)

    targets: List[Label] = [b[self.label_key] for b in batch]

    # Pad and convert to tensor
    ttargets: torch.Tensor = mktensor(
        targets, device=self.device, dtype=self.label_dtype
    )

    return inputs, ttargets.to(self.device), lengths

__init__(self, pad_indx=0, modalities={'audio', 'visual', 'text'}, label_key='label', max_length=-1, label_dtype=torch.float32, device='cpu') special

Collate function for sequence classification tasks

  • Perform padding
  • Calculate sequence lengths

Parameters:

Name Type Description Default
pad_indx int

Pad token index. Defaults to 0.

0
modalities Set

Which modalities are included in the batch dict

{'audio', 'visual', 'text'}
max_length int

Pad sequences to a fixed maximum length

-1
label_key str

String to access the label in the batch dict

'label'
device str

device of returned tensors. Leave this as "cpu". The LightningModule will handle the Conversion.

'cpu'

Examples:

>>> dataloader = torch.utils.DataLoader(my_dataset, collate_fn=MultimodalSequenceClassificationCollator())
Source code in slp/data/collators.py
def __init__(
    self,
    pad_indx=0,
    modalities={"visual", "text", "audio"},
    label_key="label",
    max_length=-1,
    label_dtype=torch.float,
    device="cpu",
):
    """Collate function for sequence classification tasks

    * Perform padding
    * Calculate sequence lengths

    Args:
        pad_indx (int): Pad token index. Defaults to 0.
        modalities (Set): Which modalities are included in the batch dict
        max_length (int): Pad sequences to a fixed maximum length
        label_key (str): String to access the label in the batch dict
        device (str): device of returned tensors. Leave this as "cpu".
            The LightningModule will handle the Conversion.

    Examples:
        >>> dataloader = torch.utils.DataLoader(my_dataset, collate_fn=MultimodalSequenceClassificationCollator())
    """
    self.pad_indx = pad_indx
    self.device = device
    self.max_length = max_length
    self.label_key = label_key
    self.modalities = modalities
    self.label_dtype = label_dtype

Seq2SeqCollator

__call__(self, batch) special

Call collate function

Parameters:

Name Type Description Default
batch List[Tuple[torch.Tensor, torch.Tensor]]

Batch of samples. It expects a list of tuples (source, target) Each source and target are a sequences of features or ids.

required

Returns:

Type Description
Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]

Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: Returns tuple of batched tensors (inputs, labels, lengths_inputs, lengths_targets)

Source code in slp/data/collators.py
def __call__(
    self, batch: List[Tuple[torch.Tensor, torch.Tensor]]
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """Call collate function

    Args:
        batch (List[Tuple[torch.Tensor, torch.Tensor]]): Batch of samples.
            It expects a list of tuples (source, target)
            Each source and target are a sequences of features or ids.

    Returns:
        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: Returns tuple of batched tensors
            (inputs, labels, lengths_inputs, lengths_targets)
    """
    inputs: List[torch.Tensor] = [b[0] for b in batch]
    targets: List[torch.Tensor] = [b[1] for b in batch]
    lengths_inputs = torch.tensor([s.size(0) for s in inputs], device=self.device)
    lengths_targets = torch.tensor([s.size(0) for s in targets], device=self.device)

    if self.max_length > 0:
        lengths_inputs = torch.clamp(lengths_inputs, min=0, max=self.max_length)
        lengths_targets = torch.clamp(lengths_targets, min=0, max=self.max_length)

    inputs_padded: torch.Tensor = pad_sequence(
        inputs,
        batch_first=True,
        padding_value=self.pad_indx,
        max_length=self.max_length,
    ).to(self.device)

    targets_padded: torch.Tensor = pad_sequence(
        targets,
        batch_first=True,
        padding_value=self.pad_indx,
        max_length=self.max_length,
    ).to(self.device)

    return inputs_padded, targets_padded, lengths_inputs, lengths_targets

__init__(self, pad_indx=0, max_length=-1, device='cpu') special

Collate function for seq2seq tasks

  • Perform padding
  • Calculate sequence lengths

Parameters:

Name Type Description Default
pad_indx int

Pad token index. Defaults to 0.

0
max_length int

Pad sequences to a fixed maximum length

-1
device str

device of returned tensors. Leave this as "cpu". The LightningModule will handle the Conversion.

'cpu'

Examples:

>>> dataloader = torch.utils.DataLoader(my_dataset, collate_fn=Seq2SeqClassificationCollator())
Source code in slp/data/collators.py
def __init__(self, pad_indx=0, max_length=-1, device="cpu"):
    """Collate function for seq2seq tasks

    * Perform padding
    * Calculate sequence lengths

    Args:
        pad_indx (int): Pad token index. Defaults to 0.
        max_length (int): Pad sequences to a fixed maximum length
        device (str): device of returned tensors. Leave this as "cpu".
            The LightningModule will handle the Conversion.

    Examples:
        >>> dataloader = torch.utils.DataLoader(my_dataset, collate_fn=Seq2SeqClassificationCollator())
    """
    self.pad_indx = pad_indx
    self.max_length = max_length
    self.device = device

SequenceClassificationCollator

__call__(self, batch) special

Call collate function

Parameters:

Name Type Description Default
batch List[Tuple[torch.Tensor, Union[numpy.ndarray, torch.Tensor, List[~T], int]]]

Batch of samples. It expects a list of tuples (inputs, label).

required

Returns:

Type Description
Tuple[torch.Tensor, torch.Tensor, torch.Tensor]

Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns tuple of batched tensors (inputs, labels, lengths)

Source code in slp/data/collators.py
def __call__(
    self, batch: List[Tuple[torch.Tensor, Label]]
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Call collate function

    Args:
        batch (List[Tuple[torch.Tensor, slp.util.types.Label]]): Batch of samples.
            It expects a list of tuples (inputs, label).

    Returns:
        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns tuple of batched tensors (inputs, labels, lengths)
    """
    inputs: List[torch.Tensor] = [b[0] for b in batch]
    targets: List[Label] = [b[1] for b in batch]
    #  targets: List[torch.tensor] = map(list, zip(*batch))
    lengths = torch.tensor([s.size(0) for s in inputs], device=self.device)

    if self.max_length > 0:
        lengths = torch.clamp(lengths, min=0, max=self.max_length)
    # Pad and convert to tensor
    inputs_padded: torch.Tensor = pad_sequence(
        inputs,
        batch_first=True,
        padding_value=self.pad_indx,
        max_length=self.max_length,
    ).to(self.device)

    ttargets: torch.Tensor = mktensor(targets, device=self.device, dtype=torch.long)

    return inputs_padded, ttargets.to(self.device), lengths

__init__(self, pad_indx=0, max_length=-1, device='cpu') special

Collate function for sequence classification tasks

  • Perform padding
  • Calculate sequence lengths

Parameters:

Name Type Description Default
pad_indx int

Pad token index. Defaults to 0.

0
max_length int

Pad sequences to a fixed maximum length

-1
device str

device of returned tensors. Leave this as "cpu". The LightningModule will handle the Conversion.

'cpu'

Examples:

>>> dataloader = torch.utils.DataLoader(my_dataset, collate_fn=SequenceClassificationCollator())
Source code in slp/data/collators.py
def __init__(self, pad_indx=0, max_length=-1, device="cpu"):
    """Collate function for sequence classification tasks

    * Perform padding
    * Calculate sequence lengths

    Args:
        pad_indx (int): Pad token index. Defaults to 0.
        max_length (int): Pad sequences to a fixed maximum length
        device (str): device of returned tensors. Leave this as "cpu".
            The LightningModule will handle the Conversion.

    Examples:
        >>> dataloader = torch.utils.DataLoader(my_dataset, collate_fn=SequenceClassificationCollator())
    """
    self.pad_indx = pad_indx
    self.device = device
    self.max_length = max_length

EmbeddingsLoader

__init__(self, embeddings_file, dim, vocab=None, extra_tokens=None) special

Load word embeddings in text format

Parameters:

Name Type Description Default
embeddings_file str

File where embeddings are stored (e.g. glove.6B.50d.txt)

required
dim int

Dimensionality of embeddings

required
vocab Optional[Dict[str, int]]

Load only embeddings in vocab. Defaults to None.

None
extra_tokens Optional[slp.config.nlp.SPECIAL_TOKENS]

Create random embeddings for these special tokens. Defaults to None.

None
Source code in slp/data/corpus.py
def __init__(
    self,
    embeddings_file: str,
    dim: int,
    vocab: Optional[Dict[str, int]] = None,
    extra_tokens: Optional[SPECIAL_TOKENS] = None,
) -> None:
    """Load word embeddings in text format

    Args:
        embeddings_file (str): File where embeddings are stored (e.g. glove.6B.50d.txt)
        dim (int): Dimensionality of embeddings
        vocab (Optional[Dict[str, int]]): Load only embeddings in vocab. Defaults to None.
        extra_tokens (Optional[slp.config.nlp.SPECIAL_TOKENS]): Create random embeddings for these special tokens.
            Defaults to None.
    """
    self.embeddings_file = embeddings_file
    self.vocab = vocab
    self.cache_ = self._get_cache_name()
    self.dim_ = dim
    self.extra_tokens = extra_tokens

__repr__(self) special

String representation of class

Source code in slp/data/corpus.py
def __repr__(self):
    """String representation of class"""

    return f"{self.__class__.__name__}({self.embeddings_file}, {self.dim_})"

augment_embeddings(self, word2idx, idx2word, embeddings, token, emb=None)

Create a random embedding for a special token and append it to the embeddings array

Parameters:

Name Type Description Default
word2idx Dict[str, int]

Current word2idx map

required
idx2word Dict[int, str]

Current idx2word map

required
embeddings List[numpy.ndarray]

Embeddings array as list of embeddings

required
token str

The special token (e.g. [PAD])

required
emb Optional[numpy.ndarray]

Optional value for the embedding to be appended. Defaults to None, where a random embedding is created.

None

Returns:

Type Description
Tuple[Dict[str, int], Dict[int, str], List[numpy.ndarray]]

Tuple[Dict[str, int], Dict[int, str], List[np.ndarray]]: (word2idx, idx2word, embeddings) tuple

Source code in slp/data/corpus.py
def augment_embeddings(
    self,
    word2idx: Dict[str, int],
    idx2word: Dict[int, str],
    embeddings: List[np.ndarray],
    token: str,
    emb: Optional[np.ndarray] = None,
) -> Tuple[Dict[str, int], Dict[int, str], List[np.ndarray]]:
    """Create a random embedding for a special token and append it to the embeddings array

    Args:
        word2idx (Dict[str, int]): Current word2idx map
        idx2word (Dict[int, str]): Current idx2word map
        embeddings (List[np.ndarray]): Embeddings array as list of embeddings
        token (str): The special token (e.g. [PAD])
        emb (Optional[np.ndarray]): Optional value for the embedding to be appended.
            Defaults to None, where a random embedding is created.

    Returns:
        Tuple[Dict[str, int], Dict[int, str], List[np.ndarray]]: (word2idx, idx2word, embeddings) tuple
    """
    word2idx[token] = len(embeddings)
    idx2word[len(embeddings)] = token

    if emb is None:
        emb = np.random.uniform(low=-0.05, high=0.05, size=self.dim_)
    embeddings.append(emb)

    return word2idx, idx2word, embeddings

in_accepted_vocab(self, word)

Check if word exists in given vocabulary

Parameters:

Name Type Description Default
word str

word from embeddings file

required

Returns:

Type Description
bool

bool: Word exists

Source code in slp/data/corpus.py
def in_accepted_vocab(self, word: str) -> bool:
    """Check if word exists in given vocabulary

    Args:
        word (str): word from embeddings file

    Returns:
        bool: Word exists
    """

    return True if self.vocab is None else word in self.vocab

load(self)

Read the word vectors from a text file

  • Read embeddings
  • Filter with given vocabulary
  • Augment with special tokens

Returns:

Type Description
Tuple[Dict[str, int], Dict[int, str], numpy.ndarray]

types.Embeddings: (word2idx, idx2word, embeddings) tuple

Source code in slp/data/corpus.py
@system.timethis(method=True)
def load(self) -> types.Embeddings:
    """Read the word vectors from a text file

    * Read embeddings
    * Filter with given vocabulary
    * Augment with special tokens

    Returns:
        types.Embeddings: (word2idx, idx2word, embeddings) tuple
    """
    # in order to avoid this time consuming operation, cache the results
    try:
        cache = self._load_cache()
        logger.info("Loaded word embeddings from cache.")

        return cache
    except OSError:
        logger.warning(f"Didn't find embeddings cache file {self.embeddings_file}")
        logger.warning("Loading embeddings from file.")

    # create the necessary dictionaries and the word embeddings matrix

    if not os.path.exists(self.embeddings_file):
        logger.critical(f"{self.embeddings_file} not found!")
        raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), self.embeddings_file)

    logger.info(f"Indexing file {self.embeddings_file} ...")

    # create the 2D array, which will be used for initializing
    # the Embedding layer of a NN.
    # We reserve the first row (idx=0), as the word embedding,
    # which will be used for zero padding (word with id = 0).

    if self.extra_tokens is not None:
        word2idx, idx2word, embeddings = self.augment_embeddings(
            {},
            {},
            [],
            self.extra_tokens.PAD.value,  # type: ignore
            emb=np.zeros(self.dim_),
        )

        for token in self.extra_tokens:  # type: ignore
            logger.debug(f"Adding token {token.value} to embeddings matrix")

            if token == self.extra_tokens.PAD:
                continue
            word2idx, idx2word, embeddings = self.augment_embeddings(
                word2idx, idx2word, embeddings, token.value
            )
    else:
        word2idx, idx2word, embeddings = self.augment_embeddings(
            {}, {}, [], "[PAD]", emb=np.zeros(self.dim_)
        )
    # read file, line by line
    with open(self.embeddings_file, "r") as f:
        num_lines = sum(1 for line in f)

    with open(self.embeddings_file, "r") as f:
        index = len(embeddings)

        for line in tqdm(
            f, total=num_lines, desc="Loading word embeddings...", leave=False
        ):
            # skip the first row if it is a header

            if len(line.split()) < self.dim_:
                continue

            values = line.rstrip().split(" ")
            word = values[0]

            if word in word2idx:
                continue

            if not self.in_accepted_vocab(word):
                continue

            vector = np.asarray(values[1:], dtype=np.float32)
            idx2word[index] = word
            word2idx[word] = index
            embeddings.append(vector)
            index += 1

    logger.info(f"Loaded {len(embeddings)} word vectors.")
    embeddings_out = np.array(embeddings, dtype="float32")

    # write the data to a cache file
    self._dump_cache((word2idx, idx2word, embeddings_out))

    return word2idx, idx2word, embeddings_out

HfCorpus

embeddings: None property readonly

Unused. Defined for compatibility

frequencies: Dict[str, int] property readonly

Retrieve wordpieces occurence counts

Returns:

Type Description
Dict[str, int]

Dict[str, int]: wordpieces occurence counts

idx2word: None property readonly

Unused. Defined for compatibility

indices: List[List[int]] property readonly

Retrieve corpus as token indices

Returns:

Type Description
List[List[int]]

List[List[int]]: Token indices for corpus

raw: List[str] property readonly

Retrieve raw corpus

Returns:

Type Description
List[str]

List[str]: Raw Corpus

tokenized: List[List[str]] property readonly

Retrieve tokenized corpus

Returns:

Type Description
List[List[str]]

List[List[str]]: tokenized corpus

vocab: Set[str] property readonly

Retrieve set of words in vocabulary

Returns:

Type Description
Set[str]

Set[str]: set of words in vocabulary

vocab_size: int property readonly

Retrieve vocabulary size

Returns:

Type Description
int

int: Vocabulary size

word2idx: None property readonly

Unused. Defined for compatibility

__getitem__(self, idx) special

Get ith element in corpus as token indices

Parameters:

Name Type Description Default
idx List[int]

index in corpus

required

Returns:

Type Description
List[int]

List[int]: List of token indices for sentence

Source code in slp/data/corpus.py
def __getitem__(self, idx) -> List[int]:
    """Get ith element in corpus as token indices

    Args:
        idx (List[int]): index in corpus

    Returns:
        List[int]: List of token indices for sentence
    """
    out: List[int] = (
        self.corpus_indices_[idx]
        if self.max_length <= 0
        else self.corpus_indices_[idx][: self.max_length]
    )

    return out

__init__(self, corpus, lower=True, tokenizer_model='bert-base-uncased', add_special_tokens=True, special_tokens=<enum 'SPECIAL_TOKENS'>, max_length=-1, **kwargs) special

Process a corpus using hugging face tokenizers

Select one of hugging face tokenizers and process corpus

Parameters:

Name Type Description Default
corpus List[str]

List of sentences

required
lower bool

Convert strings to lower case. Defaults to True.

True
tokenizer_model str

Hugging face model to use. Defaults to "bert-base-uncased".

'bert-base-uncased'
add_special_tokens bool

Add special tokens in sentence during tokenization. Defaults to True.

True
special_tokens Optional[slp.config.nlp.SPECIAL_TOKENS]

Special tokens to include in the vocabulary. Defaults to slp.config.nlp.SPECIAL_TOKENS.

<enum 'SPECIAL_TOKENS'>
max_length int

Crop sequences above this length. Defaults to -1 where sequences are left unaltered.

-1
Source code in slp/data/corpus.py
def __init__(
    self,
    corpus: List[str],
    lower: bool = True,
    tokenizer_model: str = "bert-base-uncased",
    add_special_tokens: bool = True,
    special_tokens: Optional[SPECIAL_TOKENS] = SPECIAL_TOKENS,  # type: ignore
    max_length: int = -1,
    **kwargs,
):
    """Process a corpus using hugging face tokenizers

    Select one of hugging face tokenizers and process corpus

    Args:
        corpus (List[str]): List of sentences
        lower (bool): Convert strings to lower case. Defaults to True.
        tokenizer_model (str): Hugging face model to use. Defaults to "bert-base-uncased".
        add_special_tokens (bool): Add special tokens in sentence during tokenization. Defaults to True.
        special_tokens (Optional[SPECIAL_TOKENS]): Special tokens to include in the vocabulary.
             Defaults to slp.config.nlp.SPECIAL_TOKENS.
        max_length (int): Crop sequences above this length. Defaults to -1 where sequences are left unaltered.
    """
    self.corpus_ = corpus
    self.max_length = max_length

    logger.info(
        f"Tokenizing corpus using hugging face tokenizer from {tokenizer_model}"
    )

    self.tokenizer = HuggingFaceTokenizer(
        lower=lower, model=tokenizer_model, add_special_tokens=add_special_tokens
    )

    self.corpus_indices_ = [
        self.tokenizer(s)
        for s in tqdm(
            self.corpus_, desc="Converting tokens to indices...", leave=False
        )
    ]

    self.tokenized_corpus_ = [
        self.tokenizer.detokenize(s)
        for s in tqdm(
            self.corpus_indices_,
            desc="Mapping indices to tokens...",
            leave=False,
        )
    ]

    self.vocab_ = create_vocab(
        self.tokenized_corpus_,
        vocab_size=-1,
        special_tokens=special_tokens,
    )

__len__(self) special

Number of samples in corpus

Returns:

Type Description
int

int: Corpus length

Source code in slp/data/corpus.py
def __len__(self) -> int:
    """Number of samples in corpus

    Returns:
        int: Corpus length
    """

    return len(self.corpus_indices_)

TokenizedCorpus

embeddings: None property readonly

Unused. Kept for compatibility

frequencies: Dict[str, int] property readonly

Retrieve wordpieces occurence counts

Returns:

Type Description
Dict[str, int]

Dict[str, int]: wordpieces occurence counts

idx2word: Dict[int, str] property readonly

Retrieve idx2word mapping

Returns:

Type Description
Dict[int, str]

Dict[str, int]: idx2word mapping

indices: Union[List[int], List[List[int]]] property readonly

Retrieve corpus as token indices

Returns:

Type Description
Union[List[int], List[List[int]]]

List[List[int]]: Token indices for corpus

raw: Union[List[str], List[List[str]]] property readonly

Retrieve raw corpus

Returns:

Type Description
Union[List[str], List[List[str]]]

List[str]: Raw Corpus

tokenized: Union[List[str], List[List[str]]] property readonly

Retrieve tokenized corpus

Returns:

Type Description
Union[List[str], List[List[str]]]

List[List[str]]: Tokenized corpus

vocab: Set[str] property readonly

Retrieve set of words in vocabulary

Returns:

Type Description
Set[str]

Set[str]: set of words in vocabulary

vocab_size: int property readonly

Retrieve vocabulary size

Returns:

Type Description
int

int: Vocabulary size

word2idx: Dict[str, int] property readonly

Retrieve word2idx mapping

Returns:

Type Description
Dict[str, int]

Dict[str, int]: word2idx mapping

__getitem__(self, idx) special

Get ith element in corpus as token indices

Parameters:

Name Type Description Default
idx List[int]

index in corpus

required

Returns:

Type Description
List[int]

List[int]: List of token indices for sentence

Source code in slp/data/corpus.py
def __getitem__(self, idx) -> List[int]:
    """Get ith element in corpus as token indices

    Args:
        idx (List[int]): index in corpus

    Returns:
        List[int]: List of token indices for sentence
    """
    out: List[int] = (
        self.corpus_indices_[idx]
        if self.max_length <= 0
        else self.corpus_indices_[idx][: self.max_length]
    )

    return out

__init__(self, corpus, word2idx=None, special_tokens=<enum 'SPECIAL_TOKENS'>, max_length=-1, **kwargs) special

Wrap a corpus that's already tokenized

Parameters:

Name Type Description Default
corpus Union[List[str], List[List[str]]]

List of tokens or List of lists of tokens

required
word2idx Dict[str, int]

Token to index mapping. Defaults to None.

None
special_tokens Optional[slp.config.nlp.SPECIAL_TOKENS]

Special Tokens. Defaults to SPECIAL_TOKENS.

<enum 'SPECIAL_TOKENS'>
Source code in slp/data/corpus.py
def __init__(
    self,
    corpus: Union[List[str], List[List[str]]],
    word2idx: Dict[str, int] = None,
    special_tokens: Optional[SPECIAL_TOKENS] = SPECIAL_TOKENS,  # type: ignore
    max_length: int = -1,
    **kwargs,
):
    """Wrap a corpus that's already tokenized

    Args:
        corpus (Union[List[str], List[List[str]]]): List of tokens or List of lists of tokens
        word2idx (Dict[str, int], optional): Token to index mapping. Defaults to None.
        special_tokens (Optional[SPECIAL_TOKENS], optional): Special Tokens. Defaults to SPECIAL_TOKENS.
    """
    self.corpus_ = corpus
    self.tokenized_corpus_ = corpus
    self.max_length = max_length

    self.vocab_ = create_vocab(
        self.tokenized_corpus_,
        vocab_size=-1,
        special_tokens=special_tokens,
    )

    if word2idx is not None:
        logger.info("Converting tokens to ids using word2idx.")
        self.word2idx_ = word2idx
    else:
        logger.info(
            "No word2idx provided. Will convert tokens to ids using an iterative counter."
        )
        self.word2idx_ = dict(zip(self.vocab_.keys(), itertools.count()))

    self.idx2word_ = {v: k for k, v in self.word2idx_.items()}

    self.to_token_ids = ToTokenIds(
        self.word2idx_,
        specials=SPECIAL_TOKENS,  # type: ignore
    )

    if isinstance(self.tokenized_corpus_[0], list):
        self.corpus_indices_ = [
            self.to_token_ids(s)
            for s in tqdm(
                self.tokenized_corpus_,
                desc="Converting tokens to token ids...",
                leave=False,
            )
        ]
    else:
        self.corpus_indices_ = self.to_token_ids(self.tokenized_corpus_)  # type: ignore

__len__(self) special

Number of samples in corpus

Returns:

Type Description
int

int: Corpus length

Source code in slp/data/corpus.py
def __len__(self) -> int:
    """Number of samples in corpus

    Returns:
        int: Corpus length
    """

    return len(self.corpus_indices_)

WordCorpus

embeddings: ndarray property readonly

Retrieve embeddings array

Returns:

Type Description
ndarray

np.ndarray: Array of pretrained word embeddings

frequencies: Dict[str, int] property readonly

Retrieve word occurence counts

Returns:

Type Description
Dict[str, int]

Dict[str, int]: word occurence counts

idx2word: Dict[int, str] property readonly

Retrieve idx2word mapping

Returns:

Type Description
Dict[int, str]

Dict[str, int]: idx2word mapping

indices: List[List[int]] property readonly

Retrieve corpus as token indices

Returns:

Type Description
List[List[int]]

List[List[int]]: Token indices for corpus

raw: List[str] property readonly

Retrieve raw corpus

Returns:

Type Description
List[str]

List[str]: Raw Corpus

tokenized: List[List[str]] property readonly

Retrieve tokenized corpus

Returns:

Type Description
List[List[str]]

List[List[str]]: Tokenized corpus

vocab: Set[str] property readonly

Retrieve set of words in vocabulary

Returns:

Type Description
Set[str]

Set[str]: set of words in vocabulary

vocab_size: int property readonly

Retrieve vocabulary size for corpus

Returns:

Type Description
int

int: vocabulary size

word2idx: Dict[str, int] property readonly

Retrieve word2idx mapping

Returns:

Type Description
Dict[str, int]

Dict[str, int]: word2idx mapping

__getitem__(self, idx) special

Get ith element in corpus as token indices

Parameters:

Name Type Description Default
idx List[int]

index in corpus

required

Returns:

Type Description
List[int]

List[int]: List of token indices for sentence

Source code in slp/data/corpus.py
def __getitem__(self, idx) -> List[int]:
    """Get ith element in corpus as token indices

    Args:
        idx (List[int]): index in corpus

    Returns:
        List[int]: List of token indices for sentence
    """
    out: List[int] = (
        self.corpus_indices_[idx]
        if self.max_length <= 0
        else self.corpus_indices_[idx][: self.max_length]
    )

    return out

__init__(self, corpus, limit_vocab_size=30000, word2idx=None, idx2word=None, embeddings=None, embeddings_file=None, embeddings_dim=300, lower=True, special_tokens=<enum 'SPECIAL_TOKENS'>, prepend_bos=False, append_eos=False, lang='en_core_web_md', max_length=-1, **kwargs) special

Load corpus embeddings, tokenize in words using spacy and convert to ids

This class handles the handling of a raw corpus. It handles:

  • Tokenization into words (spacy)
  • Loading of pretrained word embedding
  • Calculation of word frequencies / corpus statistics
  • Conversion to token ids

You can pass either:

  • Pass an embeddings file to load pretrained embeddings and create the word2idx mapping
  • Pass already loaded embeddings array and word2idx. This is useful for the dev / test splits where we want to pass the train split embeddings / word2idx.

Parameters:

Name Type Description Default
corpus List[str]

Corpus as a list of sentences

required
limit_vocab_size int

Upper bound for number of most frequent tokens to keep. Defaults to 30000.

30000
word2idx Optional[Dict[str, int]]

Mapping of word to indices. Defaults to None.

None
idx2word Optional[Dict[int, str]]

Mapping of indices to words. Defaults to None.

None
embeddings Optional[numpy.ndarray]

Embeddings array. Defaults to None.

None
embeddings_file Optional[str]

Embeddings file to read. Defaults to None.

None
embeddings_dim int

Dimension of embeddings. Defaults to 300.

300
lower bool

Convert strings to lower case. Defaults to True.

True
special_tokens Optional[slp.config.nlp.SPECIAL_TOKENS]

Special tokens to include in the vocabulary. Defaults to slp.config.nlp.SPECIAL_TOKENS.

<enum 'SPECIAL_TOKENS'>
prepend_bos bool

Prepend Beginning of Sequence token for seq2seq tasks. Defaults to False.

False
append_eos bool

Append End of Sequence token for seq2seq tasks. Defaults to False.

False
lang str

Spacy language, e.g. el_core_web_sm, en_core_web_sm etc. Defaults to "en_core_web_md".

'en_core_web_md'
max_length int

Crop sequences above this length. Defaults to -1 where sequences are left unaltered.

-1
Source code in slp/data/corpus.py
def __init__(
    self,
    corpus: List[str],
    limit_vocab_size: int = 30000,
    word2idx: Optional[Dict[str, int]] = None,
    idx2word: Optional[Dict[int, str]] = None,
    embeddings: Optional[np.ndarray] = None,
    embeddings_file: Optional[str] = None,
    embeddings_dim: int = 300,
    lower: bool = True,
    special_tokens: Optional[SPECIAL_TOKENS] = SPECIAL_TOKENS,  # type: ignore
    prepend_bos: bool = False,
    append_eos: bool = False,
    lang: str = "en_core_web_md",
    max_length: int = -1,
    **kwargs,
):
    """Load corpus embeddings, tokenize in words using spacy and convert to ids

    This class handles the handling of a raw corpus. It handles:

    * Tokenization into words (spacy)
    * Loading of pretrained word embedding
    * Calculation of word frequencies / corpus statistics
    * Conversion to token ids

    You can pass either:

    * Pass an embeddings file to load pretrained embeddings and create the word2idx mapping
    * Pass already loaded embeddings array and word2idx. This is useful for the dev / test splits
      where we want to pass the train split embeddings / word2idx.

    Args:
        corpus (List[List[str]]): Corpus as a list of sentences
        limit_vocab_size (int): Upper bound for number of most frequent tokens to keep. Defaults to 30000.
        word2idx (Optional[Dict[str, int]]): Mapping of word to indices. Defaults to None.
        idx2word (Optional[Dict[int, str]]): Mapping of indices to words. Defaults to None.
        embeddings (Optional[np.ndarray]): Embeddings array. Defaults to None.
        embeddings_file (Optional[str]): Embeddings file to read. Defaults to None.
        embeddings_dim (int): Dimension of embeddings. Defaults to 300.
        lower (bool): Convert strings to lower case. Defaults to True.
        special_tokens (Optional[SPECIAL_TOKENS]): Special tokens to include in the vocabulary.
             Defaults to slp.config.nlp.SPECIAL_TOKENS.
        prepend_bos (bool): Prepend Beginning of Sequence token for seq2seq tasks. Defaults to False.
        append_eos (bool): Append End of Sequence token for seq2seq tasks. Defaults to False.
        lang (str): Spacy language, e.g. el_core_web_sm, en_core_web_sm etc. Defaults to "en_core_web_md".
        max_length (int): Crop sequences above this length. Defaults to -1 where sequences are left unaltered.
    """
    # FIXME: Extract super class to avoid repetition
    self.corpus_ = corpus
    self.max_length = max_length
    self.tokenizer = SpacyTokenizer(
        lower=lower,
        prepend_bos=prepend_bos,
        append_eos=append_eos,
        specials=special_tokens,
        lang=lang,
    )

    logger.info(f"Tokenizing corpus using spacy {lang}")

    self.tokenized_corpus_ = [
        self.tokenizer(s)
        for s in tqdm(self.corpus_, desc="Tokenizing corpus...", leave=False)
    ]

    self.vocab_ = create_vocab(
        self.tokenized_corpus_,
        vocab_size=limit_vocab_size if word2idx is None else -1,
        special_tokens=special_tokens,
    )

    self.word2idx_, self.idx2word_, self.embeddings_ = None, None, None
    # self.corpus_indices_ = self.tokenized_corpus_

    if word2idx is not None:
        logger.info("Word2idx was already provided. Going to used it.")

    if embeddings_file is not None and word2idx is None:
        logger.info(
            f"Going to load {len(self.vocab_)} embeddings from {embeddings_file}"
        )
        loader = EmbeddingsLoader(
            embeddings_file,
            embeddings_dim,
            vocab=self.vocab_,
            extra_tokens=special_tokens,
        )
        word2idx, idx2word, embeddings = loader.load()

    if embeddings is not None:
        self.embeddings_ = embeddings

    if idx2word is not None:
        self.idx2word_ = idx2word

    if word2idx is not None:
        self.word2idx_ = word2idx

        logger.info("Converting tokens to ids using word2idx.")
        self.to_token_ids = ToTokenIds(
            self.word2idx_,
            specials=SPECIAL_TOKENS,  # type: ignore
        )

        self.corpus_indices_ = [
            self.to_token_ids(s)
            for s in tqdm(
                self.tokenized_corpus_,
                desc="Converting tokens to token ids...",
                leave=False,
            )
        ]

        logger.info("Filtering corpus vocabulary.")

        updated_vocab = {}

        for k, v in self.vocab_.items():
            if k in self.word2idx_:
                updated_vocab[k] = v

        logger.info(
            f"Out of {len(self.vocab_)} tokens {len(self.vocab_) - len(updated_vocab)} were not found in the pretrained embeddings."
        )

        self.vocab_ = updated_vocab

__len__(self) special

Number of samples in corpus

Returns:

Type Description
int

int: Corpus length

Source code in slp/data/corpus.py
def __len__(self) -> int:
    """Number of samples in corpus

    Returns:
        int: Corpus length
    """

    return len(self.corpus_indices_)

create_vocab(corpus, vocab_size=-1, special_tokens=None)

Create the vocabulary based on tokenized input corpus

  • Injects special tokens in the vocabulary
  • Calculates the occurence count for each token
  • Limits vocabulary to vocab_size most common tokens

Parameters:

Name Type Description Default
corpus Union[List[str], List[List[str]]]

The tokenized corpus as a list of sentences or a list of tokenized sentences

required
vocab_size int

[description]. Limit vocabulary to vocab_size most common tokens. Defaults to -1 which keeps all tokens.

-1
special_tokens Optional[slp.config.nlp.SPECIAL_TOKENS]

Special tokens to include in the vocabulary. Defaults to None.

None

Returns:

Type Description
Dict[str, int]

Dict[str, int]: Dictionary of all accepted tokens and their corresponding occurence counts

Examples:

>>> create_vocab(["in", "a", "galaxy", "far", "far", "away"])
{'far': 2, 'away': 1, 'galaxy': 1, 'a': 1, 'in': 1}
>>> create_vocab(["in", "a", "galaxy", "far", "far", "away"], vocab_size=3)
{'far': 2, 'a': 1, 'in': 1}
>>> create_vocab(["in", "a", "galaxy", "far", "far", "away"], vocab_size=3, special_tokens=slp.config.nlp.SPECIAL_TOKENS)
{'[PAD]': 0, '[MASK]': 0, '[UNK]': 0, '[BOS]': 0, '[EOS]': 0, '[CLS]': 0, '[SEP]': 0, 'far': 2, 'a': 1, 'in': 1}
Source code in slp/data/corpus.py
def create_vocab(
    corpus: Union[List[str], List[List[str]]],
    vocab_size: int = -1,
    special_tokens: Optional[SPECIAL_TOKENS] = None,
) -> Dict[str, int]:
    """Create the vocabulary based on tokenized input corpus

    * Injects special tokens in the vocabulary
    * Calculates the occurence count for each token
    * Limits vocabulary to vocab_size most common tokens

    Args:
        corpus (Union[List[str], List[List[str]]]): The tokenized corpus as a list of sentences or a list of tokenized sentences
        vocab_size (int): [description]. Limit vocabulary to vocab_size most common tokens.
            Defaults to -1 which keeps all tokens.
        special_tokens Optional[SPECIAL_TOKENS]: Special tokens to include in the vocabulary. Defaults to None.

    Returns:
        Dict[str, int]: Dictionary of all accepted tokens and their corresponding occurence counts

    Examples:
        >>> create_vocab(["in", "a", "galaxy", "far", "far", "away"])
        {'far': 2, 'away': 1, 'galaxy': 1, 'a': 1, 'in': 1}
        >>> create_vocab(["in", "a", "galaxy", "far", "far", "away"], vocab_size=3)
        {'far': 2, 'a': 1, 'in': 1}
        >>> create_vocab(["in", "a", "galaxy", "far", "far", "away"], vocab_size=3, special_tokens=slp.config.nlp.SPECIAL_TOKENS)
        {'[PAD]': 0, '[MASK]': 0, '[UNK]': 0, '[BOS]': 0, '[EOS]': 0, '[CLS]': 0, '[SEP]': 0, 'far': 2, 'a': 1, 'in': 1}
    """

    if isinstance(corpus[0], list):
        corpus = list(itertools.chain.from_iterable(corpus))
    freq = Counter(corpus)

    if special_tokens is None:
        extra_tokens = []
    else:
        extra_tokens = special_tokens.to_list()

    if vocab_size < 0:
        vocab_size = len(freq)
    take = min(vocab_size, len(freq))
    logger.info(f"Keeping {vocab_size} most common tokens out of {len(freq)}")

    def take0(x: Tuple[Any, Any]) -> Any:
        """Take first tuple element"""

        return x[0]

    common_words = list(map(take0, freq.most_common(take)))
    common_words = list(set(common_words) - set(extra_tokens))
    words = extra_tokens + common_words

    if len(words) > vocab_size:
        words = words[: vocab_size + len(extra_tokens)]

    def token_freq(t):
        """Token frequeny"""

        return 0 if t in extra_tokens else freq[t]

    vocab = dict(zip(words, map(token_freq, words)))
    logger.info(f"Vocabulary created with {len(vocab)} tokens.")
    logger.info(f"The 10 most common tokens are:\n{freq.most_common(10)}")

    return vocab

CorpusDataset

__getitem__(self, idx) special

Get a source and target token from the corpus

Parameters:

Name Type Description Default
idx int

Token position

required

Returns:

Type Description
Tuple[torch.Tensor, torch.Tensor]

(processed sentence, label)

Source code in slp/data/datasets.py
def __getitem__(self, idx):
    """Get a source and target token from the corpus

    Args:
        idx (int): Token position

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: (processed sentence, label)
    """
    text, target = self.corpus[idx], self.labels[idx]
    if self.label_encoder is not None:
        target = self.label_encoder.transform([target])[0]
    for t in self.transforms:
        text = t(text)
    return text, target

__init__(self, corpus, labels) special

Labeled corpus dataset

Parameters:

Name Type Description Default
corpus WordCorpus, HfCorpus etc..

Input corpus

required
labels List[Any]

Labels for examples

required
Source code in slp/data/datasets.py
def __init__(self, corpus, labels):
    """Labeled corpus dataset

    Args:
        corpus (WordCorpus, HfCorpus etc..): Input corpus
        labels (List[Any]): Labels for examples
    """
    self.corpus = corpus
    self.labels = labels
    assert len(self.labels) == len(self.corpus), "Incompatible labels and corpus"
    self.transforms = []
    self.label_encoder = None
    if isinstance(self.labels[0], str):
        self.label_encoder = LabelEncoder().fit(self.labels)

__len__(self) special

Length of corpus

Returns:

Type Description
int

Corpus Length

Source code in slp/data/datasets.py
def __len__(self):
    """Length of corpus

    Returns:
        int: Corpus Length
    """
    return len(self.corpus)

map(self, t)

Append a transform to self.transforms, in order to be applied to the data

Parameters:

Name Type Description Default
t Callable[[str], Any]

Transform of input token

required

Returns:

Type Description
CorpusDataset

self

Source code in slp/data/datasets.py
def map(self, t):
    """Append a transform to self.transforms, in order to be applied to the data

    Args:
        t (Callable[[str], Any]): Transform of input token

    Returns:
        CorpusDataset: self
    """
    self.transforms.append(t)
    return self

CorpusLMDataset

__getitem__(self, idx) special

Get a source and target token from the corpus

Parameters:

Name Type Description Default
idx int

Token position

required

Returns:

Type Description
Tuple[torch.Tensor, torch.Tensor]

source=coprus[idx], target=corpus[idx+1]

Source code in slp/data/datasets.py
def __getitem__(self, idx):
    """Get a source and target token from the corpus

    Args:
        idx (int): Token position

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: source=coprus[idx], target=corpus[idx+1]
    """
    src, tgt = self.source[idx], self.target[idx]
    for t in self.transforms:
        src = t(src)
        tgt = t(tgt)
    return src, tgt

__init__(self, corpus) special

Wraps a tokenized dataset which is provided as a list of tokens

Targets = source shifted one token to the left (next token prediction)

Parameters:

Name Type Description Default
corpus List[str] or WordCorpus

List of tokens

required
Source code in slp/data/datasets.py
def __init__(self, corpus):
    """Wraps a tokenized dataset which is provided as a list of tokens

    Targets = source shifted one token to the left (next token prediction)

    Args:
        corpus (List[str] or WordCorpus): List of tokens
    """
    self.source = corpus[:-1]
    self.target = corpus[1:]
    self.transforms = []

__len__(self) special

Length of corpus

Returns:

Type Description
int

Corpus Length

Source code in slp/data/datasets.py
def __len__(self):
    """Length of corpus

    Returns:
        int: Corpus Length
    """
    return int(len(self.source))

map(self, t)

Append a transform to self.transforms, in order to be applied to the data

Parameters:

Name Type Description Default
t Callable[[str], Any]

Transform of input token

required

Returns:

Type Description
CorpusLMDataset

self

Source code in slp/data/datasets.py
def map(self, t):
    """Append a transform to self.transforms, in order to be applied to the data

    Args:
        t (Callable[[str], Any]): Transform of input token

    Returns:
        CorpusLMDataset: self
    """
    self.transforms.append(t)
    return self

HuggingFaceTokenizer

__call__(self, x) special

Call to tokenize function

Parameters:

Name Type Description Default
x str

Input string

required

Returns:

Type Description
List[int]

List[int]: List of token ids

Source code in slp/data/transforms.py
def __call__(self, x: str) -> List[int]:
    """Call to tokenize function

    Args:
        x (str): Input string

    Returns:
        List[int]: List of token ids
    """
    out: List[int] = self.tokenizer.encode(
        x, add_special_tokens=self.add_special_tokens, max_length=65536
    )
    return out

__init__(self, lower=True, model='bert-base-uncased', add_special_tokens=True) special

Apply one of huggingface tokenizers to a string

Parameters:

Name Type Description Default
lower bool

Lowercase string. Defaults to True.

True
model str

Select transformer model. Defaults to "bert-base-uncased".

'bert-base-uncased'
add_special_tokens bool

Insert special tokens to tokenized string. Defaults to True.

True
Source code in slp/data/transforms.py
def __init__(
    self,
    lower: bool = True,
    model: str = "bert-base-uncased",
    add_special_tokens: bool = True,
):
    """Apply one of huggingface tokenizers to a string

    Args:
        lower (bool): Lowercase string. Defaults to True.
        model (str): Select transformer model. Defaults to "bert-base-uncased".
        add_special_tokens (bool): Insert special tokens to tokenized string. Defaults to True.
    """
    self.tokenizer = AutoTokenizer.from_pretrained(model, do_lower_case=lower)
    self.vocab_size = len(self.tokenizer.vocab)
    self.add_special_tokens = add_special_tokens

detokenize(self, x)

Convert list of token ids to list of tokens

Parameters:

Name Type Description Default
x List[int]

List of token ids

required

Returns:

Type Description
List[str]

List[str]: List of tokens

Source code in slp/data/transforms.py
def detokenize(self, x: List[int]) -> List[str]:
    """Convert list of token ids to list of tokens

    Args:
        x (List[int]): List of token ids

    Returns:
        List[str]: List of tokens
    """
    out: List[str] = self.tokenizer.convert_ids_to_tokens(x)
    return out

ReplaceUnknownToken

__call__(self, x) special

Convert in list of tokens to [UNK]

Parameters:

Name Type Description Default
x List[str]

List of tokens

required

Returns:

Type Description
List[str]

List[str]: List of tokens

Source code in slp/data/transforms.py
def __call__(self, x: List[str]) -> List[str]:
    """Convert <unk> in list of tokens to [UNK]

    Args:
        x (List[str]): List of tokens

    Returns:
        List[str]: List of tokens
    """
    return [w if w != self.old_unk else self.new_unk for w in x]

__init__(self, old_unk='<unk>', new_unk='[UNK]') special

Replace existing unknown tokens in the vocab to [UNK]. Useful for wikitext

Parameters:

Name Type Description Default
old_unk str

Unk token in corpus. Defaults to "".

'<unk>'
new_unk str

Desired unk value. Defaults to SPECIAL_TOKENS.UNK.value.

'[UNK]'
Source code in slp/data/transforms.py
def __init__(
    self,
    old_unk: str = "<unk>",
    new_unk: str = SPECIAL_TOKENS.UNK.value,  # type: ignore
):
    """Replace existing unknown tokens in the vocab to [UNK]. Useful for wikitext

    Args:
        old_unk (str): Unk token in corpus. Defaults to "<unk>".
        new_unk (str): Desired unk value. Defaults to SPECIAL_TOKENS.UNK.value.
    """
    self.old_unk = old_unk
    self.new_unk = new_unk

SentencepieceTokenizer

__call__(self, x) special

Call to tokenize function

Parameters:

Name Type Description Default
x str

Input string

required

Returns:

Type Description
List[int]

List[int]: List of tokens ids

Source code in slp/data/transforms.py
def __call__(self, x: str) -> List[int]:
    """Call to tokenize function

    Args:
        x (str): Input string

    Returns:
        List[int]: List of tokens ids
    """
    if self.lower:
        x = x.lower()
    ids: List[int] = self.pre_id + self.tokenizer.encode_as_ids(x) + self.post_id
    return ids

__init__(self, lower=True, model=None, prepend_bos=False, append_eos=False, specials=<enum 'SPECIAL_TOKENS'>) special

Tokenize sentence using pretrained sentencepiece model

Parameters:

Name Type Description Default
lower bool

Lowercase string. Defaults to True.

True
model Optional[Any]

Sentencepiece model. Defaults to None.

None
prepend_bos bool

Prepend BOS for seq2seq. Defaults to False.

False
append_eos bool

Append EOS for seq2seq. Defaults to False.

False
specials Optional[slp.config.nlp.SPECIAL_TOKENS]

Special tokens. Defaults to SPECIAL_TOKENS.

<enum 'SPECIAL_TOKENS'>
Source code in slp/data/transforms.py
def __init__(
    self,
    lower: bool = True,
    model: Optional[Any] = None,
    prepend_bos: bool = False,
    append_eos: bool = False,
    specials: Optional[SPECIAL_TOKENS] = SPECIAL_TOKENS,  # type: ignore
):
    """Tokenize sentence using pretrained sentencepiece model

    Args:
        lower (bool): Lowercase string. Defaults to True.
        model (Optional[Any]): Sentencepiece model. Defaults to None.
        prepend_bos (bool): Prepend BOS for seq2seq. Defaults to False.
        append_eos (bool): Append EOS for seq2seq. Defaults to False.
        specials (Optional[SPECIAL_TOKENS]): Special tokens. Defaults to SPECIAL_TOKENS.
    """
    self.tokenizer = spm.SentencePieceProcessor()
    self.tokenizer.Load(model)
    self.specials = specials
    self.lower = lower
    self.vocab_size = self.tokenizer.get_piece_size()
    self.pre_id = []
    self.post_id = []
    if prepend_bos:
        self.pre_id.append(self.tokenizer.piece_to_id(self.specials.BOS.value))  # type: ignore
    if append_eos:
        self.post_id.append(self.tokenizer.piece_to_id(self.specials.EOS.value))  # type: ignore

SpacyTokenizer

__call__(self, x) special

Call to tokenize function

Parameters:

Name Type Description Default
x str

Input string

required

Returns:

Type Description
List[str]

List[str]: List of tokens

Source code in slp/data/transforms.py
def __call__(self, x: str) -> List[str]:
    """Call to tokenize function

    Args:
        x (str): Input string

    Returns:
        List[str]: List of tokens
    """
    if self.lower:
        x = x.lower()
    out: List[str] = (
        self.pre_id + [y.text for y in self.nlp.tokenizer(x)] + self.post_id
    )
    return out

__init__(self, lower=True, prepend_bos=False, append_eos=False, specials=<enum 'SPECIAL_TOKENS'>, lang='en_core_web_sm') special

Apply spacy tokenizer to str

Parameters:

Name Type Description Default
lower bool

Lowercase string. Defaults to True.

True
prepend_bos bool

Prepend BOS for seq2seq. Defaults to False.

False
append_eos bool

Append EOS for seq2seq. Defaults to False.

False
specials Optional[slp.config.nlp.SPECIAL_TOKENS]

Special tokens. Defaults to SPECIAL_TOKENS.

<enum 'SPECIAL_TOKENS'>
lang str

Spacy language, e.g. el_core_web_sm, en_core_web_sm etc. Defaults to "en_core_web_md".

'en_core_web_sm'
Source code in slp/data/transforms.py
def __init__(
    self,
    lower: bool = True,
    prepend_bos: bool = False,
    append_eos: bool = False,
    specials: Optional[SPECIAL_TOKENS] = SPECIAL_TOKENS,  # type: ignore
    lang: str = "en_core_web_sm",
):
    """Apply spacy tokenizer to str

    Args:
        lower (bool): Lowercase string. Defaults to True.
        prepend_bos (bool): Prepend BOS for seq2seq. Defaults to False.
        append_eos (bool): Append EOS for seq2seq. Defaults to False.
        specials (Optional[SPECIAL_TOKENS]): Special tokens. Defaults to SPECIAL_TOKENS.
        lang (str): Spacy language, e.g. el_core_web_sm, en_core_web_sm etc. Defaults to "en_core_web_md".
    """
    self.lower = lower
    self.specials = SPECIAL_TOKENS
    self.lang = lang
    self.pre_id = []
    self.post_id = []
    if prepend_bos:
        self.pre_id.append(self.specials.BOS.value)
    if append_eos:
        self.post_id.append(self.specials.EOS.value)
    self.nlp = self.get_nlp(name=lang, specials=specials)

get_nlp(self, name='en_core_web_sm', specials=<enum 'SPECIAL_TOKENS'>)

Get spacy nlp object for given lang and add SPECIAL_TOKENS

Parameters:

Name Type Description Default
name str

Spacy language, e.g. el_core_web_sm, en_core_web_sm etc. Defaults to "en_core_web_md".

'en_core_web_sm'
specials Optional[slp.config.nlp.SPECIAL_TOKENS]

Special tokens. Defaults to SPECIAL_TOKENS.

<enum 'SPECIAL_TOKENS'>

Returns:

Type Description
Language

spacy.Language: spacy text-processing pipeline

Source code in slp/data/transforms.py
def get_nlp(
    self,
    name: str = "en_core_web_sm",
    specials: Optional[SPECIAL_TOKENS] = SPECIAL_TOKENS,  # type: ignore
) -> spacy.Language:
    """Get spacy nlp object for given lang and add SPECIAL_TOKENS

    Args:
        name (str): Spacy language, e.g. el_core_web_sm, en_core_web_sm etc. Defaults to "en_core_web_md".
        specials (Optional[SPECIAL_TOKENS]): Special tokens. Defaults to SPECIAL_TOKENS.

    Returns:
        spacy.Language: spacy text-processing pipeline
    """
    nlp = spacy.load(name)
    if specials is not None:
        for token in specials.to_list():
            nlp.tokenizer.add_special_case(token, [{ORTH: token}])
    return nlp

ToTensor

__call__(self, x) special

Convert list of tokens or list of features to tensor

Parameters:

Name Type Description Default
x List[Any]

List of tokens or features

required

Returns:

Type Description
Tensor

torch.Tensor: Resulting tensor

Source code in slp/data/transforms.py
def __call__(self, x: List[Any]) -> torch.Tensor:
    """Convert list of tokens or list of features to tensor

    Args:
        x (List[Any]): List of tokens or features

    Returns:
        torch.Tensor: Resulting tensor
    """
    return mktensor(x, device=self.device, dtype=self.dtype)

__init__(self, device='cpu', dtype=torch.int64) special

To tensor convertor

Parameters:

Name Type Description Default
device str

Device to map the tensor. Defaults to "cpu".

'cpu'
dtype dtype

Type of resulting tensor. Defaults to torch.long.

torch.int64
Source code in slp/data/transforms.py
def __init__(self, device: str = "cpu", dtype: torch.dtype = torch.long):
    """To tensor convertor

    Args:
        device (str): Device to map the tensor. Defaults to "cpu".
        dtype (torch.dtype): Type of resulting tensor. Defaults to torch.long.
    """
    self.device = device
    self.dtype = dtype

ToTokenIds

__call__(self, x) special

Convert list of tokens to list of token ids

Parameters:

Name Type Description Default
x List[str]

List of tokens

required

Returns:

Type Description
List[int]

List[int]: List of token ids

Source code in slp/data/transforms.py
def __call__(self, x: List[str]) -> List[int]:
    """Convert list of tokens to list of token ids

    Args:
        x (List[str]): List of tokens

    Returns:
        List[int]: List of token ids
    """
    return [
        self.word2idx[w] if w in self.word2idx else self.word2idx[self.unk_value]
        for w in x
    ]

__init__(self, word2idx, specials=<enum 'SPECIAL_TOKENS'>) special

Convert List of tokens to list of token ids

Parameters:

Name Type Description Default
word2idx Dict[str, int]

Word to index mapping

required
specials Optional[slp.config.nlp.SPECIAL_TOKENS]

Special tokens. Defaults to SPECIAL_TOKENS.

<enum 'SPECIAL_TOKENS'>
Source code in slp/data/transforms.py
def __init__(
    self,
    word2idx: Dict[str, int],
    specials: Optional[SPECIAL_TOKENS] = SPECIAL_TOKENS,  # type: ignore
):
    """Convert List of tokens to list of token ids

    Args:
        word2idx (Dict[str, int]): Word to index mapping
        specials (Optional[SPECIAL_TOKENS]): Special tokens. Defaults to SPECIAL_TOKENS.
    """
    self.word2idx = word2idx
    self.unk_value = specials.UNK.value if specials is not None else "[UNK]"  # type: ignore