Source code for fastNLP.io.embed_loader

import numpy as np
import torch

from fastNLP.core.vocabulary import Vocabulary
from fastNLP.io.base_loader import BaseLoader


[docs]class EmbedLoader(BaseLoader): """docstring for EmbedLoader""" def __init__(self): super(EmbedLoader, self).__init__() @staticmethod def _load_glove(emb_file): """Read file as a glove embedding file format: embeddings are split by line, for one embedding, word and numbers split by space Example:: word_1 float_1 float_2 ... float_emb_dim word_2 float_1 float_2 ... float_emb_dim ... """ emb = {} with open(emb_file, 'r', encoding='utf-8') as f: for line in f: line = list(filter(lambda w: len(w) > 0, line.strip().split(' '))) if len(line) > 2: emb[line[0]] = torch.Tensor(list(map(float, line[1:]))) return emb @staticmethod def _load_pretrain(emb_file, emb_type): """Read txt data from embedding file and convert to np.array as pre-trained embedding :param str emb_file: the pre-trained embedding file path :param str emb_type: the pre-trained embedding data format :return: a dict of ``{str: np.array}`` """ if emb_type == 'glove': return EmbedLoader._load_glove(emb_file) else: raise Exception("embedding type {} not support yet".format(emb_type))
[docs] @staticmethod def load_embedding(emb_dim, emb_file, emb_type, vocab): """Load the pre-trained embedding and combine with the given dictionary. :param int emb_dim: the dimension of the embedding. Should be the same as pre-trained embedding. :param str emb_file: the pre-trained embedding file path. :param str emb_type: the pre-trained embedding format, support glove now :param Vocabulary vocab: a mapping from word to index, can be provided by user or built from pre-trained embedding :return (embedding_tensor, vocab): embedding_tensor - Tensor of shape (len(word_dict), emb_dim); vocab - input vocab or vocab built by pre-train """ pretrain = EmbedLoader._load_pretrain(emb_file, emb_type) if vocab is None: # build vocabulary from pre-trained embedding vocab = Vocabulary() for w in pretrain.keys(): vocab.add(w) embedding_tensor = torch.randn(len(vocab), emb_dim) for w, v in pretrain.items(): if len(v.shape) > 1 or emb_dim != v.shape[0]: raise ValueError( "Pretrained embedding dim is {}. Dimension dismatched. Required {}".format(v.shape, (emb_dim,))) if vocab.has_word(w): embedding_tensor[vocab[w]] = v return embedding_tensor, vocab
@staticmethod def parse_glove_line(line): line = list(filter(lambda w: len(w) > 0, line.strip().split(" "))) if len(line) <= 2: raise RuntimeError("something goes wrong in parsing glove embedding") return line[0], torch.Tensor(list(map(float, line[1:])))
[docs] @staticmethod def fast_load_embedding(emb_dim, emb_file, vocab): """Fast load the pre-trained embedding and combine with the given dictionary. This loading method uses line-by-line operation. :param int emb_dim: the dimension of the embedding. Should be the same as pre-trained embedding. :param str emb_file: the pre-trained embedding file path. :param Vocabulary vocab: a mapping from word to index, can be provided by user or built from pre-trained embedding :return embedding_matrix: numpy.ndarray """ if vocab is None: raise RuntimeError("You must provide a vocabulary.") embedding_matrix = np.zeros(shape=(len(vocab), emb_dim)) hit_flags = np.zeros(shape=(len(vocab),), dtype=int) with open(emb_file, "r", encoding="utf-8") as f: for line in f: word, vector = EmbedLoader.parse_glove_line(line) if word in vocab: if len(vector.shape) > 1 or emb_dim != vector.shape[0]: raise ValueError("Pre-trained embedding dim is {}. Expect {}.".format(vector.shape, (emb_dim,))) embedding_matrix[vocab[word]] = vector hit_flags[vocab[word]] = 1 if np.sum(hit_flags) < len(vocab): # some words from vocab are missing in pre-trained embedding # we normally sample each dimension vocab_embed = embedding_matrix[np.where(hit_flags)] sampled_vectors = np.random.normal(vocab_embed.mean(axis=0), vocab_embed.std(axis=0), size=(len(vocab) - np.sum(hit_flags), emb_dim)) embedding_matrix[np.where(1 - hit_flags)] = sampled_vectors return embedding_matrix