from collections import Counter
[docs]def check_build_vocab(func):
"""A decorator to make sure the indexing is built before used.
"""
def _wrapper(self, *args, **kwargs):
if self.word2idx is None or self.rebuild is True:
self.build_vocab()
return func(self, *args, **kwargs)
return _wrapper
[docs]def check_build_status(func):
"""A decorator to check whether the vocabulary updates after the last build.
"""
def _wrapper(self, *args, **kwargs):
if self.rebuild is False:
self.rebuild = True
if self.max_size is not None and len(self.word_count) >= self.max_size:
print("[Warning] Vocabulary has reached the max size {} when calling {} method. "
"Adding more words may cause unexpected behaviour of Vocabulary. ".format(
self.max_size, func.__name__))
return func(self, *args, **kwargs)
return _wrapper
[docs]class Vocabulary(object):
"""Use for word and index one to one mapping
Example::
vocab = Vocabulary()
word_list = "this is a word list".split()
vocab.update(word_list)
vocab["word"]
vocab.to_word(5)
:param int max_size: set the max number of words in Vocabulary. Default: None
:param int min_freq: set the min occur frequency of words in Vocabulary. Default: None
"""
def __init__(self, max_size=None, min_freq=None, unknown='<unk>', padding='<pad>'):
self.max_size = max_size
self.min_freq = min_freq
self.word_count = Counter()
self.unknown = unknown
self.padding = padding
self.word2idx = None
self.idx2word = None
self.rebuild = True
@check_build_status
def update(self, word_lst):
"""Add a list of words into the vocabulary.
:param list word_lst: a list of strings
"""
self.word_count.update(word_lst)
@check_build_status
def add(self, word):
"""Add a single word into the vocabulary.
:param str word: a word or token.
"""
self.word_count[word] += 1
@check_build_status
def add_word(self, word):
"""Add a single word into the vocabulary.
:param str word: a word or token.
"""
self.add(word)
@check_build_status
def add_word_lst(self, word_lst):
"""Add a list of words into the vocabulary.
:param list word_lst: a list of strings
"""
self.update(word_lst)
[docs] def build_vocab(self):
"""Build a mapping from word to index, and filter the word using ``max_size`` and ``min_freq``.
"""
self.word2idx = {}
if self.padding is not None:
self.word2idx[self.padding] = 0
if self.unknown is not None:
self.word2idx[self.unknown] = 1
max_size = min(self.max_size, len(self.word_count)) if self.max_size else None
words = self.word_count.most_common(max_size)
if self.min_freq is not None:
words = filter(lambda kv: kv[1] >= self.min_freq, words)
if self.word2idx is not None:
words = filter(lambda kv: kv[0] not in self.word2idx, words)
start_idx = len(self.word2idx)
self.word2idx.update({w: i + start_idx for i, (w, _) in enumerate(words)})
self.build_reverse_vocab()
self.rebuild = False
[docs] def build_reverse_vocab(self):
"""Build "index to word" dict based on "word to index" dict.
"""
self.idx2word = {i: w for w, i in self.word2idx.items()}
@check_build_vocab
def __len__(self):
return len(self.word2idx)
@check_build_vocab
def __contains__(self, item):
"""Check if a word in vocabulary.
:param item: the word
:return: True or False
"""
return item in self.word2idx
def has_word(self, w):
return self.__contains__(w)
@check_build_vocab
def __getitem__(self, w):
"""To support usage like::
vocab[w]
"""
if w in self.word2idx:
return self.word2idx[w]
if self.unknown is not None:
return self.word2idx[self.unknown]
else:
raise ValueError("word {} not in vocabulary".format(w))
[docs] def to_index(self, w):
""" Turn a word to an index. If w is not in Vocabulary, return the unknown label.
:param str w: a word
"""
return self.__getitem__(w)
@property
@check_build_vocab
def unknown_idx(self):
if self.unknown is None:
return None
return self.word2idx[self.unknown]
@property
@check_build_vocab
def padding_idx(self):
if self.padding is None:
return None
return self.word2idx[self.padding]
@check_build_vocab
def to_word(self, idx):
"""given a word's index, return the word itself
:param int idx: the index
:return str word: the indexed word
"""
return self.idx2word[idx]
def __getstate__(self):
"""Use to prepare data for pickle.
"""
state = self.__dict__.copy()
# no need to pickle idx2word as it can be constructed from word2idx
del state['idx2word']
return state
def __setstate__(self, state):
"""Use to restore state from pickle.
"""
self.__dict__.update(state)
self.build_reverse_vocab()