Source code for fastNLP.modules.encoder.masked_rnn

__author__ = 'max'

import torch
import torch.nn as nn
import torch.nn.functional as F

from fastNLP.modules.utils import initial_parameter
def MaskedRecurrent(reverse=False):
    def forward(input, hidden, cell, mask, train=True, dropout=0):
        """
        :param input:
        :param hidden:
        :param cell:
        :param mask:
        :param dropout: step之间的dropout,对mask了的也会drop,应该是没问题的,反正没有gradient
        :param train: 控制dropout的行为,在StackedRNN的forward中调用
        :return:
        """
        output = []
        steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
        for i in steps:
            if mask is None or mask[i].data.min() > 0.5:  # 没有mask,都是1
                hidden = cell(input[i], hidden)
            elif mask[i].data.max() > 0.5:  # 有mask,但不全为0
                hidden_next = cell(input[i], hidden)  # 一次喂入一个batch!
                # hack to handle LSTM
                if isinstance(hidden, tuple):  # LSTM outputs a tuple of (hidden, cell), this is a common hack 😁
                    mask = mask.float()
                    hx, cx = hidden
                    hp1, cp1 = hidden_next
                    hidden = (
                        hx + (hp1 - hx) * mask[i].squeeze(),
                        cx + (cp1 - cx) * mask[i].squeeze())  # Why? 我知道了!!如果是mask就不用改变
                else:
                    hidden = hidden + (hidden_next - hidden) * mask[i]

            # if dropout != 0 and train: # warning, should i treat masked tensor differently?
            #     if isinstance(hidden, tuple):
            #         hidden = (F.dropout(hidden[0], p=dropout, training=train),
            #                   F.dropout(hidden[1], p=dropout, training=train))
            #     else:
            #         hidden = F.dropout(hidden, p=dropout, training=train)

            # hack to handle LSTM
            output.append(hidden[0] if isinstance(hidden, tuple) else hidden)

        if reverse:
            output.reverse()
        output = torch.cat(output, 0).view(input.size(0), *output[0].size())

        return hidden, output

    return forward


def StackedRNN(inners, num_layers, lstm=False, train=True, step_dropout=0, layer_dropout=0):
    num_directions = len(inners)  # rec_factory!
    total_layers = num_layers * num_directions

    def forward(input, hidden, cells, mask):
        assert (len(cells) == total_layers)
        next_hidden = []

        if lstm:
            hidden = list(zip(*hidden))

        for i in range(num_layers):
            all_output = []
            for j, inner in enumerate(inners):
                l = i * num_directions + j
                hy, output = inner(input, hidden[l], cells[l], mask, step_dropout, train)
                next_hidden.append(hy)
                all_output.append(output)

            input = torch.cat(all_output, input.dim() - 1)  # 下一层的输入

            if layer_dropout != 0 and i < num_layers - 1:
                input = F.dropout(input, p=layer_dropout, training=train, inplace=False)

        if lstm:
            next_h, next_c = zip(*next_hidden)
            next_hidden = (
                torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
                torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
            )
        else:
            next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())

        return next_hidden, input

    return forward


def AutogradMaskedRNN(num_layers=1, batch_first=False, train=True, layer_dropout=0, step_dropout=0,
                      bidirectional=False, lstm=False):
    rec_factory = MaskedRecurrent

    if bidirectional:
        layer = (rec_factory(), rec_factory(reverse=True))
    else:
        layer = (rec_factory(),)  # rec_factory 就是每层的结构啦!!在MaskedRecurrent中进行每层的计算!然后用StackedRNN接起来

    func = StackedRNN(layer,
                      num_layers,
                      lstm=lstm,
                      layer_dropout=layer_dropout, step_dropout=step_dropout,
                      train=train)

    def forward(input, cells, hidden, mask):
        if batch_first:
            input = input.transpose(0, 1)
            if mask is not None:
                mask = mask.transpose(0, 1)

        nexth, output = func(input, hidden, cells, mask)

        if batch_first:
            output = output.transpose(0, 1)

        return output, nexth

    return forward


def MaskedStep():
    def forward(input, hidden, cell, mask):
        if mask is None or mask.data.min() > 0.5:
            hidden = cell(input, hidden)
        elif mask.data.max() > 0.5:
            hidden_next = cell(input, hidden)
            # hack to handle LSTM
            if isinstance(hidden, tuple):
                hx, cx = hidden
                hp1, cp1 = hidden_next
                hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
            else:
                hidden = hidden + (hidden_next - hidden) * mask
        # hack to handle LSTM
        output = hidden[0] if isinstance(hidden, tuple) else hidden

        return hidden, output

    return forward


def StackedStep(layer, num_layers, lstm=False, dropout=0, train=True):
    def forward(input, hidden, cells, mask):
        assert (len(cells) == num_layers)
        next_hidden = []

        if lstm:
            hidden = list(zip(*hidden))

        for l in range(num_layers):
            hy, output = layer(input, hidden[l], cells[l], mask)
            next_hidden.append(hy)
            input = output

            if dropout != 0 and l < num_layers - 1:
                input = F.dropout(input, p=dropout, training=train, inplace=False)

        if lstm:
            next_h, next_c = zip(*next_hidden)
            next_hidden = (
                torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
                torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
            )
        else:
            next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())

        return next_hidden, input

    return forward


def AutogradMaskedStep(num_layers=1, dropout=0, train=True, lstm=False):
    layer = MaskedStep()

    func = StackedStep(layer,
                       num_layers,
                       lstm=lstm,
                       dropout=dropout,
                       train=train)

    def forward(input, cells, hidden, mask):
        nexth, output = func(input, hidden, cells, mask)
        return output, nexth

    return forward


[docs]class MaskedRNNBase(nn.Module): def __init__(self, Cell, input_size, hidden_size, num_layers=1, bias=True, batch_first=False, layer_dropout=0, step_dropout=0, bidirectional=False, initial_method = None , **kwargs): """ :param Cell: :param input_size: :param hidden_size: :param num_layers: :param bias: :param batch_first: :param layer_dropout: :param step_dropout: :param bidirectional: :param kwargs: """ super(MaskedRNNBase, self).__init__() self.Cell = Cell self.input_size = input_size self.hidden_size = hidden_size self.num_layers = num_layers self.bias = bias self.batch_first = batch_first self.layer_dropout = layer_dropout self.step_dropout = step_dropout self.bidirectional = bidirectional num_directions = 2 if bidirectional else 1 self.all_cells = [] for layer in range(num_layers): # 初始化所有cell for direction in range(num_directions): layer_input_size = input_size if layer == 0 else hidden_size * num_directions cell = self.Cell(layer_input_size, hidden_size, self.bias, **kwargs) self.all_cells.append(cell) self.add_module('cell%d' % (layer * num_directions + direction), cell) # Max的代码写得真好看 initial_parameter(self, initial_method) def reset_parameters(self): for cell in self.all_cells: cell.reset_parameters()
[docs] def forward(self, input, mask=None, hx=None): batch_size = input.size(0) if self.batch_first else input.size(1) lstm = self.Cell is nn.LSTMCell if hx is None: num_directions = 2 if self.bidirectional else 1 hx = torch.autograd.Variable( input.data.new(self.num_layers * num_directions, batch_size, self.hidden_size).zero_()) if lstm: hx = (hx, hx) func = AutogradMaskedRNN(num_layers=self.num_layers, batch_first=self.batch_first, step_dropout=self.step_dropout, layer_dropout=self.layer_dropout, train=self.training, bidirectional=self.bidirectional, lstm=lstm) # 传入all_cells,继续往底层封装走 output, hidden = func(input, self.all_cells, hx, None if mask is None else mask.view(mask.size() + (1,))) # 这个+ (1, )是个什么操作? return output, hidden
[docs] def step(self, input, hx=None, mask=None): ''' execute one step forward (only for one-directional RNN). Args: input (batch, input_size): input tensor of this step. hx (num_layers, batch, hidden_size): the hidden state of last step. mask (batch): the mask tensor of this step. Returns: output (batch, hidden_size): tensor containing the output of this step from the last layer of RNN. hn (num_layers, batch, hidden_size): tensor containing the hidden state of this step ''' assert not self.bidirectional, "step only cannot be applied to bidirectional RNN." # aha, typo! batch_size = input.size(0) lstm = self.Cell is nn.LSTMCell if hx is None: hx = torch.autograd.Variable(input.data.new(self.num_layers, batch_size, self.hidden_size).zero_()) if lstm: hx = (hx, hx) func = AutogradMaskedStep(num_layers=self.num_layers, dropout=self.step_dropout, train=self.training, lstm=lstm) output, hidden = func(input, self.all_cells, hx, mask) return output, hidden
[docs]class MaskedRNN(MaskedRNNBase): r"""Applies a multi-layer Elman RNN with costomized non-linearity to an input sequence. For each element in the input sequence, each layer computes the following function: .. math:: h_t = \tanh(w_{ih} * x_t + b_{ih} + w_{hh} * h_{(t-1)} + b_{hh}) where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is the hidden state of the previous layer at time `t` or :math:`input_t` for the first layer. If nonlinearity='relu', then `ReLU` is used instead of `tanh`. Args: input_size: The number of expected features in the input x hidden_size: The number of features in the hidden state h num_layers: Number of recurrent layers. nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh' bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer bidirectional: If True, becomes a bidirectional RNN. Default: False Inputs: input, mask, h_0 - **input** (seq_len, batch, input_size): tensor containing the features of the input sequence. **mask** (seq_len, batch): 0-1 tensor containing the mask of the input sequence. - **h_0** (num_layers * num_directions, batch, hidden_size): tensor containing the initial hidden state for each element in the batch. Outputs: output, h_n - **output** (seq_len, batch, hidden_size * num_directions): tensor containing the output features (h_k) from the last layer of the RNN, for each k. If a :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output will also be a packed sequence. - **h_n** (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for k=seq_len. """ def __init__(self, *args, **kwargs): super(MaskedRNN, self).__init__(nn.RNNCell, *args, **kwargs)
[docs]class MaskedLSTM(MaskedRNNBase): r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence. For each element in the input sequence, each layer computes the following function: .. math:: \begin{array}{ll} i_t = \mathrm{sigmoid}(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\ f_t = \mathrm{sigmoid}(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\ g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\ o_t = \mathrm{sigmoid}(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\ c_t = f_t * c_{(t-1)} + i_t * g_t \\ h_t = o_t * \tanh(c_t) \end{array} where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell state at time `t`, :math:`x_t` is the hidden state of the previous layer at time `t` or :math:`input_t` for the first layer, and :math:`i_t`, :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell, and out gates, respectively. Args: input_size: The number of expected features in the input x hidden_size: The number of features in the hidden state h num_layers: Number of recurrent layers. bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer bidirectional: If True, becomes a bidirectional RNN. Default: False Inputs: input, mask, (h_0, c_0) - **input** (seq_len, batch, input_size): tensor containing the features of the input sequence. **mask** (seq_len, batch): 0-1 tensor containing the mask of the input sequence. - **h_0** (num_layers \* num_directions, batch, hidden_size): tensor containing the initial hidden state for each element in the batch. - **c_0** (num_layers \* num_directions, batch, hidden_size): tensor containing the initial cell state for each element in the batch. Outputs: output, (h_n, c_n) - **output** (seq_len, batch, hidden_size * num_directions): tensor containing the output features `(h_t)` from the last layer of the RNN, for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output will also be a packed sequence. - **h_n** (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t=seq_len - **c_n** (num_layers * num_directions, batch, hidden_size): tensor containing the cell state for t=seq_len """ def __init__(self, *args, **kwargs): super(MaskedLSTM, self).__init__(nn.LSTMCell, *args, **kwargs)
[docs]class MaskedGRU(MaskedRNNBase): r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence. For each element in the input sequence, each layer computes the following function: .. math:: \begin{array}{ll} r_t = \mathrm{sigmoid}(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\ z_t = \mathrm{sigmoid}(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\ n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\ h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} \\ \end{array} where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the hidden state of the previous layer at time `t` or :math:`input_t` for the first layer, and :math:`r_t`, :math:`z_t`, :math:`n_t` are the reset, input, and new gates, respectively. Args: input_size: The number of expected features in the input x hidden_size: The number of features in the hidden state h num_layers: Number of recurrent layers. nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh' bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True batch_first: If True, then the input and output tensors are provided as (batch, seq, feature) dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer bidirectional: If True, becomes a bidirectional RNN. Default: False Inputs: input, mask, h_0 - **input** (seq_len, batch, input_size): tensor containing the features of the input sequence. **mask** (seq_len, batch): 0-1 tensor containing the mask of the input sequence. - **h_0** (num_layers * num_directions, batch, hidden_size): tensor containing the initial hidden state for each element in the batch. Outputs: output, h_n - **output** (seq_len, batch, hidden_size * num_directions): tensor containing the output features (h_k) from the last layer of the RNN, for each k. If a :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output will also be a packed sequence. - **h_n** (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for k=seq_len. """ def __init__(self, *args, **kwargs): super(MaskedGRU, self).__init__(nn.GRUCell, *args, **kwargs)