Source code for fastNLP.core.fieldarray

import numpy as np


[docs]class FieldArray(object):
    """``FieldArray`` is the collection of ``Instance``s of the same field.
    It is the basic element of ``DataSet`` class.

    :param str name: the name of the FieldArray
    :param list content: a list of int, float, str or np.ndarray, or a list of list of one, or a np.ndarray.
    :param int padding_val: the integer for padding. Default: 0.
    :param bool is_target: If True, this FieldArray is used to compute loss.
    :param bool is_input: If True, this FieldArray is used to the model input.

    """

    def __init__(self, name, content, padding_val=0, is_target=None, is_input=None):
        self.name = name
        if isinstance(content, list):
            content = content
        elif isinstance(content, np.ndarray):
            content = content.tolist()  # convert np.ndarray into 2-D list
        else:
            raise TypeError("content in FieldArray can only be list or numpy.ndarray, got {}.".format(type(content)))
        self.content = content
        self.padding_val = padding_val

        self._is_target = None
        self._is_input = None

        self.BASIC_TYPES = (int, float, str, np.ndarray)
        self.is_2d_list = False
        self.pytype = None  # int, float, str, or np.ndarray
        self.dtype = None  # np.int64, np.float64, np.str

        if is_input is not None:
            self.is_input = is_input
        if is_target is not None:
            self.is_target = is_target

    @property
    def is_input(self):
        return self._is_input

    @is_input.setter
    def is_input(self, value):
        if value is True:
            self.pytype = self._type_detection(self.content)
            self.dtype = self._map_to_np_type(self.pytype)
        self._is_input = value

    @property
    def is_target(self):
        return self._is_target

    @is_target.setter
    def is_target(self, value):
        if value is True:
            self.pytype = self._type_detection(self.content)
            self.dtype = self._map_to_np_type(self.pytype)
        self._is_target = value

    def _type_detection(self, content):
        """

        :param content: a list of int, float, str or np.ndarray, or a list of list of one.
        :return type: one of int, float, str, np.ndarray

        """
        if isinstance(content, list) and len(content) > 0 and isinstance(content[0], list):
            # content is a 2-D list
            if not all(isinstance(_, list) for _ in content):  # strict check 2-D list
                raise TypeError("Please provide 2-D list.")
            type_set = set([self._type_detection(x) for x in content])
            if len(type_set) == 2 and int in type_set and float in type_set:
                type_set = {float}
            elif len(type_set) > 1:
                raise TypeError("Cannot create FieldArray with more than one type. Provided {}".format(type_set))
            self.is_2d_list = True
            return type_set.pop()

        elif isinstance(content, list):
            # content is a 1-D list
            if len(content) == 0:
                # the old error is not informative enough.
                raise RuntimeError("Cannot create FieldArray with an empty list. Or one element in the list is empty.")
            type_set = set([type(item) for item in content])

            if len(type_set) == 1 and tuple(type_set)[0] in self.BASIC_TYPES:
                return type_set.pop()
            elif len(type_set) == 2 and float in type_set and int in type_set:
                # up-cast int to float
                return float
            else:
                raise TypeError("Cannot create FieldArray with type {}".format(*type_set))
        else:
            raise TypeError("Cannot create FieldArray with type {}".format(type(content)))

    @staticmethod
    def _map_to_np_type(basic_type):
        type_mapping = {int: np.int64, float: np.float64, str: np.str, np.ndarray: np.ndarray}
        return type_mapping[basic_type]

    def __repr__(self):
        return "FieldArray {}: {}".format(self.name, self.content.__repr__())

[docs]    def append(self, val):
        """Add a new item to the tail of FieldArray.

        :param val: int, float, str, or a list of one.
        """
        if self.is_target is True or self.is_input is True:
            # only check type when used as target or input

            val_type = type(val)
            if val_type == list:  # shape check
                if self.is_2d_list is False:
                    raise RuntimeError("Cannot append a list into a 1-D FieldArray. Please provide an element.")
                if len(val) == 0:
                    raise RuntimeError("Cannot append an empty list.")
                val_list_type = set([type(_) for _ in val])  # type check
                if len(val_list_type) == 2 and int in val_list_type and float in val_list_type:
                    # up-cast int to float
                    val_type = float
                elif len(val_list_type) == 1:
                    val_type = val_list_type.pop()
                else:
                    raise TypeError("Cannot append a list of {}".format(val_list_type))
            else:
                if self.is_2d_list is True:
                    raise RuntimeError("Cannot append a non-list into a 2-D list. Please provide a list.")

            if val_type == float and self.pytype == int:
                # up-cast
                self.pytype = float
                self.dtype = self._map_to_np_type(self.pytype)
            elif val_type == int and self.pytype == float:
                pass
            elif val_type == self.pytype:
                pass
            else:
                raise TypeError("Cannot append type {} into type {}".format(val_type, self.pytype))

        self.content.append(val)

    def __getitem__(self, indices):
        return self.get(indices)

    def __setitem__(self, idx, val):
        assert isinstance(idx, int)
        self.content[idx] = val

[docs]    def get(self, indices):
        """Fetch instances based on indices.

        :param indices: an int, or a list of int.
        :return:
        """
        if isinstance(indices, int):
            return self.content[indices]
        if self.is_input is False and self.is_target is False:
            raise RuntimeError("Please specify either is_input or is_target is True for {}".format(self.name))
        batch_size = len(indices)

        if not is_iterable(self.content[0]):
            array = np.array([self.content[i] for i in indices], dtype=self.dtype)
        elif self.dtype in (np.int64, np.float64):
            max_len = max([len(self.content[i]) for i in indices])
            array = np.full((batch_size, max_len), self.padding_val, dtype=self.dtype)
            for i, idx in enumerate(indices):
                array[i][:len(self.content[idx])] = self.content[idx]
        else:  # should only be str
            array = np.array([self.content[i] for i in indices])
        return array

    def __len__(self):
        """Returns the size of FieldArray.

        :return int length:
        """
        return len(self.content)


def is_iterable(content):
    try:
        _ = (e for e in content)
    except TypeError:
        return False
    return True