import csv
import logging

import numpy as np
from numpy.lib.recfunctions import repack_fields, structured_to_unstructured

logger = logging.getLogger(__name__)


class Frame1D(object):
    def __init__(self, array):
        self._array = array

    def __len__(self):
        return len(self._array)

    def __array__(self, *args, **kwargs):
        return np.asarray(self._array, *args, **kwargs)

    def __invert__(self):
        return self.__class__(~self._array)

    def __repr__(self):
        return repr(self._array)

    def __str__(self):
        return str(self._array)

    def unique(self, return_index=False, return_counts=False):
        res = np.unique(self._array, return_index=return_index, return_counts=return_counts)
        if return_index and return_counts:
            return self.__class__(res[0]), res[1], res[2]
        elif return_index or return_counts:
            return self.__class__(res[0]), res[1]
        else:
            return self.__class__(res)

    @property
    def dtype(self):
        return self._array.dtype

    def copy(self):
        return self.__class__(self._array.copy())

    def __iter__(self):
        return iter(self._array)

    def astype(self, dtype):
        return self.__class__(self._array.astype(dtype))

    def repeat(self, times):
        return self.__class__(np.repeat(self._array, times))

    def tile(self, times):
        return self.__class__(np.tile(self._array, times))

    def is_in(self, other):
        # avoid np.in1d/np.isin because it freezes on large lists
        other = set(tuple(i) for i in other)
        return np.array([tuple(i) in other for i in self], dtype=np.bool)

    @classmethod
    def concatenate(cls, frames):
        """
        Concatenates frames in pandas-style: i.e., the result will have
        union([frame.columns for frame in frames]) columns with missing
        values masked.
        """
        arrays = [frame._array for frame in frames]
        # first check if all arrays have the same dtypes: in that case concatenation
        # can easily be done structarray-style
        if len(set(array.dtype for array in arrays)) == 1:
            return cls(np.concatenate(arrays))

        # otherwise, check for dtype compatibility (concat will fail if
        # two frames have different dtypes for the same column)
        dtypes = get_compatible_dtypes(arrays)
        all_columns = set(dtypes.keys())
        result_arrays = []
        for array in frames:
            missing_columns = all_columns.difference(set(array.dtype.names))
            for column in missing_columns:
                dtype = dtypes[column]
                array = array.append_column(np.repeat(get_null_value(dtype), len(array)).astype(dtype), column)
            result_arrays.append(array)

        return cls.from_dict({col: np.concatenate([arr[col] for arr in result_arrays])
                              for col in result_arrays[0].dtype.names})

    @property
    def shape(self):
        return self._array.shape

    def __eq__(self, other):
        if isinstance(other, Frame1D):
            other = other._array
        return Series(self._array == other)

    def union1d(self, other):
        return self.__class__(np.union1d(self._array, other._array))

    def intersect1d(self, other):
        return self.__class__(np.intersect1d(self._array, other._array))


class Series(Frame1D):
    def __init__(self, array):
        assert len(array.shape) == 1 and array.dtype.names is None, 'Column object should be created from 1D array'
        super(Series, self).__init__(array)

    def __getitem__(self, item):
        if isinstance(item, int):
            return self._array[item]
        return Series(self._array[item])

    def __setitem__(self, item, value):
        if isinstance(value, Series):
            value = value._array
        self._array[item] = value

    def __ne__(self, other):
        return self._array.__ne__(other)

    def __lt__(self, other):
        return self._array < other

    def __gt__(self, other):
        return self._array > other

    def __ge__(self, other):
        return self._array >= other

    def __le__(self, other):
        return self._array <= other

    def __and__(self, other):
        return self._array & other

    def min(self):
        return self._array.min()

    def max(self, axis=0):
        return self._array.max(axis=axis)

    def argmax(self, axis=0):
        return self._array.argmax(axis=axis)

    def mean(self, axis=0):
        return self._array.mean(axis=axis)

    def __mul__(self, other):
        if isinstance(other, Series):
            other = other._array
        return Series(self._array * other)

    def __neg__(self):
        return Series(-self._array)

    def to_list(self):
        return self._array.tolist()

    def to_array(self):
        return self._array

    def all(self):
        return self._array.all()


class Row(Frame1D):
    pass


class DataFrame(Frame1D):
    def __init__(self, array):
        assert len(
            array.shape) == 1 and array.dtype.names is not None, 'DataFrame object should be based on 1D structarrays'
        super(DataFrame, self).__init__(array)

    def __getitem__(self, item):
        if isinstance(item, str):
            return Series(self._array[item])
        if isinstance(item, int):
            return Row(self._array[item])
        if isinstance(item, Series):
            return DataFrame(self._array[item._array])
        return DataFrame(self._array[item])

    def __setitem__(self, item, value):
        if isinstance(value, Series):
            value = value._array
        self._array[item] = value

    def astype(self, dtype):
        if isinstance(dtype, np.dtype):
            dtype = [dtype.fields[name][0] for name in self.columns]
        elif isinstance(dtype, dict) and 'names' not in dtype and 'formats' not in dtype:
            dtype = [(name, dtype[name]) for name in self.columns]
        elif isinstance(dtype, list):
            dtype = dict(dtype)
            dtype = [(name, dtype[name]) for name in self.columns]
        return super(DataFrame, self).astype(dtype)

    @classmethod
    def from_pandas(cls, df, dtypes=None):
        """
        make structured array from pandas dataset
        """
        if dtypes is None:
            dtypes = [(str(name), dtype.str) for name, dtype in df.dtypes.iteritems()]
        array = np.empty(len(df), dtype=dtypes)
        for name, _ in dtypes:
            array[name] = df[name]
        return cls(array)

    @classmethod
    def empty(cls, dtype):
        """
        return empty dataset with specified dtype
        """
        return cls.from_structarray(np.empty(0, dtype))

    @classmethod
    def from_dict(cls, data, dtype=None):
        """
        make structured array from dictionary (similar to pandas DataFrame constructor)
        """
        assert len(data) > 0, 'Empty dict not accepted'
        count = len(next(data.itervalues()))
        assert np.all([len(v) == count for v in data.itervalues()]), 'Values length mismatch'

        # at first convert to dict of 1d numpy arrays
        arrays = {}
        for name, item in data.iteritems():
            if isinstance(item, DataFrame):
                arrays[name] = item._array
            else:
                arrays[name] = np.array(item)
        if dtype is None:
            dtype = [(name.encode("ascii"), np.object if is_string_type(array.dtype) else array.dtype)
                     for name, array in arrays.iteritems()]
        else:
            assert len(dtype) == len(data.keys()), 'dtype items count mismatch'

        # build new structured array
        result = np.empty(count, dtype)
        for name, array in arrays.iteritems():
            result[name] = array
        return cls(result)

    @classmethod
    def from_structarray(cls, array):
        return cls(array)

    @classmethod
    def from_tsv(cls, path_or_buf, **kwargs):
        import pandas as pd

        return cls.from_pandas(pd.read_csv(path_or_buf, sep='\t', **kwargs))

    def append_columns(self, data, replace=True):
        """
        append_fields from recfunctions (http://pyopengl.sourceforge.net/pydoc/numpy.lib.recfunctions.html) is too slow
        """
        assert is_structarray(data), 'Appending data must be structured array'
        assert len(self._array) == len(data), 'Array lengths mismatch'
        old_columns = set(self._array.dtype.names)
        new_columns = set(data.dtype.names)
        result_columns = old_columns.union(new_columns)
        if not replace and len(result_columns) < len(old_columns) + len(new_columns):
            raise ValueError("Some columns are going to be replaced, use replace=True if it is expected behavior")

        # `data` may contain a column with the same name but different type
        column_types = dict(self._array.dtype.fields)
        column_types.update(dict(data.dtype.fields))
        result_dtype = [(name, column_types[name][0]) for name in result_columns]

        new_array = np.empty(len(data), dtype=result_dtype)

        old_columns = list(old_columns - new_columns)
        new_columns = list(new_columns)
        new_array[old_columns] = self._array[old_columns]
        new_array[new_columns] = data[new_columns]

        return DataFrame(new_array)

    def append_column(self, data, name, replace=True):
        if isinstance(data, Series):
            data = data._array
        data = np.array(data)
        assert len(data.shape) == 1, '`append_column` only accepts 1d-arrays'
        if data.dtype == np.object:
            struct_data = np.empty(len(data), dtype=[(name, data.dtype)])
            struct_data[name] = data
            return self.append_columns(struct_data, replace)
        else:
            return self.append_columns(data.view(dtype=[(name, data.dtype)]), replace)

    def __contains__(self, item):
        """
        :param item: column name
        :return: True if column in DataFrame, False otherwise
        """
        return item in self._array.dtype.names

    def assert_has_columns(self, required_columns):
        """
        Checks if all of the required_columns are in DataFrame.
        Raises AssertionError if some columns are missing.
        """
        missing_columns = set(required_columns).difference(set(self.columns))
        if missing_columns:
            raise AssertionError("Array is missing the following required columns: {}".format(missing_columns))

    def _has_same_dtypes(self):
        first_column = self.columns[0]
        return all(self[first_column].dtype == self[column].dtype for column in self.columns)

    def max(self, axis=None, out=None):
        """
        :param axis: None of 1 or 2, optional
            Axis along which to operate.  By default, flattened input is used.
            None - search maximum through all columns;
            0 - return Row with maximums by columns;
            1 - return Series with maximums by rows.
        :param out: not used
        :return: maximum
        """
        if out is not None:
            raise NotImplementedError('Writing result to out is not supported yet')
        if axis is None:
            return np.max(self.as_2d_array())
        elif axis == 0:
            return Row(np.array([self[column].max() for column in self.columns], dtype=self.dtype))
        elif axis == 1:
            return Series(np.max(self.as_2d_array(), axis=1))

    def argmax(self, axis=1, out=None):
        """
        :param axis: for now, only 1 - return Series with argmaxs by rows.
        :param out: not used
        :return: Series with argmaxs by rows
        """
        if out is not None:
            raise NotImplementedError('Writing result to out is not supported yet')
        if axis == 1:
            return Series(np.argmax(self.as_2d_array(), axis=1))
        else:
            raise NotImplementedError('Only axis=1 for argmax is supported')

    def mean(self, axis=None, out=None):
        """
        Returns mean either for the whole DataFrame or by axis
        Output dtype might differ from one in the original DataFrame
        :param axis: None of 1 or 2, optional
            Axis along which to operate.  By default, flattened input is used.
            None - search mean through all columns;
            0 - return Row with means by columns;
            1 - return Series with means by rows.
        :param out: not used
        :return: mean
        """
        if out is not None:
            raise NotImplementedError('Writing result to out is not supported yet')
        if axis is None:
            return np.mean(self.as_2d_array())
        elif axis == 0:
            return Row(np.array([self[column].mean() for column in self.columns]))
        elif axis == 1:
            return Series(np.mean(self.as_2d_array(), axis=1))

    def as_2d_array(self):
        """
        View dataframe with the same dtypes as 2d array.
        """
        array = self._array
        if self.dtype.hasobject:
            array = repack_fields(array)
        return structured_to_unstructured(array)

    def arggroupby(self, column):
        """
        Reproduces pandas' df.groupby(), but returns group indices instead of groups
        The (slightly modified) recipe: http://esantorella.com/2016/06/16/groupby/
        """
        if not isinstance(column, (str, unicode)):
            column = list(column)
        unique_values, idx = np.unique(self._array[column], return_inverse=True)
        n_groups = len(unique_values)
        # for each group, store its indices
        group_indices = [[] for _ in xrange(n_groups)]
        for i, group in enumerate(idx):
            group_indices[group].append(i)
        group_indices = map(np.array, group_indices)
        for group_index in xrange(n_groups):
            yield unique_values[group_index], group_indices[group_index]

    def groupby(self, column):
        for key, idx in self.arggroupby(column):
            yield key, DataFrame(self._array[idx])

    def rename_columns(self, mapping):
        self._array.dtype.names = [mapping.get(name, name) for name in self._array.dtype.names]

    def to_list_of_dicts(self):
        """
        Represents DataFrame as list of dicts
        """

        # a workaround to make the result json and bson serializable
        def make_not_numpy_type(val):
            if isinstance(val, np.integer):
                return int(val)
            elif isinstance(val, np.floating):
                return float(val)
            elif isinstance(val, np.bool_):
                return bool(val)
            return val

        return [{name: make_not_numpy_type(row[name]) for name in self._array.dtype.names} for row in self._array]

    def drop_columns(self, columns):
        new_names = list(set(self._array.dtype.names) - set(columns))
        assert len(new_names) > 0, 'Not possible to delete all columns in the array'
        return DataFrame(self._array[new_names])

    def change_type(self, column, dtype):
        if self._array[column].dtype == dtype:
            return self

        new_array = self.drop_columns([column])
        return new_array.append_column(np.array(self._array[column], dtype=dtype), column)

    def change_types(self, types_map):
        for column, dtype in types_map.items():
            if self._array[column].dtype == dtype:
                del types_map[column]

        if not types_map:
            return self

        # If we want to change all columns, we will drop all columns and that is not possible
        if len(types_map.keys()) < len(self.columns):
            types_map_list = [types_map]
        else:
            types_map_list = [dict(types_map.items()[:-1]), dict(types_map.items()[-1:])]

        new_array = self
        for new_types_map in types_map_list:
            columns = new_types_map.keys()
            new_array = new_array.drop_columns(columns)
            new_columns_dtype = new_types_map.items()
            new_array = new_array.append_columns(np.array(self._array[columns], dtype=new_columns_dtype), column)

        return new_array

    def replace_column(self, data, name):
        other_columns = list(set(self._array.dtype.names) - {name})
        if len(other_columns) == 0:
            new_df = DataFrame(np.empty(len(data), dtype=[(name, data.dtype.str)]))
            new_df[name] = data
        else:
            if data.dtype.hasobject:
                new_df = self[other_columns].append_column(data, name, replace=True)
            else:
                array = self._array.copy()
                array[name] = data
                dtype = [(column, data.dtype) if column == name else (column, self.dtype.fields[column][0])
                         for column in self.columns]
                new_df = DataFrame(array.astype(dtype))
        return new_df

    def to_2d_array(self):
        """
        convert to numpy 2d array
        """
        return np.vstack([self._array[column] for column in self.columns]).T

    def sort(self, order):
        self._array.sort(order=order)

    def to_pandas(self):
        import pandas as pd

        return pd.DataFrame(self._array)

    def to_tsv(self, filename):
        with open(filename, mode='w') as f:
            writer = csv.writer(f, dialect=csv.excel_tab)
            writer.writerow(self.columns)
            for row in self:
                writer.writerow([i.encode('utf-8') if isinstance(i, unicode) else i for i in row])

    @property
    def columns(self):
        return self._array.dtype.names

    def drop_null(self, columns=None):
        columns = columns or self.columns
        idx = np.ones(len(self), dtype=np.bool)
        for column, (dtype, _) in self.dtype.fields.iteritems():
            if column in columns:
                idx &= ~get_null_index(self[column])
        return self[idx]


def get_compatible_dtypes(arrays):
    dtypes = {}
    for array in arrays:
        for column, (dtype, _) in array.dtype.fields.iteritems():
            if (column not in dtypes) or (np.can_cast(dtypes[column], dtype)):
                dtypes[column] = dtype
            elif np.can_cast(dtype, dtypes[column]):
                pass
            else:
                raise ValueError("Two different dtypes specified for column {} ({} and {}): cannot concat".format(
                    column, dtype, dtypes[column]
                ))
    return dtypes


def get_null_value(dtype):
    dtype = np.dtype(dtype)
    if np.issubdtype(dtype, np.floating):
        return np.nan
    elif np.issubdtype(dtype, np.integer):
        return np.int32(-1)
    elif is_string_type(dtype):
        return np.str('')
    elif dtype == np.object:
        return None
    elif dtype == np.bool_:
        return False
    else:
        raise ValueError("Unexpected dtype: {}".format(dtype))


def is_structarray(frame):
    return frame.dtype.names is not None


def is_string_type(dtype):
    return np.issubdtype(dtype, np.str) or np.issubdtype(dtype, np.unicode)


def make_cartesian_product(frame, array, column):
    """
    Appends array to frame as cartesian product of frame rows and array elements.
    :param frame: DataFrame
    :param array: Series
    :param column: new column name
    :return: frame new DataFrame with new column
    """
    df = frame.tile(len(array))
    return df.append_column(data=array.repeat(len(frame)), name=column)


def get_null_index(array):
    null_value = get_null_value(array.dtype)
    if null_value is np.nan:
        return np.isnan(array)
    else:
        return array == null_value
