import logging
from itertools import chain
from six import text_type, PY2

log = logging.getLogger(__name__)


try:
    unichr
except NameError:
    unichr = chr

if PY2:
    def make_bad_symbols():
        return frozenset(chain(
            [u'\u0000'],
            (unichr(i) for i in range(ord(u'\ud800'), ord(u'\udbff') + 1)),
            (unichr(i) for i in range(ord(u'\udc00'), ord(u'\udfff') + 1)),
        ))

    BAD_SYMBOLS = make_bad_symbols()
else:
    BAD_SYMBOLS = [u'\u0000']


def sanitize_unicode(u_str):
    # http://unicodebook.readthedocs.org/en/latest/unicode_encodings.html#surrogates
    #
    # U+D800—U+DBFF (1,024 code points): high surrogates
    # U+DC00—U+DFFF (1,024 code points): low surrogates
    assert isinstance(u_str, text_type)

    for x in u_str:
        if x in BAD_SYMBOLS:
            return u''.join(x for x in u_str if x not in BAD_SYMBOLS)
    return u_str


def safe_unicode(v, field_name):
    if v is None:
        return v
    if isinstance(v, text_type):
        return sanitize_unicode(v)
    try:
        uv = text_type(v, 'utf-8')
    except Exception as exc:
        log.warning("{0}: {1}".format(
            field_name, exc))
        try:
            uv = text_type(v, 'utf-8', errors='ignore')
        except Exception as exc:
            log.error('Fallback unicoding string %s=\'%s\' failed with exc: %s',
                      field_name, v, exc)
            raise
    return sanitize_unicode(uv)
