from __future__ import print_function

import re
import six
import sys
import json
import atexit
import requests
import logging
import itertools
import subprocess

from collections import defaultdict

try:
    from urllib.parse import parse_qs
    unicode = str
except ImportError:
    from urlparse import parse_qs
    from itertools import imap as map, izip as zip


def read_wizard_requests(file_name):
    """Read a printwzrd-style request list (whitespace-only lines are ignored, `^@.+` are comments)."""
    with open(file_name) as f:
        for line in f:
            line = line.strip('\n\r')
            if line and not line.startswith('@'):
                yield line


_RELEV_FIELD_TYPES = [
    (re.compile(r'^[+-]?\d+$'), int),
    (re.compile(r'^[+-]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?$'), float),
]


def _parse_relev(it):
    if isinstance(it, dict):
        return it

    res = {}
    for p in it.split(u';'):
        k, _, v = p.partition(u'=')
        for regex, converter in _RELEV_FIELD_TYPES:
            if regex.match(v):
                v = converter(v)
                break
        res.setdefault(k, []).append(v)
    return {k: v[0] if len(v) == 1 else v for k, v in res.items()}


_tree_unpackers = dict()


def _kill_unpackers():
    global _tree_unpackers
    for argv, unpacker in list(_tree_unpackers.items()):
        if unpacker:
            try:
                unpacker.stdin.close()
            except Exception:
                try:
                    unpacker.kill()
                except Exception:
                    pass
        _tree_unpackers.pop(argv)


def _unpack(argv, it):
    global _tree_unpackers
    if not isinstance(it, six.text_type):
        return it
    argv_key = '/'.join(argv)
    while True:
        unpacker = _tree_unpackers.get(argv_key, None)
        if unpacker is False:
            # Failed to start once, no need to try again
            return [it, 'Error: Unpacker has crashed']
        if unpacker is None:
            try:
                unpacker = _tree_unpackers[argv_key] = subprocess.Popen(
                    argv, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
                )
                atexit.register(_kill_unpackers)
            except Exception as x:
                _tree_unpackers[argv_key] = False  # don't try to start it again
                return [it, 'Error: Failed to launch unpacker %s: %s' % (' '.join(argv), x)]
        try:
            if unpacker.poll() is not None:
                # The unpacker crashed? Let it lay still, then.
                _tree_unpackers[argv_key] = False
                return [it, 'Error: Unpacker crashed']
            unpacker.stdin.write((it.replace(u'\n', u' ').strip() + u'\n').encode('utf-8'))
            unpacker.stdin.flush()
            req = unpacker.stdout.readline().decode('utf-8').strip(u'\n')
            if req.startswith('error: '):
                return [it, 'Error: Invalid input: ' + req[7:]]
            return req
        except IOError:
            _tree_unpackers[argv_key] = None  # restart on next iteration


def _unpack_tree(unpackrichtree, it, unpacked_trees):
    key = ('/'.join(unpackrichtree[1]), it)
    cached = unpacked_trees.get(key)
    if cached is not None:
        return cached

    verbose = _unpack(unpackrichtree[1], it)
    # The code below causes timeout:
    # try:
    #     verbose = json.loads(verbose, strict=False)
    # except Exception:
    #     pass
    cached = {
        'short': _unpack(unpackrichtree[0], it),
        'verbose': verbose
    }
    unpacked_trees[key] = cached
    return cached


_TREES_IN_RELEV = {
    # relevname: (rule, field)
    u'qtree_business': (u'PPO', u'BusinessTree'),
    u'qtree_business_nogeo': (u'PPO', u'BusinessNoGeoTree'),
}


def logging_verbose(verbose, *args, **kwargs):
    if verbose:
        logging.debug(*args, **kwargs)
    else:
        pass


def load_json_like_strings(text, verbose=False):
    logging_verbose(verbose, "TEXT! Text type: `{}`", type(text))
    if isinstance(text, (str, u"".__class__)) and (text or '-')[0] in '[{':
        try:
            data = json.loads(text, strict=False)
            if isinstance(data, list):
                logging_verbose(verbose, "NOW WE SHOULD TRY TO SORT.1")
                if len(data) > 0:
                    if isinstance(data[0], dict) and 'key' in data[0]:
                        try:
                            data.sort(key=lambda x: x['key'])
                        except Exception:
                            pass
                    else:
                        try:
                            data.sort()
                        except Exception:
                            pass
                for i, v in enumerate(data):
                    data[i] = load_json_like_strings(v, verbose)
                data.append({'__type__': 'string'})
            elif isinstance(data, dict):
                for k, v in data.iteritems():
                    if isinstance(v, list):
                        logging_verbose(verbose, "NOW WE SHOULD TRY TO SORT.2")
                        data[k] = load_json_like_strings(sorted(v), verbose)
                    else:
                        data[k] = load_json_like_strings(v, verbose)
                data['__type__'] = 'string'
            return data
        except Exception:
            pass
    if isinstance(text, list) and len(text) > 0:
        if isinstance(text[0], dict) and 'Alias' in text[0]:
            try:
                text = {
                    item['Alias']: item for item in text
                }
                text['__type__'] = 'list'
                return text
            except Exception:
                pass

        logging_verbose(verbose, "NOW WE SHOULD TRY TO SORT.3")
        try:
            text.sort()
        except Exception:
            logging_verbose(verbose, "TEXT! Value type: `{}`", type(text[0]))

    return text


def _unpack_cgi(text):
    if not isinstance(text, six.text_type):
        return text
    result = defaultdict(list)
    for chunk in text.split('&'):
        k, v = (chunk.split('=', 1) + [''])[:2]
        result[k].append(v)
    # we do not sort result[k] here because cgi args order may matter
    return result


def unpack_rule_fields(rules, relev_hashes, unpackrichtree, unpackreqbundle, unpacked_trees, verbose=False):
    serialization = rules.get(u'BinaryTree', rules.get(u'Serialization', {}))
    if u'Tree' in serialization:
        if u'Pretty' in serialization:
            del serialization[u'Tree']
        elif unpackrichtree:
            serialization[u'Tree'] = _unpack_tree(unpackrichtree, serialization[u'Tree'], unpacked_trees)

    for relev, (rule, field) in _TREES_IN_RELEV.items():
        # Avoid duplicating diffs in rrr and relevs, but still indicate there is diff.
        # Less json comparisons, less qtree unpacking, less global warming.
        response = rules.get(rule, {})
        if field in response and relev in relev_hashes:
            response[field] = '(tree hash: %s) <see relev "%s">' % (hex(relev_hashes[relev]), relev)

    for rule, props in rules.items():
        props.pop(u'RuleRequest', None)
        props.pop(u'ReadableTree', None)

        for prop in list(props):
            if prop.endswith(u'_debug'):
                del props[prop]
            elif prop.startswith(u'qtree4'):
                if unpackrichtree:
                    props[prop] = _unpack_tree(unpackrichtree, props[prop], unpacked_trees)
            elif prop == u'wizqbundle' or (rule == u'Serialization' and prop == u'Bundle'):
                if unpackreqbundle:
                    unpacked = _unpack(unpackreqbundle, props[prop])
                    try:
                        props[prop] = json.loads(unpacked, strict=False)
                    except Exception:
                        props[prop] = unpacked
            else:
                logging_verbose(verbose, "load_json_like_strings CALLED for prop {}", prop)
                props[prop] = load_json_like_strings(props[prop], verbose)

    for rule, field in [(u'Afisha', 'QtreeSaas'),
                        (u'GeoRelev', u'AuxTree'),
                        (u'GeoRelev', u'ReqPhrase'),
                        (u'Misspell', u'FixedQTree'),
                        (u'ObjectIntent', u'Video'),
                        (u'QTreeFiller', u'filled_qtree'),
                        (u'Social', u'SOCIAL'),
                        (u'Video', u'VIDEO'),
                        (u'Video', u'VIDEOHOSTING'),
                        (u'Video', u'VIDEOSERIAL')]:
        response = rules.get(rule, {})
        if unpackrichtree and field in response:
            response[field] = _unpack_tree(unpackrichtree, response[field], unpacked_trees)

    misspell = rules.get(u'Misspell', {})
    if u'FixedRelev' in misspell:
        misspell[u'FixedRelev'] = _parse_relev(misspell[u'FixedRelev'])

    for rule, field in [(u'DataDrivenClassifiersWizard', u'Debug'),
                        (u'GeoAddr', u'Body'),
                        (u'GeoAddr', u'UnfilteredAnswer'),
                        (u'Gzt', u'Body'),
                        (u'Report', u'Body'),
                        (u'Report', u'Rules'), ]:
        # Parse all list values as json
        response = rules.get(rule, {})
        src = response.get(field, None)
        if not isinstance(src, list):
            continue

        results = []
        for value in src:
            try:
                d = json.loads(value)
                d['__type__'] = 'string'
                results.append(d, strict=False)
            except Exception:
                results.append(value)
        response[field] = results

    for rule, field, pretty_field in [(u'EntityFinder', u'MatchesExport', u'MatchesExportDebug'),
                                      (u'EntitySearch', u'QueryObject', u'QueryObjectClassifierResults'),
                                      (u'MobileApps', u'qtree4mobapps', u'ReadableTree'),
                                      (u'QueryFactors', u'Factors', u'PlainTextFactors'), ]:
        # remove some fields if there are pretty-printed duplicate fields in the answer
        response = rules.get(rule, {})
        if pretty_field in response:
            response.pop(field, None)

    report = rules.get(u'Report', {}) or {}  # sometimes rules.get(u'Report', {}) gives us None
    for b in report.get(u'Body', []) or []:
        if not isinstance(b, list):
            continue
        for bb in b:
            try:
                d = json.loads(bb[u'value'], strict=False)
                d['__type__'] = 'string'
                bb[u'value'] = d
            except Exception:
                pass
    for r in report.get(u'Rules', []) or []:
        if not isinstance(r, list):
            continue
        for t in r.get(u'Tokens', []) or []:
            try:
                d = json.loads(t[u'Body'][u'Value'], strict=False)
                d['__type__'] = 'string'
                t[u'Body'][u'Value'] = d
            except Exception:
                pass

    rules.get(u'LingBoost', {}).pop('CacheHit', None)


def postprocess(resp, unpackrichtree, unpackreqbundle, rules=(), verbose=False):
    '''
        Remove redundant and unnecessary data from a Wizard response, unpack (almost) all
        qtrees, and erase the results of any rules except the ones we're interested in.

        :param unpackrichtree: argv for a tool that converts base64-encoded qtrees to readable strings
        :param unpackreqbundle: argv for a tool that converts base64-encoded reqbundles to readable strings
        :param rules: either a set of rules of interest, or an empty set (meaning "all").

    '''
    logging_verbose(verbose, "POSTPROCESS STARTED.")
    logging_verbose(verbose, 'Length of responses before all: {}', len(resp))
    logging_verbose(verbose, 'Responses keys: {}', resp.keys())
    resp.pop(u'meta', None)
    resp.pop(u'markup', None)
    resp.pop(u'begemot_nodes', None)
    resp.pop(u'deserialized_qtree', None)
    relev_hashes = {}
    unpacked_trees = {}

    if u'rules' in resp:
        if rules:
            resp[u'rules'] = {k: v for k, v in resp[u'rules'].items() if k in rules}
        else:
            resp[u'rules'].pop(u'DebugTree', None)
            resp[u'rules'].pop(u'DebugRequest', None)
            resp[u'rules'].pop(u'.version', None)
    resp_rules = resp.get(u'rules')

    for k in (u'relev', u'rearr', u'snip'):
        if k in resp:
            resp[k] = _parse_relev(resp[k])
            if not unpackrichtree:
                continue
            for key in resp[k]:
                if key in ['auxq', 'regphrase', 'morehl'] or key.startswith('qtree') or key.endswith('qtree'):
                    if key in _TREES_IN_RELEV and _TREES_IN_RELEV[key][0] in resp_rules:
                        relev_hashes[key] = hash(resp[k][key])
                    resp[k][key] = _unpack_tree(unpackrichtree, resp[k][key], unpacked_trees)

    if resp_rules:
        logging_verbose(verbose, "unpack_rule_fields STARTED.")
        unpack_rule_fields(resp_rules, relev_hashes, unpackrichtree, unpackreqbundle, unpacked_trees, verbose)

    if u'log' in resp:
        try:
            resp[u'log'] = json.loads(resp[u'log'], strict=False)
        except Exception:
            pass

    if u'wizarded_request' in resp:
        resp[u'qtree'] = resp.pop(u'wizarded_request')
    elif u'qtree' in resp and unpackrichtree:
        resp[u'qtree'] = _unpack_tree(unpackrichtree, resp[u'qtree'], unpacked_trees)

    if u'eventlog' in resp:
        for entry in resp[u'eventlog']:
            entry.pop(u'Timestamp', None)

    if 'global_ctx' in resp:
        statid = resp['global_ctx'].get('stat-block-id')
        if statid:
            resp['global_ctx']['stat-block-id'] = "some unique id"

    if 'content' in resp:
        resp['content'] = _unpack_cgi(resp['content'])

    logging_verbose(verbose, 'Length of responses after all: {}'.format(len(resp)))
    return resp


def parse_query(req, extra_params={}, maybe_full=True):
    '''Given a line of a printwzrd request list, construct an HTTP path + query.'''
    from printwizard3.printwizard import parse_query
    return parse_query(req, extra_params, maybe_full)


def parse_queries(reqs, extra_params={}):
    for req in reqs:
        req = req.rstrip('\n')
        if not req or req.startswith('@'):
            continue
        path, rules = parse_query(req, extra_params)
        yield req, path, rules


def extra_cgi_parameters(service):
    return {
        'default': 'wizextra=verifyalltrees=da',
        'geo': 'geoaddr_geometa=1&geoaddr_debug=1&wizextra=geoaddr_geometa=1&wizextra=geoaddr_debug=1',
        'geosearch': 'wizextra=verifyalltrees=da&rn=Geosearch&geoaddr_geometa=1&wizextra=geoaddr_geometa=1',
    }.get(service, '')


def _mp_remote(req, session=requests.Session()):
    host, global_rules, unpackrichtree, unpackreqbundle, (text, url, rules) = req
    if not isinstance(text, unicode):
        text = text.decode('utf-8')
    response = None
    try:
        response = session.get(u'http://{}{}'.format(host, url))
        response.raise_for_status()
        if not response.text:
            return text, {u'error': u'empty response'}
        return text, postprocess(response.json(), unpackrichtree, unpackreqbundle, set(rules) | set(global_rules))
    except Exception as e:
        if response is None:
            return text, {u'Python exception': u'{0.__class__.__name__}: {0}'.format(e)}
        return text, {
            u'error': unicode(response.text),
            u'HTTP code': response.status_code,
            u'Python exception': u'{0.__class__.__name__}: {0}'.format(e)
        }


def printwzrd(host, reqs, extra_params=None, global_rules=[], pool=None, unpackrichtree=None, unpackreqbundle=None):
    extra_params = parse_qs(extra_params) if extra_params else {}
    iter = zip(
        itertools.repeat(host),
        itertools.repeat(global_rules),
        itertools.repeat(unpackrichtree),
        itertools.repeat(unpackreqbundle),
        parse_queries(reqs, extra_params)
    )
    if pool:
        return pool.imap_unordered(_mp_remote, iter, chunksize=10)
    return map(_mp_remote, iter)


def _mp_apphost(arg):
    i, (req, resp, kwargs) = arg
    text = None
    for source in json.loads(req):
        if not isinstance(source['results'], list):
            source['results'] = [source['results']]
        for item in source['results']:
            if item.get('type') == 'report':
                text = item.get('text')
    head = u'#{} | {}'.format(i, text)
    try:
        items = json.loads(resp)
    except ValueError:
        return head, {u'error': resp}
    if len(items) != 1:
        return head, {u'error': u'expected a json mode response consisting of one item'}
    return head, postprocess(items[0], **kwargs)


def postprocess_apphost(reqs, responses, pool=None, **kwargs):
    iter = enumerate(zip(reqs, responses, itertools.repeat(kwargs)), 1)
    if pool:
        return pool.imap_unordered(_mp_apphost, iter, chunksize=10)
    return map(_mp_apphost, iter)


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(description='Obtain Wizard responses to printwzrd requests.')
    parser.add_argument('host', metavar='host:port', help='address of a remote Wizard instance')
    parser.add_argument('--cgi', help='extra query parameters')
    parser.add_argument('--rules', help='comma-separated names of rules to print the results of')
    parser.add_argument('--unpack', help='path to tools/unpackrichtree executable')
    args = parser.parse_args()
    rules = args.rules.split(',') if args.rules else []
    unpackrichtree = [[args.unpack, '-urrl'], [args.unpack, '-us']] if args.unpack else None
    json.dump(
        dict(
            printwzrd(args.host, sys.stdin, extra_params=args.cgi, global_rules=rules, unpackrichtree=unpackrichtree)
        ),
        sys.stdout
    )
