import sys
import json
import hashlib
import base64

'''
Facts retriever for YT
recives settings as a json in base64 or quoted representation
*
|-ensureAscii (boolean, optional, false by default.
|              If true, all unicode strings are forced to ascii representation)
|-encoding (string, optional, default utf-8)
|-fields (JSON-list, required)
  |-type (string, required)
  |-sources (string list, optional.
  |          If missing or empty, all sources are processed)
  |-hash (string list, optional, contains list of keys to be hashed.
          Can containe inner data's key (e.g. "data.inner_key").
          Data fields will be finally dumped to string)

for example:

python retrieve_facts.py eyJmaWVsZHMiOlt7InR5cGUiOiJfcWEtc2lnbmF0dXJlIiwic291cmNlcyI6WyJtYWlsIl0sImhhc2giOlsiZXh0cmFjdF90aW1lIiwiZGF0YS5TaWduYXR1cmUuRmlvIl19XX0=
python retrieve_facts.py '{"ensureAscii": true, "encoding": "utf-8", "fields": [{"type": "_qa-signature", "sources": ["mail"],  "hash": ["extract_time", "data.Signature.Fio"]}]}'

'''


def do_decode(lines):
    for line in lines:
        yield json.loads(line)


def do_map(rows, settings):
    encoding = settings.get('encoding', 'utf-8')
    ensure_ascii = settings.get('ensureAscii', False)

    def hash_func(data):
        if isinstance(data, unicode):
            data = data.encode(encoding)
        if not isinstance(data, str):
            data = str(data)
        return hashlib.sha256(data).hexdigest()

    def hash_field_in_place(source, field):
        if field in source:
            source[field] = hash_func(source[field])

    field_sets_with_data = [True in [key.startswith('data.')
                                     for key in field_settings['hash']]
                            for field_settings in settings['fields']]

    for row in rows:
        for field_settings, hash_data in zip(settings['fields'],
                                             field_sets_with_data):
            if field_settings['type'] != row.get('type'):
                continue

            sources = field_settings.get('sources', [])
            if sources and row['source'] not in sources:
                continue

            if hash_data and 'data' in row:
                try:
                    row['data'] = json.loads(row['data'])
                except ValueError:
                    continue

            for key in field_settings.get('hash', []):
                if key.startswith('data.') and 'data' in row:
                    data_key = key[5:]
                    hash_field_in_place(row['data'], data_key)
                else:
                    hash_field_in_place(row, key)

            if hash_data and 'data' in row:
                row['data'] = json.dumps(row['data'],
                                         ensure_ascii=ensure_ascii,
                                         encoding=encoding)

            result = json.dumps(row,
                                ensure_ascii=ensure_ascii, encoding=encoding)
            if (isinstance(result, unicode)):
                result = result.encode(encoding)
            print result


if __name__ == '__main__':
    stream = sys.stdin
    stream = do_decode(stream)

    try:
        settings = json.loads(base64.b64decode(sys.argv[1]))
    except TypeError:
        settings = json.loads(sys.argv[1])

    if 'fields' not in settings:
        print >> sys.stderr, 'wrong settings format'
        sys.exit(1)
    do_map(stream, settings)
