#! /usr/bin/env python2.7
# -*- coding: utf8 -*-
import collections
import json
import sys
import time
import traceback
import urllib
import urlparse
import threading
import re

import requests

reload(sys)
sys.setdefaultencoding('utf-8')

from broken_guids import BROKEN_GROUP_IDS
from broken_paths import BROKEN_PATHS

MPFS_HOST = "http://mpfs.disk.yandex.net"
DJFS_HOST = "http://localhost:25699/api/legacy"

HEADERS = {'Content-Type': 'application/json'}

_supported_meta = {'info': {'md5', 'mimetype', 'fs_symbolic_link', 'visible', 'empty', 'group', 'file_id', 'short_url', 'angle',
                                               'mediatype', 'photoslice_album_type', 'width', 'albums_exclusions', 'video_info', 'aesthetics',
                                               'pmid', 'height', 'alias_enabled', 'resource_id', 'sha256', 'hasfolders', 'numfolders', 'numfiles',
                                               'drweb', 'etime', 'size', 'sizes', 'custom_preview', 'file_url', 'media_type', 'public', 'file_mid',
                                               'storage_type', 'photoslice_time', 'versioning_status', 'fotki_data_stid', 'fotki_data_url',
                                               'note_name', 'note_revision_created', 'note_revision_deleted', 'blocked', 'custom_properties',
                                               'download_counter', 'views_counter', 'page_blocked_items_num', 'comment_ids', 'blockings', 'revision',
                                               'public_hash', 'append_time', 'original_id', 'folder_type', 'numchildren', 'numfolders', 'online_office_url',
                                               'digest_mid', 'numfiles', 'fs_symbolic_link', 'shared_rights', 'with_shared', 'original_parent_id', 'version|urn:yandex:disk:dist',
                                               'office_online_editor_type', 'digest_url', 'fotki_image_id', 'hid'},
                   'bulk': {'md5', 'mimetype', 'fs_symbolic_link', 'visible', 'empty', 'group', 'file_id', 'short_url', 'angle',
                            'mediatype', 'photoslice_album_type', 'width', 'albums_exclusions', 'video_info', 'aesthetics',
                            'pmid', 'height', 'alias_enabled', 'resource_id', 'sha256', 'hasfolders', 'numfolders', 'numfiles',
                            'drweb', 'etime', 'size', 'sizes', 'custom_preview', 'file_url', 'media_type', 'public', 'file_mid',
                            'storage_type', 'photoslice_time', 'versioning_status', 'fotki_data_stid', 'fotki_data_url',
                            'note_name', 'note_revision_created', 'note_revision_deleted', 'blocked', 'custom_properties',
                            'download_counter', 'views_counter', 'page_blocked_items_num', 'comment_ids', 'blockings', 'revision',
                            'public_hash', 'append_time', 'original_id', 'folder_type', 'numchildren', 'autouploaded'},
                   'public': {'drweb', 'mediatype', 'mimetype', 'short_url', 'size', 'sizes', 'file_id', 'comment_ids',
                              'views_counter', 'download_counter', 'blockings', 'page_blocked_items_num'}}

_supported_parameters = {'public': {'private_hash', 'uid', 'sort', 'order', 'preview_size', 'preview_crop',
                                    'preview_quality', 'preview_allow_big_size', 'amount', 'offset', 'meta', 'increment_views'}}

_errors_write_lock = threading.Lock()
_errors_log_file = open('errors.log', 'a')


def print_diff(key, mpfs, djfs, offset, uri, is5xx=False, body=None):
    print 'meta', key
    print json.dumps(mpfs, indent=2)
    print json.dumps(djfs, indent=2)
    with _errors_write_lock:
        _errors_log_file.write(str(offset) + '\n')
        _errors_log_file.write(uri + '\n')
        if body:
            _errors_log_file.write(body + '\n')
        if is5xx:
            _errors_log_file.write('probably just a 5xx\n\n')
            _errors_log_file.flush()
            return
        _errors_log_file.write('meta ' + key + '\n')
        _errors_log_file.write('MPFS: ' + json.dumps(mpfs, indent=2) + '\n')
        _errors_log_file.write('DJFS: ' + json.dumps(djfs, indent=2) + '\n\n')
        _errors_log_file.flush()



def closelog():
    with _errors_write_lock:
        _errors_log_file.close()

def get_data(host, uri, endpoint, body=None):
    url = host + uri
    if endpoint in _supported_parameters:
        supported_parameters = _supported_parameters[endpoint]
        u = urlparse.urlparse(url)
        param_names = [param_pair.split('=')[0] for param_pair in u.query.split('&')]
        for param_name in param_names:
            if param_name not in supported_parameters:
                print param_name, param_names, supported_parameters, uri
                print 'Skipped'
                return None
    if endpoint in _supported_meta and (not 'meta' in uri or uri.strip().endswith('meta=') or 'meta=&' in uri):
        uri = uri.replace('meta=', '').strip()
        if not uri.endswith('&'):
            uri = uri + '&'
        uri = uri + 'meta=' + ','.join(_supported_meta[endpoint])
        url = host + uri

    response = requests.post(url, data=body, headers=HEADERS) if body else requests.get(url,  headers=HEADERS)
    return response


def rewrite_url(url):
    u = urlparse.urlparse(url)
    # not check owner_uid, consider + and %20 - the same
    params = map(urllib.unquote, filter(lambda x: not x.startswith('owner_uid='),
                                        sorted(u.query.replace('+', '%20').split('&'))))
    return {
        'scheme': u.scheme,
        'host': u.netloc,
        'path': u.path.replace('%3D', '=').replace('*', '%2A'),
        'params': params
    }

def sizes_to_dict(sizes):
    return dict([(size['name'], size['url']) for size in sizes])


def same_beauty(mpfs, djfs, offset, uri, body):
    if abs(mpfs - djfs) > 0.001:
        print_diff('aesthetics', mpfs, djfs, offset, uri, False, body)
        return False
    return True


def same_urls(key, from_trash, mpfs_url, djfs_url, offset, uri, body, mpfs_media_type=None):
    if not mpfs_url or not djfs_url:
        print_diff(key, mpfs_url, djfs_url, offset, uri, body=body)
        return False
    m = rewrite_url(mpfs_url)
    d = rewrite_url(djfs_url)
    # hack filename and mediatype parameter for trash links
    if from_trash:
        d['params'] = filter(lambda x: not x.startswith('filename='), d['params'])
        m['params'] = filter(lambda x: not x.startswith('filename='), m['params'])
        mPathParts = m['path'].split('/')
        dPathParts = d['path'].split('/')
        dPathParts[2] = mPathParts[2]
        d['path'] = '/'.join(mPathParts)

        d['params'] = filter(lambda x: not x.startswith('media_type='), d['params'])
        m['params'] = filter(lambda x: not x.startswith('media_type='), m['params'])

    # hack check media_type, if mpfs doesn't resolve it
    if ('media_type=unknown' in m['params']):
        d['params'] = filter(lambda x: not x.startswith('media_type='), d['params'])
        m['params'] = filter(lambda x: not x.startswith('media_type='), m['params'])

    # if mpfs use in file_url incorrect MediaType, other than mediatype field, replace. only for file_url
    if mpfs_media_type and len(filter(lambda x: x.startswith('media_type='), m['params'])) > 0:
        m['params'] = filter(lambda x: not x.startswith('media_type='), m['params'])
        m['params'].append('media_type=' + mpfs_media_type)
        m['params'] = sorted(m['params'])

    if m['scheme'] != d['scheme']:
        print_diff(key, m, d, offset, uri, False, body)
        return False
    if m['host'] != d['host']:
        print_diff(key, m, d, offset, uri, False, body)
        return False
    if m['params'] != d['params']:
        print_diff(key, m, d, offset, uri, False, body)
        return False

    # hack paths in download links, which responds for time
    if m['host'] == 'downloader.disk.yandex.ru':
        mPathParts = m['path'].split('/')
        dPathParts = d['path'].split('/')
        if mPathParts[3] != dPathParts[3] and abs(int(dPathParts[3], 16) - int(mPathParts[3], 16)) < 5:
            dPathParts[3] = mPathParts[3]
            dPathParts[2] = mPathParts[2]
            d['path'] = '/'.join(dPathParts)

    if m['path'] != d['path']:
        print_diff(key, m, d, offset, uri, body=body)
        return False

    return True


def same_group(mpfs_group, djfs_group, offset, uri, body):
    isOk = True

    for key, mpfs_value in mpfs_group.items():
        djfs_value = djfs_group.get(key) if djfs_group else None
        if key == 'user_count':
            continue
        if key == 'gid' and (djfs_value in BROKEN_GROUP_IDS or mpfs_value in BROKEN_GROUP_IDS):
            continue
        if not mpfs_value and not djfs_value:
            continue
        if djfs_value != mpfs_value:
             if key == 'owner':
                 if djfs_value and mpfs_value:
                     for owner_key in mpfs_value:
                         mpfs_owner_value = mpfs_value.get(owner_key)
                         djfs_owner_value = djfs_value.get(owner_key)
                         if not mpfs_owner_value and not djfs_owner_value:
                             continue
                         if mpfs_owner_value != djfs_owner_value:
                             isOk = False
                             print_diff('group ' + key, mpfs_value, djfs_value, offset, uri, body=body)
                             break
                 else:
                     print_diff('group ' + key, mpfs_value, djfs_value, offset, uri, body=body)
                     isOk = False

    return isOk


def filter_broken_paths(resp_array, uid):
    # for item in resp_array:
    #     broken = False
    #     for p in BROKEN_PATHS:
    #         broken = broken or (uid + ':' + item.get('path')).startswith(p)
    #     print str(broken) + '   ' + item.get('path')
    return filter(lambda item: 'path' not in item or all(not (uid + ':' + item.get('path')).startswith(p) for p in BROKEN_PATHS), resp_array)


def same(mpfs_l, djfs_l, uid, offset, uri, endpoint, body=None):
    isOk = True

    mpfs_l = mpfs_l if isinstance(mpfs_l, list) else [mpfs_l]
    djfs_l = djfs_l if isinstance(djfs_l, list) else [djfs_l]

    mpfs_l = filter_broken_paths(mpfs_l, uid) if uid else mpfs_l
    djfs_l = filter_broken_paths(djfs_l, uid) if uid else djfs_l

    if len(mpfs_l) != len(djfs_l):
        print_diff('length', len(mpfs_l), len(djfs_l), offset, uri, body=body)
        return


    for mpfs, djfs in zip(mpfs_l, djfs_l):
        mpfs = mpfs['resource'] if 'resource' in mpfs else mpfs
        djfs = djfs['resource'] if 'resource' in djfs else djfs
        djfs_path = djfs.get('path')
        mpfs_path = mpfs.get('path')
        d_resource_id = djfs['meta']['resource_id'] if 'meta' in djfs and 'resource_id' in djfs['meta'] else None
        m_resource_id = mpfs['meta']['resource_id'] if 'meta' in mpfs and 'resource_id' in mpfs['meta'] else None

        if djfs_path != mpfs_path and d_resource_id and d_resource_id == m_resource_id:
            # print "files m=" + mpfs_path + ' and d=' + djfs_path + ' have different paths, but same resource_id ' + d_resource_id
            continue

        from_trash = djfs_path and djfs_path.startswith('/trash')
        supported_meta = _supported_meta[endpoint] if endpoint in  _supported_meta else []

        for key, mpfs_value in mpfs.items():
            djfs_value = djfs.get(key)
            if (key == 'etime' or key == 'mtime') and int(djfs_value) < 0 and int(mpfs_value) < 0:
                continue

            if (key == 'utime' or key == 'ctime' or key == 'etime' or key == 'mtime') and abs(
                    int(djfs_value) - int(mpfs_value)) <= 1:
                continue

            if mpfs_value != djfs_value:
                if key == 'meta':
                    if 'sizes' in mpfs['meta']:
                        mpfs['meta']['sizes'] = sizes_to_dict(mpfs['meta']['sizes'])
                        djfs['meta']['sizes'] = sizes_to_dict(djfs['meta']['sizes']) if 'sizes' in djfs['meta'] else {}
                    mpfs_media_type = mpfs['meta'].get('mediatype', None)
                    for meta_key, mpfs_meta_value in mpfs_value.items():
                        if meta_key not in supported_meta:
                            continue
                        djfs_meta_value = djfs_value.get(meta_key)
                        if meta_key == 'sizes':
                            for size, mpfs_url in djfs_meta_value.items():
                                djfs_url = djfs_meta_value.get(size)
                                isOk = isOk and same_urls('size.' + size, from_trash, mpfs_url, djfs_url, offset, uri, body)
                        elif meta_key == 'user':
                            continue
                        elif meta_key == 'custom_preview':
                            isOk = isOk and same_urls('custom_preview', from_trash, mpfs_meta_value,
                                                      djfs_meta_value, offset, uri, body)
                        elif meta_key == 'office_online_url':
                            isOk = isOk and same_urls('office_online_url', False, mpfs_meta_value,
                                                      djfs_meta_value, offset, uri, body)
                        elif meta_key == 'group':
                            isOk = isOk and same_group(mpfs_meta_value, djfs_meta_value, offset, uri, body)
                        elif meta_key == 'fotki_data_url':
                            isOk = isOk and same_urls(meta_key, lambda x: x, mpfs_meta_value, djfs_meta_value, offset, uri, body)
                        elif meta_key == 'file_url':
                            isOk = isOk and same_urls(meta_key, from_trash, mpfs_meta_value, djfs_meta_value,
                                                      offset, uri, body, mpfs_media_type)
                        elif meta_key == 'digest_url':
                            isOk = isOk and same_urls(meta_key, from_trash, mpfs_meta_value, djfs_meta_value,
                                                      offset, uri, body, mpfs_media_type)
                        elif meta_key == 'aesthetics':
                            isOk = isOk and same_beauty(mpfs_meta_value, djfs_meta_value, offset, uri, body)
                        elif meta_key == 'autouploaded' or meta_key == 'comment_ids':
                            continue
                        elif meta_key == 'alias_enabled':
                            isOk = isOk and int(djfs_meta_value) == int(mpfs_meta_value)
                        elif (meta_key == 'media_type' or meta_key == 'mediatype') \
                                and mpfs_meta_value == 'unknown' and djfs_meta_value != 'unknown':
                            continue
                        elif (meta_key == 'media_type' or meta_key == 'mediatype') and from_trash:
                            continue
                        elif (meta_key == 'etime' or meta_key == 'photoslice_time') \
                                and int(djfs_meta_value) < 0 and int(mpfs_meta_value) < 0:
                            continue
                        elif meta_key == 'total_results_count':
                            continue
                        elif meta_key == 'shared_rights' and 'with_shared' not in mpfs_value:
                            continue
                        elif meta_key == 'file_id' and mpfs_path and is_root(mpfs_path):
                            continue
                        elif meta_key == 'views_counter' and endpoint == 'public' and djfs_meta_value - mpfs_meta_value < 5:
                            continue
                        elif meta_key == 'blockings' and endpoint == 'public' and check_blockings(mpfs_meta_value, djfs_meta_value, offset, uri, body):
                            continue
                        elif (mpfs_meta_value or djfs_meta_value) and mpfs_meta_value != djfs_meta_value:
                            print_diff(meta_key, mpfs_meta_value, djfs_meta_value, offset, uri, body=body)
                            isOk = False
                    for meta_key in djfs_value:
                        if meta_key not in supported_meta:
                            continue
                        if meta_key not in mpfs_value:
                            if meta_key == 'file_id' and mpfs_path and is_root(mpfs_path):
                                continue
                            print 'meta', meta_key, 'only in djfs', djfs_value.get(meta_key)
                            print_diff(meta_key, None, djfs_value.get(meta_key), offset, uri, body=body)
                            isOk = False
                else:
                    if key == 'id' and mpfs_value[:-1] == djfs_value:
                        continue
                    print key
                    print json.dumps(mpfs_value, indent=2)
                    print json.dumps(djfs_value, indent=2)
                    print_diff('not meta : ' + key, mpfs_value, djfs_value, offset, uri, body=body)
                    isOk = False
        for key in djfs:
            if key not in mpfs and key != '__type':
                print key, 'only in djfs'
                print_diff('not meta : ' + key, None, djfs.get(key), offset, uri, body=body)
                isOk = False

        if not isOk:
            print 'file path: ' + djfs_path if djfs_path else 'None'
            break

    return isOk

def check_blockings(mpfs, djfs, offset, uri, body):
    isOk = True

    for key, mpfs_value in mpfs.items():
        djfs_value = djfs.get(key) if djfs else None
        if not mpfs_value and not djfs_value:
            continue
        if djfs_value != mpfs_value:
            if key == 'simple' or 'folder':
                if djfs_value and mpfs_value:
                    for internal_key in mpfs_value:
                        mpfs_internal_value = mpfs_value.get(internal_key)
                        djfs_internal_value = djfs_value.get(internal_key)
                        if not mpfs_internal_value and not djfs_internal_value:
                            continue
                        if mpfs_internal_value != djfs_internal_value and int(djfs_internal_value) - int(mpfs_internal_value) > 5:
                            isOk = False
                            print_diff('blockings ' + key, mpfs_value, djfs_value, offset, uri, body=body)
                            break
                else:
                    print_diff('blockings ' + key, mpfs_value, djfs_value, offset, uri, body=body)
                    isOk = False

    return isOk


def strip_uid(str):
    if ':' in str:
        return str.split(':')[1]
    else:
        return str


def remove_duplicates_in_file_ids(body):
    js = json.loads(body.replace('\\\\', '\\'), encoding='UTF-8')
    file_ids = map(strip_uid, js)
    counter = collections.Counter(file_ids)

    return json.dumps(filter(lambda f: counter.get(strip_uid(f)) == 1, js))

def is_root(path):
    return '/' == path or re.match(r'/[a-zA-Z]', path) or re.match(r'/[a-zA-Z]/', path)


def do_check(lines):
    offset = int(sys.argv[1])
    endpoint = sys.argv[2]
    continue_on_error = sys.argv.count('--continue') > 0
    start_time = time.time()
    start_time_index = offset
    for _id, line in enumerate(lines):
        if _id < offset:
            start_time = time.time()
            continue
        print _id

        body = None
        if '\t' in line:
            body, uri = line[:-1].split('\t')
        else:
            uri = line[:-1]

        if body:
            body = remove_duplicates_in_file_ids(body)

        mpfs_response = get_data(MPFS_HOST, uri, endpoint, body)
        djfs_response = get_data(DJFS_HOST, uri, endpoint, body)
        uid = urlparse.parse_qs(urlparse.urlparse(uri).query)['uid'][0] if 'uid' in urlparse.parse_qs(urlparse.urlparse(uri).query) else None

        if not mpfs_response and not djfs_response:
            continue

        if not mpfs_response or not djfs_response:
            print 'error in one of responses, see log'
            is_same = False
        else:
            try:
                mpfs_data = json.loads(mpfs_response.text)
                djfs_data = json.loads(djfs_response.text)

                is_same = same(mpfs_data, djfs_data, uid, _id, uri, endpoint, body)
            except Exception as e:
                print 'error', uri
                print e
                traceback.print_exc()
                is_same = False

        if (_id % 100 == 0) or not is_same:
            prev_time = start_time
            start_time = time.time()
            print str(_id - start_time_index) + ' records took ' + time.strftime("%H:%M:%S",
                                                                                 time.gmtime(
                                                                                     start_time - prev_time))
            start_time_index = _id

            with open('results.log', 'a') as f:
                f.write('%d %s\n' % (_id, is_same))
        if not is_same:
            print '-------------'
            print uri
            print
            if continue_on_error:
                with open('errors.log', 'a') as f:
                    f.write(line)
            else:
                return

    print 'done'
    _errors_log_file.close()


if __name__ == "__main__":
    do_check(sys.stdin)
