import json
import os

import yt.wrapper as yt
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
import scipy.spatial.distance as ssd


def module_filter(module):
    module_name = getattr(module, '__name__', '')
    if 'yt_yson_bindings' in module_name:
        return False
    if 'numpy' in module_name:
        return False
    if 'hashlib' in module_name:
        return False
    if 'hmac' in module_name:
        return False

    module_file = getattr(module, '__file__', '')
    if not module_file:
        return False
    if module_file.endswith('.so'):
        return False

    return True


yt.config["proxy"]["url"] = "hahn.yt.yandex.net"
yt.config["memory_limit"] = 3000000000
yt.config['pickling']['dynamic_libraries']['enable_auto_collection'] = True
yt.config['pickling']['module_filter'] = module_filter
yt.config["tabular_data_format"] = yt.DsvFormat()

yt.config["token"] = 'YOUR_TOKEN_HERE'
yt.config["pool"] = 'sherlock'

original_stacktraces_list = list()
original_stacktraces = yt.read_table('//home/mobilesearch/salavat/retrace/4.16.1/errors/rebucket/2019-03-12_uniq', enable_read_parallel=True)
for row in original_stacktraces:
    original_stacktraces_list.append(row['stacktrace'])

number_of_stacktraces = len(original_stacktraces_list)

distances = np.ones((number_of_stacktraces, number_of_stacktraces), order='C')
np.fill_diagonal(distances, 0.0)


with open('2019_03_12_distances_0.4_0.8.txt', 'r') as distances_file:
    lines = distances_file.readlines()
    for i in range(1, len(lines)):
        splitted = lines[i].strip().split('\t')
        distance = splitted[0]
        i = int(splitted[1])
        j = int(splitted[2])
        distances[i, j] = float(distance)
        distances[j, i] = float(distance)
    distArray = ssd.squareform(distances)
    linkage = linkage(distArray, method='complete')
    print(linkage)
    t = 0.9
    clusters = fcluster(Z=linkage, t=t)
    clusters = clusters.astype(int)
    histo, bin_edges = np.histogram(clusters, bins=np.unique(clusters))
    number_of_clusters = np.unique(clusters)[-1]
    print('for t = {} number of uniq clusters = {}'.format(t * 100.0, number_of_clusters))
    dirname = '0.4_0.8_{}'.format(t*100)
    if not os.path.exists(dirname):
        os.mkdir(dirname)
    valuable_clusters = list()
    histo = histo.astype(int)
    for i in range(len(histo)):
        if histo[i] > 10:
            valuable_clusters.append((i, histo[i]))
    valuable_clusters.sort(key=lambda x: x[1])
    np.savetxt(os.path.join(dirname, '0.4_0.8_{}_histo.txt'.format(t*100)), histo, fmt='%i')
    np.savetxt(os.path.join(dirname, '0.4_0.8_{}_histo_bins.txt'.format(t*100)), bin_edges.astype(int), fmt='%i')
    with open(os.path.join(dirname, '0.4_0.8_{}_big_clusters.txt'.format(t*100)), 'w+') as valuable_histo_file:
        valuable_histo_file.write('\n'.join(map(lambda x: '{}:{}'.format(x[0], x[1]), valuable_clusters)))
    np.savetxt(os.path.join(dirname, '0.4_0.8_{}.txt'.format(t*100)), clusters, fmt='%i')
    for i in range(len(clusters)):
        cur_cluster = clusters[i]
        with open(os.path.join(dirname, 'cluster_{}.txt'.format(int(cur_cluster))), 'a+') as cur_cluster_file:
            cur_cluster_file.write(original_stacktraces_list[i])
            cur_cluster_file.write('\n===================================================\n')
    print(clusters)

    # with yt.Transaction(timeout=10800):
#     table = yt.read_table(
#         table=yt.TablePath(name='//home/mobilesearch/salavat/retrace/4.16.1/errors/rebucket/2019-03-12_distances_0.4_0.2', columns=['i', 'j', 'distance']),
#         enable_read_parallel=True)
#     row_num = 0
#     for row in table:
#         row_num += 1
#         i = int(row['i'])
#         j = int(row['j'])
#         distance = row['distance']
#         distances[i, j] = float(distance)
#         if row_num % 1000000 == 0:
#             print('read {} rows'.format(row_num))
#     print('distances were read.')
#
#     np.savetxt('0.4_0.2.txt')

