import sys
import codecs



__author__ = 'cansucullu'

mark_dict = {'HIGHEST': 1.00,
             'HIGH': 0.75,
             'MIDDLE': 0.50,
             'LOW': 0.25,
             'LOWEST': 0.00,
             # '_404': 0.00,
              }


def main(filename):
    f_in = codecs.open(filename, 'r', 'utf-8')
    f_out = codecs.open(filename+'-marks-every', 'w+', 'utf-8')

    f_tw = codecs.open('new-tw-raw.txt', 'r', 'utf-8')
    tws = f_tw.readlines()
    f_tw.close()


    c = 0
    for line in f_in:
        url = line.rstrip()
        new_line = ''
        new_line += url


        condition = True

        for tw_line in tws:
            items = tw_line.rstrip().split('\t')

            if url == items[0]:
                condition = False
                new_line += '\t'
                new_line += items[1]

        if condition:
            print "no mark exists for ", url

        new_line += '\n'
        f_out.write(new_line)


        c += 1
        if c % 1000 == 0:
            print c

    f_in.close()
    f_out.close()


def remove_duplicates(filename):
    f_in = open(filename+'-marks-every')
    f_out = open(filename+'-marks-single', 'w+')


    c = 0
    for line in f_in:
        all_items = line.strip().split('\t')
        url = all_items[0]
        items = all_items[1:]

        if len(items) == 1:
            if items[0] == '_404':
                final_mark = '_404'
            elif items[0] == 'not-judged':
                final_mark = 'not-judged'
            else:
                final_mark = items[0]

        else:
            effective_score = 0.0
            effective_count = 0

            for item in items:
                if item != '_404':
                    effective_count += 1
                    effective_score += mark_dict[item]

            if effective_count == 0:
                final_mark = '_404'
            else:
                average_score = effective_score / effective_count

                closest = min(mark_dict.values(), key=lambda x: abs(x-average_score))
                index_ = mark_dict.values().index(closest)
                final_mark = mark_dict.keys()[index_]

        f_out.write(url)
        f_out.write('\t')
        f_out.write(final_mark)
        f_out.write('\n')

        c += 1
        if c % 500 == 0:
            print c,


    f_in.close()
    f_out.close()


if __name__ == '__main__':
    filename = sys.argv[1]
    print "Collect Marks"
    #main(filename)
    print "Remove Duplicates"
    remove_duplicates(filename)
