#!/usr/bin/python
# -*- coding: utf-8 -*-

#печать таблицы для анализа

import yt.wrapper as yt
import sys
import re

def main():
    input = '//home/catalogia/yuryz/queries_left_join'

    pat = r'(https?:\/\/)?([\w\.-]+)\.([a-z]{2,6}\.?)(\/[\w \.-]*)*\/?'
    pat_re = re.compile(pat)

    prev_rec = None
    for rec in yt.read_table(input, raw=False):
        url_re = pat_re.match(rec['url'])
        if url_re is None:
            continue
        domain = url_re.group(2)+'.'+url_re.group(3) #домен
        if domain != sys.argv[1]:
            continue

        if prev_rec is not None:
            if prev_rec['url'] == rec['url']:
                if D.has_key(rec['position']):
                    if D[rec['position']].has_key(rec['query']):
                        D[rec['position']][rec['query']] += 1
                    else:
                        D[rec['position']][rec['query']] = 1
                else:
                    D[rec['position']] = {rec['query']: 1}
            else:
                print title
                print url
                K1 = D.keys()
                K1.sort()
                for k1 in K1:
                    K2 = D[k1].keys()
                    K2.sort()
                    for k2 in K2:
                        print '\t' + str(k1) + '\t' + k2 + '\t' + str(D[k1][k2])

                prev_rec = rec
                title = prev_rec['title'].strip()
                url = prev_rec['url']
                D = {}
                D[prev_rec['position']] = {prev_rec['query']: 1}
        else:
            prev_rec = rec
            title = prev_rec['title'].strip()
            url = prev_rec['url']
            D = {}
            D[prev_rec['position']] = {prev_rec['query']: 1}


if __name__ == '__main__':
    main()
