#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
import sys
if os.getuid() != 0:
    print "you must be root to run this script"
    sys.exit(1)

import re
import traceback
import cPickle as pickle

import mpfs.engine.process
from mpfs.common.util import filetypes
mpfs.engine.process.setup_admin_script()
log = mpfs.engine.process.get_default_log()

import pymongo
import datetime
from collections import defaultdict

from mpfs.metastorage.mongo.projects.disk import db, id_for_key, decompress_data
from mpfs.common.util import format_dict_table


LIMIT = 1000
RE_DISK = re.compile('^\d+:/disk/.*')
dump_file_name = '/tmp/public_stat_dump.out'
result_file_name = '/tmp/public_stat_result.out'

def iter_uids():
    uids_file_name = sys.argv[1]
    with open(uids_file_name, 'r') as uids_file:
        line = 1
        while uids_file and line:
            line = uids_file.readline().strip()
            yield line

def link_iterator():
    users_processed = 0
    total_number = db.link_data.count()
    users_count = db.user_index.count()
    log.info('Total number of links: %s' % total_number)
#     while users_processed < users_count:
#         for user in list(db.user_index.find(skip=users_processed, limit=100000, fields=())):
    for uid in iter_uids():
        spec = {
                'uid' : uid
                }
        links_processed = 0
        links_per_user = db.link_data.find(spec).count()
        if links_per_user:
            while links_processed < links_per_user:
                for link in db.link_data.find(spec, skip=links_processed, limit=LIMIT, fields=('uid', 'type', 'data',)):
                    links_processed+=1
                    if link['type'] == 'file' and RE_DISK.match(link['data']['tgt']):
                        yield link


def iterate_by_date(data):
    for k,v in sorted(data.iteritems(), key=lambda x: x[0]):
        v['date'] = k
        yield v


def iterate_by_number(data, total_folders):
    for k,v in sorted(data.iteritems(), key=lambda x: x[1], reverse=True):
        yield {
               'elements' : k,
               'folders' : v,
               '%' : '%.1f' % (float(v*100)/total_folders),
               }

def iterate_by_type(data, total_folders):
    for k,v in sorted(data.iteritems(), key=lambda x: x[1], reverse=True):
        yield {
               'type' : k,
               'folders' : v,
               '%' : '%.1f' % (float(v*100)/total_folders),
               }

def get_media_type(resource):
    if resource['type'] == 'dir':
        return 'folder'
    else:
        return filetypes.getGroupByName(resource['key'], resource.get('data', {}).get('mimetype'))


def process():
    total_files = 0
    total_folders = 0
    pictures_by_day = defaultdict(lambda: defaultdict(int))
    folders_by_file_number = defaultdict(int)
    folders_by_file_type = defaultdict(int)
    media_types = set()
    for link in link_iterator():
        try:
            tgt = link['data']['tgt']
            uid = link['uid']
            log.info('Processing link uid: %s tgt: %s' % (uid, tgt))
            _id = id_for_key(tgt)
            spec = {
                    '_id' : _id,
                    'uid' : uid,
                    }
            resource = db.user_data.find_one(spec, fields=('key', 'type', 'data',))
            if not resource:
                continue
            try:
                day = datetime.datetime.fromtimestamp(link['data']['ctime']).strftime("%Y-%m-%d")
            except (KeyError, TypeError), e:
                continue
            media_type = get_media_type(resource)
            if resource['type'] == 'file':
                total_files+=1
            elif resource['type'] == 'dir':
                total_folders+=1
                spec = {
                        'parent' : _id,
                        'uid' : uid,
                        }
                folders = 0
                files = 0
                total = 0
                types = defaultdict(int)
                for each in db.user_data.find(spec, fields=('key', 'type', 'data',)):
                    child_type = get_media_type(each)
                    total+=1
                    if child_type == 'folder':
                        folders+=1
                    else:
                        types[child_type]+=1
                        files+=1
                folders_by_file_number[files+folders]+=1
                if files:
                    max_type, max_number = sorted(types.iteritems(), key=lambda x: x[1], reverse=True)[0]
                    if max_number >= (files+folders)*0.8:
                        folders_by_file_type[max_type]+=1
                if folders:
                    folders_by_file_type['with folders'] +=1
            media_types.add(media_type)
            pictures_by_day[day][media_type]+=1
        except Exception, e:
            log.error(traceback.format_exc())
            log.error(e)

    column_names = ['date', ]
    for each in ['folder', 'image']:
        try:
            media_types.remove(each)
        except KeyError:
            pass
        column_names.append(each)
    column_names.extend(sorted(media_types))

    #===========================================================================
    # serialize data
    dump_object = {
                   'pictures_by_day' : dict(pictures_by_day),
                   'folders_by_file_number' : dict(folders_by_file_number),
                   'folders_by_file_type' : dict(folders_by_file_type),
                   'total_folders' : total_folders,
                   'column_names' : column_names,
                   }
    dump_file = open(dump_file_name, 'w')
    pickle.dump(dump_object, dump_file)
    dump_file.flush()
    dump_file.close()
    #===========================================================================

    """
    dump_file = open(dump_file_name, 'r')
    dump_object = pickle.load(dump_file)
    dump_file.close()

    pictures_by_day = dump_object['pictures_by_day']
    folders_by_file_number = dump_object['folders_by_file_number']
    folders_by_file_type = dump_object['folders_by_file_type']
    total_folders = dump_object['total_folders']
    column_names = dump_object['column_names']
    """

    def print_results():
        result_file = open(result_file_name, 'w')
        result_file.write('\nPublic resources by days\n')
        by_days = format_dict_table(list(iterate_by_date(pictures_by_day)), column_names=column_names)
        result_file.write(by_days)
        result_file.flush()

        result_file.write('\nPublic folders by the number of elements\n')
        by_file_number = format_dict_table(list(iterate_by_number(folders_by_file_number, total_folders)), column_names=('folders', 'elements', '%'))
        result_file.write(by_file_number)
        result_file.flush()

        result_file.write('\nPublic folders by content types\n')
        by_file_type = format_dict_table(list(iterate_by_type(folders_by_file_type, total_folders)), column_names=('folders', 'type', '%'))
        result_file.write(by_file_type)
        result_file.flush()

        result_file.write('\nnumber of public files %s\n' % total_files)
        result_file.write('number of public folders %s\n' % total_folders)
        result_file.flush()
        result_file.close()

    print_results()

process()
