#!/usr/bin/env python3

from ast import arg
from tractor.util.common import extract_login
from tractor_disk.google_drive import GoogleBuildPathMappingOp
from tractor_disk.source_drive import get_path_mapping_op_by_source, get_file_by_source, SourceDrive
from tractor_disk.common import LOST_AND_FOUND_DIR, SHARED_WITH_ME_DIR, NULL_STR

from collections import defaultdict
from tractor.yandex_services.directory import Directory
from tractor.secrets import Secrets
import sys
import json
import argparse
import numpy
from humanize import naturalsize

SCHEMA = [
    "orgid",
    "domain",
    "user_login",
    "yandex_puid",
    "file_id",
    "file_path",
    "file_size",
    "file_mime_type",
    "md5_checksum",
]


def main():
    args = parse_args()
    secrets = Secrets()
    org_data = load_org_data(args.org_data)
    users = list(org_data.keys())
    print(len(users), users, file=sys.stderr)
    directory = Directory(secrets.directory_secret(), args.org_id)
    yandex_users = directory.get_users()
    yandex_puid_mapping = build_yandex_puid_mapping(yandex_users)
    path_mapping_op_cls = get_path_mapping_op_by_source(args.source)
    file_cls = get_file_by_source(args.source)
    ret = []
    multiple_parent_files = set()
    for user, user_data in org_data.items():
        file_path_mapping, user_multiple_parent_files = build_path_mapping(
            path_mapping_op_cls, user_data
        )
        multiple_parent_files = multiple_parent_files.union(user_multiple_parent_files)
        for file in user_data["files"]:
            if args.only_owned and not file["ownedByMe"]:
                continue
            file["path"] = file_path_mapping[file["id"]]
            wrapped_file = file_cls(file)
            ret.append(
                make_file_info(args.org_id, args.domain, user, yandex_puid_mapping, wrapped_file)
            )

    if args.print_duplicates:
        print_duplicates(ret)
        return

    if args.print_multiple_parent:
        print_multiple_parent_files(org_data, multiple_parent_files)
        return

    if args.csv:
        print_csv_by_schema(ret, SCHEMA)
    else:
        print_json(ret)

    if args.summary:
        print_summary(ret, multiple_parent_files)


def make_file_info(orgid, domain, user_email, yandex_puid_mapping, file):
    ret = {
        "orgid": orgid,
        "domain": domain,
        "user_login": extract_login(user_email),
        "yandex_puid": yandex_puid_mapping.get(extract_login(user_email), NULL_STR),
        "file_id": file.id(),
        "file_path": file.path(),
        "file_size": file.size(),
        "file_mime_type": file.mime_type(),
        "md5_checksum": file.md5_checksum(),
        # 'file_id': file.get('id', 'NULL'),
        # 'file_path': file.get('path', 'NULL'),
        # 'file_size': file.get('size', 'NULL'),
        # 'file_mime_type': file.get('mimeType', 'NULL'),
        # 'md5_checksum': file.get('md5Checksum', 'NULL'),
    }
    return ret


def load_org_data(file_path):
    with open(file_path, "r") as f:
        return json.load(f)


def build_path_mapping(path_mapping_op_cls, user_data):
    for file in user_data["files"]:
        file["root_folder_id"] = user_data["root_folder_id"]
    build_mapping_op = path_mapping_op_cls(user_data["files"])
    return build_mapping_op()


def print_duplicates(items):
    duplicates = collect_duplicated_file_paths(items)
    print(json.dumps(duplicates, indent=2), file=sys.stderr)


def print_multiple_parent_files(org_data, multiple_parent_files):
    for user, user_data in org_data.items():
        for file in user_data["files"]:
            if file["id"] in multiple_parent_files:
                print(user, file)


def print_summary(items, multiple_parent_files):
    print("users: %d" % len(collect_user_logins(items)), file=sys.stderr)
    print("yandex users: %d" % len(collect_yandex_puids(items)), file=sys.stderr)
    print("files: %d" % len(collect_file_ids(items)), file=sys.stderr)
    shared_with_me_files = collect_shared_with_me(items)
    print("shared with me files: %d" % len(shared_with_me_files), file=sys.stderr)
    lost_and_found_files = collect_lost_and_found(items)
    print("lost+found files: %d" % len(lost_and_found_files), file=sys.stderr)
    duplicates = collect_duplicated_file_paths(items)
    print(
        "duplicated: %s"
        % numpy.sum([len(user_duplicates) for _, user_duplicates in duplicates.items()]),
        file=sys.stderr,
    )
    file_size_list = collect_file_size(items)
    print("files with multiple parents: %d" % len(multiple_parent_files), file=sys.stderr)
    print("file size p50: %s" % naturalsize(numpy.quantile(file_size_list, 0.50)), file=sys.stderr)
    print("file size p90: %s" % naturalsize(numpy.quantile(file_size_list, 0.90)), file=sys.stderr)
    print("file size p99: %s" % naturalsize(numpy.quantile(file_size_list, 0.99)), file=sys.stderr)
    print("file size max: %s" % naturalsize(numpy.max(file_size_list)), file=sys.stderr)
    print("file size sum total: %s" % naturalsize(numpy.sum(file_size_list)), file=sys.stderr)


def collect_user_logins(items):
    return set(collect_not_null_values(items, "user_login"))


def collect_yandex_puids(items):
    return set(collect_not_null_values(items, "yandex_puid"))


def collect_file_ids(items):
    return collect_not_null_values(items, "file_id")


def collect_shared_with_me(items):
    return [
        item for item in items if item["file_path"].startswith("{}/".format(SHARED_WITH_ME_DIR))
    ]


def collect_lost_and_found(items):
    return [
        item for item in items if item["file_path"].startswith("{}/".format(LOST_AND_FOUND_DIR))
    ]


def collect_file_size(items):
    return [int(x) for x in collect_not_null_values(items, "file_size")]


def collect_not_null_values(items, key):
    return list(filter(lambda x: x != NULL_STR, [x[key] for x in items]))


def collect_duplicated_file_paths(items):
    duplicates = defaultdict(lambda: [])
    user_file_paths = defaultdict(lambda: {})
    for item in items:
        user, path, checksum = item["user_login"], item["file_path"], item["md5_checksum"]
        user_files = user_file_paths[user]
        if path in user_files and checksum != user_files[path]:
            duplicates[user].append(path)
        else:
            user_files[path] = checksum
    return duplicates


def print_json(items):
    print(json.dumps(items))


def print_csv_by_schema(items, schema):
    print(",".join(schema))
    for item in items:
        print(",".join(get_str_values_by_schema(item, schema)))


def get_str_values_by_schema(item, schema):
    values = []
    for field in schema:
        values.append(str(item.get(field, NULL_STR)))
    return values


def build_yandex_puid_mapping(yandex_users):
    ret = {}
    for yandex_user in yandex_users:
        login = extract_login(yandex_user["email"])
        ret[login] = yandex_user["uid"]
    return ret


def email(user, domain):
    return user + "@" + domain


def int_or_zero(value):
    return int(value if value != NULL_STR else 0)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--org-id", help="org id")
    parser.add_argument("--domain", help="org domain")
    parser.add_argument(
        "--source",
        help="source drive",
        choices=list(SourceDrive),
        type=SourceDrive.from_string,
        required=True,
    )
    parser.add_argument("--org-data", help="path to file with org data")
    parser.add_argument("--csv", help="print CSV report", action="store_true")
    parser.add_argument("--summary", help="print users and files summary", action="store_true")
    parser.add_argument(
        "--print-duplicates", help="print file paths for each user", action="store_true"
    )
    parser.add_argument(
        "--print-multiple-parent",
        help="print files that have multiple parents",
        action="store_true",
    )
    parser.add_argument("--only-owned", help="print CSV report", action="store_true")
    args = parser.parse_args()
    return args


if __name__ == "__main__":
    main()
